├── .github └── workflows │ └── check-toolkit.yaml ├── .gitignore ├── INSTALL.rst ├── LICENSE.txt ├── README.rst ├── doc ├── Makefile ├── README.md ├── conf.py ├── index.rst ├── make.bat └── reference.rst ├── img ├── analysis_synthesis.png ├── global_spectrum.png ├── prosody_labeller.png └── screenshot.png ├── pyproject.toml ├── readthedocs.yml ├── samples ├── 01l_fact_0001.lab ├── 01l_fact_0001.wav ├── 14m_proosa_0002_0002.lab ├── 14m_proosa_0002_0002.wav ├── 40_N1_C_kissankello.TextGrid ├── 40_N1_C_kissankello.wav ├── 8hz_4hz_1hz.wav ├── kan_0001.F0 ├── kan_0001.lab ├── kan_0001.wav ├── libritts │ ├── 7127_75947_000010_000000.TextGrid │ ├── 7127_75947_000010_000000.wav │ ├── LJ050-0276.TextGrid │ ├── LJ050-0276.wav │ ├── LJ050-0277.TextGrid │ ├── LJ050-0277.wav │ ├── LJ050-0278.TextGrid │ └── LJ050-0278.wav ├── rjs_01_0003.F0 ├── rjs_01_0003.lab └── rjs_01_0003.wav ├── screenshot.png ├── test ├── diff_num.py ├── resources │ ├── 01l_fact_0001.cwt │ ├── libritts │ │ ├── 7127_75947_000010_000000.prom │ │ ├── LJ050-0276.prom │ │ ├── LJ050-0277.prom │ │ └── LJ050-0278.prom │ └── test_spectrum │ │ ├── 8hz_4hz_1hz.freqs.txt │ │ └── 8hz_4hz_1hz.spec.txt └── run_test.sh ├── tools.rst └── wavelet_prosody_toolkit ├── __init__.py ├── configs ├── default.yaml ├── libritts.yaml ├── libritts_boundary.yaml └── synthesis.yaml ├── cwt_analysis_synthesis.py ├── cwt_global_spectrum.py ├── prosody_labeller.py ├── prosody_tools ├── __init__.py ├── cwt_utils.py ├── duration_processing.py ├── energy_processing.py ├── f0_processing.py ├── filter.py ├── lab.py ├── loma.py ├── misc.py ├── pitch_tracker.py └── smooth_and_interp.py └── wavelet_gui.py /.github/workflows/check-toolkit.yaml: -------------------------------------------------------------------------------- 1 | name: check-wavelet-prosody-toolkit 2 | run-name: ${{ github.actor }} is in validation 3 | on: [push] 4 | jobs: 5 | build: 6 | runs-on: ubuntu-latest 7 | strategy: 8 | matrix: 9 | python-version: [ '3.8', '3.9', '3.10', '3.11' ] 10 | 11 | steps: 12 | - uses: actions/checkout@v3 13 | 14 | # Setup python 15 | - name: Setup python 16 | uses: actions/setup-python@v1 17 | with: 18 | python-version: ${{ matrix.python-version }} 19 | architecture: x64 20 | 21 | # Install everything 22 | - name: Install wavelet-prosody-toolkit 23 | run: pip install -e . 24 | 25 | # Linux and macOS 26 | - name: Run the test 27 | shell: bash -l {0} 28 | run: | 29 | bash test/run_test.sh 30 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | ### Python ### 2 | # Byte-compiled / optimized / DLL files 3 | __pycache__/ 4 | *.py[cod] 5 | *$py.class 6 | 7 | # C extensions 8 | *.so 9 | 10 | # Distribution / packaging 11 | .Python 12 | build/ 13 | develop-eggs/ 14 | dist/ 15 | downloads/ 16 | eggs/ 17 | .eggs/ 18 | lib/ 19 | lib64/ 20 | parts/ 21 | sdist/ 22 | var/ 23 | wheels/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | 28 | # PyInstaller 29 | # Usually these files are written by a python script from a template 30 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 31 | *.manifest 32 | *.spec 33 | 34 | # Installer logs 35 | pip-log.txt 36 | pip-delete-this-directory.txt 37 | 38 | # Unit test / coverage reports 39 | htmlcov/ 40 | .tox/ 41 | .coverage 42 | .coverage.* 43 | .cache 44 | nosetests.xml 45 | coverage.xml 46 | *.cover 47 | .hypothesis/ 48 | 49 | # Translations 50 | *.mo 51 | *.pot 52 | 53 | # Django stuff: 54 | *.log 55 | local_settings.py 56 | 57 | # Flask stuff: 58 | instance/ 59 | .webassets-cache 60 | 61 | # Scrapy stuff: 62 | .scrapy 63 | 64 | # Sphinx documentation 65 | doc/_build/ 66 | doc/_modules 67 | 68 | # PyBuilder 69 | target/ 70 | 71 | # Jupyter Notebook 72 | .ipynb_checkpoints 73 | 74 | # pyenv 75 | .python-version 76 | 77 | # celery beat schedule file 78 | celerybeat-schedule.* 79 | 80 | # SageMath parsed files 81 | *.sage.py 82 | 83 | # Environments 84 | .env 85 | .venv 86 | env/ 87 | venv/ 88 | ENV/ 89 | env.bak/ 90 | venv.bak/ 91 | 92 | # Spyder project settings 93 | .spyderproject 94 | .spyproject 95 | 96 | # Rope project settings 97 | .ropeproject 98 | 99 | # mkdocs documentation 100 | /site 101 | 102 | # mypy 103 | .mypy_cache/ 104 | 105 | 106 | # wavelet specificities 107 | *.prom 108 | *.wav.* 109 | *.wav_*.* -------------------------------------------------------------------------------- /INSTALL.rst: -------------------------------------------------------------------------------- 1 | Install procedure 2 | ================= 3 | 4 | Wavelet Prosody Analyzer is a toolkit comprising command line tools and a GUI application. 5 | All the tools are started from terminal, so some familiarity with command line tools is assumed. 6 | 7 | Installation has been tested only on one Ubuntu Linux, on Arch Linux and on MacOS Sierra machine. 8 | Running on windows might be possible if the required libraries can be installed. 9 | 10 | Default installation 11 | --------------------- 12 | 13 | To install the toolkit, simply run 14 | 15 | .. code:: sh 16 | 17 | pip install -e .[gui] 18 | 19 | It will install the dependencies needed to run the toolkit. 20 | 21 | To be able to run the application globally, the following line should be added to your shell profile file (~/.bashrc or ~/.profile in general): 22 | 23 | .. code:: sh 24 | 25 | export PATH=~/.local/bin:$PATH 26 | 27 | After restarting the shell, you can finally run the tool by calling them on the command line, like for example: 28 | 29 | .. code:: sh 30 | 31 | wavelet_gui 32 | 33 | Development mode installation 34 | ------------------------ 35 | 36 | Even if the setup doesn't require it, we advise to use the environment management system conda ( https://docs.conda.io/en/latest/miniconda.html ). 37 | Conda provides an easy way to define the environments and install precompiled packages. 38 | Therefore, the modification you will propose won't affect your system configuration. 39 | 40 | Assuming you have created activated the conda environment, you can install pre-compiled packages 41 | 42 | .. code:: sh 43 | 44 | conda install scipy numpy matplotlib joblib pyqt 45 | 46 | We then use the setup script to install the rest of the dependencies: 47 | 48 | .. code:: sh 49 | 50 | pip install -e .[full] 51 | 52 | To start the Wavelet Prosody Analyzer GUI, run the following commands: 53 | 54 | .. code:: sh 55 | 56 | wavelet_gui 57 | 58 | if it doesn’t work, please raise an issue on github here: https://github.com/asuni/wavelet_prosody_toolkit/issues . 59 | -------------------------------------------------------------------------------- /LICENSE.txt: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2018 Antti Suni 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.rst: -------------------------------------------------------------------------------- 1 | |github-actions-badge| 2 | 3 | .. |github-actions-badge| image:: https://github.com/asuni/wavelet_prosody_toolkit/actions/workflows/check-toolkit.yaml/badge.svg 4 | .. _github-actions-badge: https://github.com/asuni/wavelet_prosody_toolkit/actions?query=check-wavelet-prosody-toolkit 5 | 6 | Wavelet prosody analyzer 7 | ======================== 8 | 9 | antti.suni@helsinki.fi 10 | 11 | **UPDATE 3.2.2020**, Additional command-line tools: **batch-processing, global spectrum and analysis-synthesis:** `tools.rst `__. 12 | 13 | |screenshot| 14 | 15 | .. |screenshot| image:: screenshot.png 16 | 17 | Description 18 | ----------- 19 | 20 | The program calculates f0, energy and duration features from speech 21 | wav-file, performs continuous wavelet analysis on combined features, 22 | finds prosodic events (prominences, boundaries) from the wavelet 23 | scalogram and aligns the events with transcribed units. 24 | 25 | See also: 26 | 27 | [1] Antti Suni, Juraj Šimko, Daniel Aalto, Martti Vainio, Hierarchical 28 | representation and estimation of prosody using continuous wavelet 29 | transform, Computer Speech & Language, Volume 45, 2017, Pages 123-136, 30 | ISSN 0885-2308, https://doi.org/10.1016/j.csl.2016.11.001. 31 | 32 | The default settings of the program are roughly the same as in the 33 | paper, duration signal was generated from word level labels. 34 | 35 | Requirements 36 | ------------ 37 | 38 | The wavelet prosody analysis depends on several packages which are installed automatically if you 39 | use the procedure describe in `./INSTALL.rst `__. 40 | 41 | Here are the main dependencies: 42 | 43 | - **pycwt** for the wavelet analysis (see https://github.com/regeirk/pycwt/LICENSE.txt ) 44 | - **pyyaml** for the configuration (see https://github.com/yaml/pyyaml/blob/master/LICENSE ) 45 | - **soundfile** for playing waves (see https://github.com/bastibe/SoundFile/blob/master/LICENSE ) 46 | - **wavio** for reading/writing wav (see https://github.com/WarrenWeckesser/wavio/blob/master/README.rst ) 47 | - **tgt** for reading/writing textgrid (see https://github.com/hbuschme/TextGridTools/blob/master/LICENSE ) 48 | - **pyqt5** for the gui (see https://www.riverbankcomputing.com/commercial/pyqt ) 49 | - **matplotlib** for the plot rendering (see https://github.com/matplotlib/matplotlib/blob/master/LICENSE/LICENSE ) 50 | 51 | Here the optional dependencies: 52 | 53 | - **pyreaper** for the f0 extraction (see https://github.com/r9y9/pyreaper/blob/master/LICENSE.md ). 54 | 55 | **The user is invited to have a look at the license of the dependencies.** 56 | 57 | Installation 58 | ------------ 59 | 60 | see `./INSTALL.rst `__ 61 | 62 | Input information 63 | ----------------- 64 | 65 | - audio files in wav format 66 | - transcriptions in either htk .lab format or Praat textgrids 67 | 68 | Usage: 69 | ------ 70 | 71 | 1. Assuming the installation process is done in **global mode**, just do 72 | 73 | .. code:: sh 74 | 75 | wavelet_gui 76 | 77 | Otherwise, go to the root directory of the program in the terminal, and start by 78 | 79 | .. code:: sh 80 | 81 | python3 wavelet_prosody_toolkit/wavelet_gui.py 82 | 83 | 84 | 2. Select directory with speech and transciption files: 85 | ``Select Speech Directory...``. Some examples are provided in 86 | ``samples/`` directory. Files should have the same root, for example 87 | file1.wav, file1.lab or file2.wav file2.TextGrid. 88 | 89 | 3. Select features to use in analysis: ``Prosodic Feats for CWT..`` 90 | 91 | 4. Adjust Pitch tracking parameters for the speaker / environment, press 92 | ``Reprocess`` to see changes Set range for possible pitch values, 93 | typically males ~50-350Hz, females ~100-400Hz. If estimated track 94 | skips obviously voiced portions, move voicing threshold slider left. 95 | 96 | - Alternatively, pre-estimated f0 analyses can be used: file .f0 must 97 | exist and it should be either in praat matrix format or as a list 98 | file with one f0 value / line, frame shift must be constant 5ms. To 99 | get suitable format from Praat, select wav and do: 100 | 101 | - To Pitch: 0.005, 120, 400 102 | - To Matrix 103 | - Save as matrix text file: “/.f0” 104 | 105 | 5. Adjust the weights of prosodic features and choose if the final 106 | signal is combined by summing or multiplying the features 107 | 108 | 6. Select which tiers to use for durations signal generation / use 109 | duration estimated from signal 110 | 111 | 7. Select transcription level of interest: ``Select Tier`` 112 | 113 | 8. You can interactively zoom and move around with the button on top, 114 | and play the visible section 115 | 116 | 9. When everything is good, you can ``Process all`` which analyzes all 117 | utterances in the directory with the current settings, and saves 118 | prosodic labels in the speech directory as ``.prom`` 119 | 120 | Prosodic labels are saved in a tab separated form with the following 121 | columns: 122 | 123 | .. code:: 124 | 125 | 126 | 127 | Advanced Usage: 128 | --------------- 129 | 130 | Additional customization of the input signals and wavelet analysis is possible by modifying the configuration file. The default configuration is located in: 131 | 132 | .. code:: sh 133 | 134 | wavelet_prosody_toolkit/configs/default.yaml 135 | 136 | You can view an online version here: https://github.com/asuni/wavelet_prosody_toolkit/blob/master/wavelet_prosody_toolkit/configs/default.yaml 137 | 138 | You are recommended to make a copy of the default.yaml file (to e.g. myconfig.yaml), and modify the copy. To apply the modified configuration, start the program by 139 | 140 | .. code:: sh 141 | 142 | wavelet_gui --config path/to/myconfig.yaml 143 | 144 | Some helpful shortcuts 145 | ---------------------- 146 | 147 | Here are a list of shortcuts available in the GUI: 148 | 149 | - **CTRL+q** to quit 150 | - **F11** to switch between fullscreen et normal mode 151 | -------------------------------------------------------------------------------- /doc/Makefile: -------------------------------------------------------------------------------- 1 | # Minimal makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line. 5 | SPHINXOPTS = 6 | SPHINXBUILD = sphinx-build 7 | SPHINXPROJ = wavelet-prosody-toolkit 8 | SOURCEDIR = . 9 | BUILDDIR = ../build/doc 10 | 11 | 12 | 13 | # Put it first so that "make" without argument is like "make help". 14 | help: 15 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 16 | 17 | .PHONY: help Makefile 18 | 19 | # Catch-all target: route all unknown targets to Sphinx using the new 20 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). 21 | %: Makefile 22 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 23 | -------------------------------------------------------------------------------- /doc/README.md: -------------------------------------------------------------------------------- 1 | # How to generate the documentation 2 | 3 | - extract the info from the source code 4 | ```sh 5 | sphinx-apidoc ../wavelet_prosody_toolkit -o _modules -e -M 6 | ``` 7 | - generate the html documentation 8 | ```sh 9 | make html 10 | ``` 11 | - documentation is generated in `../build/docs/html` 12 | -------------------------------------------------------------------------------- /doc/conf.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | # 4 | # wavelet-prosody-toolkit documentation build configuration file, created by 5 | # sphinx-quickstart on Tue Jan 9 14:55:15 2018. 6 | # 7 | # This file is execfile()d with the current directory set to its 8 | # containing dir. 9 | # 10 | # Note that not all possible configuration values are present in this 11 | # autogenerated file. 12 | # 13 | # All configuration values have a default; values that are commented out 14 | # serve to show the default. 15 | 16 | # If extensions (or modules to document with autodoc) are in another directory, 17 | # add these directories to sys.path here. If the directory is relative to the 18 | # documentation root, use os.path.abspath to make it absolute, like shown here. 19 | # 20 | 21 | import sys 22 | import os 23 | sys.path.insert(0, os.path.abspath('../..')) 24 | 25 | # -- Fix non implicit call to sphinx-apidoc (see https://github.com/sphinx-doc/sphinx/issues/1861 ) 26 | import sphinx.apidoc 27 | 28 | def setup(app): 29 | """Helper to generate source code documentation 30 | """ 31 | sphinx.apidoc.main(['-f', '-T', '-e', '-o', 'doc/_modules', '../wavelet_prosody_toolkit/']) 32 | 33 | 34 | # -- General configuration ------------------------------------------------ 35 | 36 | # If your documentation needs a minimal Sphinx version, state it here. 37 | # 38 | # needs_sphinx = '1.0' 39 | 40 | # Add any Sphinx extension module names here, as strings. They can be 41 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom 42 | # ones. 43 | extensions = ['sphinx.ext.autodoc', 44 | 'sphinx.ext.autosummary', 45 | 'sphinx.ext.doctest', 46 | 'sphinx.ext.mathjax', 47 | 'sphinx.ext.ifconfig', 48 | 'sphinx.ext.viewcode', 49 | 'sphinx.ext.githubpages', 50 | 'sphinx.ext.napoleon', 51 | 'numpydoc'] 52 | 53 | # Add any paths that contain templates here, relative to this directory. 54 | templates_path = ['_templates'] 55 | 56 | # The suffix(es) of source filenames. 57 | # You can specify multiple suffix as a list of string: 58 | # 59 | source_parsers = { 60 | '.md': 'recommonmark.parser.CommonMarkParser', 61 | } 62 | source_suffix = ['.rst', '.md'] 63 | # source_suffix = '.rst' 64 | 65 | # The master toctree document. 66 | master_doc = 'index' 67 | 68 | # General information about the project. 69 | project = 'Wavelet prosody analysis toolkit' 70 | copyright = '2018, Antti Suni' 71 | author = 'Antti Suni' 72 | 73 | # The version info for the project you're documenting, acts as replacement for 74 | # |version| and |release|, also used in various other places throughout the 75 | # built documents. 76 | # 77 | # The short X.Y version. 78 | version = '0.1a' 79 | # The full version, including alpha/beta/rc tags. 80 | release = '0.1' 81 | 82 | # The language for content autogenerated by Sphinx. Refer to documentation 83 | # for a list of supported languages. 84 | # 85 | # This is also used if you do content translation via gettext catalogs. 86 | # Usually you set "language" from the command line for these cases. 87 | language = None 88 | 89 | # List of patterns, relative to source directory, that match files and 90 | # directories to ignore when looking for source files. 91 | # This patterns also effect to html_static_path and html_extra_path 92 | exclude_patterns = ["README.md", "_modules/modules.rst"] 93 | 94 | # The name of the Pygments (syntax highlighting) style to use. 95 | pygments_style = 'sphinx' 96 | 97 | # If true, `todo` and `todoList` produce output, else they produce nothing. 98 | todo_include_todos = False 99 | 100 | 101 | # -- Options for HTML output ---------------------------------------------- 102 | 103 | # The theme to use for HTML and HTML Help pages. See the documentation for 104 | # a list of builtin themes. 105 | # 106 | html_theme = 'sphinx_rtd_theme' 107 | 108 | # Theme options are theme-specific and customize the look and feel of a theme 109 | # further. For a list of options available for each theme, see the 110 | # documentation. 111 | # 112 | # html_theme_options = {} 113 | 114 | # Add any paths that contain custom static files (such as style sheets) here, 115 | # relative to this directory. They are copied after the builtin static files, 116 | # so a file named "default.css" will overwrite the builtin "default.css". 117 | html_static_path = ['_static'] 118 | 119 | # Custom sidebar templates, must be a dictionary that maps document names 120 | # to template names. 121 | # 122 | # This is required for the alabaster theme 123 | # refs: http://alabaster.readthedocs.io/en/latest/installation.html#sidebars 124 | html_sidebars = { 125 | '**': [ 126 | 'relations.html', # needs 'show_related': True theme option to display 127 | 'searchbox.html', 128 | ] 129 | } 130 | 131 | 132 | # -- Options for HTMLHelp output ------------------------------------------ 133 | 134 | # Output file base name for HTML help builder. 135 | htmlhelp_basename = 'wavelet-prosody-toolkitdoc' 136 | 137 | 138 | # -- Options for LaTeX output --------------------------------------------- 139 | 140 | latex_elements = { 141 | # The paper size ('letterpaper' or 'a4paper'). 142 | # 143 | # 'papersize': 'letterpaper', 144 | 145 | # The font size ('10pt', '11pt' or '12pt'). 146 | # 147 | # 'pointsize': '10pt', 148 | 149 | # Additional stuff for the LaTeX preamble. 150 | # 151 | # 'preamble': '', 152 | 153 | # Latex figure (float) alignment 154 | # 155 | # 'figure_align': 'htbp', 156 | } 157 | 158 | # Grouping the document tree into LaTeX files. List of tuples 159 | # (source start file, target name, title, 160 | # author, documentclass [howto, manual, or own class]). 161 | latex_documents = [ 162 | (master_doc, 'wavelet-prosody-toolkit.tex', 'wavelet-prosody-toolkit Documentation', 163 | 'Antti Suni', 'manual'), 164 | ] 165 | 166 | 167 | # -- Options for manual page output --------------------------------------- 168 | 169 | # One entry per manual page. List of tuples 170 | # (source start file, name, description, authors, manual section). 171 | man_pages = [ 172 | (master_doc, 'wavelet-prosody-toolkit', 'wavelet-prosody-toolkit Documentation', 173 | [author], 1) 174 | ] 175 | 176 | 177 | # -- Options for Texinfo output ------------------------------------------- 178 | 179 | # Grouping the document tree into Texinfo files. List of tuples 180 | # (source start file, target name, title, author, 181 | # dir menu entry, description, category) 182 | texinfo_documents = [ 183 | (master_doc, 'wavelet-prosody-toolkit', 'wavelet-prosody-toolkit Documentation', 184 | author, 'wavelet-prosody-toolkit', 'One line description of project.', 185 | 'Miscellaneous'), 186 | ] 187 | 188 | # -- Options for Napoleon 189 | napoleon_google_docstring = True 190 | napoleon_numpy_docstring = True 191 | napoleon_include_init_with_doc = False 192 | napoleon_include_private_with_doc = False 193 | napoleon_include_special_with_doc = True 194 | napoleon_use_admonition_for_examples = False 195 | napoleon_use_admonition_for_notes = False 196 | napoleon_use_admonition_for_references = False 197 | napoleon_use_ivar = False 198 | napoleon_use_param = True 199 | napoleon_use_rtype = True 200 | 201 | numpydoc_show_inherited_class_members = False 202 | -------------------------------------------------------------------------------- /doc/index.rst: -------------------------------------------------------------------------------- 1 | .. include:: ../README.rst 2 | 3 | API Documentation 4 | ----------------- 5 | 6 | .. toctree:: 7 | :maxdepth: 2 8 | 9 | reference 10 | -------------------------------------------------------------------------------- /doc/make.bat: -------------------------------------------------------------------------------- 1 | @ECHO OFF 2 | 3 | pushd %~dp0 4 | 5 | REM Command file for Sphinx documentation 6 | 7 | if "%SPHINXBUILD%" == "" ( 8 | set SPHINXBUILD=sphinx-build 9 | ) 10 | set SOURCEDIR=. 11 | set BUILDDIR=_build 12 | set SPHINXPROJ=PyCWT 13 | 14 | if "%1" == "" goto help 15 | 16 | %SPHINXBUILD% >NUL 2>NUL 17 | if errorlevel 9009 ( 18 | echo. 19 | echo.The 'sphinx-build' command was not found. Make sure you have Sphinx 20 | echo.installed, then set the SPHINXBUILD environment variable to point 21 | echo.to the full path of the 'sphinx-build' executable. Alternatively you 22 | echo.may add the Sphinx directory to PATH. 23 | echo. 24 | echo.If you don't have Sphinx installed, grab it from 25 | echo.http://sphinx-doc.org/ 26 | exit /b 1 27 | ) 28 | 29 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% 30 | goto end 31 | 32 | :help 33 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% 34 | 35 | :end 36 | popd 37 | -------------------------------------------------------------------------------- /doc/reference.rst: -------------------------------------------------------------------------------- 1 | API Reference 2 | ============= 3 | .. toctree:: 4 | :maxdepth: 3 5 | 6 | _modules/wavelet_prosody_toolkit 7 | _modules/wavelet_prosody_toolkit.prosody_tools 8 | -------------------------------------------------------------------------------- /img/analysis_synthesis.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/asuni/wavelet_prosody_toolkit/564d2aad4ae2401aab2e521255e1d65dacc3756d/img/analysis_synthesis.png -------------------------------------------------------------------------------- /img/global_spectrum.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/asuni/wavelet_prosody_toolkit/564d2aad4ae2401aab2e521255e1d65dacc3756d/img/global_spectrum.png -------------------------------------------------------------------------------- /img/prosody_labeller.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/asuni/wavelet_prosody_toolkit/564d2aad4ae2401aab2e521255e1d65dacc3756d/img/prosody_labeller.png -------------------------------------------------------------------------------- /img/screenshot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/asuni/wavelet_prosody_toolkit/564d2aad4ae2401aab2e521255e1d65dacc3756d/img/screenshot.png -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | build-backend = "setuptools.build_meta" 3 | requires = [ 4 | "setuptools>=61", 5 | ] 6 | 7 | [project] 8 | name = "wavelet_prosody_toolkit" 9 | version = "1.0" 10 | authors = [ 11 | {name="Antti Suni", email="antti.suni@helsinki.fi"}, 12 | {name="Sébastien Le Maguer", email="sebastien.lemaguer@helsinki.fi"} 13 | ] 14 | description = "Prosody wavelet analysis toolkit" 15 | readme = {file="README.rst", content-type="text/x-rst"} 16 | classifiers = [ 17 | 'Development Status :: 4 - Beta', 18 | # Audience 19 | 'Intended Audience :: Science/Research', 20 | # Topics 21 | 'Topic :: Multimedia :: Sound/Audio :: Speech', 22 | 'Topic :: Scientific/Engineering :: Information Analysis', 23 | 'Topic :: Scientific/Engineering :: Visualization', 24 | # Pick your license as you wish 25 | 'License :: OSI Approved :: MIT License', 26 | # Python version (FIXME: fix the list of python version based on travis results) 27 | 'Programming Language :: Python :: 3', 28 | 'Programming Language :: Python :: 3.7', 29 | 'Programming Language :: Python :: 3.8', 30 | 'Programming Language :: Python :: 3.9', 31 | 'Programming Language :: Python :: 3.10', 32 | 'Programming Language :: Python :: 3.11', 33 | ] 34 | dependencies = [ 35 | "pyyaml", 36 | "pycwt", 37 | "numpy", 38 | "scipy", 39 | "soundfile", 40 | "tgt", 41 | "wavio", 42 | "joblib" 43 | ] 44 | 45 | [project.optional-dependencies] 46 | gui = ["pyqt5", "matplotlib"] 47 | reaper = ["pyreaper"] 48 | docs = ["sphinx", "sphinx_rtd_theme", "numpydoc"] 49 | full = [ 50 | "pyqt5", 51 | "matplotlib", 52 | "pyreaper", 53 | "sphinx", 54 | "sphinx_rtd_theme", 55 | "numpydoc" 56 | ] 57 | dev = ["pre-commit"] 58 | 59 | 60 | [project.scripts] 61 | prosody_labeller = "wavelet_prosody_toolkit.prosody_labeller:main" 62 | cwt_analysis_synthesis = "wavelet_prosody_toolkit.cwt_analysis_synthesis:main" 63 | wavelet_gui = "wavelet_prosody_toolkit.wavelet_gui:main" 64 | 65 | [project.urls] 66 | Homepage = "https://github.com/asuni/wavelet_prosody_toolkit" 67 | Issues = "https://github.com/asuni/wavelet_prosody_toolkit/issues" 68 | git = "https://github.com/asuni/wavelet_prosody_toolkit.git" 69 | 70 | [tool.setuptools] 71 | packages = ["wavelet_prosody_toolkit"] 72 | 73 | [tool.black] 74 | line-length = 120 75 | target-version = ['py311'] 76 | include = '\.pyi?$' 77 | exclude = ''' 78 | /( 79 | \.toml 80 | |\.sh 81 | |\.git 82 | |\.ini 83 | |Dockerfile 84 | |Jenkinfile 85 | )/ 86 | ''' 87 | 88 | [tool.flake8] 89 | max-line-length = 120 90 | 91 | [tool.basedpyright] 92 | typeCheckingMode = "standard" 93 | -------------------------------------------------------------------------------- /readthedocs.yml: -------------------------------------------------------------------------------- 1 | # .readthedocs.yml 2 | 3 | build: 4 | image: latest 5 | 6 | python: 7 | version: 3.6 8 | pip_install: true 9 | extra_requirements: 10 | - docs 11 | -------------------------------------------------------------------------------- /samples/01l_fact_0001.lab: -------------------------------------------------------------------------------- 1 | 0 11200000 # !SIL 2 | 11200000 11700000 p puheen 3 | 11700000 12200000 u 4 | 12200000 12900000 h 5 | 12900000 13550000 e 6 | 13550000 14050000 e 7 | 14050000 14550000 n 8 | 14550000 14950000 t tutkimus 9 | 14950000 15700000 u 10 | 15700000 16300000 t 11 | 16300000 16950000 k 12 | 16950000 17300000 i 13 | 17300000 18100000 m 14 | 18100000 18750000 u 15 | 18750000 20150000 s 16 | 20150000 20950000 o on 17 | 20950000 21300000 n 18 | 21300000 21700000 j jo 19 | 21700000 22250000 o 20 | 22250000 22850000 l lähtökohdiltaan 21 | 22850000 23650000 A_ 22 | 23650000 23950000 h 23 | 23950000 24500000 t 24 | 24500000 25050000 O_ 25 | 25050000 25700000 k 26 | 25700000 26350000 o 27 | 26350000 26650000 h 28 | 26650000 27450000 d 29 | 27450000 27900000 i 30 | 27900000 28200000 l 31 | 28200000 28850000 t 32 | 28850000 29300000 a 33 | 29300000 29750000 a 34 | 29750000 30250000 n 35 | 30250000 31050000 m monia 36 | 31050000 31800000 o 37 | 31800000 32200000 n 38 | 32200000 33150000 i 39 | 33150000 33800000 a 40 | 33800000 35050000 e eri 41 | 35050000 35300000 r 42 | 35300000 35800000 i 43 | 35800000 36600000 t tieteenaloja 44 | 36600000 37350000 i 45 | 37350000 38050000 e 46 | 38050000 38950000 t 47 | 38950000 39600000 e 48 | 39600000 39850000 e 49 | 39850000 40600000 n 50 | 40600000 41500000 a 51 | 41500000 41850000 l 52 | 41850000 42350000 o 53 | 42350000 42800000 j 54 | 42800000 43400000 a 55 | 43400000 44500000 k kiinnostava 56 | 44500000 44850000 i 57 | 44850000 45200000 i 58 | 45200000 45500000 n 59 | 45500000 45800000 n 60 | 45800000 46150000 o 61 | 46150000 46950000 s 62 | 46950000 47350000 t 63 | 47350000 47800000 a 64 | 47800000 48350000 v 65 | 48350000 48650000 a 66 | 48650000 49650000 t tutkimuskohde 67 | 49650000 50200000 u 68 | 50200000 50850000 t 69 | 50850000 51650000 k 70 | 51650000 52000000 i 71 | 52000000 52500000 m 72 | 52500000 53150000 u 73 | 53150000 53750000 s 74 | 53750000 54650000 k 75 | 54650000 55300000 o 76 | 55300000 56400000 h 77 | 56400000 57100000 d 78 | 57100000 57350000 e 79 | 57350000 60550000 # 80 | 60550000 62300000 # !SIL 81 | -------------------------------------------------------------------------------- /samples/01l_fact_0001.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/asuni/wavelet_prosody_toolkit/564d2aad4ae2401aab2e521255e1d65dacc3756d/samples/01l_fact_0001.wav -------------------------------------------------------------------------------- /samples/14m_proosa_0002_0002.lab: -------------------------------------------------------------------------------- 1 | 0 1900000 # !SIL 2 | 1900000 2550000 t täällä 3 | 2550000 2850000 A_ 4 | 2850000 3250000 A_ 5 | 3250000 3500000 l 6 | 3500000 3750000 l 7 | 3750000 4000000 A_ 8 | 4000000 4250000 v voi 9 | 4250000 4500000 o 10 | 4500000 4750000 i 11 | 4750000 5050000 o olla 12 | 5050000 5350000 l 13 | 5350000 5600000 l 14 | 5600000 5850000 a 15 | 5850000 6500000 h helvetin 16 | 6500000 7000000 e 17 | 7000000 7750000 l 18 | 7750000 8000000 v 19 | 8000000 8450000 e 20 | 8450000 9350000 t 21 | 9350000 9750000 i 22 | 9750000 10000000 n 23 | 10000000 11100000 h hauskaa 24 | 11100000 11700000 a 25 | 11700000 12250000 u 26 | 12250000 13450000 s 27 | 13450000 13850000 k 28 | 13850000 14100000 a 29 | 14100000 14650000 a 30 | 14650000 15500000 k kun 31 | 15500000 15950000 u 32 | 15950000 16300000 n 33 | 16300000 16550000 v vain 34 | 16550000 16800000 a 35 | 16800000 17050000 i 36 | 17050000 17400000 n 37 | 17400000 18250000 s sattuu 38 | 18250000 18650000 a 39 | 18650000 19750000 t 40 | 19750000 20250000 t 41 | 20250000 20750000 u 42 | 20750000 21150000 u 43 | 21150000 21650000 h hyvään 44 | 21650000 22250000 y 45 | 22250000 22550000 v 46 | 22550000 22800000 A_ 47 | 22800000 23700000 A_ 48 | 23700000 24050000 n 49 | 24050000 25100000 s seuraan 50 | 25100000 26300000 e 51 | 26300000 27200000 u 52 | 27200000 27750000 r 53 | 27750000 28600000 a 54 | 28600000 29300000 a 55 | 29300000 29550000 n 56 | 29550000 31400000 # !SIL 57 | -------------------------------------------------------------------------------- /samples/14m_proosa_0002_0002.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/asuni/wavelet_prosody_toolkit/564d2aad4ae2401aab2e521255e1d65dacc3756d/samples/14m_proosa_0002_0002.wav -------------------------------------------------------------------------------- /samples/40_N1_C_kissankello.TextGrid: -------------------------------------------------------------------------------- 1 | File type = "ooTextFile" 2 | Object class = "TextGrid" 3 | 4 | xmin = 0 5 | xmax = 2.602675736961451 6 | tiers? 7 | size = 5 8 | item []: 9 | item [1]: 10 | class = "IntervalTier" 11 | name = "sanat" 12 | xmin = 0 13 | xmax = 2.602675736961451 14 | intervals: size = 5 15 | intervals [1]: 16 | xmin = 0 17 | xmax = 0.3813532988377878 18 | text = "" 19 | intervals [2]: 20 | xmin = 0.3813532988377878 21 | xmax = 1.373576792765878 22 | text = "S0" 23 | intervals [3]: 24 | xmin = 1.373576792765878 25 | xmax = 1.9027481016021124 26 | text = "S1" 27 | intervals [4]: 28 | xmin = 1.9027481016021124 29 | xmax = 2.2477643338446045 30 | text = "S2" 31 | intervals [5]: 32 | xmin = 2.2477643338446045 33 | xmax = 2.602675736961451 34 | text = "" 35 | item [2]: 36 | class = "IntervalTier" 37 | name = "tavut" 38 | xmin = 0 39 | xmax = 2.602675736961451 40 | intervals: size = 6 41 | intervals [1]: 42 | xmin = 0 43 | xmax = 1.373576792765878 44 | text = "" 45 | intervals [2]: 46 | xmin = 1.373576792765878 47 | xmax = 1.6765392363669516 48 | text = "T1" 49 | intervals [3]: 50 | xmin = 1.6765392363669516 51 | xmax = 1.9027481016021124 52 | text = "T2" 53 | intervals [4]: 54 | xmin = 1.9027481016021124 55 | xmax = 2.087091927631417 56 | text = "T3" 57 | intervals [5]: 58 | xmin = 2.087091927631417 59 | xmax = 2.2477643338446045 60 | text = "T4" 61 | intervals [6]: 62 | xmin = 2.2477643338446045 63 | xmax = 2.602675736961451 64 | text = "" 65 | item [3]: 66 | class = "IntervalTier" 67 | name = "vokaalit" 68 | xmin = 0 69 | xmax = 2.602675736961451 70 | intervals: size = 9 71 | intervals [1]: 72 | xmin = 0 73 | xmax = 1.4793689320369707 74 | text = "" 75 | intervals [2]: 76 | xmin = 1.4793689320369707 77 | xmax = 1.571320010157871 78 | text = "V1" 79 | intervals [3]: 80 | xmin = 1.571320010157871 81 | xmax = 1.7530452937140502 82 | text = "" 83 | intervals [4]: 84 | xmin = 1.7530452937140502 85 | xmax = 1.8182176368032497 86 | text = "V2" 87 | intervals [5]: 88 | xmin = 1.8182176368032497 89 | xmax = 1.9419381861997345 90 | text = "" 91 | intervals [6]: 92 | xmin = 1.9419381861997345 93 | xmax = 2.0138858704032554 94 | text = "V3" 95 | intervals [7]: 96 | xmin = 2.0138858704032554 97 | xmax = 2.159362246891967 98 | text = "" 99 | intervals [8]: 100 | xmin = 2.159362246891967 101 | xmax = 2.2477643338446045 102 | text = "V4" 103 | intervals [9]: 104 | xmin = 2.2477643338446045 105 | xmax = 2.602675736961451 106 | text = "" 107 | item [4]: 108 | class = "IntervalTier" 109 | name = "narina" 110 | xmin = 0 111 | xmax = 2.602675736961451 112 | intervals: size = 1 113 | intervals [1]: 114 | xmin = 0 115 | xmax = 2.602675736961451 116 | text = "" 117 | item [5]: 118 | class = "TextTier" 119 | name = "f0" 120 | xmin = 0 121 | xmax = 2.602675736961451 122 | points: size = 0 123 | -------------------------------------------------------------------------------- /samples/40_N1_C_kissankello.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/asuni/wavelet_prosody_toolkit/564d2aad4ae2401aab2e521255e1d65dacc3756d/samples/40_N1_C_kissankello.wav -------------------------------------------------------------------------------- /samples/8hz_4hz_1hz.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/asuni/wavelet_prosody_toolkit/564d2aad4ae2401aab2e521255e1d65dacc3756d/samples/8hz_4hz_1hz.wav -------------------------------------------------------------------------------- /samples/kan_0001.F0: -------------------------------------------------------------------------------- 1 | "ooTextFile" 2 | "Matrix" 3 | 0 6.0808125000000004 1212 0.0050000000000000001 0.012906249999999951 4 | 1 1 1 1 1 5 | 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 228.36184485720696 231.61652757293712 232.94093729501785 227.51383368857464 216.33460906087706 215.27691244045562 221.11449612190302 218.73749076716285 210.80632647608695 216.90325929629068 223.94384903673213 225.41841484548863 226.11846290406339 226.11954324729203 224.88139992790673 223.62215680378435 222.51631893962201 223.03075366773194 222.96915357418595 223.73152556085446 224.80609593772559 225.50824037978597 226.81830524546291 228.14501252280334 228.78948597152723 229.07246619767847 230.14815441738497 235.26130461363218 236.32073042144751 236.49450434786604 238.46095315396661 240.84128480245542 244.37167124825746 248.87864181690765 252.7494925271013 256.02256272410347 259.5448056257564 261.76819010465573 263.64089652720781 267.16528052662721 270.36663728844292 272.30709486212839 274.06445940191497 278.16646409328678 283.03631150640149 286.33032571766074 289.32893079773515 292.74926085366747 294.47530691114048 296.20412557514214 299.77653491613984 301.34640298776617 300.72265065023493 300.57117180343602 300.54322241784689 300.87812316367229 296.91149071872832 288.47422347060598 271.90791284510249 259.16664198247071 263.5841298973549 270.99517054967697 277.56980539095446 284.69226936936599 292.39080752485023 298.14893975006152 300.64830326945861 300.535308712438 298.21337588807489 293.82945415167768 290.05836157813604 287.18566670309002 282.38276796082596 262.67393106580749 248.17949308134499 243.58330436526913 234.58666038254651 217.08028849022514 264.77749876705622 283.75529823726305 295.29117936685725 271.2658696840985 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 198.9937151583442 195.82911314957175 191.24619646152317 186.55121990123757 183.93790608393667 183.24964570276398 182.58674142580443 181.33201713840566 181.82060769563449 182.15963314579494 183.26864823387922 182.91813760591586 182.45025384584639 182.74787693489233 182.15209334520125 181.20248653105787 181.64176296643251 182.75239234491841 182.8105252313172 182.61521716020141 183.11764202454464 183.88269552193916 186.6996283750409 190.01788429522395 192.04589741202062 194.47419919231078 198.40352839749826 204.08051172009024 207.15371998349769 212.90950156591094 216.39833239198393 216.70095096061533 230.47866350865178 234.55203378343182 234.72607756768389 226.95416182777086 261.61467344386904 273.77354649941964 268.10104244142565 258.44961985926363 259.5985309297489 276.08290840581191 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 238.73121837536968 231.65038175750107 227.00387059504189 218.55191493399556 212.813954226091 210.24313391777324 205.81156815827504 202.60817605765246 203.94956257286449 204.02247074006755 203.84687380119342 203.20810048697689 204.04584283404384 204.27213085035942 205.01051493569597 205.817153331632 206.93546010922626 210.70573458494977 214.00031103248378 215.94306869576735 218.44177530908811 219.7513254354416 221.57667262730408 222.71534374008451 222.39911569945374 222.2283197929068 222.99633068386871 223.23696243738738 223.91728945682996 224.31623142771136 226.4473588674978 229.20485949439603 230.131810794568 232.71389385289589 234.91434941444564 236.06569070482206 238.6461295490025 241.39250595463048 243.48809413261748 247.88868845252503 250.41994952660056 251.16545021663009 252.43215107592613 252.16637373750152 252.40989363141915 252.40615696983519 252.76927739314408 253.34286593112574 254.87372095819572 258.81715962176588 264.29701297096096 269.57150425336033 273.93073262777796 275.65352956053607 275.57767606915252 276.07298001511361 277.09045304728744 278.05862244527884 278.80239755104031 279.46087469795668 279.96428544241837 278.73668143065146 277.10602855821747 275.267213684946 270.71440548051459 261.25729813874545 250.67082887829321 251.74147179721723 261.93192397712158 265.56461357487308 268.31719701572075 269.2463986241122 270.29040468318146 269.96070474173581 268.96009005448661 264.53472984609004 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 210.57779289837967 208.72200414895624 200.98758135492108 198.0892655572633 194.36902460857104 190.26772264367031 190.53710083685701 186.78475168693097 184.17419795629763 188.18261791415173 184.04517616078388 186.02976516762115 188.29901376666066 189.51076081203095 191.62248211585495 194.36706569453969 192.74401928457456 0 0 0 0 0 242.76956307244018 245.34296625486704 236.4593619292514 230.06892883318261 230.56880286244373 246.41440331914549 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 222.9773632188253 223.21560494756801 224.68006384268512 221.48185355999314 213.96882598904384 209.50588864003004 207.73562464175612 207.36391135453215 207.34667993749852 205.46253163203122 205.49317983442677 206.58259237278179 207.41174911006925 208.33644940205383 210.45806255817129 212.1271723390411 213.43304309411067 215.77599217722852 217.32825076801041 218.73018646892262 220.77372374500874 223.50874573647096 225.28445301587826 225.78586128346015 226.75681531299375 229.18509387962081 230.71369488860807 231.65744150710364 232.08883363375602 230.12881235918394 226.74688711060483 224.03602004015991 224.56770916465351 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 250.72415528073927 235.64345150821453 237.88962216356083 244.85877412818601 250.83363745289651 254.46671785519914 257.6102773584974 260.22376197805187 261.15932330660883 261.60156788006469 262.55281277174538 260.74378523617111 259.56357193214387 258.21892944587137 254.43961262710158 249.62001237593793 246.15549992812367 244.49906211565482 238.69522394781504 236.06780100937149 236.35362419753531 233.24196021019705 231.98236402530358 233.66758793248553 232.24441063254366 230.06784980811693 230.54524894551832 230.76548046114473 230.00538569147815 230.05031487502833 229.48008675848956 228.1879421579261 226.57687380235529 225.51335844587928 225.275881713904 224.59384441979793 224.88847859650778 222.88164326422691 219.94298326294577 218.6976195683113 214.37801341920294 209.52065521341018 207.11300405401644 206.72809591393204 205.51812662856528 205.29733315493471 204.63974107730769 203.81440196851472 205.84394900680562 211.6695943612834 209.98221386329209 209.51272611029225 211.4758549299946 213.45660007028457 214.34010578091304 215.1969415637094 215.74168256218439 215.82024173143029 216.08637434019025 216.83700189350901 218.17647520585606 220.22870507113396 221.66948390212221 222.36652896972709 223.50259880963355 224.43233943941539 226.42185047368241 228.37813164181213 229.27643829583494 230.67273145621482 232.42982156985428 233.89256102656611 237.51360274036347 235.58744598339095 235.7061229212475 239.50880328850968 244.7050760102301 251.62676051743816 244.41235544486986 242.74041677023084 243.43034456053374 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 234.33011593775751 230.55758287210898 227.82790300891691 224.70855073399983 221.05060056992482 216.51557319954949 214.08846724939244 212.01444110917797 210.38417950696766 211.08960178934507 209.58586151700419 204.07276996040835 202.61117457980203 207.24036923786772 211.82686256218227 212.84519828917971 212.97687099568881 214.09143183527522 215.56193619437525 216.30317822772216 218.77319334891112 220.64943170143084 224.03417076130137 226.26463711529314 227.71292891189745 228.92478463087409 231.39963105256936 233.35726587383138 235.18184339568458 238.175655199673 240.9226026984899 241.69026758275251 242.23781436807914 243.761322830925 242.80998784908019 240.21811736137892 235.07547377361672 230.80524061841385 227.80249763147663 228.21379110285443 0 0 0 0 0 0 0 0 292.09856348586072 290.46298899787615 290.71949143530225 292.35433621454075 292.30523678880445 291.74070029277243 290.78919541757818 290.84386736590426 291.10656583730429 288.99131976717416 288.5427966773857 290.34976754415413 290.12116122964653 290.55055926258035 291.34236233693804 290.88460869133559 289.20848658844903 286.93040799204937 282.55166910675905 262.91343080335326 251.39631081093097 239.15288547449609 221.65823166884323 222.4357525031954 221.96563281722302 212.47245441603911 209.3598698358505 210.71266660858771 206.93728472628271 206.05392812396502 224.56283402146508 239.72921973403791 244.27738688146906 249.58018815440261 243.19343735120745 238.09182243902595 233.92008214255333 230.91560724322798 227.780030579731 224.7263024707342 220.8377443704872 215.46891513625621 212.64780397297983 212.09226178608077 211.16518908048721 209.9662470394436 209.87602634910405 210.21883186948017 210.52424778129404 209.96465801539696 208.0709375121759 204.88624094713032 202.53144517375245 200.70418959125254 198.2048343808641 194.47628647781013 191.57540462581372 192.17638647270556 192.63457664151304 194.20891298977676 194.31854441066733 192.73015442924904 191.55708354452983 191.54400536020381 190.88775073286357 189.99455805631615 189.76488940445043 188.31604276636881 188.26620167826962 187.17701496709407 185.38257586939426 180.00071090430069 177.94143219647739 183.77511957738051 184.15213238004824 183.65681781893946 182.25731303302294 180.85217155761882 178.9063537120779 179.93393109888748 179.08979596280119 180.1556316806236 181.33000258865738 181.12465687877568 184.29970195297872 184.65679882426676 183.74038292792903 185.52442547496713 187.5800619014924 188.88388428753737 192.98650542741819 195.57022884466747 198.4434574842698 200.97658357697313 203.23726555571636 204.0306757615939 206.54848695209168 205.62494962780724 212.06920101870182 209.52108181320025 209.2081275810736 198.71541206943812 199.67671883143365 204.62827356504823 222.19733703222693 200.88818743081967 203.21649986166059 203.06897608803314 210.68770645195698 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 222.50616965196218 205.9044392660463 198.05767052561851 193.61783531783493 191.01914036991244 188.38185587157582 185.12928185457218 180.96706679447604 181.29083948756488 180.47383348602852 179.77796244633154 179.13761775961137 180.87477148543988 193.5842746500789 221.51539336596508 219.2785971038868 216.56324152596497 215.70260531999457 213.66842374328701 209.48540964747329 208.02686883732582 207.20686693276204 206.64609470240188 208.11476684406546 209.79753325640499 210.77365482092569 210.24690926752032 208.86508723755199 205.99417207562809 203.42612739419141 202.06836372052888 201.08683839660313 202.6100708282317 203.77899261439697 203.90929176901111 203.73174494617857 204.25618849129413 205.19684306502194 205.10897248510665 204.49918300377468 207.80124634822366 209.48305921237784 206.93850674987567 206.89584234768796 207.75732328416248 207.93578837639419 206.64798401544473 205.37830542099522 204.0562171312844 203.59874277208991 202.61818742926718 202.12261707704059 202.82540556897882 202.21539407492276 202.41928537520073 203.6591913423625 204.16255074366478 203.59701602994809 203.93103996184141 206.80563311048212 207.63277664533754 207.3490117254114 208.6546475064082 208.55836742325991 211.79226896002973 212.97561328270854 212.29601377471761 211.47474432605878 213.88754613273028 213.89325885022939 211.37505611082847 209.79052370641114 210.31143010312297 210.88610831238142 207.64206301603565 205.78184809705527 205.42632932067573 168.93139437979798 168.95761281518313 170.2954996953361 171.80231954531038 209.96830939010982 0 0 0 0 0 0 0 0 0 0 0 0 233.93532122674213 227.61770669533681 214.59183701168834 210.54902741478102 211.23354071712194 211.94187209693339 210.49350093663716 207.94907881245379 205.39735829109333 201.70287027150908 199.14735095838074 196.9810894495607 192.34386595299893 191.04471313030561 189.46278851079839 184.24574920643693 182.01497709598524 173.87066442878151 168.44155642067741 179.80545787974154 203.55263360364185 194.61999298053954 201.27551758731889 197.22315915448334 198.13621794660938 157.84543411266995 164.17185067327807 171.06064875846695 169.8817412457569 169.69480530493968 169.52609055000278 164.92676799776379 168.00841100783254 167.16039501885416 165.85202907101646 164.66857822110916 164.21509940271727 162.21061364082527 162.26883145090653 162.06494736479144 162.44864690276867 162.63332040640779 162.78820004943151 163.04256934411885 161.24311613025844 161.92398237885558 161.32044757540996 162.19415760274464 165.25750865846874 164.17151173872642 165.42156381092249 163.91085375276634 167.53511500488051 170.61114555801888 164.31902939328563 161.87365404885935 157.17393385493168 157.73938681824654 157.50935883358616 152.9724434977534 156.00014448517265 152.24785298001075 145.78832996357627 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 6 | -------------------------------------------------------------------------------- /samples/kan_0001.lab: -------------------------------------------------------------------------------- 1 | 0 5600000 sil _UTTEND_ 2 | 5600000 5950000 a ameirika 3 | 5950000 7050000 m 4 | 7050000 8150000 e 5 | 8150000 8400000 i 6 | 8400000 8700000 r 7 | 8700000 9800000 i 8 | 9800000 10550000 k 9 | 10550000 13000000 a 10 | 13000000 14700000 sil _SPACE_ 11 | 14700000 16500000 s sannyukta 12 | 16500000 17250000 a 13 | 17250000 18300000 n 14 | 18300000 18600000 n 15 | 18600000 19450000 y 16 | 19450000 20500000 u 17 | 20500000 20650000 k 18 | 20650000 22000000 t 19 | 22000000 23700000 a 20 | 23700000 25050000 sil _SPACE_ 21 | 25050000 26600000 s sannsthaana 22 | 26600000 27100000 a 23 | 27100000 27350000 n 24 | 27350000 28050000 n 25 | 28050000 29150000 s 26 | 29150000 29850000 t 27 | 29850000 30500000 h 28 | 30500000 31450000 a 29 | 31450000 31650000 a 30 | 31650000 32450000 n 31 | 32450000 34050000 a 32 | 34050000 36950000 sil _SPACE_ 33 | 36950000 37800000 s sannvidhaana 34 | 37800000 38400000 a 35 | 38400000 38800000 n 36 | 38800000 39900000 n 37 | 39900000 40050000 v 38 | 40050000 41150000 i 39 | 41150000 41850000 d 40 | 41850000 42000000 h 41 | 42000000 42200000 a 42 | 42200000 43500000 a 43 | 43500000 44600000 n 44 | 44600000 46150000 a 45 | 46150000 47500000 sil _SPACE_ 46 | 47500000 48100000 d dinaacharand 47 | 48100000 48850000 i 48 | 48850000 49600000 n 49 | 49600000 50550000 a 50 | 50550000 51450000 a 51 | 51450000 52000000 c 52 | 52000000 52550000 h 53 | 52550000 52800000 a 54 | 52800000 53500000 r 55 | 53500000 53750000 a 56 | 53750000 54150000 n 57 | 54150000 54750000 d 58 | 54750000 54750000 skip _TILDE_ 59 | 54750000 55700000 e e 60 | 55700000 60750000 sil _UTTEND_ 61 | -------------------------------------------------------------------------------- /samples/kan_0001.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/asuni/wavelet_prosody_toolkit/564d2aad4ae2401aab2e521255e1d65dacc3756d/samples/kan_0001.wav -------------------------------------------------------------------------------- /samples/libritts/7127_75947_000010_000000.TextGrid: -------------------------------------------------------------------------------- 1 | File type = "ooTextFile" 2 | Object class = "TextGrid" 3 | 4 | xmin = 0.0 5 | xmax = 5.1 6 | tiers? 7 | size = 2 8 | item []: 9 | item [1]: 10 | class = "IntervalTier" 11 | name = "words" 12 | xmin = 0.0 13 | xmax = 5.1 14 | intervals: size = 17 15 | intervals [1]: 16 | xmin = 0.000 17 | xmax = 0.680 18 | text = "yes" 19 | intervals [2]: 20 | xmin = 0.680 21 | xmax = 0.750 22 | text = "" 23 | intervals [3]: 24 | xmin = 0.750 25 | xmax = 0.890 26 | text = "the" 27 | intervals [4]: 28 | xmin = 0.890 29 | xmax = 1.450 30 | text = "character" 31 | intervals [5]: 32 | xmin = 1.450 33 | xmax = 1.670 34 | text = "which" 35 | intervals [6]: 36 | xmin = 1.670 37 | xmax = 1.880 38 | text = "your" 39 | intervals [7]: 40 | xmin = 1.880 41 | xmax = 2.250 42 | text = "royal" 43 | intervals [8]: 44 | xmin = 2.250 45 | xmax = 2.660 46 | text = "highness" 47 | intervals [9]: 48 | xmin = 2.660 49 | xmax = 3.190 50 | text = "assumed" 51 | intervals [10]: 52 | xmin = 3.190 53 | xmax = 3.380 54 | text = "is" 55 | intervals [11]: 56 | xmin = 3.380 57 | xmax = 3.520 58 | text = "in" 59 | intervals [12]: 60 | xmin = 3.520 61 | xmax = 3.960 62 | text = "perfect" 63 | intervals [13]: 64 | xmin = 3.960 65 | xmax = 4.380 66 | text = "harmony" 67 | intervals [14]: 68 | xmin = 4.380 69 | xmax = 4.550 70 | text = "with" 71 | intervals [15]: 72 | xmin = 4.550 73 | xmax = 4.710 74 | text = "your" 75 | intervals [16]: 76 | xmin = 4.710 77 | xmax = 5.080 78 | text = "own" 79 | intervals [17]: 80 | xmin = 5.080 81 | xmax = 5.1 82 | text = "" 83 | item [2]: 84 | class = "IntervalTier" 85 | name = "phones" 86 | xmin = 0.0 87 | xmax = 5.1 88 | intervals: size = 59 89 | intervals [1]: 90 | xmin = 0.000 91 | xmax = 0.170 92 | text = "Y" 93 | intervals [2]: 94 | xmin = 0.170 95 | xmax = 0.400 96 | text = "EH1" 97 | intervals [3]: 98 | xmin = 0.400 99 | xmax = 0.680 100 | text = "S" 101 | intervals [4]: 102 | xmin = 0.680 103 | xmax = 0.750 104 | text = "sp" 105 | intervals [5]: 106 | xmin = 0.750 107 | xmax = 0.810 108 | text = "DH" 109 | intervals [6]: 110 | xmin = 0.810 111 | xmax = 0.890 112 | text = "AH0" 113 | intervals [7]: 114 | xmin = 0.890 115 | xmax = 1.030 116 | text = "K" 117 | intervals [8]: 118 | xmin = 1.030 119 | xmax = 1.080 120 | text = "EH1" 121 | intervals [9]: 122 | xmin = 1.080 123 | xmax = 1.180 124 | text = "R" 125 | intervals [10]: 126 | xmin = 1.180 127 | xmax = 1.240 128 | text = "IH0" 129 | intervals [11]: 130 | xmin = 1.240 131 | xmax = 1.280 132 | text = "K" 133 | intervals [12]: 134 | xmin = 1.280 135 | xmax = 1.350 136 | text = "T" 137 | intervals [13]: 138 | xmin = 1.350 139 | xmax = 1.450 140 | text = "ER0" 141 | intervals [14]: 142 | xmin = 1.450 143 | xmax = 1.510 144 | text = "W" 145 | intervals [15]: 146 | xmin = 1.510 147 | xmax = 1.570 148 | text = "IH1" 149 | intervals [16]: 150 | xmin = 1.570 151 | xmax = 1.670 152 | text = "CH" 153 | intervals [17]: 154 | xmin = 1.670 155 | xmax = 1.700 156 | text = "Y" 157 | intervals [18]: 158 | xmin = 1.700 159 | xmax = 1.790 160 | text = "AO1" 161 | intervals [19]: 162 | xmin = 1.790 163 | xmax = 1.880 164 | text = "R" 165 | intervals [20]: 166 | xmin = 1.880 167 | xmax = 2.020 168 | text = "R" 169 | intervals [21]: 170 | xmin = 2.020 171 | xmax = 2.140 172 | text = "OY1" 173 | intervals [22]: 174 | xmin = 2.140 175 | xmax = 2.180 176 | text = "AH0" 177 | intervals [23]: 178 | xmin = 2.180 179 | xmax = 2.250 180 | text = "L" 181 | intervals [24]: 182 | xmin = 2.250 183 | xmax = 2.330 184 | text = "HH" 185 | intervals [25]: 186 | xmin = 2.330 187 | xmax = 2.440 188 | text = "AY1" 189 | intervals [26]: 190 | xmin = 2.440 191 | xmax = 2.480 192 | text = "N" 193 | intervals [27]: 194 | xmin = 2.480 195 | xmax = 2.570 196 | text = "AH0" 197 | intervals [28]: 198 | xmin = 2.570 199 | xmax = 2.660 200 | text = "S" 201 | intervals [29]: 202 | xmin = 2.660 203 | xmax = 2.740 204 | text = "AH0" 205 | intervals [30]: 206 | xmin = 2.740 207 | xmax = 2.890 208 | text = "S" 209 | intervals [31]: 210 | xmin = 2.890 211 | xmax = 3.080 212 | text = "UW1" 213 | intervals [32]: 214 | xmin = 3.080 215 | xmax = 3.150 216 | text = "M" 217 | intervals [33]: 218 | xmin = 3.150 219 | xmax = 3.190 220 | text = "D" 221 | intervals [34]: 222 | xmin = 3.190 223 | xmax = 3.300 224 | text = "IH1" 225 | intervals [35]: 226 | xmin = 3.300 227 | xmax = 3.380 228 | text = "Z" 229 | intervals [36]: 230 | xmin = 3.380 231 | xmax = 3.450 232 | text = "IH0" 233 | intervals [37]: 234 | xmin = 3.450 235 | xmax = 3.520 236 | text = "N" 237 | intervals [38]: 238 | xmin = 3.520 239 | xmax = 3.670 240 | text = "P" 241 | intervals [39]: 242 | xmin = 3.670 243 | xmax = 3.740 244 | text = "ER1" 245 | intervals [40]: 246 | xmin = 3.740 247 | xmax = 3.820 248 | text = "F" 249 | intervals [41]: 250 | xmin = 3.820 251 | xmax = 3.870 252 | text = "IH2" 253 | intervals [42]: 254 | xmin = 3.870 255 | xmax = 3.920 256 | text = "K" 257 | intervals [43]: 258 | xmin = 3.920 259 | xmax = 3.960 260 | text = "T" 261 | intervals [44]: 262 | xmin = 3.960 263 | xmax = 4.030 264 | text = "HH" 265 | intervals [45]: 266 | xmin = 4.030 267 | xmax = 4.070 268 | text = "AA1" 269 | intervals [46]: 270 | xmin = 4.070 271 | xmax = 4.150 272 | text = "R" 273 | intervals [47]: 274 | xmin = 4.150 275 | xmax = 4.200 276 | text = "M" 277 | intervals [48]: 278 | xmin = 4.200 279 | xmax = 4.240 280 | text = "AH0" 281 | intervals [49]: 282 | xmin = 4.240 283 | xmax = 4.290 284 | text = "N" 285 | intervals [50]: 286 | xmin = 4.290 287 | xmax = 4.380 288 | text = "IY0" 289 | intervals [51]: 290 | xmin = 4.380 291 | xmax = 4.430 292 | text = "W" 293 | intervals [52]: 294 | xmin = 4.430 295 | xmax = 4.470 296 | text = "IH0" 297 | intervals [53]: 298 | xmin = 4.470 299 | xmax = 4.550 300 | text = "TH" 301 | intervals [54]: 302 | xmin = 4.550 303 | xmax = 4.590 304 | text = "Y" 305 | intervals [55]: 306 | xmin = 4.590 307 | xmax = 4.620 308 | text = "UH1" 309 | intervals [56]: 310 | xmin = 4.620 311 | xmax = 4.710 312 | text = "R" 313 | intervals [57]: 314 | xmin = 4.710 315 | xmax = 4.890 316 | text = "OW1" 317 | intervals [58]: 318 | xmin = 4.890 319 | xmax = 5.080 320 | text = "N" 321 | intervals [59]: 322 | xmin = 5.080 323 | xmax = 5.1 324 | text = "" 325 | -------------------------------------------------------------------------------- /samples/libritts/7127_75947_000010_000000.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/asuni/wavelet_prosody_toolkit/564d2aad4ae2401aab2e521255e1d65dacc3756d/samples/libritts/7127_75947_000010_000000.wav -------------------------------------------------------------------------------- /samples/libritts/LJ050-0276.TextGrid: -------------------------------------------------------------------------------- 1 | File type = "ooTextFile" 2 | Object class = "TextGrid" 3 | 4 | xmin = 0.0 5 | xmax = 8.563673469387755 6 | tiers? 7 | size = 2 8 | item []: 9 | item [1]: 10 | class = "IntervalTier" 11 | name = "words" 12 | xmin = 0.0 13 | xmax = 8.563673469387755 14 | intervals: size = 26 15 | intervals [1]: 16 | xmin = 0.000 17 | xmax = 0.180 18 | text = "as" 19 | intervals [2]: 20 | xmin = 0.180 21 | xmax = 0.460 22 | text = "has" 23 | intervals [3]: 24 | xmin = 0.460 25 | xmax = 0.660 26 | text = "been" 27 | intervals [4]: 28 | xmin = 0.660 29 | xmax = 1.070 30 | text = "pointed" 31 | intervals [5]: 32 | xmin = 1.070 33 | xmax = 1.430 34 | text = "out" 35 | intervals [6]: 36 | xmin = 1.430 37 | xmax = 1.790 38 | text = "" 39 | intervals [7]: 40 | xmin = 1.790 41 | xmax = 1.900 42 | text = "the" 43 | intervals [8]: 44 | xmin = 1.900 45 | xmax = 2.330 46 | text = "commission" 47 | intervals [9]: 48 | xmin = 2.330 49 | xmax = 2.510 50 | text = "has" 51 | intervals [10]: 52 | xmin = 2.510 53 | xmax = 2.780 54 | text = "not" 55 | intervals [11]: 56 | xmin = 2.780 57 | xmax = 3.350 58 | text = "resolved" 59 | intervals [12]: 60 | xmin = 3.350 61 | xmax = 3.580 62 | text = "all" 63 | intervals [13]: 64 | xmin = 3.580 65 | xmax = 3.690 66 | text = "the" 67 | intervals [14]: 68 | xmin = 3.690 69 | xmax = 4.420 70 | text = "proposals" 71 | intervals [15]: 72 | xmin = 4.420 73 | xmax = 4.650 74 | text = "which" 75 | intervals [16]: 76 | xmin = 4.650 77 | xmax = 4.810 78 | text = "could" 79 | intervals [17]: 80 | xmin = 4.810 81 | xmax = 4.970 82 | text = "be" 83 | intervals [18]: 84 | xmin = 4.970 85 | xmax = 5.450 86 | text = "made" 87 | intervals [19]: 88 | xmin = 5.450 89 | xmax = 6.150 90 | text = "" 91 | intervals [20]: 92 | xmin = 6.150 93 | xmax = 6.250 94 | text = "the" 95 | intervals [21]: 96 | xmin = 6.250 97 | xmax = 6.740 98 | text = "commission" 99 | intervals [22]: 100 | xmin = 6.740 101 | xmax = 7.370 102 | text = "nevertheless" 103 | intervals [23]: 104 | xmin = 7.370 105 | xmax = 7.510 106 | text = "is" 107 | intervals [24]: 108 | xmin = 7.510 109 | xmax = 8.180 110 | text = "confident" 111 | intervals [25]: 112 | xmin = 8.180 113 | xmax = 8.470 114 | text = "that" 115 | intervals [26]: 116 | xmin = 8.470 117 | xmax = 8.563673469387755 118 | text = "" 119 | item [2]: 120 | class = "IntervalTier" 121 | name = "phones" 122 | xmin = 0.0 123 | xmax = 8.563673469387755 124 | intervals: size = 97 125 | intervals [1]: 126 | xmin = 0.000 127 | xmax = 0.110 128 | text = "ae" 129 | intervals [2]: 130 | xmin = 0.110 131 | xmax = 0.180 132 | text = "z" 133 | intervals [3]: 134 | xmin = 0.180 135 | xmax = 0.270 136 | text = "hh" 137 | intervals [4]: 138 | xmin = 0.270 139 | xmax = 0.370 140 | text = "ae" 141 | intervals [5]: 142 | xmin = 0.370 143 | xmax = 0.460 144 | text = "z" 145 | intervals [6]: 146 | xmin = 0.460 147 | xmax = 0.510 148 | text = "b" 149 | intervals [7]: 150 | xmin = 0.510 151 | xmax = 0.580 152 | text = "ih" 153 | intervals [8]: 154 | xmin = 0.580 155 | xmax = 0.660 156 | text = "n" 157 | intervals [9]: 158 | xmin = 0.660 159 | xmax = 0.740 160 | text = "p" 161 | intervals [10]: 162 | xmin = 0.740 163 | xmax = 0.870 164 | text = "oy" 165 | intervals [11]: 166 | xmin = 0.870 167 | xmax = 0.900 168 | text = "n" 169 | intervals [12]: 170 | xmin = 0.900 171 | xmax = 0.960 172 | text = "t" 173 | intervals [13]: 174 | xmin = 0.960 175 | xmax = 0.990 176 | text = "ax" 177 | intervals [14]: 178 | xmin = 0.990 179 | xmax = 1.070 180 | text = "d" 181 | intervals [15]: 182 | xmin = 1.070 183 | xmax = 1.340 184 | text = "aw" 185 | intervals [16]: 186 | xmin = 1.340 187 | xmax = 1.430 188 | text = "t" 189 | intervals [17]: 190 | xmin = 1.430 191 | xmax = 1.790 192 | text = "sp" 193 | intervals [18]: 194 | xmin = 1.790 195 | xmax = 1.840 196 | text = "dh" 197 | intervals [19]: 198 | xmin = 1.840 199 | xmax = 1.900 200 | text = "ax" 201 | intervals [20]: 202 | xmin = 1.900 203 | xmax = 1.980 204 | text = "k" 205 | intervals [21]: 206 | xmin = 1.980 207 | xmax = 2.020 208 | text = "ax" 209 | intervals [22]: 210 | xmin = 2.020 211 | xmax = 2.080 212 | text = "m" 213 | intervals [23]: 214 | xmin = 2.080 215 | xmax = 2.140 216 | text = "ih" 217 | intervals [24]: 218 | xmin = 2.140 219 | xmax = 2.240 220 | text = "sh" 221 | intervals [25]: 222 | xmin = 2.240 223 | xmax = 2.280 224 | text = "ax" 225 | intervals [26]: 226 | xmin = 2.280 227 | xmax = 2.330 228 | text = "n" 229 | intervals [27]: 230 | xmin = 2.330 231 | xmax = 2.400 232 | text = "hh" 233 | intervals [28]: 234 | xmin = 2.400 235 | xmax = 2.440 236 | text = "ax" 237 | intervals [29]: 238 | xmin = 2.440 239 | xmax = 2.510 240 | text = "z" 241 | intervals [30]: 242 | xmin = 2.510 243 | xmax = 2.590 244 | text = "n" 245 | intervals [31]: 246 | xmin = 2.590 247 | xmax = 2.740 248 | text = "aa" 249 | intervals [32]: 250 | xmin = 2.740 251 | xmax = 2.780 252 | text = "t" 253 | intervals [33]: 254 | xmin = 2.780 255 | xmax = 2.840 256 | text = "r" 257 | intervals [34]: 258 | xmin = 2.840 259 | xmax = 2.870 260 | text = "iy" 261 | intervals [35]: 262 | xmin = 2.870 263 | xmax = 2.970 264 | text = "z" 265 | intervals [36]: 266 | xmin = 2.970 267 | xmax = 3.140 268 | text = "aa" 269 | intervals [37]: 270 | xmin = 3.140 271 | xmax = 3.220 272 | text = "l" 273 | intervals [38]: 274 | xmin = 3.220 275 | xmax = 3.270 276 | text = "v" 277 | intervals [39]: 278 | xmin = 3.270 279 | xmax = 3.350 280 | text = "d" 281 | intervals [40]: 282 | xmin = 3.350 283 | xmax = 3.520 284 | text = "ao" 285 | intervals [41]: 286 | xmin = 3.520 287 | xmax = 3.580 288 | text = "l" 289 | intervals [42]: 290 | xmin = 3.580 291 | xmax = 3.630 292 | text = "dh" 293 | intervals [43]: 294 | xmin = 3.630 295 | xmax = 3.690 296 | text = "ax" 297 | intervals [44]: 298 | xmin = 3.690 299 | xmax = 3.750 300 | text = "p" 301 | intervals [45]: 302 | xmin = 3.750 303 | xmax = 3.780 304 | text = "r" 305 | intervals [46]: 306 | xmin = 3.780 307 | xmax = 3.820 308 | text = "ax" 309 | intervals [47]: 310 | xmin = 3.820 311 | xmax = 3.940 312 | text = "p" 313 | intervals [48]: 314 | xmin = 3.940 315 | xmax = 4.100 316 | text = "ow" 317 | intervals [49]: 318 | xmin = 4.100 319 | xmax = 4.170 320 | text = "z" 321 | intervals [50]: 322 | xmin = 4.170 323 | xmax = 4.200 324 | text = "ax" 325 | intervals [51]: 326 | xmin = 4.200 327 | xmax = 4.310 328 | text = "l" 329 | intervals [52]: 330 | xmin = 4.310 331 | xmax = 4.420 332 | text = "z" 333 | intervals [53]: 334 | xmin = 4.420 335 | xmax = 4.490 336 | text = "w" 337 | intervals [54]: 338 | xmin = 4.490 339 | xmax = 4.550 340 | text = "ih" 341 | intervals [55]: 342 | xmin = 4.550 343 | xmax = 4.650 344 | text = "ch" 345 | intervals [56]: 346 | xmin = 4.650 347 | xmax = 4.720 348 | text = "k" 349 | intervals [57]: 350 | xmin = 4.720 351 | xmax = 4.780 352 | text = "uh" 353 | intervals [58]: 354 | xmin = 4.780 355 | xmax = 4.810 356 | text = "d" 357 | intervals [59]: 358 | xmin = 4.810 359 | xmax = 4.870 360 | text = "b" 361 | intervals [60]: 362 | xmin = 4.870 363 | xmax = 4.970 364 | text = "iy" 365 | intervals [61]: 366 | xmin = 4.970 367 | xmax = 5.060 368 | text = "m" 369 | intervals [62]: 370 | xmin = 5.060 371 | xmax = 5.360 372 | text = "ey" 373 | intervals [63]: 374 | xmin = 5.360 375 | xmax = 5.450 376 | text = "d" 377 | intervals [64]: 378 | xmin = 5.450 379 | xmax = 6.150 380 | text = "sp" 381 | intervals [65]: 382 | xmin = 6.150 383 | xmax = 6.210 384 | text = "dh" 385 | intervals [66]: 386 | xmin = 6.210 387 | xmax = 6.250 388 | text = "ax" 389 | intervals [67]: 390 | xmin = 6.250 391 | xmax = 6.340 392 | text = "k" 393 | intervals [68]: 394 | xmin = 6.340 395 | xmax = 6.390 396 | text = "ax" 397 | intervals [69]: 398 | xmin = 6.390 399 | xmax = 6.460 400 | text = "m" 401 | intervals [70]: 402 | xmin = 6.460 403 | xmax = 6.520 404 | text = "ih" 405 | intervals [71]: 406 | xmin = 6.520 407 | xmax = 6.630 408 | text = "sh" 409 | intervals [72]: 410 | xmin = 6.630 411 | xmax = 6.710 412 | text = "ax" 413 | intervals [73]: 414 | xmin = 6.710 415 | xmax = 6.740 416 | text = "n" 417 | intervals [74]: 418 | xmin = 6.740 419 | xmax = 6.800 420 | text = "n" 421 | intervals [75]: 422 | xmin = 6.800 423 | xmax = 6.880 424 | text = "eh" 425 | intervals [76]: 426 | xmin = 6.880 427 | xmax = 6.920 428 | text = "v" 429 | intervals [77]: 430 | xmin = 6.920 431 | xmax = 7.000 432 | text = "er" 433 | intervals [78]: 434 | xmin = 7.000 435 | xmax = 7.030 436 | text = "dh" 437 | intervals [79]: 438 | xmin = 7.030 439 | xmax = 7.100 440 | text = "ax" 441 | intervals [80]: 442 | xmin = 7.100 443 | xmax = 7.180 444 | text = "l" 445 | intervals [81]: 446 | xmin = 7.180 447 | xmax = 7.270 448 | text = "eh" 449 | intervals [82]: 450 | xmin = 7.270 451 | xmax = 7.370 452 | text = "s" 453 | intervals [83]: 454 | xmin = 7.370 455 | xmax = 7.400 456 | text = "ax" 457 | intervals [84]: 458 | xmin = 7.400 459 | xmax = 7.510 460 | text = "z" 461 | intervals [85]: 462 | xmin = 7.510 463 | xmax = 7.610 464 | text = "k" 465 | intervals [86]: 466 | xmin = 7.610 467 | xmax = 7.730 468 | text = "aa" 469 | intervals [87]: 470 | xmin = 7.730 471 | xmax = 7.780 472 | text = "n" 473 | intervals [88]: 474 | xmin = 7.780 475 | xmax = 7.880 476 | text = "f" 477 | intervals [89]: 478 | xmin = 7.880 479 | xmax = 7.910 480 | text = "ax" 481 | intervals [90]: 482 | xmin = 7.910 483 | xmax = 7.950 484 | text = "d" 485 | intervals [91]: 486 | xmin = 7.950 487 | xmax = 8.050 488 | text = "ax" 489 | intervals [92]: 490 | xmin = 8.050 491 | xmax = 8.110 492 | text = "n" 493 | intervals [93]: 494 | xmin = 8.110 495 | xmax = 8.180 496 | text = "t" 497 | intervals [94]: 498 | xmin = 8.180 499 | xmax = 8.230 500 | text = "dh" 501 | intervals [95]: 502 | xmin = 8.230 503 | xmax = 8.390 504 | text = "ae" 505 | intervals [96]: 506 | xmin = 8.390 507 | xmax = 8.470 508 | text = "t" 509 | intervals [97]: 510 | xmin = 8.470 511 | xmax = 8.563673469387755 512 | text = "sp" 513 | -------------------------------------------------------------------------------- /samples/libritts/LJ050-0276.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/asuni/wavelet_prosody_toolkit/564d2aad4ae2401aab2e521255e1d65dacc3756d/samples/libritts/LJ050-0276.wav -------------------------------------------------------------------------------- /samples/libritts/LJ050-0277.TextGrid: -------------------------------------------------------------------------------- 1 | File type = "ooTextFile" 2 | Object class = "TextGrid" 3 | 4 | xmin = 0.0 5 | xmax = 8.714603174603175 6 | tiers? 7 | size = 2 8 | item []: 9 | item [1]: 10 | class = "IntervalTier" 11 | name = "words" 12 | xmin = 0.0 13 | xmax = 8.714603174603175 14 | intervals: size = 27 15 | intervals [1]: 16 | xmin = 0.000 17 | xmax = 0.170 18 | text = "with" 19 | intervals [2]: 20 | xmin = 0.170 21 | xmax = 0.340 22 | text = "the" 23 | intervals [3]: 24 | xmin = 0.340 25 | xmax = 0.740 26 | text = "active" 27 | intervals [4]: 28 | xmin = 0.740 29 | xmax = 1.590 30 | text = "cooperation" 31 | intervals [5]: 32 | xmin = 1.590 33 | xmax = 1.710 34 | text = "of" 35 | intervals [6]: 36 | xmin = 1.710 37 | xmax = 1.810 38 | text = "the" 39 | intervals [7]: 40 | xmin = 1.810 41 | xmax = 2.620 42 | text = "responsible" 43 | intervals [8]: 44 | xmin = 2.620 45 | xmax = 3.550 46 | text = "agencies" 47 | intervals [9]: 48 | xmin = 3.550 49 | xmax = 3.710 50 | text = "" 51 | intervals [10]: 52 | xmin = 3.710 53 | xmax = 4.000 54 | text = "and" 55 | intervals [11]: 56 | xmin = 4.000 57 | xmax = 4.170 58 | text = "with" 59 | intervals [12]: 60 | xmin = 4.170 61 | xmax = 4.310 62 | text = "the" 63 | intervals [13]: 64 | xmin = 4.310 65 | xmax = 5.030 66 | text = "understanding" 67 | intervals [14]: 68 | xmin = 5.030 69 | xmax = 5.130 70 | text = "of" 71 | intervals [15]: 72 | xmin = 5.130 73 | xmax = 5.240 74 | text = "the" 75 | intervals [16]: 76 | xmin = 5.240 77 | xmax = 5.590 78 | text = "people" 79 | intervals [17]: 80 | xmin = 5.590 81 | xmax = 5.700 82 | text = "of" 83 | intervals [18]: 84 | xmin = 5.700 85 | xmax = 5.780 86 | text = "the" 87 | intervals [19]: 88 | xmin = 5.780 89 | xmax = 6.240 90 | text = "united" 91 | intervals [20]: 92 | xmin = 6.240 93 | xmax = 6.720 94 | text = "states" 95 | intervals [21]: 96 | xmin = 6.720 97 | xmax = 6.820 98 | text = "in" 99 | intervals [22]: 100 | xmin = 6.820 101 | xmax = 7.000 102 | text = "their" 103 | intervals [23]: 104 | xmin = 7.000 105 | xmax = 7.450 106 | text = "demands" 107 | intervals [24]: 108 | xmin = 7.450 109 | xmax = 7.810 110 | text = "upon" 111 | intervals [25]: 112 | xmin = 7.810 113 | xmax = 7.980 114 | text = "their" 115 | intervals [26]: 116 | xmin = 7.980 117 | xmax = 8.620 118 | text = "president" 119 | intervals [27]: 120 | xmin = 8.620 121 | xmax = 8.714603174603175 122 | text = "" 123 | item [2]: 124 | class = "IntervalTier" 125 | name = "phones" 126 | xmin = 0.0 127 | xmax = 8.714603174603175 128 | intervals: size = 115 129 | intervals [1]: 130 | xmin = 0.000 131 | xmax = 0.060 132 | text = "w" 133 | intervals [2]: 134 | xmin = 0.060 135 | xmax = 0.140 136 | text = "ih" 137 | intervals [3]: 138 | xmin = 0.140 139 | xmax = 0.170 140 | text = "dh" 141 | intervals [4]: 142 | xmin = 0.170 143 | xmax = 0.200 144 | text = "dh" 145 | intervals [5]: 146 | xmin = 0.200 147 | xmax = 0.340 148 | text = "ax" 149 | intervals [6]: 150 | xmin = 0.340 151 | xmax = 0.540 152 | text = "ae" 153 | intervals [7]: 154 | xmin = 0.540 155 | xmax = 0.570 156 | text = "k" 157 | intervals [8]: 158 | xmin = 0.570 159 | xmax = 0.610 160 | text = "t" 161 | intervals [9]: 162 | xmin = 0.610 163 | xmax = 0.660 164 | text = "ax" 165 | intervals [10]: 166 | xmin = 0.660 167 | xmax = 0.740 168 | text = "v" 169 | intervals [11]: 170 | xmin = 0.740 171 | xmax = 0.850 172 | text = "k" 173 | intervals [12]: 174 | xmin = 0.850 175 | xmax = 0.940 176 | text = "ow" 177 | intervals [13]: 178 | xmin = 0.940 179 | xmax = 1.080 180 | text = "aa" 181 | intervals [14]: 182 | xmin = 1.080 183 | xmax = 1.140 184 | text = "p" 185 | intervals [15]: 186 | xmin = 1.140 187 | xmax = 1.270 188 | text = "er" 189 | intervals [16]: 190 | xmin = 1.270 191 | xmax = 1.380 192 | text = "ey" 193 | intervals [17]: 194 | xmin = 1.380 195 | xmax = 1.520 196 | text = "sh" 197 | intervals [18]: 198 | xmin = 1.520 199 | xmax = 1.550 200 | text = "ax" 201 | intervals [19]: 202 | xmin = 1.550 203 | xmax = 1.590 204 | text = "n" 205 | intervals [20]: 206 | xmin = 1.590 207 | xmax = 1.660 208 | text = "ah" 209 | intervals [21]: 210 | xmin = 1.660 211 | xmax = 1.710 212 | text = "v" 213 | intervals [22]: 214 | xmin = 1.710 215 | xmax = 1.740 216 | text = "dh" 217 | intervals [23]: 218 | xmin = 1.740 219 | xmax = 1.810 220 | text = "ax" 221 | intervals [24]: 222 | xmin = 1.810 223 | xmax = 1.860 224 | text = "r" 225 | intervals [25]: 226 | xmin = 1.860 227 | xmax = 1.920 228 | text = "iy" 229 | intervals [26]: 230 | xmin = 1.920 231 | xmax = 2.030 232 | text = "s" 233 | intervals [27]: 234 | xmin = 2.030 235 | xmax = 2.080 236 | text = "p" 237 | intervals [28]: 238 | xmin = 2.080 239 | xmax = 2.190 240 | text = "aa" 241 | intervals [29]: 242 | xmin = 2.190 243 | xmax = 2.260 244 | text = "n" 245 | intervals [30]: 246 | xmin = 2.260 247 | xmax = 2.320 248 | text = "s" 249 | intervals [31]: 250 | xmin = 2.320 251 | xmax = 2.370 252 | text = "ax" 253 | intervals [32]: 254 | xmin = 2.370 255 | xmax = 2.400 256 | text = "b" 257 | intervals [33]: 258 | xmin = 2.400 259 | xmax = 2.440 260 | text = "ax" 261 | intervals [34]: 262 | xmin = 2.440 263 | xmax = 2.620 264 | text = "l" 265 | intervals [35]: 266 | xmin = 2.620 267 | xmax = 2.770 268 | text = "ey" 269 | intervals [36]: 270 | xmin = 2.770 271 | xmax = 2.840 272 | text = "jh" 273 | intervals [37]: 274 | xmin = 2.840 275 | xmax = 2.880 276 | text = "ax" 277 | intervals [38]: 278 | xmin = 2.880 279 | xmax = 2.940 280 | text = "n" 281 | intervals [39]: 282 | xmin = 2.940 283 | xmax = 3.100 284 | text = "s" 285 | intervals [40]: 286 | xmin = 3.100 287 | xmax = 3.340 288 | text = "iy" 289 | intervals [41]: 290 | xmin = 3.340 291 | xmax = 3.550 292 | text = "z" 293 | intervals [42]: 294 | xmin = 3.550 295 | xmax = 3.710 296 | text = "sp" 297 | intervals [43]: 298 | xmin = 3.710 299 | xmax = 3.880 300 | text = "hh" 301 | intervals [44]: 302 | xmin = 3.880 303 | xmax = 3.930 304 | text = "ae" 305 | intervals [45]: 306 | xmin = 3.930 307 | xmax = 4.000 308 | text = "d" 309 | intervals [46]: 310 | xmin = 4.000 311 | xmax = 4.070 312 | text = "w" 313 | intervals [47]: 314 | xmin = 4.070 315 | xmax = 4.140 316 | text = "ih" 317 | intervals [48]: 318 | xmin = 4.140 319 | xmax = 4.170 320 | text = "dh" 321 | intervals [49]: 322 | xmin = 4.170 323 | xmax = 4.200 324 | text = "dh" 325 | intervals [50]: 326 | xmin = 4.200 327 | xmax = 4.310 328 | text = "ax" 329 | intervals [51]: 330 | xmin = 4.310 331 | xmax = 4.390 332 | text = "ah" 333 | intervals [52]: 334 | xmin = 4.390 335 | xmax = 4.420 336 | text = "n" 337 | intervals [53]: 338 | xmin = 4.420 339 | xmax = 4.460 340 | text = "d" 341 | intervals [54]: 342 | xmin = 4.460 343 | xmax = 4.520 344 | text = "er" 345 | intervals [55]: 346 | xmin = 4.520 347 | xmax = 4.620 348 | text = "s" 349 | intervals [56]: 350 | xmin = 4.620 351 | xmax = 4.680 352 | text = "t" 353 | intervals [57]: 354 | xmin = 4.680 355 | xmax = 4.820 356 | text = "ae" 357 | intervals [58]: 358 | xmin = 4.820 359 | xmax = 4.860 360 | text = "n" 361 | intervals [59]: 362 | xmin = 4.860 363 | xmax = 4.910 364 | text = "d" 365 | intervals [60]: 366 | xmin = 4.910 367 | xmax = 4.960 368 | text = "ax" 369 | intervals [61]: 370 | xmin = 4.960 371 | xmax = 5.030 372 | text = "ng" 373 | intervals [62]: 374 | xmin = 5.030 375 | xmax = 5.080 376 | text = "ax" 377 | intervals [63]: 378 | xmin = 5.080 379 | xmax = 5.130 380 | text = "v" 381 | intervals [64]: 382 | xmin = 5.130 383 | xmax = 5.170 384 | text = "dh" 385 | intervals [65]: 386 | xmin = 5.170 387 | xmax = 5.240 388 | text = "ax" 389 | intervals [66]: 390 | xmin = 5.240 391 | xmax = 5.320 392 | text = "p" 393 | intervals [67]: 394 | xmin = 5.320 395 | xmax = 5.440 396 | text = "iy" 397 | intervals [68]: 398 | xmin = 5.440 399 | xmax = 5.490 400 | text = "p" 401 | intervals [69]: 402 | xmin = 5.490 403 | xmax = 5.520 404 | text = "ax" 405 | intervals [70]: 406 | xmin = 5.520 407 | xmax = 5.590 408 | text = "l" 409 | intervals [71]: 410 | xmin = 5.590 411 | xmax = 5.660 412 | text = "ah" 413 | intervals [72]: 414 | xmin = 5.660 415 | xmax = 5.700 416 | text = "v" 417 | intervals [73]: 418 | xmin = 5.700 419 | xmax = 5.740 420 | text = "dh" 421 | intervals [74]: 422 | xmin = 5.740 423 | xmax = 5.780 424 | text = "ax" 425 | intervals [75]: 426 | xmin = 5.780 427 | xmax = 5.830 428 | text = "y" 429 | intervals [76]: 430 | xmin = 5.830 431 | xmax = 5.860 432 | text = "uw" 433 | intervals [77]: 434 | xmin = 5.860 435 | xmax = 5.930 436 | text = "n" 437 | intervals [78]: 438 | xmin = 5.930 439 | xmax = 6.040 440 | text = "ay" 441 | intervals [79]: 442 | xmin = 6.040 443 | xmax = 6.110 444 | text = "t" 445 | intervals [80]: 446 | xmin = 6.110 447 | xmax = 6.160 448 | text = "ax" 449 | intervals [81]: 450 | xmin = 6.160 451 | xmax = 6.240 452 | text = "d" 453 | intervals [82]: 454 | xmin = 6.240 455 | xmax = 6.330 456 | text = "s" 457 | intervals [83]: 458 | xmin = 6.330 459 | xmax = 6.400 460 | text = "t" 461 | intervals [84]: 462 | xmin = 6.400 463 | xmax = 6.590 464 | text = "ey" 465 | intervals [85]: 466 | xmin = 6.590 467 | xmax = 6.640 468 | text = "t" 469 | intervals [86]: 470 | xmin = 6.640 471 | xmax = 6.720 472 | text = "s" 473 | intervals [87]: 474 | xmin = 6.720 475 | xmax = 6.760 476 | text = "ax" 477 | intervals [88]: 478 | xmin = 6.760 479 | xmax = 6.820 480 | text = "n" 481 | intervals [89]: 482 | xmin = 6.820 483 | xmax = 6.870 484 | text = "dh" 485 | intervals [90]: 486 | xmin = 6.870 487 | xmax = 6.930 488 | text = "eh" 489 | intervals [91]: 490 | xmin = 6.930 491 | xmax = 7.000 492 | text = "r" 493 | intervals [92]: 494 | xmin = 7.000 495 | xmax = 7.040 496 | text = "d" 497 | intervals [93]: 498 | xmin = 7.040 499 | xmax = 7.080 500 | text = "ax" 501 | intervals [94]: 502 | xmin = 7.080 503 | xmax = 7.180 504 | text = "m" 505 | intervals [95]: 506 | xmin = 7.180 507 | xmax = 7.330 508 | text = "ae" 509 | intervals [96]: 510 | xmin = 7.330 511 | xmax = 7.360 512 | text = "n" 513 | intervals [97]: 514 | xmin = 7.360 515 | xmax = 7.400 516 | text = "d" 517 | intervals [98]: 518 | xmin = 7.400 519 | xmax = 7.450 520 | text = "z" 521 | intervals [99]: 522 | xmin = 7.450 523 | xmax = 7.540 524 | text = "ax" 525 | intervals [100]: 526 | xmin = 7.540 527 | xmax = 7.630 528 | text = "p" 529 | intervals [101]: 530 | xmin = 7.630 531 | xmax = 7.760 532 | text = "aa" 533 | intervals [102]: 534 | xmin = 7.760 535 | xmax = 7.810 536 | text = "n" 537 | intervals [103]: 538 | xmin = 7.810 539 | xmax = 7.850 540 | text = "dh" 541 | intervals [104]: 542 | xmin = 7.850 543 | xmax = 7.900 544 | text = "eh" 545 | intervals [105]: 546 | xmin = 7.900 547 | xmax = 7.980 548 | text = "r" 549 | intervals [106]: 550 | xmin = 7.980 551 | xmax = 8.060 552 | text = "p" 553 | intervals [107]: 554 | xmin = 8.060 555 | xmax = 8.120 556 | text = "r" 557 | intervals [108]: 558 | xmin = 8.120 559 | xmax = 8.180 560 | text = "eh" 561 | intervals [109]: 562 | xmin = 8.180 563 | xmax = 8.250 564 | text = "z" 565 | intervals [110]: 566 | xmin = 8.250 567 | xmax = 8.300 568 | text = "ax" 569 | intervals [111]: 570 | xmin = 8.300 571 | xmax = 8.370 572 | text = "d" 573 | intervals [112]: 574 | xmin = 8.370 575 | xmax = 8.460 576 | text = "ax" 577 | intervals [113]: 578 | xmin = 8.460 579 | xmax = 8.550 580 | text = "n" 581 | intervals [114]: 582 | xmin = 8.550 583 | xmax = 8.620 584 | text = "t" 585 | intervals [115]: 586 | xmin = 8.620 587 | xmax = 8.714603174603175 588 | text = "sp" 589 | -------------------------------------------------------------------------------- /samples/libritts/LJ050-0277.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/asuni/wavelet_prosody_toolkit/564d2aad4ae2401aab2e521255e1d65dacc3756d/samples/libritts/LJ050-0277.wav -------------------------------------------------------------------------------- /samples/libritts/LJ050-0278.TextGrid: -------------------------------------------------------------------------------- 1 | File type = "ooTextFile" 2 | Object class = "TextGrid" 3 | 4 | xmin = 0.0 5 | xmax = 8.923582766439909 6 | tiers? 7 | size = 2 8 | item []: 9 | item [1]: 10 | class = "IntervalTier" 11 | name = "words" 12 | xmin = 0.0 13 | xmax = 8.923582766439909 14 | intervals: size = 24 15 | intervals [1]: 16 | xmin = 0.000 17 | xmax = 0.100 18 | text = "the" 19 | intervals [2]: 20 | xmin = 0.100 21 | xmax = 0.910 22 | text = "recommendations" 23 | intervals [3]: 24 | xmin = 0.910 25 | xmax = 1.010 26 | text = "we" 27 | intervals [4]: 28 | xmin = 1.010 29 | xmax = 1.180 30 | text = "have" 31 | intervals [5]: 32 | xmin = 1.180 33 | xmax = 1.420 34 | text = "here" 35 | intervals [6]: 36 | xmin = 1.420 37 | xmax = 2.200 38 | text = "suggested" 39 | intervals [7]: 40 | xmin = 2.200 41 | xmax = 2.600 42 | text = "" 43 | intervals [8]: 44 | xmin = 2.600 45 | xmax = 2.780 46 | text = "would" 47 | intervals [9]: 48 | xmin = 2.780 49 | xmax = 3.270 50 | text = "greatly" 51 | intervals [10]: 52 | xmin = 3.270 53 | xmax = 3.740 54 | text = "advance" 55 | intervals [11]: 56 | xmin = 3.740 57 | xmax = 3.830 58 | text = "the" 59 | intervals [12]: 60 | xmin = 3.830 61 | xmax = 4.440 62 | text = "security" 63 | intervals [13]: 64 | xmin = 4.440 65 | xmax = 4.550 66 | text = "of" 67 | intervals [14]: 68 | xmin = 4.550 69 | xmax = 4.670 70 | text = "the" 71 | intervals [15]: 72 | xmin = 4.670 73 | xmax = 5.330 74 | text = "office" 75 | intervals [16]: 76 | xmin = 5.330 77 | xmax = 5.590 78 | text = "" 79 | intervals [17]: 80 | xmin = 5.590 81 | xmax = 5.940 82 | text = "without" 83 | intervals [18]: 84 | xmin = 5.940 85 | xmax = 6.170 86 | text = "any" 87 | intervals [19]: 88 | xmin = 6.170 89 | xmax = 6.830 90 | text = "impairment" 91 | intervals [20]: 92 | xmin = 6.830 93 | xmax = 7.080 94 | text = "" 95 | intervals [21]: 96 | xmin = 7.080 97 | xmax = 7.230 98 | text = "of" 99 | intervals [22]: 100 | xmin = 7.230 101 | xmax = 7.430 102 | text = "our" 103 | intervals [23]: 104 | xmin = 7.430 105 | xmax = 8.130 106 | text = "fundamental" 107 | intervals [24]: 108 | xmin = 8.130 109 | xmax = 8.923582766439909 110 | text = "liberties" 111 | item [2]: 112 | class = "IntervalTier" 113 | name = "phones" 114 | xmin = 0.0 115 | xmax = 8.923582766439909 116 | intervals: size = 108 117 | intervals [1]: 118 | xmin = 0.000 119 | xmax = 0.050 120 | text = "dh" 121 | intervals [2]: 122 | xmin = 0.050 123 | xmax = 0.100 124 | text = "ax" 125 | intervals [3]: 126 | xmin = 0.100 127 | xmax = 0.190 128 | text = "r" 129 | intervals [4]: 130 | xmin = 0.190 131 | xmax = 0.260 132 | text = "eh" 133 | intervals [5]: 134 | xmin = 0.260 135 | xmax = 0.320 136 | text = "k" 137 | intervals [6]: 138 | xmin = 0.320 139 | xmax = 0.350 140 | text = "ax" 141 | intervals [7]: 142 | xmin = 0.350 143 | xmax = 0.400 144 | text = "m" 145 | intervals [8]: 146 | xmin = 0.400 147 | xmax = 0.460 148 | text = "ax" 149 | intervals [9]: 150 | xmin = 0.460 151 | xmax = 0.490 152 | text = "n" 153 | intervals [10]: 154 | xmin = 0.490 155 | xmax = 0.530 156 | text = "d" 157 | intervals [11]: 158 | xmin = 0.530 159 | xmax = 0.660 160 | text = "ey" 161 | intervals [12]: 162 | xmin = 0.660 163 | xmax = 0.760 164 | text = "sh" 165 | intervals [13]: 166 | xmin = 0.760 167 | xmax = 0.790 168 | text = "ax" 169 | intervals [14]: 170 | xmin = 0.790 171 | xmax = 0.840 172 | text = "n" 173 | intervals [15]: 174 | xmin = 0.840 175 | xmax = 0.910 176 | text = "z" 177 | intervals [16]: 178 | xmin = 0.910 179 | xmax = 0.970 180 | text = "w" 181 | intervals [17]: 182 | xmin = 0.970 183 | xmax = 1.010 184 | text = "iy" 185 | intervals [18]: 186 | xmin = 1.010 187 | xmax = 1.100 188 | text = "hh" 189 | intervals [19]: 190 | xmin = 1.100 191 | xmax = 1.130 192 | text = "ax" 193 | intervals [20]: 194 | xmin = 1.130 195 | xmax = 1.180 196 | text = "v" 197 | intervals [21]: 198 | xmin = 1.180 199 | xmax = 1.250 200 | text = "hh" 201 | intervals [22]: 202 | xmin = 1.250 203 | xmax = 1.360 204 | text = "ih" 205 | intervals [23]: 206 | xmin = 1.360 207 | xmax = 1.420 208 | text = "r" 209 | intervals [24]: 210 | xmin = 1.420 211 | xmax = 1.540 212 | text = "s" 213 | intervals [25]: 214 | xmin = 1.540 215 | xmax = 1.610 216 | text = "ax" 217 | intervals [26]: 218 | xmin = 1.610 219 | xmax = 1.640 220 | text = "g" 221 | intervals [27]: 222 | xmin = 1.640 223 | xmax = 1.760 224 | text = "jh" 225 | intervals [28]: 226 | xmin = 1.760 227 | xmax = 1.850 228 | text = "eh" 229 | intervals [29]: 230 | xmin = 1.850 231 | xmax = 1.950 232 | text = "s" 233 | intervals [30]: 234 | xmin = 1.950 235 | xmax = 2.020 236 | text = "t" 237 | intervals [31]: 238 | xmin = 2.020 239 | xmax = 2.110 240 | text = "ax" 241 | intervals [32]: 242 | xmin = 2.110 243 | xmax = 2.200 244 | text = "d" 245 | intervals [33]: 246 | xmin = 2.200 247 | xmax = 2.600 248 | text = "sp" 249 | intervals [34]: 250 | xmin = 2.600 251 | xmax = 2.690 252 | text = "w" 253 | intervals [35]: 254 | xmin = 2.690 255 | xmax = 2.720 256 | text = "uh" 257 | intervals [36]: 258 | xmin = 2.720 259 | xmax = 2.780 260 | text = "d" 261 | intervals [37]: 262 | xmin = 2.780 263 | xmax = 2.880 264 | text = "g" 265 | intervals [38]: 266 | xmin = 2.880 267 | xmax = 2.950 268 | text = "r" 269 | intervals [39]: 270 | xmin = 2.950 271 | xmax = 3.080 272 | text = "ey" 273 | intervals [40]: 274 | xmin = 3.080 275 | xmax = 3.130 276 | text = "t" 277 | intervals [41]: 278 | xmin = 3.130 279 | xmax = 3.190 280 | text = "l" 281 | intervals [42]: 282 | xmin = 3.190 283 | xmax = 3.270 284 | text = "iy" 285 | intervals [43]: 286 | xmin = 3.270 287 | xmax = 3.310 288 | text = "ax" 289 | intervals [44]: 290 | xmin = 3.310 291 | xmax = 3.370 292 | text = "d" 293 | intervals [45]: 294 | xmin = 3.370 295 | xmax = 3.440 296 | text = "v" 297 | intervals [46]: 298 | xmin = 3.440 299 | xmax = 3.580 300 | text = "ae" 301 | intervals [47]: 302 | xmin = 3.580 303 | xmax = 3.640 304 | text = "n" 305 | intervals [48]: 306 | xmin = 3.640 307 | xmax = 3.740 308 | text = "s" 309 | intervals [49]: 310 | xmin = 3.740 311 | xmax = 3.780 312 | text = "dh" 313 | intervals [50]: 314 | xmin = 3.780 315 | xmax = 3.830 316 | text = "ax" 317 | intervals [51]: 318 | xmin = 3.830 319 | xmax = 3.950 320 | text = "s" 321 | intervals [52]: 322 | xmin = 3.950 323 | xmax = 4.000 324 | text = "ax" 325 | intervals [53]: 326 | xmin = 4.000 327 | xmax = 4.030 328 | text = "k" 329 | intervals [54]: 330 | xmin = 4.030 331 | xmax = 4.150 332 | text = "y" 333 | intervals [55]: 334 | xmin = 4.150 335 | xmax = 4.180 336 | text = "uh" 337 | intervals [56]: 338 | xmin = 4.180 339 | xmax = 4.260 340 | text = "r" 341 | intervals [57]: 342 | xmin = 4.260 343 | xmax = 4.300 344 | text = "ax" 345 | intervals [58]: 346 | xmin = 4.300 347 | xmax = 4.380 348 | text = "t" 349 | intervals [59]: 350 | xmin = 4.380 351 | xmax = 4.440 352 | text = "iy" 353 | intervals [60]: 354 | xmin = 4.440 355 | xmax = 4.500 356 | text = "ax" 357 | intervals [61]: 358 | xmin = 4.500 359 | xmax = 4.550 360 | text = "v" 361 | intervals [62]: 362 | xmin = 4.550 363 | xmax = 4.630 364 | text = "ih" 365 | intervals [63]: 366 | xmin = 4.630 367 | xmax = 4.670 368 | text = "n" 369 | intervals [64]: 370 | xmin = 4.670 371 | xmax = 4.820 372 | text = "ao" 373 | intervals [65]: 374 | xmin = 4.820 375 | xmax = 4.920 376 | text = "f" 377 | intervals [66]: 378 | xmin = 4.920 379 | xmax = 5.010 380 | text = "ax" 381 | intervals [67]: 382 | xmin = 5.010 383 | xmax = 5.330 384 | text = "s" 385 | intervals [68]: 386 | xmin = 5.330 387 | xmax = 5.590 388 | text = "sp" 389 | intervals [69]: 390 | xmin = 5.590 391 | xmax = 5.670 392 | text = "w" 393 | intervals [70]: 394 | xmin = 5.670 395 | xmax = 5.710 396 | text = "ih" 397 | intervals [71]: 398 | xmin = 5.710 399 | xmax = 5.780 400 | text = "th" 401 | intervals [72]: 402 | xmin = 5.780 403 | xmax = 5.880 404 | text = "aw" 405 | intervals [73]: 406 | xmin = 5.880 407 | xmax = 5.940 408 | text = "t" 409 | intervals [74]: 410 | xmin = 5.940 411 | xmax = 6.000 412 | text = "eh" 413 | intervals [75]: 414 | xmin = 6.000 415 | xmax = 6.060 416 | text = "n" 417 | intervals [76]: 418 | xmin = 6.060 419 | xmax = 6.170 420 | text = "iy" 421 | intervals [77]: 422 | xmin = 6.170 423 | xmax = 6.200 424 | text = "ax" 425 | intervals [78]: 426 | xmin = 6.200 427 | xmax = 6.270 428 | text = "m" 429 | intervals [79]: 430 | xmin = 6.270 431 | xmax = 6.370 432 | text = "p" 433 | intervals [80]: 434 | xmin = 6.370 435 | xmax = 6.490 436 | text = "eh" 437 | intervals [81]: 438 | xmin = 6.490 439 | xmax = 6.550 440 | text = "r" 441 | intervals [82]: 442 | xmin = 6.550 443 | xmax = 6.610 444 | text = "m" 445 | intervals [83]: 446 | xmin = 6.610 447 | xmax = 6.660 448 | text = "ax" 449 | intervals [84]: 450 | xmin = 6.660 451 | xmax = 6.730 452 | text = "n" 453 | intervals [85]: 454 | xmin = 6.730 455 | xmax = 6.830 456 | text = "t" 457 | intervals [86]: 458 | xmin = 6.830 459 | xmax = 7.080 460 | text = "sp" 461 | intervals [87]: 462 | xmin = 7.080 463 | xmax = 7.160 464 | text = "ah" 465 | intervals [88]: 466 | xmin = 7.160 467 | xmax = 7.230 468 | text = "v" 469 | intervals [89]: 470 | xmin = 7.230 471 | xmax = 7.370 472 | text = "aw" 473 | intervals [90]: 474 | xmin = 7.370 475 | xmax = 7.430 476 | text = "er" 477 | intervals [91]: 478 | xmin = 7.430 479 | xmax = 7.570 480 | text = "f" 481 | intervals [92]: 482 | xmin = 7.570 483 | xmax = 7.650 484 | text = "ah" 485 | intervals [93]: 486 | xmin = 7.650 487 | xmax = 7.690 488 | text = "n" 489 | intervals [94]: 490 | xmin = 7.690 491 | xmax = 7.720 492 | text = "d" 493 | intervals [95]: 494 | xmin = 7.720 495 | xmax = 7.760 496 | text = "ax" 497 | intervals [96]: 498 | xmin = 7.760 499 | xmax = 7.840 500 | text = "m" 501 | intervals [97]: 502 | xmin = 7.840 503 | xmax = 7.890 504 | text = "eh" 505 | intervals [98]: 506 | xmin = 7.890 507 | xmax = 7.970 508 | text = "n" 509 | intervals [99]: 510 | xmin = 7.970 511 | xmax = 8.010 512 | text = "t" 513 | intervals [100]: 514 | xmin = 8.010 515 | xmax = 8.070 516 | text = "ax" 517 | intervals [101]: 518 | xmin = 8.070 519 | xmax = 8.130 520 | text = "l" 521 | intervals [102]: 522 | xmin = 8.130 523 | xmax = 8.220 524 | text = "l" 525 | intervals [103]: 526 | xmin = 8.220 527 | xmax = 8.260 528 | text = "ih" 529 | intervals [104]: 530 | xmin = 8.260 531 | xmax = 8.310 532 | text = "b" 533 | intervals [105]: 534 | xmin = 8.310 535 | xmax = 8.400 536 | text = "er" 537 | intervals [106]: 538 | xmin = 8.400 539 | xmax = 8.460 540 | text = "t" 541 | intervals [107]: 542 | xmin = 8.460 543 | xmax = 8.660 544 | text = "iy" 545 | intervals [108]: 546 | xmin = 8.660 547 | xmax = 8.923582766439909 548 | text = "z" 549 | -------------------------------------------------------------------------------- /samples/libritts/LJ050-0278.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/asuni/wavelet_prosody_toolkit/564d2aad4ae2401aab2e521255e1d65dacc3756d/samples/libritts/LJ050-0278.wav -------------------------------------------------------------------------------- /samples/rjs_01_0003.lab: -------------------------------------------------------------------------------- 1 | 0 1550000 pau !SENT_START 2 | 1550000 3050000 h high 3 | 3050000 4250000 ai 4 | 4250000 5050000 l labor 5 | 5050000 6500000 ei 6 | 6500000 7200000 b 7 | 7200000 8450000 @ 8 | 8450000 10450000 s small 9 | 10450000 10950000 m 10 | 10950000 12300000 oo 11 | 12300000 12750000 lw 12 | 12750000 14000000 s scale 13 | 14000000 14500000 k 14 | 14500000 16050000 ei 15 | 16050000 17450000 lw 16 | 17450000 18250000 e enterprises 17 | 18250000 18750000 n 18 | 18750000 19300000 t 19 | 19300000 19800000 @ 20 | 19800000 20600000 p 21 | 20600000 21500000 r 22 | 21500000 22350000 ai 23 | 22350000 23400000 z 24 | 23400000 24300000 i 25 | 24300000 25500000 z 26 | 25500000 25850000 pau 27 | 25850000 26200000 b by 28 | 26200000 27800000 ai 29 | 27800000 28050000 i employing 30 | 28050000 28950000 m 31 | 28950000 29800000 p 32 | 29800000 30150000 l 33 | 30150000 31550000 oi 34 | 31550000 31850000 i 35 | 31850000 32700000 ng 36 | 32700000 33650000 l low 37 | 33650000 35200000 ou 38 | 35200000 36550000 k cost 39 | 36550000 38000000 o 40 | 38000000 38950000 s 41 | 38950000 39800000 t 42 | 39800000 40700000 m marginal 43 | 40700000 42250000 aa 44 | 42250000 43100000 jh 45 | 43100000 43350000 i 46 | 43350000 43900000 n 47 | 43900000 45500000 l! 48 | 45500000 45800000 l labor 49 | 45800000 47000000 ei 50 | 47000000 47600000 b 51 | 47600000 48700000 @ 52 | 48700000 51500000 pau 53 | 51500000 51900000 w which 54 | 51900000 52300000 i 55 | 52300000 53300000 ch 56 | 53300000 53600000 r require 57 | 53600000 53900000 i 58 | 53900000 54900000 k 59 | 54900000 55300000 w 60 | 55300000 57250000 ai 61 | 57250000 57550000 @ 62 | 57550000 58600000 l large 63 | 58600000 60700000 aa 64 | 60700000 62100000 jh 65 | 62100000 63000000 l labor 66 | 63000000 64100000 ei 67 | 64100000 64400000 b 68 | 64400000 65850000 @ 69 | 65850000 66100000 i inputs 70 | 66100000 67100000 n 71 | 67100000 67950000 p 72 | 67950000 68750000 u 73 | 68750000 70000000 t 74 | 70000000 71350000 s 75 | 71350000 73150000 pau !SENT_END 76 | -------------------------------------------------------------------------------- /samples/rjs_01_0003.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/asuni/wavelet_prosody_toolkit/564d2aad4ae2401aab2e521255e1d65dacc3756d/samples/rjs_01_0003.wav -------------------------------------------------------------------------------- /screenshot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/asuni/wavelet_prosody_toolkit/564d2aad4ae2401aab2e521255e1d65dacc3756d/screenshot.png -------------------------------------------------------------------------------- /test/diff_num.py: -------------------------------------------------------------------------------- 1 | # travis test for comparing prominence and boundary values across versions. 2 | # allow for minor differences in values 3 | import sys, glob 4 | import numpy as np 5 | ref_files = sorted(glob.glob(sys.argv[1]+"/*.prom")) 6 | test_files = sorted(glob.glob(sys.argv[2]+"/*.prom")) 7 | 8 | for i in range(len(ref_files)): 9 | ref = (open(ref_files[i], "r")).readlines() 10 | test = (open(test_files[i], "r")).readlines() 11 | 12 | val_ref = [] 13 | val_test = [] 14 | # compare prominence and boundary values with some tolerance 15 | for l in ref: 16 | val_ref.append(float(l.strip().split("\t")[-1])) 17 | val_ref.append(float(l.strip().split("\t")[-2])) 18 | for l in test: 19 | val_test.append(float(l.strip().split("\t")[-1])) 20 | val_test.append(float(l.strip().split("\t")[-2])) 21 | 22 | 23 | assert np.allclose(np.array(val_ref), np.array(val_test), atol=0.3), \ 24 | ref_files[i]+" and "+test_files[i]+ " differ too much!" 25 | -------------------------------------------------------------------------------- /test/resources/libritts/7127_75947_000010_000000.prom: -------------------------------------------------------------------------------- 1 | 7127_75947_000010_000000 0.000 0.680 yes 0.745 1.052 2 | 7127_75947_000010_000000 0.750 0.890 the -0.116 0.000 3 | 7127_75947_000010_000000 0.890 1.450 character 3.169 1.255 4 | 7127_75947_000010_000000 1.450 1.670 which -0.008 0.000 5 | 7127_75947_000010_000000 1.670 1.880 your 0.017 0.366 6 | 7127_75947_000010_000000 1.880 2.250 royal 1.332 0.361 7 | 7127_75947_000010_000000 2.250 2.660 highness 0.655 0.923 8 | 7127_75947_000010_000000 2.660 3.190 assumed 2.819 1.281 9 | 7127_75947_000010_000000 3.190 3.380 is -0.057 0.000 10 | 7127_75947_000010_000000 3.380 3.520 in 0.006 0.511 11 | 7127_75947_000010_000000 3.520 3.960 perfect 2.022 0.487 12 | 7127_75947_000010_000000 3.960 4.380 harmony 0.900 0.630 13 | 7127_75947_000010_000000 4.380 4.550 with -0.100 0.218 14 | 7127_75947_000010_000000 4.550 4.710 your -0.024 0.000 15 | 7127_75947_000010_000000 4.710 5.080 own 1.285 1.000 16 | -------------------------------------------------------------------------------- /test/resources/libritts/LJ050-0276.prom: -------------------------------------------------------------------------------- 1 | LJ050-0276 0.000 0.180 as 0.336 0.174 2 | LJ050-0276 0.180 0.460 has 1.829 0.000 3 | LJ050-0276 0.460 0.660 been -0.023 0.684 4 | LJ050-0276 0.660 1.070 pointed 0.584 0.247 5 | LJ050-0276 1.070 1.430 out 0.942 1.440 6 | LJ050-0276 1.790 1.900 the -0.011 0.000 7 | LJ050-0276 1.900 2.330 commission 1.712 0.000 8 | LJ050-0276 2.330 2.510 has -0.057 0.856 9 | LJ050-0276 2.510 2.780 not 2.328 0.364 10 | LJ050-0276 2.780 3.350 resolved 0.250 1.209 11 | LJ050-0276 3.350 3.580 all 1.080 0.000 12 | LJ050-0276 3.580 3.690 the 0.000 0.891 13 | LJ050-0276 3.690 4.420 proposals 1.988 0.578 14 | LJ050-0276 4.420 4.650 which 0.633 0.057 15 | LJ050-0276 4.650 4.810 could 0.023 0.692 16 | LJ050-0276 4.810 4.970 be -0.077 0.000 17 | LJ050-0276 4.970 5.450 made 1.280 0.978 18 | LJ050-0276 6.150 6.250 the -0.003 0.000 19 | LJ050-0276 6.250 6.740 commission 1.177 0.121 20 | LJ050-0276 6.740 7.370 nevertheless 0.347 1.309 21 | LJ050-0276 7.370 7.510 is 0.000 0.000 22 | LJ050-0276 7.510 8.180 confident 2.469 0.663 23 | LJ050-0276 8.180 8.470 that 0.723 1.000 24 | -------------------------------------------------------------------------------- /test/resources/libritts/LJ050-0277.prom: -------------------------------------------------------------------------------- 1 | LJ050-0277 0.000 0.170 with 0.430 1.010 2 | LJ050-0277 0.170 0.340 the -0.037 0.000 3 | LJ050-0277 0.340 0.740 active 3.736 1.174 4 | LJ050-0277 0.740 1.590 cooperation 0.959 0.439 5 | LJ050-0277 1.590 1.710 of 0.000 0.000 6 | LJ050-0277 1.710 1.810 the -0.027 1.361 7 | LJ050-0277 1.810 2.620 responsible 2.287 0.704 8 | LJ050-0277 2.620 3.550 agencies 1.066 0.939 9 | LJ050-0277 3.710 4.000 and 0.266 0.000 10 | LJ050-0277 4.000 4.170 with 0.864 0.244 11 | LJ050-0277 4.170 4.310 the 0.378 0.071 12 | LJ050-0277 4.310 5.030 understanding 1.089 0.087 13 | LJ050-0277 5.030 5.130 of 0.000 0.854 14 | LJ050-0277 5.130 5.240 the -0.080 0.000 15 | LJ050-0277 5.240 5.590 people 2.182 0.000 16 | LJ050-0277 5.590 5.700 of -0.009 1.087 17 | LJ050-0277 5.700 5.780 the -0.060 0.000 18 | LJ050-0277 5.780 6.240 united 0.351 0.292 19 | LJ050-0277 6.240 6.720 states 0.911 0.204 20 | LJ050-0277 6.720 6.820 in 0.000 0.024 21 | LJ050-0277 6.820 7.000 their 0.095 0.083 22 | LJ050-0277 7.000 7.450 demands 0.350 0.774 23 | LJ050-0277 7.450 7.810 upon 2.003 0.767 24 | LJ050-0277 7.810 7.980 their -0.103 0.000 25 | LJ050-0277 7.980 8.620 president 1.032 1.000 26 | -------------------------------------------------------------------------------- /test/resources/libritts/LJ050-0278.prom: -------------------------------------------------------------------------------- 1 | LJ050-0278 0.000 0.100 the 0.000 0.000 2 | LJ050-0278 0.100 0.910 recommendations 1.652 1.132 3 | LJ050-0278 0.910 1.010 we 0.000 0.000 4 | LJ050-0278 1.010 1.180 have -0.049 0.124 5 | LJ050-0278 1.180 1.420 here 1.098 0.460 6 | LJ050-0278 1.420 2.200 suggested 1.787 1.327 7 | LJ050-0278 2.600 2.780 would 0.205 0.297 8 | LJ050-0278 2.780 3.270 greatly 2.707 0.259 9 | LJ050-0278 3.270 3.740 advance 0.002 1.320 10 | LJ050-0278 3.740 3.830 the -0.056 0.000 11 | LJ050-0278 3.830 4.440 security 1.914 0.000 12 | LJ050-0278 4.440 4.550 of 0.000 0.665 13 | LJ050-0278 4.550 4.670 the -0.033 0.000 14 | LJ050-0278 4.670 5.330 office 0.960 1.147 15 | LJ050-0278 5.590 5.940 without 1.523 0.000 16 | LJ050-0278 5.940 6.170 any 0.148 0.613 17 | LJ050-0278 6.170 6.830 impairment 1.496 1.364 18 | LJ050-0278 7.080 7.230 of 0.000 0.000 19 | LJ050-0278 7.230 7.430 our 1.700 0.000 20 | LJ050-0278 7.430 8.130 fundamental 0.053 0.552 21 | LJ050-0278 8.130 8.924 liberties 0.713 1.000 22 | -------------------------------------------------------------------------------- /test/resources/test_spectrum/8hz_4hz_1hz.freqs.txt: -------------------------------------------------------------------------------- 1 | 16.00000 14.92853 13.92881 12.99604 12.12573 11.31371 10.55606 9.84916 9.18959 8.57419 8.00000 7.46426 6.96440 6.49802 6.06287 5.65685 5.27803 4.92458 4.59479 4.28709 4.00000 3.73213 3.48220 3.24901 3.03143 2.82843 2.63902 2.46229 2.29740 2.14355 2.00000 1.86607 1.74110 1.62450 1.51572 1.41421 1.31951 1.23114 1.14870 1.07177 1.00000 0.93303 0.87055 0.81225 0.75786 0.70711 0.65975 0.61557 0.57435 0.53589 0.50000 0.46652 0.43528 0.40613 0.37893 0.35355 0.32988 0.30779 0.28717 0.26794 0.25000 -------------------------------------------------------------------------------- /test/resources/test_spectrum/8hz_4hz_1hz.spec.txt: -------------------------------------------------------------------------------- 1 | 0.06162 0.07188 0.08484 0.10062 0.12017 0.14874 0.19752 0.27625 0.37755 0.46408 0.48220 0.40586 0.27301 0.16186 0.11925 0.14038 0.20643 0.30600 0.42299 0.51960 0.53918 0.45020 0.29310 0.15702 0.08705 0.05898 0.04023 0.02544 0.02024 0.02280 0.02593 0.02505 0.02128 0.02030 0.02924 0.05343 0.09916 0.17173 0.26570 0.35278 0.38407 0.32698 0.20557 0.08974 0.02797 0.01087 0.00972 0.01044 0.01066 0.01113 0.01204 0.01370 0.01593 0.01885 0.02227 0.02558 0.02821 0.03015 0.03098 0.03060 0.03062 -------------------------------------------------------------------------------- /test/run_test.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Check default script 4 | python3 wavelet_prosody_toolkit/cwt_analysis_synthesis.py -v samples/01l_fact_0001.wav 01l_fact_0001.cwt 5 | diff 01l_fact_0001.cwt test/resources/01l_fact_0001.cwt 6 | ret=$? 7 | if [ $ret != 0 ]; then 8 | exit $ret 9 | fi 10 | 11 | # Check prosody labeller 12 | python3 wavelet_prosody_toolkit/prosody_labeller.py -v -o test_libri -c wavelet_prosody_toolkit/configs/libritts.yaml samples/libritts 13 | python3 test/diff_num.py test_libri test/resources/libritts 14 | #diff -r test_libri test/resources/libritts 15 | ret=$? 16 | if [ $ret != 0 ]; then 17 | exit $ret 18 | fi 19 | 20 | # Check global spectrum extractor 21 | python3 wavelet_prosody_toolkit/cwt_global_spectrum.py -v -o test_spectrum samples/8hz_4hz_1hz.wav 22 | diff -r test_spectrum/ test/resources/test_spectrum 23 | ret=$? 24 | if [ $ret != 0 ]; then 25 | exit $ret 26 | fi 27 | -------------------------------------------------------------------------------- /tools.rst: -------------------------------------------------------------------------------- 1 | Additional tools for prosody processing with wavelets 2 | ----------------------------------------------------- 3 | 4 | Besides the graphical Wavelet Prosody Analyzer, the repository contains additional command-line tools related to prosody processing with wavelets, described below. 5 | Precise usage of the tools can be checked by running the tools with the --help flag, for example: 6 | 7 | .. code:: sh 8 | 9 | python3 cwt_analysis_synthesis.py --help 10 | 11 | | 12 | | 13 | 14 | **prosody_labeller.py** 15 | 16 | .. image:: img/prosody_labeller.png 17 | :width: 600 18 | 19 | This tool provides the same functionality as the graphical wavelet prosody analyzer. With parallel processing and no graphical overhead, it is suitable for processing large speech corpora. We also provide configuration files fine-tuned for English prominence and boundary estimation. Try: 20 | 21 | .. code:: sh 22 | 23 | python3 prosody_labeller.py samples/libritts --config configs/libritts.yaml 24 | 25 | or 26 | 27 | .. code:: sh 28 | 29 | python3 prosody_labeller.py samples/libritts --config configs/libritts_boundary.yaml 30 | 31 | 32 | *Talman A, Suni A, Celikkanat H, Kakouros S, Tiedemann J, Vainio M. Predicting Prosodic Prominence from Text with Pre-trained Contextualized Word Representations. Nordic Conference of Computational Linguistics. 2019 Aug 9.* 33 | 34 | *Antti Suni, Juraj Šimko, Daniel Aalto, Martti Vainio, Hierarchical representation and estimation of prosody using continuous wavelet transform, Computer Speech & Language, Volume 45, 2017, Pages 123-136, ISSN 0885-2308, https://doi.org/10.1016/j.csl.2016.11.001.* 35 | 36 | | 37 | | 38 | 39 | **cwt_analysis_synthesis.py** 40 | 41 | .. image:: img/analysis_synthesis.png 42 | :width: 600 43 | 44 | This tool demonstrates how F0 can be decomposed to temporal scales which can be associated to phonological levels, and how the original F0 contour can be reconstructed from these scales. 45 | 46 | *Suni, A. S., Aalto, D., Raitio, T., Alku, P., & Vainio, M. (2013). Wavelets for intonation modeling in HMM speech synthesis. In A. Bonafonte (Ed.), 8th ISCA Workshop on Speech Synthesis, Proceedings, Barcelona, August 31 - September 2, 2013 (pp. 285-290). Barcelona: ISCA.* 47 | 48 | | 49 | | 50 | 51 | **cwt_global_spectrum.py** 52 | 53 | .. image:: img/global_spectrum.png 54 | :width: 600 55 | 56 | This script extracts global wavelet spectrum of the speech envelope, similar to amplitude modulation spectrum. 57 | 58 | *Suni , A , Kallio , H , Benus , S & Šimko , J 2019 , Characterizing second language fluency with global wavelet spectrum . in S Calhoun , P Escudero , M Tabain & P Warren (eds) , Proceedings of the 19th International Congress of Phonetic Sciences, Melbourne, Australia.* 59 | 60 | 61 | -------------------------------------------------------------------------------- /wavelet_prosody_toolkit/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/asuni/wavelet_prosody_toolkit/564d2aad4ae2401aab2e521255e1d65dacc3756d/wavelet_prosody_toolkit/__init__.py -------------------------------------------------------------------------------- /wavelet_prosody_toolkit/configs/default.yaml: -------------------------------------------------------------------------------- 1 | f0: 2 | use_existing_f0: false # do not perform pitch tracking if .f0 file exists 3 | pitch_tracker: "inst_freq" # inst_freq (our algorithm) or REAPER (from google) 4 | min_f0: 50 5 | max_f0: 400 6 | voicing_threshold: 50 # from 0 (all voiced) to 100 (no voicing) 7 | interpolation_method: "true_envelope" # fill unvoiced sections; "linear", "pchip" or "true_envelope" 8 | 9 | energy: 10 | # subband for energy calculation (in hz) 11 | band_min: 200 12 | band_max: 5000 13 | calculation_method: "rms" # 'rms', 'hilbert' or 'true_envelope' (root mean square, hilbert envelope, or true envelope -inspired method) 14 | smooth_energy: True # smoothing reduces the effect of voicing related variation in energy contour 15 | 16 | 17 | duration: 18 | acoustic_estimation: False # estimate speech rate from signal using wavelet transform of energy 19 | delta_duration: False # use differential duration signal, useful for boundary detection 20 | duration_tiers: ["phones","words"] 21 | weights: [0.5, 0.5] # weight for selected duration tiers 22 | silence_symbols: ["#", "!pau", "", "pau", "!sil", "sil", "", " ","

", "", "." ,",","?"] 23 | linear: False # linear or logarithmic durations 24 | bump: False # more lively signal by emphasizing the differences 25 | # between adjacent unit duration 26 | 27 | feature_combination: 28 | type: "sum" # sum or product 29 | detrend: True # detrend can be used to remove typical downdrift during utterance, mainly produces more balanced looking scalograms 30 | weights: 31 | f0: 1.0 32 | energy: 1.0 33 | duration: 0.5 34 | 35 | labels: 36 | annotation_tier: "words" # adjust these to your annotation scheme 37 | 38 | wavelet: 39 | mother_wavelet: "mexican_hat" # mexican_hat, morlet or paul 40 | period: 3 # applies to morlet and paul wavelets (periods 2 - 5 are reasonable) 41 | 42 | scale_distance: 0.25 # distance between adjacent scales in octaves, (0.25 means 4 scales per octave) 43 | num_scales: 34 # number of wavelet scales 44 | magnitude: False # for purposes other than prosodic event annotation, 45 | # examinining wavelet magnitude might be useful (complex wavelets Morlet or Paul should be used) 46 | 47 | # lines of maximum amplitude, defined as octaves below and above the measured unit scale 48 | loma: 49 | prom_start: -3 # -3 meaning three octaves below unit scale 50 | prom_end: 0 51 | boundary_start: -2 52 | boundary_end: 1 # one octave higher than unit scale 53 | -------------------------------------------------------------------------------- /wavelet_prosody_toolkit/configs/libritts.yaml: -------------------------------------------------------------------------------- 1 | f0: 2 | use_existing_f0: false # do not perform pitch tracking if .f0 file exists 3 | pitch_tracker: "inst_freq" # inst_freq (our algorithm) or REAPER (from google) 4 | min_f0: 50 5 | max_f0: 400 6 | voicing_threshold: 20 # from 0 (all voiced) to 100 (no voicing) 7 | interpolation_method: "true_envelope" # fill unvoiced sections; "linear", "pchip" or "true_envelope" 8 | 9 | energy: 10 | # subband for energy calculation (in hz) 11 | band_min: 400 12 | band_max: 4000 13 | calculation_method: "rms" # 'rms', 'hilbert' or 'true_envelope' (root mean square, hilbert envelope, or true envelope -inspired method) 14 | smooth_energy: True # smoothing reduces the effect of voicing related variation in energy contour 15 | 16 | 17 | duration: 18 | acoustic_estimation: False # estimate speech rate from signal using wavelet transform of energy 19 | delta_duration: False # use differential duration signal, useful for boundary detection 20 | bump: False # more lively signal by emphasizing the differences 21 | # between adjacent unit durations 22 | duration_tiers: ["words", "phones"] 23 | weights: [0.5, 0.5] # weight for selected duration tiers 24 | #duration_tiers: ["words"] 25 | silence_symbols: ["#", "!pau", "", "pau","sp", "!sil", "sil", "", " ","

", "", "." ,",","?"] 26 | linear: False # linear or logarithmic durations 27 | feature_combination: 28 | type: "product" # sum or product 29 | detrend: True # detrend can be used to remove typical downdrift during utterance, mainly produces more balanced looking scalograms 30 | weights: 31 | f0: 1.0 32 | energy: 0.5 33 | duration: 1.0 34 | 35 | labels: 36 | annotation_tier: "words" # adjust these to your annotation scheme 37 | 38 | wavelet: 39 | mother_wavelet: "mexican_hat" # mexican_hat, morlet or paul 40 | period: 3 # applies to morlet and paul wavelets (periods 2 - 5 are reasonable) 41 | 42 | scale_distance: 0.25 # distance between adjacent scales in octaves, (0.25 means 4 scales per octave) 43 | num_scales: 40 # number of wavelet scales 44 | magnitude: False # for purposes other than prosodic event annotation, 45 | # examinining wavelet magnitude might be useful (complex wavelets Morlet or Paul should be used) 46 | 47 | # lines of maximum amplitude, defined as octaves below and above the measured unit scale 48 | loma: 49 | prom_start: -2 # -3 meaning three octaves below unit scale 50 | prom_end: 1 51 | boundary_start: -1 52 | boundary_end: 2 # one octave higher than unit scale 53 | -------------------------------------------------------------------------------- /wavelet_prosody_toolkit/configs/libritts_boundary.yaml: -------------------------------------------------------------------------------- 1 | f0: 2 | use_existing_f0: false # do not perform pitch tracking if .f0 file exists 3 | pitch_tracker: "inst_freq" # inst_freq (our algorithm) or REAPER (from google) 4 | min_f0: 50 5 | max_f0: 400 6 | voicing_threshold: 20 # from 0 (all voiced) to 100 (no voicing) 7 | interpolation_method: "true_envelope" # fill unvoiced sections; "linear", "pchip" or "true_envelope" 8 | 9 | energy: 10 | # subband for energy calculation (in hz) 11 | band_min: 400 12 | band_max: 4000 13 | calculation_method: "true_envelope" # 'rms', 'hilbert' or 'true_envelope' (root mean square, hilbert envelope, or true envelope -inspired method) 14 | smooth_energy: True # smoothing reduces the effect of voicing related variation in energy contour 15 | 16 | 17 | duration: 18 | acoustic_estimation: False # estimate speech rate from signal using wavelet transform of energy 19 | delta_duration: True # use differential duration signal, useful for boundary detection 20 | bump: False # more lively duration signal by emphasizing the differences 21 | # between adjacent unit durations 22 | duration_tiers: ["words"] #, "phones"] 23 | 24 | weights: [0.5, 0.5] 25 | silence_symbols: ["#", "!pau", "", "pau","sp", "!sil", "sil", "", " ","

", "", "." ,",","?"] 26 | linear: True # linear log logarithmi durations 27 | 28 | feature_combination: 29 | type: "sum" # sum or product 30 | detrend: False # detrend can be used to remove typical downdrift during utterance, mainly produces more balanced looking scalograms 31 | weights: 32 | f0: 1.0 33 | energy: 1.0 34 | duration: 0.5 35 | 36 | labels: 37 | #annotation_tier: "word" # BURNC 38 | anotation_tier: "words" 39 | wavelet: 40 | mother_wavelet: "mexican_hat" # mexican_hat, morlet or paul 41 | period: 3 # applies to morlet and paul wavelets (periods 2 - 5 are reasonable) 42 | 43 | scale_distance: 0.25 # distance between adjacent scales in octaves, (0.25 means 4 scales per octave) 44 | num_scales: 40 # number of wavelet scales 45 | magnitude: False # for purposes other than prosodic event annotation, 46 | # examinining wavelet magnitude might be useful (complex wavelets Morlet or Paul should be used) 47 | 48 | # lines of maximum amplitude, defined as octaves below and above the measured unit scale 49 | loma: 50 | prom_start: -2 # -3 meaning three octaves below unit scale 51 | prom_end: 1 52 | boundary_start: -1 53 | boundary_end: 2.5 # one octave higher than unit scale 54 | -------------------------------------------------------------------------------- /wavelet_prosody_toolkit/configs/synthesis.yaml: -------------------------------------------------------------------------------- 1 | wavelet: 2 | mother_wavelet: "mexican_hat" # mexican_hat, morlet or paul 3 | scale_distance: 1 # distance between adjacent scales in octaves, (0.25 means 4 scales per octave) 4 | num_scales: 12 # number of wavelet scales 5 | combined_scales: [[0, 2], [2, 4], [4, 6], [6, 8], [8, 12]] 6 | 7 | -------------------------------------------------------------------------------- /wavelet_prosody_toolkit/cwt_analysis_synthesis.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | """ 4 | AUTHOR 5 | - Antti Suni 6 | - Sébastien Le Maguer 7 | 8 | DESCRIPTION 9 | 10 | usage: cwt_analysis_synthesis.py [-h] [-v] [-M MODE] [-m MEAN_F0] [-o OUTPUT] 11 | [-P] 12 | input_file 13 | 14 | Tool for CWT analysis/synthesis of the F0 15 | 16 | positional arguments: 17 | input_file Input signal or F0 file 18 | 19 | optional arguments: 20 | -h, --help show this help message and exit 21 | -v, --verbosity increase output verbosity 22 | -M MODE, --mode MODE script mode: 0=analysis, 1=synthesis, 2=analysis/synthesis 23 | -m MEAN_F0, --mean_f0 MEAN_F0 24 | Mean f0 needed for synthesis (unsed for analysis modes) 25 | -o OUTPUT, --output OUTPUT 26 | output directory for analysis or filename for synthesis. 27 | (Default: input_file directory [Analysis] or .f0 [Synthesis]) 28 | -P, --plot Plot the results 29 | 30 | 31 | LICENSE 32 | See https://github.com/asuni/wavelet_prosody_toolkit/blob/master/LICENSE.txt 33 | """ 34 | 35 | import sys 36 | import os 37 | import traceback 38 | import argparse 39 | import time 40 | import logging 41 | 42 | import yaml 43 | 44 | # Collections 45 | from collections import defaultdict 46 | 47 | import warnings 48 | 49 | # Wavelet import 50 | from wavelet_prosody_toolkit.prosody_tools import misc 51 | from wavelet_prosody_toolkit.prosody_tools import cwt_utils 52 | from wavelet_prosody_toolkit.prosody_tools import f0_processing 53 | 54 | import numpy as np 55 | 56 | # List of logging levels used to setup everything using verbose option 57 | LEVEL = [logging.WARNING, logging.INFO, logging.DEBUG] 58 | 59 | # FIXME: be more specific! 60 | warnings.simplefilter("ignore", np.ComplexWarning) # Plotting can't deal with complex, but we don't care 61 | 62 | 63 | ############################################################################### 64 | # Functions 65 | ############################################################################### 66 | def apply_configuration(current_configuration, updating_part): 67 | """Utils to update the current configuration using the updating part 68 | 69 | Parameters 70 | ---------- 71 | current_configuration: dict 72 | The current state of the configuration 73 | 74 | updating_part: dict 75 | The information to add to the current configuration 76 | 77 | Returns 78 | ------- 79 | dict 80 | the updated configuration 81 | """ 82 | if not isinstance(current_configuration, dict): 83 | return updating_part 84 | 85 | if current_configuration is None: 86 | return updating_part 87 | 88 | if updating_part is None: 89 | return current_configuration 90 | 91 | for k in updating_part: 92 | if k not in current_configuration: 93 | current_configuration[k] = updating_part[k] 94 | else: 95 | current_configuration[k] = apply_configuration(current_configuration[k], updating_part[k]) 96 | 97 | return current_configuration 98 | 99 | 100 | def load_f0(input_file, binary_mode=False, configuration=None): 101 | """Load the f0 from a text file or extract it from a wav file 102 | 103 | Parameters 104 | ---------- 105 | input_file: string 106 | The input file name. 107 | 108 | Returns 109 | ------- 110 | 1D arraylike 111 | the raw f0 values 112 | """ 113 | if input_file.lower().endswith(".csv"): 114 | if binary_mode: 115 | raise Exception("cannot have a csv file in binary mode") 116 | else: 117 | raw_f0 = np.loadtxt(input_file) 118 | if input_file.lower().endswith(".f0"): 119 | if binary_mode: 120 | raw_f0 = np.fromfile(input_file, dtype=np.float32) 121 | else: 122 | raw_f0 = np.loadtxt(input_file) 123 | elif input_file.lower().endswith(".lf0"): 124 | if binary_mode: 125 | raw_f0 = np.fromfile(input_file, dtype=np.float32) 126 | else: 127 | raw_f0 = np.loadtxt(input_file) 128 | raw_f0 = np.exp(raw_f0) 129 | elif input_file.lower().endswith(".wav"): 130 | logging.info("Extracting the F0 from the signal") 131 | (fs, wav_form) = misc.read_wav(input_file) 132 | raw_f0 = f0_processing.extract_f0(wav_form, fs, 133 | configuration["f0"]["min_f0"], 134 | configuration["f0"]["max_f0"]) 135 | 136 | 137 | return raw_f0 138 | 139 | 140 | ############################################################################### 141 | # Main function 142 | ############################################################################### 143 | def run(): 144 | """Main entry function 145 | 146 | This function contains the code needed to achieve the analysis and/or the synthesis 147 | """ 148 | global args 149 | 150 | warnings.simplefilter("ignore", FutureWarning) # Plotting can't deal with complex, but we don't care 151 | 152 | # Loading default configuration 153 | configuration = defaultdict() 154 | with open(os.path.dirname(os.path.realpath(__file__)) + "/configs/default.yaml", 'r') as f: 155 | configuration = apply_configuration(configuration, defaultdict(lambda: False, yaml.safe_load(f))) 156 | logging.debug("default configuration") 157 | logging.debug(configuration) 158 | 159 | # Loading dedicated analysis.synthesis configuration 160 | with open(os.path.dirname(os.path.realpath(__file__)) + "/configs/synthesis.yaml", 'r') as f: 161 | configuration = apply_configuration(configuration, defaultdict(lambda: False, yaml.safe_load(f))) 162 | logging.debug("configuration filled with synthesis part") 163 | logging.debug(configuration) 164 | 165 | # Loading user configuration 166 | if args.configuration_file: 167 | try: 168 | with open(args.configuration_file, 'r') as f: 169 | configuration = apply_configuration(configuration, defaultdict(lambda: False, yaml.safe_load(f))) 170 | logging.debug("configuration filled with user part") 171 | logging.debug(configuration) 172 | except IOError as ex: 173 | logging.error("configuration file " + args.config + " could not be loaded:") 174 | logging.error(ex.msg) 175 | sys.exit(1) 176 | 177 | # Analysis Mode 178 | if args.mode == 0: 179 | raw_f0 = load_f0(args.input_file, args.binary_mode, configuration) 180 | 181 | logging.info("Processing f0") 182 | f0 = f0_processing.process(raw_f0) 183 | # FIXME: reintegrated 184 | if args.plot: 185 | # Plotting 186 | import matplotlib.pyplot as plt 187 | import matplotlib.colors as colors 188 | 189 | plt.title("F0 preprocessing and interpolation") 190 | plt.plot(f0, color="red", alpha=0.5, linewidth=3) 191 | plt.plot(raw_f0, color="gray", alpha=0.5) 192 | plt.show() 193 | 194 | # # FIXME: read this? 195 | # logging.info("writing interpolated lf0\t" + output_file + ".interp") 196 | # np.savetxt(output_file + ".interp", f0.astype('float'), 197 | # fmt="%f", delimiter="\n") 198 | 199 | # Perform continuous wavelet transform of mean-substracted f0 with 12 scales, one octave apart 200 | logging.info("Starting analysis with (num_scale=%d, scale_distance=%f, mother_name=%s)" % 201 | (configuration["wavelet"]["num_scales"], configuration["wavelet"]["scale_distance"], configuration["wavelet"]["mother_wavelet"])) 202 | full_scales, widths, _ = cwt_utils.cwt_analysis(f0 - np.mean(f0), 203 | mother_name=configuration["wavelet"]["mother_wavelet"], 204 | period=configuration["wavelet"]["period"], 205 | num_scales=configuration["wavelet"]["num_scales"], 206 | scale_distance=configuration["wavelet"]["scale_distance"], 207 | apply_coi=False) 208 | full_scales = np.real(full_scales) 209 | # SSW parameterization, adjacent scales combined (with extra scales to handle long utterances) 210 | scales = cwt_utils.combine_scales(np.real(full_scales), configuration["wavelet"]["combined_scales"]) 211 | for i in range(0, len(scales)): 212 | logging.debug("Mean scale[%d]: %s" % (i, str(np.mean(scales[i])))) 213 | 214 | # Saving matrix 215 | logging.info("writing wavelet matrix in \"%s\"" % args.output_file) 216 | if args.binary_mode: 217 | with open(args.output_file, "wb") as f_out: 218 | scales.T.astype(np.float32).tofile(f_out) 219 | else: 220 | np.savetxt(args.output_file, scales.T.astype('float'), fmt="%f", delimiter=",") 221 | 222 | # Synthesis mode 223 | if args.mode == 1: 224 | if args.binary_mode: 225 | scales = np.fromfile(args.input_file, dtype=np.float32) 226 | scales = scales.reshape(-1, len(configuration["wavelet"]["combined_scales"])).T 227 | else: 228 | scales = np.loadtxt(args.input_file, delimiter=",").T # FIXME: hardcoded 229 | 230 | rec = cwt_utils.cwt_synthesis(scales, args.mean_f0) 231 | 232 | logging.info("Save reconstructed f0 in %s" % args.output_file) 233 | if args.binary_mode: 234 | with open(args.output_file, "wb") as f_out: 235 | rec.astype(np.float32).tofile(f_out) 236 | else: 237 | np.savetxt(args.output_file, rec, fmt="%f") 238 | 239 | # Debugging /plotting part 240 | if args.plot: 241 | nb_sub = 2 242 | if args.mode == 0: 243 | nb_sub = 3 244 | 245 | ax = plt.subplot(nb_sub, 1, 1) 246 | # pylab.title("CWT decomposition to % scales and reconstructed signal" % len(configuration["wavelet"]["combined_scales"])) 247 | 248 | if args.mode == 0: 249 | plt.plot(f0, linewidth=1, color="red") 250 | rec = cwt_utils.cwt_synthesis(scales, np.mean(f0)) 251 | 252 | plt.plot(rec, color="blue", alpha=0.3) 253 | 254 | plt.subplot(nb_sub, 1, 2, sharex=ax) 255 | for i in range(0, len(scales)): 256 | plt.plot(scales[i] + max(rec)*1.5 + i*75, 257 | color="blue", alpha=0.5) 258 | #plt.plot(scales[len(scales)-i-1] + max(rec)*1.5 + i*75, 259 | 260 | 261 | 262 | if args.mode == 0: 263 | plt.subplot(nb_sub, 1, 3, sharex=ax) 264 | plt.contourf(np.real(full_scales), 100, 265 | norm=colors.SymLogNorm(linthresh=0.2, linscale=0.05, 266 | vmin=np.min(full_scales), vmax=np.max(full_scales)),cmap="jet") 267 | plt.show() 268 | 269 | 270 | ############################################################################### 271 | # Envelopping 272 | ############################################################################### 273 | def main(): 274 | """Entry point for CWT analysis/synthesis tool 275 | 276 | This function is a wrapper to deal with arguments and logging. 277 | """ 278 | global args 279 | 280 | try: 281 | parser = argparse.ArgumentParser(description="Tool for CWT analysis/synthesis of the F0") 282 | 283 | # Add options 284 | parser.add_argument("-B", "--binary-mode", action="store_true", 285 | help="Activate binary mode, else files are assumed to be a csv for the f0/wavelet part") 286 | parser.add_argument("-c", "--configuration-file", default=None, help="configuration file") 287 | parser.add_argument("-M", "--mode", type=int, default=0, 288 | help="script mode: 0=analysis, 1=synthesis") 289 | parser.add_argument("-m", "--mean_f0", type=float, default=100, 290 | help="Mean f0 needed for synthesis (unsed for analysis modes)") 291 | parser.add_argument("-P", "--plot", action="store_true", 292 | help="Plot the results") 293 | parser.add_argument("-v", "--verbosity", action="count", default=0, 294 | help="increase output verbosity") 295 | 296 | # Add arguments 297 | parser.add_argument("input_file", help="Input signal or F0 file") 298 | parser.add_argument("output_file", 299 | help="output directory for analysis or filename for synthesis. " + 300 | "(Default: input_file directory [Analysis] or .f0 [Synthesis])") 301 | 302 | # Parsing arguments 303 | args = parser.parse_args() 304 | 305 | # Verbose level => logging level 306 | log_level = args.verbosity 307 | if (args.verbosity >= len(LEVEL)): 308 | log_level = len(LEVEL) - 1 309 | logging.basicConfig(level=LEVEL[log_level]) 310 | logging.warning("verbosity level is too high, I'm gonna assume you're taking the highest (%d)" % log_level) 311 | else: 312 | logging.basicConfig(level=LEVEL[log_level]) 313 | 314 | # Debug time 315 | start_time = time.time() 316 | logging.info("start time = " + time.asctime()) 317 | 318 | # Running main function <=> run application 319 | run() 320 | 321 | # Debug time 322 | logging.info("end time = " + time.asctime()) 323 | logging.info('TOTAL TIME IN MINUTES: %02.2f' % 324 | ((time.time() - start_time) / 60.0)) 325 | 326 | # Exit program 327 | sys.exit(0) 328 | except KeyboardInterrupt as e: # Ctrl-C 329 | raise e 330 | except SystemExit as e: # sys.exit() 331 | pass 332 | except Exception as e: 333 | logging.error('ERROR, UNEXPECTED EXCEPTION') 334 | logging.error(str(e)) 335 | traceback.print_exc(file=sys.stderr) 336 | sys.exit(-1) 337 | 338 | 339 | if __name__ == '__main__': 340 | main() 341 | 342 | # cwt_analysis_synthesis.py ends here 343 | -------------------------------------------------------------------------------- /wavelet_prosody_toolkit/cwt_global_spectrum.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | """ 4 | AUTHOR 5 | - Antti Suni 6 | - Sébastien Le Maguer 7 | 8 | DESCRIPTION 9 | 10 | usage: cwt_global_spectrum.py [-h] [-v] [-o OUTPUT] 11 | [-P] 12 | input_file 13 | 14 | 15 | Tool for extracting global wavelet spectrum of speech envelope 16 | introduced for second language fluency estimation in the following paper: 17 | 18 | @inproceedings{suni2019characterizing, 19 | title={Characterizing second language fluency with global wavelet spectrum}, 20 | author={Suni, Antti and Kallio, Heini and Benu{\v{s}}, {\v{S}}tefan and {\v{S}}imko, Juraj}, 21 | booktitle={International Congress of Phonetic Sciences}, 22 | pages={1947--1951}, 23 | year={2019}, 24 | organization={Australasian Speech Science and Technology Association Inc.} 25 | } 26 | 27 | positional arguments: 28 | input_file Input signal or F0 file 29 | 30 | optional arguments: 31 | -h, --help show this help message and exit 32 | -v, --verbosity increase output verbosity 33 | -o OUTPUT, --output OUTPUT 34 | output directory for analysis or filename for synthesis. 35 | (Default: input_file directory [Analysis] or .f0 [Synthesis]) 36 | -P, --plot Plot the results 37 | 38 | 39 | You should be able to see peak around 4Hz, corresponding to syllable rate. 40 | For longer speech files, lower frequency peaks related to phrasing should appear. 41 | Synthetic test file with 8Hz, 4Hz and 1Hz components is included in sample directory. 42 | 43 | 44 | LICENSE 45 | See https://github.com/asuni/wavelet_prosody_toolkit/blob/master/LICENSE.txt 46 | 47 | """ 48 | 49 | # System/default 50 | import sys 51 | import os 52 | 53 | # Arguments 54 | import argparse 55 | 56 | # Messaging/logging 57 | import traceback 58 | import time 59 | import logging 60 | 61 | # Math/plot 62 | import numpy as np 63 | import matplotlib.ticker 64 | import matplotlib.pyplot as plt 65 | 66 | # Libraries 67 | from wavelet_prosody_toolkit.prosody_tools import cwt_utils as cwt_utils 68 | from wavelet_prosody_toolkit.prosody_tools import misc as misc 69 | from wavelet_prosody_toolkit.prosody_tools import energy_processing as energy_processing 70 | 71 | 72 | ############################################################################### 73 | # global constants 74 | ############################################################################### 75 | LEVEL = [logging.WARNING, logging.INFO, logging.DEBUG] 76 | 77 | 78 | ############################################################################### 79 | # Functions 80 | ############################################################################### 81 | def calc_global_spectrum(wav_file, period=5, n_scales=60, plot=False): 82 | """ 83 | """ 84 | 85 | # Extract signal envelope, scale and normalize 86 | (fs, waveform) = misc.read_wav(wav_file) 87 | waveform = misc.resample(waveform, fs, 16000) 88 | energy = energy_processing.extract_energy(waveform, min_freq=30, method="hilbert") 89 | energy[energy<0] = 0 90 | energy = np.cbrt(energy+0.1) 91 | params = misc.normalize_std(energy) 92 | 93 | 94 | # perform continous wavelet transform on envelope with morlet wavelet 95 | 96 | # increase _period to get sharper spectrum 97 | matrix, scales, freq = cwt_utils.cwt_analysis(params, first_freq = 16, num_scales = n_scales, scale_distance = 0.1,period=period, mother_name="Morlet",apply_coi=True) 98 | 99 | 100 | # power, arbitrary scaling to prevent underflow 101 | p_matrix = (abs(matrix)**2).astype('float32')*1000.0 102 | power_spec = np.nanmean(p_matrix,axis=1) 103 | 104 | if plot: 105 | f, wave_pics = plt.subplots(1, 2, gridspec_kw = {'width_ratios':[5, 1]}, sharey=True) 106 | f.subplots_adjust(hspace=10) 107 | f.subplots_adjust(wspace=0) 108 | wave_pics[0].set_ylim(0, n_scales) 109 | wave_pics[0].set_xlabel("Time(m:s)") 110 | wave_pics[0].set_ylabel("Frequency(Hz)") 111 | wave_pics[1].set_xlabel("power") 112 | wave_pics[1].tick_params(labelright=True) 113 | 114 | fname = os.path.basename(wav_file) 115 | title = "CWT Morlet(p="+str(period)+") global spectrum, "+ fname 116 | wave_pics[0].contourf(p_matrix, 100) 117 | wave_pics[0].set_title(title, loc="center") 118 | wave_pics[0].plot(params*3, color="white",alpha=0.5) 119 | 120 | freq_labels = [round(x,3) 121 | if (np.isclose(x, round(x)) or 122 | (x < 2 and np.isclose(x*100., round(x*100))) or 123 | (x < 0.5 and np.isclose(x*10000., round(x*10000)))) 124 | else "" 125 | for x in list(freq)] 126 | 127 | wave_pics[0].set_yticks(np.linspace(0, len(freq_labels)-1, len(freq_labels))) 128 | wave_pics[0].set_yticklabels(freq_labels) 129 | formatter = matplotlib.ticker.FuncFormatter(lambda ms, x: time.strftime('%M:%S', time.gmtime(ms // 200))) 130 | wave_pics[0].xaxis.set_major_formatter(formatter) 131 | wave_pics[1].grid(axis="y") 132 | wave_pics[1].plot(power_spec,np.linspace(0,len(power_spec), len(power_spec)),"-") 133 | plt.show() 134 | 135 | 136 | return (power_spec, freq) 137 | 138 | ############################################################################### 139 | # Main function 140 | ############################################################################### 141 | def main(): 142 | """Main entry function 143 | """ 144 | global args 145 | 146 | period = 5 147 | n_scales = 60 148 | 149 | # Compute the global spectrum 150 | (power_spec, freq) = calc_global_spectrum(args.wav_file, period, n_scales, args.plot) 151 | 152 | # save spectrum and associated frequencies for further processing 153 | output_dir = os.path.dirname(args.wav_file) 154 | if args.output_dir is not None: 155 | output_dir = args.output_dir 156 | os.makedirs(output_dir, exist_ok=True) 157 | basename = os.path.join(output_dir, os.path.splitext(os.path.basename(args.wav_file))[0]) 158 | np.savetxt(basename+".spec.txt", power_spec, fmt="%.5f", newline= " ") 159 | np.savetxt(basename+".freqs.txt", freq, fmt="%.5f", newline= " ") 160 | 161 | 162 | ############################################################################### 163 | # Envelopping 164 | ############################################################################### 165 | if __name__ == '__main__': 166 | try: 167 | parser = argparse.ArgumentParser(description="") 168 | 169 | # Add options 170 | parser.add_argument("-l", "--log_file", default=None, 171 | help="Logger file") 172 | parser.add_argument("-o", "--output_dir", default=None, type=str, 173 | help="The output directory (if not defined, use the same directory than the wave file)") 174 | parser.add_argument("-P", "--plot", default=False, action="store_true", 175 | help="Plot the results") 176 | parser.add_argument("-v", "--verbosity", action="count", default=0, 177 | help="increase output verbosity") 178 | 179 | # Add arguments 180 | parser.add_argument("wav_file", help="The input wave file") 181 | 182 | # Parsing arguments 183 | args = parser.parse_args() 184 | 185 | # create logger and formatter 186 | logger = logging.getLogger() 187 | formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s') 188 | 189 | # Verbose level => logging level 190 | log_level = args.verbosity 191 | if (args.verbosity >= len(LEVEL)): 192 | log_level = len(LEVEL) - 1 193 | logger.setLevel(log_level) 194 | logging.warning("verbosity level is too high, I'm gonna assume you're taking the highest (%d)" % log_level) 195 | else: 196 | logger.setLevel(LEVEL[log_level]) 197 | 198 | # create console handler 199 | ch = logging.StreamHandler() 200 | ch.setFormatter(formatter) 201 | logger.addHandler(ch) 202 | 203 | # create file handler 204 | if args.log_file is not None: 205 | fh = logging.FileHandler(args.log_file) 206 | logger.addHandler(fh) 207 | 208 | # Debug time 209 | start_time = time.time() 210 | logger.info("start time = " + time.asctime()) 211 | 212 | # Running main function <=> run application 213 | main() 214 | 215 | # Debug time 216 | logging.info("end time = " + time.asctime()) 217 | logging.info('TOTAL TIME IN MINUTES: %02.2f' % 218 | ((time.time() - start_time) / 60.0)) 219 | 220 | # Exit program 221 | sys.exit(0) 222 | except KeyboardInterrupt as e: # Ctrl-C 223 | raise e 224 | except SystemExit: # sys.exit() 225 | pass 226 | except Exception as e: 227 | logging.error('ERROR, UNEXPECTED EXCEPTION') 228 | logging.error(str(e)) 229 | traceback.print_exc(file=sys.stderr) 230 | sys.exit(-1) 231 | 232 | 233 | else: 234 | print("usage: cwt_global_spectrum.py ") 235 | -------------------------------------------------------------------------------- /wavelet_prosody_toolkit/prosody_labeller.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | """ 4 | AUTHOR 5 | 6 | Sébastien Le Maguer 7 | 8 | DESCRIPTION 9 | 10 | LICENSE 11 | This script is in the public domain, free from copyrights or restrictions. 12 | Created: 27 January 2020 13 | """ 14 | 15 | # System/default 16 | import sys 17 | import os 18 | import glob 19 | 20 | # Arguments 21 | import argparse 22 | 23 | # Messaging/logging 24 | import traceback 25 | import time 26 | import logging 27 | import copy 28 | 29 | # Configuration 30 | import yaml 31 | from collections import defaultdict 32 | 33 | # Math and plotting 34 | import numpy as np 35 | import scipy.ndimage 36 | import matplotlib.pyplot as plt 37 | 38 | # Parallel job managment 39 | from joblib import Parallel, delayed 40 | 41 | # acoustic features 42 | from wavelet_prosody_toolkit.prosody_tools import energy_processing 43 | from wavelet_prosody_toolkit.prosody_tools import f0_processing 44 | from wavelet_prosody_toolkit.prosody_tools import duration_processing 45 | 46 | # helpers 47 | from wavelet_prosody_toolkit.prosody_tools import misc 48 | from wavelet_prosody_toolkit.prosody_tools import smooth_and_interp 49 | 50 | # wavelet transform 51 | from wavelet_prosody_toolkit.prosody_tools import cwt_utils, loma, lab 52 | 53 | ############################################################################### 54 | # global constants 55 | ############################################################################### 56 | LEVEL = [logging.WARNING, logging.INFO, logging.DEBUG] 57 | 58 | ############################################################################### 59 | # Functions 60 | ############################################################################### 61 | def get_logger(verbosity, log_file): 62 | 63 | # create logger and formatter 64 | logger = logging.getLogger("prosody labeller") 65 | formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s') 66 | 67 | # Verbose level => logging level 68 | log_level = verbosity 69 | if (log_level >= len(LEVEL)): 70 | log_level = len(LEVEL) - 1 71 | logger.setLevel(log_level) 72 | logging.warning("verbosity level is too high, I'm gonna assume you're taking the highest (%d)" % log_level) 73 | else: 74 | logger.setLevel(LEVEL[log_level]) 75 | 76 | # create console handler 77 | ch = logging.StreamHandler() 78 | ch.setFormatter(formatter) 79 | logger.addHandler(ch) 80 | 81 | # create file handler 82 | if log_file is not None: 83 | fh = logging.FileHandler(log_file) 84 | logger.addHandler(fh) 85 | 86 | return logger 87 | 88 | 89 | def apply_configuration(current_configuration, updating_part): 90 | """Utils to update the current configuration using the updating part 91 | 92 | Parameters 93 | ---------- 94 | current_configuration: dict 95 | The current state of the configuration 96 | 97 | updating_part: dict 98 | The information to add to the current configuration 99 | 100 | Returns 101 | ------- 102 | dict 103 | the updated configuration 104 | """ 105 | if not isinstance(current_configuration, dict): 106 | return updating_part 107 | 108 | if current_configuration is None: 109 | return updating_part 110 | 111 | if updating_part is None: 112 | return current_configuration 113 | 114 | for k in updating_part: 115 | if k not in current_configuration: 116 | current_configuration[k] = updating_part[k] 117 | else: 118 | current_configuration[k] = apply_configuration(current_configuration[k], updating_part[k]) 119 | 120 | return current_configuration 121 | 122 | 123 | 124 | def analysis(input_file, cfg, logger, annotation_dir=None, output_dir=None, plot=False): 125 | 126 | # Load the wave file 127 | print("Analyzing %s starting..." % input_file) 128 | orig_sr, sig = misc.read_wav(input_file) 129 | 130 | # extract energy 131 | energy = energy_processing.extract_energy(sig, orig_sr, 132 | cfg["energy"]["band_min"], 133 | cfg["energy"]["band_max"], 134 | cfg["energy"]["calculation_method"]) 135 | energy = np.cbrt(energy+1) 136 | if cfg["energy"]["smooth_energy"]: 137 | energy = smooth_and_interp.peak_smooth(energy, 30, 3) # FIXME: 30? 3? 138 | energy = smooth_and_interp.smooth(energy, 10) 139 | 140 | # extract f0 141 | raw_pitch = f0_processing.extract_f0(sig, orig_sr, 142 | f0_min=cfg["f0"]["min_f0"], 143 | f0_max=cfg["f0"]["max_f0"], 144 | voicing=cfg["f0"]["voicing_threshold"], 145 | #harmonics=cfg["f0"]["harmonics"], 146 | configuration=cfg["f0"]["pitch_tracker"]) 147 | # interpolate, stylize 148 | pitch = f0_processing.process(raw_pitch) 149 | 150 | # extract speech rate 151 | rate = np.zeros(len(pitch)) 152 | 153 | 154 | # Get annotations (if available) 155 | tiers = [] 156 | if annotation_dir is None: 157 | annotation_dir = os.path.dirname(input_file) 158 | basename = os.path.splitext(os.path.basename(input_file))[0] 159 | grid = os.path.join(annotation_dir, "%s.TextGrid" % basename) 160 | if os.path.exists(grid): 161 | tiers = lab.read_textgrid(grid) 162 | else: 163 | grid = os.path.join(annotation_dir, "%s.lab" % basename) 164 | if not os.path.exists(grid): 165 | raise Exception("There is no annotations associated with %s" % input_file) 166 | tiers = lab.read_htk_label(grid) 167 | 168 | # Extract duration 169 | if len(tiers) > 0: 170 | dur_tiers = [] 171 | for level in cfg["duration"]["duration_tiers"]: 172 | assert(level.lower() in tiers), level+" not defined in tiers: check that duration_tiers in config match the actual textgrid tiers" 173 | try: 174 | dur_tiers.append(tiers[level.lower()]) 175 | except: 176 | print("\nerror: "+"\""+level+"\"" +" not in labels, modify duration_tiers in config\n\n") 177 | raise 178 | 179 | if not cfg["duration"]["acoustic_estimation"]: 180 | rate = duration_processing.get_duration_signal(dur_tiers, 181 | weights=cfg["duration"]["weights"], 182 | linear=cfg["duration"]["linear"], 183 | sil_symbols=cfg["duration"]["silence_symbols"], 184 | bump = cfg["duration"]["bump"]) 185 | 186 | else: 187 | rate = duration_processing.get_rate(energy) 188 | rate = smooth_and_interp.smooth(rate, 30) 189 | 190 | if cfg["duration"]["delta_duration"]: 191 | rate = np.diff(rate) 192 | 193 | # Combine signals 194 | min_length = np.min([len(pitch), len(energy), len(rate)]) 195 | pitch = pitch[:min_length] 196 | energy = energy[:min_length] 197 | rate = rate[:min_length] 198 | 199 | if cfg["feature_combination"]["type"] == "product": 200 | pitch = misc.normalize_minmax(pitch) ** cfg["feature_combination"]["weights"]["f0"] 201 | energy = misc.normalize_minmax(energy) ** cfg["feature_combination"]["weights"]["energy"] 202 | rate = misc.normalize_minmax(rate) ** cfg["feature_combination"]["weights"]["duration"] 203 | params = pitch * energy * rate 204 | 205 | else: 206 | params = misc.normalize_std(pitch) * cfg["feature_combination"]["weights"]["f0"] + \ 207 | misc.normalize_std(energy) * cfg["feature_combination"]["weights"]["energy"] + \ 208 | misc.normalize_std(rate) * cfg["feature_combination"]["weights"]["duration"] 209 | 210 | if cfg["feature_combination"]["detrend"]: 211 | params = smooth_and_interp.remove_bias(params, 800) 212 | 213 | params = misc.normalize_std(params) 214 | 215 | 216 | # CWT analysis 217 | (cwt, scales, freqs) = cwt_utils.cwt_analysis(params, 218 | mother_name=cfg["wavelet"]["mother_wavelet"], 219 | period=cfg["wavelet"]["period"], 220 | num_scales=cfg["wavelet"]["num_scales"], 221 | scale_distance=cfg["wavelet"]["scale_distance"], 222 | apply_coi=False) 223 | cwt = np.real(cwt) 224 | scales *= 200 # FIXME: why 200? 225 | 226 | 227 | # Compute lines of maximum amplitude 228 | assert(cfg["labels"]["annotation_tier"].lower() in tiers), \ 229 | cfg["labels"]["annotation_tier"]+" not defined in tiers: check that annotation_tier in config is found in the textgrid tiers" 230 | labels = tiers[cfg["labels"]["annotation_tier"].lower()] 231 | 232 | # get scale corresponding to avg unit length of selected tier 233 | n_scales = cfg["wavelet"]["num_scales"] 234 | scale_dist = cfg["wavelet"]["scale_distance"] 235 | scales = (1./freqs*200)*0.5 # FIXME: hardcoded vales 236 | unit_scale = misc.get_best_scale2(scales, labels) 237 | 238 | # Define the scale information (FIXME: description) 239 | pos_loma_start_scale = unit_scale + int(cfg["loma"]["prom_start"]/scale_dist) # three octaves down from average unit length 240 | pos_loma_end_scale = unit_scale + int(cfg["loma"]["prom_end"]/scale_dist) 241 | neg_loma_start_scale = unit_scale + int(cfg["loma"]["boundary_start"]/scale_dist) # two octaves down 242 | neg_loma_end_scale = unit_scale + int(cfg["loma"]["boundary_end"]/scale_dist) # one octave up 243 | 244 | pos_loma = loma.get_loma(cwt, scales, pos_loma_start_scale, pos_loma_end_scale) 245 | neg_loma = loma.get_loma(-cwt, scales, neg_loma_start_scale, neg_loma_end_scale) 246 | 247 | max_loma = loma.get_prominences(pos_loma, labels) 248 | prominences = np.array(max_loma) 249 | boundaries = np.array(loma.get_boundaries(max_loma, neg_loma, labels)) 250 | 251 | 252 | # output results 253 | if output_dir is None: 254 | output_dir = os.path.dirname(input_file) 255 | os.makedirs(output_dir, exist_ok=True) 256 | 257 | basename = os.path.splitext(os.path.basename(input_file))[0] 258 | output_filename = os.path.join(output_dir, "%s.prom" % basename) 259 | print("Saving %s..." % (output_filename)) 260 | loma.save_analyses(output_filename, 261 | labels, 262 | prominences, 263 | boundaries) 264 | 265 | # Plotting 266 | if plot != 0: 267 | fig, ax = plt.subplots(6, 1, sharex=True, 268 | figsize=(len(labels) / 10 * 8, 8), 269 | gridspec_kw = {'height_ratios':[1, 1, 1, 2, 4, 1.5]}) 270 | plt.subplots_adjust(hspace=0) 271 | 272 | # Plot individual signals 273 | ax[0].plot(pitch, linewidth=1) 274 | ax[0].set_ylabel("Pitch", rotation="horizontal", ha="right", va="center") 275 | 276 | ax[1].plot(energy, linewidth=1) 277 | ax[1].set_ylabel("Energy", rotation="horizontal", ha="right", va="center") 278 | 279 | ax[2].plot(rate, linewidth=1) 280 | ax[2].set_ylabel("Speech rate", rotation="horizontal", ha="right", va="center") 281 | 282 | # Plot combined signal 283 | ax[3].plot(params, linewidth=1) 284 | ax[3].set_ylabel("Combined \n signal", rotation="horizontal", ha="right", va="center") 285 | plt.xlim(0, len(params)) 286 | 287 | # Wavelet and loma 288 | cwt[cwt>0] = np.log(cwt[cwt>0]+1.) 289 | cwt[cwt<-0.1] = -0.1 290 | ax[4].contourf(cwt,100, cmap="inferno") 291 | loma.plot_loma(pos_loma, ax[4], color="black") 292 | loma.plot_loma(neg_loma, ax[4], color="white") 293 | ax[4].set_ylabel("Wavelet & \n LOMA", rotation="horizontal", ha="right", va="center") 294 | 295 | # Add labels 296 | prom_text = prominences[:, 1]/(np.max(prominences[:, 1]))*2.5 + 0.5 297 | lab.plot_labels(labels, ypos=0.3, size=6, prominences=prom_text, fig=ax[5], boundary=False, background=False) 298 | ax[5].set_ylabel("Labels", rotation="horizontal", ha="right", va="center") 299 | for i in range(0, len(labels)): 300 | for a in [0, 1, 2, 3, 4, 5]: 301 | ax[a].axvline(x=labels[i][0], color='black', 302 | linestyle="-", linewidth=0.2, alpha=0.5) 303 | 304 | ax[a].axvline(x=labels[i][1], color='black', 305 | linestyle="-", linewidth=0.2+boundaries[i][-1] * 2, 306 | alpha=0.5) 307 | 308 | plt.xlim(0, cwt.shape[1]) 309 | 310 | # Align ylabels and remove axis 311 | fig.align_ylabels(ax) 312 | for i in range(len(ax)-1): 313 | ax[i].tick_params( 314 | axis='x', # changes apply to the x-axis 315 | which='both', # both major and minor ticks are affected 316 | bottom=False, # ticks along the bottom edge are off 317 | top=False, # ticks along the top edge are off 318 | labelbottom=False) # labels along the bottom edge are off 319 | ax[i].tick_params( 320 | axis='y', # changes apply to the x-axis 321 | which='both', # both major and minor ticks are affected 322 | left=False, # ticks along the bottom edge are off 323 | right=False, # ticks along the top edge are off 324 | labelleft=False) # labels along the bottom edge are off 325 | 326 | ax[len(ax)-1].tick_params( 327 | axis='y', # changes apply to the x-axis 328 | which='both', # both major and minor ticks are affected 329 | left=False, # ticks along the bottom edge are off 330 | right=False, # ticks along the top edge are off 331 | labelleft=False) # labels along the bottom edge are off 332 | 333 | # Plot 334 | if plot < 0: 335 | output_filename = os.path.join(output_dir, "%s.png" % basename) 336 | logger.info("Save plot %s" % output_filename) 337 | fig.savefig(output_filename, bbox_inches='tight', dpi=400) 338 | elif plot > 0: 339 | plt.show() 340 | 341 | def analysis_batch_wrap(input_file, cfg, annotation_dir=None, output_dir=None, plot=0, logger=None): 342 | # Encapsulate running 343 | try: 344 | print(".") 345 | analysis(input_file, cfg, logger, annotation_dir, output_dir, plot) 346 | except Exception as ex: 347 | logging.error(str(ex)) 348 | traceback.print_exc(file=sys.stderr) 349 | 350 | 351 | ############################################################################### 352 | # Main function 353 | ############################################################################### 354 | def main(): 355 | """Main entry function 356 | """ 357 | global args, logger 358 | 359 | # Load configuration 360 | configuration = defaultdict() 361 | with open(os.path.dirname(os.path.realpath(__file__)) + "/configs/default.yaml", 'r') as f: 362 | configuration = apply_configuration(configuration, defaultdict(lambda: False, yaml.load(f, Loader=yaml.FullLoader))) 363 | 364 | if args.config: 365 | try: 366 | with open(args.config, 'r') as f: 367 | configuration = apply_configuration(configuration, defaultdict(lambda: False, yaml.load(f, Loader=yaml.FullLoader))) 368 | except IOError as ex: 369 | print("configuration file " + args.config + " could not be loaded:") 370 | 371 | sys.exit(1) 372 | logger.debug("Current confirugration:") 373 | logger.debug(configuration) 374 | 375 | # Get the number of jobs 376 | nb_jobs = args.nb_jobs 377 | 378 | # Loading files 379 | if os.path.isfile(args.input): 380 | input_files = [args.input] 381 | else: 382 | input_files = glob.glob(args.input + "/*.wav") 383 | if len(input_files) == 1: 384 | nb_jobs = 1 385 | 386 | plot_flag = 0 387 | if nb_jobs > 1: 388 | if args.plot: 389 | plot_flag = -1 390 | Parallel(n_jobs=nb_jobs, verbose=args.verbosity)(delayed(analysis_batch_wrap)(f, configuration, args.annotation_directory, args.output_directory, plot_flag, logger) for f in input_files) 391 | else: 392 | if args.plot: 393 | plot_flag = 1 394 | for f in input_files: 395 | analysis(f, configuration, logger, args.annotation_directory, args.output_directory, plot_flag) 396 | 397 | 398 | ############################################################################### 399 | # Envelopping 400 | ############################################################################### 401 | if __name__ == '__main__': 402 | try: 403 | parser = argparse.ArgumentParser(description="Command line application to analyze prosody using wavelets.") 404 | 405 | # Add options 406 | parser.add_argument("-a", "--annotation_directory", default=None, type=str, 407 | help="Annotation directory. If not specified, the tool will by default try to load annotations from the directory containing the wav files") 408 | parser.add_argument("-j", "--nb_jobs", default=4, type=int, 409 | help="Define the number of jobs to run in parallel") 410 | parser.add_argument("-c", "--config", default=None, type=str, 411 | help="configuration file") 412 | parser.add_argument("-l", "--log_file", default=None, type=str, 413 | help="Logger file") 414 | parser.add_argument("-o", "--output_directory", default=None, type=str, 415 | help="The output directory. If not specified, the tool will output the result in a .prom file in the same directory than the wave files") 416 | parser.add_argument("-p", "--plot", default=False, action="store_true", 417 | help="Plot the result (the number of jobs is de facto set to 1 if activated)") 418 | parser.add_argument("-v", "--verbosity", action="count", default=1, 419 | help="increase output verbosity") 420 | 421 | # Add arguments 422 | parser.add_argument("input", help="directory with wave files or wave file to analyze (a label file with the same basename should be available)") 423 | 424 | 425 | 426 | # Parsing arguments 427 | args = parser.parse_args() 428 | if args.plot: 429 | args.nb_jobs = 1 430 | # Get the logger 431 | logger = get_logger(args.verbosity, args.log_file) 432 | 433 | # Debug time 434 | start_time = time.time() 435 | logger.info("start time = " + time.asctime()) 436 | 437 | # Running main function <=> run application 438 | main() 439 | 440 | # Debug time 441 | logger.info("end time = " + time.asctime()) 442 | logger.info('TOTAL TIME IN MINUTES: %02.2f' % 443 | ((time.time() - start_time) / 60.0)) 444 | 445 | # Exit program 446 | sys.exit(0) 447 | except KeyboardInterrupt as e: # Ctrl-C 448 | raise e 449 | except SystemExit: # sys.exit() 450 | pass 451 | except Exception as e: 452 | logging.error('ERROR, UNEXPECTED EXCEPTION') 453 | logging.error(str(e)) 454 | traceback.print_exc(file=sys.stderr) 455 | sys.exit(-1) 456 | -------------------------------------------------------------------------------- /wavelet_prosody_toolkit/prosody_tools/__init__.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | 4 | """ 5 | AUTHOR 6 | - Antti Suni 7 | - Sébastien Le Maguer 8 | 9 | DESCRIPTION 10 | Package which provides the modules to achieve wavelet analysis of speech prosody. 11 | 12 | LICENSE 13 | See https://github.com/asuni/wavelet_prosody_toolkit/blob/master/LICENSE.txt 14 | """ 15 | -------------------------------------------------------------------------------- /wavelet_prosody_toolkit/prosody_tools/cwt_utils.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | 4 | """ 5 | AUTHOR 6 | - Antti Suni 7 | - Sébastien Le Maguer 8 | 9 | DESCRIPTION 10 | Module which provides continuous wavelet transform (cwt) analysis/synthesis routines 11 | 12 | LICENSE 13 | See https://github.com/asuni/wavelet_prosody_toolkit/blob/master/LICENSE.txt 14 | """ 15 | 16 | from numpy import array,concatenate, sqrt, pad, mean, std, real, nan, zeros, nanmean, nanstd, pi, around, log2 17 | 18 | import pycwt as cwt 19 | 20 | ########################################################################################### 21 | # Private routines 22 | ########################################################################################### 23 | def _unpad(matrix, num): 24 | """Private function to unpad axis 1 of a matrix 25 | 26 | Parameters 27 | ---------- 28 | matrix: ndarray 29 | a NDarray 30 | num: int 31 | the unpadding size 32 | 33 | Returns 34 | ------- 35 | ndarray 36 | the unpadded matrix 37 | """ 38 | unpadded = matrix[:,num:len(matrix[0])-num] 39 | return unpadded 40 | 41 | 42 | def _padded_cwt(params, dt, dj, s0, J, mother, padding_len): 43 | """Private function to compute a wavelet transform on padded data 44 | 45 | Parameters 46 | ---------- 47 | params: arraylike 48 | The prosodic parameters. 49 | dt: ? 50 | ? 51 | dj: ? 52 | ? 53 | s0: ? 54 | ? 55 | J: ? 56 | ? 57 | mother: ? 58 | The mother wavelet. 59 | padding_len: int 60 | The padding length 61 | 62 | Returns 63 | ------- 64 | wavelet_matrix: ndarray 65 | The wavelet data resulting from the analysis 66 | scales: arraylike 67 | The scale indices corresponding to the wavelet data 68 | freqs: ? 69 | ? 70 | coi: array 71 | The cone of influence values 72 | fft: ? 73 | ? 74 | fftfreqs: ? 75 | ? 76 | """ 77 | #padded = concatenate([params,params,params]) 78 | padded = pad(params, padding_len, mode='edge') #edge 79 | wavelet_matrix, scales, freqs, coi, fft, fftfreqs = cwt.cwt(padded, dt, dj, s0, J, mother) 80 | wavelet_matrix = _unpad(wavelet_matrix, padding_len) 81 | #wavelet_matrix = _unpad(wavelet_matrix, len(params)) 82 | 83 | return (wavelet_matrix, scales, freqs, coi, fft, fftfreqs) 84 | 85 | 86 | 87 | 88 | def _zero_outside_coi(wavelet_matrix,freqs, rate = 200): 89 | """Private function to set each elements outside of the Cone Of Influence (coi) to 0. 90 | 91 | Parameters 92 | ---------- 93 | wavelet_matrix: type 94 | description 95 | freqs: type 96 | description 97 | 98 | """ 99 | for i in range(0,wavelet_matrix.shape[0]): 100 | coi =int(1./freqs[i]*rate) 101 | wavelet_matrix[i,0:coi] = 0. 102 | wavelet_matrix[i,-coi:] = 0. 103 | return wavelet_matrix 104 | 105 | def _scale_for_reconstruction(wavelet_matrix,scales, dj, dt,mother="mexican_hat",period=3): 106 | """ ? 107 | 108 | Parameters 109 | ---------- 110 | wavelet_matrix: ndarray 111 | The wavelet data resulting from the analysis 112 | scales: arraylike 113 | The scale indices corresponding to the wavelet data 114 | dj: ? 115 | ? 116 | dt: ? 117 | ? 118 | mother: ? 119 | ? 120 | period: ? 121 | ? 122 | 123 | """ 124 | scaled = array(wavelet_matrix) 125 | 126 | # mexican Hat 127 | c = dj / (3.541 * 0.867) 128 | 129 | if mother=="morlet": 130 | cc = 1.83 131 | 132 | #periods 5 and 6 are correct, 3,4 approximate 133 | if period == 3: 134 | cc = 1.74 135 | if period == 4: 136 | cc = 1.1 137 | elif period==5: 138 | cc=0.9484 139 | elif period==6: 140 | cc == 0.7784 141 | 142 | c = dj / (cc * pi**(-0.25)) 143 | 144 | for i in range(0, len(scales)): 145 | scaled[i]*= c*sqrt(dt)/sqrt(scales[i]) 146 | # substracting the mean should not be necessary? 147 | scaled[i]-=mean(scaled[i]) 148 | 149 | return scaled 150 | 151 | 152 | def _freq2scale(freq, mother, period = 3.): 153 | """ 154 | convert frequency to wavelet scale width 155 | 156 | Parameters 157 | ---------- 158 | freq: float 159 | frequency value in Hz 160 | 161 | mother: string 162 | name of the mother wavelet ("mexican_hat", "morlet") 163 | """ 164 | 165 | freq = float(freq) 166 | if mother.lower() == "mexican_hat": 167 | return (1./freq)/(2. * pi / sqrt(2 + 0.5)) #np.sqrt(2./(2.*k0+1.))); 168 | if mother.lower() == "morlet": 169 | return (1./freq)*(period + sqrt(2. + period**2))/(4 * pi) 170 | else: 171 | return (1./freq)/ (4. * pi / (2. * period + 1.)) 172 | 173 | ########################################################################################### 174 | # Public routines 175 | ########################################################################################### 176 | def combine_scales(wavelet_matrix, slices): 177 | """Combine the scales of given slices 178 | 179 | Parameters 180 | ---------- 181 | wavelet_matrix: ndarray 182 | The wavelet data matrix. 183 | slices: ndarray 184 | The slices 185 | 186 | Returns 187 | ------- 188 | array 189 | The combined scales 190 | """ 191 | combined_scales = [] 192 | 193 | for i in range(0, len(slices)): 194 | combined_scales.append(sum(wavelet_matrix[slices[i][0]:slices[i][1]])) 195 | return array(combined_scales) 196 | 197 | 198 | def cwt_analysis(params, mother_name="mexican_hat",num_scales=12, first_scale = None, first_freq = None, scale_distance=1.0, apply_coi=True, period=5, frame_rate = 200): 199 | """Achieve the continous wavelet analysis of given parameters 200 | 201 | Parameters 202 | ---------- 203 | params: arraylike 204 | The parameters to analyze. 205 | mother_name: string, optional 206 | The name of the mother wavelet [default: mexican_hat]. 207 | num_scales: int, optional 208 | The number of scales [default: 12]. 209 | first_scale: int, optional 210 | The width of the shortest scale 211 | first_freq: int, optional 212 | The highest frequency in Hz 213 | scale_distance: float, optional 214 | The distance between scales [default: 1.0]. 215 | apply_coi: boolean, optional 216 | Apply the Cone Of Influence (coi) 217 | period: int, optional 218 | The period of the mother wavelet [default: 5]. 219 | frame_rate: int, optional 220 | The signal frame rate [default: 200]. 221 | 222 | Returns 223 | ------- 224 | wavelet_matrix: ndarray 225 | The wavelet data resulting from the analysis 226 | scales: arraylike 227 | The scale indices corresponding to the wavelet data 228 | """ 229 | # setup wavelet transform 230 | 231 | dt = 1. /float(frame_rate) # frame length 232 | 233 | if not first_scale: 234 | first_scale = dt # first scale, here frame length 235 | 236 | if first_freq: 237 | first_scale = _freq2scale(first_freq, mother_name, period) 238 | 239 | dj = scale_distance # distance between scales in octaves 240 | J = num_scales # number of scales 241 | 242 | mother = cwt.MexicanHat() 243 | 244 | if str.lower(mother_name) == "morlet": 245 | mother = cwt.Morlet(period) 246 | elif str.lower(mother_name) == "paul": 247 | mother = cwt.Paul(period) 248 | 249 | wavelet_matrix, scales, freqs, coi, fft, fftfreqs = _padded_cwt(params, dt, dj, first_scale, J,mother, 400) 250 | #wavelet_matrix, scales, freqs, coi, fft, fftfreqs = cwt.cwt(f0_mean_sub, dt, dj, s0, J,mother) 251 | 252 | #wavelet_matrix = abs(wavelet_matrix) 253 | wavelet_matrix = _scale_for_reconstruction((wavelet_matrix), scales, dj, dt,mother=mother_name,period=period) 254 | 255 | if apply_coi: 256 | #wavelet_matrix = _zero_outside_coi(wavelet_matrix, scales/dt*0.5) 257 | wavelet_matrix = _zero_outside_coi(wavelet_matrix, freqs, frame_rate) 258 | import numpy as np 259 | np.set_printoptions(precision=3, suppress=True) 260 | return (wavelet_matrix,scales,freqs) 261 | 262 | 263 | def cwt_synthesis(wavelet_matrix, mean = 0): 264 | """Synthesizing a signal given a wavelet dataset 265 | 266 | Parameters 267 | ---------- 268 | wavelet_matrix: ndarray 269 | The wavelet data matrix. 270 | mean: float 271 | The mean to translate the signal. 272 | 273 | Returns 274 | ------- 275 | arraylike 276 | The generated signal 277 | 278 | """ 279 | return sum(wavelet_matrix[:])+mean 280 | -------------------------------------------------------------------------------- /wavelet_prosody_toolkit/prosody_tools/duration_processing.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | 4 | """ 5 | AUTHOR 6 | - Antti Suni 7 | - Sébastien Le Maguer 8 | 9 | DESCRIPTION 10 | Module which provides the duration routines to be able to apply a wavelet analysis 11 | 12 | LICENSE 13 | See https://github.com/asuni/wavelet_prosody_toolkit/blob/master/LICENSE.txt 14 | """ 15 | 16 | from . import smooth_and_interp, misc 17 | import numpy as np 18 | 19 | SIL_SYMBOLS = ["#","!pau", "sp", "", "pau", "!sil", "sil", "", " ","

", "", ".", ",","?"] 20 | 21 | 22 | def _get_dur_stats(labels, linear=False, sil_symbols=[]): 23 | durations = [] 24 | for i in range(len(labels)): 25 | (st,en, unit) = labels[i] 26 | if unit.lower() not in sil_symbols: 27 | dur = en-st 28 | if not linear: 29 | 30 | dur = np.log(dur+1.) 31 | durations.append(dur) 32 | durations = np.array(durations) 33 | return (np.min(durations), np.max(durations), np.mean(durations)) 34 | 35 | 36 | def get_rate(params,p=2,hp=10,lp=150, fig=None): 37 | """ 38 | estimation of speech rate as a center of gravity of wavelet spectrum 39 | similar to method described in "Boundary Detection using Continuous Wavelet Analysis" (2016) 40 | """ 41 | from . import cwt_utils 42 | 43 | params = smooth_and_interp.smooth(params, hp) 44 | params -= smooth_and_interp.smooth(params, lp) 45 | 46 | wavelet_matrix, scales, freqs = cwt_utils.cwt_analysis(params, mother_name="Morlet", \ 47 | num_scales=80, scale_distance=0.1,\ 48 | apply_coi=True,period=2) 49 | wavelet_matrix = abs(wavelet_matrix) 50 | 51 | rate = np.zeros(len(params)) 52 | 53 | for i in range(0,wavelet_matrix.shape[1]): 54 | frame_en = np.sum(wavelet_matrix[:,i]) 55 | # center of gravity 56 | rate[i] = np.nonzero(wavelet_matrix[:,i].cumsum() >=frame_en*0.5)[0].min() 57 | # maximum energy scale 58 | #rate[i]= np.argmax(wavelet_matrix[:,i]) #.astype('float')) 59 | 60 | if fig: 61 | fig.contourf((wavelet_matrix), 50) 62 | rate = smooth_and_interp.smooth(rate, 30) 63 | if fig: 64 | fig.plot(rate,color="black") 65 | 66 | return rate 67 | 68 | 69 | def duration(labels, rate=200,linear=False,bump=False, sil_symbols=SIL_SYMBOLS): 70 | """ 71 | construct duration signal from labels 72 | """ 73 | 74 | dur = np.zeros(len(labels)) 75 | params = np.zeros(int(labels[-1][1]*rate)) 76 | prev_end = 0 77 | (min_dur, max_dur, mean_dur) = _get_dur_stats(labels,linear, sil_symbols) 78 | 79 | for i in range(0,len(labels)): 80 | 81 | (st,en, unit) = labels[i] 82 | st*=rate 83 | en*=rate 84 | dur[i] = en-st 85 | if not linear: 86 | dur[i] = np.log(dur[i]+1.) 87 | 88 | if unit.lower() in sil_symbols: 89 | dur[i] = min_dur 90 | 91 | # skip very short units, likely labelling errors 92 | if (en<=st+0.01): 93 | continue 94 | 95 | # unit duration -> height of the duration contour in the middle of the unit 96 | params[int(st+(en-st)/2.0)] = dur[i] 97 | 98 | # "bump" -> emphasize difference between adjacent unit durations 99 | if i > 0 and bump: 100 | params[int(st)]= (dur[i]+dur[i-1])/2.- (abs(dur[i]-dur[i-1])) 101 | 102 | # handle gaps in labels similarly to silences 103 | if st > prev_end and i > 1: 104 | #gap_dur = min_dur 105 | params[int(prev_end+(st-prev_end)/2.0)] = min_dur #(gap_dur) #0.001 #-max_dur 106 | prev_end = en 107 | 108 | # set endpoints to mean in order to avoid large "valleys" 109 | params[0] = np.mean(dur) 110 | params[-1] = np.mean(dur) 111 | 112 | # make continous duration contour and smooth a bit 113 | params = smooth_and_interp.interpolate_zeros(params, 'pchip') 114 | params = smooth_and_interp.smooth(params, 20) 115 | 116 | return params 117 | 118 | 119 | 120 | def get_duration_signal(tiers =[], weights = [], sil_symbols=SIL_SYMBOLS,\ 121 | rate=1, linear=True, bump=False): 122 | """ 123 | Construct duration contour from labels. If many tiers are selected, 124 | construct contours for each tier and return a weighted sum of those 125 | 126 | """ 127 | durations = [] 128 | lengths = [] 129 | for t in tiers: 130 | durations.append(misc.normalize_std(duration(t, rate=rate, sil_symbols=sil_symbols,\ 131 | linear=linear, bump=bump))) 132 | 133 | durations = misc.match_length(durations) 134 | sum_durations =np.zeros(len(durations[0])) 135 | 136 | if len(weights)!=len(tiers): 137 | weights = np.ones(len(tiers)) 138 | for i in range(len(durations)): 139 | sum_durations+=durations[i]*weights[i] 140 | 141 | return (sum_durations) 142 | -------------------------------------------------------------------------------- /wavelet_prosody_toolkit/prosody_tools/energy_processing.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | 4 | """ 5 | AUTHOR 6 | - Antti Suni 7 | - Sébastien Le Maguer 8 | 9 | DESCRIPTION 10 | Module which provides the energy routines to be able to apply a wavelet analysis 11 | 12 | LICENSE 13 | See https://github.com/asuni/wavelet_prosody_toolkit/blob/master/LICENSE.txt 14 | """ 15 | 16 | import numpy as np 17 | from . import smooth_and_interp, misc 18 | 19 | 20 | # Logging 21 | import logging 22 | logger = logging.getLogger(__name__) 23 | 24 | 25 | 26 | def extract_energy(orig_waveform, fs=16000, min_freq=200, max_freq=3000, method='rms', target_rate=200): 27 | #python 2, 3 compatibility hack 28 | try: 29 | basestring 30 | except NameError: 31 | basestring = str 32 | 33 | 34 | # accept both wav-files and waveform arrays 35 | if isinstance(orig_waveform, basestring): 36 | (fs, waveform) = misc.read_wav(orig_waveform) 37 | 38 | # NOTE: reconvert to int to keep the consistency 39 | waveform = orig_waveform / 3.0517578125e-5 40 | 41 | import scipy.signal 42 | from . import filter 43 | lp_waveform = filter.butter_bandpass_filter(waveform, min_freq, max_freq, fs, order=5) 44 | 45 | # verify that filtering works 46 | #lp_waveform = waveform 47 | #scipy.io.wavfile.write("/tmp/tmp.wav", fs, lp_waveform.astype(np.int16)) 48 | 49 | # hilbert is sometimes prohibitively slow, should pad to next power of two 50 | if method == 'hilbert': 51 | energy=abs(scipy.signal.hilbert(lp_waveform)) 52 | 53 | elif method == "true_envelope": 54 | # window should be about one pitch period, ~ 5 ms 55 | win = 0.005 *fs 56 | energy = smooth_and_interp.peak_smooth(abs(lp_waveform), 200,win) 57 | 58 | elif method == "rms": 59 | energy=np.sqrt(lp_waveform**2) 60 | 61 | logger.debug("fs = %d, target_rate = %d, fs/target_rate = %f" % (fs, target_rate, fs/target_rate)) 62 | energy = misc.resample(energy, fs, target_rate) 63 | #energy = scipy.signal.resample_poly(energy, 1., int(round(fs/target_rate))) 64 | logger.debug("len(energy) = %d, len(energy)/target_rate = %f" % (len(energy), len(energy)/target_rate)) 65 | return energy 66 | 67 | 68 | def process(energy, voicing=[]): 69 | energy = smooth_and_interp.peak_smooth(energy, 100, 5, voicing=voicing) 70 | return energy 71 | -------------------------------------------------------------------------------- /wavelet_prosody_toolkit/prosody_tools/f0_processing.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | 4 | """ 5 | AUTHOR 6 | - Antti Suni 7 | - Sébastien Le Maguer 8 | 9 | DESCRIPTION 10 | Module which provides the F0 routines to be able to apply a wavelet analysis 11 | 12 | LICENSE 13 | See https://github.com/asuni/wavelet_prosody_toolkit/blob/master/LICENSE.txt 14 | """ 15 | 16 | 17 | # Global/system packages 18 | import sys 19 | import os 20 | 21 | # Math/signal processing 22 | import numpy as np 23 | from scipy.io import wavfile 24 | import pylab 25 | 26 | # Local packages 27 | from . import smooth_and_interp 28 | from . import pitch_tracker 29 | 30 | # Logging 31 | import logging 32 | logger = logging.getLogger(__name__) 33 | 34 | # Pyreaper 35 | try: 36 | import pyreaper 37 | USE_REAPER = True 38 | logger.info("Pyreaper is available") 39 | except ImportError: 40 | USE_REAPER = False 41 | logger.debug("Pyreaper is not available so falling back into the default pitch tracker") 42 | 43 | 44 | ############################################################################### 45 | 46 | 47 | def rolling_window(a, window): 48 | shape = a.shape[:-1] + (a.shape[-1] - window + 1, window) 49 | strides = a.strides + (a.strides[-1],) 50 | return np.lib.stride_tricks.as_strided(a, shape=shape, strides=strides) 51 | 52 | 53 | def _cut_boundary_vals(params, num_vals): 54 | cutted = np.array(params) 55 | for i in range(num_vals, len(params)-num_vals): 56 | if (params[i] <= 0) and (params[i+1] > 0): 57 | for j in range(i, i+num_vals): 58 | cutted[j] = 0.0 59 | 60 | if (params[i] > 0) and (params[i+1] <= 0): 61 | for j in range(i-num_vals, i+1): 62 | cutted[j] = 0.0 63 | 64 | return cutted 65 | 66 | 67 | def _remove_outliers(lf0, trace=False): 68 | 69 | if np.nanmean(lf0[lf0 > 0]) > 10: 70 | raise("logF0 expected") 71 | 72 | fixed = np.array(lf0) 73 | 74 | # remove f0 values from voicing boundaries, if they make a large difference for 75 | # interpolation 76 | boundary_cut = smooth_and_interp.interpolate_zeros(_cut_boundary_vals(fixed, 3), 'linear') 77 | interp = smooth_and_interp.interpolate_zeros(fixed, 'linear') 78 | fixed[abs(interp-boundary_cut) > 0.1] = 0 79 | interp = smooth_and_interp.interpolate_zeros(fixed, 'linear') 80 | 81 | # iterative outlier removal 82 | # 1. compare current contour estimate to a smoothed contour and remove deviates larger than threshold 83 | # 2. smooth current estimate with shorter window, thighten threshold 84 | # 3. goto 1. 85 | 86 | # In practice, first handles large scale octave jump type errors, 87 | # finally small scale 'errors' like consonant perturbation effects and 88 | # other irregularities in voicing boundaries 89 | # 90 | # if this appears to remove too many correct values, increase thresholds 91 | num_iter = 30 92 | max_win_len = 100 93 | min_win_len = 10 # 20 94 | max_threshold = 3. # threshold with broad window 95 | 96 | min_threshold = 0.5 # threshold with shorted window 97 | 98 | if trace: 99 | pylab.rcParams['figure.figsize'] = 20, 5 100 | pylab.figure() 101 | pylab.title("outlier removal") 102 | 103 | _std = np.std(interp) 104 | # do not tie fixing to liveliness of the original 105 | _std = 0.3 106 | 107 | win_len = np.exp(np.linspace(np.log(max_win_len), np.log(min_win_len), 108 | num_iter+1)) 109 | outlier_threshold = np.linspace(_std*max_threshold, _std*min_threshold, 110 | num_iter+1) 111 | for i in range(0, num_iter): 112 | smooth_contour = smooth_and_interp.smooth(interp, win_len[i]) 113 | low_limit = smooth_contour - outlier_threshold[i] 114 | hi_limit = smooth_contour + outlier_threshold[i]*1.5 # bit more careful upwards, not to cut emphases 115 | 116 | # # octave jump down fix, more harm than good? 117 | # fixed[interpsmooth_contour+0.45]=interp[interp>smooth_contour+0.45]-0.5 119 | fixed[interp > hi_limit] = 0 120 | fixed[interp < low_limit] = 0 121 | 122 | if trace: 123 | pylab.clf() 124 | pylab.title("outlier removal %d" % i) 125 | # pylab.ylim(3.5,7) 126 | pylab.plot((low_limit), 'black', linestyle='--') 127 | pylab.plot((hi_limit), 'black', linestyle='--') 128 | pylab.plot((smooth_contour), 'black', linestyle='--') 129 | pylab.plot((interp), linewidth=3) 130 | pylab.plot(lf0) 131 | pylab.show() 132 | 133 | interp = smooth_and_interp.interpolate_zeros(fixed, 'linear') 134 | 135 | # if trace: 136 | # raw_input("press any key to continue") 137 | 138 | return fixed 139 | 140 | 141 | def _interpolate(f0, method="true_envelope"): 142 | 143 | if method == "linear": 144 | return smooth_and_interp.interpolate_zeros(f0, 'linear') 145 | elif method == "pchip": 146 | return smooth_and_interp.interpolate_zeros(f0, 'pchip') 147 | 148 | elif method == 'true_envelope': 149 | interp = smooth_and_interp.interpolate_zeros(f0) 150 | 151 | _std = np.std(interp) 152 | _min = np.min(interp) 153 | low_limit = smooth_and_interp.smooth(interp, 200)-1.5*_std 154 | low_limit[low_limit < _min] = _min 155 | hi_limit = smooth_and_interp.smooth(interp, 100)+2.0*_std 156 | voicing = np.array(f0) 157 | constrained = np.array(f0) 158 | constrained = np.maximum(f0, low_limit) 159 | constrained = np.minimum(constrained, hi_limit) 160 | 161 | interp = smooth_and_interp.peak_smooth(constrained, 100, 20, 162 | voicing=voicing) 163 | # smooth voiced parts a bit too 164 | interp = smooth_and_interp.peak_smooth(interp, 3, 2) # ,voicing=raw) 165 | return interp 166 | else: 167 | raise("no such interpolation method: %s", method) 168 | 169 | 170 | def extract_f0(waveform, fs=16000, f0_min=30, f0_max=550, harmonics=10., voicing=50., configuration="pitch_tracker"): 171 | """Extract F0 from a waveform 172 | 173 | """ 174 | # first determine f0 without limits, then use mean and std of the first estimate 175 | # to limit search range. 176 | if (f0_min == 0) or (f0_max == 0): 177 | if USE_REAPER and (configuration == "REAPER"): 178 | _, _, _, f0, _ = pyreaper.reaper(waveform, fs, f0_min, f0_max) 179 | else: 180 | (f0, _) = pitch_tracker.inst_freq_pitch(waveform, fs, f0_min, f0_max, harmonics, voicing, False, 200) 181 | 182 | mean_f0 = np.mean(f0[f0 > 0]) 183 | std_f0 = np.std(f0[f0 > 0]) 184 | f0_min = max((mean_f0 - 3*std_f0, 40.0)) 185 | f0_max = mean_f0 + 6*std_f0 186 | 187 | logger.debug("f0_min = %f, f0_max = %f" % (f0_min, f0_max)) 188 | 189 | if USE_REAPER and (configuration == "REAPER"): 190 | _, _, _, f0, _ = pyreaper.reaper(waveform, fs, f0_min, f0_max) 191 | else: 192 | (f0, _) = pitch_tracker.inst_freq_pitch(waveform, fs, f0_min, f0_max, harmonics, voicing, False, 200) 193 | 194 | return f0 195 | 196 | 197 | def process(f0, fix_outliers=True, interpolate=True, do_trace=False): 198 | 199 | lf0 = np.array(f0) 200 | log_scaled = True 201 | if np.mean(f0[f0 > 0]) > 20: 202 | log_scaled = False 203 | lf0[f0 > 0] = np.log(f0[f0 > 0]) 204 | lf0[f0 <= 0] = 0 205 | 206 | if fix_outliers: 207 | lf0 = _remove_outliers(lf0, trace=do_trace) 208 | if interpolate: 209 | lf0 = _interpolate(lf0, 'true_envelope') 210 | if not log_scaled: 211 | return np.exp(lf0) 212 | else: 213 | return lf0 214 | 215 | 216 | # this is temporary: assumes 5ms frame shift, 217 | # assumes format to be either one f0 value per line 218 | # or praat matrix format 219 | 220 | def read_f0(filename): 221 | import os.path 222 | for ext in [".f0", ".F0"]: 223 | f0_f = os.path.splitext(filename)[0]+ext 224 | 225 | if os.path.exists(f0_f): 226 | logger.info("reading F0 file", f0_f) 227 | try: 228 | # one f0 value per line 229 | return np.loadtxt(f0_f) 230 | except: 231 | # praat matrix 232 | try: 233 | return np.loadtxt(f0_f, skiprows=4) 234 | except: 235 | logger.error("unknown format for F0 value in file \"%s\"" % filename) 236 | 237 | return None 238 | -------------------------------------------------------------------------------- /wavelet_prosody_toolkit/prosody_tools/filter.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | 4 | """ 5 | AUTHOR 6 | - Antti Suni 7 | - Sébastien Le Maguer 8 | 9 | DESCRIPTION 10 | Butter filter utilities 11 | 12 | This module contains butter filter help functions copied from http://scipy-cookbook.readthedocs.io/items/ButterworthBandpass.html 13 | 14 | LICENSE 15 | See https://github.com/asuni/wavelet_prosody_toolkit/blob/master/LICENSE.txt 16 | """ 17 | 18 | from scipy.signal import butter, lfilter 19 | 20 | # Logging 21 | import logging 22 | logger = logging.getLogger(__name__) 23 | 24 | def butter_bandpass(lowcut, highcut, fs, order=5): 25 | """Generate the butter bandpass filter 26 | 27 | For more details see scipy.signal.butter documentation 28 | 29 | Parameters 30 | ---------- 31 | lowcut: int 32 | The low cut value 33 | highcut: type 34 | description 35 | fs: int 36 | Signal sample rate 37 | order: int 38 | Order of the butter fiter 39 | 40 | Returns 41 | ------- 42 | b: arraylike 43 | Numerator polynomial of the IIR filter 44 | a: arraylike 45 | Denominator polynomial of the IIR filter 46 | """ 47 | nyq = 0.5 * fs 48 | low = lowcut / nyq 49 | if highcut >=nyq*0.95: 50 | highcut = nyq*0.95 51 | high = highcut / nyq 52 | b, a = butter(order, [low, high], btype='band') 53 | 54 | return b, a 55 | 56 | 57 | def butter_bandpass_filter(data, lowcut, highcut, fs, order=5): 58 | """Filter signal data using a butter filter type 59 | 60 | For more details see scipy.signal.butter and scipy.signal.lfilter documentation 61 | 62 | Parameters 63 | ---------- 64 | data: arraylike 65 | An N-dimensional input array. 66 | lowcut: int 67 | The lowcut filtering value. 68 | highcut: type 69 | The highcut filtering value. 70 | fs: int 71 | The signal sample rate. 72 | order: int 73 | The order of the butter filter. 74 | 75 | Returns 76 | ------- 77 | arraylike 78 | An N-dimensional filtered array 79 | """ 80 | b, a = butter_bandpass(lowcut, highcut, fs, order=order) 81 | y = lfilter(b, a, data) 82 | 83 | return y 84 | -------------------------------------------------------------------------------- /wavelet_prosody_toolkit/prosody_tools/lab.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | 4 | """ 5 | AUTHOR 6 | - Antti Suni 7 | - Sébastien Le Maguer 8 | 9 | DESCRIPTION 10 | This module provide routines to deal with annotation in TextGrid or HTK label formats 11 | 12 | LICENSE 13 | See https://github.com/asuni/wavelet_prosody_toolkit/blob/master/LICENSE.txt 14 | """ 15 | 16 | def read_textgrid(filename, sample_rate=200): 17 | import tgt 18 | try: 19 | tg = tgt.read_textgrid(filename) #, include_empty_intervals=True) 20 | except: 21 | print("reading "+filename+" failed") 22 | 23 | return 24 | tiers = [] 25 | labs = {} 26 | 27 | for tier in tg.get_tier_names(): 28 | if (tg.get_tier_by_name(tier)).tier_type()!='IntervalTier': 29 | continue 30 | tiers.append(tg.get_tier_by_name(tier)) 31 | 32 | lab = [] 33 | for a in tiers[-1].annotations: 34 | 35 | try: 36 | # this was for some past experiment 37 | if a.text in ["p1","p2","p3","p4","p5","p6","p7"]: 38 | lab[-1][-1]=lab[-1][-1]+"_"+a.text 39 | else: 40 | #lab.append([a.start_time*sample_rate,a.end_time*sample_rate,a.text.encode('utf-8')]) 41 | lab.append([a.start_time*sample_rate,a.end_time*sample_rate,a.text]) 42 | except: 43 | pass 44 | #print tiers[-1].encode('latin-1') 45 | labs[tier.lower()] = lab 46 | try: 47 | for i in range(len(labs['prosody'])): 48 | if labs['prosody'][i][2][-2:] not in ["p1","p2","p3","p4","p5","p6","p7"]: 49 | labs['prosody'][i][2]+="_p0" 50 | except: 51 | pass 52 | 53 | return labs 54 | 55 | 56 | def htk_to_ms(htk_time): 57 | """ 58 | Convert time in HTK (100 ns) units to 5 ms 59 | """ 60 | if type(htk_time)==type("string"): 61 | htk_time = float(htk_time) 62 | return htk_time / 50000.0 63 | 64 | 65 | def read_htk_label(fname, scale = "word", htk_time=True, only_words=False): 66 | """ 67 | Read HTK label, assume: "start end phone word", where word is optional. 68 | Convert times from HTK units to MS 69 | """ 70 | import codecs 71 | 72 | try: 73 | 74 | f = codecs.open(fname,"r", "utf-8") 75 | #f = open(fname, "r") 76 | except: 77 | #raw_input() 78 | raise Exception("htk label file %s not found" % fname) 79 | 80 | label = f.readlines() 81 | f.close() 82 | 83 | label = [line.split() for line in label] ## split lines on whitespace 84 | 85 | segments = [] 86 | words = [] 87 | prev_end = 0.0 88 | prev_start = 0.0 89 | prev_word = "!SIL" 90 | prev_segment = "" 91 | orig_start=0 92 | orig_end = 0 93 | word = "" 94 | for line in label: 95 | if len(line)==4 and line[2] == 'skip': 96 | continue 97 | word = False 98 | if len(line)==3: 99 | (start,end,segment) = line 100 | if start == "nan": 101 | continue 102 | 103 | elif len(line)==4: 104 | (start,end,segment,word) = line 105 | 106 | else: 107 | 108 | print("Bad line length:") 109 | print(line) 110 | 111 | continue 112 | #sys.exit(1) 113 | if htk_time == True: 114 | end = htk_to_ms(int(end)) 115 | start = htk_to_ms(int(start)) 116 | else: 117 | # 5ms frame 118 | pass 119 | end = float(end)*200 120 | start = float(start)*200 121 | if start == end: 122 | continue 123 | prev_end = start 124 | 125 | segments.append([int(start), int(end), segment]) # 126 | 127 | # handle the last word too 128 | 129 | """ 130 | if word or segments[-1][2] in ["SIL", "pause", '#']: 131 | try: 132 | if prev_word not in ["!SIL", "pause"] and prev_word[0]!= "!" and prev_word[0]!="_" and prev_word[0]!='#': 133 | words.append([int(prev_start), int(prev_end),prev_word]) #, prev_word]) 134 | except: 135 | pass 136 | """ 137 | if word: 138 | 139 | words.append([int(prev_start), int(prev_end),prev_word]) 140 | prev_start = start 141 | prev_word = word 142 | word = "" 143 | if len(label[-1])==4: 144 | words.append([htk_to_ms(float(label[-1][0])), htk_to_ms(float(label[-1][1])), label[-1][3]]) 145 | labs = {} 146 | if len(words) > 0: 147 | labs["words"] = words 148 | labs["segments"] = segments 149 | 150 | 151 | return labs 152 | 153 | 154 | def plot_labels(labels,shift = 0, fig="", text = True, ypos = -0.5, color="black", 155 | boundary=True, size =9,prominences=[], rotation=30, background=True,rate = 1.): 156 | import numpy as np 157 | import pylab 158 | if fig == "": 159 | fig = pylab 160 | 161 | #print labels 162 | if len(prominences) == 0: 163 | prominences = np.ones(len(labels)) 164 | else: 165 | prominences = np.sqrt(np.array(prominences)+0.25) #/np.max(prominences) 166 | import matplotlib as mpl 167 | mpl.rcParams['font.family'] = 'fantasy' 168 | mpl.rcParams['font.fantasy'] = 'Ubuntu' #'Arial' 169 | 170 | import matplotlib.patches as patches 171 | 172 | i = 0 173 | for (start, end, segment) in labels: 174 | start*=rate 175 | end*=rate 176 | if text and segment[0] != "!": 177 | 178 | 179 | try: 180 | 181 | t =fig.text(start+(end-start)/2,ypos, segment, color=color,fontsize=size*(prominences[i]+0.5)*1,\ 182 | ha='center',alpha=0.75, rotation=rotation, clip_on=True) 183 | if background: 184 | t.set_bbox(dict(facecolor='grey', alpha=0.3, edgecolor='grey')) 185 | except: 186 | pass 187 | 188 | if boundary: 189 | fig.axvline(x=start, color='gray',linestyle="-",alpha=0.5) 190 | fig.axvline(x=end, color='gray',linestyle="-",alpha=0.5) 191 | 192 | i+=1 193 | -------------------------------------------------------------------------------- /wavelet_prosody_toolkit/prosody_tools/loma.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | 4 | """ 5 | AUTHOR 6 | - Antti Suni 7 | - Sébastien Le Maguer 8 | 9 | DESCRIPTION 10 | Module which provides Line Of Maximum Amplitude (loma) related routines 11 | 12 | LICENSE 13 | See https://github.com/asuni/wavelet_prosody_toolkit/blob/master/LICENSE.txt 14 | """ 15 | 16 | import numpy as np 17 | from operator import itemgetter 18 | 19 | from wavelet_prosody_toolkit.prosody_tools import misc 20 | 21 | # Logging 22 | import logging 23 | logger = logging.getLogger(__name__) 24 | 25 | 26 | def save_analyses(fname, labels, prominences, boundaries, frame_rate=200, with_header=False): 27 | """Save analysis into a csv formatted this way 28 | 29 | Parameters 30 | ---------- 31 | fname: string 32 | The output csv filename 33 | labels: list of tuple (float, float, string) 34 | List of labels which are lists of 3 elements [start, end, description] 35 | prominences: type 36 | description 37 | boundaries: type 38 | description 39 | frame_rate: int 40 | The speech frame rate 41 | with_header: boolean 42 | Write the header (True) or not (False) [default: False] 43 | 44 | """ 45 | import os.path 46 | 47 | # Fill Header 48 | if with_header: 49 | header = ("Basename", "start", "end", "label", "prominence", "boundary") 50 | 51 | # Generate content 52 | lines = [] 53 | for i in range(0, len(labels)): 54 | lines.append(("%s" %(os.path.splitext(os.path.basename(fname))[0] ), 55 | "%.3f" %(float(labels[i][0]/frame_rate)), 56 | "%.3f" %(float(labels[i][1]/frame_rate)), 57 | labels[i][2], 58 | #"%.3f" %(float(prominences[i][0]/frame_rate)), 59 | "%.3f" %(prominences[i][1]), 60 | "%.3f" %(boundaries[i][1]))) 61 | 62 | logger.debug("Saving %s with following content:" % fname) 63 | if with_header: 64 | logger.debug(header) 65 | logger.debug(lines) 66 | 67 | import codecs 68 | with codecs.open(fname, "w", "utf-8") as prom_f: 69 | if with_header: 70 | prom_f.write(u"\t".join(header) + u"\n") 71 | 72 | for i in range(0,len(lines)): 73 | prom_f.write(u'\t'.join(lines[i])+u"\n") 74 | 75 | 76 | def simplify(loma): 77 | """? 78 | 79 | Parameters 80 | ---------- 81 | loma: type 82 | description 83 | 84 | """ 85 | simplified = [] 86 | for l in loma: 87 | # align loma to it's position in the middle of the line 88 | pos = l[int(len(l)/2.0)][0] 89 | strength = l[-1][1] 90 | simplified.append((pos,strength)) 91 | return simplified 92 | 93 | 94 | def get_prominences(pos_loma, labels, rate=1): 95 | """? 96 | 97 | Parameters 98 | ---------- 99 | pos_loma: list of ? 100 | Positive loma values 101 | labels: list of tuple (float, float, string) 102 | List of labels which are lists of 3 elements [start, end, description] 103 | rate: int 104 | ? 105 | 106 | """ 107 | max_word_loma = [] 108 | loma = simplify(pos_loma) 109 | for (st, end, unit) in labels: 110 | st*=rate 111 | end*=rate 112 | word_loma = [] 113 | for l in loma: 114 | if l[0] >=st and l[0]<=end: 115 | word_loma.append(l)# l[1]) 116 | if len(word_loma)> 0: 117 | max_word_loma.append(sorted(word_loma, key=itemgetter(1))[-1]) 118 | else: 119 | max_word_loma.append([st+(end-st)/2.0, 0.]) 120 | 121 | return max_word_loma 122 | 123 | 124 | def get_boundaries(max_word_loma,boundary_loma, labels): 125 | """get strongest lines of minimum amplitude between adjacent words' max lines 126 | 127 | Parameters 128 | ---------- 129 | max_word_loma: type 130 | description 131 | boundary_loma: type 132 | description 133 | labels: type 134 | description 135 | 136 | """ 137 | boundary_loma = simplify(boundary_loma) 138 | max_boundary_loma = [] 139 | st = 0 140 | end=0 141 | for i in range(1, len(max_word_loma)): 142 | w_boundary_loma = [] 143 | for l in boundary_loma: 144 | st = max_word_loma[i-1][0] 145 | end = max_word_loma[i][0] 146 | if l[0] >=st and l[0] 0: 148 | w_boundary_loma.append(l) 149 | 150 | if len(w_boundary_loma) > 0: 151 | max_boundary_loma.append(sorted(w_boundary_loma, key=itemgetter(1))[-1]) 152 | else: 153 | max_boundary_loma.append([st+(end-st)/2, 0]) 154 | 155 | # final boundary is not estimated 156 | max_boundary_loma.append((labels[-1][1],1)) 157 | return max_boundary_loma 158 | 159 | 160 | def _get_parent(child_index, parent_diff, parent_indices): 161 | """Private function to find the parent of the given child peak. At child peak index, follow the 162 | slope of parent scale upwards to find parent 163 | 164 | Parameters 165 | ---------- 166 | child_index: int 167 | Index of the current child peak 168 | parent_diff: list of ? 169 | ? 170 | parent_indices: list of int ? 171 | Indices of available parents 172 | 173 | Returns 174 | _______ 175 | int 176 | The parent index or None if there is no parent 177 | """ 178 | for i in range(0, len(parent_indices)): 179 | if (parent_indices[i] > child_index): 180 | if (parent_diff[int(child_index)] > 0): 181 | return parent_indices[i] 182 | else: 183 | if i > 0: 184 | return parent_indices[i-1] 185 | else: 186 | return parent_indices[0] 187 | 188 | if len(parent_indices) > 0: 189 | return parent_indices[-1] 190 | return None 191 | 192 | def get_loma(wavelet_matrix, scales, min_scale, max_scale): 193 | """Get the Line Of Maximum Amplitude (loma) 194 | 195 | Parameters 196 | ---------- 197 | wavelet_matrix: matrix of float 198 | The wavelet matrix 199 | scales: list of int 200 | The list of scales 201 | min_scale: int 202 | The minimum scale 203 | max_scale: int 204 | The maximum scale 205 | 206 | Returns 207 | ------- 208 | list of tuples 209 | ? 210 | 211 | Note 212 | ---- 213 | change this so that one level is done in one chunk, not one parent. 214 | """ 215 | psize = 100.0 216 | min_peak = -10000.0 # minimum peak amplitude to consider. NOTE:this has no meaning unless scales normalized 217 | max_dist = 10 # how far in time to look for parent peaks. NOTE: frame rate and scale dependent, FIXME: how dependent? 218 | 219 | # get peaks from the first scale 220 | (peaks,indices) = misc.get_peaks(wavelet_matrix[min_scale],min_peak) 221 | 222 | loma=dict() 223 | root=dict() 224 | for i in range(0,len(peaks)): 225 | loma[indices[i]]=[] 226 | 227 | # keep track of roots of each loma 228 | root[indices[i]] = indices[i] 229 | 230 | for i in range(min_scale+1, max_scale): 231 | max_dist = np.sqrt(scales[i])*4 232 | 233 | # find peaks in the parent scale 234 | (p_peaks,p_indices) = misc.get_peaks(wavelet_matrix[i], min_peak) 235 | 236 | parents = dict(zip(p_indices, p_peaks)) 237 | 238 | # find a parent for each child peak 239 | children = dict() 240 | for p in p_indices: 241 | children[p] = [] 242 | 243 | parent_diff = np.diff(wavelet_matrix[i],1) 244 | for j in range(0,len(indices)): 245 | parent =_get_parent(indices[j], parent_diff, p_indices) 246 | if parent: 247 | if abs(parent-indices[j]) < max_dist and peaks[j] > min_peak:# np.std(wavelet_matrix[i])*0.5: 248 | children[parent].append([indices[j],peaks[j]]) 249 | peaks=[];indices = [] 250 | 251 | # for each parent, select max child 252 | 253 | for p in children: 254 | 255 | if len(children[p]) > 0: 256 | # maxi[0]: index 257 | # maxi[1]: peak height 258 | maxi = sorted(children[p], key=itemgetter(1))[-1] 259 | indices.append(p) 260 | peaks.append(maxi[1]+parents[p]) 261 | 262 | #append child to correct loma 263 | loma[root[maxi[0]]].append([maxi[0],maxi[1]+parents[p], i, p]) 264 | root[p] = root[maxi[0]] 265 | 266 | 267 | sorted_loma = [] 268 | for k in sorted(loma.keys()): 269 | if len(loma[k]) > 0: 270 | sorted_loma.append(loma[k]) 271 | 272 | logger.debug(simplify(sorted_loma)) 273 | return sorted_loma 274 | 275 | 276 | def plot_loma(loma, fig, color='black'): 277 | """Plot the line of maximum amplitudes (loma) 278 | 279 | Parameters 280 | ---------- 281 | loma: list of tuple (float, float, int, ?) 282 | the loma values 283 | fig: figure 284 | the figure where the loma are going to be plotted in 285 | color: string 286 | the color name/code 287 | 288 | """ 289 | for elt in loma: 290 | for child in elt: 291 | i = child[2] 292 | y = i-1 293 | size = child[1] 294 | fig.plot([child[0], child[3]], [(i-2), y], 295 | linewidth=size, color=color, 296 | alpha=0.45, solid_capstyle='round') 297 | -------------------------------------------------------------------------------- /wavelet_prosody_toolkit/prosody_tools/misc.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | 4 | """ 5 | AUTHOR 6 | - Antti Suni 7 | - Sébastien Le Maguer 8 | 9 | DESCRIPTION 10 | Module which provides a set of helper routines (wav, sginal, scales) 11 | 12 | LICENSE 13 | See https://github.com/asuni/wavelet_prosody_toolkit/blob/master/LICENSE.txt 14 | """ 15 | 16 | import os 17 | from scipy.signal import resample_poly 18 | import fractions 19 | import soundfile 20 | import numpy as np 21 | from pylab import ginput 22 | 23 | # Logging 24 | import logging 25 | logger = logging.getLogger(__name__) 26 | 27 | 28 | def read_wav(filename): 29 | """Read wave file using soundfile.read 30 | 31 | Parameters 32 | ---------- 33 | filename: string 34 | Name of the file. 35 | 36 | Returns 37 | ------- 38 | samplerate: int 39 | The audio signal sample rate. 40 | 41 | data: 1D arraylike 42 | The audio samples of the first channel with memory layout as C-order 43 | """ 44 | # various packages tried.. difficulties with channels, 24bit files, various dtypes 45 | # pysoundfile appears to mostly work 46 | 47 | data, samplerate = soundfile.read(filename, always_2d=True) 48 | 49 | return (samplerate, data[:, 0].copy(order='C')) 50 | 51 | """Alternative solutions: 52 | # import wavio 53 | # wav = wavio.read(filename) 54 | # print wav.data.shape 55 | # pylab.plot(wav.data[:,0]) 56 | # return (wav.rate, wav.data[:, 0]) 57 | 58 | 59 | import scipy.io.wavfile 60 | try: 61 | return scipy.io.wavfile.read(filename) 62 | except Exception as e: 63 | 64 | print e 65 | """ 66 | 67 | 68 | def write_wav(filename, data, sr, format="WAV"): 69 | """Write audio file using soundfile 70 | 71 | Parameters 72 | ---------- 73 | filename: string 74 | The name of the wave file. 75 | data: 1D arraylike 76 | The audio samples. 77 | sr: int 78 | The sample rate. 79 | format: string 80 | The output audio format (Default value is WAV for wav file). 81 | 82 | """ 83 | 84 | soundfile.write(filename, data, sr, format=format) 85 | 86 | 87 | def resample(waveform, s_sr, t_sr): 88 | """resampling for waveforms, should work also with when source and 89 | target rate ratio is fractional 90 | 91 | Parameters 92 | ---------- 93 | waveform: np.array 94 | speech waveform, mono 95 | s_sr: float 96 | original sample rate 97 | t_sr: float 98 | target sample rate 99 | 100 | returns: resampled waveform as np.array 101 | """ 102 | ratio = fractions.Fraction(int(t_sr), int(s_sr)) 103 | return resample_poly(waveform.astype(float), ratio.numerator, ratio.denominator) 104 | 105 | 106 | def play(utt): 107 | wavfile = utt + ".wav" 108 | wavfile = wavfile.replace(" ", "\ ") 109 | st = 0.2 110 | end = 1 111 | 112 | while (st > 0.01): 113 | try: 114 | pts = ginput(1) 115 | st = pts[0][0] / 200.0 116 | end = 1.0 117 | except: 118 | continue 119 | os.system("play %s trim 0:0:%f 0:0:%f " % (wavfile, st, end)) 120 | 121 | 122 | def match_length(sig_list): 123 | """Reduce length of all signals to a the minimum one. 124 | 125 | Parameters 126 | ---------- 127 | sig_list: list 128 | List of signals which are 1D array of samples. 129 | 130 | """ 131 | length = min(map(len, sig_list)) 132 | 133 | for i in range(0, len(sig_list)): 134 | sig_list[i] = sig_list[i][:int(length)] 135 | 136 | return sig_list 137 | 138 | 139 | def get_peaks(params, threshold=-10): 140 | """Find the peaks based on the given prosodic parameters. 141 | 142 | Parameters 143 | ---------- 144 | params: ? 145 | Prosodic parameters 146 | threshold: int 147 | description 148 | 149 | Returns 150 | ------- 151 | peaks: arraylike 152 | array of peak values and peak indices 153 | """ 154 | # zc = np.where(np.diff(np.sign(np.diff(params))))[0] # FIXME SLM: not used 155 | indices = (np.diff(np.sign(np.diff(params))) < 0).nonzero()[0] + 1 156 | 157 | peaks = params[indices] 158 | return np.array([peaks[peaks > threshold], indices[peaks > threshold]]) 159 | 160 | 161 | def calc_prominence(params, labels, func=np.max, use_peaks=True, rate=200): 162 | """Compute prominences 163 | 164 | Parameters 165 | ---------- 166 | params: type 167 | description 168 | labels: type 169 | description 170 | func: function handle 171 | use_peaks: boolean 172 | Use peaks (True) or not (False) to determine the prominence 173 | rate: int 174 | The rate (default=200 (Hz) for 5ms) 175 | 176 | """ 177 | labelled = [] 178 | # norm = params.astype(float) # FIXME SLM: not used 179 | for (start, end, segment, word) in labels: 180 | if use_peaks: 181 | peaks = [] 182 | (peaks, indices) = get_peaks(params[start*rate-1:end*rate], 0.0) 183 | 184 | if len(peaks) > 0: 185 | labelled.append(np.max(peaks)) 186 | else: 187 | labelled.append(0.0) 188 | else: 189 | # labelled.append([word, func(params[start-10:end])]) 190 | labelled.append(func(params[start*rate:end*rate])) 191 | 192 | return labelled 193 | 194 | 195 | def get_best_scale(wavelet_matrix, num_units): 196 | """Find the scale whose number of peaks is closest to the number of units 197 | 198 | Parameters 199 | ---------- 200 | wavelet_matrix: arraylike 201 | The wavelet matrix data. 202 | num_units: int 203 | The target number of units 204 | 205 | Returns 206 | ------- 207 | int 208 | the index of the best scale 209 | """ 210 | best_i = 0 211 | best = 999 212 | for i in range(0, wavelet_matrix.shape[0]): 213 | num_peaks = len(get_peaks(wavelet_matrix[i])[0]) 214 | 215 | dist = abs(num_peaks - num_units) 216 | if dist < best: 217 | best = dist 218 | best_i = i 219 | 220 | return best_i 221 | 222 | 223 | def get_best_scale2(scales, labels): 224 | """Find the scale whose width is the closes to the average unit length represented in the labels 225 | 226 | Parameters 227 | ---------- 228 | scales: 1D arraylike 229 | The scale indices 230 | labels: list of tuple (float, float, string) 231 | List of labels which are lists of 3 elements [start, end, description] 232 | 233 | 234 | Returns 235 | ------- 236 | int 237 | the index of the best scale 238 | 239 | """ 240 | mean_length = 0 241 | for l in labels: 242 | mean_length += (l[1] - l[0]) 243 | 244 | mean_length /= len(labels) 245 | dist = scales - mean_length 246 | 247 | return np.argmin(np.abs(dist)) 248 | 249 | 250 | def normalize_minmax(params, epsilon=0.1): 251 | """Normalize parameters into a 0,1 scale 252 | 253 | Parameters 254 | ---------- 255 | params: arraylike 256 | The parameters to normalize. 257 | epsilon: float 258 | The epsilon to deal with numerical stability 259 | 260 | Returns 261 | ------ 262 | arraylike 263 | the normalized parameters 264 | 265 | """ 266 | return (params-min(params)+epsilon)/(max(params)-min(params)) 267 | 268 | 269 | def normalize_std(params, std=0): 270 | """Normalize parameters using a z-score paradigm 271 | 272 | Parameters 273 | ---------- 274 | params: arraylike 275 | The parameters to normalize. 276 | std: float 277 | A given standard deviation. If 0, the standard deviation is computed on the params. (Default: 0) 278 | 279 | 280 | Returns 281 | ------ 282 | arraylike 283 | the normalized parameters 284 | """ 285 | if std == 0: 286 | std = np.nanstd(params) 287 | 288 | # empty array or all zeros 289 | # if std==0: 290 | if std < 0.00001: # np.isclose([std,0]): 291 | return np.zeros(len(params)) 292 | 293 | mean = np.nanmean(params) 294 | 295 | return (params - mean) / float(std) 296 | -------------------------------------------------------------------------------- /wavelet_prosody_toolkit/prosody_tools/pitch_tracker.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | 4 | """ 5 | AUTHOR 6 | - Antti Suni 7 | - Sébastien Le Maguer 8 | 9 | DESCRIPTION 10 | Module which a default pitch tracker 11 | 12 | LICENSE 13 | See https://github.com/asuni/wavelet_prosody_toolkit/blob/master/LICENSE.txt 14 | """ 15 | 16 | # Logging 17 | import logging 18 | logger = logging.getLogger(__name__) 19 | 20 | import numpy as np 21 | from . import misc, cwt_utils, f0_processing, smooth_and_interp 22 | 23 | import scipy.signal 24 | from scipy.signal import windows 25 | 26 | def _get_f0(spec, energy, min_hz, max_hz, thresh, sil_thresh): 27 | """ 28 | return frequency bin with maximum energy, if it is over given threshold 29 | and overall energy of the frame is over silence threshsold 30 | otherwise return 0 (unvoiced) 31 | """ 32 | 33 | cand = int(min_hz)+np.argmax(spec[int(min_hz):int(max_hz)]) 34 | if spec[cand] > thresh and energy > sil_thresh: 35 | if cand > 2*min_hz and spec[int(round(cand/2.))] > spec[cand]*0.5: 36 | return int(round(cand/2.)) 37 | else: 38 | return cand 39 | return 0 40 | 41 | 42 | def _track_pitch(pic, min_hz=50, max_hz=450,thresh=0.1,energy_thresh=1.0, DEBUG=False): 43 | """ 44 | extract pitch contour from time-frequency image 45 | bin with maximum energy / frame is chosen as a first f0 estimate, 46 | following with refinement steps based on the assumption of continuity of the pitch track 47 | """ 48 | 49 | pitch = np.zeros(pic.shape[0]) 50 | 51 | # calc energy threshold for voicing 52 | log_energy = np.log(np.sum(pic, axis=1)) 53 | energy_thresh=np.min(smooth_and_interp.smooth(log_energy,20))+energy_thresh 54 | pic_smooth = pic*scipy.ndimage.gaussian_filter(pic, [2,5]) 55 | 56 | # find frequency bins with max_energy 57 | for i in range(0, pic_smooth.shape[0]): 58 | pitch[i] = _get_f0(pic_smooth[i], log_energy[i],min_hz, max_hz, thresh, energy_thresh) 59 | 60 | 61 | 62 | # second pass with soft constraints 63 | n_iters = 3 64 | 65 | 66 | for iter in range(0, n_iters): 67 | 68 | smoothed = f0_processing.process(pitch) 69 | smoothed = smooth_and_interp.smooth(smoothed, int(200./(iter+1.))) 70 | 71 | 72 | # gradually thightening gaussian window centered on current estimate to softly constrain next iteration 73 | win_len = 800 74 | 75 | g_window = windows.gaussian(win_len, int(np.mean(smoothed)*(1./(iter+1.)**2))) 76 | #g_window = gaussian(win_len, (1./(iter+2)**2))) 77 | 78 | for i in range(0, pic.shape[0]): 79 | window=np.zeros(len(pic_smooth[i])) 80 | st = int(np.max((0, int(smoothed[i]-win_len)))) 81 | end = int(np.min((int(smoothed[i]+win_len*0.5), win_len-st))) 82 | window[st:end]=g_window[win_len-end:] 83 | pitch[i] = _get_f0(pic_smooth[i]*window, log_energy[i],min_hz, max_hz, thresh, energy_thresh) 84 | 85 | return pitch 86 | 87 | 88 | 89 | 90 | 91 | def _assign_to_bins(pic, freqs, mags): 92 | for i in range(1, freqs.shape[0]-1): 93 | for j in range(0, freqs.shape[1]): 94 | try: 95 | pic[j, int(freqs[i,j])]+=(mags[i,j]) 96 | except: 97 | pass 98 | 99 | 100 | def inst_freq_pitch_from_wav(utt_wav, min_hz=50, max_hz=400, acorr_weight=10., voicing_thresh=50., DEBUG=False, target_rate=200): 101 | # adjust thhresholds 102 | # the thresholds are empirically set, depends on number of bins, normalization, smoothing etc.. 103 | 104 | 105 | # read wav file, downsample to 4000Hz and normalize 106 | 107 | (fs, wav_form) = misc.read_wav(utt_wav) 108 | 109 | return inst_freq_pitch(wav_form, fs, min_hz, max_hz, acorr_weight, voicing_thresh, DEBUG, target_rate) 110 | 111 | def inst_freq_pitch(wav_form, fs, min_hz=50, max_hz=400, acorr_weight=10., voicing_thresh=50., DEBUG=False, target_rate=200): 112 | """ 113 | extract f0 track from speech wav file using instanenous frequency calculated from continuous wavelet transform 114 | """ 115 | 116 | voicing_thresh = (voicing_thresh-50.0) / 100.0 117 | acorr_weight /= 100. 118 | sample_rate = 4000 119 | tmp_wav_form = misc.resample(wav_form, fs, sample_rate) 120 | #params = scipy.signal.resample_poly(params, 1., int(round(fs/sample_rate))) 121 | tmp_wav_form = misc.normalize_std(tmp_wav_form) 122 | 123 | # init instantenous frequency pic, with rather low time and frequency resolution for speed 124 | # having 1 hz / bin simplifies the implememtation a bit, but treats males and females differently (other vals do not work) 125 | steps_in_hertz =1.0 126 | 127 | DEC = int(round(sample_rate/target_rate)) 128 | 129 | pic = np.zeros(shape=(int(len(tmp_wav_form)/float(DEC)), int(sample_rate/4.0))) 130 | 131 | 132 | # use continuous wavelet transform to get instantenous frequencies 133 | # integrate analyses with morlet mother wavelets with periods = 3,5,7 for good time and frequency resolution 134 | 135 | # setup wavelet 136 | #dt = 0.2 #4./sample_rate 137 | s0 = 2./sample_rate 138 | 139 | dj = 0.05 # 20 scales per octave 140 | J= 120 # six octaves 141 | dt = 1./sample_rate 142 | #periods = [3,5,7] #maybe this is too slow to be default 143 | periods = [5] 144 | for p in periods: 145 | 146 | (wavelet_matrix,scales,cwt_freqs) = cwt_utils.cwt_analysis(tmp_wav_form, mother_name="morlet",first_scale = s0, num_scales=J, scale_distance=dj, apply_coi=False,period=p, frame_rate = sample_rate) 147 | # hilbert transform 148 | phase = np.unwrap(np.angle(wavelet_matrix), axis=1) 149 | freqs = np.abs((np.gradient(phase, dt)[1]) / (2. * np.pi)) 150 | 151 | freqs = scipy.signal.decimate(freqs, DEC, zero_phase=True) 152 | mags = scipy.signal.decimate(abs(wavelet_matrix), DEC, zero_phase=True) 153 | 154 | # normalize magnitudes 155 | mags = (mags-mags.min())/mags.ptp() 156 | 157 | # construct time-frequency image 158 | _assign_to_bins(pic, freqs, mags) 159 | 160 | 161 | # perform frequency domain autocorrelation to enhance f0 162 | 163 | pic= scipy.ndimage.filters.gaussian_filter(pic,[1,1]) 164 | 165 | length = np.min((max_hz*3,pic.shape[1])).astype(int) 166 | 167 | for i in range(0, pic.shape[0]): # frame 168 | acorr1 = np.correlate(pic[i,:length], pic[i,:length], mode='same') 169 | pic[i, :int(length/2.)] *= acorr1[int(len(acorr1)/2.):] 170 | 171 | 172 | 173 | # generate pitch track from the image 174 | logger.debug("tracking pitch..") 175 | 176 | pitch = _track_pitch(pic,min_hz, max_hz, voicing_thresh, DEBUG=DEBUG) 177 | 178 | logger.debug("tracking pitch done.") 179 | return (pitch,pic) 180 | -------------------------------------------------------------------------------- /wavelet_prosody_toolkit/prosody_tools/smooth_and_interp.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | 4 | """ 5 | AUTHOR 6 | - Antti Suni 7 | - Sébastien Le Maguer 8 | 9 | DESCRIPTION 10 | Module which interpolation routines 11 | 12 | LICENSE 13 | See https://github.com/asuni/wavelet_prosody_toolkit/blob/master/LICENSE.txt 14 | """ 15 | 16 | # Global/system packages 17 | import sys 18 | 19 | # Math/signal processing 20 | import numpy as np 21 | from scipy.io import wavfile 22 | from scipy.signal import decimate 23 | from scipy import interpolate 24 | import pylab 25 | 26 | # Logging 27 | import logging 28 | logger = logging.getLogger(__name__) 29 | 30 | 31 | def remove_bias(params, win_len=300): 32 | return params-smooth(params, win_len) 33 | 34 | 35 | # copied from https://stackoverflow.com/questions/23024950/interp-function-in-python-like-matlab/40346185#40346185 36 | def interpolate_by_factor(vector, factor): 37 | """ 38 | Interpolate, i.e. upsample, a given 1D vector by a specific interpolation factor. 39 | :param vector: 1D data vector 40 | :param factor: factor for interpolation (must be integer) 41 | :return: interpolated 1D vector by a given factor 42 | """ 43 | # print(vector, factor) 44 | 45 | x = np.arange(np.size(vector)) 46 | y = vector 47 | f = interpolate.interp1d(x, y) 48 | 49 | x_extended_by_factor = np.linspace(x[0], x[-1], 50 | int(round(np.size(x) * factor))) 51 | y_interpolated = np.zeros(np.size(x_extended_by_factor)) 52 | 53 | i = 0 54 | for x in x_extended_by_factor: 55 | y_interpolated[i] = f(x) 56 | i += 1 57 | 58 | return y_interpolated 59 | 60 | 61 | def interpolate_zeros(params, method='pchip', min_val=0): 62 | """ 63 | Interpolate 0 values 64 | :param params: 1D data vector 65 | :param method: 66 | :param factor: factor for interpolation (must be integer) 67 | :return: interpolated 1D vector by a given factor 68 | """ 69 | 70 | voiced = np.array(params, float) 71 | for i in range(0, len(voiced)): 72 | if voiced[i] == min_val: 73 | voiced[i] = np.nan 74 | 75 | # last_voiced = len(params) - np.nanargmax(params[::-1] > 0) 76 | 77 | if np.isnan(voiced[-1]): 78 | voiced[-1] = np.nanmin(voiced) 79 | if np.isnan(voiced[0]): 80 | voiced[0] = np.nanmean(voiced) 81 | 82 | not_nan = np.logical_not(np.isnan(voiced)) 83 | 84 | indices = np.arange(len(voiced)) 85 | if method == 'spline': 86 | interp = interpolate.UnivariateSpline(indices[not_nan], 87 | voiced[not_nan], 88 | k=2, s=0) 89 | # return voiced parts intact 90 | smoothed = interp(indices) 91 | for i in range(0, len(smoothed)): 92 | if not np.isnan(voiced[i]): 93 | smoothed[i] = params[i] 94 | 95 | return smoothed 96 | 97 | elif method == 'pchip': 98 | interp = interpolate.pchip(indices[not_nan], voiced[not_nan]) 99 | else: 100 | interp = interpolate.interp1d(indices[not_nan], voiced[not_nan], 101 | method) 102 | return interp(indices) 103 | 104 | 105 | def smooth(params, win, type="HAMMING"): 106 | 107 | """ 108 | gaussian type smoothing, convolution with hamming window 109 | """ 110 | win = int(win+0.5) 111 | if win >= len(params)-1: 112 | win = len(params)-1 113 | 114 | if win % 2 == 0: 115 | win += 1 116 | 117 | s = np.r_[params[win-1:0:-1], params, params[-1:-win:-1]] 118 | 119 | if type == "HAMMING": 120 | w = np.hamming(win) 121 | # third = int(win/3) 122 | # w[:third] = 0 123 | else: 124 | w = np.ones(win) 125 | 126 | y = np.convolve(w/w.sum(), s, mode='valid') 127 | return y[int(win/2):-int(win/2)] 128 | 129 | 130 | def peak_smooth(params, max_iter, win, 131 | min_win=2, voicing=[], TRACE=False): 132 | """ 133 | Iterative smoothing while preserving peaks, 'true envelope' -style 134 | 135 | """ 136 | 137 | smoothed = np.array(params) 138 | win_reduce = np.exp(np.linspace(np.log(win), np.log(min_win), max_iter)) 139 | # std = np.std(params) 140 | if TRACE: 141 | pylab.ion() 142 | pylab.plot(params, 'black') 143 | 144 | for i in range(0, max_iter): 145 | 146 | smoothed = np.maximum(params, smoothed) 147 | # if TRACE: 148 | # if (i > 0) and (i % 2 == 0): 149 | # pass 150 | # pylab.plot(smoothed, 'gray', linewidth=1) 151 | # raw_input() 152 | 153 | if len(voicing) > 0: 154 | smoothed = smooth(smoothed, int(win+0.5)) 155 | smoothed[voicing > 0] = params[voicing > 0] 156 | else: 157 | smoothed = smooth(smoothed, int(win+0.5), type='rectangle') 158 | 159 | win = win_reduce[i] 160 | 161 | if TRACE: 162 | pylab.plot(smoothed, 'red', linewidth=2) 163 | pylab.show() 164 | return smoothed 165 | --------------------------------------------------------------------------------