├── CHANGES.txt
├── LICENSE
├── MANIFEST.in
├── README.md
├── README.txt
├── __init__.py
├── amfm_decompy
    ├── __init__.py
    ├── basic_tools.py
    ├── pYAAPT.py
    ├── pyQHM.py
    └── sample.wav
├── bin
    ├── AMFM_test.py
    └── __init__.py
├── docs
    ├── AMFM_decompy.pdf
    ├── basic_tools.txt
    ├── freq1.png
    ├── index.txt
    ├── interp.png
    ├── mag3.png
    ├── pYAAPT.txt
    ├── pyQHM.txt
    ├── samp_values.png
    └── values.png
└── pyproject.toml


/CHANGES.txt:
--------------------------------------------------------------------------------
  1 | v 1.0.12, 16/May/2025
  2 | - issue due upsample of silent signals fixed. Thanks to Divyesh Rajpura for
  3 |   reporting it.
  4 | 
  5 | v 1.0.11, 23/Jan/2021
  6 | - pitch halving errors with short audio samples bug fixed. Thanks to
  7 |   Esther Judd-Klabbers for reporting it.
  8 | 
  9 | v 1.0.10, 12/Oct/2020
 10 | - bug due the spectral pitch standard deviation being equal to 0 fixed.
 11 |   Thanks to kwanUm for reporting it.
 12 | - SyntaxWarnings in the upsample method from the Pitch object corrected (the "is"
 13 |   occurrences in the conditionals were replaced by "==" or "in").
 14 | 
 15 | v 1.0.9.1, 11/Mar/2020
 16 | - the minimum frame length required to use the spline interpolation was increased
 17 |   to 4. Otherwise, due constraits imposed by the spline algorithm itself, the
 18 |   pitch extraction routine would be aborted in the cases where the frame has length
 19 |   equal to 3. Thanks to Richard Hemphill for reporting this issue.
 20 | 
 21 | v 1.0.9, 20/Feb/2020
 22 | - issue due wrong variable name "dtype" in SignalObj initialization fixed.
 23 |   Thanks to Richard Hemphill for reporting it.
 24 | - SignalObj objects can now be initialized using kwargs. Additionally, the data
 25 |   output_dtype can also be set using kwargs.
 26 | 
 27 | v 1.0.8.1, 09/Jul/2018
 28 | - issue caused when a PCM integral data signal is employed to create a SignalObj
 29 |   instance fixed. Thanks to Omar Altayyan for reporting and fixing it.
 30 | - calculation of the nbits property of a SignalObj instance modified.
 31 | 
 32 | v 1.0.8, 21/Jun/2018
 33 | - issue caused by extra unneeded numpy array dimensions in pYAAPT.py fixed.
 34 |   Thanks to Omar Altayyan for reporting and fixing it.
 35 | - deprecation warning for the hanning and kaiser windows from scipy fixed.
 36 |   Thanks to Omar Altayyan for reporting and fixing it.
 37 | - pip module import issue in setup.py fixed.
 38 | 
 39 | v 1.0.7, 27/Jul/2017
 40 | - issue with negative indexes in the "crs_corr" function from pYAAPT.py due
 41 |   short frame lengths in the "time_track" function fixed. Thanks to Paritosh
 42 |   Gupta, Puff Kan and tuanad121 for reporting it.
 43 | - function "interrupt_main()" replaced by "assert", since that the first one is
 44 |   not working properly. Thanks to tuanad121 for reporting and fixing it.
 45 | - "frame_lengtht" parameter (renamed to "tda_frame_length") from YAAPT 4.0
 46 |   added.
 47 | - default value from the "frame_length" parameter in pYAAPT.py changed from
 48 |   25 ms to 35 ms following the alteration ocurred in the new YAAPT 4.0 MATLAB
 49 |   code.
 50 | - default value from the "nccf_thresh1" parameter in pYAAPT.py changed from
 51 |   0.25 to 0.3 following the alteration ocurred in the new YAAPT 4.0 MATLAB code.
 52 | - some extra minor alterations in pYAAPT.py from the new YAAPT 4.0 MATLAB code.
 53 | - minor alterations in the SHC computation of the "spec_track" function from
 54 |   pYAAPT.py.
 55 | - minor correction in the step interpolation of "upsample" method from PitchObj
 56 |   in order to allow compability with Python 3.
 57 | 
 58 | v1.0.6.1, 13/Mar/2017
 59 | - error due misuse of the numpy's "ones" function in spec_track function from
 60 |   pYAAPT fixed. Thanks to Paritosh Gupta for reporting it.
 61 | 
 62 | v1.0.6, 23/Jan/2017
 63 | - issue with incorrect high values at the beginning and at the end of
 64 |   PitchObj.samp_interp array fixed. Thanks to Esther Judd-Klabbers for reporting
 65 |   it.
 66 | 
 67 | v1.0.5.1, 26/Sep/2016
 68 | - print functions updated in order to allow compability with Python 3. Thanks to
 69 |   Javier Villalba García for reporting it.
 70 | - "xrange" function replaced by "range" in order to allow compability with
 71 |   Python 3.
 72 | - "thread" module replaced by "raise KeyboardInterrupt" in pYAAPT.py and
 73 |   basic_tools.py.
 74 | - corrected the "VisibleDeprecationWarning"  regarding the misuse of float
 75 |   numbers as argument to numpy.zeros() and as array indeces in pYAAPT.py.
 76 | 
 77 | v1.0.5, 22/Sep/2016
 78 | - issue with the interpolation of short voiced frames fixed and also minor
 79 |   corrections in the code. Thanks to Michał Dankiewicz for reporting and fixing
 80 |   them.
 81 | - "UnicodeDecodeError: 'charmap'" issue in setup.py fixed and also an extra
 82 |    workaround to avoid it added. Thanks to Javier Villalba García for reporting
 83 |    it.
 84 | - corrected the array lenght mismatch in the edges_finder method from the
 85 |   PitchObj class in pYAAPT.py.
 86 | - corrected the "VisibleDeprecationWarning"  regarding the misuse of float
 87 |   numbers as array indeces in pYAAPT.py.
 88 | - requirements and classifiers added to setup.py.
 89 | - distutils replaced by setuptools in setup.py.
 90 | 
 91 | v1.0.4, 13/Jan/2015
 92 | - segmentation fault issue in pYAAPT.py fixed.
 93 | - online documentation added.
 94 | 
 95 | v1.0.3, 23/Dec/2014
 96 | - several bugs related to the analysis of 44,1 kHz files fixed. Thanks to
 97 |   Tomasz Toczyski for reporting them.
 98 | - "growing pitch and merit vectors" bug in the peaks function from pYAAPT.py
 99 |   fixed.
100 | - "Stop in Dynamic" exceptions in path1 function from pYAAPT.py removed. They
101 |   were inherited from the original MATLAB code, but apperently don't have a
102 |   real use or meaning.
103 | -  internal interpolation functions changed from spline to pchip in order to
104 |   keep the fidelity to the original MATLAB code and also to produce better
105 |   results.
106 | - "pchip" option added and set as default to the PitchObj method "upsample" in
107 |   pYAAPT.py. This interpolation method is favored over "spline" because
108 |   produces less horrible pitch spikes.
109 | - "single sample upsample" bug in PitchObj method "upsample" from
110 |   pYAAPT.py fixed.
111 | - support for stereo files added.
112 | - documentation updated.
113 | 
114 | v1.0.2, 27/Nov/2014
115 | - "missing thread package" bug in basic_tools.py fixed.
116 | 
117 | v1.0.1, 20/Sep/2014
118 | - 'f0_min' bug in pYAAPT fixed.
119 | - PitchObj.mean_energy attribute added.
120 | - two SignalObj classes merged into only one.
121 | - minor PEP8, functions and commentaries changes.
122 | - documentation added.
123 | 
124 | v1.0.0, 10/Sep/2014
125 | - Initial release.
126 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | The MIT License (MIT)
 2 | 
 3 | Copyright (c) 2014 bjbschmitt
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
 1 | include *.txt
 2 | recursive-include docs *.txt
 3 | include *.pdf
 4 | recursive-include docs *.pdf
 5 | include *.png
 6 | recursive-include docs *.png
 7 | include *.wav
 8 | recursive-include amfm_decompy *.wav
 9 | prune dist*
10 | prune docs/_build*
11 | 
12 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | AMFM_decompy
 2 | =============
 3 | 
 4 | version 1.0.12
 5 | 
 6 | This python package provides the tools necessary for decomposing the voiced part of a speech signal into its modulated components, aka AM-FM decomposition. This designation is used due the fact that, in this method, the signal is modeled as a sum of amplitude- and frequency-modulated components.
 7 | 
 8 | The goal is to overcome the drawbacks from Fourier-alike techniques, e.g. SFFT, wavelets, etc, which are limited in the time-frequency analysis by the so-called Heisenberg-Gabor inequality.
 9 | 
10 | The algorithms here implemented are the QHM (Quasi-Harmonic Model), and its upgrades, aQHM (adaptive Quasi-Harmonic Model) and eaQHM (extended adaptive Quasi-Harmonic Model). Their formulation can be found at references [2-4].
11 | 
12 | Since that the tools mentioned above require a fundamental frequency reference, the package also includes the pitch tracker YAAPT (Yet Another Algorithm for Pitch Tracking) [1], which is extremely robust for both high quality and telephone speech.
13 | 
14 | The study of AM-FM decomposition algorithms was the theme from my Master Thesis. The original YAAPT program in MATLAB is provided for free by its authors, while the QHM algorithms I implemented by myself also in MATLAB. I'm porting them now to python because:
15 | 
16 | * the python language is easier to share, read and understand, making it a  better way to distribute the codes;
17 | * is more resourceful than MATLAB (has different data structures, scripting  options, etc), which will be useful for me in future studies;
18 | * the computational performance from its numeric and scientific packages (numpy  and scipy) is equivalent to MATLAB;
19 | * python is free-to-use, while MATLAB is a proprietary software;
20 | 
21 | Evaluations and future expansions
22 | =============
23 | 
24 | As for the algorithms computational performance, I optimized the YAAPT code, so my pyhton version runs now about twice as fast as the original MATLAB one. However, the QHM algorithms still run as fast as their counterparts in MATLAB. That's because the main bottleneck of both versions are the matrix dot and least-squares operations. Since numpy and MATLAB are already optimized to perform these tasks using internal Fortran functions, as far as I investigated there's no way to speed them up using Cython, for example. Nevertheless, numba and CUDA seem to be promising tools to speed the code, so adding support to them is one of my priorities for future releases.
25 | 
26 | In [1] the YAAPT is compared with well-known pitch trackers like the YIN and the RAPT, and presents the best results. In fact, so far I've been using it, the algorithm has been proved to be indeed very robust. It must be emphasized that I merely translated the code, so I only have an average knowledge about its theoretical formulation. For deep questions concerning it, I would advise to contact the original authors.
27 | 
28 | The QHM-like algorithms present some stability problems concerning small magnitude modulated components, which are already documented at [2,3]. In my python code I implemented a workaround to this problem, but it is still a sub-optimal solution.
29 | 
30 | Actually, I dedicated a chapter in my Master Thesis to a deeper study about this problem and came up with a better solution. Unfortunately, due stupid bureaucratic issues, I don't know if and when my work will be defended and published (to be short, the deadline was expired because me and my advisor needed more time to correct and improve the thesis text. Then we required a prorrogation, but the lecturers board declined it. So, basically, I was expelled from the post-gradute program with a finished and working thesis). Anyway, I'm still trying to figure out do now with my work and as soon as find a solution, I'll add my own contributions to this package.
31 | 
32 | IMPORTANT - Considerations about version 1.0.7 and later versions
33 | =============
34 | 
35 | In the latest release of the original YAAPT MATLAB source code (YAAPT v4.0) the default values from the following parameters have been altered:
36 | 
37 | * `frame_length` parameter changed from 25 ms to 35 ms;
38 | * `nccf_thresh1` parameter changed from 0.25 to 0.3;
39 | 
40 | Moreover, a new parameter called `frame_lengtht` was added (please pay atention to the extra "t" at the end), which name is quite similar to `frame_length`. In order to avoid confusion between them, an alternative (and preferred) alias for `frame_lengtht` called `tda_frame_length` was used in pYAAPT.py. Nevertheless, both inputs (`frame_lengtht` and `tda_frame_length`) are accepted.
41 | 
42 | Due these modifications, if you were running AMFM_decompy 1.0.6 or earlier versions with their default settings, you may obtain slightly different results from the ones obtained by running AMFM_decompy 1.0.7. and later versions with the new default parameters.
43 | 
44 | Therefore, if you really need to obtain exactly the same results from previous versions, you must provide the old parameter values to the yaapt function. For example, a 1.0.6 or earlier code like
45 | 
46 | `pitch = pYAAPT.yaapt(signal)`
47 | 
48 | should be rewritten in the 1.0.7 and later versions as
49 | 
50 | `pitch = pYAAPT.yaapt(signal, **{'frame_length': 25.0, 'nccf_thresh1': 0.25, 'tda_frame_length': 25.0})`
51 | 
52 | Installation
53 | =============
54 | 
55 | The pypi page https://pypi.python.org/pypi/AMFM_decompy/1.0.12.2 is recommended for a quick installation. But you can also copy all directories here and then run
56 | 
57 | ```python setup.py install```
58 | 
59 | in command line. After that, run the test script by typing
60 | 
61 | `AMFM_test.py`
62 | 
63 | to check if everything is ok (it can take couple of minutes to calculate the results). This script is a example about how to use the package.
64 | 
65 | I've tested the installation script and the package itself in Linux and Windows systems (but not in iOS) and everything went fine. So, if a problem comes up, it must be probably something about python not finding the files paths.
66 | 
67 | How to use
68 | =============
69 | 
70 | Check the AMFM_decompy pdf documentation included in the docs folder or the online documentation at http://bjbschmitt.github.io/AMFM_decompy. The amfm_decompy folder contains the sample.wav file that is used to ilustrate the package's code examples.
71 | 
72 | Credits and Publications
73 | =============
74 | 
75 | The original MATLAB YAAPT program was written by Hongbing Hu and Stephen A.Zahorian from the Speech Communication Laboratory of the State University of New York at Binghamton.
76 | 
77 | It is available at http://www.ws.binghamton.edu/zahorian as free software. Further information about the program can be found at
78 | 
79 |    [1] Stephen A. Zahorian, and Hongbing Hu, "A spectral/temporal method for robust fundamental frequency tracking," J. Acoust. Soc. Am. 123(6), June 2008.
80 | 
81 | The QHM algorithm and its upgrades are formulated and presented in the following publications:
82 | 
83 |    [2] Y. Pantazis, , PhD Thesis, University of Creta, 2010.
84 | 
85 |    [3] Y. Pantazis, O. Rosec and Y. Stylianou, , IEEE Transactions on Audio, Speech and Language Processing, vol. 19, n 2, 2011.
86 | 
87 |    [4] G. P. Kafentzis, Y. Pantazis, O. Rosec and Y. Stylianou, , in IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), 2012.
88 | 
89 | Copyright and contact
90 | =============
91 | 
92 | The AMFM_decompy is free to use, share and modify under the terms of the MIT license.
93 | 
94 | Questions, comments, suggestions, and contributions are welcome. Please contact me at
95 | 
96 | bernardo.jb.schmitt@gmail.com.
97 | 


--------------------------------------------------------------------------------
/README.txt:
--------------------------------------------------------------------------------
  1 | AMFM_decompy
  2 | =============
  3 | 
  4 | version 1.0.12
  5 | 
  6 | This python package provides the tools necessary for decomposing the voiced part
  7 | of a speech signal into its modulated components, aka AM-FM decomposition. This
  8 | designation is used due the fact that, in this method, the signal is modeled as
  9 | a sum of amplitude- and frequency-modulated components.
 10 | 
 11 | The goal is to overcome the drawbacks from Fourier-alike techniques, e.g. SFFT,
 12 | wavelets, etc, which are limited in the time-frequency analysis by the so-called
 13 | Heisenberg-Gabor inequality.
 14 | 
 15 | The algorithms here implemented are the QHM (Quasi-Harmonic Model), and its
 16 | upgrades, aQHM (adaptive Quasi-Harmonic Model) and eaQHM (extended adaptive
 17 | Quasi-Harmonic Model). Their formulation can be found at references [2-4].
 18 | 
 19 | Since that the tools mentioned above require a fundamental frequency reference,
 20 | the package also includes the pitch tracker YAAPT (Yet Another Algorithm for
 21 | Pitch Tracking) [1], which is extremely robust for both high quality and
 22 | telephone speech.
 23 | 
 24 | The study of AM-FM decomposition algorithms was the theme from my Master Thesis.
 25 | The original YAAPT program in MATLAB is provided for free by its authors, while
 26 | the QHM algorithms I implemented by myself also in MATLAB. I'm porting them now
 27 | to python because:
 28 | 
 29 | * the python language is easier to share, read and understand, making it a
 30 |   better way to distribute the codes;
 31 | * is more resourceful than MATLAB (has different data structures, scripting
 32 |   options, etc), which will be useful for me in future studies;
 33 | * the computational performance from its numeric and scientific packages (numpy
 34 |   and scipy) is equivalent to MATLAB;
 35 | * python is free-to-use, while MATLAB is a proprietary software;
 36 | 
 37 | Evaluations and future expansions
 38 | =============
 39 | 
 40 | As for the algorithms computational performance, I optimized the YAAPT code, so
 41 | my pyhton version runs now about twice as fast as the original MATLAB one.
 42 | However, the QHM algorithms still run as fast as their counterparts in MATLAB.
 43 | That's because the main bottleneck of both versions are the matrix dot and
 44 | least-squares operations. Since numpy and MATLAB are already optimized to perform
 45 | these tasks using internal Fortran functions, as far as I investigated there's
 46 | no way to speed them up using Cython, for example. Nevertheless, numba and CUDA
 47 | seem to be promising tools to speed the code, so adding support to them is one
 48 | of my priorities for future releases.
 49 | 
 50 | In [1] the YAAPT is compared with well-known pitch trackers like the YIN and
 51 | the RAPT, and presents the best results. In fact, so far I've been using it,
 52 | the algorithm has been proved to be indeed very robust. It must be emphasized
 53 | that I merely translated the code, so I only have an average knowledge about
 54 | its theoretical formulation. For deep questions concerning it, I would advise
 55 | to contact the original authors.
 56 | 
 57 | The QHM-like algorithms present some stability problems concerning small
 58 | magnitude modulated components, which are already documented at [2,3]. In my
 59 | python code I implemented a workaround to this problem, but it is still a
 60 | sub-optimal solution.
 61 | 
 62 | Actually, I dedicated a chapter in my Master Thesis to a deeper study about
 63 | this problem and came up with a better solution. Unfortunately, due stupid
 64 | bureaucratic issues, I don't know if and when my work will be defended and
 65 | published (to be short, the deadline was expired because me and my advisor
 66 | needed more time to correct and improve the thesis text. Then we required a
 67 | prorrogation, but the lecturers board declined it. So, basically, I was expelled
 68 | from the post-gradute program with a finished and working thesis). Anyway, I'm
 69 | still trying to figure out do now with my work and as soon as find a solution,
 70 | I'll add my own contributions to this package.
 71 | 
 72 | IMPORTANT - Considerations about version 1.0.7 and later versions
 73 | =============
 74 | 
 75 | In the latest release of the original YAAPT MATLAB source code (YAAPT v4.0)
 76 | the default values from the following parameters have been altered:
 77 | 
 78 | * `frame_length` parameter changed from 25 ms to 35 ms;
 79 | * `nccf_thresh1` parameter changed from 0.25 to 0.3;
 80 | 
 81 | Moreover, a new parameter called `frame_lengtht` was added (please pay atention
 82 | to the extra "t" at the end), which name is quite similar to `frame_length`.
 83 | In order to avoid confusion between them, an alternative (and preferred) alias
 84 | for `frame_lengtht` called `tda_frame_length` was used in pYAAPT.py. Nevertheless,
 85 | both inputs (`frame_lengtht` and `tda_frame_length`) are accepted.
 86 | 
 87 | Due these modifications, if you were running AMFM_decompy 1.0.6 or earlier
 88 | versions with their default settings, you may obtain slightly different results
 89 | from the ones obtained by running AMFM_decompy 1.0.7 and later versions. with
 90 | the new default parameters.
 91 | 
 92 | Therefore, if you really need to obtain exactly the same results from previous
 93 | versions, you must provide the old parameter values to the yaapt function. For
 94 | example, a 1.0.6 or earlier code like
 95 | 
 96 | `pitch = pYAAPT.yaapt(signal)`
 97 | 
 98 | should be rewritten in the 1.0.7 and later versions as
 99 | 
100 | `pitch = pYAAPT.yaapt(signal, **{'frame_length': 25.0, 'nccf_thresh1': 0.25, 'tda_frame_length': 25.0})`
101 | 
102 | Installation
103 | =============
104 | 
105 | The pypi page https://pypi.python.org/pypi/AMFM_decompy/1.0.12.2 is recommended for
106 | a quick installation. But you can also copy all directories here and then run
107 | 
108 | ```python setup.py install```
109 | 
110 | in command line. After that, run the test script by typing
111 | 
112 | `AMFM_test.py`
113 | 
114 | to check if everything is ok (it can take couple of minutes to calculate the
115 | results). This script is a example about how to use the package.
116 | 
117 | I've tested the installation script and the package itself in Linux and Windows
118 | systems (but not in iOS) and everything went fine. So, if a problem comes up,
119 | it must be probably something about python not finding the files paths.
120 | 
121 | How to use
122 | =============
123 | 
124 | Check the AMFM_decompy pdf documentation included in the docs folder or the
125 | online documentation at http://bjbschmitt.github.io/AMFM_decompy. The amfm_decompy
126 | folder contains the sample.wav file that is used to ilustrate the package's code
127 | examples.
128 | 
129 | Credits and Publications
130 | =============
131 | 
132 | The original MATLAB YAAPT program was written by Hongbing Hu and Stephen
133 | A.Zahorian from the Speech Communication Laboratory of the State University of
134 | New York at Binghamton.
135 | 
136 | It is available at http://www.ws.binghamton.edu/zahorian as free software.
137 | Further information about the program can be found at
138 | 
139 |    [1] Stephen A. Zahorian, and Hongbing Hu, "A spectral/temporal method for robust
140 |        fundamental frequency tracking," J. Acoust. Soc. Am. 123(6), June 2008.
141 | 
142 | The QHM algorithm and its upgrades are formulated and presented in the following publications:
143 | 
144 |    [2] Y. Pantazis, , PhD Thesis, University of Creta, 2010.
145 | 
146 |    [3] Y. Pantazis, O. Rosec and Y. Stylianou, , IEEE Transactions on Audio, Speech and
147 |        Language Processing, vol. 19, n 2, 2011.
148 | 
149 |    [4] G. P. Kafentzis, Y. Pantazis, O. Rosec and Y. Stylianou, , in IEEE International Conference on Acoustics,
150 |        Speech and Signal Processing (ICASSP), 2012.
151 | 
152 | Copyright and contact
153 | =============
154 | 
155 | The AMFM_decompy is free to use, share and modify under the terms of the MIT
156 | license.
157 | 
158 | Questions, comments, suggestions, and contributions are welcome. Please contact
159 | me at
160 | 
161 | bernardo.jb.schmitt@gmail.com.
162 | 


--------------------------------------------------------------------------------
/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bjbschmitt/AMFM_decompy/5c6c9bc48006d9eb5d5e874dd44f9a146a5ee38b/__init__.py


--------------------------------------------------------------------------------
/amfm_decompy/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bjbschmitt/AMFM_decompy/5c6c9bc48006d9eb5d5e874dd44f9a146a5ee38b/amfm_decompy/__init__.py


--------------------------------------------------------------------------------
/amfm_decompy/basic_tools.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """
  3 | Auxiliary classes and functions for used by the other AMFM_decompy modules.
  4 | 
  5 | Version 1.0.12
  6 | 16/May/2025 Bernardo J.B. Schmitt - bernardo.jb.schmitt@gmail.com
  7 | """
  8 | 
  9 | import numpy as np
 10 | from scipy.signal import lfilter
 11 | 
 12 | 
 13 | """
 14 | Creates a signal object.
 15 | """
 16 | 
 17 | class SignalObj(object):
 18 | 
 19 |     def __init__(self, *args, **kwargs):
 20 |         output_dtype = kwargs.get('output_dtype', 'f')
 21 | 
 22 |         # Read the signal data from the path of a wav file.
 23 |         if len(args) == 1 or 'name' in kwargs:
 24 |             name = args[0] if len(args) == 1 else kwargs['name']
 25 | 
 26 |             try:
 27 |                 from scipy.io import wavfile
 28 |             except:
 29 |                 print("ERROR: Wav modules could not loaded!")
 30 |                 raise KeyboardInterrupt
 31 |             self.fs, self.data = wavfile.read(name)
 32 |             self.name = name
 33 | 
 34 |         # Alternatively, read the signal from a Numpy array.
 35 |         elif len(args) == 2 or all (k in kwargs.keys() for k in ('data','fs')):
 36 |             data = args[0] if len(args) == 2 else kwargs['data']
 37 |             fs = args[1] if len(args) == 2 else kwargs['fs']
 38 | 
 39 |             self.data = data
 40 |             self.fs = fs
 41 | 
 42 | 
 43 |         # If the signal data is in the signed integer format (PCM), convert it
 44 |         # to float.
 45 |         if self.data.dtype.kind == 'i':
 46 |             self.nbits = self.data.itemsize*8
 47 |             self.data = pcm2float(self.data, output_dtype)
 48 | 
 49 |         self.size = len(self.data)
 50 |         self.fs = float(self.fs)
 51 | 
 52 |         # Check if the wav file is stereo.
 53 |         if self.size == self.data.size/2:
 54 |             print("Warning: stereo wav file. Converting it to mono for the analysis.")
 55 |             self.data = (self.data[:,0]+self.data[:,1])/2
 56 | 
 57 | 
 58 |     """
 59 |     Filters the signal data by a bandpass filter object and decimate it.
 60 |     """
 61 |     def filtered_version(self, bp_filter):
 62 | 
 63 |         # Filter the signal.
 64 |         tempData = lfilter(bp_filter.b, bp_filter.a, self.data)
 65 | 
 66 |         # Decimate the filtered output.
 67 |         self.filtered = tempData[0:self.size:bp_filter.dec_factor]
 68 |         self.new_fs = self.fs/bp_filter.dec_factor
 69 | 
 70 |     """
 71 |     Method that uses the pitch values to estimate the number of modulated
 72 |     components in the signal.
 73 |     """
 74 | 
 75 |     def set_nharm(self, pitch_track, n_harm_max):
 76 | 
 77 |         n_harm = (self.fs/2)/np.amax(pitch_track) - 0.5
 78 |         self.n_harm = int(np.floor(min(n_harm, n_harm_max)))
 79 | 
 80 |     """
 81 |     Adds a zero-mean gaussian noise to the signal.
 82 |     """
 83 | 
 84 |     def noiser(self, pitch_track, SNR):
 85 | 
 86 |         self.clean = np.empty((self.size))
 87 |         self.clean[:] = self.data
 88 | 
 89 |         RMS = np.std(self.data[pitch_track > 0])
 90 |         noise = np.random.normal(0, RMS/(10**(SNR/20)), self.size)
 91 |         self.data += noise
 92 | 
 93 | """
 94 | Transform a pcm raw signal into a float one, with values limited between -1 and
 95 | 1.
 96 | """
 97 | 
 98 | def pcm2float(sig, output_dtype=np.float64):
 99 | 
100 |      # Make sure it's a NumPy array.
101 |     sig = np.asarray(sig)
102 | 
103 |     # Check if it is an array of signed integers.
104 |     assert sig.dtype.kind == 'i', "'sig' must be an array of signed integers!"
105 |     # Set the array output format. Accepts string as input argument for the
106 |     # desired output format (e.g. 'f').
107 |     out_dtype = np.dtype(output_dtype)
108 | 
109 |     # Note that 'min' has a greater (by 1) absolute value than 'max'!
110 |     # Therefore, we use 'min' here to avoid clipping.
111 |     return sig.astype(out_dtype) / out_dtype.type(-np.iinfo(sig.dtype).min)
112 | 
113 | 


--------------------------------------------------------------------------------
/amfm_decompy/pYAAPT.py:
--------------------------------------------------------------------------------
   1 | # -*- coding: utf-8 -*-
   2 | """
   3 | This a ported version for Python from the YAAPT algorithm. The original MATLAB
   4 | program was written by Hongbing Hu and Stephen A.Zahorian.
   5 | 
   6 | The YAAPT program, designed for fundamental frequency tracking,
   7 | is extremely robust for both high quality and telephone speech.
   8 | 
   9 | The YAAPT program was created by the Speech Communication Laboratory of
  10 | the state university of New York at Binghamton. The original program is
  11 | available at http://www.ws.binghamton.edu/zahorian as free software. Further
  12 | information about the program could be found at Stephen A. Zahorian, and
  13 | Hongbing Hu, "A spectral/temporal method for robust fundamental frequency
  14 | tracking," J. Acoust. Soc. Am. 123(6), June 2008.
  15 | 
  16 | It must be noticed that, although this ported version is almost equal to the
  17 | original, some few changes were made in order to make the program more "pythonic"
  18 | and improve its performance. Nevertheless, the results obtained with both
  19 | algorithms were similar.
  20 | 
  21 | USAGE:
  22 |     pitch = yaapt(signal, <options>)
  23 | 
  24 | INPUTS:
  25 |     signal: signal object created by amfm_decompy.basic_tools. For more
  26 |     information about its properties, please consult the documentation file.
  27 | 
  28 |     <options>: must be formated as follows:
  29 |                **{'option_name1' : value1, 'option_name2' : value2, ...}
  30 |                The default configuration values for all of them are the same as
  31 |                in the original version. The main yaapt function in this file
  32 |                provides a short description about each option.
  33 |                For more information, please refer to the original bibliography.
  34 | 
  35 | OUTPUTS:
  36 |     pitch: pitch object. For more information about its properties, please
  37 |            consult the documentation file.
  38 | 
  39 | Version 1.0.12
  40 | 16/May/2025 Bernardo J.B. Schmitt - bernardo.jb.schmitt@gmail.com
  41 | """
  42 | 
  43 | import numpy as np
  44 | import numpy.lib.stride_tricks as stride_tricks
  45 | from scipy.signal import firwin, medfilt, lfilter
  46 | from scipy.signal.windows import hann, kaiser
  47 | import scipy.interpolate as scipy_interp
  48 | 
  49 | import amfm_decompy.basic_tools as basic
  50 | 
  51 | 
  52 | """
  53 | --------------------------------------------
  54 |                 Classes.
  55 | --------------------------------------------
  56 | """
  57 | """
  58 | Auxiliary class to handle the class properties.
  59 | """
  60 | class ClassProperty(object):
  61 | 
  62 |     def __init__(self, initval=None):
  63 |         self.val = initval
  64 | 
  65 |     def __get__(self, obj, objtype):
  66 |         return self.val
  67 | 
  68 |     def __set__(self, obj, val):
  69 |         self.val = val
  70 | 
  71 | 
  72 | """
  73 | Creates a pitch object.
  74 | """
  75 | class PitchObj(object):
  76 | 
  77 |     PITCH_HALF = ClassProperty(0)
  78 |     PITCH_HALF_SENS = ClassProperty(2.9)
  79 |     PITCH_DOUBLE = ClassProperty(0)
  80 |     PITCH_DOUBLE_SENS = ClassProperty(2.9)
  81 |     SMOOTH_FACTOR = ClassProperty(5)
  82 |     SMOOTH = ClassProperty(5)
  83 |     PTCH_TYP = ClassProperty(100.0)
  84 | 
  85 |     def __init__(self, frame_size, frame_jump, nfft=8192):
  86 | 
  87 |         self.nfft = nfft
  88 |         self.frame_size = frame_size
  89 |         self.frame_jump = frame_jump
  90 |         self.noverlap = self.frame_size-self.frame_jump
  91 | 
  92 |     def set_energy(self, energy, threshold):
  93 |         self.mean_energy = np.mean(energy)
  94 |         self.energy = energy/self.mean_energy
  95 |         self.vuv = (self.energy > threshold)
  96 | 
  97 |     def set_frames_pos(self, frames_pos):
  98 |         self.frames_pos = frames_pos
  99 |         self.nframes = len(self.frames_pos)
 100 | 
 101 |     def set_values(self, samp_values, file_size, interp_tech='pchip'):
 102 |         self.samp_values = samp_values
 103 |         self.fix()
 104 |         self.values = self.upsample(self.samp_values, file_size, 0, 0,
 105 |                                     interp_tech)
 106 |         self.edges = self.edges_finder(self.values)
 107 |         self.interpolate()
 108 |         self.values_interp = self.upsample(self.samp_interp, file_size,
 109 |                                            self.samp_interp[0],
 110 |                                            self.samp_interp[-1], interp_tech)
 111 | 
 112 |     """
 113 |     For the voiced/unvoiced version of the pitch data, finds the n samples where
 114 |     the transitions between these two states occur.
 115 |     """
 116 |     def edges_finder(self, values):
 117 |         vec1 = (np.abs(values[1:]+values[:-1]) > 0)
 118 |         vec2 = (np.abs(values[1:]*values[:-1]) == 0)
 119 |         edges = np.logical_and(vec1, vec2)
 120 |         # The previous logical operation detects where voiced/unvoiced transitions
 121 |         # occur. Thus, a 'True' in the edges[n] sample indicates that the sample
 122 |         # value[n+1] has a different state than value[n](i.e. if values[n] is
 123 |         # voiced, then values[n+1] is unvoiced - and vice-versa). Consequently,
 124 |         # the last sample from edges array will always be 'False' and is not
 125 |         # calculated (because "there is no n+1 sample" for it. That's why
 126 |         # len(edges) = len(values)-1). However, just for sake of comprehension
 127 |         # (and also to avoid python warnings about array length mismatchs), I
 128 |         # add a 'False' to edges the array. But in pratice, this 'False' is
 129 |         # useless.
 130 |         edges = np.append(edges,[False])
 131 |         index = np.arange(len(values))
 132 |         index = index[edges > 0]
 133 |         return index.tolist()
 134 | 
 135 |     """
 136 |     This method corresponds to the first half of the ptch_fix.m file. It tries
 137 |     to fix half pitch and double pitch errors.
 138 |     """
 139 |     def fix(self):
 140 |         if self.PITCH_HALF > 0:
 141 |             nz_pitch = self.samp_values[self.samp_values > 0]
 142 |             idx = self.samp_values < (np.mean(nz_pitch)-self.PITCH_HALF_SENS *
 143 |                                       np.std(nz_pitch))
 144 |             if self.PITCH_HALF == 1:
 145 |                 self.samp_values[idx] = 0
 146 |             elif self.PITCH_HALF == 2:
 147 |                 self.samp_values[idx] = 2*self.samp_values[idx]
 148 | 
 149 |         if self.PITCH_DOUBLE > 0:
 150 |             nz_pitch = self.samp_values[self.samp_values > 0]
 151 |             idx = self.samp_values > (np.mean(nz_pitch)+self.PITCH_DOUBLE_SENS *
 152 |                                       np.std(nz_pitch))
 153 |             if self.PITCH_DOUBLE == 1:
 154 |                 self.samp_values[idx] = 0
 155 |             elif self.PITCH_DOUBLE == 2:
 156 |                 self.samp_values[idx] = 0.5*self.samp_values[idx]
 157 | 
 158 |     """
 159 |     Corresponds to the second half of the ptch_fix.m file. Creates the
 160 |     interpolated pitch data.
 161 |     """
 162 |     def interpolate(self):
 163 |         pitch = np.zeros((self.nframes))
 164 |         pitch[:] = self.samp_values
 165 |         pitch2 = medfilt(self.samp_values, self.SMOOTH_FACTOR)
 166 | 
 167 |         # This part in the original code is kind of confused and caused
 168 |         # some problems with the extrapolated points before the first
 169 |         # voiced frame and after the last voiced frame. So, I made some
 170 |         # small modifications in order to make it work better.
 171 |         edges = self.edges_finder(pitch)
 172 |         first_sample = pitch[0]
 173 |         last_sample = pitch[-1]
 174 | 
 175 |         if len(np.nonzero(pitch2)[0]) < 2:
 176 |             pitch[pitch == 0] = self.PTCH_TYP
 177 |         else:
 178 |             nz_pitch = pitch2[pitch2 > 0]
 179 |             pitch2 = scipy_interp.pchip(np.nonzero(pitch2)[0],
 180 |                                         nz_pitch)(range(self.nframes))
 181 |             pitch[pitch == 0] = pitch2[pitch == 0]
 182 |         if self.SMOOTH > 0:
 183 |             pitch = medfilt(pitch, self.SMOOTH_FACTOR)
 184 |         try:
 185 |             if first_sample == 0:
 186 |                 pitch[:edges[0]-1] = pitch[edges[0]]
 187 |             if last_sample == 0:
 188 |                 pitch[edges[-1]+1:] = pitch[edges[-1]]
 189 |         except:
 190 |             pass
 191 |         self.samp_interp = pitch
 192 | 
 193 |     """
 194 |     Upsample the pitch data so that its length becomes the same as the speech
 195 |     signal.
 196 |     """
 197 |     def upsample(self, samp_values, file_size, first_samp=0, last_samp=0,
 198 |                  interp_tech='pchip'):
 199 |         if interp_tech == 'step':
 200 |             beg_pad = int((self.noverlap)/2)
 201 |             up_version = np.zeros((file_size))
 202 |             up_version[:beg_pad] = first_samp
 203 |             up_version[beg_pad:beg_pad+self.frame_jump*self.nframes] = \
 204 |                                     np.repeat(samp_values, self.frame_jump)
 205 |             up_version[beg_pad+self.frame_jump*self.nframes:] = last_samp
 206 | 
 207 |         elif interp_tech in ['pchip', 'spline']:
 208 |             if np.amin(samp_values) > 0:
 209 |                 if interp_tech == 'pchip':
 210 |                     up_version = scipy_interp.pchip(self.frames_pos,
 211 |                                                     samp_values)(range(file_size))
 212 | 
 213 |                 elif interp_tech == 'spline':
 214 |                     tck, u_original = scipy_interp.splprep(
 215 |                                                 [self.frames_pos, samp_values],
 216 |                                                 u=self.frames_pos)
 217 |                     up_version = scipy_interp.splev(range(file_size), tck)[1]
 218 |             else:
 219 |                 beg_pad = int((self.noverlap)/2)
 220 |                 up_version = np.zeros((file_size))
 221 |                 up_version[:beg_pad] = first_samp
 222 |                 voiced_frames = np.nonzero(samp_values)[0]
 223 | 
 224 |                 if len(voiced_frames) > 0:
 225 |                     edges = np.nonzero((voiced_frames[1:]-voiced_frames[:-1]) > 1)[0]
 226 |                     edges = np.insert(edges, len(edges), len(voiced_frames)-1)
 227 |                     voiced_frames = np.split(voiced_frames, edges+1)[:-1]
 228 | 
 229 |                 for frame in voiced_frames:
 230 |                     up_interval = self.frames_pos[frame]
 231 |                     tot_interval = np.arange(int(up_interval[0]-(self.frame_jump/2)),
 232 |                                           int(up_interval[-1]+(self.frame_jump/2)))
 233 | 
 234 |                     if interp_tech == 'pchip' and len(frame) > 2:
 235 |                         up_version[tot_interval] = scipy_interp.pchip(
 236 |                                                     up_interval,
 237 |                                                     samp_values[frame])(tot_interval)
 238 | 
 239 |                     elif interp_tech == 'spline' and len(frame) > 3:
 240 |                         tck, u_original = scipy_interp.splprep(
 241 |                                             [up_interval, samp_values[frame]],
 242 |                                              u=up_interval)
 243 |                         up_version[tot_interval] = scipy_interp.splev(tot_interval, tck)[1]
 244 | 
 245 |                     # In case len(frame)==2, above methods fail.
 246 |                     # Therefore, linear interpolation is used instead.
 247 |                     elif len(frame) > 1:
 248 |                         up_version[tot_interval] = scipy_interp.interp1d(
 249 |                                                     up_interval,
 250 |                                                     samp_values[frame],
 251 |                                         fill_value='extrapolate')(tot_interval)
 252 | 
 253 |                     elif len(frame) == 1:
 254 |                         up_version[tot_interval] = samp_values[frame]
 255 | 
 256 | 
 257 |                 up_version[beg_pad+self.frame_jump*self.nframes:] = last_samp
 258 | 
 259 |         return up_version
 260 | 
 261 | """
 262 | Creates a bandpass filter object.
 263 | """
 264 | class BandpassFilter(object):
 265 | 
 266 |     def __init__(self, fs, parameters):
 267 | 
 268 |         fs_min = 1000.0
 269 |         if (fs > fs_min):
 270 |             dec_factor = parameters['dec_factor']
 271 |         else:
 272 |             dec_factor = 1
 273 | 
 274 |         filter_order = parameters['bp_forder']
 275 |         f_hp = parameters['bp_low']
 276 |         f_lp = parameters['bp_high']
 277 | 
 278 |         f1 = f_hp/(fs/2)
 279 |         f2 = f_lp/(fs/2)
 280 | 
 281 |         self.b = firwin(filter_order+1, [f1, f2], pass_zero=False)
 282 |         self.a = 1
 283 |         self.dec_factor = dec_factor
 284 | 
 285 | 
 286 | """
 287 | --------------------------------------------
 288 |                 Main function.
 289 | --------------------------------------------
 290 | """
 291 | def yaapt(signal, **kwargs):
 292 | 
 293 |     # Rename the YAAPT v4.0 parameter "frame_lengtht" to "tda_frame_length"
 294 |     # (if provided).
 295 |     if 'frame_lengtht' in kwargs:
 296 |         if 'tda_frame_length' in kwargs:
 297 |             warning_str = 'WARNING: Both "tda_frame_length" and "frame_lengtht" '
 298 |             warning_str += 'refer to the same parameter. Therefore, the value '
 299 |             warning_str += 'of "frame_lengtht" is going to be discarded.'
 300 |             print(warning_str)
 301 |         else:
 302 |             kwargs['tda_frame_length'] = kwargs.pop('frame_lengtht')
 303 | 
 304 |     #---------------------------------------------------------------
 305 |     # Set the default values for the parameters.
 306 |     #---------------------------------------------------------------
 307 |     parameters = {}
 308 |     parameters['frame_length'] = kwargs.get('frame_length', 35.0)   #Length of each analysis frame (ms)
 309 |     # WARNING: In the original MATLAB YAAPT 4.0 code the next parameter is called
 310 |     # "frame_lengtht" which is quite similar to the previous one "frame_length".
 311 |     # Therefore, I've decided to rename it to "tda_frame_length" in order to
 312 |     # avoid confusion between them. Nevertheless, both inputs ("frame_lengtht"
 313 |     # and "tda_frame_length") are accepted when the function is called.
 314 |     parameters['tda_frame_length'] = \
 315 |                               kwargs.get('tda_frame_length', 35.0)  #Frame length employed in the time domain analysis (ms)
 316 |     parameters['frame_space'] = kwargs.get('frame_space', 10.0)     #Spacing between analysis frames (ms)
 317 |     parameters['f0_min'] = kwargs.get('f0_min', 60.0)               #Minimum F0 searched (Hz)
 318 |     parameters['f0_max'] = kwargs.get('f0_max', 400.0)              #Maximum F0 searched (Hz)
 319 |     parameters['fft_length'] = kwargs.get('fft_length', 8192)       #FFT length
 320 |     parameters['bp_forder'] = kwargs.get('bp_forder', 150)          #Order of band-pass filter
 321 |     parameters['bp_low'] = kwargs.get('bp_low', 50.0)               #Low frequency of filter passband (Hz)
 322 |     parameters['bp_high'] = kwargs.get('bp_high', 1500.0)           #High frequency of filter passband (Hz)
 323 |     parameters['nlfer_thresh1'] = kwargs.get('nlfer_thresh1', 0.75) #NLFER boundary for voiced/unvoiced decisions
 324 |     parameters['nlfer_thresh2'] = kwargs.get('nlfer_thresh2', 0.1)  #Threshold for NLFER definitely unvoiced
 325 |     parameters['shc_numharms'] = kwargs.get('shc_numharms', 3)      #Number of harmonics in SHC calculation
 326 |     parameters['shc_window'] = kwargs.get('shc_window', 40.0)       #SHC window length (Hz)
 327 |     parameters['shc_maxpeaks'] = kwargs.get('shc_maxpeaks', 4)      #Maximum number of SHC peaks to be found
 328 |     parameters['shc_pwidth'] = kwargs.get('shc_pwidth', 50.0)       #Window width in SHC peak picking (Hz)
 329 |     parameters['shc_thresh1'] = kwargs.get('shc_thresh1', 5.0)      #Threshold 1 for SHC peak picking
 330 |     parameters['shc_thresh2'] = kwargs.get('shc_thresh2', 1.25)     #Threshold 2 for SHC peak picking
 331 |     parameters['f0_double'] = kwargs.get('f0_double', 150.0)        #F0 doubling decision threshold (Hz)
 332 |     parameters['f0_half'] = kwargs.get('f0_half', 150.0)            #F0 halving decision threshold (Hz)
 333 |     parameters['dp5_k1'] = kwargs.get('dp5_k1', 11.0)               #Weight used in dynamic program
 334 |     parameters['dec_factor'] = kwargs.get('dec_factor', 1)          #Factor for signal resampling
 335 |     parameters['nccf_thresh1'] = kwargs.get('nccf_thresh1', 0.3)    #Threshold for considering a peak in NCCF
 336 |     parameters['nccf_thresh2'] = kwargs.get('nccf_thresh2', 0.9)    #Threshold for terminating serach in NCCF
 337 |     parameters['nccf_maxcands'] = kwargs.get('nccf_maxcands', 3)    #Maximum number of candidates found
 338 |     parameters['nccf_pwidth'] = kwargs.get('nccf_pwidth', 5)        #Window width in NCCF peak picking
 339 |     parameters['merit_boost'] = kwargs.get('merit_boost', 0.20)     #Boost merit
 340 |     parameters['merit_pivot'] = kwargs.get('merit_pivot', 0.99)     #Merit assigned to unvoiced candidates in
 341 |                                                                     #defintely unvoiced frames
 342 |     parameters['merit_extra'] = kwargs.get('merit_extra', 0.4)      #Merit assigned to extra candidates
 343 |                                                                     #in reducing F0 doubling/halving errors
 344 |     parameters['median_value'] = kwargs.get('median_value', 7)      #Order of medial filter
 345 |     parameters['dp_w1'] = kwargs.get('dp_w1', 0.15)                 #DP weight factor for V-V transitions
 346 |     parameters['dp_w2'] = kwargs.get('dp_w2', 0.5)                  #DP weight factor for V-UV or UV-V transitions
 347 |     parameters['dp_w3'] = kwargs.get('dp_w3', 0.1)                  #DP weight factor of UV-UV transitions
 348 |     parameters['dp_w4'] = kwargs.get('dp_w4', 0.9)                  #Weight factor for local costs
 349 | 
 350 |     # Exclusive from pYAAPT.
 351 | 
 352 |     parameters['spec_pitch_min_std'] = kwargs.get('spec_pitch_min_std', 0.05)
 353 |                                                                     #Weight factor that sets a minimum
 354 |                                                                     #spectral pitch standard deviation,
 355 |                                                                     #which is calculated as
 356 |                                                                     #min_std = pitch_avg*spec_pitch_min_std
 357 | 
 358 |     #---------------------------------------------------------------
 359 |     # Create the signal objects and filter them.
 360 |     #---------------------------------------------------------------
 361 |     fir_filter = BandpassFilter(signal.fs, parameters)
 362 |     nonlinear_sign = basic.SignalObj(signal.data**2, signal.fs)
 363 | 
 364 |     signal.filtered_version(fir_filter)
 365 |     nonlinear_sign.filtered_version(fir_filter)
 366 | 
 367 |     #---------------------------------------------------------------
 368 |     # Create the pitch object.
 369 |     #---------------------------------------------------------------
 370 |     nfft = parameters['fft_length']
 371 |     frame_size = int(np.fix(parameters['frame_length']*signal.fs/1000))
 372 |     frame_jump = int(np.fix(parameters['frame_space']*signal.fs/1000))
 373 |     pitch = PitchObj(frame_size, frame_jump, nfft)
 374 | 
 375 |     assert pitch.frame_size > 15, 'Frame length value {} is too short.'.format(pitch.frame_size)
 376 |     assert pitch.frame_size < 2048, 'Frame length value {} exceeds the limit.'.format(pitch.frame_size)
 377 | 
 378 | 
 379 |     #---------------------------------------------------------------
 380 |     # Calculate NLFER and determine voiced/unvoiced frames.
 381 |     #---------------------------------------------------------------
 382 |     nlfer(signal, pitch, parameters)
 383 | 
 384 |     #---------------------------------------------------------------
 385 |     # Calculate an approximate pitch track from the spectrum.
 386 |     #---------------------------------------------------------------
 387 |     spec_pitch, pitch_std = spec_track(nonlinear_sign, pitch, parameters)
 388 | 
 389 |     #---------------------------------------------------------------
 390 |     # Temporal pitch tracking based on NCCF.
 391 |     #---------------------------------------------------------------
 392 |     time_pitch1, time_merit1 = time_track(signal, spec_pitch, pitch_std, pitch,
 393 |                                           parameters)
 394 | 
 395 |     time_pitch2, time_merit2 = time_track(nonlinear_sign, spec_pitch, pitch_std,
 396 |                                           pitch, parameters)
 397 | 
 398 |     # Added in YAAPT 4.0
 399 |     if time_pitch1.shape[1] < len(spec_pitch):
 400 |         len_time = time_pitch1.shape[1]
 401 |         len_spec = len(spec_pitch)
 402 |         time_pitch1 = np.concatenate((time_pitch1, np.zeros((3,len_spec-len_time),
 403 |                                       dtype=time_pitch1.dtype)),axis=1)
 404 |         time_pitch2 = np.concatenate((time_pitch2, np.zeros((3,len_spec-len_time),
 405 |                                       dtype=time_pitch2.dtype)),axis=1)
 406 |         time_merit1 = np.concatenate((time_merit1, np.zeros((3,len_spec-len_time),
 407 |                                       dtype=time_merit1.dtype)),axis=1)
 408 |         time_merit2 = np.concatenate((time_merit2, np.zeros((3,len_spec-len_time),
 409 |                                       dtype=time_merit2.dtype)),axis=1)
 410 | 
 411 |     #---------------------------------------------------------------
 412 |     # Refine pitch candidates.
 413 |     #---------------------------------------------------------------
 414 |     ref_pitch, ref_merit = refine(time_pitch1, time_merit1, time_pitch2,
 415 |                                   time_merit2, spec_pitch, pitch, parameters)
 416 | 
 417 |     #---------------------------------------------------------------
 418 |     # Use dyanamic programming to determine the final pitch.
 419 |     #---------------------------------------------------------------
 420 |     final_pitch = dynamic(ref_pitch, ref_merit, pitch, parameters)
 421 | 
 422 |     pitch.set_values(final_pitch, signal.size)
 423 | 
 424 |     return pitch
 425 | 
 426 | 
 427 | """
 428 | --------------------------------------------
 429 |                 Side functions.
 430 | --------------------------------------------
 431 | """
 432 | 
 433 | """
 434 | Normalized Low Frequency Energy Ratio function. Corresponds to the nlfer.m file,
 435 | but instead of returning the results to them function, encapsulates them in the
 436 | pitch object.
 437 | """
 438 | def nlfer(signal, pitch, parameters):
 439 | 
 440 |     #---------------------------------------------------------------
 441 |     # Set parameters.
 442 |     #---------------------------------------------------------------
 443 |     N_f0_min = np.around((parameters['f0_min']*2/float(signal.new_fs))*pitch.nfft)
 444 |     N_f0_max = np.around((parameters['f0_max']/float(signal.new_fs))*pitch.nfft)
 445 | 
 446 |     window = hann(pitch.frame_size+2)[1:-1]
 447 |     data = np.zeros((signal.size))  #Needs other array, otherwise stride and
 448 |     data[:] = signal.filtered     #windowing will modify signal.filtered
 449 | 
 450 |     #---------------------------------------------------------------
 451 |     # Main routine.
 452 |     #---------------------------------------------------------------
 453 |     samples = np.arange(int(np.fix(float(pitch.frame_size)/2)),
 454 |                         signal.size-int(np.fix(float(pitch.frame_size)/2)),
 455 |                         pitch.frame_jump)
 456 | 
 457 |     data_matrix = np.empty((len(samples), pitch.frame_size))
 458 |     data_matrix[:, :] = stride_matrix(data, len(samples),
 459 |                                     pitch.frame_size, pitch.frame_jump)
 460 |     data_matrix *= window
 461 | 
 462 |     specData = np.fft.rfft(data_matrix, pitch.nfft)
 463 | 
 464 |     frame_energy = np.abs(specData[:, int(N_f0_min-1):int(N_f0_max)]).sum(axis=1)
 465 |     pitch.set_energy(frame_energy, parameters['nlfer_thresh1'])
 466 |     pitch.set_frames_pos(samples)
 467 | 
 468 | """
 469 | Spectral pitch tracking. Computes estimates of pitch using nonlinearly processed
 470 | speech (typically square or absolute value) and frequency domain processing.
 471 | Search for frequencies which have energy at multiplies of that frequency.
 472 | Corresponds to the spec_trk.m file.
 473 | """
 474 | def spec_track(signal, pitch, parameters):
 475 | 
 476 |     #---------------------------------------------------------------
 477 |     # Set parameters.
 478 |     #---------------------------------------------------------------
 479 |     nframe_size = pitch.frame_size*2
 480 |     maxpeaks = parameters['shc_maxpeaks']
 481 |     delta = signal.new_fs/pitch.nfft
 482 | 
 483 |     window_length = int(np.fix(parameters['shc_window']/delta))
 484 |     half_window_length = int(np.fix(float(window_length)/2))
 485 |     if not(window_length % 2):
 486 |         window_length += 1
 487 | 
 488 |     max_SHC = int(np.fix((parameters['f0_max']+parameters['shc_pwidth']*2)/delta))
 489 |     min_SHC = int(np.ceil(parameters['f0_min']/delta))
 490 |     num_harmonics = parameters['shc_numharms']
 491 | 
 492 |     #---------------------------------------------------------------
 493 |     # Main routine.
 494 |     #---------------------------------------------------------------
 495 |     cand_pitch = np.zeros((maxpeaks, pitch.nframes))
 496 |     cand_merit = np.ones((maxpeaks, pitch.nframes))
 497 | 
 498 |     data = np.append(signal.filtered,
 499 |                   np.zeros((1, nframe_size +
 500 |                          ((pitch.nframes-1)*pitch.frame_jump-signal.size))))
 501 | 
 502 |     #Compute SHC for voiced frame
 503 |     window = kaiser(nframe_size, 0.5)
 504 |     SHC = np.zeros((max_SHC))
 505 |     row_mat_list = np.array([np.empty((max_SHC-min_SHC+1, window_length))
 506 |                             for x in range(num_harmonics+1)])
 507 | 
 508 |     magnitude = np.zeros(int((half_window_length+(pitch.nfft/2)+1)))
 509 | 
 510 |     for frame in np.where(pitch.vuv)[0].tolist():
 511 |         fir_step = frame*pitch.frame_jump
 512 | 
 513 |         data_slice = data[fir_step:fir_step+nframe_size]*window
 514 |         data_slice -= np.mean(data_slice)
 515 | 
 516 |         magnitude[half_window_length:] = np.abs(np.fft.rfft(data_slice,
 517 |                                                 pitch.nfft))
 518 | 
 519 |         for idx,row_mat in enumerate(row_mat_list):
 520 |             row_mat[:, :] = stride_matrix(magnitude[min_SHC*(idx+1):],
 521 |                                           max_SHC-min_SHC+1,
 522 |                                           window_length, idx+1)
 523 |         SHC[min_SHC-1:max_SHC] = np.sum(np.prod(row_mat_list,axis=0),axis=1)
 524 | 
 525 |         cand_pitch[:, frame], cand_merit[:, frame] = \
 526 |             peaks(SHC, delta, maxpeaks, parameters)
 527 | 
 528 |     #Extract the pitch candidates of voiced frames for the future pitch selection.
 529 |     spec_pitch = cand_pitch[0, :]
 530 |     voiced_cand_pitch = cand_pitch[:, cand_pitch[0, :] > 0]
 531 |     voiced_cand_merit = cand_merit[:, cand_pitch[0, :] > 0]
 532 |     num_voiced_cand = len(voiced_cand_pitch[0, :])
 533 |     avg_voiced = np.mean(voiced_cand_pitch[0, :])
 534 |     std_voiced = np.std(voiced_cand_pitch[0, :])
 535 | 
 536 |     #Interpolation of the weigthed candidates.
 537 |     delta1 = abs((voiced_cand_pitch - 0.8*avg_voiced))*(3-voiced_cand_merit)
 538 |     index = delta1.argmin(0)
 539 | 
 540 |     voiced_peak_minmrt = voiced_cand_pitch[index, range(num_voiced_cand)]
 541 |     voiced_merit_minmrt = voiced_cand_merit[index, range(num_voiced_cand)]
 542 | 
 543 |     voiced_peak_minmrt = medfilt(voiced_peak_minmrt,
 544 |                                  max(1, parameters['median_value']-2))
 545 | 
 546 |     #Replace the lowest merit candidates by the median smoothed ones
 547 |     #computed from highest merit peaks above.
 548 |     voiced_cand_pitch[index, range(num_voiced_cand)] = voiced_peak_minmrt
 549 |     voiced_cand_merit[index, range(num_voiced_cand)] = voiced_merit_minmrt
 550 | 
 551 |     #Use dynamic programming to find best overal path among pitch candidates.
 552 |     #Dynamic weight for transition costs balance between local and
 553 |     #transition costs.
 554 |     weight_trans = parameters['dp5_k1']*std_voiced/avg_voiced
 555 | 
 556 |     if num_voiced_cand > 2:
 557 |         voiced_pitch = dynamic5(voiced_cand_pitch, voiced_cand_merit,
 558 |                                 weight_trans, parameters['f0_min'])
 559 |         voiced_pitch = medfilt(voiced_pitch, max(1, parameters['median_value']-2))
 560 | 
 561 |     else:
 562 |         if num_voiced_cand > 0:
 563 |             voiced_pitch = (np.ones((num_voiced_cand)))*150.0
 564 |         else:
 565 |             voiced_pitch = np.array([150.0])
 566 |             cand_pitch[0, 0] = 0
 567 | 
 568 |     pitch_avg = np.mean(voiced_pitch)
 569 |     pitch_std = np.maximum(np.std(voiced_pitch), pitch_avg*parameters['spec_pitch_min_std'])
 570 |     spec_pitch[cand_pitch[0, :] > 0] = voiced_pitch[:]
 571 | 
 572 |     if (spec_pitch[0] < pitch_avg/2):
 573 |         spec_pitch[0] = pitch_avg
 574 | 
 575 |     if (spec_pitch[-1] < pitch_avg/2):
 576 |         spec_pitch[-1] = pitch_avg
 577 | 
 578 |     spec_voiced = np.array(np.nonzero(spec_pitch)[0])
 579 |     spec_pitch = scipy_interp.pchip(spec_voiced,
 580 |                                     spec_pitch[spec_voiced])(range(pitch.nframes))
 581 | 
 582 |     spec_pitch = lfilter(np.ones((3))/3, 1.0, spec_pitch)
 583 | 
 584 |     spec_pitch[0] = spec_pitch[2]
 585 |     spec_pitch[1] = spec_pitch[3]
 586 | 
 587 |     return spec_pitch, pitch_std
 588 | 
 589 | """
 590 | Temporal pitch tracking.
 591 | Corresponds to the tm_trk.m file.
 592 | """
 593 | def time_track(signal, spec_pitch, pitch_std, pitch, parameters):
 594 | 
 595 |     #---------------------------------------------------------------
 596 |     # Set parameters.
 597 |     #---------------------------------------------------------------
 598 |     tda_frame_length = int(parameters['tda_frame_length']*signal.fs/1000)
 599 |     tda_noverlap = tda_frame_length-pitch.frame_jump
 600 |     tda_nframes = int((len(signal.data)-tda_noverlap)/pitch.frame_jump)
 601 | 
 602 |     len_spectral = len(spec_pitch)
 603 |     if tda_nframes < len_spectral:
 604 |         spec_pitch = spec_pitch[:tda_nframes]
 605 |     elif tda_nframes > len_spectral:
 606 |         tda_nframes = len_spectral
 607 | 
 608 |     merit_boost = parameters['merit_boost']
 609 |     maxcands = parameters['nccf_maxcands']
 610 |     freq_thresh = 5.0*pitch_std
 611 | 
 612 |     spec_range = np.maximum(spec_pitch-2.0*pitch_std, parameters['f0_min'])
 613 |     spec_range = np.vstack((spec_range,
 614 |                          np.minimum(spec_pitch+2.0*pitch_std, parameters['f0_max'])))
 615 | 
 616 |     time_pitch = np.zeros((maxcands, tda_nframes))
 617 |     time_merit = np.zeros((maxcands, tda_nframes))
 618 | 
 619 |     #---------------------------------------------------------------
 620 |     # Main routine.
 621 |     #---------------------------------------------------------------
 622 |     data = np.zeros((signal.size))  #Needs other array, otherwise stride and
 623 |     data[:] = signal.filtered       #windowing will modify signal.filtered
 624 |     signal_frames = stride_matrix(data, tda_nframes,tda_frame_length,
 625 |                                   pitch.frame_jump)
 626 |     for frame in range(tda_nframes):
 627 |         lag_min0 = (int(np.fix(signal.new_fs/spec_range[1, frame])) -
 628 |                                     int(np.fix(parameters['nccf_pwidth']/2.0)))
 629 |         lag_max0 = (int(np.fix(signal.new_fs/spec_range[0, frame])) +
 630 |                                     int(np.fix(parameters['nccf_pwidth']/2.0)))
 631 | 
 632 |         phi = crs_corr(signal_frames[frame, :], lag_min0, lag_max0)
 633 |         time_pitch[:, frame], time_merit[:, frame] = \
 634 |             cmp_rate(phi, signal.new_fs, maxcands, lag_min0, lag_max0, parameters)
 635 | 
 636 |     diff = np.abs(time_pitch - spec_pitch)
 637 |     match1 = (diff < freq_thresh)
 638 |     match = ((1 - diff/freq_thresh) * match1)
 639 |     time_merit = (((1+merit_boost)*time_merit) * match)
 640 | 
 641 |     return time_pitch, time_merit
 642 | 
 643 | """
 644 | Refines pitch candidates obtained from NCCF using spectral pitch track and
 645 | NLFER energy information.
 646 | Corresponds to the refine.m file.
 647 | """
 648 | def refine(time_pitch1, time_merit1, time_pitch2, time_merit2, spec_pitch,
 649 |            pitch, parameters):
 650 | 
 651 |     #---------------------------------------------------------------
 652 |     # Set parameters.
 653 |     #---------------------------------------------------------------
 654 |     nlfer_thresh2 = parameters['nlfer_thresh2']
 655 |     merit_pivot = parameters['merit_pivot']
 656 | 
 657 |     #---------------------------------------------------------------
 658 |     # Main routine.
 659 |     #---------------------------------------------------------------
 660 |     time_pitch = np.append(time_pitch1, time_pitch2, 0)
 661 |     time_merit = np.append(time_merit1, time_merit2, 0)
 662 |     maxcands = time_pitch.shape[0]
 663 | 
 664 |     idx = np.argsort(-time_merit, axis=0)
 665 |     time_merit.sort(axis=0)
 666 |     time_merit[:, :] = time_merit[::-1,:]
 667 | 
 668 |     time_pitch = time_pitch[idx, range(pitch.nframes)]
 669 | 
 670 |     best_pitch = medfilt(time_pitch[0, :], parameters['median_value'])*pitch.vuv
 671 | 
 672 |     idx1 = pitch.energy <= nlfer_thresh2
 673 |     idx2 = (pitch.energy > nlfer_thresh2) & (time_pitch[0, :] > 0)
 674 |     idx3 = (pitch.energy > nlfer_thresh2) & (time_pitch[0, :] <= 0)
 675 |     merit_mat = (time_pitch[1:maxcands-1, :] == 0) & idx2
 676 |     merit_mat = np.insert(merit_mat, [0, maxcands-2],
 677 |                           np.zeros((1, pitch.nframes), dtype=bool), 0)
 678 | 
 679 |     time_pitch[:, idx1] = 0
 680 |     time_merit[:, idx1] = merit_pivot
 681 | 
 682 |     time_pitch[maxcands-1, idx2] = 0.0
 683 |     time_merit[maxcands-1, idx2] = 1.0-time_merit[0, idx2]
 684 |     time_merit[merit_mat] = 0.0
 685 | 
 686 |     time_pitch[0, idx3] = spec_pitch[idx3]
 687 |     time_merit[0, idx3] = np.minimum(1, pitch.energy[idx3]/2.0)
 688 |     time_pitch[1:maxcands, idx3] = 0.0
 689 |     time_merit[1:maxcands, idx3] = 1.0-time_merit[0, idx3]
 690 | 
 691 |     time_pitch[maxcands-2, :] = best_pitch
 692 |     non_zero_frames = best_pitch > 0.0
 693 |     time_merit[maxcands-2, non_zero_frames] = time_merit[0, non_zero_frames]
 694 |     time_merit[maxcands-2, ~(non_zero_frames)] = 1.0-np.minimum(1,
 695 |                                        pitch.energy[~(non_zero_frames)]/2.0)
 696 | 
 697 |     time_pitch[maxcands-3, :] = spec_pitch
 698 |     time_merit[maxcands-3, :] = pitch.energy/5.0
 699 | 
 700 |     return time_pitch, time_merit
 701 | 
 702 | 
 703 | """
 704 | Dynamic programming used to compute local and transition cost matrices,
 705 | enabling the lowest cost tracking of pitch candidates.
 706 | It uses NFLER from the spectrogram and the highly robust spectral F0 track,
 707 | plus the merits, for computation of the cost matrices.
 708 | Corresponds to the dynamic.m file.
 709 | """
 710 | def dynamic(ref_pitch, ref_merit, pitch, parameters):
 711 | 
 712 |     #---------------------------------------------------------------
 713 |     # Set parameters.
 714 |     #---------------------------------------------------------------
 715 |     num_cands = ref_pitch.shape[0]
 716 |     best_pitch = ref_pitch[num_cands-2, :]
 717 |     mean_pitch = np.mean(best_pitch[best_pitch > 0])
 718 | 
 719 |     dp_w1 = parameters['dp_w1']
 720 |     dp_w2 = parameters['dp_w2']
 721 |     dp_w3 = parameters['dp_w3']
 722 |     dp_w4 = parameters['dp_w4']
 723 | 
 724 |     #---------------------------------------------------------------
 725 |     # Main routine.
 726 |     #---------------------------------------------------------------
 727 |     local_cost = 1 - ref_merit
 728 |     trans_cmatrix = np.ones((num_cands, num_cands, pitch.nframes))
 729 | 
 730 |     ref_mat1 = np.zeros((num_cands, num_cands, pitch.nframes))
 731 |     ref_mat2 = np.zeros((num_cands, num_cands, pitch.nframes))
 732 |     idx_mat1 = np.zeros((num_cands, num_cands, pitch.nframes), dtype=bool)
 733 |     idx_mat2 = np.zeros((num_cands, num_cands, pitch.nframes), dtype=bool)
 734 |     idx_mat3 = np.zeros((num_cands, num_cands, pitch.nframes), dtype=bool)
 735 | 
 736 |     ref_mat1[:, :, 1:] = np.tile(ref_pitch[:, 1:].reshape(1, num_cands,
 737 |                         pitch.nframes-1), (num_cands, 1, 1))
 738 |     ref_mat2[:, :, 1:] = np.tile(ref_pitch[:, :-1].reshape(num_cands, 1,
 739 |                         pitch.nframes-1), (1, num_cands, 1))
 740 | 
 741 |     idx_mat1[:, :, 1:] = (ref_mat1[:, :, 1:] > 0) & (ref_mat2[:, :, 1:] > 0)
 742 |     idx_mat2[:, :, 1:] = (((ref_mat1[:, :, 1:] == 0) & (ref_mat2[:, :, 1:] > 0)) |
 743 |                        ((ref_mat1[:, :, 1:] > 0) & (ref_mat2[:, :, 1:] == 0)))
 744 |     idx_mat3[:, :, 1:] = (ref_mat1[:, :, 1:] == 0) & (ref_mat2[:, :, 1:] == 0)
 745 | 
 746 |     mat1_values = np.abs(ref_mat1-ref_mat2)/mean_pitch
 747 |     benefit2 = np.insert(np.minimum(1, abs(pitch.energy[:-1]-pitch.energy[1:])),
 748 |                          0, 0)
 749 |     benefit2 = np.tile(benefit2, (num_cands, num_cands, 1))
 750 | 
 751 |     trans_cmatrix[idx_mat1] = dp_w1*mat1_values[idx_mat1]
 752 |     trans_cmatrix[idx_mat2] = dp_w2*(1-benefit2[idx_mat2])
 753 |     trans_cmatrix[idx_mat3] = dp_w3
 754 | 
 755 |     trans_cmatrix = trans_cmatrix/dp_w4
 756 |     path = path1(local_cost, trans_cmatrix, num_cands, pitch.nframes)
 757 |     final_pitch = ref_pitch[path, range(pitch.nframes)]
 758 | 
 759 |     return final_pitch
 760 | 
 761 | """
 762 | --------------------------------------------
 763 |                 Auxiliary functions.
 764 | --------------------------------------------
 765 | """
 766 | 
 767 | """
 768 | Computes peaks in a frequency domain function associated with the peaks found
 769 | in each frame based on the correlation sequence.
 770 | Corresponds to the peaks.m file.
 771 | """
 772 | def peaks(data, delta, maxpeaks, parameters):
 773 | 
 774 |     #---------------------------------------------------------------
 775 |     # Set parameters.
 776 |     #---------------------------------------------------------------
 777 |     PEAK_THRESH1 = parameters['shc_thresh1']
 778 |     PEAK_THRESH2 = parameters['shc_thresh2']
 779 | 
 780 |     epsilon = .00000000000001
 781 | 
 782 |     width = int(np.fix(parameters['shc_pwidth']/delta))
 783 |     if not(float(width) % 2):
 784 |         width = width + 1
 785 | 
 786 |     center = int(np.ceil(width/2))
 787 | 
 788 |     min_lag = int(np.fix(parameters['f0_min']/delta - center))
 789 |     max_lag = int(np.fix(parameters['f0_max']/delta + center))
 790 | 
 791 |     if (min_lag < 1):
 792 |         min_lag = 1
 793 |         print('Min_lag is too low and adjusted ({}).'.format(min_lag))
 794 | 
 795 |     if max_lag > (len(data) - width):
 796 |         max_lag = len(data) - width
 797 |         print('Max_lag is too high and adjusted ({}).'.format(max_lag))
 798 | 
 799 |     pitch = np.zeros((maxpeaks))
 800 |     merit = np.zeros((maxpeaks))
 801 | 
 802 |     #---------------------------------------------------------------
 803 |     # Main routine.
 804 |     #---------------------------------------------------------------
 805 |     max_data = max(data[min_lag:max_lag+1])
 806 | 
 807 |     if (max_data > epsilon):
 808 |         data = data/max_data
 809 | 
 810 |     avg_data = np.mean(data[min_lag:max_lag+1])
 811 | 
 812 |     if (avg_data > 1/PEAK_THRESH1):
 813 |         pitch = np.zeros((maxpeaks))
 814 |         merit = np.ones((maxpeaks))
 815 |         return pitch, merit
 816 | 
 817 |     #---------------------------------------------------------------
 818 |     #Step1 (this step was implemented differently than in original version)
 819 |     #---------------------------------------------------------------
 820 |     numpeaks = 0
 821 |     vec_back = (data[min_lag+center+1:max_lag-center+1] >
 822 |                                             data[min_lag+center:max_lag-center])
 823 |     vec_forw = (data[min_lag+center+1:max_lag-center+1] >
 824 |                                         data[min_lag+center+2:max_lag-center+2])
 825 |     above_thresh = (data[min_lag+center+1:max_lag-center+1] >
 826 |                                         PEAK_THRESH2*avg_data)
 827 |     peaks = np.logical_and(np.logical_and(vec_back, vec_forw), above_thresh)
 828 | 
 829 |     for n in (peaks.ravel().nonzero()[0]+min_lag+center+1).tolist():
 830 |         if np.argmax(data[n-center:n+center+1]) == center:
 831 |             if numpeaks >= maxpeaks:
 832 |                 pitch = np.append(pitch, np.zeros((1)))
 833 |                 merit = np.append(merit, np.zeros((1)))
 834 | 
 835 |             pitch[numpeaks] = float(n)*delta
 836 |             merit[numpeaks] = data[n]
 837 |             numpeaks += 1
 838 | 
 839 |     #---------------------------------------------------------------
 840 |     #Step2
 841 |     #---------------------------------------------------------------
 842 |     if (max(merit)/avg_data < PEAK_THRESH1):
 843 |         pitch = np.zeros((maxpeaks))
 844 |         merit = np.ones((maxpeaks))
 845 |         return pitch, merit
 846 | 
 847 |     #---------------------------------------------------------------
 848 |     #Step3
 849 |     #---------------------------------------------------------------
 850 |     idx = (-merit).ravel().argsort().tolist()
 851 |     merit = merit[idx]
 852 |     pitch = pitch[idx]
 853 | 
 854 |     numpeaks = min(numpeaks, maxpeaks)
 855 |     pitch = np.append(pitch[:numpeaks], np.zeros((maxpeaks-numpeaks)))
 856 |     merit = np.append(merit[:numpeaks], np.zeros((maxpeaks-numpeaks)))
 857 | 
 858 |     #---------------------------------------------------------------
 859 |     #Step4
 860 |     #---------------------------------------------------------------
 861 | 
 862 |     if (0 < numpeaks < maxpeaks):
 863 |         pitch[numpeaks:maxpeaks] = pitch[0]
 864 |         merit[numpeaks:maxpeaks] = merit[0]
 865 | 
 866 |     else:
 867 |         pitch = np.zeros((maxpeaks))
 868 |         merit = np.ones((maxpeaks))
 869 | 
 870 |     return np.transpose(pitch), np.transpose(merit)
 871 | 
 872 | """
 873 | Dynamic programming used to compute local and transition cost matrices,
 874 | enabling the lowest cost tracking of pitch candidates.
 875 | It uses NFLER from the spectrogram and the highly robust spectral F0 track,
 876 | plus the merits, for computation of the cost matrices.
 877 | Corresponds to the dynamic5.m file.
 878 | """
 879 | def dynamic5(pitch_array, merit_array, k1, f0_min):
 880 | 
 881 |     num_cand = pitch_array.shape[0]
 882 |     num_frames = pitch_array.shape[1]
 883 | 
 884 |     local = 1-merit_array
 885 |     trans = np.zeros((num_cand, num_cand, num_frames))
 886 | 
 887 |     trans[:, :, 1:] = abs(pitch_array[:, 1:].reshape(1, num_cand, num_frames-1) -
 888 |                     pitch_array[:, :-1].reshape(num_cand, 1, num_frames-1))/f0_min
 889 |     trans[:, :, 1:] = 0.05*trans[:, :, 1:] + trans[:, :, 1:]**2
 890 | 
 891 |     trans = k1*trans
 892 |     path = path1(local, trans, num_cand, num_frames)
 893 | 
 894 |     final_pitch = pitch_array[path, range(num_frames)]
 895 | 
 896 |     return final_pitch
 897 | 
 898 | """
 899 | Finds the optimal path with the lowest cost if two matrice(Local cost matrix
 900 | and Transition cost) are given.
 901 | Corresponds to the path1.m file.
 902 | """
 903 | def path1(local, trans, n_lin, n_col):
 904 | 
 905 | # Apparently the following lines are somehow kind of useless.
 906 | # Therefore, I removed them in the version 1.0.3.
 907 | 
 908 | #    if n_lin >= 100:
 909 | #        print 'Stop in Dynamic due to M>100'
 910 | #        raise KeyboardInterrupt
 911 | #
 912 | #    if n_col >= 1000:
 913 | #        print 'Stop in Dynamic due to N>1000'
 914 | #        raise KeyboardInterrupt
 915 | 
 916 |     PRED = np.zeros((n_lin, n_col), dtype=int)
 917 |     P = np.ones((n_col), dtype=int)
 918 |     p_small = np.zeros((n_col), dtype=int)
 919 | 
 920 |     PCOST = np.zeros((n_lin))
 921 |     CCOST = np.zeros((n_lin))
 922 |     PCOST = local[:, 0]
 923 | 
 924 |     for I in range(1, n_col):
 925 | 
 926 |         aux_matrix = PCOST+np.transpose(trans[:, :, I])
 927 |         K = n_lin-np.argmin(aux_matrix[:, ::-1], axis=1)-1
 928 |         PRED[:, I] = K
 929 |         CCOST = PCOST[K]+trans[K, range(n_lin), I]
 930 | 
 931 |         assert CCOST.any() < 1.0E+30, 'CCOST>1.0E+30, Stop in Dynamic'
 932 |         CCOST = CCOST+local[:, I]
 933 | 
 934 |         PCOST[:] = CCOST
 935 |         J = n_lin - np.argmin(CCOST[::-1])-1
 936 |         p_small[I] = J
 937 | 
 938 |     P[-1] = p_small[-1]
 939 | 
 940 |     for I in range(n_col-2, -1, -1):
 941 |         P[I] = PRED[P[I+1], I+1]
 942 | 
 943 |     return P
 944 | 
 945 | """
 946 | Computes the NCCF (Normalized cross correlation Function) sequence based on
 947 | the RAPT algorithm discussed by DAVID TALKIN.
 948 | Corresponds to the crs_corr.m file.
 949 | """
 950 | def crs_corr(data, lag_min, lag_max):
 951 | 
 952 |     eps1 = 0.0
 953 |     data_len = len(data)
 954 |     N = data_len-lag_max
 955 | 
 956 |     error_str = 'ERROR: Negative index in the cross correlation calculation of '
 957 |     error_str += 'the pYAAPT time domain analysis. Please try to increase the '
 958 |     error_str += 'value of the "tda_frame_length" parameter.'
 959 |     assert N>0, error_str
 960 | 
 961 |     phi = np.zeros((data_len))
 962 |     data -= np.mean(data)
 963 |     x_j = data[0:N]
 964 |     x_jr = data[lag_min:lag_max+N]
 965 |     p = np.dot(x_j, x_j)
 966 | 
 967 |     x_jr_matrix = stride_matrix(x_jr, lag_max-lag_min, N, 1)
 968 | 
 969 |     formula_nume = np.dot(x_jr_matrix, x_j)
 970 |     formula_denom = np.sum(x_jr_matrix*x_jr_matrix, axis=1)*p + eps1
 971 | 
 972 |     phi[lag_min:lag_max] = formula_nume/np.sqrt(formula_denom)
 973 | 
 974 |     return phi
 975 | 
 976 | """
 977 | Computes pitch estimates and the corresponding merit values associated with the
 978 | peaks found in each frame based on the correlation sequence.
 979 | Corresponds to the cmp_rate.m file.
 980 | """
 981 | def cmp_rate(phi, fs, maxcands, lag_min, lag_max, parameters):
 982 | 
 983 |     #---------------------------------------------------------------
 984 |     # Set parameters.
 985 |     #---------------------------------------------------------------
 986 |     width = parameters['nccf_pwidth']
 987 |     center = int(np.fix(width/2.0))
 988 |     merit_thresh1 = parameters['nccf_thresh1']
 989 |     merit_thresh2 = parameters['nccf_thresh2']
 990 | 
 991 |     numpeaks = 0
 992 |     pitch = np.zeros((maxcands))
 993 |     merit = np.zeros((maxcands))
 994 | 
 995 |     #---------------------------------------------------------------
 996 |     # Main routine.
 997 |     #(this step was implemented differently than in original version)
 998 |     #---------------------------------------------------------------
 999 |     vec_back = (phi[lag_min+center:lag_max-center+1] >
1000 |                                             phi[lag_min+center-1:lag_max-center])
1001 |     vec_forw = (phi[lag_min+center:lag_max-center+1] >
1002 |                                         phi[lag_min+center+1:lag_max-center+2])
1003 |     above_thresh = phi[lag_min+center:lag_max-center+1] > merit_thresh1
1004 |     peaks = np.logical_and(np.logical_and(vec_back, vec_forw), above_thresh)
1005 | 
1006 |     peaks = (peaks.ravel().nonzero()[0]+lag_min+center).tolist()
1007 | 
1008 |     if np.amax(phi) > merit_thresh2 and len(peaks) > 0:
1009 |         max_point = peaks[np.argmax(phi[peaks])]
1010 |         pitch[numpeaks] = fs/float(max_point+1)
1011 |         merit[numpeaks] = np.amax(phi[peaks])
1012 |         numpeaks += 1
1013 |     else:
1014 |         for n in peaks:
1015 |             if np.argmax(phi[n-center:n+center+1]) == center:
1016 |                 try:
1017 |                     pitch[numpeaks] = fs/float(n+1)
1018 |                     merit[numpeaks] = phi[n]
1019 |                 except:
1020 |                     pitch = np.hstack((pitch, fs/float(n+1)))
1021 |                     merit = np.hstack((merit, phi[n]))
1022 |                 numpeaks += 1
1023 | 
1024 |     #---------------------------------------------------------------
1025 |     # Sort the results.
1026 |     #---------------------------------------------------------------
1027 |     idx = (-merit).ravel().argsort().tolist()
1028 |     merit = merit[idx[:maxcands]]
1029 |     pitch = pitch[idx[:maxcands]]
1030 | 
1031 |     if (np.amax(merit) > 1.0):
1032 |         merit = merit/np.amax(merit)
1033 | 
1034 |     return pitch, merit
1035 | 
1036 | """
1037 | --------------------------------------------
1038 |                 Extra functions.
1039 | --------------------------------------------
1040 | """
1041 | 
1042 | def stride_matrix(vector, n_lin, n_col, hop):
1043 | 
1044 |     data_matrix = stride_tricks.as_strided(vector, shape=(n_lin, n_col),
1045 |                         strides=(vector.strides[0]*hop, vector.strides[0]))
1046 | 
1047 |     return data_matrix
1048 | 


--------------------------------------------------------------------------------
/amfm_decompy/pyQHM.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """
  3 | This package provides the tools necessary to decompose the voiced part of a
  4 | speech signal into its modulated components, aka AM-FM decomposition. This
  5 | designation is used due the fact that, in this method, the signal is modeled as
  6 | a sum of amplitude- and frequency-modulated components. The goal is to overcome
  7 | the drawbacks from Fourier-alike techniques, e.g. SFFT, wavelets, etc, which
  8 | are limited in the time-frequency analysis by the so-called Heisenberg-Gabor
  9 | inequality.
 10 | 
 11 | The algorithms here implemented were the QHM (Quasi-Harmonic Model), and its
 12 | upgrades, aQHM (adaptive Quasi-Harmonic Model) and eaQHM (extended adaptive
 13 | Quasi-Harmonic Model). Their formulation can be found at references [1-3].
 14 | 
 15 | USAGE:
 16 |     Please refer to the documentation for examples.
 17 | 
 18 | References:
 19 |     [1] Y. Pantazis, “Decomposition of AM-FM signals with applications in
 20 |         speech processing”, PhD Thesis, University of Creta, 2010.
 21 | 
 22 |     [2] Y. Pantazis, O. Rosec and Y. Stylianou, “Adaptive AM-FM signal
 23 |         decomposition with application to speech analysis”, IEEE Transactions
 24 |         on Audio, Speech and Language Processing, vol. 19, n 2, 2011.
 25 | 
 26 |     [3] G. P. Kafentzis, Y. Pantazis, O. Rosec and Y. Stylianou, “An extension
 27 |         of the adaptive quasi-harmonic model”, em IEEE International Conference
 28 |         on Acoustics, Speech and Signal Processing (ICASSP), 2012.
 29 | 
 30 | Version 1.0.12
 31 | 16/May/2025 Bernardo J.B. Schmitt - bernardo.jb.schmitt@gmail.com
 32 | """
 33 | 
 34 | import numpy as np
 35 | import scipy
 36 | 
 37 | """
 38 | --------------------------------------------
 39 |                 Classes.
 40 | --------------------------------------------
 41 | """
 42 | 
 43 | """
 44 | Creates a single component object.
 45 | """
 46 | 
 47 | class ComponentObj(object):
 48 | 
 49 |     def __init__(self, H, harm):
 50 |         self.mag = H[harm, 0, :]
 51 |         self.phase = H[harm, 1, :]
 52 |         self.freq = H[harm, 2, :]
 53 | 
 54 |     """
 55 |     Synthsize the modulated component by using the extracted magnitude and
 56 |     phase.
 57 |     """
 58 | 
 59 |     def synthesize(self):
 60 |         self.signal = 2*self.mag*np.cos(self.phase)
 61 | 
 62 | 
 63 | """
 64 | Creates the output signal object (which, in its turn, is formed by n_harm
 65 | modulated components).
 66 | """
 67 | 
 68 | class ModulatedSign(object):
 69 | 
 70 |     def __init__(self, n_harm, file_size, fs, phase_tech='phase'):
 71 |         self.n_harm = n_harm
 72 |         self.size = file_size
 73 |         self.fs = fs
 74 |         self.H = np.zeros((self.n_harm, 3, self.size))
 75 |         self.harmonics = [ComponentObj(self.H, i) for i in range(self.n_harm)]
 76 |         self.error = np.zeros(self.size)
 77 |         self.phase_tech = phase_tech
 78 | 
 79 |     """
 80 |     Updates the 3-dimension array H, which stores the magnitude, phase and
 81 |     frequency values from all components. Its first dimension refers to the
 82 |     n_harm components, the second to the three composing parameters (where 0
 83 |     stands for the magnitude, 1 for the phase and 2 for the frequency) and the
 84 |     third dimension to the temporal axis.
 85 |     """
 86 | 
 87 |     def update_values(self, a, freq, frame):
 88 |         self.H[:, 0, frame] = np.abs(a)
 89 |         self.H[:, 1, frame] = np.angle(a)
 90 |         self.H[:, 2, frame] = freq
 91 | 
 92 |     """
 93 |     Interpolate the parameters values when the extraction is not performed
 94 |     sample-by-sample. While the interpolation from magnitude and frequency
 95 |     is pretty straightforward, the phase one is not. Therefore, references
 96 |     [1,2] present a solution for this problem.
 97 |     """
 98 | 
 99 |     def interpolate_samp(self, samp_frames, pitch_track):
100 | 
101 |         # Interpolation from magnitude and frequency.
102 | 
103 |         for idx, func in [(0, 'linear'), (2, 'cubic')]:
104 |             f = scipy.interpolate.interp1d(samp_frames,
105 |                                        self.H[:, idx, samp_frames], kind=func)
106 |             self.H[:, idx, np.nonzero(pitch_track)[0]] = f(
107 |                                                     np.nonzero(pitch_track)[0])
108 | 
109 |         # Interpolation from phase.
110 | 
111 |         step = samp_frames[1]-samp_frames[0]
112 |         sin_f = np.cumsum(np.sin(np.pi*np.arange(1, step)/step)).reshape(
113 |                                                                     1, step-1)
114 |         for idx, frame in np.ndenumerate(samp_frames[1:]):
115 |             if frame-samp_frames[idx] <= step:
116 |                 cum_phase = np.cumsum(self.H[:, 2, samp_frames[idx]+1:frame+1],
117 |                                     axis=1)*2*np.pi
118 |                 bad_phase = cum_phase[:, -1]+self.H[:, 1, samp_frames[idx]]
119 |                 M = np.around(np.abs(self.H[:, 1, frame]-bad_phase)/(2*np.pi))
120 |                 if frame-samp_frames[idx] < step:
121 |                     end_step = frame-samp_frames[idx]
122 |                     func = np.cumsum(np.sin(np.pi*np.arange(1, end_step) /
123 |                                             end_step)).reshape(1, end_step-1)
124 |                 else:
125 |                     func = sin_f
126 | 
127 |                 r_vec = (np.pi*(self.H[:, 1, frame]+2*np.pi*M-bad_phase) /
128 |                         (2*(frame-samp_frames[idx]))).reshape(self.n_harm, 1)
129 | 
130 |                 new_phase = cum_phase[:, :-1]+r_vec*func + \
131 |                         self.H[:, 1, samp_frames[idx]].reshape(self.n_harm, 1)
132 |                 self.H[:, 1, samp_frames[idx]+1:frame] = ((new_phase + np.pi) %
133 |                                                             (2*np.pi)-np.pi)
134 | 
135 |     """
136 |     Synthesize the final signal by initially creating each modulated component
137 |     and then summing all of them.
138 |     """
139 | 
140 |     def synthesize(self, N=None):
141 |         if N is None:
142 |             N = self.n_harm
143 |         [self.harmonics[i].synthesize()
144 |                             for i in range(N)]
145 |         self.signal = sum([self.harmonics[i].signal
146 |                             for i in range(self.n_harm)])
147 | 
148 |     """
149 |     Calculates the SRER (Signal-to-Reconstruction Error Ratio) for the
150 |     synthesized signal.
151 |     """
152 | 
153 |     def srer(self, orig_signal, pitch_track):
154 |         self.SRER = 20*np.log10(np.std(orig_signal[np.nonzero(pitch_track)[0]]) /
155 |                     np.std(orig_signal[np.nonzero(pitch_track)[0]] -
156 |                     self.signal[np.nonzero(pitch_track)[0]]))
157 | 
158 |     """
159 |     Extrapolates the phase at the border of the voiced frames by integrating
160 |     the edge frequency value. This procedure is necessary for posterior aQHM
161 |     calculations. Additionally, the method allows the replacement of the
162 |     extracted phase by the cumulative frequency. The objective is to provide
163 |     smoother bases for further aQHM and eaQHM calculations. Normally this is
164 |     not necessary, since that the interpolation process already smooths the
165 |     phase vector. But in a sample-by-sample extraction case, this substitution
166 |     is very helpful to avoid the degradation of aQHM and eaQHM performance
167 |     due the phase wild behaviour.
168 |     """
169 | 
170 |     def phase_edges(self, edges, window):
171 | 
172 |         # Selects whether the phase itself or the cummulative frequency will be
173 |         # used.
174 |         if self.phase_tech is 'phase':
175 |             self.extrap_phase = np.unwrap(self.H[:, 1, :])
176 | 
177 |         elif self.phase_tech is 'freq':
178 |             delta_phase = self.H[:, 1, edges[0]+1] - \
179 |                                             self.H[:, 2, edges[0]+1]*2*np.pi
180 |             self.extrap_phase = np.cumsum(self.H[:, 2, :], axis=1)*2*np.pi + \
181 |                                             delta_phase.reshape(self.n_harm, 1)
182 | 
183 |         # Extrapolate the phase edges.
184 |         n_beg = -window.half_len_vec[::-1][:-1].reshape(1, window.N)
185 |         n_end = window.half_len_vec[1:].reshape(1, window.N)
186 | 
187 |         for beg, end in zip(edges[::2], edges[1::2]):
188 | 
189 |             old_phase = self.extrap_phase[:, beg+1].reshape(self.n_harm, 1)
190 |             freq = self.H[:, 2, beg+1].reshape(self.n_harm, 1)
191 |             self.extrap_phase[:, beg-window.N+1:beg+1] = \
192 |                                                 2*np.pi*freq*n_beg+old_phase
193 | 
194 |             old_phase = self.extrap_phase[:, end].reshape(self.n_harm, 1)
195 |             freq = self.H[:, 2, end].reshape(self.n_harm, 1)
196 |             self.extrap_phase[:, end+1:end+window.N+1] = \
197 |                                                 2*np.pi*freq*n_end+old_phase
198 | 
199 | 
200 | """
201 | Creates the sample window object.
202 | """
203 | 
204 | class SampleWindow(object):
205 | 
206 |     def __init__(self, window_duration, fs):
207 |         self.dur = window_duration         # in seconds
208 |         self.length = int(self.dur*fs+1)
209 |         if not self.length %2:
210 |             self.length -= 1
211 |         self.data = np.hamming(self.length)
212 |         self.data2 = self.data**2
213 |         self.N = int(self.dur*fs/2)
214 |         self.half_len_vec = np.arange(self.N+1)
215 |         self.len_vec = np.arange(-self.N, self.N+1)
216 | 
217 |         self.a0 = 0.54**2 + (0.46**2)/2
218 |         self.a1 = 0.54*0.46
219 |         self.a2 = (0.46**2)/4
220 | 
221 |         self.R0_diag = R_eq(0, g0, self)
222 |         self.R2_diag = sum(self.data2*(self.len_vec**2))
223 | 
224 | 
225 | """
226 | --------------------------------------------
227 |                Main Functions.
228 | --------------------------------------------
229 | """
230 | 
231 | """
232 | Main QHM function.
233 | """
234 | 
235 | def qhm(signal, pitch, window, samp_jump=None, N_iter=1, phase_tech='phase'):
236 | 
237 |     return HM_run(qhm_iteration, signal, pitch, window, samp_jump, N_iter,
238 |                   phase_tech)
239 | 
240 | """
241 | Main aQHM function.
242 | """
243 | 
244 | def aqhm(signal, previous_HM, pitch, window, samp_jump=None, N_iter=1,
245 |          N_runs=float('Inf'), phase_tech='phase', eaQHM_flag=False):
246 | 
247 |     count = 1
248 |     outflag = False
249 | 
250 |     while outflag is False:
251 |         func_options = [previous_HM, eaQHM_flag, 0]
252 |         HM = HM_run(aqhm_iteration, signal, pitch, window, samp_jump, N_iter,
253 |                     phase_tech, func_options)
254 |         if count == 1:
255 |             previous_HM = HM
256 |         elif (count > 1 and HM.SRER > previous_HM.SRER):
257 |             previous_HM = HM
258 |         else:
259 |             outflag = True
260 | 
261 |         count += 1
262 | 
263 |         if count > N_runs:
264 |             outflag = True
265 | 
266 |     return previous_HM
267 | 
268 | """
269 | Main eaQHM function (which in fact varies very few from the aQHM).
270 | """
271 | 
272 | def eaqhm(signal, previous_HM, pitch, window, samp_jump=None, N_iter=1,
273 |           N_runs=float('Inf'), phase_tech='phase'):
274 | 
275 |     return aqhm(signal, previous_HM, pitch, window, samp_jump, N_iter, N_runs,
276 |                 phase_tech, eaQHM_flag=True)
277 | 
278 | """
279 | Parser for the three algorithms.
280 | """
281 | 
282 | def HM_run(func, signal, pitch, window, samp_jump=None, N_iter=1,
283 |            phase_tech='phase', func_options=None):
284 | 
285 |     # Creates the output signal object and the dummy frequency vector.
286 |     HM = ModulatedSign(signal.n_harm, signal.size, signal.fs, phase_tech)
287 |     freq = np.zeros(signal.n_harm)
288 | 
289 |     # Selects whether the extration will be performed with temporal jumps or
290 |     # not.
291 |     if samp_jump is None:
292 |         voiced_frames = np.nonzero(pitch.values)[0]
293 |     else:
294 |         jump = int(np.fix(max(samp_jump*signal.fs, 1.0)))
295 |         voiced_frames = np.array([], dtype=int)
296 |         for beg, end in zip(pitch.edges[::2], pitch.edges[1::2]):
297 |             voiced_frames = np.append(voiced_frames, np.arange(
298 |                                                         beg+1, end-1, jump))
299 |             voiced_frames = np.append(voiced_frames, end)
300 | 
301 |     # Run the algorithm in the selected voiced frames.
302 |     for frame in voiced_frames:
303 |         # Uses the pitch value and the harmonic definition f_k = k*f0 to create
304 |         # a frequency reference vector, which is employed to keep each component
305 |         # within a frquency band and thus, avoiding least-squares instability.
306 |         f0_ref = pitch.values[frame]*np.arange(1, signal.n_harm+1)/signal.fs
307 | 
308 |         # Set some algorithm options.
309 |         if func is qhm_iteration:
310 |             if frame-1 in pitch.edges[::2]:
311 |                 freq[:] = f0_ref
312 |             func_options = freq
313 |         elif func is aqhm_iteration:
314 |             func_options[2] = frame
315 | 
316 |         # Core algorithm function.
317 |         coef, freq, HM.error[frame] = func(
318 |                                 signal.data[frame-window.N:frame+window.N+1],
319 |                                 f0_ref, window, signal.fs, 20.0, func_options,
320 |                                 N_iter)
321 | 
322 |         # Updates frame parameter values in the 3-dimension storage array H.
323 |         HM.update_values(coef[:signal.n_harm], freq, frame)
324 | 
325 |     # If the extraction was performed with temporal jumps, interpolate the
326 |     # results.
327 |     if samp_jump is not None:
328 |         HM.interpolate_samp(voiced_frames, pitch.values)
329 |     HM.synthesize()
330 |     HM.srer(signal.data, pitch.values)
331 |     HM.phase_edges(pitch.edges, window)
332 | 
333 |     return HM
334 | 
335 | """
336 | Core QHM function.
337 | """
338 | 
339 | def qhm_iteration(data, f0_ref, window, fs, max_step, freq, N_iter=1):
340 | 
341 |     # Initialize and allocate variables.
342 |     K = len(freq)
343 |     coef = np.zeros((2*K))
344 | 
345 |     E = np.ones((window.length, 2*K), dtype=complex)
346 |     E = exp_matrix(E, freq, window, K)
347 |     E_windowed = np.ones((window.length, 2*K), dtype=complex)
348 | 
349 |     windowed_data = (window.data*data).reshape(window.length, 1)
350 | 
351 |     # Run the QHM algorithm N_iter times.
352 |     for k in range(N_iter):
353 |         # Calculate the a and b coeficients via least-squares.
354 |         coef = least_squares(E, E_windowed, windowed_data, window, K)
355 | 
356 |         # Set a magnitude reference, which is used to detect and supress
357 |         # erroneous magnitude spikes.
358 |         mag_ref = np.abs(coef[0])
359 | 
360 |         # Updates the frequency values.
361 |         freq, ro = freq_correction(coef[:K], coef[K:], freq, f0_ref, mag_ref, K,
362 |                                    max_step, fs)
363 | 
364 |         # Updates the complex exponentials matrix.
365 |         E = exp_matrix(E, freq, window, K)
366 | 
367 |     # Compute the final coefficients values.
368 |     coef = least_squares(E, E_windowed, windowed_data, window, K)
369 | 
370 |     # This part is a workaround not present in the original references [1-3].
371 |     # It was created to detect and supress erroneous magnitude spikes, which
372 |     # degradate the final synthsized signal and consequently, its SRER.
373 |     # Alternatively, the magnitude signals could be smoothed after extraction.
374 |     # For more details, check the README file.
375 |     cond = (np.abs(coef[:K]) < 5.5*np.abs(coef[0]))
376 |     if not cond.all():
377 |         freq[~cond] = f0_ref[~cond]
378 | 
379 |         # Updates the complex exponentials matrix with the modified frequencies.
380 |         E = exp_matrix(E, freq, window, K)
381 | 
382 |         # Recalculate the final coefficients.
383 |         coef = least_squares(E, E_windowed, windowed_data, window, K)
384 | 
385 |     # Calculate the mean squared error between the original frame and the
386 |     # synthesized one.
387 |     err = error_calc(windowed_data, E, coef, window)
388 |     return coef, freq, err
389 | 
390 | """
391 | Core aQHM and eaQHM function.
392 | """
393 | 
394 | def aqhm_iteration(data, f0_ref, window, fs, max_step, func_options,
395 |                    N_iter=1):
396 | 
397 |     # Initialize and allocate variables.
398 |     previous_HM = func_options[0]
399 |     eaQHM_flag = func_options[1]
400 |     frame = func_options[2]
401 | 
402 |     freq = previous_HM.H[:, 2, frame]
403 |     windowed_data = (window.data*data).reshape(window.length, 1)
404 | 
405 |     # Set a magnitude reference, which is used to detect and supress
406 |     # erroneous magnitude spikes.
407 |     mag_ref = np.abs(previous_HM.H[0, 0, frame])
408 | 
409 |     # Ajust the phase frame.
410 |     extrap_phase_center = previous_HM.extrap_phase[:, frame].reshape(
411 |                                                         previous_HM.n_harm, 1)
412 |     phase_frame = previous_HM.extrap_phase[:, frame-window.N:frame+window.N+1] - \
413 |                     extrap_phase_center
414 | 
415 |     # Initialize the coefficients.
416 |     coef = np.vstack((previous_HM.H[:, 0, frame].reshape(previous_HM.n_harm, 1) *
417 |         np.exp(1j*extrap_phase_center), np.zeros((previous_HM.n_harm, 1))))[:, 0]
418 | 
419 |     # Initialize the matrices.
420 |     E = np.ones((window.length, 2*previous_HM.n_harm), dtype=complex)
421 |     E_ro = np.ones((window.length, 2*previous_HM.n_harm), dtype=complex)
422 |     E_windowed = np.ones((window.length, 2*previous_HM.n_harm), dtype=complex)
423 | 
424 |     E[:, :previous_HM.n_harm] = np.exp(1j*phase_frame.T)
425 | 
426 |     # If the eaQHM algorithm was selected, ajust the exponential matrix with
427 |     # the normalized magnitude.
428 |     if eaQHM_flag:
429 |         mag_center = previous_HM.H[:, 0, frame].reshape(previous_HM.n_harm, 1)
430 |         mag_frame = previous_HM.H[:, 0, frame-window.N:frame+window.N+1] / \
431 |                     mag_center
432 |         E[:, :previous_HM.n_harm] = mag_frame.T*E[:, :previous_HM.n_harm]
433 | 
434 |     E[:, previous_HM.n_harm:] = E[:, :previous_HM.n_harm] * \
435 |                                 window.len_vec.reshape(window.length, 1)
436 | 
437 |     # Run the aQHM/eaQHM algorithm N_iter times.
438 |     for k in range(N_iter):
439 | 
440 |         # Calculate the a and b coeficients via least-squares.
441 |         coef = least_squares(E, E_windowed, windowed_data, window,
442 |                              previous_HM.n_harm)
443 | 
444 |         # Updates the frequency values.
445 |         freq, ro = freq_correction(coef[:previous_HM.n_harm],
446 |                                    coef[previous_HM.n_harm:], freq, f0_ref,
447 |                                    mag_ref, previous_HM.n_harm, max_step, fs)
448 | 
449 |         # Updates the complex exponentials matrix.
450 |         E = E*exp_matrix(E_ro, ro/(2*np.pi), window, previous_HM.n_harm)
451 | 
452 |     # Compute the final coefficients values.
453 |     coef = least_squares(E, E_windowed, windowed_data, window,
454 |                          previous_HM.n_harm)
455 | 
456 |     # This part is a workaround not present in the original references [1-3].
457 |     # It was created to detect and supress erroneous magnitude spikes, which
458 |     # degradate the final synthsized signal and consequently, its SRER.
459 |     # Alternatively, the magnitude signals could be smoothed after extraction.
460 |     # For more details, check the README file.
461 |     cond = (np.abs(coef[:previous_HM.n_harm]) < 5.5*mag_ref)
462 |     if not cond.all():
463 |         freq[~cond] = f0_ref[~cond]
464 | 
465 |         # Since that the troubling aQHM/eaQHM exponentials are degradating the
466 |         # results, they are replaced by the QHM version, which is more stable.
467 |         E[:, ~np.append(cond, cond)] = exp_matrix(E_ro, freq, window,
468 |                               previous_HM.n_harm)[:, ~np.append(cond, cond)]
469 | 
470 |         # Recalculate the final coefficients.
471 |         coef = least_squares(E, E_windowed, windowed_data, window,
472 |                              previous_HM.n_harm)
473 | 
474 |     # Calculate the mean squared error between the original frame and the
475 |     # synthsized one.
476 |     err = error_calc(windowed_data, E, coef, window)
477 | 
478 |     return coef, freq, err
479 | 
480 | """
481 | --------------------------------------------
482 |             Auxiliary Functions.
483 | --------------------------------------------
484 | """
485 | 
486 | """
487 | Calculate the a and b coeficients via least-squares method.
488 | """
489 | 
490 | def least_squares(E, E_windowed, windowed_data, window, K):
491 | 
492 |     R = np.zeros((2*K, 2*K), dtype=complex)
493 |     B = np.zeros((window.length, 1), dtype=complex)
494 | 
495 |     E_windowed[:, :] = E*window.data.reshape(window.length, 1)
496 |     R = E_windowed.conj().T.dot(E_windowed)
497 |     B = E_windowed.conj().T.dot(windowed_data)
498 | 
499 |     coef = np.linalg.solve(R, B)[:, 0]
500 | 
501 |     return coef
502 | 
503 | """
504 | Calculates the frequency mismatch and updates the frequency values.
505 | """
506 | 
507 | def freq_correction(a, b, freq, f0_ref, mag_ref, n_harm, max_step, fs):
508 | 
509 |     old_freq = np.zeros(n_harm)
510 |     old_freq[:] = freq[:]
511 | 
512 |     ro = (a.real*b.imag-a.imag*b.real)/(np.abs(a)*np.abs(a))
513 | 
514 |     # If the mismatch is too high (>20Hz), the frequency update is satured to
515 |     # this value. This avoids big fluctuations, which can spoil the algorithms
516 |     # convergence as whole.
517 |     over_ro = np.abs(ro) > max_step*2*np.pi/fs
518 |     ro[over_ro] = np.sign(ro[over_ro])*(max_step*2*np.pi/fs)
519 |     freq = freq+ro/(2*np.pi)
520 | 
521 |     # Checks whether each component frequency lies within its spectral band and
522 |     # also checks whether there are magnitude spikes.
523 |     cond = ((np.round(freq/f0_ref[0]) != np.arange(n_harm)+1) |
524 |             (freq > 0.5) | (freq < 0) | (np.abs(a) > 5.5*mag_ref))
525 | 
526 |     freq[cond] = f0_ref[cond]
527 | 
528 |     return freq, (freq-old_freq)*(2*np.pi)
529 | 
530 | """
531 | Calculate the mean squared error between the original frame and the
532 | synthsized one.
533 | """
534 | 
535 | def error_calc(windowed_data, E, coef, window):
536 |     h = E.dot(coef)
537 | 
538 |     err = np.sum((windowed_data-2*h.real*window.data)**2)
539 | 
540 |     return err
541 | 
542 | """
543 | Mounts the complex exponentials matrix.
544 | """
545 | 
546 | def exp_matrix(E, freq, window, K):
547 | 
548 |     E[window.N+1:, :K] = np.exp(1j*np.pi*2*freq)
549 |     E[window.N+1:, :K] = np.cumprod(E[window.N+1:, :K], axis=0)
550 |     E[:window.N, :K] = np.conj(E[window.N+1:, :K][::-1, :])
551 | 
552 |     E[:, K:] = E[:, :K]*window.len_vec.reshape(window.length, 1)
553 | 
554 |     return E
555 | 
556 | """
557 | Some side functions found in reference [2].
558 | """
559 | 
560 | def g0(x, N):
561 |     if x != 0:
562 |         return np.sin((2*N+1)*x/2)/np.sin(x/2)
563 |     else:
564 |         return 2*N+1
565 | 
566 | def g1(x, N):
567 |     if x != 0:
568 |         return 1j*((np.sin(N*x)/(2*np.sin(x/2)**2)) -
569 |                    N*(np.cos((2*N+1)*x/2)/np.sin(x/2)))
570 |     else:
571 |         return 0
572 | 
573 | def R_eq(delta_f, func, window):
574 |     return (window.a0*func(2*np.pi*delta_f, window.N) +
575 |             func(2*np.pi*(delta_f+1./(2*window.N)), window.N)*window.a1 +
576 |             func(2*np.pi*(delta_f-1./(2*window.N)), window.N)*window.a1 +
577 |             func(2*np.pi*(delta_f+1./window.N), window.N)*window.a2 +
578 |             func(2*np.pi*(delta_f-1./window.N), window.N)*window.a2)
579 | 


--------------------------------------------------------------------------------
/amfm_decompy/sample.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bjbschmitt/AMFM_decompy/5c6c9bc48006d9eb5d5e874dd44f9a146a5ee38b/amfm_decompy/sample.wav


--------------------------------------------------------------------------------
/bin/AMFM_test.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | """
 4 | Script to test the AMFM_decompy package.
 5 | 
 6 | Version 1.0.12
 7 | 16/May/2025 Bernardo J.B. Schmitt - bernardo.jb.schmitt@gmail.com
 8 | """
 9 | import amfm_decompy
10 | import amfm_decompy.pYAAPT as pyaapt
11 | import amfm_decompy.pyQHM as pyqhm
12 | import amfm_decompy.basic_tools as basic
13 | import os.path
14 | 
15 | # Declare the variables.
16 | file_name = os.path.dirname(amfm_decompy.__file__)+os.sep+"sample.wav"
17 | window_duration = 0.015   # in seconds
18 | nharm_max = 25
19 | SNR = float('Inf')
20 | 
21 | # Create the signal object.
22 | signal = basic.SignalObj(file_name)
23 | 
24 | # Create the window object.
25 | window = pyqhm.SampleWindow(window_duration, signal.fs)
26 | 
27 | # Create the pitch object and calculate its attributes.
28 | pitch = pyaapt.yaapt(signal)
29 | 
30 | # Set the number of modulated components.
31 | signal.set_nharm(pitch.values, nharm_max)
32 | 
33 | # Check if gaussian noise has to be added.
34 | if SNR != float('Inf'):
35 |     signal.noiser(pitch.values, SNR)
36 | 
37 | # Perform the QHM extraction.
38 | QHM = pyqhm.qhm(signal, pitch, window, 0.001, N_iter = 3, phase_tech = 'phase')
39 | 
40 | print ("QHM SRER: {}".format(QHM.SRER))
41 | 
42 | # Perform the aQHM extraction.
43 | aQHM = pyqhm.aqhm(signal, QHM, pitch, window, 0.001, N_iter = 3, N_runs = 2,
44 |             phase_tech = 'phase')
45 | 
46 | print ("aQHM SRER: {}".format(aQHM.SRER))
47 | 
48 | # Perform the eaQHM extraction.
49 | eaQHM = pyqhm.eaqhm(signal, aQHM, pitch, window, 0.001, N_iter=3, N_runs=2,
50 |               phase_tech = 'phase')
51 | 
52 | print ("eaQHM SRER: {}".format(eaQHM.SRER))
53 | 
54 | 


--------------------------------------------------------------------------------
/bin/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bjbschmitt/AMFM_decompy/5c6c9bc48006d9eb5d5e874dd44f9a146a5ee38b/bin/__init__.py


--------------------------------------------------------------------------------
/docs/AMFM_decompy.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bjbschmitt/AMFM_decompy/5c6c9bc48006d9eb5d5e874dd44f9a146a5ee38b/docs/AMFM_decompy.pdf


--------------------------------------------------------------------------------
/docs/basic_tools.txt:
--------------------------------------------------------------------------------
  1 | ===========
  2 | basic_tools
  3 | ===========
  4 | 
  5 | This module contains a set of basic classes and functions that are commonly used by the other modules of the package.
  6 | 
  7 | -------
  8 | Classes
  9 | -------
 10 | 
 11 | SignalObj Class
 12 | ---------------
 13 | 
 14 | The SignalObj Class stores the speech signal and all the parameters related to it.
 15 | 
 16 | USAGE:
 17 | 
 18 | .. py:function:: SignalObj(*args, **kwargs)
 19 |     :module: amfm_decompy.basic_tools
 20 | 
 21 |     :param args: the input argument can be a string with the wav file path OR two arguments, where the first one is a numpy array containing the speech signal data and the second one represents its fundamental frequency in Hz.
 22 |     :param kwargs: please check below for the options.
 23 | 
 24 | 
 25 |     :rtype: speech signal object.
 26 | 
 27 | KWARGS OPTIONS:
 28 | 
 29 | * 'data' - instead of initializing a SignalObj with two arguments, the input signal data can be alternatively passed using this kwarg. It must used along with the 'fs' kwarg.
 30 | * 'fs' - instead of initializing a SignalObj with two arguments, the input signal sample frequency can be alternatively passed using this kwarg. It must used along with the 'data' kwarg.
 31 | * 'name' - instead of initializing a SignalObj with one argument, the input wav file path can be alternatively passed using this kwarg.
 32 | * 'output_dtype' - the numpy dtype of the output signal data.
 33 | 
 34 | SIGNAL OBJECT ATTRIBUTES:
 35 | ^^^^^^^^^^^^^^^^^^^^^^^^^
 36 | 
 37 | .. py:attribute:: data
 38 |     :module: SignalObj
 39 | 
 40 |     Numpy array containing the speech signal data. It is set during the object's initialization.
 41 | 
 42 | .. py:attribute:: fs
 43 |     :module: SignalObj
 44 | 
 45 |     Sample frequency in Hz. It is set during the object's initialization.
 46 | 
 47 | .. py:attribute:: size
 48 |     :module: SignalObj
 49 | 
 50 |     Speech signal length. It is set during the object's initialization.
 51 | 
 52 | .. py:attribute:: filtered
 53 |     :module: SignalObj
 54 | 
 55 |     Bandpassed version from the speech data. It is set by the SignalObj.filtered_version method.
 56 | 
 57 | .. py:attribute:: new_fs
 58 |     :module: SignalObj
 59 | 
 60 |     Downsampled fundamental frequency from the speech data. It is set by the SignalObj.filtered_version method.
 61 | 
 62 | .. py:attribute:: clean
 63 |     :module: SignalObj
 64 | 
 65 |     When the SignalObj.noiser method is called, this attribute is created and used to store a clean copy from the original signal.
 66 | 
 67 | 
 68 | SIGNAL OBJECT METHODS:
 69 | ^^^^^^^^^^^^^^^^^^^^^^^^
 70 | 
 71 | .. py:method:: filtered_version(bp_filter)
 72 |     :module: SignalObj
 73 | 
 74 |     :param bp_filter: BandpassFilter object.
 75 | 
 76 |     Filters the signal data by a bandpass filter.
 77 | 
 78 | .. py:method:: set_nharm(pitch_track, n_harm_max)
 79 |     :module: SignalObj
 80 | 
 81 |     :param pitch_track: pitch extracted values for each signal sample.
 82 |     :param n_harm_max: represents the maximum number of components that can be extracted from the signal.
 83 | 
 84 |     :type pitch_track: numpy array
 85 |     :type n_harm_max: int
 86 | 
 87 |     Uses the pitch values to estimate the number of modulated components in the signal.
 88 | 
 89 | .. py:method:: noiser(pitch_track, SNR)
 90 |     :module: SignalObj
 91 | 
 92 |     :param pitch_track: pitch extracted values for each signal sample.
 93 |     :param SNR: desired signal-to-noise ratio from the output signal.
 94 | 
 95 |     :type pitch_track: numpy array
 96 |     :type SNR: float
 97 | 
 98 |     Adds a zero-mean gaussian noise to the signal.
 99 | 
100 | ---------
101 | Functions
102 | ---------
103 | 
104 | pcm2float
105 | ---------
106 | 
107 | USAGE:
108 | 
109 | .. py:function:: pcm2float(sig[, dtype=numpy.float64])
110 |     :module: amfm_decompy.basic_tools
111 | 
112 |     :param sig: PCM speech signal data.
113 |     :param dtype: data type from the elements of the output array (default: numpy.float64).
114 | 
115 |     :type sig: numpy array
116 |     :type dtype: float
117 |     :rtype: numpy array.
118 | 
119 |     Transform a PCM raw signal into a float one, with values limited between -1 and 1.
120 | 


--------------------------------------------------------------------------------
/docs/freq1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bjbschmitt/AMFM_decompy/5c6c9bc48006d9eb5d5e874dd44f9a146a5ee38b/docs/freq1.png


--------------------------------------------------------------------------------
/docs/index.txt:
--------------------------------------------------------------------------------
 1 | .. AMFM decompy documentation master file, created by
 2 |    sphinx-quickstart on Mon Jan 23 14:17:16 2017.
 3 |    You can adapt this file completely to your liking, but it should at least
 4 |    contain the root `toctree` directive.
 5 | 
 6 | Welcome to AMFM decompy's documentation!
 7 | ========================================
 8 | 
 9 | Contents:
10 | 
11 | .. toctree::
12 |    :maxdepth: 2
13 | 
14 |    pYAAPT
15 |    pyQHM
16 |    basic_tools
17 | 


--------------------------------------------------------------------------------
/docs/interp.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bjbschmitt/AMFM_decompy/5c6c9bc48006d9eb5d5e874dd44f9a146a5ee38b/docs/interp.png


--------------------------------------------------------------------------------
/docs/mag3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bjbschmitt/AMFM_decompy/5c6c9bc48006d9eb5d5e874dd44f9a146a5ee38b/docs/mag3.png


--------------------------------------------------------------------------------
/docs/pYAAPT.txt:
--------------------------------------------------------------------------------
  1 | ======
  2 | pYAAPT
  3 | ======
  4 | 
  5 | This a ported version for Python from the YAAPT (Yet Another Algorithm for Pitch Tracking) algorithm. The original MATLAB program was written by Hongbing Hu and Stephen A. Zahorian.
  6 | 
  7 | The YAAPT program, designed for fundamental frequency tracking, is extremely robust for both high quality and telephone speech. The YAAPT program was created by the Speech Communication Laboratory of the state university of New York at Binghamton. The original program is available at http://www.ws.binghamton.edu/zahorian as free software. Further information about the program could be found at [ref1]_.
  8 | 
  9 | It must be noticed that, although this ported version is almost equal to the original, some few changes were made in order to make the program more "pythonic" and improve its performance. Nevertheless, the results obtained with both algorithms were similar.
 10 | 
 11 | -----------
 12 | Quick start
 13 | -----------
 14 | 
 15 | The pYAAPT basically contains the whole set of functions to extract the pitch track from a speech signal. These functions, in their turn, are independent from the pyQHM package. Therefore, pYAAPT can be used in any other speech processing application, not only in the AM-FM decomposition.
 16 | 
 17 | USAGE:
 18 | 
 19 | .. py:function:: yaapt(signal [, options])
 20 |     :module: amfm_decompy.pYAAPT
 21 | 
 22 |     :param signal: created with amfm_decompy.basic_tools.
 23 |     :param options: the default configuration values for all of them are the same as in the original version. A short description about them is presented in the next subitem. For more information about them, please refer to [ref1]_.
 24 | 
 25 |     :type signal: signal object
 26 |     :type options: Must be formatted as follow: **{'option_name1' : value1,   'option_name2' : value2, ...}
 27 |     :rtype: pitch object
 28 | 
 29 | OPTIONS:
 30 | 
 31 | * 'frame_length' - length of each analysis frame (default: 35 ms)
 32 | * 'tda_frame_length' - frame length employed in the time domain analysis (default: 35 ms). IMPORTANT: in the original YAAPT v4.0 MATLAB source code this parameter is called 'frame_lengtht'. Since its name is quite similar to 'frame_length', the alternative alias 'tda_frame_length' is employed by pYAAPT in order to avoid confusion. Nevertheless, both inputs ('frame_lengtht' and 'tda_frame_length') are accepted by the yaapt function.
 33 | * 'frame_space' - spacing between analysis frames (default: 10 ms)
 34 | * 'f0_min' - minimum pitch searched (default: 60 Hz)
 35 | * 'f0_max' - maximum pitch searched (default: 400 Hz)
 36 | * 'fft_length' - FFT length (default: 8192 samples)
 37 | * 'bp_forder' - order of band-pass filter (default: 150)
 38 | * 'bp_low' - low frequency of filter passband (default: 50 Hz)
 39 | * 'bp_high' - high frequency of filter passband (default: 1500 Hz)
 40 | * 'nlfer_thresh1' - NLFER (Normalized Low Frequency Energy Ratio) boundary for voiced/unvoiced decisions (default: 0.75)
 41 | * 'nlfer_thresh2' - threshold for NLFER definitely unvoiced (default: 0.1)
 42 | * 'shc_numharms' - number of harmonics in SHC (Spectral Harmonics Correlation) calculation (default: 3)
 43 | * 'shc_window' - SHC window length (default: 40 Hz)
 44 | * 'shc_maxpeaks' - maximum number of SHC peaks to be found (default: 4)
 45 | * 'shc_pwidth' - window width in SHC peak picking (default: 50 Hz)
 46 | * 'shc_thresh1' - threshold 1 for SHC peak picking (default: 5)
 47 | * 'shc_thresh2' - threshold 2 for SHC peak picking (default: 1.25)
 48 | * 'f0_double'- pitch doubling decision threshold (default: 150 Hz)
 49 | * 'f0_half' - pitch halving decision threshold (default: 150 Hz)
 50 | * 'dp5_k1' - weight used in dynamic program (default: 11)
 51 | * 'dec_factor' - factor for signal resampling (default: 1)
 52 | * 'nccf_thresh1' - threshold for considering a peak in NCCF (Normalized Cross Correlation Function) (default: 0.3)
 53 | * 'nccf_thresh2' - threshold for terminating search in NCCF (default: 0.9)
 54 | * 'nccf_maxcands' - maximum number of candidates found (default: 3)
 55 | * 'nccf_pwidth' - window width in NCCF peak picking (default: 5)
 56 | * 'merit_boost' - boost merit (default. 0.20)
 57 | * 'merit_pivot' - merit assigned to unvoiced candidates in definitely unvoiced frames (default: 0.99)
 58 | * 'merit_extra' - merit assigned to extra candidates in reducing pitch doubling/halving errors (default: 0.4)
 59 | * 'median_value' - order of medial filter (default: 7)
 60 | * 'dp_w1' - DP (Dynamic Programming) weight factor for voiced-voiced transitions (default: 0.15)
 61 | * 'dp_w2' - DP weight factor for voiced-unvoiced or unvoiced-voiced transitions (default: 0.5)
 62 | * 'dp_w3' - DP weight factor of unvoiced-unvoiced transitions (default: 0.1)
 63 | * 'dp_w4' - Weight factor for local costs (default: 0.9)
 64 | 
 65 | Exclusive from pYAAPT:
 66 | 
 67 | This extra parameter had to be added in order to fix a bug in the original code. More information about it :ref:`here<pyaapt-differences>`.
 68 | 
 69 | * 'spec_pitch_min_std' - Weight factor that sets a minimum spectral pitch standard deviation,which is calculated as min_std = pitch_avg*spec_pitch_min_std (default: 0.05, i.e. 5% of the average spectral pitch).
 70 | 
 71 | 
 72 | EXAMPLES:
 73 | 
 74 | Example 1 - extract the pitch track from a signal using the default configurations::
 75 | 
 76 |      import amfm_decompy.pYAAPT as pYAAPT
 77 |      import amfm_decompy.basic_tools as basic
 78 | 
 79 |      signal = basic.SignalObj('path_to_sample.wav')
 80 |      pitch = pYAAPT.yaapt(signal)
 81 | 
 82 | Example 2 - extract the pitch track from a signal with the minimum pitch set to 150 Hz, the frame length to 15 ms and the frame jump to 5 ms::
 83 | 
 84 |      import amfm_decompy.pYAAPT as pYAAPT
 85 |      import amfm_decompy.basic_tools as basic
 86 | 
 87 |      signal = basic.SignalObj('path_to_sample.wav')
 88 |      pitch = pYAAPT.yaapt(signal, **{'f0_min' : 150.0, 'frame_length' : 15.0, 'frame_space' : 5.0})
 89 | 
 90 | ------------
 91 | Classes
 92 | ------------
 93 | 
 94 | 
 95 | PitchObj Class
 96 | -----------------------
 97 | 
 98 | The PitchObj Class stores the extracted pitch and all the parameters related to it. A pitch object is necessary for the QHM algorithms. However, the pitch class structure was built in a way that it can be used by any other pitch tracker, not only the YAAPT.
 99 | 
100 | USAGE:
101 | 
102 | .. py:function:: PitchObj(frame_size, frame_jump[, nfft=8192])
103 |     :module: amfm_decompy.pYAAPT
104 | 
105 |     :param frame_size: analysis frame length.
106 |     :param frame_jump: distance between the center of a extracting frame and the center of its adjacent neighbours.
107 |     :param nfft: FFT length.
108 | 
109 |     :type frame_size: int
110 |     :type frame_jump: int
111 |     :type nfft: int
112 |     :rtype: pitch object.
113 | 
114 | PITCH CLASS VARIABLES:
115 | ^^^^^^^^^^^^^^^^^^^^^^
116 | 
117 | These variables not related with the YAAPT algorithm itself, but with a post-processing where the data is smoothed and halving/doubling errors corrected.
118 | 
119 | .. py:attribute:: PITCH_HALF
120 |     :module: PitchObj
121 | 
122 |     This variable is a flag. When its value is equal to 1, the halving detector set the half pitch values to 0. If PITCH_HALF is equal to 2, the half pitch values are multiplied by 2. For other PITCH_HALF values, the halving detector is not employed (default: 0).
123 | 
124 | .. py:attribute:: PITCH_HALF_SENS
125 |     :module: PitchObj
126 | 
127 |     Set the halving detector sensibility. A pitch sample is considered half valued if it is not zero and lower than::
128 | 
129 |     mean(pitch) - PITCH_HALF_SENS*std(pitch)
130 | 
131 |     (default: 2.9).
132 | 
133 | .. py:attribute:: PITCH_DOUBLE
134 |     :module: PitchObj
135 | 
136 |     This variable is a flag. When its value is equal to 1, the doubling detector set the double pitch values to 0. If PITCH_DOUBLE is equal to 2, the double pitch values are divided by 2. For other PITCH_DOUBLE values, the doubling detector is not employed (default: 0).
137 | 
138 | .. py:attribute:: PITCH_DOUBLE_SENS
139 |     :module: PitchObj
140 | 
141 |     Set the doubling detector sensibility. A pitch sample is considered double valued if it is not zero and higher than::
142 | 
143 |     mean(pitch) + PITCH_DOUBLE_SENS*std(pitch)
144 | 
145 |     (default: 2.9).
146 | 
147 | .. py:attribute:: SMOOTH_FACTOR
148 |     :module: PitchObj
149 | 
150 |     Determines the median filter length used to smooth the interpolated pitch values (default: 5). [1]_
151 | 
152 | .. py:attribute:: SMOOTH
153 |     :module: PitchObj
154 | 
155 |     This variable is a flag. When its value is not equal to 0, the interpolated pitch is smoothed by a median filter (default: 5). [1]_
156 | 
157 | .. py:attribute:: PTCH_TYP
158 |     :module: PitchObj
159 | 
160 |     If there are less than 2 voiced frames in the file, the PTCH_TYP value is used in the interpolation (default: 100 Hz). [1]_
161 | 
162 | .. rubric:: Footnotes
163 | 
164 | .. [1] don't mistake this interpolation with the one performed by the pYAAPT.upsample method. For more explanation, please refer to the pYAAPT.samp_interp and pYAAPT.values_interp attributes.
165 | 
166 | EXAMPLE:
167 | 
168 | Example 1 - the pitch is extracted from sample.wav with different smoothing and interpolation configurations::
169 | 
170 |     import amfm_decompy.pYAAPT as pYAAPT
171 |     import amfm_decompy.basic_tools as basic
172 | 
173 |     signal = basic.SignalObj('path_to_sample.wav')
174 | 
175 |     pYAAPT.PitchObj.PITCH_DOUBLE = 2      # set new values
176 |     pYAAPT.PitchObj.PITCH_HALF = 2
177 |     pYAAPT.PitchObj.SMOOTH_FACTOR = 3
178 | 
179 |     pitch = pYAAPT.yaapt(signal) # calculate the pitch track
180 | 
181 | 
182 | PITCH OBJECT ATTRIBUTES:
183 | ^^^^^^^^^^^^^^^^^^^^^^^^
184 | 
185 | .. py:attribute:: nfft
186 |     :module: PitchObj
187 | 
188 |     Length in samples from the FFT used by the pitch tracker. It is set during the object's initialization.
189 | 
190 | .. py:attribute:: frame_size
191 |     :module: PitchObj
192 | 
193 |     Length in samples from the frames used by the pitch tracker. It is set during the object's initialization.
194 | 
195 | .. py:attribute:: frame_jump
196 |     :module: PitchObj
197 | 
198 |     Distance in samples between the center of a extracting frame and the center of its adjacent neighbours. It is set during the object's initialization.
199 | 
200 | .. py:attribute:: noverlap
201 |     :module: PitchObj
202 | 
203 |     It's the difference between the frame size and the frame jump. Represents the number of samples that two adjacent frames share in common, i.e, how much they overlap each other. It is set during the object's initialization.
204 | 
205 | .. py:attribute:: mean_energy
206 |     :module: PitchObj
207 | 
208 |     Signal's low frequency band mean energy. It is set by the PitchObj.set_energy method.
209 | 
210 | .. py:attribute:: energy
211 |     :module: PitchObj
212 | 
213 |     Array that contains the low frequency band energy from each frame, normalized by PitchObj.mean_energy. It is set by the PitchObj.set_energy method.
214 | 
215 | .. py:attribute:: vuv
216 |     :module: PitchObj
217 | 
218 |     Boolean vector that indicates if each speech frame was classified as voiced (represented as 'True') or unvoiced (represented as 'False'). It is set by the PitchObj.set_energy method.
219 | 
220 | .. py:attribute:: frames_pos
221 |     :module: PitchObj
222 | 
223 |     A numpy array that contains the temporal location of the center of each extraction frame, which is also referred as time stamp. It is set by the PitchObj.set_frame_pos method. The locations are given in sample domain, so their values in time domain are calculated as::
224 | 
225 |        import amfm_decompy.pYAAPT as pYAAPT
226 |        import amfm_decompy.basic_tools as basic
227 | 
228 |        signal = basic.SignalObj('path_to_sample.wav')
229 |        pitch = pYAAPT.yaapt(signal)
230 | 
231 |        time_stamp_in_seconds = pitch.frame_pos/signal.fs
232 | 
233 | .. py:attribute:: nframes
234 |     :module: PitchObj
235 | 
236 |     Number of frames. It is set by the PitchObj.set_frame_pos method.
237 | 
238 | .. py:attribute:: samp_values
239 |                   samp_interp
240 |     :module: PitchObj
241 | 
242 |     Both arrays contain the pitch values from each of the nframes. The only difference is that, in PitchObj.samp_interp the unvoiced segments are replaced by the interpolation from the adjacent voiced segments edges. This provides a non-zero version from the pitch track, which can be necessary for some applications.
243 | 
244 |     Example::
245 | 
246 |        import amfm_decompy.pYAAPT as pYAAPT
247 |        import amfm_decompy.basic_tools as basic
248 |        from matplotlib import pyplot as plt
249 | 
250 |        signal = basic.SignalObj('path_to_sample.wav')
251 |        pitch = pYAAPT.yaapt(signal)
252 | 
253 |        plt.plot(pitch.samp_values, label='samp_values', color='blue')
254 |        plt.plot(pitch.samp_interp, label='samp_interp', color='green')
255 | 
256 |        plt.xlabel('frames', fontsize=18)
257 |        plt.ylabel('pitch (Hz)', fontsize=18)
258 |        plt.legend(loc='upper right')
259 |        axes = plt.gca()
260 |        axes.set_xlim([0,90])
261 |        plt.show()
262 | 
263 |     The output is presented below:
264 | 
265 | .. image:: ../_images/samp_values.png
266 | 
267 | Both attributes are set by the PitchObj.set_values method.
268 | 
269 | .. py:attribute:: values
270 |                   values_interp
271 |     :module: PitchObj
272 | 
273 |     PitchObj.values and PitchObj.values_interp are the upsampled versions from PitchObj.samp_values and PitchObj.samp_interp respectively. Therefore, their length is equal to the original file length (for more information, check the PitchObj.upsample() method).
274 | 
275 |     Example::
276 | 
277 |        import amfm_decompy.pYAAPT as pYAAPT
278 |        import amfm_decompy.basic_tools as basic
279 |        from matplotlib import pyplot as plt
280 | 
281 |        signal = basic.SignalObj('path_to_sample.wav')
282 |        pitch = pYAAPT.yaapt(signal)
283 | 
284 |        plt.plot(pitch.values, label='samp_values', color='blue')
285 |        plt.plot(pitch.values_interp, label='samp_interp', color='green')
286 | 
287 |        plt.xlabel('samples', fontsize=18)
288 |        plt.ylabel('pitch (Hz)', fontsize=18)
289 |        plt.legend(loc='upper right')
290 |        axes = plt.gca()
291 |        axes.set_xlim([0,16000])
292 |        plt.show()
293 | 
294 |     The output is presented below:
295 | 
296 | .. image:: ../_images/values.png
297 | 
298 | Both attributes are set by the PitchObj.set_values method.
299 | 
300 | .. py:attribute:: edges
301 |     :module: PitchObj
302 | 
303 |     A list that contains the index where occur the transitions between unvoiced-voiced and voiced-unvoiced in PitchObj.values. It is set by the PitchObj.set_values method, which employs internally the PitchObj.edges_finder method.
304 | 
305 | PITCH OBJECT METHODS:
306 | ^^^^^^^^^^^^^^^^^^^^^^^^
307 | 
308 | .. py:method:: set_energy(energy, threshold)
309 |     :module: PitchObj
310 | 
311 |     :param energy: contains the low frequency energy for each frame.
312 |     :param threshold: normalized threshold.
313 | 
314 |     :type energy: numpy array
315 | 
316 |     Set the normalized low frequency energy by taking the input array and dividing it by its mean value. Normalized values above the threshold are considered voiced frames, while the ones below it are unvoiced frames.
317 | 
318 | .. py:method:: set_frames_pos(frames_pos)
319 |     :module: PitchObj
320 | 
321 |     :param frames_pos: index with the sample positions.
322 | 
323 |     :type values: numpy array
324 | 
325 |     Set the position from the center of the extraction frames.
326 | 
327 | .. py:method:: set_values(samp_values, file_size [, interp_tech='spline'])
328 |     :module: PitchObj
329 | 
330 |     :param samp_values: pitch value for each frame.
331 |     :param file_size: length of the speech signal.
332 |     :param interp_tech: interpolation method employed to upsample the data. Can be 'pchip' (default), 'spline' and 'step'.
333 | 
334 |     :type samp_values: numpy array
335 |     :type file_size: int
336 |     :type interp_tech: string
337 | 
338 |     Set the pitch values and also calculates its interpolated version (for more information, check the PitchObj.samp_values and PitchObj.samp_interp attributes). A post-process is employed then using the PitchObj class attributes. After that, both arrays are upsampled, so that the output arrays have the same length as the original speech signal. In this process, a second interpolation is necessary. The interpolation technique employed is indicated by the parameter interp_tech.
339 | 
340 |     Example::
341 | 
342 |        import amfm_decompy.pYAAPT as pYAAPT
343 |        import amfm_decompy.basic_tools as basic
344 |        from matplotlib import pyplot as plt
345 | 
346 |        signal = basic.SignalObj('path_to_sample.wav')
347 |        pitch = pYAAPT.yaapt(signal)
348 | 
349 |        plt.plot(pitch.values, label='pchip interpolation', color='green')
350 | 
351 |        pitch.set_values(pitch.samp_values, len(pitch.values), interp_tech='spline')
352 |        plt.plot(pitch.values, label='spline interpolation', color='red')
353 | 
354 |        pitch.set_values(pitch.samp_values, len(pitch.values), interp_tech='step')
355 |        plt.plot(pitch.values, label='step interpolation', color='blue')
356 | 
357 |        plt.xlabel('samples', fontsize=18)
358 |        plt.ylabel('pitch (Hz)', fontsize=18)
359 |        plt.legend(loc='upper right')
360 |        axes = plt.gca()
361 |        axes.set_xlim([0,16000])
362 |        axes.set_ylim([150,250])
363 | 
364 |        plt.show()
365 | 
366 |     The output is presented below:
367 | 
368 | .. image:: ../_images/interp.png
369 | 
370 | .. py:method:: edges_finder(values)
371 |     :module: PitchObj
372 | 
373 |     :param values: contains the low frequency energy for each frame.
374 | 
375 |     :type values: numpy array
376 |     :rtype: list.
377 | 
378 |     Returns the index of the samples where occur the transitions between unvoiced-voiced and voiced-unvoiced.
379 | 
380 | 
381 | BandpassFilter Class
382 | --------------------
383 | 
384 | Creates a bandpass filter necessary for the YAAPT algorithm.
385 | 
386 | USAGE:
387 | 
388 | .. py:function:: BandpassFilter(fs, parameters)
389 |     :module: amfm_decompy.pYAAPT
390 | 
391 |     :param fs: signal's fundamental frequency
392 |     :param parameters: contains the parameters options from the YAAPT algorithm.
393 | 
394 |     :type fs: float
395 |     :type parameters: dictionary
396 |     :rtype: bandpass filter object.
397 | 
398 | BANDPASS FILTER ATTRIBUTES:
399 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^
400 | 
401 | .. py:attribute:: b
402 |     :module: BandpassFilter
403 | 
404 |     Bandpass filter zeros coefficients. It is set during the object's initialization.
405 | 
406 | .. py:attribute:: a
407 |     :module: BandpassFilter
408 | 
409 |     Bandpass filter poles coefficients. It is set during the object's initialization.
410 | 
411 | .. py:attribute:: dec_factor
412 |     :module: BandpassFilter
413 | 
414 |     Decimation factor used for downsampling the data. It is set during the object's initialization.
415 | 
416 | .. _pyaapt-differences:
417 | 
418 | -------------------------------------------------
419 | Differences between pYAAPT and the original YAAPT
420 | -------------------------------------------------
421 | 
422 | As stated before, the pYAAPT was conceived as a port of the original Matlab YAAPT package. However, with the evolution of the YAAPT and also with the constant feedback from pYAAPT users, there are currently a few important differences between both codes:
423 | 
424 | YAAPT 4.0 processing speed
425 | ----------------------------
426 | 
427 | The version 4.0 from the YAAPT came with an additional feature that allows the user to "skip" the spectral pitch tracking, or alternatively, skip the time domain pitch tracking. Although I understand why the feature was implemented (Matlab has some limitations in terms of optimizing the code performance), personally I consider this addition a bit questionable.
428 | 
429 | The strong point of the YAAPT is its robustness. And by personal experience, I would say that most of my speech processing projects relied on the efficiency of the pitch tracker. Thus, sacrificing the robustness of the algorithm can cause a snowball effect that could eventually compromise an entire project.
430 | 
431 | Therefore, until the present moment the speed feature is not available at pYAAPT. Specially because Python still has some better speeding options to be explored, like numba or CUDA. Eventually I might add this speed parameter to some future major release, it does not require an extensive code refactoring anyway.
432 | 
433 | But since that this feature is a bit counter-productive, I do not see it currently as priority.
434 | 
435 | spec_pitch_min_std parameter
436 | ----------------------------
437 | 
438 | In the function tm_trk.m from the original YAAPT code, the spectral pitch standard deviation (pStd) is employed to calculate the frequency threshold (freq_threshold) variable, which is later used to refine the merit of the pitch candidates.
439 | 
440 | However, in some corner cases it might happen that all spectral pitch values are the same, which results in a standard deviation equal to zero. And since that the freq_threshold is employed as the denominator of a fraction, this will lead to a division by 0, which will consequently crash the algorithm. This issue was reported in real-time applications using the pYAAPT.
441 | 
442 | Since that this bug is also present in the original Matlab code, a custom solution had to be developed. Thus, the most reasonable approach was to use a percentage of the average spectral pitch. This percentage was named spec_pitch_min_std, which has default value of 0.05. Therefore, when the standard deviation of the spectral pitch is lower than 5% of its mean value, this fraction of the average pitch is employed instead of the standard deviation.
443 | 
444 | 
445 | .. [ref1] Stephen A. Zahorian, and Hongbing Hu, "A spectral/temporal method for robust fundamental frequency tracking," J. Acosut. Soc. Am. 123(6), June 2008.
446 | 


--------------------------------------------------------------------------------
/docs/pyQHM.txt:
--------------------------------------------------------------------------------
  1 | =====
  2 | pyQHM
  3 | =====
  4 | 
  5 | The algorithms here implemented were the QHM (Quasi-Harmonic Model), and its upgrades, aQHM (adaptive Quasi-Harmonic Model) and eaQHM (extended adaptive Quasi-Harmonic Model). Their formulation can be found at references [ref2]_, [ref3]_ and [ref4]_.
  6 | 
  7 | These algorithms perform the so-called AM-FM decomposition. This designation is used due the fact that, in this method, the signal is modeled as a sum of amplitude- and frequency-modulated components. The goal is to overcome the drawbacks from Fourier-alike techniques, e.g. SFFT, wavelets, etc, which are limited in the time-frequency analysis by the so-called Heisenberg-Gabor inequality.
  8 | 
  9 | -----------
 10 | Quick start
 11 | -----------
 12 | 
 13 | The pyQHM module provides a function for each of the QHM family algorithms:
 14 | 
 15 | USAGE:
 16 | 
 17 | .. py:function:: qhm(signal, pitch, window[, samp_jump=None, N_iter=1, phase_tech='phase'])
 18 |                  aqhm(signal, previous_HM, pitch, window[, samp_jump=None, N_iter=1, N_runs=float('Inf'), phase_tech='phase'])
 19 |                  eaqhm(signal, previous_HM, pitch, window[, samp_jump=None, N_iter=1, N_runs=float('Inf'), phase_tech='phase'])
 20 |     :module: amfm_decompy.pyQHM
 21 | 
 22 |     :param signal: contains the signal data and its parameters.
 23 |     :param pitch: contains the pitch track and its parameters.
 24 |     :param window: contains the sample window and some reference arrays.
 25 |     :param samp_jump: distance in seconds between the center of a extracting frame and the center of its adjacent neighbours (default: sample by sample).
 26 |     :param N_iter: number of iterations for each frame estimation (default: 1).
 27 |     :param phase_tech: has two options: 'phase' (default) and 'freq'. The objective is to choose the smoother base for further aQHM and eaQHM calculations in order to avoid the degradation of their performance due the phase wild behaviour. Normally when a sample jump is employed,  the 'phase' option it's enough, since that the interpolation process already smooths the phase signal. However, in a sample by sample analysis, the use of 'freq' (cumulative frequency) is favoured.
 28 |     :param previous_HM: previously extracted AM-FM signal, used as base for the aQHM and eaQHM calculations.
 29 |     :param N_runs: after the aQHM/eaQHM algorithm has been applied on the whole signal, the function takes the output modulated signal object as new input and restart the aQHM/eaQHM until N_runs are performed OR until the output SRER (Signal-to-Reconstruction Error Ratio) stops growing. The goal is to refine the results. (default: keeps restarting the algorithm infinitely until the maximum SRER).
 30 | 
 31 |     :type signal: signal object
 32 |     :type pitch: pitch object
 33 |     :type window: window object
 34 |     :type samp_jump: float
 35 |     :type N_iter: int
 36 |     :type phase_tech: str
 37 |     :type previous_HM: modulated signal object
 38 |     :type N_runs: int
 39 |     :rtype: modulated signal object
 40 | 
 41 | EXAMPLES:
 42 | 
 43 | Example 1 - the parameters of a speech signal are extracted sample by sample through QHM. After that, its output is used as input for the first of two aQHM runs with 1 ms sample jump. Finally, the result is used to start one run of the eaQHM with a 1 ms sample jump again. The three algorithms perform 3 iterations per frame extraction.::
 44 | 
 45 |    import amfm_decompy.pYAAPT as pyaapt
 46 |    import amfm_decompy.pyQHM as pyqhm
 47 |    import amfm_decompy.basic_tools as basic
 48 | 
 49 |    # Declare the variables.
 50 |    window_duration = 0.015
 51 |    nharm_max = 25
 52 | 
 53 |    # Create the signal object.
 54 |    signal = basic.SignalObj('path_to_sample.wav')
 55 | 
 56 |    # Create the window object.
 57 |    window = pyqhm.SampleWindow(window_duration, signal.fs)
 58 | 
 59 |    # Create the pitch object and calculate its attributes.
 60 |    pitch = pyaapt.yaapt(signal)
 61 | 
 62 |    # Use the pitch track to set the number of modulated components.
 63 |    signal.set_nharm(pitch.values, nharm_max)
 64 | 
 65 |    # Perform the QHM extraction.
 66 |    QHM = pyqhm.qhm(signal, pitch, window, N_iter = 3, phase_tech = 'freq')
 67 | 
 68 |    # Perform the aQHM extraction.
 69 |    aQHM = pyqhm.aqhm(signal, QHM, pitch, window, 0.001, N_iter = 3, N_runs = 2)
 70 | 
 71 |    # Perform the eaQHM extraction.
 72 |    eaQHM = pyqhm.eaqhm(signal, aQHM, pitch, window, 0.001, N_iter=3, N_runs=1)
 73 | 
 74 | ------------
 75 | Classes
 76 | ------------
 77 | 
 78 | ModulatedSign Class
 79 | -----------------------
 80 | 
 81 | The ModulatedSign Class stores the extracted modulated signal and all the parameters related to it. The data structure provided by this class is used by all the QHM algorithms, since that the model for a modulated signal is basically the same for all of them.
 82 | 
 83 | USAGE:
 84 | 
 85 | .. py:function:: ModulatedSign(n_harm, file_size, fs[, phase_tech='phase'])
 86 |     :module: amfm_decompy.pyQHM
 87 | 
 88 |     :param n_harm: number of modulated components that form the signal.
 89 |     :param file_size: length of the speech signal in samples.
 90 |     :param fs: sampling frequency in Hz.
 91 |     :param phase_tech: has two options: 'phase' (default) and 'freq'. The objective is to choose the smoother base for further aQHM and eaQHM calculations in order to avoid the degradation of their performance due the phase wild behaviour. Normally when a sample jump is employed,  the 'phase' option it's enough, since that the interpolation process already smooths the phase signal. However, in a sample by sample analysis, the use of 'freq' (cumulative frequency) is favoured.
 92 | 
 93 |     :type n_harm: int
 94 |     :type file_size: int
 95 |     :type fs: float
 96 |     :type phase_tech: str
 97 |     :rtype: modulated signal object.
 98 | 
 99 | MODULATED SIGNAL ATTRIBUTES:
100 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^
101 | 
102 | .. py:attribute:: n_harm
103 |     :module: ModulatedSign
104 | 
105 |     Number of modulated components that form the signal. It is set during the object's initialization.
106 | 
107 | .. py:attribute:: size
108 |     :module: ModulatedSign
109 | 
110 |     Length of the speech signal in samples. It is set during the object's initialization.
111 | 
112 | .. py:attribute:: fs
113 |     :module: ModulatedSign
114 | 
115 |     Sampling frequency in Hz. It is set during the object's initialization.
116 | 
117 | .. py:attribute:: H
118 |     :module: ModulatedSign
119 | 
120 |     3-dimension numpy array (n_harm, 3, file_size), which stores the magnitude, phase and frequency values from all components. Its first dimension refers to the n_harm components, the second to the three composing parameters (where 0 stands for the magnitude, 1 for the phase and 2 for the frequency) and the third dimension to the temporal axis. It is created during the object's initialization.
121 | 
122 | .. py:attribute:: harmonics
123 |     :module: ModulatedSign
124 | 
125 |     List where each element is a modulated component. Read more about it in the ComponentObj Class section. It is created during the object's initialization.
126 | 
127 | .. py:attribute:: error
128 |     :module: ModulatedSign
129 | 
130 |     Numpy array where each element is the mean squared error between the original signal frame and its synthesized version. It is created during the object's initialization.
131 | 
132 | .. py:attribute:: phase_tech
133 |     :module: ModulatedSign
134 | 
135 |     Name of the phase smoothing method used to create a reference for future aQHM/eaQHM calculations. Can be 'phase' or 'freq'. It is set during the object's initialization.
136 | 
137 | .. py:attribute:: signal
138 |     :module: ModulatedSign
139 | 
140 |     Final signal synthesized with the extracted parameters. It is created by the ModulatedSign.synthesize method.
141 | 
142 | .. py:attribute:: SRER
143 |     :module: ModulatedSign
144 | 
145 |     Signal-to-Reconstruction Error Ratio, measures the similarity between the original signal and its synthesized version. The bigger its value, the better the reconstruction. It is calculated by the ModulatedSign.srer method.
146 | 
147 | .. py:attribute:: extrap_phase
148 |     :module: ModulatedSign
149 | 
150 |     2-dimension numpy array (n_harm, file_size) which contains a modified version of the extracted phase track from each component. The signals are smoothed (check the ModulatedSign.phase_tech attribute) and their edge values are extrapolated for future aQHM/eaQHM runs. It is calculated by the ModulatedSign.phase_edges method.
151 | 
152 | MODULATED SIGNAL METHODS:
153 | ^^^^^^^^^^^^^^^^^^^^^^^^^
154 | 
155 | .. py:method:: update_values(a, freq, frame)
156 |     :module: ModulatedSign
157 | 
158 |     :param a: contains the extracted complex coefficients from the harmonic model (for more information about them, please check the references).
159 |     :param freq: instantaneous frequency from each of the components.
160 |     :param frame: sample where the center of the moving sample window is located.
161 | 
162 |     :type a: numpy array
163 |     :type freq: numpy array
164 |     :type frame: int
165 | 
166 |     Updates the values of magnitude, phase and instantaneous frequency in the H matrix.
167 | 
168 | .. py:method:: interpolate_samp(samp_frames, pitch)
169 |     :module: ModulatedSign
170 | 
171 |     :param samp_frames: contains the sample locations where the algorithm was employed.
172 |     :param pitch: pitch information.
173 | 
174 |     :type samp_frames: numpy array
175 |     :type pitch: pitch object
176 | 
177 |     Interpolate the parameters values when the extraction is not performed sample-by-sample.
178 | 
179 | .. py:method:: synthesize([N=None])
180 |     :module: ModulatedSign
181 | 
182 |     :param N: select which of the components are going to be synthesized (default: all of them).
183 | 
184 |     Runs the ComponentObj.synthesize method for each of the n_harm components, and after that, sum them to construct the final synthesized signal.
185 | 
186 | .. py:method:: srer(orig_signal, pitch_track)
187 |     :module: ModulatedSign
188 | 
189 |     :param orig_signal: original signal.
190 |     :param pitch_track: pitch values for each sample.
191 | 
192 |     :type orig_signal: numpy array
193 |     :type pitch_track: numpy array
194 | 
195 |     Calculates the SRER (Signal-to-Reconstruction Error Ratio) for the synthesized signal. It is defined mathematically as
196 | 
197 |     20*log10(std(orig_signal) / std(orig_signal - synth_signal)).
198 | 
199 | .. py:method:: phase_edges(edges, window)
200 |     :module: ModulatedSign
201 | 
202 |     :param edges: index where occur the pitch transitions between unvoiced-voiced and voiced-unvoiced.
203 |     :param window: sample window and its parameters.
204 | 
205 |     :type orig_signal: list
206 |     :type window: window object
207 | 
208 |     Extrapolates the phase at the border of the voiced frames by integrating the edge frequency value. This procedure is necessary for posterior aQHM calculations. Additionally, the method allows the replacement of the extracted phase by the cumulative frequency. The objective is to provide smoother bases for further aQHM and eaQHM calculations. Normally this is not necessary, since that the interpolation process already smooths the phase vector. But in a sample-by-sample extraction case, this substitution is very helpful to avoid the degradation of aQHM and eaQHM performance due the phase wild behaviour.
209 | 
210 | ComponentObj Class
211 | -----------------------
212 | 
213 | Creates a single component object, whose data is stored in the ModulatedSign.H matrix. The ComponentObj Class provides thus an alternative interface to separately access and manipulate each component.
214 | 
215 | USAGE:
216 | 
217 | .. py:function:: ComponentObj(H, harm)
218 |     :module: amfm_decompy.pyQHM
219 | 
220 |     :param H: 3-dimensional array where the component data is stored (for more information, check the ModulatedSign.H attribute).
221 |     :param harm: the component index.
222 | 
223 |     :type H: numpy array
224 |     :type harm: int
225 |     :rtype: component object.
226 | 
227 | 
228 | MODULATED COMPONENT ATTRIBUTES:
229 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
230 | 
231 | .. py:attribute:: mag
232 |     :module: ComponentObj
233 | 
234 |     Magnitude envelope of the component. It is set during the object's initialization.
235 | 
236 | .. py:attribute:: phase
237 |     :module: ComponentObj
238 | 
239 |     Phase angle track of the component in radians. It is set during the object's initialization.
240 | 
241 | .. py:attribute:: freq
242 |     :module: ComponentObj
243 | 
244 |     Instantaneous normalized frequency track of the component. To get the value in Hz just multiply this array by the sample frequency. It is set during the object's initialization.
245 | 
246 | 
247 | .. py:attribute:: signal
248 |     :module: ComponentObj
249 | 
250 |     Component signal synthesized with the extracted parameters. It is created by the ComponentObj.synthesize method.
251 | 
252 | EXAMPLES:
253 | 
254 | Example 1 - Shows how to to access the component data of a specific component::
255 | 
256 |    import amfm_decompy.pYAAPT as pyaapt
257 |    import amfm_decompy.pyQHM as pyqhm
258 |    import amfm_decompy.basic_tools as basic
259 |    from matplotlib import pyplot as plt
260 | 
261 |    # Declare the variables.
262 |    window_duration = 0.015
263 |    nharm_max = 25
264 | 
265 |    # Create the signal object.
266 |    signal = basic.SignalObj('path_to_sample.wav')
267 | 
268 |    # Create the window object.
269 |    window = pyqhm.SampleWindow(window_duration, signal.fs)
270 | 
271 |    # Create the pitch object and calculate its attributes.
272 |    pitch = pyaapt.yaapt(signal)
273 | 
274 |    # Use the pitch track to set the number of modulated components.
275 |    signal.set_nharm(pitch.values, nharm_max)
276 | 
277 |    # Perform the QHM extraction.
278 |    QHM = pyqhm.qhm(signal, pitch, window, 0.001, N_iter = 3)
279 | 
280 |    fig1 = plt.figure()
281 | 
282 |    # Plot the instaneous frequency of the fundamental harmonic.
283 |    # The ComponentObj objects are stored inside the harmonics list.
284 |    # For more information, please check the ModulatedSign.harmonics attribute.
285 |    plt.plot(QHM.harmonics[0].freq*signal.fs)
286 | 
287 |    plt.xlabel('samples', fontsize=18)
288 |    plt.ylabel('pitch (Hz)', fontsize=18)
289 | 
290 |    fig2 = plt.figure()
291 | 
292 |    # Plot the envelope magnitude of the third harmonic.
293 |    # The ComponentObj objects are stored inside the harmonics list.
294 |    # For more information, please check the ModulatedSign.harmonics attribute.
295 |    plt.plot(QHM.harmonics[2].mag, color='green')
296 | 
297 |    plt.xlabel('samples', fontsize=18)
298 |    plt.ylabel('magnitude', fontsize=18)
299 | 
300 |    plt.show()
301 | 
302 | The results are presented in the next two pictures:
303 | 
304 | .. image:: ../_images/freq1.png
305 | 
306 | .. image:: ../_images/mag3.png
307 | 
308 | NOTE:
309 |    It must noticed that the ComponentObj can be normally sliced. For example::
310 | 
311 |       QHM.harmonics[0].freq[920:1000]
312 | 
313 |    will return a array containing only the segment of the fundamental frequency between the samples from 920 to 999, while::
314 | 
315 |       QHM.harmonics[2].mag[950]
316 | 
317 |    will return the magnitude of the third harmonic at the 950th sample. However, due the way that the python language is internally built, unfortunately it's not possible to slice the harmonics list. For example::
318 | 
319 |      QHM.harmonics[0:3].freq[920:1000]
320 |      QHM.harmonics[0:2].mag[950]
321 | 
322 |    will raise an ERROR message. Therefore, the only way to simultaneously get the data of a group of components is by directly accessing the ModulatedSign.H matrix (or using a for loop, but this option is slower)::
323 | 
324 |      QHM.H[0:3, 2, 920:1000]
325 |      QHM.H[0:2, 0, 950]
326 | 
327 | MODULATED COMPONENT METHODS:
328 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^
329 | 
330 | .. py:method:: synthesize()
331 |     :module: ComponentObj
332 | 
333 |     Synthsize the modulated component by using the extracted magnitude and
334 |     phase.
335 | 
336 | SampleWindow Class
337 | -----------------------
338 | 
339 | Creates the sample hamming window object and some related index arrays.
340 | 
341 | USAGE:
342 | 
343 | .. py:function:: SampleWindow(window_duration, fs)
344 |     :module: amfm_decompy.pyQHM
345 | 
346 |     :param window_duration: window duration in seconds.
347 |     :param fs: sample frequency in Hz.
348 | 
349 |     :type window_duration: float
350 |     :type fs: float
351 |     :rtype: sample window object.
352 | 
353 | SAMPLE WINDOW ATTRIBUTES:
354 | ^^^^^^^^^^^^^^^^^^^^^^^^^
355 | 
356 | .. py:attribute:: dur
357 |     :module: SampleWindow
358 | 
359 |     Window duration in seconds. It is set during the object's initialization.
360 | 
361 | .. py:attribute:: length
362 |     :module: SampleWindow
363 | 
364 |     Window length in samples. It is set during the object's initialization.
365 | 
366 | .. py:attribute:: data
367 |     :module: SampleWindow
368 | 
369 |     Array containing the hamming window data. It is set during the object's initialization.
370 | 
371 | .. py:attribute:: data2
372 |     :module: SampleWindow
373 | 
374 |     Array containing the hamming window data with each element raised to the 2 power. It is set during the object's initialization.
375 | 
376 | .. py:attribute:: N
377 |     :module: SampleWindow
378 | 
379 |     Half-window length, i.e., SampleWindow.length/2 -1. It is set during the object's initialization.
380 | 
381 | .. py:attribute:: half_len_vec
382 |     :module: SampleWindow
383 | 
384 |     Numpy array that contains the indexes from zero to N, i.e, [0, 1 ... N]. It is set during the object's initialization.
385 | 
386 | .. py:attribute:: len_vec
387 |     :module: SampleWindow
388 | 
389 |     Numpy array that contains the indexes from -N to N, i.e, [-N, -N+1 ... N-1, N]. It is set during the object's initialization.
390 | 
391 | 
392 | .. [ref2] Y.Pantazis, “Decomposition of AM-FM signals with applications in speech processing”, PhD Thesis, University of Creta, 2010.
393 | 
394 | .. [ref3] Y.Pantazis, O. Rosec and Y. Stylianou, “Adaptive AM-FM signal decomposition with application to speech analysis”, IEEE Transactions on Audio, Speech and Language Processing, vol. 19, n 2, 2011.
395 | 
396 | .. [ref4] G.P. Kafentzis, Y. Pantazis, O. Rosec and Y. Stylianou, “An extension of the adaptive quasi-harmonic model”, em IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), 2012.


--------------------------------------------------------------------------------
/docs/samp_values.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bjbschmitt/AMFM_decompy/5c6c9bc48006d9eb5d5e874dd44f9a146a5ee38b/docs/samp_values.png


--------------------------------------------------------------------------------
/docs/values.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bjbschmitt/AMFM_decompy/5c6c9bc48006d9eb5d5e874dd44f9a146a5ee38b/docs/values.png


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [build-system]
 2 | requires = ["setuptools>=61.0", "wheel"]
 3 | build-backend = "setuptools.build_meta"
 4 | 
 5 | [project]
 6 | name = "AMFM_decompy"
 7 | version = "1.0.12.2"
 8 | description = "Package containing the tools necessary for decomposing a speech signal into its modulated components, aka AM-FM decomposition."
 9 | readme = "README.md"
10 | requires-python = ">=3.5"
11 | license = "MIT"
12 | license-files = ["LICENSE"]
13 | authors = [
14 |   { name = "Bernardo J.B. Schmitt", email = "bernardo.jb.schmitt@gmail.com" }
15 | ]
16 | keywords = ["Python", "speech", "pitch", "QHM", "YAAPT", "modulated components", "AM-FM decomposition"]
17 | classifiers = [
18 |   "Programming Language :: Python :: 3",
19 |   "Programming Language :: Python :: 3.5",
20 |   "Programming Language :: Python :: 3.6",
21 |   "Programming Language :: Python :: 3.7",
22 |   "Programming Language :: Python :: 3.8",
23 |   "Programming Language :: Python :: 3.9",
24 |   "Programming Language :: Python :: 3.10",
25 |   "Programming Language :: Python :: 3.11",
26 |   "Programming Language :: Python :: 3.12",
27 |   "Topic :: Scientific/Engineering",
28 |   "Topic :: Scientific/Engineering :: Human Machine Interfaces",
29 |   "Topic :: Scientific/Engineering :: Information Analysis",
30 |   "Topic :: Software Development :: Libraries :: Python Modules"
31 | ]
32 | dependencies = [
33 |   "numpy",
34 |   "scipy"
35 | ]
36 | urls = { "Homepage" = "https://github.com/bjbschmitt/AMFM_decompy/" }
37 | 
38 | [project.scripts]
39 | AMFM_test = "bin.AMFM_test:main"
40 | 
41 | [tool.setuptools.packages.find]
42 | where = ["."]
43 | include = ["amfm_decompy"]
44 | 
45 | [tool.setuptools.package-data]
46 | amfm_decompy = ["*.wav"]
47 | 
48 | [tool.setuptools]
49 | zip-safe = false
50 | include-package-data = true


--------------------------------------------------------------------------------