├── .github
└── FUNDING.yml
├── .gitignore
├── .travis.yml
├── AlternetiveTravisCI
├── CONTRIBUTING.rst
├── LICENSE
├── MANIFEST.in
├── Makefile
├── README.rst
├── __init__.py
├── _config.yml
├── _images
├── Speech_GIF.gif
├── follow-twitter.gif
├── pipeline.jpg
├── readme.rst
├── speech.gif
├── speechpy_logo.gif
└── stackframes.png
├── docs
├── Makefile
├── _config.yml
├── requirements.txt
└── source
│ ├── _static
│ └── img
│ │ ├── 08063416.pdf
│ │ ├── Speech_GIF.gif
│ │ ├── installation_logo.gif
│ │ ├── installation_logo.jpg
│ │ ├── speech.gif
│ │ ├── speech.jpg
│ │ ├── speechpy_logo.gif
│ │ ├── speechpy_logo.jpg
│ │ └── stackframes.png
│ ├── _templates
│ ├── breadcrumbs.html
│ └── breadcrumbs.html~
│ ├── conf.py
│ ├── content
│ ├── features.rst
│ ├── postprocessing.rst
│ └── preprocessing.rst
│ ├── epilogue
│ ├── CONTRIBUTING.rst
│ ├── finalnote.rst
│ └── test.rst
│ ├── index.rst
│ └── intro
│ └── introductions.rst
├── example
├── Alesis-Sanctuary-QCard-AcoustcBas-C2.wav
├── test_local.py
└── test_package.py
├── paper
├── paper.bib
├── paper.md
├── paper.pdf
└── test
│ ├── _imgs
│ ├── Scheme_of_speech_recognition_system.png
│ ├── packageview.png
│ └── travicCI.png
│ └── test.md
├── requirements.txt
├── setup.cfg
├── setup.py
├── speechpy
├── __init__.py
├── feature.py
├── functions.py
└── processing.py
└── tests
├── Alesis-Sanctuary-QCard-AcoustcBas-C2.wav
└── test_speechpy.py
/.github/FUNDING.yml:
--------------------------------------------------------------------------------
1 | github: [astorfi]
2 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # Byte-compiled / optimized / DLL files
2 | __pycache__/
3 | *.py[cod]
4 | *$py.class
5 |
6 | # C extensions
7 | *.so
8 |
9 | # Distribution / packaging
10 | .Python
11 | env/
12 | build/
13 | develop-eggs/
14 | dist/
15 | downloads/
16 | eggs/
17 | .eggs/
18 | lib/
19 | lib64/
20 | parts/
21 | sdist/
22 | var/
23 | *.egg-info/
24 | .installed.cfg
25 | *.egg
26 |
27 | # PyInstaller
28 | # Usually these files are written by a python script from a template
29 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
30 | *.manifest
31 | *.spec
32 |
33 | # Installer logs
34 | pip-log.txt
35 | pip-delete-this-directory.txt
36 |
37 | # Unit test / coverage reports
38 | htmlcov/
39 | .tox/
40 | .coverage
41 | .coverage.*
42 | .cache
43 | nosetests.xml
44 | coverage.xml
45 | *,cover
46 | .hypothesis/
47 |
48 | # Translations
49 | *.mo
50 | *.pot
51 |
52 | # Django stuff:
53 | *.log
54 | local_settings.py
55 |
56 | # Flask stuff:
57 | instance/
58 | .webassets-cache
59 |
60 | # Scrapy stuff:
61 | .scrapy
62 |
63 | # Sphinx documentation
64 | docs/_build/
65 |
66 | # PyBuilder
67 | target/
68 |
69 | # IPython Notebook
70 | .ipynb_checkpoints
71 |
72 | # pyenv
73 | .python-version
74 |
75 | # celery beat schedule file
76 | celerybeat-schedule
77 |
78 | # dotenv
79 | .env
80 |
81 | # virtualenv
82 | venv/
83 | ENV/
84 |
85 | # Spyder project settings
86 | .spyderproject
87 |
88 | # Rope project settings
89 | .ropeproject
90 |
--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
1 | language: python
2 | python:
3 |
4 | - "2.7"
5 | - "3.4"
6 | - "3.5"
7 |
8 | # command to install dependencies
9 | install:
10 | - pip install -r requirements.txt
11 | - pip install coveralls
12 | - pip install codecov
13 |
14 | script:
15 | - coverage run --omit=*.virtualenvs*,*virtualenv* example/test_package.py test
16 | - coverage run --omit=*.virtualenvs*,*virtualenv* example/test_local.py test
17 | - pytest tests/
18 |
19 |
20 | after_success:
21 | - coveralls
22 | - codecov
23 |
24 | sudo: enabled
25 | dist: trusty
26 |
--------------------------------------------------------------------------------
/AlternetiveTravisCI:
--------------------------------------------------------------------------------
1 | language: python
2 | python:
3 |
4 | - "2.7"
5 | - "3.4"
6 | - "3.5"
7 | - "3.5-dev" # 3.5 development branch
8 |
9 | # command to install dependencies
10 | install: "pip install -r requirements.txt"
11 |
12 | # command to run tests
13 | script: python setup.py develop
14 |
15 | sudo: enabled
16 | dist: trusty
17 |
--------------------------------------------------------------------------------
/CONTRIBUTING.rst:
--------------------------------------------------------------------------------
1 |
2 | *************
3 | Contributing
4 | *************
5 |
6 | When contributing to this repository, you are more than welcome to discuss your feedback with any of the owners of this repository. *For typos, please do not create a pull request. Instead, declare them in issues or email the repository owner*. For technical and conceptual questions please feel free to **directly contact the repository owner**. Before asking general questions related to the concepts and techniques provided in this project, **please make sure to read and understand its associated paper**.
7 |
8 | Please note we have a code of conduct, please follow it in all your interactions with the project.
9 |
10 | ====================
11 | Pull Request Process
12 | ====================
13 |
14 | Please consider the following criterions in order to help us in a better way:
15 |
16 | 1. The pull request is mainly expected to be a code script suggestion or improvement.
17 | 2. A pull request related to non-code-script sections is expected to make a significant difference in the documentation. Otherwise, it is expected to be announced in the issues section.
18 | 3. Ensure any install or build dependencies are removed before the end of the layer when doing a
19 | build and creating a pull request.
20 | 4. Add comments with details of changes to the interface, this includes new environment
21 | variables, exposed ports, useful file locations and container parameters.
22 | 5. You may merge the Pull Request in once you have the sign-off of at least one other developer, or if you
23 | do not have permission to do that, you may request the owner to merge it for you if you believe all checks are passed.
24 |
25 | ============
26 | Final Note
27 | ============
28 |
29 | We are looking forward to your kind feedback. Please help us to improve this open source project and make our work better.
30 | For contribution, please create a pull request and we will investigate it promptly. Once again, we appreciate
31 | your kind feedback and elaborate code inspections.
32 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | Apache License
2 | Version 2.0, January 2004
3 | http://www.apache.org/licenses/
4 |
5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
6 |
7 | 1. Definitions.
8 |
9 | "License" shall mean the terms and conditions for use, reproduction,
10 | and distribution as defined by Sections 1 through 9 of this document.
11 |
12 | "Licensor" shall mean the copyright owner or entity authorized by
13 | the copyright owner that is granting the License.
14 |
15 | "Legal Entity" shall mean the union of the acting entity and all
16 | other entities that control, are controlled by, or are under common
17 | control with that entity. For the purposes of this definition,
18 | "control" means (i) the power, direct or indirect, to cause the
19 | direction or management of such entity, whether by contract or
20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the
21 | outstanding shares, or (iii) beneficial ownership of such entity.
22 |
23 | "You" (or "Your") shall mean an individual or Legal Entity
24 | exercising permissions granted by this License.
25 |
26 | "Source" form shall mean the preferred form for making modifications,
27 | including but not limited to software source code, documentation
28 | source, and configuration files.
29 |
30 | "Object" form shall mean any form resulting from mechanical
31 | transformation or translation of a Source form, including but
32 | not limited to compiled object code, generated documentation,
33 | and conversions to other media types.
34 |
35 | "Work" shall mean the work of authorship, whether in Source or
36 | Object form, made available under the License, as indicated by a
37 | copyright notice that is included in or attached to the work
38 | (an example is provided in the Appendix below).
39 |
40 | "Derivative Works" shall mean any work, whether in Source or Object
41 | form, that is based on (or derived from) the Work and for which the
42 | editorial revisions, annotations, elaborations, or other modifications
43 | represent, as a whole, an original work of authorship. For the purposes
44 | of this License, Derivative Works shall not include works that remain
45 | separable from, or merely link (or bind by name) to the interfaces of,
46 | the Work and Derivative Works thereof.
47 |
48 | "Contribution" shall mean any work of authorship, including
49 | the original version of the Work and any modifications or additions
50 | to that Work or Derivative Works thereof, that is intentionally
51 | submitted to Licensor for inclusion in the Work by the copyright owner
52 | or by an individual or Legal Entity authorized to submit on behalf of
53 | the copyright owner. For the purposes of this definition, "submitted"
54 | means any form of electronic, verbal, or written communication sent
55 | to the Licensor or its representatives, including but not limited to
56 | communication on electronic mailing lists, source code control systems,
57 | and issue tracking systems that are managed by, or on behalf of, the
58 | Licensor for the purpose of discussing and improving the Work, but
59 | excluding communication that is conspicuously marked or otherwise
60 | designated in writing by the copyright owner as "Not a Contribution."
61 |
62 | "Contributor" shall mean Licensor and any individual or Legal Entity
63 | on behalf of whom a Contribution has been received by Licensor and
64 | subsequently incorporated within the Work.
65 |
66 | 2. Grant of Copyright License. Subject to the terms and conditions of
67 | this License, each Contributor hereby grants to You a perpetual,
68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
69 | copyright license to reproduce, prepare Derivative Works of,
70 | publicly display, publicly perform, sublicense, and distribute the
71 | Work and such Derivative Works in Source or Object form.
72 |
73 | 3. Grant of Patent License. Subject to the terms and conditions of
74 | this License, each Contributor hereby grants to You a perpetual,
75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
76 | (except as stated in this section) patent license to make, have made,
77 | use, offer to sell, sell, import, and otherwise transfer the Work,
78 | where such license applies only to those patent claims licensable
79 | by such Contributor that are necessarily infringed by their
80 | Contribution(s) alone or by combination of their Contribution(s)
81 | with the Work to which such Contribution(s) was submitted. If You
82 | institute patent litigation against any entity (including a
83 | cross-claim or counterclaim in a lawsuit) alleging that the Work
84 | or a Contribution incorporated within the Work constitutes direct
85 | or contributory patent infringement, then any patent licenses
86 | granted to You under this License for that Work shall terminate
87 | as of the date such litigation is filed.
88 |
89 | 4. Redistribution. You may reproduce and distribute copies of the
90 | Work or Derivative Works thereof in any medium, with or without
91 | modifications, and in Source or Object form, provided that You
92 | meet the following conditions:
93 |
94 | (a) You must give any other recipients of the Work or
95 | Derivative Works a copy of this License; and
96 |
97 | (b) You must cause any modified files to carry prominent notices
98 | stating that You changed the files; and
99 |
100 | (c) You must retain, in the Source form of any Derivative Works
101 | that You distribute, all copyright, patent, trademark, and
102 | attribution notices from the Source form of the Work,
103 | excluding those notices that do not pertain to any part of
104 | the Derivative Works; and
105 |
106 | (d) If the Work includes a "NOTICE" text file as part of its
107 | distribution, then any Derivative Works that You distribute must
108 | include a readable copy of the attribution notices contained
109 | within such NOTICE file, excluding those notices that do not
110 | pertain to any part of the Derivative Works, in at least one
111 | of the following places: within a NOTICE text file distributed
112 | as part of the Derivative Works; within the Source form or
113 | documentation, if provided along with the Derivative Works; or,
114 | within a display generated by the Derivative Works, if and
115 | wherever such third-party notices normally appear. The contents
116 | of the NOTICE file are for informational purposes only and
117 | do not modify the License. You may add Your own attribution
118 | notices within Derivative Works that You distribute, alongside
119 | or as an addendum to the NOTICE text from the Work, provided
120 | that such additional attribution notices cannot be construed
121 | as modifying the License.
122 |
123 | You may add Your own copyright statement to Your modifications and
124 | may provide additional or different license terms and conditions
125 | for use, reproduction, or distribution of Your modifications, or
126 | for any such Derivative Works as a whole, provided Your use,
127 | reproduction, and distribution of the Work otherwise complies with
128 | the conditions stated in this License.
129 |
130 | 5. Submission of Contributions. Unless You explicitly state otherwise,
131 | any Contribution intentionally submitted for inclusion in the Work
132 | by You to the Licensor shall be under the terms and conditions of
133 | this License, without any additional terms or conditions.
134 | Notwithstanding the above, nothing herein shall supersede or modify
135 | the terms of any separate license agreement you may have executed
136 | with Licensor regarding such Contributions.
137 |
138 | 6. Trademarks. This License does not grant permission to use the trade
139 | names, trademarks, service marks, or product names of the Licensor,
140 | except as required for reasonable and customary use in describing the
141 | origin of the Work and reproducing the content of the NOTICE file.
142 |
143 | 7. Disclaimer of Warranty. Unless required by applicable law or
144 | agreed to in writing, Licensor provides the Work (and each
145 | Contributor provides its Contributions) on an "AS IS" BASIS,
146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 | implied, including, without limitation, any warranties or conditions
148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 | PARTICULAR PURPOSE. You are solely responsible for determining the
150 | appropriateness of using or redistributing the Work and assume any
151 | risks associated with Your exercise of permissions under this License.
152 |
153 | 8. Limitation of Liability. In no event and under no legal theory,
154 | whether in tort (including negligence), contract, or otherwise,
155 | unless required by applicable law (such as deliberate and grossly
156 | negligent acts) or agreed to in writing, shall any Contributor be
157 | liable to You for damages, including any direct, indirect, special,
158 | incidental, or consequential damages of any character arising as a
159 | result of this License or out of the use or inability to use the
160 | Work (including but not limited to damages for loss of goodwill,
161 | work stoppage, computer failure or malfunction, or any and all
162 | other commercial damages or losses), even if such Contributor
163 | has been advised of the possibility of such damages.
164 |
165 | 9. Accepting Warranty or Additional Liability. While redistributing
166 | the Work or Derivative Works thereof, You may choose to offer,
167 | and charge a fee for, acceptance of support, warranty, indemnity,
168 | or other liability obligations and/or rights consistent with this
169 | License. However, in accepting such obligations, You may act only
170 | on Your own behalf and on Your sole responsibility, not on behalf
171 | of any other Contributor, and only if You agree to indemnify,
172 | defend, and hold each Contributor harmless for any liability
173 | incurred by, or claims asserted against, such Contributor by reason
174 | of your accepting any such warranty or additional liability.
175 |
176 | END OF TERMS AND CONDITIONS
177 |
178 | APPENDIX: How to apply the Apache License to your work.
179 |
180 | To apply the Apache License to your work, attach the following
181 | boilerplate notice, with the fields enclosed by brackets "{}"
182 | replaced with your own identifying information. (Don't include
183 | the brackets!) The text should be enclosed in the appropriate
184 | comment syntax for the file format. We also recommend that a
185 | file or class name and description of purpose be included on the
186 | same "printed page" as the copyright notice for easier
187 | identification within third-party archives.
188 |
189 | Copyright {2017} {Amirsina Torfi}
190 |
191 | Licensed under the Apache License, Version 2.0 (the "License");
192 | you may not use this file except in compliance with the License.
193 | You may obtain a copy of the License at
194 |
195 | http://www.apache.org/licenses/LICENSE-2.0
196 |
197 | Unless required by applicable law or agreed to in writing, software
198 | distributed under the License is distributed on an "AS IS" BASIS,
199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 | See the License for the specific language governing permissions and
201 | limitations under the License.
202 |
--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include README.rst
2 |
--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
1 | init:
2 | pip install -r requirements.txt
3 |
4 | test:
5 | nosetests tests
6 |
--------------------------------------------------------------------------------
/README.rst:
--------------------------------------------------------------------------------
1 | .. image:: _images/speechpy_logo.gif
2 | :target: https://github.com/astorfi/speech_feature_extraction/blob/master/images/speechpy_logo.gif
3 |
4 | ===============================================
5 | `SpeechPy Official Project Documentation`_
6 | ===============================================
7 |
8 | .. image:: https://pepy.tech/badge/speechpy
9 | :target: https://pepy.tech/project/speechpy
10 | .. image:: https://img.shields.io/badge/contributions-welcome-brightgreen.svg?style=flat
11 | :target: https://github.com/astorfi/speechpy/pulls
12 | .. image:: https://coveralls.io/repos/github/astorfi/speechpy/badge.svg?branch=master
13 | :target: https://coveralls.io/github/astorfi/speechpy?branch=master
14 | .. image:: https://codecov.io/gh/astorfi/speechpy/branch/master/graph/badge.svg
15 | :target: https://codecov.io/gh/astorfi/speechpy
16 | .. image:: https://badge.fury.io/py/speechpy.svg
17 | :target: https://badge.fury.io/py/speechpy
18 | .. image:: http://joss.theoj.org/papers/10.21105/joss.00749/status.svg
19 | :target: https://doi.org/10.21105/joss.00749
20 | .. image:: https://img.shields.io/twitter/follow/amirsinatorfi.svg?label=Follow&style=social
21 | :target: https://twitter.com/amirsinatorfi
22 |
23 | .. .. image:: https://zenodo.org/badge/DOI/10.5281/zenodo.810391.svg
24 | .. :target: https://doi.org/10.5281/zenodo.810391
25 |
26 | .. _SpeechPy Official Project Documentation: http://speechpy.readthedocs.io
27 |
28 |
29 | ==========================
30 | Table of Contents
31 | ==========================
32 | .. contents::
33 | :local:
34 | :depth: 3
35 |
36 | ---------------------
37 | Documentation
38 | ---------------------
39 |
40 | This library provides most frequent used speech features including MFCCs and filterbank energies alongside with the log-energy of filterbanks.
41 | If you are interested to see what are MFCCs and how they are generated please refer to this
42 | `wiki `_ page.
43 |
44 | .. image:: _images/speech.gif
45 |
46 |
47 | Please refer to the following links for further informations:
48 |
49 | `SpeechPy Official Project Documentation`_
50 |
51 | `Paper`_
52 |
53 | .. _SpeechPy Official Project Documentation: http://speechpy.readthedocs.io
54 | .. _Paper: https://doi.org/10.21105/joss.00749
55 |
56 | ------------------------------------------
57 | Which Python versions are supported
58 | ------------------------------------------
59 |
60 | Currently, the package has been tested and verified using Python ``2.7``, ``3.4`` and ``3.5``.
61 |
62 | ---------------------
63 | Citation
64 | ---------------------
65 |
66 | If you used this package, please kindly cite it as follows:
67 |
68 | .. code:: bash
69 |
70 | @article{torfi2018speechpy,
71 | title={SpeechPy-A Library for Speech Processing and Recognition},
72 | author={Torfi, Amirsina},
73 | journal={arXiv preprint arXiv:1803.01094},
74 | year={2018}
75 | }
76 |
77 | ---------------------
78 | How to Install?
79 | ---------------------
80 |
81 | There are two possible ways for installation of this package: local installation and PyPi.
82 |
83 | ~~~~~~~~~~~~~~~~~~~
84 | Local Installation
85 | ~~~~~~~~~~~~~~~~~~~
86 |
87 | For local installation at first the repository must be cloned::
88 |
89 | git clone https://github.com/astorfi/speech_feature_extraction.git
90 |
91 | After cloning the reposity, root to the repository directory then execute::
92 |
93 | python setup.py develop
94 |
95 | ~~~~~
96 | Pypi
97 | ~~~~~
98 |
99 | The package is available on PyPi. For direct installation simply execute the following:
100 |
101 | .. code-block:: shell
102 |
103 | pip install speechpy
104 |
105 |
106 | ------------------------------------------
107 | What Features are supported?
108 | ------------------------------------------
109 | - Mel Frequency Cepstral Coefficients(MFCCs)
110 | - Filterbank Energies
111 | - Log Filterbank Energies
112 |
113 | Please refer to `SpeechPy Official Project Documentation`_ for details about the supported features.
114 |
115 | ~~~~~~~~~~~~~~
116 | MFCC Features
117 | ~~~~~~~~~~~~~~
118 |
119 | |pic1| |pic2|
120 |
121 | .. |pic1| image:: _images/Speech_GIF.gif
122 | :width: 45%
123 |
124 | .. |pic2| image:: _images/pipeline.jpg
125 | :width: 45%
126 |
127 | The supported attributes for generating MFCC features can be seen by investigating the related function:
128 |
129 | .. code-block:: python
130 |
131 | def mfcc(signal, sampling_frequency, frame_length=0.020, frame_stride=0.01,num_cepstral =13,
132 | num_filters=40, fft_length=512, low_frequency=0, high_frequency=None, dc_elimination=True):
133 | """Compute MFCC features from an audio signal.
134 | :param signal: the audio signal from which to compute features. Should be an N x 1 array
135 | :param sampling_frequency: the sampling frequency of the signal we are working with.
136 | :param frame_length: the length of each frame in seconds. Default is 0.020s
137 | :param frame_stride: the step between successive frames in seconds. Default is 0.02s (means no overlap)
138 | :param num_filters: the number of filters in the filterbank, default 40.
139 | :param fft_length: number of FFT points. Default is 512.
140 | :param low_frequency: lowest band edge of mel filters. In Hz, default is 0.
141 | :param high_frequency: highest band edge of mel filters. In Hz, default is samplerate/2
142 | :param num_cepstral: Number of cepstral coefficients.
143 | :param dc_elimination: hIf the first dc component should be eliminated or not.
144 | :returns: A numpy array of size (num_frames x num_cepstral) containing mfcc features.
145 | """
146 |
147 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~
148 | Filterbank Energy Features
149 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~
150 |
151 |
152 | .. code-block:: python
153 |
154 | def mfe(signal, sampling_frequency, frame_length=0.020, frame_stride=0.01,
155 | num_filters=40, fft_length=512, low_frequency=0, high_frequency=None):
156 | """Compute Mel-filterbank energy features from an audio signal.
157 | :param signal: the audio signal from which to compute features. Should be an N x 1 array
158 | :param sampling_frequency: the sampling frequency of the signal we are working with.
159 | :param frame_length: the length of each frame in seconds. Default is 0.020s
160 | :param frame_stride: the step between successive frames in seconds. Default is 0.02s (means no overlap)
161 | :param num_filters: the number of filters in the filterbank, default 40.
162 | :param fft_length: number of FFT points. Default is 512.
163 | :param low_frequency: lowest band edge of mel filters. In Hz, default is 0.
164 | :param high_frequency: highest band edge of mel filters. In Hz, default is samplerate/2
165 | :returns:
166 | features: the energy of fiterbank: num_frames x num_filters
167 | frame_energies: the energy of each frame: num_frames x 1
168 | """
169 |
170 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
171 | log - Filterbank Energy Features
172 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
173 |
174 | The attributes for ``log_filterbank energies`` are the same for ``filterbank energies`` too.
175 |
176 | .. code-block:: python
177 |
178 | def lmfe(signal, sampling_frequency, frame_length=0.020, frame_stride=0.01,
179 | num_filters=40, fft_length=512, low_frequency=0, high_frequency=None):
180 | """Compute log Mel-filterbank energy features from an audio signal.
181 | :param signal: the audio signal from which to compute features. Should be an N x 1 array
182 | :param sampling_frequency: the sampling frequency of the signal we are working with.
183 | :param frame_length: the length of each frame in seconds. Default is 0.020s
184 | :param frame_stride: the step between successive frames in seconds. Default is 0.02s (means no overlap)
185 | :param num_filters: the number of filters in the filterbank, default 40.
186 | :param fft_length: number of FFT points. Default is 512.
187 | :param low_frequency: lowest band edge of mel filters. In Hz, default is 0.
188 | :param high_frequency: highest band edge of mel filters. In Hz, default is samplerate/2
189 | :returns:
190 | features: the energy of fiterbank: num_frames x num_filters
191 | frame_log_energies: the log energy of each frame: num_frames x 1
192 | """
193 |
194 | ~~~~~~~~~~~~
195 | Stack Frames
196 | ~~~~~~~~~~~~
197 |
198 | In ``Stack_Frames`` function, the stack of frames will be generated from the signal.
199 |
200 | .. code-block:: python
201 |
202 | def stack_frames(sig, sampling_frequency, frame_length=0.020, frame_stride=0.020, Filter=lambda x: numpy.ones((x,)),
203 | zero_padding=True):
204 | """Frame a signal into overlapping frames.
205 | :param sig: The audio signal to frame of size (N,).
206 | :param sampling_frequency: The sampling frequency of the signal.
207 | :param frame_length: The length of the frame in second.
208 | :param frame_stride: The stride between frames.
209 | :param Filter: The time-domain filter for applying to each frame. By default it is one so nothing will be changed.
210 | :param zero_padding: If the samples is not a multiple of frame_length(number of frames sample), zero padding will
211 | be done for generating last frame.
212 | :returns: Array of frames. size: number_of_frames x frame_len.
213 | """
214 |
215 | ---------------------
216 | Post Processing
217 | ---------------------
218 |
219 | There are some post-processing operation that are supported in ``speechpy``.
220 |
221 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
222 | Global cepstral mean and variance normalization (CMVN)
223 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
224 |
225 | This function performs global cepstral mean and variance normalization
226 | (CMVN) to remove the channel effects. The code assumes that there is one
227 | observation per row.
228 |
229 | .. code-block:: python
230 |
231 | def cmvn(vec, variance_normalization=False):
232 | """
233 | This function is aimed to perform global ``cepstral mean and variance normalization``
234 | (CMVN) on input feature vector "vec". The code assumes that there is one observation per row.
235 |
236 | :param:
237 | vec: input feature matrix (size:(num_observation,num_features))
238 | variance_normalization: If the variance normilization should be performed or not.
239 | :return:
240 | The mean(or mean+variance) normalized feature vector.
241 | """
242 |
243 |
244 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
245 | Local cepstral mean and variance normalization (CMVN) over a sliding window
246 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
247 |
248 | This function performs local cepstral mean and variance normalization
249 | (CMVN) over sliding windows. The code assumes that there is one
250 | observation per row.
251 |
252 | .. code-block:: python
253 |
254 | def cmvnw(vec, win_size=301, variance_normalization=False):
255 | """
256 | This function is aimed to perform local cepstral mean and variance normalization on a sliding window.
257 | (CMVN) on input feature vector "vec". The code assumes that there is one observation per row.
258 | :param
259 | vec: input feature matrix (size:(num_observation,num_features))
260 | win_size: The size of sliding window for local normalization and should be odd.
261 | default=301 which is around 3s if 100 Hz rate is considered(== 10ms frame stide)
262 | variance_normalization: If the variance normilization should be performed or not.
263 |
264 | :return: The mean(or mean+variance) normalized feature vector.
265 | """
266 |
267 | -----
268 | Tests
269 | -----
270 |
271 | SpeechPy includes some unit tests. To run the tests, ``cd`` into the
272 | ``speechpy/tests`` directory and run:
273 |
274 | .. code-block:: shell
275 |
276 | python -m pytest
277 |
278 | For installing the requirements you only need to install ``pytest``.
279 |
280 | ------------
281 | Example
282 | ------------
283 |
284 | The test example can be seen in ``test/test.py`` as below:
285 |
286 | .. code-block:: python
287 |
288 | import scipy.io.wavfile as wav
289 | import numpy as np
290 | import speechpy
291 | import os
292 |
293 | file_name = os.path.join(os.path.dirname(os.path.abspath(__file__)),'Alesis-Sanctuary-QCard-AcoustcBas-C2.wav')
294 | fs, signal = wav.read(file_name)
295 | signal = signal[:,0]
296 |
297 | # Example of pre-emphasizing.
298 | signal_preemphasized = speechpy.processing.preemphasis(signal, cof=0.98)
299 |
300 | # Example of staching frames
301 | frames = speechpy.processing.stack_frames(signal, sampling_frequency=fs, frame_length=0.020, frame_stride=0.01, filter=lambda x: np.ones((x,)),
302 | zero_padding=True)
303 |
304 | # Example of extracting power spectrum
305 | power_spectrum = speechpy.processing.power_spectrum(frames, fft_points=512)
306 | print('power spectrum shape=', power_spectrum.shape)
307 |
308 | ############# Extract MFCC features #############
309 | mfcc = speechpy.feature.mfcc(signal, sampling_frequency=fs, frame_length=0.020, frame_stride=0.01,
310 | num_filters=40, fft_length=512, low_frequency=0, high_frequency=None)
311 | mfcc_cmvn = speechpy.processing.cmvnw(mfcc,win_size=301,variance_normalization=True)
312 | print('mfcc(mean + variance normalized) feature shape=', mfcc_cmvn.shape)
313 |
314 | mfcc_feature_cube = speechpy.feature.extract_derivative_feature(mfcc)
315 | print('mfcc feature cube shape=', mfcc_feature_cube.shape)
316 |
317 | ############# Extract logenergy features #############
318 | logenergy = speechpy.feature.lmfe(signal, sampling_frequency=fs, frame_length=0.020, frame_stride=0.01,
319 | num_filters=40, fft_length=512, low_frequency=0, high_frequency=None)
320 | logenergy_feature_cube = speechpy.feature.extract_derivative_feature(logenergy)
321 | print('logenergy features=', logenergy.shape)
322 |
323 | For extracting the feature at first, the signal samples will be stacked into frames. The features are computed for each frame in the stacked frames collection.
324 |
325 | ---------------------
326 | Dependencies
327 | ---------------------
328 |
329 | Two packages of ``Scipy`` and ``NumPy`` are the required dependencies which will be installed automatically by running the ``setup.py`` file.
330 |
331 | ---------------------
332 | Acknowledgements
333 | ---------------------
334 |
335 | This work is based upon a work supported by the Center for Identification Technology Research and the National Science Foundation under Grant #1650474.
336 |
337 |
338 | ---------------------
339 | Contributing
340 | ---------------------
341 |
342 | When contributing to this repository, you are more than welcome to discuss your feedback with any of the owners of this repository. *For typos, please do not create a pull request. Instead, declare them in issues or email the repository owner*. For technical and conceptual questions please feel free to **directly contact the repository owner**. Before asking general questions related to the concepts and techniques provided in this project, **please make sure to read and understand its associated paper**.
343 |
344 | ~~~~~~~~~~~~~~~~~~~~~~~~
345 | Pull Request Process
346 | ~~~~~~~~~~~~~~~~~~~~~~~~
347 |
348 | Please consider the following criterions in order to help us in a better way:
349 |
350 | 1. The pull request is mainly expected to be a code script suggestion or improvement.
351 | 2. A pull request related to non-code-script sections is expected to make a significant difference in the documentation. Otherwise, it is expected to be announced in the issues section.
352 | 3. Ensure any install or build dependencies are removed before the end of the layer when doing a
353 | build and creating a pull request.
354 | 4. Add comments with details of changes to the interface, this includes new environment
355 | variables, exposed ports, useful file locations and container parameters.
356 | 5. You may merge the Pull Request in once you have the sign-off of at least one other developer, or if you
357 | do not have permission to do that, you may request the owner to merge it for you if you believe all checks are passed.
358 |
359 | ~~~~~~~~~~~~~~~~~~~~~~~~
360 | Declaring issues
361 | ~~~~~~~~~~~~~~~~~~~~~~~~
362 |
363 | For declaring issues, you can directly email the repository owner. However, preferably please create an issue as it might be
364 | the issue that other repository followers may encounter. That way, the question to other developers will be answered as well.
365 |
366 | ~~~~~~~~~~~~~~~~~~~~~~~~
367 | Final Note
368 | ~~~~~~~~~~~~~~~~~~~~~~~~
369 |
370 | We are looking forward to your kind feedback. Please help us to improve this open source project and make our work better.
371 | For contribution, please create a pull request and we will investigate it promptly. Once again, we appreciate
372 | your kind feedback and elaborate code inspections.
373 |
374 |
375 |
376 | ---------------------
377 | Disclaimer
378 | ---------------------
379 |
380 | Although by dramatic chages, some portion of this library is inspired by the `python speech features`_ library.
381 |
382 | .. _python speech features: https://github.com/jameslyons/python_speech_features
383 |
384 | We clain the following advantages for our library:
385 |
386 | 1. More accurate operations have been performed for the mel-frequency calculations.
387 | 2. The package supports different ``Python`` versions.
388 | 3. The feature are generated in a more organized way as cubic features.
389 | 4. The package is well-tested and integrated.
390 | 5. The package is up-to-date and actively developing.
391 | 6. The package has been used for research purposes.
392 | 7. Exceptions and extreme cases are handled in this library.
393 |
--------------------------------------------------------------------------------
/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/astorfi/speechpy/b317746f042d02e03124518625be6d9327b71e4a/__init__.py
--------------------------------------------------------------------------------
/_config.yml:
--------------------------------------------------------------------------------
1 | theme: jekyll-theme-architect
--------------------------------------------------------------------------------
/_images/Speech_GIF.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/astorfi/speechpy/b317746f042d02e03124518625be6d9327b71e4a/_images/Speech_GIF.gif
--------------------------------------------------------------------------------
/_images/follow-twitter.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/astorfi/speechpy/b317746f042d02e03124518625be6d9327b71e4a/_images/follow-twitter.gif
--------------------------------------------------------------------------------
/_images/pipeline.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/astorfi/speechpy/b317746f042d02e03124518625be6d9327b71e4a/_images/pipeline.jpg
--------------------------------------------------------------------------------
/_images/readme.rst:
--------------------------------------------------------------------------------
1 | The images used for this repository.
2 |
--------------------------------------------------------------------------------
/_images/speech.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/astorfi/speechpy/b317746f042d02e03124518625be6d9327b71e4a/_images/speech.gif
--------------------------------------------------------------------------------
/_images/speechpy_logo.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/astorfi/speechpy/b317746f042d02e03124518625be6d9327b71e4a/_images/speechpy_logo.gif
--------------------------------------------------------------------------------
/_images/stackframes.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/astorfi/speechpy/b317746f042d02e03124518625be6d9327b71e4a/_images/stackframes.png
--------------------------------------------------------------------------------
/docs/Makefile:
--------------------------------------------------------------------------------
1 | # Minimal makefile for Sphinx documentation
2 | #
3 |
4 | # You can set these variables from the command line.
5 | SPHINXOPTS =
6 | SPHINXBUILD = sphinx-build
7 | SPHINXPROJ = SpeechPy
8 | SOURCEDIR = source
9 | BUILDDIR = build
10 |
11 | # Put it first so that "make" without argument is like "make help".
12 | help:
13 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
14 |
15 | .PHONY: help Makefile
16 |
17 | # Catch-all target: route all unknown targets to Sphinx using the new
18 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS).
19 | %: Makefile
20 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
--------------------------------------------------------------------------------
/docs/_config.yml:
--------------------------------------------------------------------------------
1 | theme: jekyll-theme-time-machine
--------------------------------------------------------------------------------
/docs/requirements.txt:
--------------------------------------------------------------------------------
1 | sphinx
2 | -e git://github.com/snide/sphinx_rtd_theme.git#egg=sphinx_rtd_theme
3 |
--------------------------------------------------------------------------------
/docs/source/_static/img/08063416.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/astorfi/speechpy/b317746f042d02e03124518625be6d9327b71e4a/docs/source/_static/img/08063416.pdf
--------------------------------------------------------------------------------
/docs/source/_static/img/Speech_GIF.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/astorfi/speechpy/b317746f042d02e03124518625be6d9327b71e4a/docs/source/_static/img/Speech_GIF.gif
--------------------------------------------------------------------------------
/docs/source/_static/img/installation_logo.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/astorfi/speechpy/b317746f042d02e03124518625be6d9327b71e4a/docs/source/_static/img/installation_logo.gif
--------------------------------------------------------------------------------
/docs/source/_static/img/installation_logo.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/astorfi/speechpy/b317746f042d02e03124518625be6d9327b71e4a/docs/source/_static/img/installation_logo.jpg
--------------------------------------------------------------------------------
/docs/source/_static/img/speech.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/astorfi/speechpy/b317746f042d02e03124518625be6d9327b71e4a/docs/source/_static/img/speech.gif
--------------------------------------------------------------------------------
/docs/source/_static/img/speech.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/astorfi/speechpy/b317746f042d02e03124518625be6d9327b71e4a/docs/source/_static/img/speech.jpg
--------------------------------------------------------------------------------
/docs/source/_static/img/speechpy_logo.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/astorfi/speechpy/b317746f042d02e03124518625be6d9327b71e4a/docs/source/_static/img/speechpy_logo.gif
--------------------------------------------------------------------------------
/docs/source/_static/img/speechpy_logo.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/astorfi/speechpy/b317746f042d02e03124518625be6d9327b71e4a/docs/source/_static/img/speechpy_logo.jpg
--------------------------------------------------------------------------------
/docs/source/_static/img/stackframes.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/astorfi/speechpy/b317746f042d02e03124518625be6d9327b71e4a/docs/source/_static/img/stackframes.png
--------------------------------------------------------------------------------
/docs/source/_templates/breadcrumbs.html:
--------------------------------------------------------------------------------
1 | {# Support for Sphinx 1.3+ page_source_suffix, but don't break old builds. #}
2 |
3 | {% if page_source_suffix %}
4 | {% set suffix = page_source_suffix %}
5 | {% else %}
6 | {% set suffix = source_suffix %}
7 | {% endif %}
8 |
9 | {% if meta is defined and meta is not none %}
10 | {% set check_meta = True %}
11 | {% else %}
12 | {% set check_meta = False %}
13 | {% endif %}
14 |
15 | {% if check_meta and 'github_url' in meta %}
16 | {% set display_github = True %}
17 | {% endif %}
18 |
19 | {% if check_meta and 'bitbucket_url' in meta %}
20 | {% set display_bitbucket = True %}
21 | {% endif %}
22 |
23 | {% if check_meta and 'gitlab_url' in meta %}
24 | {% set display_gitlab = True %}
25 | {% endif %}
26 |
27 |
28 |
29 |
70 |
71 | {% if (theme_prev_next_buttons_location == 'top' or theme_prev_next_buttons_location == 'both') and (next or prev) %}
72 |
80 | {% endif %}
81 |
82 |
83 |
--------------------------------------------------------------------------------
/docs/source/_templates/breadcrumbs.html~:
--------------------------------------------------------------------------------
1 | {# Support for Sphinx 1.3+ page_source_suffix, but don't break old builds. #}
2 |
3 | {% if page_source_suffix %}
4 | {% set suffix = page_source_suffix %}
5 | {% else %}
6 | {% set suffix = source_suffix %}
7 | {% endif %}
8 |
9 | {% if meta is defined and meta is not none %}
10 | {% set check_meta = True %}
11 | {% else %}
12 | {% set check_meta = False %}
13 | {% endif %}
14 |
15 | {% if check_meta and 'github_url' in meta %}
16 | {% set display_github = True %}
17 | {% endif %}
18 |
19 | {% if check_meta and 'bitbucket_url' in meta %}
20 | {% set display_bitbucket = True %}
21 | {% endif %}
22 |
23 | {% if check_meta and 'gitlab_url' in meta %}
24 | {% set display_gitlab = True %}
25 | {% endif %}
26 |
27 |
28 |
29 |
70 |
71 | {% if (theme_prev_next_buttons_location == 'top' or theme_prev_next_buttons_location == 'both') and (next or prev) %}
72 |
80 | {% endif %}
81 |
82 |
83 |
--------------------------------------------------------------------------------
/docs/source/conf.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | #
3 | # SpeechPy documentation build configuration file, created by
4 | # sphinx-quickstart on Wed Nov 22 14:40:49 2017.
5 | #
6 | # This file is execfile()d with the current directory set to its
7 | # containing dir.
8 | #
9 | # Note that not all possible configuration values are present in this
10 | # autogenerated file.
11 | #
12 | # All configuration values have a default; values that are commented out
13 | # serve to show the default.
14 |
15 | # If extensions (or modules to document with autodoc) are in another directory,
16 | # add these directories to sys.path here. If the directory is relative to the
17 | # documentation root, use os.path.abspath to make it absolute, like shown here.
18 | #
19 | import os
20 | import sys
21 | sys.path.insert(0, os.path.abspath('../../'))
22 | import speechpy
23 | import numpy
24 | import sphinx_rtd_theme
25 |
26 |
27 | # -- General configuration ------------------------------------------------
28 |
29 | # If your documentation needs a minimal Sphinx version, state it here.
30 | #
31 | # needs_sphinx = '1.0'
32 |
33 | # Add any Sphinx extension module names here, as strings. They can be
34 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
35 | # ones.
36 | extensions = [
37 | 'sphinx.ext.autodoc',
38 | 'sphinx.ext.autosummary',
39 | 'sphinx.ext.doctest',
40 | 'sphinx.ext.intersphinx',
41 | 'sphinx.ext.todo',
42 | 'sphinx.ext.coverage',
43 | 'sphinx.ext.mathjax',
44 | 'sphinx.ext.napoleon',
45 | 'sphinx.ext.viewcode',
46 | # 'sphinxcontrib.googleanalytics',
47 | ]
48 |
49 | # True to use the :ivar: role for instance variables. False to use the .. attribute:: directive instead. Defaults to False.
50 | # Refer to http://www.sphinx-doc.org/en/stable/ext/napoleon.html
51 | napoleon_use_ivar = True
52 |
53 | # Add any paths that contain templates here, relative to this directory.
54 | templates_path = ['_templates']
55 |
56 | # The suffix(es) of source filenames.
57 | # You can specify multiple suffix as a list of string:
58 | #
59 | # source_suffix = ['.rst', '.md']
60 | source_suffix = '.rst'
61 |
62 | # The master toctree document.
63 | master_doc = 'index'
64 |
65 | # General information about the project.
66 | project = u'SpeechPy'
67 | copyright = u'2017, Amirsina Torfi'
68 | author = u'Amirsina Torfi'
69 |
70 | # The version info for the project you're documenting, acts as replacement for
71 | # |version| and |release|, also used in various other places throughout the
72 | # built documents.
73 | #
74 | version = 'master (' + '2.3.0' + ' )'
75 | # The full version, including alpha/beta/rc tags.
76 | # TODO: verify this works as expected
77 | release = 'master'
78 |
79 | # The language for content autogenerated by Sphinx. Refer to documentation
80 | # for a list of supported languages.
81 | #
82 | # This is also used if you do content translation via gettext catalogs.
83 | # Usually you set "language" from the command line for these cases.
84 | language = None
85 |
86 | # List of patterns, relative to source directory, that match files and
87 | # directories to ignore when looking for source files.
88 | # This patterns also effect to html_static_path and html_extra_path
89 | exclude_patterns = []
90 |
91 | # The name of the Pygments (syntax highlighting) style to use.
92 | pygments_style = 'sphinx'
93 |
94 | # If true, `todo` and `todoList` produce output, else they produce nothing.
95 | todo_include_todos = False
96 |
97 |
98 | # -- Options for HTML output ----------------------------------------------
99 |
100 | # The theme to use for HTML and HTML Help pages. See the documentation for
101 | # a list of builtin themes.
102 | #
103 | html_theme = 'sphinx_rtd_theme'
104 | html_theme_path = [sphinx_rtd_theme.get_html_theme_path()]
105 |
106 | # Theme options are theme-specific and customize the look and feel of a theme
107 | # further. For a list of options available for each theme, see the
108 | # documentation.
109 | #
110 | # html_theme_options = {}
111 |
112 | html_theme_options = {
113 | 'collapse_navigation': False,
114 | 'display_version': True,
115 | 'logo_only': True,
116 | 'sticky_navigation': False
117 | }
118 |
119 | html_context = {
120 | "display_github": True, # Add 'Edit on Github' link instead of 'View page source'
121 | "last_updated": True,
122 | "commit": False,
123 | }
124 |
125 | html_logo = '_static/img/speechpy_logo.gif'
126 |
127 | # Add any paths that contain custom static files (such as style sheets) here,
128 | # relative to this directory. They are copied after the builtin static files,
129 | # so a file named "default.css" will overwrite the builtin "default.css".
130 | html_static_path = ['_static']
131 |
132 | # -- Options for HTMLHelp output ------------------------------------------
133 |
134 | # Output file base name for HTML help builder.
135 | htmlhelp_basename = 'SpeechPydoc'
136 |
137 |
138 | # -- Options for LaTeX output ---------------------------------------------
139 |
140 | # -- Options for LaTeX output ---------------------------------------------
141 |
142 | # latex_engine = 'pdflatex'
143 |
144 | # latex_engine = 'lualatex'
145 | # latex_elements = {
146 |
147 | # 'papersize': 'a4paper',
148 | # 'releasename':" ",
149 | # 'figure_align':'htbp',
150 | # 'pointsize': '12pt',
151 | # 'fontpkg': r'''
152 | # \setmainfont{Times New Roman}
153 | # \setsansfont{Times New Roman}
154 | # \setmonofont{Times New Roman}
155 | # ''',
156 | # 'preamble': r'''
157 | # \usepackage[titles]{tocloft}
158 | # \cftsetpnumwidth {1.25cm}\cftsetrmarg{1.5cm}
159 | # \setlength{\cftchapnumwidth}{0.75cm}
160 | # \setlength{\cftsecindent}{\cftchapnumwidth}
161 | # \setlength{\cftsecnumwidth}{1.25cm}
162 | # ''',
163 | # 'fncychap': r'\usepackage[Bjornstrup]{fncychap}',
164 | # 'printindex': r'\footnotesize\raggedright\printindex',
165 | # }
166 |
167 |
168 |
169 |
170 | latex_elements = {
171 | # The paper size ('letterpaper' or 'a4paper').
172 | #
173 | 'papersize': 'letterpaper',
174 |
175 | # The font size ('10pt', '11pt' or '12pt').
176 | #
177 | 'pointsize': '10pt',
178 |
179 | # Additional stuff for the LaTeX preamble.
180 | #
181 | 'preamble': '',
182 |
183 | # Latex figure (float) alignment
184 | #
185 | 'figure_align': 'htbp',
186 | }
187 |
188 | # Grouping the document tree into LaTeX files. List of tuples
189 | # (source start file, target name, title,
190 | # author, documentclass [howto, manual, or own class]).
191 | latex_documents = [
192 | (master_doc, 'test.tex', u'test Documentation',
193 | u'test', 'manual'),
194 | ]
195 |
196 | # The name of an image file (relative to this directory) to place at the top of
197 | # the title page.
198 | #
199 | # latex_logo = None
200 |
201 | # If true, show page references after internal links.
202 | #
203 | # latex_show_pagerefs = False
204 |
205 | # If true, show URL addresses after external links.
206 | #
207 | # latex_show_urls = False
208 |
209 | # Documents to append as an appendix to all manuals.
210 | #
211 | # latex_appendices = []
212 |
213 | # If false, no module index is generated.
214 | #
215 | # latex_domain_indices = True
216 |
217 |
218 |
219 |
220 |
221 |
222 |
223 |
224 |
225 |
226 |
227 |
228 |
229 |
230 |
231 | latex_logo = '_static/img/speechpy_logo.jpg'
232 |
233 |
234 | # latex_elements = {
235 | # # The paper size ('letterpaper' or 'a4paper').
236 | # #
237 | # 'papersize': 'a4paper',
238 | # 'releasename':" ",
239 | # # Sonny, Lenny, Glenn, Conny, Rejne, Bjarne and Bjornstrup
240 | # # 'fncychap': '\\usepackage[Lenny]{fncychap}',
241 | # 'fncychap': '\\usepackage{fncychap}',
242 | # # 'fontpkg': ' ',
243 | #
244 | # 'figure_align':'htbp',
245 | # # The font size ('10pt', '11pt' or '12pt').
246 | # #
247 | # 'pointsize': '14pt',
248 | #
249 | # # Additional stuff for the LaTeX preamble.
250 | #
251 | # # 'preamble': r'''
252 | # # %%%%%%%%%%%%%%%%%%%% Sina %%%%%%%%%%%%%%%%%%
253 | # # %%%add number to subsubsection 2=subsection, 3=subsubsection
254 | # # %%% below subsubsection is not good idea.
255 | # # \setcounter{secnumdepth}{3}
256 | # # %
257 | # # %%%% Table of content upto 2=subsection, 3=subsubsection
258 | # # %\setcounter{tocdepth}{2}
259 | # #
260 | # # \usepackage{amsmath,amsfonts,amssymb,amsthm}
261 | # # \usepackage{graphicx}
262 | # #
263 | # # %\usepackage{minted}
264 | # # %\fvset{breaklines=true}
265 | # #
266 | # # %%% reduce spaces for Table of contents, figures and tables
267 | # # %%% it is used "\addtocontents{toc}{\vskip -1.2cm}" etc. in the document
268 | # # \usepackage[notlot,nottoc,notlof]{}
269 | # #
270 | # # \usepackage{color}
271 | # # \usepackage{transparent}
272 | # # \usepackage{eso-pic}
273 | # # \usepackage{lipsum}
274 | # #
275 | # # \usepackage{footnotebackref} %%link at the footnote to go to the place of footnote in the text
276 | # #
277 | # # %% spacing between line
278 | # # \usepackage{setspace}
279 | # # %%%%\onehalfspacing
280 | # # %%%%\doublespacing
281 | # # %\singlespacing
282 | # #
283 | # #
284 | # # %%%%%%%%%%% datetime
285 | # # \usepackage{datetime}
286 | # #
287 | # # \newdateformat{MonthYearFormat}{%
288 | # # \monthname[\THEMONTH], \THEYEAR}
289 | # #
290 | # #
291 | # # %% RO, LE will not work for 'oneside' layout.
292 | # # %% Change oneside to twoside in document class
293 | # # %\usepackage{fancyhdr}
294 | # # %\pagestyle{fancy}
295 | # # %\fancyhf{}
296 | # #
297 | # # %%% Alternating Header for oneside
298 | # # %\fancyhead[L]{\ifthenelse{\isodd{\value{page}}}{ \small \nouppercase{\leftmark} }{}}
299 | # # %\fancyhead[R]{\ifthenelse{\isodd{\value{page}}}{}{ \small \nouppercase{\rightmark} }}
300 | # #
301 | # # %%% Alternating Header for two side
302 | # # %\fancyhead[RO]{\small \nouppercase{\rightmark}}
303 | # # %\fancyhead[LE]{\small \nouppercase{\leftmark}}
304 | # #
305 | # # %% for oneside: change footer at right side. If you want to use Left and right then use same as header defined above.
306 | # # %\fancyfoot[R]{\ifthenelse{\isodd{\value{page}}}{{\tiny Amirsina Torfi} }{\href{https://github.com/astorfi/speechpy}{\tiny SpeechPy}}}
307 | # #
308 | # # %%% Alternating Footer for two side
309 | # # %\fancyfoot[RO, RE]{\scriptsize Amirsina Torfi (amirsina.torfi@gmail.com)}
310 | # #
311 | # # %%% page number
312 | # # %\fancyfoot[CO, CE]{\thepage}
313 | # #
314 | # # %\renewcommand{\headrulewidth}{0.5pt}
315 | # # %\renewcommand{\footrulewidth}{0.5pt}
316 | # #
317 | # # %\RequirePackage{tocbibind} %%% comment this to remove page number for following
318 | # # %\addto\captionsenglish{\renewcommand{\contentsname}{Table of contents}}
319 | # # %\addto\captionsenglish{\renewcommand{\listfigurename}{List of figures}}
320 | # # %\addto\captionsenglish{\renewcommand{\listtablename}{List of tables}}
321 | # # %\addto\captionsenglish{\renewcommand{\listtablename}{List of tables}} %%% Heading for TOC
322 | # #
323 | # #
324 | # # %%reduce spacing for itemize
325 | # # \usepackage{enumitem}
326 | # # %\setlist{nosep}
327 | # #
328 | # # %%%%%%%%%%% Quote Styles at the top of chapter
329 | # # %\usepackage{epigraph}
330 | # # %\setlength{\epigraphwidth}{0.8\columnwidth}
331 | # # %\newcommand{\chapterquote}[2]{\epigraphhead[60]{\epigraph{\textit{#1}}{\textbf {\textit{--#2}}}}}
332 | # # %%%%%%%%%%% Quote for all places except Chapter
333 | # # %\newcommand{\sectionquote}[2]{{\quote{\textit{``#1''}}{\textbf {\textit{--#2}}}}}
334 | # # ''',
335 | # #
336 | # #
337 | # # 'maketitle': r'''
338 | # # \pagenumbering{Roman} %%% to avoid page 1 conflict with actual page 1
339 | # #
340 | # # \begin{titlepage}
341 | # # \centering
342 | # #
343 | # # \vspace*{40mm} %%% * is used to give space from top
344 | # # \textbf{\Huge {SpeechPy: Speech Recognition Library}}
345 | # #
346 | # # \vspace{0mm}
347 | # # \begin{figure}[!h]
348 | # # \centering
349 | # # \includegraphics[scale=0.8]{speechpy_logo.jpg}
350 | # # \end{figure}
351 | # #
352 | # # \vspace{0mm}
353 | # # \Large \textbf{{Amirsina Torfi}}
354 | # #
355 | # # % \small Created on : Octorber, 2017
356 | # #
357 | # # \vspace*{0mm}
358 | # # \small Last updated : \MonthYearFormat\today
359 | # #
360 | # #
361 | # # %% \vfill adds at the bottom
362 | # # \vfill
363 | # # \small \textit{Please refer to project repository at }{\href{https://github.com/astorfi/speechpy}{SpeechPy}}
364 | # # \end{titlepage}
365 | # #
366 | # # \clearpage
367 | # # \pagenumbering{roman}
368 | # # \tableofcontents
369 | # # % \listoffigures
370 | # # % \listoftables
371 | # # \clearpage
372 | # # \pagenumbering{english}
373 | # #
374 | # # ''',
375 | # # Latex figure (float) alignment
376 | # #
377 | # # 'figure_align': 'htbp',
378 | # # 'sphinxsetup': \
379 | # # #'hmargin={0.7in,0.7in}, vmargin={1in,1in}, \
380 | # # 'verbatimwithframe=true, \
381 | # # TitleColor={rgb}{0,0,0}',
382 | # # 'tableofcontents':' ',
383 | #
384 | # }
385 |
386 |
387 |
388 | # Grouping the document tree into LaTeX files. List of tuples
389 | # (source start file, target name, title,
390 | # author, documentclass [howto, manual, or own class]).
391 | latex_documents = [
392 | (master_doc, 'speechpy.tex', 'SpeechPy Documentation',
393 | 'Amirsina Torfi', 'manual'),
394 | ]
395 |
396 |
397 | # -- Options for manual page output ---------------------------------------
398 |
399 | # One entry per manual page. List of tuples
400 | # (source start file, name, description, authors, manual section).
401 | man_pages = [
402 | (master_doc, 'speechpy', u'SpeechPy Documentation',
403 | [author], 1)
404 | ]
405 |
406 |
407 | # -- Options for Texinfo output -------------------------------------------
408 |
409 | # Grouping the document tree into Texinfo files. List of tuples
410 | # (source start file, target name, title, author,
411 | # dir menu entry, description, category)
412 | texinfo_documents = [
413 | (master_doc, 'SpeechPy', u'SpeechPy Documentation',
414 | author, 'SpeechPy', 'A library for Speech Recognition and Feature Extraction.',
415 | 'Miscellaneous'),
416 | ]
417 |
418 |
419 | # Example configuration for intersphinx: refer to the Python standard library.
420 | intersphinx_mapping = {
421 | 'python': ('https://docs.python.org/3/', None),
422 | 'numpy': ('https://docs.scipy.org/doc/numpy/', None),
423 | }
424 |
425 |
426 |
--------------------------------------------------------------------------------
/docs/source/content/features.rst:
--------------------------------------------------------------------------------
1 | .. role:: hidden
2 | :class: hidden-section
3 |
4 | Features
5 | =========
6 |
7 | .. automodule:: speechpy.feature
8 | .. currentmodule:: speechpy.feature
9 |
10 |
11 | :hidden:`MFCC`
12 | ~~~~~~~~~~~~~~
13 |
14 | .. autofunction:: speechpy.feature.mfcc
15 |
16 |
17 | :hidden:`Mel Frequency Energy`
18 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
19 |
20 | .. autofunction:: speechpy.feature.mfe
21 |
22 |
23 | :hidden:`Log Mel Frequency Energy`
24 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
25 |
26 | .. autofunction:: speechpy.feature.lmfe
27 |
28 |
29 | :hidden:`Extract Derivative Features`
30 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
31 |
32 | .. autofunction:: speechpy.feature.extract_derivative_feature
33 |
--------------------------------------------------------------------------------
/docs/source/content/postprocessing.rst:
--------------------------------------------------------------------------------
1 | .. role:: hidden
2 | :class: hidden-section
3 |
4 |
5 | postprocessing
6 | ==============
7 |
8 | .. automodule:: speechpy.processing
9 | .. currentmodule:: speechpy.processing
10 |
11 |
12 | :hidden:`Global Cepstral Mean and Variance Normalization`
13 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
14 |
15 | .. autofunction:: speechpy.processing.cmvn
16 |
17 |
18 | :hidden:`Local Cepstral Mean and Variance Normalization over Sliding Window`
19 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
20 |
21 | .. autofunction:: speechpy.processing.cmvnw
22 |
--------------------------------------------------------------------------------
/docs/source/content/preprocessing.rst:
--------------------------------------------------------------------------------
1 | .. role:: hidden
2 | :class: hidden-section
3 |
4 |
5 | Preprocessing
6 | =============
7 |
8 | .. automodule:: speechpy.processing
9 | .. currentmodule:: speechpy.processing
10 |
11 | :hidden:`Pre-emphasis`
12 | ~~~~~~~~~~~~~~~~~~~~~~~~~
13 | .. autofunction:: speechpy.processing.preemphasis
14 |
15 | :hidden:`Stacking`
16 | ~~~~~~~~~~~~~~~~~~
17 | .. autofunction:: speechpy.processing.stack_frames
18 |
19 | :hidden:`FFT Spectrum`
20 | ~~~~~~~~~~~~~~~~~~~~~~
21 | .. autofunction:: speechpy.processing.fft_spectrum
22 |
23 | :hidden:`Power Spectrum`
24 | ~~~~~~~~~~~~~~~~~~~~~~~~
25 | .. autofunction:: speechpy.processing.power_spectrum
26 |
27 | :hidden:`Power Spectrum Log`
28 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~
29 | .. autofunction:: speechpy.processing.log_power_spectrum
30 |
31 | :hidden:`Derivative Extraction`
32 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
33 |
34 | .. autofunction:: speechpy.processing.derivative_extraction
35 |
--------------------------------------------------------------------------------
/docs/source/epilogue/CONTRIBUTING.rst:
--------------------------------------------------------------------------------
1 |
2 | ========================
3 | Contributing
4 | ========================
5 |
6 | When contributing to this repository, you are more than welcome to discuss your feedback with any of the owners of this repository. *For typos, please do not create a pull request. Instead, declare them in issues or email the repository owner*. For technical and conceptual questions please feel free to **directly contact the repository owner**. Before asking general questions related to the concepts and techniques provided in this project, **please make sure to read and understand its associated paper**.
7 |
8 | Please note we have a code of conduct, please follow it in all your interactions with the project.
9 |
10 | ----------------------
11 | Pull Request Process
12 | ----------------------
13 |
14 | Please consider the following criterions in order to help us in a better way:
15 |
16 | 1. The pull request is mainly expected to be a code script suggestion or improvement.
17 | 2. A pull request related to non-code-script sections is expected to make a significant difference in the documentation. Otherwise, it is expected to be announced in the issues section.
18 | 3. Ensure any install or build dependencies are removed before the end of the layer when doing a
19 | build and creating a pull request.
20 | 4. Add comments with details of changes to the interface, this includes new environment
21 | variables, exposed ports, useful file locations and container parameters.
22 | 5. You may merge the Pull Request in once you have the sign-off of at least one other developer, or if you
23 | do not have permission to do that, you may request the owner to merge it for you if you believe all checks are passed.
24 |
25 | ----------------------
26 | Final Note
27 | ----------------------
28 |
29 | We are looking forward to your kind feedback. Please help us to improve this open source project and make our work better.
30 | For contribution, please create a pull request and we will investigate it promptly. Once again, we appreciate
31 | your kind feedback and elaborate code inspections.
32 |
--------------------------------------------------------------------------------
/docs/source/epilogue/finalnote.rst:
--------------------------------------------------------------------------------
1 | ==================
2 | Epilogue
3 | ==================
4 |
5 | -------------
6 | Disclaimer
7 | -------------
8 |
9 | Although by dramatic chages, some portion of this library is inspired by the `python speech features`_ library.
10 |
11 | .. _python speech features: https://github.com/jameslyons/python_speech_features
12 |
13 | We clain the following advantages for our library:
14 |
15 | 1. More accurate operations have been performed for the mel-frequency calculations.
16 | 2. The package supports different ``Python`` versions.
17 | 3. The feature are generated in a more organized way as cubic features.
18 | 4. The package is well-tested and integrated.
19 | 5. The package is up-to-date and actively developing.
20 | 6. The package has been used for research purposes.
21 | 7. Exceptions and extreme cases are handled in this library.
22 |
23 |
24 | -------------
25 | Contributing
26 | -------------
27 |
28 | When contributing to this repository, please first discuss the change you wish to make via issue,
29 | email, or any other method with the owners of this repository before making a change. *For typos, please
30 | do not create a pull request. Instead, declare them in issues or email the repository owner*.
31 |
32 | Please note we have a code of conduct, please follow it in all your interactions with the project.
33 |
34 | --------------------------
35 | Pull Request Process
36 | --------------------------
37 |
38 | Please consider the following criterions in order to help us in a better way:
39 |
40 | 1. The pull request is mainly expected to be a code script suggestion or improvement.
41 | 2. A pull request related to non-code-script sections is expected to make a significant difference in the documentation. Otherwise, it is expected to be announced in the issues section.
42 | 3. Ensure any install or build dependencies are removed before the end of the layer when doing a
43 | build and creating a pull request.
44 | 4. Add comments with details of changes to the interface, this includes new environment
45 | variables, exposed ports, useful file locations and container parameters.
46 | 5. You may merge the Pull Request in once you have the sign-off of at least one other developer, or if you
47 | do not have permission to do that, you may request the owner to merge it for you if you believe all checks are passed.
48 |
49 | -------------
50 | Final Note
51 | -------------
52 |
53 | We are looking forward to your kind feedback. Please help us to improve this open source project and make our work better.
54 | For contribution, please create a pull request and we will investigate it promptly. Once again, we appreciate
55 | your kind feedback and elaborate code inspections.
56 |
--------------------------------------------------------------------------------
/docs/source/epilogue/test.rst:
--------------------------------------------------------------------------------
1 | ============
2 | test
3 | ============
4 |
5 | -------------
6 | Test Package
7 | -------------
8 | Once the package has been installed, a test file can be directly run to show the results.
9 | The test example can be seen in ``test/test_package.py`` as below:
10 |
11 | .. code-block:: python
12 |
13 | import scipy.io.wavfile as wav
14 | import numpy as np
15 | import speechpy
16 | import os
17 |
18 | file_name = os.path.join(os.path.dirname(os.path.abspath(__file__)),'Alesis-Sanctuary-QCard-AcoustcBas-C2.wav')
19 | fs, signal = wav.read(file_name)
20 | signal = signal[:,0]
21 |
22 | # Example of pre-emphasizing.
23 | signal_preemphasized = speechpy.processing.preemphasis(signal, cof=0.98)
24 |
25 | # Example of staching frames
26 | frames = speechpy.processing.stack_frames(signal, sampling_frequency=fs, frame_length=0.020, frame_stride=0.01, filter=lambda x: np.ones((x,)),
27 | zero_padding=True)
28 |
29 | # Example of extracting power spectrum
30 | power_spectrum = speechpy.processing.power_spectrum(frames, fft_points=512)
31 | print('power spectrum shape=', power_spectrum.shape)
32 |
33 | ############# Extract MFCC features #############
34 | mfcc = speechpy.feature.mfcc(signal, sampling_frequency=fs, frame_length=0.020, frame_stride=0.01,
35 | num_filters=40, fft_length=512, low_frequency=0, high_frequency=None)
36 | mfcc_cmvn = speechpy.processing.cmvnw(mfcc,win_size=301,variance_normalization=True)
37 | print('mfcc(mean + variance normalized) feature shape=', mfcc_cmvn.shape)
38 |
39 | mfcc_feature_cube = speechpy.feature.extract_derivative_feature(mfcc)
40 | print('mfcc feature cube shape=', mfcc_feature_cube.shape)
41 |
42 | ############# Extract logenergy features #############
43 | logenergy = speechpy.feature.lmfe(signal, sampling_frequency=fs, frame_length=0.020, frame_stride=0.01,
44 | num_filters=40, fft_length=512, low_frequency=0, high_frequency=None)
45 | logenergy_feature_cube = speechpy.feature.extract_derivative_feature(logenergy)
46 | print('logenergy features=', logenergy.shape)
47 |
48 |
49 |
50 |
51 |
52 |
53 | -----------
54 | Test Local
55 | -----------
56 |
57 | There is an alternative local way of testing without the necessity to package installation.
58 | The local test example can be found in ``test/test_package.py`` as follows:
59 |
60 | .. code-block:: python
61 |
62 | import scipy.io.wavfile as wav
63 | import numpy as np
64 | import os
65 | import sys
66 | lib_path = os.path.abspath(os.path.join('..'))
67 | print(lib_path)
68 | sys.path.append(lib_path)
69 | import speechpy
70 | import os
71 |
72 | file_name = os.path.join(os.path.dirname(os.path.abspath(__file__)),'Alesis-Sanctuary-QCard-AcoustcBas-C2.wav')
73 | fs, signal = wav.read(file_name)
74 | signal = signal[:,0]
75 |
76 | # Example of pre-emphasizing.
77 | signal_preemphasized = speechpy.processing.preemphasis(signal, cof=0.98)
78 |
79 | # Example of staching frames
80 | frames = speechpy.processing.stack_frames(signal, sampling_frequency=fs, frame_length=0.020, frame_stride=0.01, filter=lambda x: np.ones((x,)),
81 | zero_padding=True)
82 |
83 | # Example of extracting power spectrum
84 | power_spectrum = speechpy.processing.power_spectrum(frames, fft_points=512)
85 | print('power spectrum shape=', power_spectrum.shape)
86 |
87 | ############# Extract MFCC features #############
88 | mfcc = speechpy.feature.mfcc(signal, sampling_frequency=fs, frame_length=0.020, frame_stride=0.01,
89 | num_filters=40, fft_length=512, low_frequency=0, high_frequency=None)
90 | mfcc_cmvn = speechpy.processing.cmvnw(mfcc,win_size=301,variance_normalization=True)
91 | print('mfcc(mean + variance normalized) feature shape=', mfcc_cmvn.shape)
92 |
93 | mfcc_feature_cube = speechpy.feature.extract_derivative_feature(mfcc)
94 | print('mfcc feature cube shape=', mfcc_feature_cube.shape)
95 |
96 | ############# Extract logenergy features #############
97 | logenergy = speechpy.feature.lmfe(signal, sampling_frequency=fs, frame_length=0.020, frame_stride=0.01,
98 | num_filters=40, fft_length=512, low_frequency=0, high_frequency=None)
99 | logenergy_feature_cube = speechpy.feature.extract_derivative_feature(logenergy)
100 | print('logenergy features=', logenergy.shape)
101 |
102 |
103 |
104 |
105 |
106 |
107 |
108 |
109 |
110 |
111 |
112 |
113 |
114 |
115 |
116 | For ectracting the feature at first, the signal samples will be stacked into frames. The features are computed for each frame in the stacked frames collection.
117 |
118 | -------------
119 | Dependencies
120 | -------------
121 |
122 | Two packages of ``Scipy`` and ``NumPy`` are the required dependencies which will be installed automatically by running the ``setup.py`` file.
123 |
--------------------------------------------------------------------------------
/docs/source/index.rst:
--------------------------------------------------------------------------------
1 | .. SpeechPy documentation master file, created by
2 | sphinx-quickstart on Wed Nov 22 14:40:49 2017.
3 | You can adapt this file completely to your liking, but it should at least
4 | contain the root `toctree` directive.
5 |
6 | :github_url: https://github.com/astorfi/speechpy
7 |
8 | Welcome to SpeechPy's documentation!
9 | ====================================
10 |
11 | .. toctree::
12 | :maxdepth: 2
13 | :caption: Preface
14 |
15 | intro/introductions
16 |
17 | .. toctree::
18 | :maxdepth: 2
19 | :caption: Package Reference
20 |
21 | content/preprocessing
22 | content/features
23 | content/postprocessing
24 |
25 | .. toctree::
26 | :maxdepth: 2
27 | :caption: Epilogue
28 |
29 | epilogue/test
30 | epilogue/CONTRIBUTING
31 |
32 |
33 | Indices and tables
34 | ==================
35 |
36 | * :ref:`genindex`
37 | * :ref:`modindex`
38 | * :ref:`search`
39 |
--------------------------------------------------------------------------------
/docs/source/intro/introductions.rst:
--------------------------------------------------------------------------------
1 |
2 | ============
3 | Introduction
4 | ============
5 |
6 | -------------------------
7 | Foreword
8 | -------------------------
9 |
10 | The purpose of this project is to provide a package for speech processing and
11 | feature extraction. This library provides most frequent used speech features including MFCCs and filterbank energies alongside with the log-energy of filterbanks.
12 |
13 |
14 | .. image:: ../_static/img/speech.jpg
15 | :height: 200px
16 | :width: 400 px
17 | :scale: 100 %
18 | :alt: alternate text
19 | :align: center
20 |
21 | -------------------------
22 | Motivation
23 | -------------------------
24 |
25 | There are different motivations for this open source project.
26 |
27 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~
28 | Deep Learning application
29 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~
30 |
31 | One of the main reasons for creating this package was to provide necessary features for deep learning applications such as ASR(Automatic Speech Recognition) or SR(Speaker Recognition).
32 | As a results, most of the features that are necessary are provided hear.
33 |
34 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~
35 | Pythonic Packaging
36 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~
37 |
38 | Another reason for creating this package was to have a Pythonic environment for
39 | speech recognition and feature extraction due to the fact that the Python language
40 | is becoming ubiquotous!
41 |
42 |
43 | -------------------------
44 | How to Install?
45 | -------------------------
46 |
47 | .. image:: ../_static/img/installation_logo.jpg
48 | :height: 100 px
49 | :width: 200 px
50 | :scale: 80 %
51 | :alt: alternate text
52 | :align: center
53 |
54 |
55 | There are two possible ways for installation of this package: local installation and PyPi.
56 |
57 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~
58 | Local Installation
59 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~
60 |
61 | For local installation at first the repository must be cloned::
62 |
63 | git clone https://github.com/astorfi/speech_feature_extraction.git
64 |
65 |
66 | After cloning the reposity, root to the repository directory then execute::
67 |
68 | python setup.py develop
69 |
70 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~
71 | Pypi
72 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~
73 |
74 | The package is available on PyPi. For direct installation simply execute the following:
75 |
76 |
77 | .. code-block:: shell
78 |
79 | pip install speechpy
80 |
81 | -------------------------
82 | Citation
83 | -------------------------
84 |
85 | If you used this package, please cite it as follows:
86 |
87 | .. code:: bash
88 |
89 | @misc{amirsina_torfi_2017_840395,
90 | author = {Amirsina Torfi},
91 | title = {{SpeechPy: Speech recognition and feature extraction}},
92 | month = aug,
93 | year = 2017,
94 | doi = {10.5281/zenodo.840395},
95 | url = {https://doi.org/10.5281/zenodo.840395}
96 | }
97 |
--------------------------------------------------------------------------------
/example/Alesis-Sanctuary-QCard-AcoustcBas-C2.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/astorfi/speechpy/b317746f042d02e03124518625be6d9327b71e4a/example/Alesis-Sanctuary-QCard-AcoustcBas-C2.wav
--------------------------------------------------------------------------------
/example/test_local.py:
--------------------------------------------------------------------------------
1 | """
2 | This example is provided to test the package locally.
3 | There is no need to installing the package using pip.
4 | Only forking the project repository is required.
5 | """
6 |
7 | import scipy.io.wavfile as wav
8 | import numpy as np
9 | import os
10 | import sys
11 | lib_path = os.path.abspath(os.path.join('..'))
12 | print(lib_path)
13 | sys.path.append(lib_path)
14 | from speechpy import processing
15 | from speechpy import feature
16 | import os
17 |
18 | file_name = os.path.join(os.path.dirname(os.path.abspath(__file__)),'Alesis-Sanctuary-QCard-AcoustcBas-C2.wav')
19 | fs, signal = wav.read(file_name)
20 | signal = signal[:,0]
21 |
22 | # Pre-emphasizing.
23 | signal_preemphasized = processing.preemphasis(signal, cof=0.98)
24 |
25 | # Staching frames
26 | frames = processing.stack_frames(signal, sampling_frequency=fs,
27 | frame_length=0.020,
28 | frame_stride=0.01,
29 | filter=lambda x: np.ones((x,)),
30 | zero_padding=True)
31 |
32 | # Extracting power spectrum
33 | power_spectrum = processing.power_spectrum(frames, fft_points=512)
34 | print('power spectrum shape=', power_spectrum.shape)
35 |
36 | ############# Extract MFCC features #############
37 | mfcc = feature.mfcc(signal, sampling_frequency=fs,
38 | frame_length=0.020, frame_stride=0.01,
39 | num_filters=40, fft_length=512, low_frequency=0,
40 | high_frequency=None)
41 |
42 | # Cepstral mean variance normalization.
43 | mfcc_cmvn = processing.cmvn(mfcc,variance_normalization=True)
44 | print('mfcc(mean + variance normalized) feature shape=', mfcc_cmvn.shape)
45 |
46 | # Extracting derivative features
47 | mfcc_feature_cube = feature.extract_derivative_feature(mfcc)
48 | print('mfcc feature cube shape=', mfcc_feature_cube.shape)
49 |
50 | ############# Extract logenergy features #############
51 | logenergy = feature.lmfe(signal, sampling_frequency=fs,
52 | frame_length=0.020, frame_stride=0.01,
53 | num_filters=40, fft_length=512,
54 | low_frequency=0, high_frequency=None)
55 | logenergy_feature_cube = feature.extract_derivative_feature(logenergy)
56 | print('logenergy features=', logenergy.shape)
57 |
58 |
59 |
60 |
61 |
62 |
--------------------------------------------------------------------------------
/example/test_package.py:
--------------------------------------------------------------------------------
1 | """
2 | This example is provided to test the installed package.
3 | The package should be installed from PyPi using pip install speechpy.
4 | """
5 |
6 | import scipy.io.wavfile as wav
7 | import numpy as np
8 | import speechpy
9 | import os
10 |
11 | # Reading the sample wave file
12 | file_name = os.path.join(os.path.dirname(os.path.abspath(__file__)),
13 | 'Alesis-Sanctuary-QCard-AcoustcBas-C2.wav')
14 | fs, signal = wav.read(file_name)
15 | signal = signal[:,0]
16 |
17 | # Pre-emphasizing.
18 | signal_preemphasized = speechpy.processing.preemphasis(signal, cof=0.98)
19 |
20 | # Staching frames
21 | frames = speechpy.processing.stack_frames(signal, sampling_frequency=fs,
22 | frame_length=0.020,
23 | frame_stride=0.01,
24 | filter=lambda x: np.ones((x,)),
25 | zero_padding=True)
26 |
27 | # Extracting power spectrum
28 | power_spectrum = speechpy.processing.power_spectrum(frames, fft_points=512)
29 | print('power spectrum shape=', power_spectrum.shape)
30 |
31 | ############# Extract MFCC features #############
32 | mfcc = speechpy.feature.mfcc(signal, sampling_frequency=fs,
33 | frame_length=0.020, frame_stride=0.01,
34 | num_filters=40, fft_length=512, low_frequency=0,
35 | high_frequency=None)
36 | mfcc_cmvn = speechpy.processing.cmvnw(mfcc,win_size=301,
37 | variance_normalization=True)
38 | print('mfcc(mean + variance normalized) feature shape=', mfcc_cmvn.shape)
39 |
40 | # Extracting derivative features
41 | mfcc_feature_cube = speechpy.feature.extract_derivative_feature(mfcc)
42 | print('mfcc feature cube shape=', mfcc_feature_cube.shape)
43 |
44 | ############# Extract logenergy features #############
45 | logenergy = speechpy.feature.lmfe(signal, sampling_frequency=fs,
46 | frame_length=0.020, frame_stride=0.01,
47 | num_filters=40, fft_length=512,
48 | low_frequency=0, high_frequency=None)
49 | logenergy_feature_cube = speechpy.feature.extract_derivative_feature(logenergy)
50 | print('logenergy features=', logenergy.shape)
51 |
52 |
53 |
54 |
55 |
--------------------------------------------------------------------------------
/paper/paper.bib:
--------------------------------------------------------------------------------
1 | @inproceedings{hirsch2000aurora,
2 | title={The Aurora experimental framework for the performance evaluation of speech recognition systems under noisy conditions},
3 | author={Hirsch, Hans-G{\"u}nter and Pearce, David},
4 | booktitle={ASR2000-Automatic Speech Recognition: Challenges for the new Millenium ISCA Tutorial and Research Workshop (ITRW)},
5 | year={2000}
6 | }
7 |
8 | @book{guyon2008feature,
9 | title={Feature extraction: foundations and applications},
10 | author={Guyon, Isabelle and Gunn, Steve and Nikravesh, Masoud and Zadeh, Lofti A},
11 | volume={207},
12 | year={2008},
13 | publisher={Springer}
14 | }
15 |
16 | @article{furui1986speaker,
17 | title={Speaker-independent isolated word recognition using dynamic features of speech spectrum},
18 | author={Furui, Sadaoki},
19 | journal={IEEE Transactions on Acoustics, Speech, and Signal Processing},
20 | volume={34},
21 | number={1},
22 | pages={52--59},
23 | year={1986},
24 | publisher={IEEE}
25 | }
26 |
27 | @book{yu2016automatic,
28 | title={AUTOMATIC SPEECH RECOGNITION.},
29 | author={Yu, Dong and Deng, Li},
30 | year={2016},
31 | publisher={Springer}
32 | }
33 |
34 | @book{rabiner1993fundamentals,
35 | title={Fundamentals of speech recognition},
36 | author={Rabiner, Lawrence R and Juang, Biing-Hwang},
37 | volume={14},
38 | year={1993},
39 | publisher={PTR Prentice Hall Englewood Cliffs}
40 | }
41 |
42 | @article{campbell1997speaker,
43 | title={Speaker recognition: A tutorial},
44 | author={Campbell, Joseph P},
45 | journal={Proceedings of the IEEE},
46 | volume={85},
47 | number={9},
48 | pages={1437--1462},
49 | year={1997},
50 | publisher={IEEE}
51 | }
52 |
53 |
54 | @inproceedings{deng2013recent,
55 | title={Recent advances in deep learning for speech research at Microsoft},
56 | author={Deng, Li and Li, Jinyu and Huang, Jui-Ting and Yao, Kaisheng and Yu, Dong and Seide, Frank and Seltzer, Michael and Zweig, Geoff and He, Xiaodong and Williams, Jason and others},
57 | booktitle={Acoustics, Speech and Signal Processing (ICASSP), 2013 IEEE International Conference on},
58 | pages={8604--8608},
59 | year={2013},
60 | organization={IEEE}
61 | }
62 |
63 | @inproceedings{lee2009unsupervised,
64 | title={Unsupervised feature learning for audio classification using convolutional deep belief networks},
65 | author={Lee, Honglak and Pham, Peter and Largman, Yan and Ng, Andrew Y},
66 | booktitle={Advances in neural information processing systems},
67 | pages={1096--1104},
68 | year={2009}
69 | }
70 |
71 | @inproceedings{yu2011improved,
72 | title={Improved bottleneck features using pretrained deep neural networks},
73 | author={Yu, Dong and Seltzer, Michael L},
74 | booktitle={Twelfth Annual Conference of the International Speech Communication Association},
75 | year={2011}
76 | }
77 |
78 | @article{giannakopoulos2015pyaudioanalysis,
79 | title={pyAudioAnalysis: An Open-Source Python Library for Audio Signal Analysis},
80 | author={Giannakopoulos, Theodoros},
81 | journal={PloS one},
82 | volume={10},
83 | number={12},
84 | year={2015},
85 | publisher={Public Library of Science}
86 | }
87 |
88 | @article{torfi2017text,
89 | title={Text-independent speaker verification using 3d convolutional neural networks},
90 | author={Torfi, Amirsina and Nasrabadi, Nasser M and Dawson, Jeremy},
91 | journal={arXiv preprint arXiv:1705.09422},
92 | year={2017}
93 | }
94 |
95 | @article{torfi20173d,
96 | title={3D Convolutional Neural Networks for Cross Audio-Visual Matching Recognition},
97 | author={Torfi, Amirsina and Iranmanesh, Seyed Mehdi and Nasrabadi, Nasser and Dawson, Jeremy},
98 | journal={IEEE Access},
99 | volume={5},
100 | pages={22081--22091},
101 | year={2017},
102 | publisher={IEEE}
103 | }
104 |
105 | @article{prechelt2000empirical,
106 | title={An empirical comparison of c, c++, java, perl, python, rexx and tcl},
107 | author={Prechelt, Lutz},
108 | journal={IEEE Computer},
109 | volume={33},
110 | number={10},
111 | pages={23--29},
112 | year={2000}
113 | }
114 |
115 | @misc{torfispeechpy,
116 | author = {Amirsina Torfi},
117 | title = {{SpeechPy: Speech recognition and feature extraction}},
118 | month = aug,
119 | year = 2017,
120 | doi = {10.5281/zenodo.810391},
121 | url = {https://doi.org/10.5281/zenodo.810391}}
122 |
123 | @article{torfi2017coupled,
124 | title={Coupled 3D Convolutional Neural Networks for Audio-Visual Recognition},
125 | author={Torfi, Amirsina and Iranmanesh, Seyed Mehdi and Nasrabadi, Nasser M and Dawson, Jeremy},
126 | journal={arXiv preprint arXiv:1706.05739},
127 | year={2017}
128 | }
129 |
130 | @article{torfi2017construction,
131 | title={On the Construction of Polar Codes for Achieving the Capacity of Marginal Channels},
132 | author={Torfi, Amisina and Soleymani, Sobhan and Vakili, Vahid Tabataba},
133 | journal={arXiv preprint arXiv:1707.04512},
134 | year={2017}
135 | }
136 |
137 | @article{shannon2001mathematical,
138 | title={A mathematical theory of communication},
139 | author={Shannon, Claude Elwood},
140 | journal={ACM SIGMOBILE Mobile Computing and Communications Review},
141 | volume={5},
142 | number={1},
143 | pages={3--55},
144 | year={2001},
145 | publisher={ACM}
146 | }
147 |
148 | @article{gurban2009information,
149 | title={Information theoretic feature extraction for audio-visual speech recognition},
150 | author={Gurban, Mihai and Thiran, Jean-Philippe},
151 | journal={IEEE Transactions on signal processing},
152 | volume={57},
153 | number={12},
154 | pages={4765--4776},
155 | year={2009},
156 | publisher={IEEE}
157 | }
158 |
159 | @inproceedings{variani2014deep,
160 | title={Deep neural networks for small footprint text-dependent speaker verification},
161 | author={Variani, Ehsan and Lei, Xin and McDermott, Erik and Moreno, Ignacio Lopez and Gonzalez-Dominguez, Javier},
162 | booktitle={Acoustics, Speech and Signal Processing (ICASSP), 2014 IEEE International Conference on},
163 | pages={4052--4056},
164 | year={2014},
165 | organization={IEEE}
166 | }
167 |
168 | @article{hinton2012deep,
169 | title={Deep neural networks for acoustic modeling in speech recognition: The shared views of four research groups},
170 | author={Hinton, Geoffrey and Deng, Li and Yu, Dong and Dahl, George E and Mohamed, Abdel-rahman and Jaitly, Navdeep and Senior, Andrew and Vanhoucke, Vincent and Nguyen, Patrick and Sainath, Tara N and others},
171 | journal={IEEE Signal Processing Magazine},
172 | volume={29},
173 | number={6},
174 | pages={82--97},
175 | year={2012},
176 | publisher={IEEE}
177 | }
178 |
179 | @article{lecun2015deep,
180 | title={Deep learning},
181 | author={LeCun, Yann and Bengio, Yoshua and Hinton, Geoffrey},
182 | journal={nature},
183 | volume={521},
184 | number={7553},
185 | pages={436},
186 | year={2015},
187 | publisher={Nature Publishing Group}
188 | }
189 |
190 | @article{liu2015deep,
191 | title={Deep feature for text-dependent speaker verification},
192 | author={Liu, Yuan and Qian, Yanmin and Chen, Nanxin and Fu, Tianfan and Zhang, Ya and Yu, Kai},
193 | journal={Speech Communication},
194 | volume={73},
195 | pages={1--13},
196 | year={2015},
197 | publisher={Elsevier}
198 | }
199 |
200 | @article{torfi2018attention,
201 | title={Attention-Based Guided Structured Sparsity of Deep Neural Networks},
202 | author={Torfi, Amirsina and Shirvani, Rouzbeh A},
203 | journal={arXiv preprint arXiv:1802.09902},
204 | year={2018}
205 | }
206 |
207 |
208 |
--------------------------------------------------------------------------------
/paper/paper.md:
--------------------------------------------------------------------------------
1 | ---
2 | title: 'SpeechPy - A Library for Speech Processing and Recognition'
3 | tags:
4 | - Python
5 | authors:
6 | - name: Amirsina Torfi
7 | orcid: 0000-0003-2282-4361
8 | affiliation: "1"
9 | affiliations:
10 | - name: Virginia Tech, Department of Computer Science
11 | index: 1
12 | date: 15 May 2018
13 | bibliography: paper.bib
14 | ---
15 |
16 | # Abstract
17 | SpeechPy is an open source Python package that contains speech preprocessing techniques, speech features, and important post-processing operations. It provides most frequent used speech features including MFCCs and filterbank energies alongside with the log-energy of filter-banks. The aim of the package is to provide researchers with a simple tool for speech feature extraction and processing purposes in applications such as Automatic Speech Recognition and Speaker Verification.
18 |
19 | # Introduction
20 | Automatic Speech Recognition (ASR) requires three main components for
21 | further analysis: Preprocessing, feature extraction, and
22 | post-processing. Feature extraction, in an abstract meaning, is
23 | extracting descriptive features from raw signal for speech
24 | classification purposes. Due to the high
25 | dimensionality, the raw signal can be less informative compared to
26 | extracted higher level features. Feature extraction comes to our rescue
27 | for turning the high dimensional signal to a lower dimensional and yet
28 | a more informative version of that for sound recognition and
29 | classification [@furui1986speaker; @guyon2008feature; @hirsch2000aurora].
30 |
31 | 
32 |
33 | Feature extraction, in essence, should be done considering the specific
34 | application at hand. For example, in ASR applications, the linguistic
35 | characteristics of the raw signal are of great importance and the other
36 | characteristics must be
37 | ignored [@yu2016automatic; @rabiner1993fundamentals]. On the other hand,
38 | in Speaker Recognition (SR) task, solely voice-associated information
39 | must be contained in the extracted feature [@campbell1997speaker]. So the
40 | feature extraction goal is to extract the relevant feature from the raw
41 | signal and map it to a lower dimensional feature space. The problem of
42 | feature extraction has been investigated in pattern classification aimed
43 | at preventing the curse of dimensionality. There are some feature
44 | extraction approaches based on information theory
45 | [@torfi2017construction; @shannon2001mathematical] applied to multimodal
46 | signals and demonstrated promising results [@gurban2009information].
47 |
48 | The speech features can be categorized into two general types of
49 | acoustic and linguistic features. The former one is mainly related to
50 | non-verbal sounds and the later one is associated with ASR and SR
51 | systems for which verbal part has the major role. Perhaps one of the most
52 | famous linguistic feature which is hard to beat is the Mel-Frequency
53 | Cepstral Coefficients (MFCC). It uses speech raw frames in the range
54 | from 20ms to 40ms for having stationary
55 | characteristics [@rabiner1993fundamentals]. MFCC is widely used for both
56 | ASR and SR tasks and more recently in the associated deep learning
57 | applications as the input to the network rather than directly feeding
58 | the signal [@deng2013recent; @lee2009unsupervised; @yu2011improved].
59 | With the advent of deep learning [@lecun2015deep; @torfi2018attention],
60 | major improvements have been achieved by using deep neural networks
61 | rather than traditional methods for speech recognition
62 | applications [@variani2014deep; @hinton2012deep; @liu2015deep].
63 |
64 | With the availability of free software for speech recognition such as
65 | VOICEBOX, most of these softwares are Matlab-based which limits
66 | their reproducibility due to commercial issues. Another great package is
67 | PyAudioAnalysis [@giannakopoulos2015pyaudioanalysis], which is a
68 | the comprehensive package developed in Python. However, the issue with
69 | PyAudioAnalysis is that its complexity and being too verbose for
70 | extracting simple features and it also lacks some important
71 | preprocessing and post-processing operations for its current version.
72 |
73 | Considering the recent advent of deep learning in ASR and SR and the
74 | importance of the accurate speech feature extraction, here are the
75 | motivations behind SpeechPy package:
76 |
77 | * Developing a free open source package which covers important
78 | preprocessing techniques, speech features, and post-processing
79 | operations required for ASR and SR applications.
80 |
81 | * A simple package with a minimum degree of complexity should be
82 | available for beginners.
83 |
84 | * A well-tested and continuously integrated package for future
85 | developments should be developed.
86 |
87 | SpeechPy has been developed to satisfy the aforementioned needs. It
88 | contains the most important preprocessing and post-processing operations
89 | and a selection of frequently used speech features. The package is free
90 | and released as an open source software. Continuous integration
91 | using for instant error check and validity of changes has been deployed
92 | for SpeechPy. Moreover, prior to the latest official release of
93 | SpeechPy, the package has successfully been utilized for research
94 | purposes [@torfi20173d; @torfi2017text].
95 |
96 | # Package Eco-system
97 |
98 |
99 | SpeechPy has been developed using Python language for its interface and
100 | backed as well. An empirical study demonstrated that Python as a
101 | scripting language, is more effective and productive than conventional
102 | languages for some programming problems and memory consumption is
103 | often “better than Java and not much worse than C or
104 | C++” [@prechelt2000empirical]. We chose Python due to its simplicity and
105 | popularity. Third-party libraries are avoided except *Numpy* and *Scipy*
106 | for handling data and numeric computations.
107 |
108 | ## Complexity
109 |
110 | As the user should not and does not even need to manipulate the internal
111 | package structure, object-oriented programming is mostly used for
112 | package development which provides an easier interface for the user with a
113 | sacrifice to the simplicity of the code. However, the internal code
114 | complexity of the package does not affect the user experience since the
115 | modules can easily be called with the associated arguments. SpeechPy is
116 | a library with a collection of sub-modules.
117 |
118 | ## Code Style and Documentation
119 |
120 | SpeechPy is constructed based on PEP 8 style guide for Python codes.
121 | Moreover, it is extensively documented using the formatted docstrings
122 | and Sphinx for further automatic modifications to the document in
123 | case of changing internal modules. The full documentation of the project
124 | will be generated in HTML and PDF format using Sphinx and is hosted
125 | online. The official releases of the project are hosted on the Zenodo as
126 | well [@torfispeechpy].
127 |
128 | 
129 |
130 | ## Continuous Testing and Extensibility
131 |
132 | The output of each function has been evaluated as well as using different
133 | tests as opposed to the other existing standard packages. For continuous
134 | testing, the code is hosted on GitHub and integrated with Travis CI.
135 | Each modification to the code must pass the unit tests defined for the
136 | continuous integration. This will ensure the package does not break with
137 | unadapted code scripts. However, the validity of the modifications
138 | should always be investigated with the owner or authorized collaborators
139 | of the project. The code will be tested at each time of modification for
140 | Python versions *“2.7”*, *“3.4”* and *“3.5”*. In the future, these
141 | versions are subject to change.
142 |
143 | 
144 |
145 | # Availability
146 |
147 | ## Operating system {#operating-system .unnumbered}
148 |
149 | Tested on Ubuntu 14.04 and 16.04 LTS Linux, Apple Mac OS X 10.9.5 , and
150 | Microsoft Windows 7 & 10. We expect that SpeechPy works on any
151 | distribution as long as Python and the package dependencies are
152 | installed.
153 |
154 | ## Programming language {#programming-language .unnumbered}
155 |
156 | The package has been tested with Python 2.7, 3.4 and 3.5. However, using
157 | Python 3.5 is suggested.
158 |
159 | ## Additional system requirements & dependencies {#additional-system-requirements-dependencies .unnumbered}
160 |
161 | SpeechPy is a light package and small computational power would be
162 | enough for running it. Although the speed of the execution is totally
163 | dependent on the system architecture. The dependencies are as follows:
164 |
165 | * Numpy
166 |
167 | * SciPy
168 |
169 | # Acknowledgement
170 |
171 | This work has been completed with computational resources provided by the West Virginia University and Virginia Tech and is based upon a work
172 | supported by the Center for Identification Technology Research (CITeR) and the National Science Foundation (NSF) under Grant \#1650474.
173 | I would like to thank professor Nasser Nasrabadi for supporting me through this project and for his valuable supervision regarding my research in speech technology.
174 |
175 | # References
176 |
--------------------------------------------------------------------------------
/paper/paper.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/astorfi/speechpy/b317746f042d02e03124518625be6d9327b71e4a/paper/paper.pdf
--------------------------------------------------------------------------------
/paper/test/_imgs/Scheme_of_speech_recognition_system.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/astorfi/speechpy/b317746f042d02e03124518625be6d9327b71e4a/paper/test/_imgs/Scheme_of_speech_recognition_system.png
--------------------------------------------------------------------------------
/paper/test/_imgs/packageview.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/astorfi/speechpy/b317746f042d02e03124518625be6d9327b71e4a/paper/test/_imgs/packageview.png
--------------------------------------------------------------------------------
/paper/test/_imgs/travicCI.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/astorfi/speechpy/b317746f042d02e03124518625be6d9327b71e4a/paper/test/_imgs/travicCI.png
--------------------------------------------------------------------------------
/paper/test/test.md:
--------------------------------------------------------------------------------
1 | ---
2 | title: 'SpeechPy - A Library for Speech Processing and Recognition'
3 | tags:
4 | - Python
5 | authors:
6 | - name: Amirsina Torfi
7 | orcid: 0000-0003-2282-4361
8 | affiliation: "1"
9 | affiliations:
10 | - name: Virginia Tech, Department of Computer Science
11 | index: 1
12 | date: 15 May 2018
13 | bibliography: paper.bib
14 | ---
15 |
16 | # Abstract
17 | SpeechPy is an open source Python package that contains speech preprocessing techniques, speech features, and important post-processing operations. It provides most frequent used speech features including MFCCs and filterbank energies alongside with the log-energy of filter-banks. The aim of the package is to provide researchers with a simple tool for speech feature extraction and processing purposes in applications such as Automatic Speech Recognition and Speaker Verification.
18 |
19 | # Introduction
20 | Automatic Speech Recognition (ASR) requires three main components for
21 | further analysis: Preprocessing, feature extraction, and
22 | post-processing. Feature extraction, in an abstract meaning, is
23 | extracting descriptive features from raw signal for speech
24 | classification purposes. Due to the high
25 | dimensionality, the raw signal can be less informative compared to
26 | extracted higher level features. Feature extraction comes to our rescue
27 | for turning the high dimensional signal to a lower dimensional and yet
28 | more informative version of that for sound recognition and
29 | classification [@furui1986speaker; @guyon2008feature; @hirsch2000aurora].
30 |
31 | 
32 |
33 | Feature extraction, in essence, should be done considering the specific
34 | application at hand. For example, in ASR applications, the linguistic
35 | characteristics of the raw signal are of great importance and the other
36 | characteristics must be
37 | ignored [@yu2016automatic; @rabiner1993fundamentals]. On the other hand,
38 | in Speaker Recognition (SR) task, solely voice-associated information
39 | must be contained in extracted feature [@campbell1997speaker]. So the
40 | feature extraction goal is to extract the relevant feature from the raw
41 | signal and map it to a lower dimensional feature space. The problem of
42 | feature extraction has been investigated in pattern classification aimed
43 | at preventing the curse of dimensionality. There are some feature
44 | extraction approaches based on information theory
45 | [@torfi2017construction; @shannon2001mathematical] applied to multimodal
46 | signals and demonstrated promising results [@gurban2009information].
47 |
48 | The speech features can be categorized into two general types of
49 | acoustic and linguistic features. The former one is mainly related to
50 | non-verbal sounds and the later one is associated with ASR and SR
51 | systems for which verbal part has the major role. Perhaps one the most
52 | famous linguistic feature which is hard to beat is the Mel-Frequency
53 | Cepstral Coefficients (MFCC). It uses speech raw frames in the range
54 | from 20ms to 40ms for having stationary
55 | characteristics [@rabiner1993fundamentals]. MFCC is widely used for both
56 | ASR and SR tasks and more recently in the associated deep learning
57 | applications as the input to the network rather than directly feeding
58 | the signal [@deng2013recent; @lee2009unsupervised; @yu2011improved].
59 | With the advent of deep learning [@lecun2015deep; @torfi2018attention],
60 | major improvements have been achieved by using deep neural networks
61 | rather than traditional methods for speech recognition
62 | applications [@variani2014deep; @hinton2012deep; @liu2015deep].
63 |
64 | With the availability of free software for speech recognition such as
65 | VOICEBOX[^1], most of these softwares are Matlab-based which limits
66 | their reproducibility due to commercial issues. Another great package is
67 | PyAudioAnalysis [@giannakopoulos2015pyaudioanalysis], which is a
68 | comprehensive package developed in Python. However, the issue with
69 | PyAudioAnalysis is that its complexity and being too verbose for
70 | extracting simple features and it also lacks some important
71 | preprocessing and post-processing operations for its current version.
72 |
73 | Considering the recent advent of deep learning in ASR and SR and the
74 | importance of the accurate speech feature extraction, here are the
75 | motivations behind SpeechPy package:
76 |
77 | * Developing a free open source package which covers important
78 | preprocessing techniques, speech features, and post-processing
79 | operations required for ASR and SR applications.
80 |
81 | * A simple package with a minimum degree of complexity should be
82 | available for beginners.
83 |
84 | * A well-tested and continuously integrated package for future
85 | developments should be developed.
86 |
87 | SpeechPy has been developed to satisfy the aforementioned needs. It
88 | contains the most important preprocessing and post-processing operations
89 | and a selection of frequently used speech features. The package is free
90 | and released as an open source software[^2]. Continuous integration
91 | using for instant error check and validity of changes has been deployed
92 | for SpeechPy. Moreover, prior to the latest official release of
93 | SpeechPy, the package has successfully been utilized for research
94 | purposes [@torfi20173d; @torfi2017text].
95 |
96 | # Package Eco-system
97 |
98 |
99 | SpeechPy has been developed using Python language for its interface and
100 | backed as well. An empirical study demonstrated that Python as a
101 | scripting language, is more effective and productive than conventional
102 | languages[^3] for some programming problems and memory consumption is
103 | often “better than Java and not much worse than C or
104 | C++” [@prechelt2000empirical]. We chose Python due to its simplicity and
105 | popularity. Third-party libraries are avoided except *Numpy* and *Scipy*
106 | for handling data and numeric computations.
107 |
108 | ## Complexity
109 |
110 | As the user should not and does not even need to manipulate the internal
111 | package structure, object-oriented programming is mostly used for
112 | package development which provides easier interface for the user with a
113 | sacrifice to the simplicity of the code. However, the internal code
114 | complexity of the package does not affect the user experience since the
115 | modules can easily be called with the associated arguments. SpeechPy is
116 | a library with a collection of sub-modules.
117 |
118 | ## Code Style and Documentation
119 |
120 | SpeechPy is constructed based on PEP 8 style guide for Python codes.
121 | Moreover, it is extensively documented using the formatted docstrings
122 | and Sphinx[^4] for further automatic modifications to the document in
123 | case of changing internal modules. The full documentation of the project
124 | will be generated in HTML and PDF format using Sphinx and is hosted
125 | online. The official releases of the project are hosted on the Zenodo as
126 | well[^5] [@torfispeechpy].
127 |
128 | 
129 |
130 | ## Continuous Testing and Extensibility
131 |
132 | The output of each function has been evaluated as well using different
133 | tests as opposed to the other existing standard packages. For continuous
134 | testing, the code is hosted on GitHub and integrated with Travis CI.
135 | Each modification to the code must pass the unit tests defined for the
136 | continuous integration. This will ensure the package does not break with
137 | unadapted code scripts. However, the validity of the modifications
138 | should always be investigated with the owner or authorized collaborators
139 | of the project. The code will be tested at each time of modification for
140 | Python versions *“2.7”*, *“3.4”* and *“3.5”*. In the future, these
141 | versions are subject to change.
142 |
143 | 
144 |
145 | # Availability
146 |
147 | ## Operating system {#operating-system .unnumbered}
148 |
149 | Tested on Ubuntu 14.04 and 16.04 LTS Linux, Apple Mac OS X 10.9.5 , and
150 | Microsoft Windows 7 & 10. We expect that SpeechPy works on any
151 | distribution as long as Python and the package dependencies are
152 | installed.
153 |
154 | ## Programming language {#programming-language .unnumbered}
155 |
156 | The package has been tested Python 2.7, 3.4 and 3.5. However, using
157 | Python 3.5 is suggested.
158 |
159 | ## Additional system requirements & dependencies {#additional-system-requirements-dependencies .unnumbered}
160 |
161 | SpeechPy is a light package and small computational power would be
162 | enough for running it. Although the speed of the execution is totally
163 | dependent to the system architecture. The dependencies are as follows:
164 |
165 | * Numpy
166 |
167 | * SciPy
168 |
169 | # Aknowledgement
170 |
171 | This work has been completed in part with computational resources
172 | provided by the West Virginia University and is based upon a work
173 | supported by the Center for Identification Technology Research (CITeR)
174 | and the National Science Foundation (NSF) under Grant \#1650474.
175 |
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | numpy
2 | scipy
3 | speechpy
4 | python-coveralls
5 | pytest
6 |
--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
1 | [metadata]
2 | description-file = README.rst
3 |
--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
1 | from setuptools import setup, find_packages
2 |
3 | setup(name='speechpy',
4 | version='2.4',
5 | description='The python package for extracting speech features.',
6 | author='Amirsina Torfi',
7 | author_email='amirsina.torfi@gmail.com',
8 | url='https://github.com/astorfi/speechpy',
9 | download_url = 'https://github.com/astorfi/speechpy/archive/2.4.zip',
10 | packages=find_packages(exclude=('tests', 'docs')),
11 | include_package_data=True,
12 | install_requires=[
13 | 'scipy',
14 | 'numpy',
15 | ],
16 | zip_safe=False)
17 |
--------------------------------------------------------------------------------
/speechpy/__init__.py:
--------------------------------------------------------------------------------
1 | from . import feature
2 | from . import processing
3 |
--------------------------------------------------------------------------------
/speechpy/feature.py:
--------------------------------------------------------------------------------
1 | """feature module.
2 |
3 | This module provides functions for calculating the main speech
4 | features that the package is aimed to extract as well as the required
5 | elements.
6 |
7 |
8 | Functions:
9 |
10 | filterbanks: Compute the Mel-filterbanks
11 | The filterbanks must be created for extracting
12 | speech features such as MFCC.
13 |
14 | mfcc: Extracting Mel Frequency Cepstral Coefficient feature.
15 |
16 | mfe: Extracting Mel Energy feature.
17 |
18 | lmfe: Extracting Log Mel Energy feature.
19 |
20 | extract_derivative_feature: Extract the first and second derivative
21 | features. This finction, directly use the ``derivative_extraction``
22 | function in the ``processing`` module.
23 |
24 | """
25 |
26 | from __future__ import division
27 | import numpy as np
28 | from . import processing
29 | from scipy.fftpack import dct
30 | from . import functions
31 |
32 |
33 | def filterbanks(
34 | num_filter,
35 | coefficients,
36 | sampling_freq,
37 | low_freq=None,
38 | high_freq=None):
39 | """Compute the Mel-filterbanks. Each filter will be stored in one rows.
40 | The columns correspond to fft bins.
41 |
42 | Args:
43 | num_filter (int): the number of filters in the filterbank, default 20.
44 | coefficients (int): (fftpoints//2 + 1). Default is 257.
45 | sampling_freq (float): the samplerate of the signal we are working
46 | with. It affects mel spacing.
47 | low_freq (float): lowest band edge of mel filters, default 0 Hz
48 | high_freq (float): highest band edge of mel filters,
49 | default samplerate/2
50 |
51 | Returns:
52 | array: A numpy array of size num_filter x (fftpoints//2 + 1)
53 | which are filterbank
54 | """
55 | high_freq = high_freq or sampling_freq / 2
56 | low_freq = low_freq or 300
57 | s = "High frequency cannot be greater than half of the sampling frequency!"
58 | assert high_freq <= sampling_freq / 2, s
59 | assert low_freq >= 0, "low frequency cannot be less than zero!"
60 |
61 | # Computing the Mel filterbank
62 | # converting the upper and lower frequencies to Mels.
63 | # num_filter + 2 is because for num_filter filterbanks we need
64 | # num_filter+2 point.
65 | mels = np.linspace(
66 | functions.frequency_to_mel(low_freq),
67 | functions.frequency_to_mel(high_freq),
68 | num_filter + 2)
69 |
70 | # we should convert Mels back to Hertz because the start and end-points
71 | # should be at the desired frequencies.
72 | hertz = functions.mel_to_frequency(mels)
73 |
74 | # The frequency resolution required to put filters at the
75 | # exact points calculated above should be extracted.
76 | # So we should round those frequencies to the closest FFT bin.
77 | freq_index = (
78 | np.floor(
79 | (coefficients +
80 | 1) *
81 | hertz /
82 | sampling_freq)).astype(int)
83 |
84 | # Initial definition
85 | filterbank = np.zeros([num_filter, coefficients])
86 |
87 | # The triangular function for each filter
88 | for i in range(0, num_filter):
89 | left = int(freq_index[i])
90 | middle = int(freq_index[i + 1])
91 | right = int(freq_index[i + 2])
92 | z = np.linspace(left, right, num=right - left + 1)
93 | filterbank[i,
94 | left:right + 1] = functions.triangle(z,
95 | left=left,
96 | middle=middle,
97 | right=right)
98 |
99 | return filterbank
100 |
101 |
102 | def mfcc(
103 | signal,
104 | sampling_frequency,
105 | frame_length=0.020,
106 | frame_stride=0.01,
107 | num_cepstral=13,
108 | num_filters=40,
109 | fft_length=512,
110 | low_frequency=0,
111 | high_frequency=None,
112 | dc_elimination=True):
113 | """Compute MFCC features from an audio signal.
114 |
115 | Args:
116 |
117 | signal (array): the audio signal from which to compute features.
118 | Should be an N x 1 array
119 | sampling_frequency (int): the sampling frequency of the signal
120 | we are working with.
121 | frame_length (float): the length of each frame in seconds.
122 | Default is 0.020s
123 | frame_stride (float): the step between successive frames in seconds.
124 | Default is 0.02s (means no overlap)
125 | num_filters (int): the number of filters in the filterbank,
126 | default 40.
127 | fft_length (int): number of FFT points. Default is 512.
128 | low_frequency (float): lowest band edge of mel filters.
129 | In Hz, default is 0.
130 | high_frequency (float): highest band edge of mel filters.
131 | In Hz, default is samplerate/2
132 | num_cepstral (int): Number of cepstral coefficients.
133 | dc_elimination (bool): hIf the first dc component should
134 | be eliminated or not.
135 |
136 | Returns:
137 | array: A numpy array of size (num_frames x num_cepstral) containing mfcc features.
138 | """
139 | feature, energy = mfe(signal, sampling_frequency=sampling_frequency,
140 | frame_length=frame_length, frame_stride=frame_stride,
141 | num_filters=num_filters, fft_length=fft_length,
142 | low_frequency=low_frequency,
143 | high_frequency=high_frequency)
144 | if len(feature) == 0:
145 | return np.empty((0, num_cepstral))
146 | feature = np.log(feature)
147 | feature = dct(feature, type=2, axis=-1, norm='ortho')[:, :num_cepstral]
148 |
149 | # replace first cepstral coefficient with log of frame energy for DC
150 | # elimination.
151 | if dc_elimination:
152 | feature[:, 0] = np.log(energy)
153 | return feature
154 |
155 |
156 | def mfe(signal, sampling_frequency, frame_length=0.020, frame_stride=0.01,
157 | num_filters=40, fft_length=512, low_frequency=0, high_frequency=None):
158 | """Compute Mel-filterbank energy features from an audio signal.
159 |
160 | Args:
161 | signal (array): the audio signal from which to compute features.
162 | Should be an N x 1 array
163 | sampling_frequency (int): the sampling frequency of the signal
164 | we are working with.
165 | frame_length (float): the length of each frame in seconds.
166 | Default is 0.020s
167 | frame_stride (float): the step between successive frames in seconds.
168 | Default is 0.02s (means no overlap)
169 | num_filters (int): the number of filters in the filterbank,
170 | default 40.
171 | fft_length (int): number of FFT points. Default is 512.
172 | low_frequency (float): lowest band edge of mel filters.
173 | In Hz, default is 0.
174 | high_frequency (float): highest band edge of mel filters.
175 | In Hz, default is samplerate/2
176 |
177 | Returns:
178 | array: features - the energy of fiterbank of size num_frames x num_filters. The energy of each frame: num_frames x 1
179 | """
180 |
181 | # Convert to float
182 | signal = signal.astype(float)
183 |
184 | # Stack frames
185 | frames = processing.stack_frames(
186 | signal,
187 | sampling_frequency=sampling_frequency,
188 | frame_length=frame_length,
189 | frame_stride=frame_stride,
190 | filter=lambda x: np.ones(
191 | (x,
192 | )),
193 | zero_padding=False)
194 |
195 | # getting the high frequency
196 | high_frequency = high_frequency or sampling_frequency / 2
197 |
198 | # calculation of the power sprectum
199 | power_spectrum = processing.power_spectrum(frames, fft_length)
200 | coefficients = power_spectrum.shape[1]
201 | # this stores the total energy in each frame
202 | frame_energies = np.sum(power_spectrum, 1)
203 |
204 | # Handling zero enegies.
205 | frame_energies = functions.zero_handling(frame_energies)
206 |
207 | # Extracting the filterbank
208 | filter_banks = filterbanks(
209 | num_filters,
210 | coefficients,
211 | sampling_frequency,
212 | low_frequency,
213 | high_frequency)
214 |
215 | # Filterbank energies
216 | features = np.dot(power_spectrum, filter_banks.T)
217 | features = functions.zero_handling(features)
218 |
219 | return features, frame_energies
220 |
221 |
222 | def lmfe(signal, sampling_frequency, frame_length=0.020, frame_stride=0.01,
223 | num_filters=40, fft_length=512, low_frequency=0, high_frequency=None):
224 | """Compute log Mel-filterbank energy features from an audio signal.
225 |
226 |
227 | Args:
228 | signal (array): the audio signal from which to compute features.
229 | Should be an N x 1 array
230 | sampling_frequency (int): the sampling frequency of the signal
231 | we are working with.
232 | frame_length (float): the length of each frame in seconds.
233 | Default is 0.020s
234 | frame_stride (float): the step between successive frames in seconds.
235 | Default is 0.02s (means no overlap)
236 | num_filters (int): the number of filters in the filterbank,
237 | default 40.
238 | fft_length (int): number of FFT points. Default is 512.
239 | low_frequency (float): lowest band edge of mel filters.
240 | In Hz, default is 0.
241 | high_frequency (float): highest band edge of mel filters.
242 | In Hz, default is samplerate/2
243 |
244 | Returns:
245 | array: Features - The log energy of fiterbank of size num_frames x num_filters frame_log_energies. The log energy of each frame num_frames x 1
246 | """
247 |
248 | feature, frame_energies = mfe(signal,
249 | sampling_frequency=sampling_frequency,
250 | frame_length=frame_length,
251 | frame_stride=frame_stride,
252 | num_filters=num_filters,
253 | fft_length=fft_length,
254 | low_frequency=low_frequency,
255 | high_frequency=high_frequency)
256 | feature = np.log(feature)
257 |
258 | return feature
259 |
260 |
261 | def extract_derivative_feature(feature):
262 | """
263 | This function extracts temporal derivative features which are
264 | first and second derivatives.
265 |
266 | Args:
267 | feature (array): The feature vector which its size is: N x M
268 |
269 | Return:
270 | array: The feature cube vector which contains the static, first and second derivative features of size: N x M x 3
271 | """
272 | first_derivative_feature = processing.derivative_extraction(
273 | feature, DeltaWindows=2)
274 | second_derivative_feature = processing.derivative_extraction(
275 | first_derivative_feature, DeltaWindows=2)
276 |
277 | # Creating the future cube for each file
278 | feature_cube = np.concatenate(
279 | (feature[:, :, None], first_derivative_feature[:, :, None],
280 | second_derivative_feature[:, :, None]),
281 | axis=2)
282 | return feature_cube
283 |
--------------------------------------------------------------------------------
/speechpy/functions.py:
--------------------------------------------------------------------------------
1 | """function module.
2 |
3 | This module contains necessary functions for calculating the features
4 | in the `features` module.
5 |
6 |
7 | Attributes:
8 |
9 | frequency_to_mel: Converting the frequency to Mel scale.
10 | This is necessary for filterbank energy calculation.
11 | mel_to_frequency: Converting the Mel to frequency scale.
12 | This is necessary for filterbank energy calculation.
13 | triangle: Creating a triangle for filterbanks.
14 | This is necessary for filterbank energy calculation.
15 | zero_handling: Handling zero values due to the possible
16 | issues regarding the log functions.
17 | """
18 |
19 | from __future__ import division
20 | import numpy as np
21 | from . import processing
22 | from scipy.fftpack import dct
23 | import math
24 |
25 |
26 | def frequency_to_mel(f):
27 | """converting from frequency to Mel scale.
28 |
29 | :param f: The frequency values(or a single frequency) in Hz.
30 | :returns: The mel scale values(or a single mel).
31 | """
32 | return 1127 * np.log(1 + f / 700.)
33 |
34 |
35 | def mel_to_frequency(mel):
36 | """converting from Mel scale to frequency.
37 |
38 | :param mel: The mel scale values(or a single mel).
39 | :returns: The frequency values(or a single frequency) in Hz.
40 | """
41 | return 700 * (np.exp(mel / 1127.0) - 1)
42 |
43 |
44 | def triangle(x, left, middle, right):
45 | out = np.zeros(x.shape)
46 | out[x <= left] = 0
47 | out[x >= right] = 0
48 | first_half = np.logical_and(left < x, x <= middle)
49 | out[first_half] = (x[first_half] - left) / (middle - left)
50 | second_half = np.logical_and(middle <= x, x < right)
51 | out[second_half] = (right - x[second_half]) / (right - middle)
52 | return out
53 |
54 |
55 | def zero_handling(x):
56 | """
57 | This function handle the issue with zero values if the are exposed
58 | to become an argument for any log function.
59 | :param x: The vector.
60 | :return: The vector with zeros substituted with epsilon values.
61 | """
62 | return np.where(x == 0, np.finfo(float).eps, x)
63 |
--------------------------------------------------------------------------------
/speechpy/processing.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | """Processing module for signal processing operations.
3 |
4 | This module demonstrates documentation for the signal processing
5 | function which are required as internal computations in the package.
6 |
7 |
8 | Attributes:
9 |
10 | preemphasis: Preemphasising on the signal. This is a preprocessing step.
11 |
12 | stack_frames: Create stacking frames from the raw signal.
13 |
14 | fft_spectrum: Calculation of the Fast Fourier Transform.
15 |
16 | power_spectrum: Power Spectrum calculation.
17 |
18 | log_power_spectrum: Log Power Spectrum calculation.
19 |
20 | derivative_extraction: Calculation of the derivative of the extracted featurs.
21 |
22 | cmvn: Cepstral mean variance normalization. This is a post processing operation.
23 |
24 | cmvnw: Cepstral mean variance normalization over the sliding window. This is a post processing operation.
25 |
26 | """
27 |
28 | __license__ = "MIT"
29 | __author__ = " Amirsina Torfi"
30 | __docformat__ = 'reStructuredText'
31 |
32 | import decimal
33 | import numpy as np
34 | import math
35 |
36 |
37 | # 1.4 becomes 1 and 1.6 becomes 2. special case: 1.5 becomes 2.
38 | def round_half_up(number):
39 | return int(
40 | decimal.Decimal(number).quantize(
41 | decimal.Decimal('1'),
42 | rounding=decimal.ROUND_HALF_UP))
43 |
44 |
45 | def preemphasis(signal, shift=1, cof=0.98):
46 | """preemphasising on the signal.
47 |
48 | Args:
49 | signal (array): The input signal.
50 | shift (int): The shift step.
51 | cof (float): The preemphasising coefficient. 0 equals to no filtering.
52 |
53 | Returns:
54 | array: The pre-emphasized signal.
55 | """
56 |
57 | rolled_signal = np.roll(signal, shift)
58 | return signal - cof * rolled_signal
59 |
60 |
61 | def stack_frames(
62 | sig,
63 | sampling_frequency,
64 | frame_length=0.020,
65 | frame_stride=0.020,
66 | filter=lambda x: np.ones(
67 | (x,
68 | )),
69 | zero_padding=True):
70 | """Frame a signal into overlapping frames.
71 |
72 | Args:
73 | sig (array): The audio signal to frame of size (N,).
74 | sampling_frequency (int): The sampling frequency of the signal.
75 | frame_length (float): The length of the frame in second.
76 | frame_stride (float): The stride between frames.
77 | filter (array): The time-domain filter for applying to each frame.
78 | By default it is one so nothing will be changed.
79 | zero_padding (bool): If the samples is not a multiple of
80 | frame_length(number of frames sample), zero padding will
81 | be done for generating last frame.
82 |
83 | Returns:
84 | array: Stacked_frames-Array of frames of size (number_of_frames x frame_len).
85 |
86 | """
87 |
88 | # Check dimension
89 | s = "Signal dimention should be of the format of (N,) but it is %s instead"
90 | assert sig.ndim == 1, s % str(sig.shape)
91 |
92 | # Initial necessary values
93 | length_signal = sig.shape[0]
94 | frame_sample_length = int(
95 | np.round(
96 | sampling_frequency *
97 | frame_length)) # Defined by the number of samples
98 | frame_stride = float(np.round(sampling_frequency * frame_stride))
99 |
100 | # Zero padding is done for allocating space for the last frame.
101 | if zero_padding:
102 | # Calculation of number of frames
103 | numframes = (int(math.ceil((length_signal
104 | - frame_sample_length) / frame_stride)))
105 | print(numframes,length_signal,frame_sample_length,frame_stride)
106 |
107 | # Zero padding
108 | len_sig = int(numframes * frame_stride + frame_sample_length)
109 | additive_zeros = np.zeros((len_sig - length_signal,))
110 | signal = np.concatenate((sig, additive_zeros))
111 |
112 | else:
113 | # No zero padding! The last frame which does not have enough
114 | # samples(remaining samples <= frame_sample_length), will be dropped!
115 | numframes = int(math.floor((length_signal
116 | - frame_sample_length) / frame_stride))
117 |
118 | # new length
119 | len_sig = int((numframes - 1) * frame_stride + frame_sample_length)
120 | signal = sig[0:len_sig]
121 |
122 | # Getting the indices of all frames.
123 | indices = np.tile(np.arange(0,
124 | frame_sample_length),
125 | (numframes,
126 | 1)) + np.tile(np.arange(0,
127 | numframes * frame_stride,
128 | frame_stride),
129 | (frame_sample_length,
130 | 1)).T
131 | indices = np.array(indices, dtype=np.int32)
132 |
133 | # Extracting the frames based on the allocated indices.
134 | frames = signal[indices]
135 |
136 | # Apply the windows function
137 | window = np.tile(filter(frame_sample_length), (numframes, 1))
138 | Extracted_Frames = frames * window
139 | return Extracted_Frames
140 |
141 |
142 | def fft_spectrum(frames, fft_points=512):
143 | """This function computes the one-dimensional n-point discrete Fourier
144 | Transform (DFT) of a real-valued array by means of an efficient algorithm
145 | called the Fast Fourier Transform (FFT). Please refer to
146 | https://docs.scipy.org/doc/numpy/reference/generated/numpy.fft.rfft.html
147 | for further details.
148 |
149 | Args:
150 | frames (array): The frame array in which each row is a frame.
151 | fft_points (int): The length of FFT. If fft_length is greater than frame_len, the frames will be zero-padded.
152 |
153 | Returns:
154 | array: The fft spectrum.
155 | If frames is an num_frames x sample_per_frame matrix, output
156 | will be num_frames x FFT_LENGTH.
157 | """
158 | SPECTRUM_VECTOR = np.fft.rfft(frames, n=fft_points, axis=-1, norm=None)
159 | return np.absolute(SPECTRUM_VECTOR)
160 |
161 |
162 | def power_spectrum(frames, fft_points=512):
163 | """Power spectrum of each frame.
164 |
165 | Args:
166 | frames (array): The frame array in which each row is a frame.
167 | fft_points (int): The length of FFT. If fft_length is greater than frame_len, the frames will be zero-padded.
168 |
169 | Returns:
170 | array: The power spectrum.
171 | If frames is an num_frames x sample_per_frame matrix, output
172 | will be num_frames x fft_length.
173 | """
174 | return 1.0 / fft_points * np.square(fft_spectrum(frames, fft_points))
175 |
176 |
177 | def log_power_spectrum(frames, fft_points=512, normalize=True):
178 | """Log power spectrum of each frame in frames.
179 |
180 | Args:
181 | frames (array): The frame array in which each row is a frame.
182 | fft_points (int): The length of FFT. If fft_length is greater than
183 | frame_len, the frames will be zero-padded.
184 | normalize (bool): If normalize=True, the log power spectrum
185 | will be normalized.
186 |
187 | Returns:
188 | array: The power spectrum - If frames is an
189 | num_frames x sample_per_frame matrix, output will be
190 | num_frames x fft_length.
191 | """
192 | power_spec = power_spectrum(frames, fft_points)
193 | power_spec[power_spec <= 1e-20] = 1e-20
194 | log_power_spec = 10 * np.log10(power_spec)
195 | if normalize:
196 | return log_power_spec - np.max(log_power_spec)
197 | else:
198 | return log_power_spec
199 |
200 |
201 | def derivative_extraction(feat, DeltaWindows):
202 | """This function the derivative features.
203 |
204 | Args:
205 | feat (array): The main feature vector(For returning the second
206 | order derivative it can be first-order derivative).
207 | DeltaWindows (int): The value of DeltaWindows is set using
208 | the configuration parameter DELTAWINDOW.
209 |
210 | Returns:
211 | array: Derivative feature vector - A NUMFRAMESxNUMFEATURES numpy
212 | array which is the derivative features along the features.
213 | """
214 |
215 | # Getting the shape of the vector.
216 | rows, cols = feat.shape
217 |
218 | # Difining the vector of differences.
219 | DIF = np.zeros(feat.shape, dtype=feat.dtype)
220 | Scale = 0
221 |
222 | # Pad only along features in the vector.
223 | FEAT = np.lib.pad(feat, ((0, 0), (DeltaWindows, DeltaWindows)), 'edge')
224 | for i in range(DeltaWindows):
225 | # Start index
226 | offset = DeltaWindows
227 |
228 | # The dynamic range
229 | Range = i + 1
230 |
231 | dif = Range * FEAT[:, offset + Range:offset + Range + cols]
232 | - FEAT[:, offset - Range:offset - Range + cols]
233 | Scale += 2 * np.power(Range, 2)
234 | DIF += dif
235 |
236 | return DIF / Scale
237 |
238 |
239 | def cmvn(vec, variance_normalization=False):
240 | """ This function is aimed to perform global cepstral mean and
241 | variance normalization (CMVN) on input feature vector "vec".
242 | The code assumes that there is one observation per row.
243 |
244 | Args:
245 | vec (array): input feature matrix
246 | (size:(num_observation,num_features))
247 | variance_normalization (bool): If the variance
248 | normilization should be performed or not.
249 |
250 | Return:
251 | array: The mean(or mean+variance) normalized feature vector.
252 | """
253 | eps = 2**-30
254 | rows, cols = vec.shape
255 |
256 | # Mean calculation
257 | norm = np.mean(vec, axis=0)
258 | norm_vec = np.tile(norm, (rows, 1))
259 |
260 | # Mean subtraction
261 | mean_subtracted = vec - norm_vec
262 |
263 | # Variance normalization
264 | if variance_normalization:
265 | stdev = np.std(mean_subtracted, axis=0)
266 | stdev_vec = np.tile(stdev, (rows, 1))
267 | output = mean_subtracted / (stdev_vec + eps)
268 | else:
269 | output = mean_subtracted
270 |
271 | return output
272 |
273 |
274 | def cmvnw(vec, win_size=301, variance_normalization=False):
275 | """ This function is aimed to perform local cepstral mean and
276 | variance normalization on a sliding window. The code assumes that
277 | there is one observation per row.
278 |
279 | Args:
280 | vec (array): input feature matrix
281 | (size:(num_observation,num_features))
282 | win_size (int): The size of sliding window for local normalization.
283 | Default=301 which is around 3s if 100 Hz rate is
284 | considered(== 10ms frame stide)
285 | variance_normalization (bool): If the variance normilization should
286 | be performed or not.
287 |
288 | Return:
289 | array: The mean(or mean+variance) normalized feature vector.
290 | """
291 | # Get the shapes
292 | eps = 2**-30
293 | rows, cols = vec.shape
294 |
295 | # Windows size must be odd.
296 | assert isinstance(win_size, int), "Size must be of type 'int'!"
297 | assert win_size % 2 == 1, "Windows size must be odd!"
298 |
299 | # Padding and initial definitions
300 | pad_size = int((win_size - 1) / 2)
301 | vec_pad = np.lib.pad(vec, ((pad_size, pad_size), (0, 0)), 'symmetric')
302 | mean_subtracted = np.zeros(np.shape(vec), dtype=np.float32)
303 |
304 | for i in range(rows):
305 | window = vec_pad[i:i + win_size, :]
306 | window_mean = np.mean(window, axis=0)
307 | mean_subtracted[i, :] = vec[i, :] - window_mean
308 |
309 | # Variance normalization
310 | if variance_normalization:
311 |
312 | # Initial definitions.
313 | variance_normalized = np.zeros(np.shape(vec), dtype=np.float32)
314 | vec_pad_variance = np.lib.pad(
315 | mean_subtracted, ((pad_size, pad_size), (0, 0)), 'symmetric')
316 |
317 | # Looping over all observations.
318 | for i in range(rows):
319 | window = vec_pad_variance[i:i + win_size, :]
320 | window_variance = np.std(window, axis=0)
321 | variance_normalized[i, :] \
322 | = mean_subtracted[i, :] / (window_variance + eps)
323 | output = variance_normalized
324 | else:
325 | output = mean_subtracted
326 |
327 | return output
328 |
329 |
330 | # def resample_Fn(wave, fs, f_new=16000):
331 | # """This function resample the data to arbitrary frequency
332 | # :param fs: Frequency of the sound file.
333 | # :param wave: The sound file itself.
334 | # :returns:
335 | # f_new: The new frequency.
336 | # signal_new: The new signal samples at new frequency.
337 | #
338 | # dependency: from scikits.samplerate import resample
339 | # """
340 | #
341 | # # Resampling using interpolation(There are other
342 | # methods than 'sinc_best')
343 | # signal_new = resample(wave, float(f_new) / fs, 'sinc_best')
344 | #
345 | # # Necessary data converting for saving .wav file using scipy.
346 | # signal_new = np.asarray(signal_new, dtype=np.int16)
347 | #
348 | # # # Uncomment if you want to save the audio file
349 | # # # Save using new format
350 | # # wav.write(filename='resample_rainbow_16k.wav',rate=fr,data=signal_new)
351 | # return signal_new, f_new
352 |
--------------------------------------------------------------------------------
/tests/Alesis-Sanctuary-QCard-AcoustcBas-C2.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/astorfi/speechpy/b317746f042d02e03124518625be6d9327b71e4a/tests/Alesis-Sanctuary-QCard-AcoustcBas-C2.wav
--------------------------------------------------------------------------------
/tests/test_speechpy.py:
--------------------------------------------------------------------------------
1 | import scipy.io.wavfile as wav
2 | import numpy as np
3 | import os
4 | import sys
5 | lib_path = os.path.abspath(os.path.join('..'))
6 | print(lib_path)
7 | sys.path.append(lib_path)
8 | from speechpy import processing
9 | from speechpy import feature
10 | from speechpy import functions
11 |
12 | # Ramdom signal generation for testing
13 | mu, sigma = 0, 0.1 # mean and standard deviation
14 | signal = np.random.normal(mu, sigma, 1000000)
15 | fs = 16000
16 |
17 | # Generating stached frames with SpeechPy
18 | frame_length = 0.02
19 | frame_stride = 0.02
20 | num_filters=40
21 |
22 |
23 | class Test_Methods_Exists(object):
24 | def test_processing(self):
25 |
26 | # Cheching the availibility of functions in the chosen attribute
27 | assert hasattr(processing, 'preemphasis')
28 | assert hasattr(processing, 'stack_frames')
29 | assert hasattr(processing, 'fft_spectrum')
30 | assert hasattr(processing, 'power_spectrum')
31 | assert hasattr(processing, 'log_power_spectrum')
32 | assert hasattr(processing, 'derivative_extraction')
33 | assert hasattr(processing, 'cmvn')
34 | assert hasattr(processing, 'cmvnw')
35 |
36 | def test_feature(self):
37 |
38 | # Cheching the availibility of functions in the chosen attribute
39 | assert hasattr(feature, 'filterbanks')
40 | assert hasattr(feature, 'mfcc')
41 | assert hasattr(feature, 'mfe')
42 | assert hasattr(feature, 'lmfe')
43 | assert hasattr(feature, 'extract_derivative_feature')
44 |
45 | def test_functions(self):
46 |
47 | # Cheching the availibility of functions in the chosen attribute
48 | assert hasattr(functions, 'frequency_to_mel')
49 | assert hasattr(functions, 'mel_to_frequency')
50 | assert hasattr(functions, 'triangle')
51 | assert hasattr(functions, 'zero_handling')
52 |
53 |
54 | class Test_Processing(object):
55 |
56 | def test_preemphasis(self):
57 |
58 | # Performing the operation on the generated signal.
59 | signal_preemphasized = processing.preemphasis(signal, cof=0.98)
60 |
61 | # Shape matcher
62 | assert signal_preemphasized.ndim == 1
63 | assert signal_preemphasized.shape == signal.shape
64 |
65 | def test_stack_frames(self):
66 |
67 | frames = processing.stack_frames(signal, sampling_frequency=fs,
68 | frame_length=frame_length,
69 | frame_stride=frame_stride,
70 | filter=lambda x: np.ones((x,)),
71 | zero_padding=True)
72 |
73 | # Direct calculation using numpy
74 | window = int(np.round(frame_length * fs))
75 | step = int(np.round(frame_stride * fs))
76 | all_frames = (int(np.ceil((signal.shape[0]
77 | - window) / step)))
78 |
79 | # Shape matching of stacked frames
80 | assert all_frames == frames.shape[0]
81 |
82 | def test_cmvn(self):
83 |
84 | feature_vector = np.random.rand(50,100)
85 | normalized_feature = processing.cmvn(feature_vector, variance_normalization=True)
86 |
87 | # Shape match
88 | assert normalized_feature.shape == feature_vector.shape
89 |
90 | # Check the std and mean of the output vector
91 | assert np.allclose(np.mean(normalized_feature,axis=0), np.zeros((1,normalized_feature.shape[1])))
92 | assert np.allclose(np.std(normalized_feature,axis=0), np.ones((1,normalized_feature.shape[1])))
93 |
94 |
95 | class Test_feature(object):
96 |
97 | def test_mfcc(self):
98 |
99 | num_cepstral = 13
100 | mfcc = feature.mfcc(signal, sampling_frequency=fs,
101 | frame_length=0.020, num_cepstral=num_cepstral, frame_stride=0.01,
102 | num_filters=num_filters, fft_length=512, low_frequency=0,
103 | high_frequency=None)
104 |
105 | # Shape matcher
106 | assert mfcc.shape[1] == num_cepstral
107 |
108 |
109 |
110 |
111 |
112 |
113 |
114 |
--------------------------------------------------------------------------------