├── .github └── FUNDING.yml ├── .gitignore ├── .travis.yml ├── AlternetiveTravisCI ├── CONTRIBUTING.rst ├── LICENSE ├── MANIFEST.in ├── Makefile ├── README.rst ├── __init__.py ├── _config.yml ├── _images ├── Speech_GIF.gif ├── follow-twitter.gif ├── pipeline.jpg ├── readme.rst ├── speech.gif ├── speechpy_logo.gif └── stackframes.png ├── docs ├── Makefile ├── _config.yml ├── requirements.txt └── source │ ├── _static │ └── img │ │ ├── 08063416.pdf │ │ ├── Speech_GIF.gif │ │ ├── installation_logo.gif │ │ ├── installation_logo.jpg │ │ ├── speech.gif │ │ ├── speech.jpg │ │ ├── speechpy_logo.gif │ │ ├── speechpy_logo.jpg │ │ └── stackframes.png │ ├── _templates │ ├── breadcrumbs.html │ └── breadcrumbs.html~ │ ├── conf.py │ ├── content │ ├── features.rst │ ├── postprocessing.rst │ └── preprocessing.rst │ ├── epilogue │ ├── CONTRIBUTING.rst │ ├── finalnote.rst │ └── test.rst │ ├── index.rst │ └── intro │ └── introductions.rst ├── example ├── Alesis-Sanctuary-QCard-AcoustcBas-C2.wav ├── test_local.py └── test_package.py ├── paper ├── paper.bib ├── paper.md ├── paper.pdf └── test │ ├── _imgs │ ├── Scheme_of_speech_recognition_system.png │ ├── packageview.png │ └── travicCI.png │ └── test.md ├── requirements.txt ├── setup.cfg ├── setup.py ├── speechpy ├── __init__.py ├── feature.py ├── functions.py └── processing.py └── tests ├── Alesis-Sanctuary-QCard-AcoustcBas-C2.wav └── test_speechpy.py /.github/FUNDING.yml: -------------------------------------------------------------------------------- 1 | github: [astorfi] 2 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | env/ 12 | build/ 13 | develop-eggs/ 14 | dist/ 15 | downloads/ 16 | eggs/ 17 | .eggs/ 18 | lib/ 19 | lib64/ 20 | parts/ 21 | sdist/ 22 | var/ 23 | *.egg-info/ 24 | .installed.cfg 25 | *.egg 26 | 27 | # PyInstaller 28 | # Usually these files are written by a python script from a template 29 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 30 | *.manifest 31 | *.spec 32 | 33 | # Installer logs 34 | pip-log.txt 35 | pip-delete-this-directory.txt 36 | 37 | # Unit test / coverage reports 38 | htmlcov/ 39 | .tox/ 40 | .coverage 41 | .coverage.* 42 | .cache 43 | nosetests.xml 44 | coverage.xml 45 | *,cover 46 | .hypothesis/ 47 | 48 | # Translations 49 | *.mo 50 | *.pot 51 | 52 | # Django stuff: 53 | *.log 54 | local_settings.py 55 | 56 | # Flask stuff: 57 | instance/ 58 | .webassets-cache 59 | 60 | # Scrapy stuff: 61 | .scrapy 62 | 63 | # Sphinx documentation 64 | docs/_build/ 65 | 66 | # PyBuilder 67 | target/ 68 | 69 | # IPython Notebook 70 | .ipynb_checkpoints 71 | 72 | # pyenv 73 | .python-version 74 | 75 | # celery beat schedule file 76 | celerybeat-schedule 77 | 78 | # dotenv 79 | .env 80 | 81 | # virtualenv 82 | venv/ 83 | ENV/ 84 | 85 | # Spyder project settings 86 | .spyderproject 87 | 88 | # Rope project settings 89 | .ropeproject 90 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: python 2 | python: 3 | 4 | - "2.7" 5 | - "3.4" 6 | - "3.5" 7 | 8 | # command to install dependencies 9 | install: 10 | - pip install -r requirements.txt 11 | - pip install coveralls 12 | - pip install codecov 13 | 14 | script: 15 | - coverage run --omit=*.virtualenvs*,*virtualenv* example/test_package.py test 16 | - coverage run --omit=*.virtualenvs*,*virtualenv* example/test_local.py test 17 | - pytest tests/ 18 | 19 | 20 | after_success: 21 | - coveralls 22 | - codecov 23 | 24 | sudo: enabled 25 | dist: trusty 26 | -------------------------------------------------------------------------------- /AlternetiveTravisCI: -------------------------------------------------------------------------------- 1 | language: python 2 | python: 3 | 4 | - "2.7" 5 | - "3.4" 6 | - "3.5" 7 | - "3.5-dev" # 3.5 development branch 8 | 9 | # command to install dependencies 10 | install: "pip install -r requirements.txt" 11 | 12 | # command to run tests 13 | script: python setup.py develop 14 | 15 | sudo: enabled 16 | dist: trusty 17 | -------------------------------------------------------------------------------- /CONTRIBUTING.rst: -------------------------------------------------------------------------------- 1 | 2 | ************* 3 | Contributing 4 | ************* 5 | 6 | When contributing to this repository, you are more than welcome to discuss your feedback with any of the owners of this repository. *For typos, please do not create a pull request. Instead, declare them in issues or email the repository owner*. For technical and conceptual questions please feel free to **directly contact the repository owner**. Before asking general questions related to the concepts and techniques provided in this project, **please make sure to read and understand its associated paper**. 7 | 8 | Please note we have a code of conduct, please follow it in all your interactions with the project. 9 | 10 | ==================== 11 | Pull Request Process 12 | ==================== 13 | 14 | Please consider the following criterions in order to help us in a better way: 15 | 16 | 1. The pull request is mainly expected to be a code script suggestion or improvement. 17 | 2. A pull request related to non-code-script sections is expected to make a significant difference in the documentation. Otherwise, it is expected to be announced in the issues section. 18 | 3. Ensure any install or build dependencies are removed before the end of the layer when doing a 19 | build and creating a pull request. 20 | 4. Add comments with details of changes to the interface, this includes new environment 21 | variables, exposed ports, useful file locations and container parameters. 22 | 5. You may merge the Pull Request in once you have the sign-off of at least one other developer, or if you 23 | do not have permission to do that, you may request the owner to merge it for you if you believe all checks are passed. 24 | 25 | ============ 26 | Final Note 27 | ============ 28 | 29 | We are looking forward to your kind feedback. Please help us to improve this open source project and make our work better. 30 | For contribution, please create a pull request and we will investigate it promptly. Once again, we appreciate 31 | your kind feedback and elaborate code inspections. 32 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "{}" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright {2017} {Amirsina Torfi} 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include README.rst 2 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | init: 2 | pip install -r requirements.txt 3 | 4 | test: 5 | nosetests tests 6 | -------------------------------------------------------------------------------- /README.rst: -------------------------------------------------------------------------------- 1 | .. image:: _images/speechpy_logo.gif 2 | :target: https://github.com/astorfi/speech_feature_extraction/blob/master/images/speechpy_logo.gif 3 | 4 | =============================================== 5 | `SpeechPy Official Project Documentation`_ 6 | =============================================== 7 | 8 | .. image:: https://pepy.tech/badge/speechpy 9 | :target: https://pepy.tech/project/speechpy 10 | .. image:: https://img.shields.io/badge/contributions-welcome-brightgreen.svg?style=flat 11 | :target: https://github.com/astorfi/speechpy/pulls 12 | .. image:: https://coveralls.io/repos/github/astorfi/speechpy/badge.svg?branch=master 13 | :target: https://coveralls.io/github/astorfi/speechpy?branch=master 14 | .. image:: https://codecov.io/gh/astorfi/speechpy/branch/master/graph/badge.svg 15 | :target: https://codecov.io/gh/astorfi/speechpy 16 | .. image:: https://badge.fury.io/py/speechpy.svg 17 | :target: https://badge.fury.io/py/speechpy 18 | .. image:: http://joss.theoj.org/papers/10.21105/joss.00749/status.svg 19 | :target: https://doi.org/10.21105/joss.00749 20 | .. image:: https://img.shields.io/twitter/follow/amirsinatorfi.svg?label=Follow&style=social 21 | :target: https://twitter.com/amirsinatorfi 22 | 23 | .. .. image:: https://zenodo.org/badge/DOI/10.5281/zenodo.810391.svg 24 | .. :target: https://doi.org/10.5281/zenodo.810391 25 | 26 | .. _SpeechPy Official Project Documentation: http://speechpy.readthedocs.io 27 | 28 | 29 | ========================== 30 | Table of Contents 31 | ========================== 32 | .. contents:: 33 | :local: 34 | :depth: 3 35 | 36 | --------------------- 37 | Documentation 38 | --------------------- 39 | 40 | This library provides most frequent used speech features including MFCCs and filterbank energies alongside with the log-energy of filterbanks. 41 | If you are interested to see what are MFCCs and how they are generated please refer to this 42 | `wiki `_ page. 43 | 44 | .. image:: _images/speech.gif 45 | 46 | 47 | Please refer to the following links for further informations: 48 | 49 | `SpeechPy Official Project Documentation`_ 50 | 51 | `Paper`_ 52 | 53 | .. _SpeechPy Official Project Documentation: http://speechpy.readthedocs.io 54 | .. _Paper: https://doi.org/10.21105/joss.00749 55 | 56 | ------------------------------------------ 57 | Which Python versions are supported 58 | ------------------------------------------ 59 | 60 | Currently, the package has been tested and verified using Python ``2.7``, ``3.4`` and ``3.5``. 61 | 62 | --------------------- 63 | Citation 64 | --------------------- 65 | 66 | If you used this package, please kindly cite it as follows: 67 | 68 | .. code:: bash 69 | 70 | @article{torfi2018speechpy, 71 | title={SpeechPy-A Library for Speech Processing and Recognition}, 72 | author={Torfi, Amirsina}, 73 | journal={arXiv preprint arXiv:1803.01094}, 74 | year={2018} 75 | } 76 | 77 | --------------------- 78 | How to Install? 79 | --------------------- 80 | 81 | There are two possible ways for installation of this package: local installation and PyPi. 82 | 83 | ~~~~~~~~~~~~~~~~~~~ 84 | Local Installation 85 | ~~~~~~~~~~~~~~~~~~~ 86 | 87 | For local installation at first the repository must be cloned:: 88 | 89 | git clone https://github.com/astorfi/speech_feature_extraction.git 90 | 91 | After cloning the reposity, root to the repository directory then execute:: 92 | 93 | python setup.py develop 94 | 95 | ~~~~~ 96 | Pypi 97 | ~~~~~ 98 | 99 | The package is available on PyPi. For direct installation simply execute the following: 100 | 101 | .. code-block:: shell 102 | 103 | pip install speechpy 104 | 105 | 106 | ------------------------------------------ 107 | What Features are supported? 108 | ------------------------------------------ 109 | - Mel Frequency Cepstral Coefficients(MFCCs) 110 | - Filterbank Energies 111 | - Log Filterbank Energies 112 | 113 | Please refer to `SpeechPy Official Project Documentation`_ for details about the supported features. 114 | 115 | ~~~~~~~~~~~~~~ 116 | MFCC Features 117 | ~~~~~~~~~~~~~~ 118 | 119 | |pic1| |pic2| 120 | 121 | .. |pic1| image:: _images/Speech_GIF.gif 122 | :width: 45% 123 | 124 | .. |pic2| image:: _images/pipeline.jpg 125 | :width: 45% 126 | 127 | The supported attributes for generating MFCC features can be seen by investigating the related function: 128 | 129 | .. code-block:: python 130 | 131 | def mfcc(signal, sampling_frequency, frame_length=0.020, frame_stride=0.01,num_cepstral =13, 132 | num_filters=40, fft_length=512, low_frequency=0, high_frequency=None, dc_elimination=True): 133 | """Compute MFCC features from an audio signal. 134 | :param signal: the audio signal from which to compute features. Should be an N x 1 array 135 | :param sampling_frequency: the sampling frequency of the signal we are working with. 136 | :param frame_length: the length of each frame in seconds. Default is 0.020s 137 | :param frame_stride: the step between successive frames in seconds. Default is 0.02s (means no overlap) 138 | :param num_filters: the number of filters in the filterbank, default 40. 139 | :param fft_length: number of FFT points. Default is 512. 140 | :param low_frequency: lowest band edge of mel filters. In Hz, default is 0. 141 | :param high_frequency: highest band edge of mel filters. In Hz, default is samplerate/2 142 | :param num_cepstral: Number of cepstral coefficients. 143 | :param dc_elimination: hIf the first dc component should be eliminated or not. 144 | :returns: A numpy array of size (num_frames x num_cepstral) containing mfcc features. 145 | """ 146 | 147 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~ 148 | Filterbank Energy Features 149 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~ 150 | 151 | 152 | .. code-block:: python 153 | 154 | def mfe(signal, sampling_frequency, frame_length=0.020, frame_stride=0.01, 155 | num_filters=40, fft_length=512, low_frequency=0, high_frequency=None): 156 | """Compute Mel-filterbank energy features from an audio signal. 157 | :param signal: the audio signal from which to compute features. Should be an N x 1 array 158 | :param sampling_frequency: the sampling frequency of the signal we are working with. 159 | :param frame_length: the length of each frame in seconds. Default is 0.020s 160 | :param frame_stride: the step between successive frames in seconds. Default is 0.02s (means no overlap) 161 | :param num_filters: the number of filters in the filterbank, default 40. 162 | :param fft_length: number of FFT points. Default is 512. 163 | :param low_frequency: lowest band edge of mel filters. In Hz, default is 0. 164 | :param high_frequency: highest band edge of mel filters. In Hz, default is samplerate/2 165 | :returns: 166 | features: the energy of fiterbank: num_frames x num_filters 167 | frame_energies: the energy of each frame: num_frames x 1 168 | """ 169 | 170 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 171 | log - Filterbank Energy Features 172 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 173 | 174 | The attributes for ``log_filterbank energies`` are the same for ``filterbank energies`` too. 175 | 176 | .. code-block:: python 177 | 178 | def lmfe(signal, sampling_frequency, frame_length=0.020, frame_stride=0.01, 179 | num_filters=40, fft_length=512, low_frequency=0, high_frequency=None): 180 | """Compute log Mel-filterbank energy features from an audio signal. 181 | :param signal: the audio signal from which to compute features. Should be an N x 1 array 182 | :param sampling_frequency: the sampling frequency of the signal we are working with. 183 | :param frame_length: the length of each frame in seconds. Default is 0.020s 184 | :param frame_stride: the step between successive frames in seconds. Default is 0.02s (means no overlap) 185 | :param num_filters: the number of filters in the filterbank, default 40. 186 | :param fft_length: number of FFT points. Default is 512. 187 | :param low_frequency: lowest band edge of mel filters. In Hz, default is 0. 188 | :param high_frequency: highest band edge of mel filters. In Hz, default is samplerate/2 189 | :returns: 190 | features: the energy of fiterbank: num_frames x num_filters 191 | frame_log_energies: the log energy of each frame: num_frames x 1 192 | """ 193 | 194 | ~~~~~~~~~~~~ 195 | Stack Frames 196 | ~~~~~~~~~~~~ 197 | 198 | In ``Stack_Frames`` function, the stack of frames will be generated from the signal. 199 | 200 | .. code-block:: python 201 | 202 | def stack_frames(sig, sampling_frequency, frame_length=0.020, frame_stride=0.020, Filter=lambda x: numpy.ones((x,)), 203 | zero_padding=True): 204 | """Frame a signal into overlapping frames. 205 | :param sig: The audio signal to frame of size (N,). 206 | :param sampling_frequency: The sampling frequency of the signal. 207 | :param frame_length: The length of the frame in second. 208 | :param frame_stride: The stride between frames. 209 | :param Filter: The time-domain filter for applying to each frame. By default it is one so nothing will be changed. 210 | :param zero_padding: If the samples is not a multiple of frame_length(number of frames sample), zero padding will 211 | be done for generating last frame. 212 | :returns: Array of frames. size: number_of_frames x frame_len. 213 | """ 214 | 215 | --------------------- 216 | Post Processing 217 | --------------------- 218 | 219 | There are some post-processing operation that are supported in ``speechpy``. 220 | 221 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 222 | Global cepstral mean and variance normalization (CMVN) 223 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 224 | 225 | This function performs global cepstral mean and variance normalization 226 | (CMVN) to remove the channel effects. The code assumes that there is one 227 | observation per row. 228 | 229 | .. code-block:: python 230 | 231 | def cmvn(vec, variance_normalization=False): 232 | """ 233 | This function is aimed to perform global ``cepstral mean and variance normalization`` 234 | (CMVN) on input feature vector "vec". The code assumes that there is one observation per row. 235 | 236 | :param: 237 | vec: input feature matrix (size:(num_observation,num_features)) 238 | variance_normalization: If the variance normilization should be performed or not. 239 | :return: 240 | The mean(or mean+variance) normalized feature vector. 241 | """ 242 | 243 | 244 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 245 | Local cepstral mean and variance normalization (CMVN) over a sliding window 246 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 247 | 248 | This function performs local cepstral mean and variance normalization 249 | (CMVN) over sliding windows. The code assumes that there is one 250 | observation per row. 251 | 252 | .. code-block:: python 253 | 254 | def cmvnw(vec, win_size=301, variance_normalization=False): 255 | """ 256 | This function is aimed to perform local cepstral mean and variance normalization on a sliding window. 257 | (CMVN) on input feature vector "vec". The code assumes that there is one observation per row. 258 | :param 259 | vec: input feature matrix (size:(num_observation,num_features)) 260 | win_size: The size of sliding window for local normalization and should be odd. 261 | default=301 which is around 3s if 100 Hz rate is considered(== 10ms frame stide) 262 | variance_normalization: If the variance normilization should be performed or not. 263 | 264 | :return: The mean(or mean+variance) normalized feature vector. 265 | """ 266 | 267 | ----- 268 | Tests 269 | ----- 270 | 271 | SpeechPy includes some unit tests. To run the tests, ``cd`` into the 272 | ``speechpy/tests`` directory and run: 273 | 274 | .. code-block:: shell 275 | 276 | python -m pytest 277 | 278 | For installing the requirements you only need to install ``pytest``. 279 | 280 | ------------ 281 | Example 282 | ------------ 283 | 284 | The test example can be seen in ``test/test.py`` as below: 285 | 286 | .. code-block:: python 287 | 288 | import scipy.io.wavfile as wav 289 | import numpy as np 290 | import speechpy 291 | import os 292 | 293 | file_name = os.path.join(os.path.dirname(os.path.abspath(__file__)),'Alesis-Sanctuary-QCard-AcoustcBas-C2.wav') 294 | fs, signal = wav.read(file_name) 295 | signal = signal[:,0] 296 | 297 | # Example of pre-emphasizing. 298 | signal_preemphasized = speechpy.processing.preemphasis(signal, cof=0.98) 299 | 300 | # Example of staching frames 301 | frames = speechpy.processing.stack_frames(signal, sampling_frequency=fs, frame_length=0.020, frame_stride=0.01, filter=lambda x: np.ones((x,)), 302 | zero_padding=True) 303 | 304 | # Example of extracting power spectrum 305 | power_spectrum = speechpy.processing.power_spectrum(frames, fft_points=512) 306 | print('power spectrum shape=', power_spectrum.shape) 307 | 308 | ############# Extract MFCC features ############# 309 | mfcc = speechpy.feature.mfcc(signal, sampling_frequency=fs, frame_length=0.020, frame_stride=0.01, 310 | num_filters=40, fft_length=512, low_frequency=0, high_frequency=None) 311 | mfcc_cmvn = speechpy.processing.cmvnw(mfcc,win_size=301,variance_normalization=True) 312 | print('mfcc(mean + variance normalized) feature shape=', mfcc_cmvn.shape) 313 | 314 | mfcc_feature_cube = speechpy.feature.extract_derivative_feature(mfcc) 315 | print('mfcc feature cube shape=', mfcc_feature_cube.shape) 316 | 317 | ############# Extract logenergy features ############# 318 | logenergy = speechpy.feature.lmfe(signal, sampling_frequency=fs, frame_length=0.020, frame_stride=0.01, 319 | num_filters=40, fft_length=512, low_frequency=0, high_frequency=None) 320 | logenergy_feature_cube = speechpy.feature.extract_derivative_feature(logenergy) 321 | print('logenergy features=', logenergy.shape) 322 | 323 | For extracting the feature at first, the signal samples will be stacked into frames. The features are computed for each frame in the stacked frames collection. 324 | 325 | --------------------- 326 | Dependencies 327 | --------------------- 328 | 329 | Two packages of ``Scipy`` and ``NumPy`` are the required dependencies which will be installed automatically by running the ``setup.py`` file. 330 | 331 | --------------------- 332 | Acknowledgements 333 | --------------------- 334 | 335 | This work is based upon a work supported by the Center for Identification Technology Research and the National Science Foundation under Grant #1650474. 336 | 337 | 338 | --------------------- 339 | Contributing 340 | --------------------- 341 | 342 | When contributing to this repository, you are more than welcome to discuss your feedback with any of the owners of this repository. *For typos, please do not create a pull request. Instead, declare them in issues or email the repository owner*. For technical and conceptual questions please feel free to **directly contact the repository owner**. Before asking general questions related to the concepts and techniques provided in this project, **please make sure to read and understand its associated paper**. 343 | 344 | ~~~~~~~~~~~~~~~~~~~~~~~~ 345 | Pull Request Process 346 | ~~~~~~~~~~~~~~~~~~~~~~~~ 347 | 348 | Please consider the following criterions in order to help us in a better way: 349 | 350 | 1. The pull request is mainly expected to be a code script suggestion or improvement. 351 | 2. A pull request related to non-code-script sections is expected to make a significant difference in the documentation. Otherwise, it is expected to be announced in the issues section. 352 | 3. Ensure any install or build dependencies are removed before the end of the layer when doing a 353 | build and creating a pull request. 354 | 4. Add comments with details of changes to the interface, this includes new environment 355 | variables, exposed ports, useful file locations and container parameters. 356 | 5. You may merge the Pull Request in once you have the sign-off of at least one other developer, or if you 357 | do not have permission to do that, you may request the owner to merge it for you if you believe all checks are passed. 358 | 359 | ~~~~~~~~~~~~~~~~~~~~~~~~ 360 | Declaring issues 361 | ~~~~~~~~~~~~~~~~~~~~~~~~ 362 | 363 | For declaring issues, you can directly email the repository owner. However, preferably please create an issue as it might be 364 | the issue that other repository followers may encounter. That way, the question to other developers will be answered as well. 365 | 366 | ~~~~~~~~~~~~~~~~~~~~~~~~ 367 | Final Note 368 | ~~~~~~~~~~~~~~~~~~~~~~~~ 369 | 370 | We are looking forward to your kind feedback. Please help us to improve this open source project and make our work better. 371 | For contribution, please create a pull request and we will investigate it promptly. Once again, we appreciate 372 | your kind feedback and elaborate code inspections. 373 | 374 | 375 | 376 | --------------------- 377 | Disclaimer 378 | --------------------- 379 | 380 | Although by dramatic chages, some portion of this library is inspired by the `python speech features`_ library. 381 | 382 | .. _python speech features: https://github.com/jameslyons/python_speech_features 383 | 384 | We clain the following advantages for our library: 385 | 386 | 1. More accurate operations have been performed for the mel-frequency calculations. 387 | 2. The package supports different ``Python`` versions. 388 | 3. The feature are generated in a more organized way as cubic features. 389 | 4. The package is well-tested and integrated. 390 | 5. The package is up-to-date and actively developing. 391 | 6. The package has been used for research purposes. 392 | 7. Exceptions and extreme cases are handled in this library. 393 | -------------------------------------------------------------------------------- /__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/astorfi/speechpy/b317746f042d02e03124518625be6d9327b71e4a/__init__.py -------------------------------------------------------------------------------- /_config.yml: -------------------------------------------------------------------------------- 1 | theme: jekyll-theme-architect -------------------------------------------------------------------------------- /_images/Speech_GIF.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/astorfi/speechpy/b317746f042d02e03124518625be6d9327b71e4a/_images/Speech_GIF.gif -------------------------------------------------------------------------------- /_images/follow-twitter.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/astorfi/speechpy/b317746f042d02e03124518625be6d9327b71e4a/_images/follow-twitter.gif -------------------------------------------------------------------------------- /_images/pipeline.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/astorfi/speechpy/b317746f042d02e03124518625be6d9327b71e4a/_images/pipeline.jpg -------------------------------------------------------------------------------- /_images/readme.rst: -------------------------------------------------------------------------------- 1 | The images used for this repository. 2 | -------------------------------------------------------------------------------- /_images/speech.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/astorfi/speechpy/b317746f042d02e03124518625be6d9327b71e4a/_images/speech.gif -------------------------------------------------------------------------------- /_images/speechpy_logo.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/astorfi/speechpy/b317746f042d02e03124518625be6d9327b71e4a/_images/speechpy_logo.gif -------------------------------------------------------------------------------- /_images/stackframes.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/astorfi/speechpy/b317746f042d02e03124518625be6d9327b71e4a/_images/stackframes.png -------------------------------------------------------------------------------- /docs/Makefile: -------------------------------------------------------------------------------- 1 | # Minimal makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line. 5 | SPHINXOPTS = 6 | SPHINXBUILD = sphinx-build 7 | SPHINXPROJ = SpeechPy 8 | SOURCEDIR = source 9 | BUILDDIR = build 10 | 11 | # Put it first so that "make" without argument is like "make help". 12 | help: 13 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 14 | 15 | .PHONY: help Makefile 16 | 17 | # Catch-all target: route all unknown targets to Sphinx using the new 18 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). 19 | %: Makefile 20 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) -------------------------------------------------------------------------------- /docs/_config.yml: -------------------------------------------------------------------------------- 1 | theme: jekyll-theme-time-machine -------------------------------------------------------------------------------- /docs/requirements.txt: -------------------------------------------------------------------------------- 1 | sphinx 2 | -e git://github.com/snide/sphinx_rtd_theme.git#egg=sphinx_rtd_theme 3 | -------------------------------------------------------------------------------- /docs/source/_static/img/08063416.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/astorfi/speechpy/b317746f042d02e03124518625be6d9327b71e4a/docs/source/_static/img/08063416.pdf -------------------------------------------------------------------------------- /docs/source/_static/img/Speech_GIF.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/astorfi/speechpy/b317746f042d02e03124518625be6d9327b71e4a/docs/source/_static/img/Speech_GIF.gif -------------------------------------------------------------------------------- /docs/source/_static/img/installation_logo.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/astorfi/speechpy/b317746f042d02e03124518625be6d9327b71e4a/docs/source/_static/img/installation_logo.gif -------------------------------------------------------------------------------- /docs/source/_static/img/installation_logo.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/astorfi/speechpy/b317746f042d02e03124518625be6d9327b71e4a/docs/source/_static/img/installation_logo.jpg -------------------------------------------------------------------------------- /docs/source/_static/img/speech.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/astorfi/speechpy/b317746f042d02e03124518625be6d9327b71e4a/docs/source/_static/img/speech.gif -------------------------------------------------------------------------------- /docs/source/_static/img/speech.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/astorfi/speechpy/b317746f042d02e03124518625be6d9327b71e4a/docs/source/_static/img/speech.jpg -------------------------------------------------------------------------------- /docs/source/_static/img/speechpy_logo.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/astorfi/speechpy/b317746f042d02e03124518625be6d9327b71e4a/docs/source/_static/img/speechpy_logo.gif -------------------------------------------------------------------------------- /docs/source/_static/img/speechpy_logo.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/astorfi/speechpy/b317746f042d02e03124518625be6d9327b71e4a/docs/source/_static/img/speechpy_logo.jpg -------------------------------------------------------------------------------- /docs/source/_static/img/stackframes.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/astorfi/speechpy/b317746f042d02e03124518625be6d9327b71e4a/docs/source/_static/img/stackframes.png -------------------------------------------------------------------------------- /docs/source/_templates/breadcrumbs.html: -------------------------------------------------------------------------------- 1 | {# Support for Sphinx 1.3+ page_source_suffix, but don't break old builds. #} 2 | 3 | {% if page_source_suffix %} 4 | {% set suffix = page_source_suffix %} 5 | {% else %} 6 | {% set suffix = source_suffix %} 7 | {% endif %} 8 | 9 | {% if meta is defined and meta is not none %} 10 | {% set check_meta = True %} 11 | {% else %} 12 | {% set check_meta = False %} 13 | {% endif %} 14 | 15 | {% if check_meta and 'github_url' in meta %} 16 | {% set display_github = True %} 17 | {% endif %} 18 | 19 | {% if check_meta and 'bitbucket_url' in meta %} 20 | {% set display_bitbucket = True %} 21 | {% endif %} 22 | 23 | {% if check_meta and 'gitlab_url' in meta %} 24 | {% set display_gitlab = True %} 25 | {% endif %} 26 | 27 |
28 | 29 | 70 | 71 | {% if (theme_prev_next_buttons_location == 'top' or theme_prev_next_buttons_location == 'both') and (next or prev) %} 72 | 80 | {% endif %} 81 |
82 |
83 | -------------------------------------------------------------------------------- /docs/source/_templates/breadcrumbs.html~: -------------------------------------------------------------------------------- 1 | {# Support for Sphinx 1.3+ page_source_suffix, but don't break old builds. #} 2 | 3 | {% if page_source_suffix %} 4 | {% set suffix = page_source_suffix %} 5 | {% else %} 6 | {% set suffix = source_suffix %} 7 | {% endif %} 8 | 9 | {% if meta is defined and meta is not none %} 10 | {% set check_meta = True %} 11 | {% else %} 12 | {% set check_meta = False %} 13 | {% endif %} 14 | 15 | {% if check_meta and 'github_url' in meta %} 16 | {% set display_github = True %} 17 | {% endif %} 18 | 19 | {% if check_meta and 'bitbucket_url' in meta %} 20 | {% set display_bitbucket = True %} 21 | {% endif %} 22 | 23 | {% if check_meta and 'gitlab_url' in meta %} 24 | {% set display_gitlab = True %} 25 | {% endif %} 26 | 27 |
28 | 29 | 70 | 71 | {% if (theme_prev_next_buttons_location == 'top' or theme_prev_next_buttons_location == 'both') and (next or prev) %} 72 | 80 | {% endif %} 81 |
82 |
83 | -------------------------------------------------------------------------------- /docs/source/conf.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # 3 | # SpeechPy documentation build configuration file, created by 4 | # sphinx-quickstart on Wed Nov 22 14:40:49 2017. 5 | # 6 | # This file is execfile()d with the current directory set to its 7 | # containing dir. 8 | # 9 | # Note that not all possible configuration values are present in this 10 | # autogenerated file. 11 | # 12 | # All configuration values have a default; values that are commented out 13 | # serve to show the default. 14 | 15 | # If extensions (or modules to document with autodoc) are in another directory, 16 | # add these directories to sys.path here. If the directory is relative to the 17 | # documentation root, use os.path.abspath to make it absolute, like shown here. 18 | # 19 | import os 20 | import sys 21 | sys.path.insert(0, os.path.abspath('../../')) 22 | import speechpy 23 | import numpy 24 | import sphinx_rtd_theme 25 | 26 | 27 | # -- General configuration ------------------------------------------------ 28 | 29 | # If your documentation needs a minimal Sphinx version, state it here. 30 | # 31 | # needs_sphinx = '1.0' 32 | 33 | # Add any Sphinx extension module names here, as strings. They can be 34 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom 35 | # ones. 36 | extensions = [ 37 | 'sphinx.ext.autodoc', 38 | 'sphinx.ext.autosummary', 39 | 'sphinx.ext.doctest', 40 | 'sphinx.ext.intersphinx', 41 | 'sphinx.ext.todo', 42 | 'sphinx.ext.coverage', 43 | 'sphinx.ext.mathjax', 44 | 'sphinx.ext.napoleon', 45 | 'sphinx.ext.viewcode', 46 | # 'sphinxcontrib.googleanalytics', 47 | ] 48 | 49 | # True to use the :ivar: role for instance variables. False to use the .. attribute:: directive instead. Defaults to False. 50 | # Refer to http://www.sphinx-doc.org/en/stable/ext/napoleon.html 51 | napoleon_use_ivar = True 52 | 53 | # Add any paths that contain templates here, relative to this directory. 54 | templates_path = ['_templates'] 55 | 56 | # The suffix(es) of source filenames. 57 | # You can specify multiple suffix as a list of string: 58 | # 59 | # source_suffix = ['.rst', '.md'] 60 | source_suffix = '.rst' 61 | 62 | # The master toctree document. 63 | master_doc = 'index' 64 | 65 | # General information about the project. 66 | project = u'SpeechPy' 67 | copyright = u'2017, Amirsina Torfi' 68 | author = u'Amirsina Torfi' 69 | 70 | # The version info for the project you're documenting, acts as replacement for 71 | # |version| and |release|, also used in various other places throughout the 72 | # built documents. 73 | # 74 | version = 'master (' + '2.3.0' + ' )' 75 | # The full version, including alpha/beta/rc tags. 76 | # TODO: verify this works as expected 77 | release = 'master' 78 | 79 | # The language for content autogenerated by Sphinx. Refer to documentation 80 | # for a list of supported languages. 81 | # 82 | # This is also used if you do content translation via gettext catalogs. 83 | # Usually you set "language" from the command line for these cases. 84 | language = None 85 | 86 | # List of patterns, relative to source directory, that match files and 87 | # directories to ignore when looking for source files. 88 | # This patterns also effect to html_static_path and html_extra_path 89 | exclude_patterns = [] 90 | 91 | # The name of the Pygments (syntax highlighting) style to use. 92 | pygments_style = 'sphinx' 93 | 94 | # If true, `todo` and `todoList` produce output, else they produce nothing. 95 | todo_include_todos = False 96 | 97 | 98 | # -- Options for HTML output ---------------------------------------------- 99 | 100 | # The theme to use for HTML and HTML Help pages. See the documentation for 101 | # a list of builtin themes. 102 | # 103 | html_theme = 'sphinx_rtd_theme' 104 | html_theme_path = [sphinx_rtd_theme.get_html_theme_path()] 105 | 106 | # Theme options are theme-specific and customize the look and feel of a theme 107 | # further. For a list of options available for each theme, see the 108 | # documentation. 109 | # 110 | # html_theme_options = {} 111 | 112 | html_theme_options = { 113 | 'collapse_navigation': False, 114 | 'display_version': True, 115 | 'logo_only': True, 116 | 'sticky_navigation': False 117 | } 118 | 119 | html_context = { 120 | "display_github": True, # Add 'Edit on Github' link instead of 'View page source' 121 | "last_updated": True, 122 | "commit": False, 123 | } 124 | 125 | html_logo = '_static/img/speechpy_logo.gif' 126 | 127 | # Add any paths that contain custom static files (such as style sheets) here, 128 | # relative to this directory. They are copied after the builtin static files, 129 | # so a file named "default.css" will overwrite the builtin "default.css". 130 | html_static_path = ['_static'] 131 | 132 | # -- Options for HTMLHelp output ------------------------------------------ 133 | 134 | # Output file base name for HTML help builder. 135 | htmlhelp_basename = 'SpeechPydoc' 136 | 137 | 138 | # -- Options for LaTeX output --------------------------------------------- 139 | 140 | # -- Options for LaTeX output --------------------------------------------- 141 | 142 | # latex_engine = 'pdflatex' 143 | 144 | # latex_engine = 'lualatex' 145 | # latex_elements = { 146 | 147 | # 'papersize': 'a4paper', 148 | # 'releasename':" ", 149 | # 'figure_align':'htbp', 150 | # 'pointsize': '12pt', 151 | # 'fontpkg': r''' 152 | # \setmainfont{Times New Roman} 153 | # \setsansfont{Times New Roman} 154 | # \setmonofont{Times New Roman} 155 | # ''', 156 | # 'preamble': r''' 157 | # \usepackage[titles]{tocloft} 158 | # \cftsetpnumwidth {1.25cm}\cftsetrmarg{1.5cm} 159 | # \setlength{\cftchapnumwidth}{0.75cm} 160 | # \setlength{\cftsecindent}{\cftchapnumwidth} 161 | # \setlength{\cftsecnumwidth}{1.25cm} 162 | # ''', 163 | # 'fncychap': r'\usepackage[Bjornstrup]{fncychap}', 164 | # 'printindex': r'\footnotesize\raggedright\printindex', 165 | # } 166 | 167 | 168 | 169 | 170 | latex_elements = { 171 | # The paper size ('letterpaper' or 'a4paper'). 172 | # 173 | 'papersize': 'letterpaper', 174 | 175 | # The font size ('10pt', '11pt' or '12pt'). 176 | # 177 | 'pointsize': '10pt', 178 | 179 | # Additional stuff for the LaTeX preamble. 180 | # 181 | 'preamble': '', 182 | 183 | # Latex figure (float) alignment 184 | # 185 | 'figure_align': 'htbp', 186 | } 187 | 188 | # Grouping the document tree into LaTeX files. List of tuples 189 | # (source start file, target name, title, 190 | # author, documentclass [howto, manual, or own class]). 191 | latex_documents = [ 192 | (master_doc, 'test.tex', u'test Documentation', 193 | u'test', 'manual'), 194 | ] 195 | 196 | # The name of an image file (relative to this directory) to place at the top of 197 | # the title page. 198 | # 199 | # latex_logo = None 200 | 201 | # If true, show page references after internal links. 202 | # 203 | # latex_show_pagerefs = False 204 | 205 | # If true, show URL addresses after external links. 206 | # 207 | # latex_show_urls = False 208 | 209 | # Documents to append as an appendix to all manuals. 210 | # 211 | # latex_appendices = [] 212 | 213 | # If false, no module index is generated. 214 | # 215 | # latex_domain_indices = True 216 | 217 | 218 | 219 | 220 | 221 | 222 | 223 | 224 | 225 | 226 | 227 | 228 | 229 | 230 | 231 | latex_logo = '_static/img/speechpy_logo.jpg' 232 | 233 | 234 | # latex_elements = { 235 | # # The paper size ('letterpaper' or 'a4paper'). 236 | # # 237 | # 'papersize': 'a4paper', 238 | # 'releasename':" ", 239 | # # Sonny, Lenny, Glenn, Conny, Rejne, Bjarne and Bjornstrup 240 | # # 'fncychap': '\\usepackage[Lenny]{fncychap}', 241 | # 'fncychap': '\\usepackage{fncychap}', 242 | # # 'fontpkg': ' ', 243 | # 244 | # 'figure_align':'htbp', 245 | # # The font size ('10pt', '11pt' or '12pt'). 246 | # # 247 | # 'pointsize': '14pt', 248 | # 249 | # # Additional stuff for the LaTeX preamble. 250 | # 251 | # # 'preamble': r''' 252 | # # %%%%%%%%%%%%%%%%%%%% Sina %%%%%%%%%%%%%%%%%% 253 | # # %%%add number to subsubsection 2=subsection, 3=subsubsection 254 | # # %%% below subsubsection is not good idea. 255 | # # \setcounter{secnumdepth}{3} 256 | # # % 257 | # # %%%% Table of content upto 2=subsection, 3=subsubsection 258 | # # %\setcounter{tocdepth}{2} 259 | # # 260 | # # \usepackage{amsmath,amsfonts,amssymb,amsthm} 261 | # # \usepackage{graphicx} 262 | # # 263 | # # %\usepackage{minted} 264 | # # %\fvset{breaklines=true} 265 | # # 266 | # # %%% reduce spaces for Table of contents, figures and tables 267 | # # %%% it is used "\addtocontents{toc}{\vskip -1.2cm}" etc. in the document 268 | # # \usepackage[notlot,nottoc,notlof]{} 269 | # # 270 | # # \usepackage{color} 271 | # # \usepackage{transparent} 272 | # # \usepackage{eso-pic} 273 | # # \usepackage{lipsum} 274 | # # 275 | # # \usepackage{footnotebackref} %%link at the footnote to go to the place of footnote in the text 276 | # # 277 | # # %% spacing between line 278 | # # \usepackage{setspace} 279 | # # %%%%\onehalfspacing 280 | # # %%%%\doublespacing 281 | # # %\singlespacing 282 | # # 283 | # # 284 | # # %%%%%%%%%%% datetime 285 | # # \usepackage{datetime} 286 | # # 287 | # # \newdateformat{MonthYearFormat}{% 288 | # # \monthname[\THEMONTH], \THEYEAR} 289 | # # 290 | # # 291 | # # %% RO, LE will not work for 'oneside' layout. 292 | # # %% Change oneside to twoside in document class 293 | # # %\usepackage{fancyhdr} 294 | # # %\pagestyle{fancy} 295 | # # %\fancyhf{} 296 | # # 297 | # # %%% Alternating Header for oneside 298 | # # %\fancyhead[L]{\ifthenelse{\isodd{\value{page}}}{ \small \nouppercase{\leftmark} }{}} 299 | # # %\fancyhead[R]{\ifthenelse{\isodd{\value{page}}}{}{ \small \nouppercase{\rightmark} }} 300 | # # 301 | # # %%% Alternating Header for two side 302 | # # %\fancyhead[RO]{\small \nouppercase{\rightmark}} 303 | # # %\fancyhead[LE]{\small \nouppercase{\leftmark}} 304 | # # 305 | # # %% for oneside: change footer at right side. If you want to use Left and right then use same as header defined above. 306 | # # %\fancyfoot[R]{\ifthenelse{\isodd{\value{page}}}{{\tiny Amirsina Torfi} }{\href{https://github.com/astorfi/speechpy}{\tiny SpeechPy}}} 307 | # # 308 | # # %%% Alternating Footer for two side 309 | # # %\fancyfoot[RO, RE]{\scriptsize Amirsina Torfi (amirsina.torfi@gmail.com)} 310 | # # 311 | # # %%% page number 312 | # # %\fancyfoot[CO, CE]{\thepage} 313 | # # 314 | # # %\renewcommand{\headrulewidth}{0.5pt} 315 | # # %\renewcommand{\footrulewidth}{0.5pt} 316 | # # 317 | # # %\RequirePackage{tocbibind} %%% comment this to remove page number for following 318 | # # %\addto\captionsenglish{\renewcommand{\contentsname}{Table of contents}} 319 | # # %\addto\captionsenglish{\renewcommand{\listfigurename}{List of figures}} 320 | # # %\addto\captionsenglish{\renewcommand{\listtablename}{List of tables}} 321 | # # %\addto\captionsenglish{\renewcommand{\listtablename}{List of tables}} %%% Heading for TOC 322 | # # 323 | # # 324 | # # %%reduce spacing for itemize 325 | # # \usepackage{enumitem} 326 | # # %\setlist{nosep} 327 | # # 328 | # # %%%%%%%%%%% Quote Styles at the top of chapter 329 | # # %\usepackage{epigraph} 330 | # # %\setlength{\epigraphwidth}{0.8\columnwidth} 331 | # # %\newcommand{\chapterquote}[2]{\epigraphhead[60]{\epigraph{\textit{#1}}{\textbf {\textit{--#2}}}}} 332 | # # %%%%%%%%%%% Quote for all places except Chapter 333 | # # %\newcommand{\sectionquote}[2]{{\quote{\textit{``#1''}}{\textbf {\textit{--#2}}}}} 334 | # # ''', 335 | # # 336 | # # 337 | # # 'maketitle': r''' 338 | # # \pagenumbering{Roman} %%% to avoid page 1 conflict with actual page 1 339 | # # 340 | # # \begin{titlepage} 341 | # # \centering 342 | # # 343 | # # \vspace*{40mm} %%% * is used to give space from top 344 | # # \textbf{\Huge {SpeechPy: Speech Recognition Library}} 345 | # # 346 | # # \vspace{0mm} 347 | # # \begin{figure}[!h] 348 | # # \centering 349 | # # \includegraphics[scale=0.8]{speechpy_logo.jpg} 350 | # # \end{figure} 351 | # # 352 | # # \vspace{0mm} 353 | # # \Large \textbf{{Amirsina Torfi}} 354 | # # 355 | # # % \small Created on : Octorber, 2017 356 | # # 357 | # # \vspace*{0mm} 358 | # # \small Last updated : \MonthYearFormat\today 359 | # # 360 | # # 361 | # # %% \vfill adds at the bottom 362 | # # \vfill 363 | # # \small \textit{Please refer to project repository at }{\href{https://github.com/astorfi/speechpy}{SpeechPy}} 364 | # # \end{titlepage} 365 | # # 366 | # # \clearpage 367 | # # \pagenumbering{roman} 368 | # # \tableofcontents 369 | # # % \listoffigures 370 | # # % \listoftables 371 | # # \clearpage 372 | # # \pagenumbering{english} 373 | # # 374 | # # ''', 375 | # # Latex figure (float) alignment 376 | # # 377 | # # 'figure_align': 'htbp', 378 | # # 'sphinxsetup': \ 379 | # # #'hmargin={0.7in,0.7in}, vmargin={1in,1in}, \ 380 | # # 'verbatimwithframe=true, \ 381 | # # TitleColor={rgb}{0,0,0}', 382 | # # 'tableofcontents':' ', 383 | # 384 | # } 385 | 386 | 387 | 388 | # Grouping the document tree into LaTeX files. List of tuples 389 | # (source start file, target name, title, 390 | # author, documentclass [howto, manual, or own class]). 391 | latex_documents = [ 392 | (master_doc, 'speechpy.tex', 'SpeechPy Documentation', 393 | 'Amirsina Torfi', 'manual'), 394 | ] 395 | 396 | 397 | # -- Options for manual page output --------------------------------------- 398 | 399 | # One entry per manual page. List of tuples 400 | # (source start file, name, description, authors, manual section). 401 | man_pages = [ 402 | (master_doc, 'speechpy', u'SpeechPy Documentation', 403 | [author], 1) 404 | ] 405 | 406 | 407 | # -- Options for Texinfo output ------------------------------------------- 408 | 409 | # Grouping the document tree into Texinfo files. List of tuples 410 | # (source start file, target name, title, author, 411 | # dir menu entry, description, category) 412 | texinfo_documents = [ 413 | (master_doc, 'SpeechPy', u'SpeechPy Documentation', 414 | author, 'SpeechPy', 'A library for Speech Recognition and Feature Extraction.', 415 | 'Miscellaneous'), 416 | ] 417 | 418 | 419 | # Example configuration for intersphinx: refer to the Python standard library. 420 | intersphinx_mapping = { 421 | 'python': ('https://docs.python.org/3/', None), 422 | 'numpy': ('https://docs.scipy.org/doc/numpy/', None), 423 | } 424 | 425 | 426 | -------------------------------------------------------------------------------- /docs/source/content/features.rst: -------------------------------------------------------------------------------- 1 | .. role:: hidden 2 | :class: hidden-section 3 | 4 | Features 5 | ========= 6 | 7 | .. automodule:: speechpy.feature 8 | .. currentmodule:: speechpy.feature 9 | 10 | 11 | :hidden:`MFCC` 12 | ~~~~~~~~~~~~~~ 13 | 14 | .. autofunction:: speechpy.feature.mfcc 15 | 16 | 17 | :hidden:`Mel Frequency Energy` 18 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 19 | 20 | .. autofunction:: speechpy.feature.mfe 21 | 22 | 23 | :hidden:`Log Mel Frequency Energy` 24 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 25 | 26 | .. autofunction:: speechpy.feature.lmfe 27 | 28 | 29 | :hidden:`Extract Derivative Features` 30 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 31 | 32 | .. autofunction:: speechpy.feature.extract_derivative_feature 33 | -------------------------------------------------------------------------------- /docs/source/content/postprocessing.rst: -------------------------------------------------------------------------------- 1 | .. role:: hidden 2 | :class: hidden-section 3 | 4 | 5 | postprocessing 6 | ============== 7 | 8 | .. automodule:: speechpy.processing 9 | .. currentmodule:: speechpy.processing 10 | 11 | 12 | :hidden:`Global Cepstral Mean and Variance Normalization` 13 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 14 | 15 | .. autofunction:: speechpy.processing.cmvn 16 | 17 | 18 | :hidden:`Local Cepstral Mean and Variance Normalization over Sliding Window` 19 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 20 | 21 | .. autofunction:: speechpy.processing.cmvnw 22 | -------------------------------------------------------------------------------- /docs/source/content/preprocessing.rst: -------------------------------------------------------------------------------- 1 | .. role:: hidden 2 | :class: hidden-section 3 | 4 | 5 | Preprocessing 6 | ============= 7 | 8 | .. automodule:: speechpy.processing 9 | .. currentmodule:: speechpy.processing 10 | 11 | :hidden:`Pre-emphasis` 12 | ~~~~~~~~~~~~~~~~~~~~~~~~~ 13 | .. autofunction:: speechpy.processing.preemphasis 14 | 15 | :hidden:`Stacking` 16 | ~~~~~~~~~~~~~~~~~~ 17 | .. autofunction:: speechpy.processing.stack_frames 18 | 19 | :hidden:`FFT Spectrum` 20 | ~~~~~~~~~~~~~~~~~~~~~~ 21 | .. autofunction:: speechpy.processing.fft_spectrum 22 | 23 | :hidden:`Power Spectrum` 24 | ~~~~~~~~~~~~~~~~~~~~~~~~ 25 | .. autofunction:: speechpy.processing.power_spectrum 26 | 27 | :hidden:`Power Spectrum Log` 28 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 29 | .. autofunction:: speechpy.processing.log_power_spectrum 30 | 31 | :hidden:`Derivative Extraction` 32 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 33 | 34 | .. autofunction:: speechpy.processing.derivative_extraction 35 | -------------------------------------------------------------------------------- /docs/source/epilogue/CONTRIBUTING.rst: -------------------------------------------------------------------------------- 1 | 2 | ======================== 3 | Contributing 4 | ======================== 5 | 6 | When contributing to this repository, you are more than welcome to discuss your feedback with any of the owners of this repository. *For typos, please do not create a pull request. Instead, declare them in issues or email the repository owner*. For technical and conceptual questions please feel free to **directly contact the repository owner**. Before asking general questions related to the concepts and techniques provided in this project, **please make sure to read and understand its associated paper**. 7 | 8 | Please note we have a code of conduct, please follow it in all your interactions with the project. 9 | 10 | ---------------------- 11 | Pull Request Process 12 | ---------------------- 13 | 14 | Please consider the following criterions in order to help us in a better way: 15 | 16 | 1. The pull request is mainly expected to be a code script suggestion or improvement. 17 | 2. A pull request related to non-code-script sections is expected to make a significant difference in the documentation. Otherwise, it is expected to be announced in the issues section. 18 | 3. Ensure any install or build dependencies are removed before the end of the layer when doing a 19 | build and creating a pull request. 20 | 4. Add comments with details of changes to the interface, this includes new environment 21 | variables, exposed ports, useful file locations and container parameters. 22 | 5. You may merge the Pull Request in once you have the sign-off of at least one other developer, or if you 23 | do not have permission to do that, you may request the owner to merge it for you if you believe all checks are passed. 24 | 25 | ---------------------- 26 | Final Note 27 | ---------------------- 28 | 29 | We are looking forward to your kind feedback. Please help us to improve this open source project and make our work better. 30 | For contribution, please create a pull request and we will investigate it promptly. Once again, we appreciate 31 | your kind feedback and elaborate code inspections. 32 | -------------------------------------------------------------------------------- /docs/source/epilogue/finalnote.rst: -------------------------------------------------------------------------------- 1 | ================== 2 | Epilogue 3 | ================== 4 | 5 | ------------- 6 | Disclaimer 7 | ------------- 8 | 9 | Although by dramatic chages, some portion of this library is inspired by the `python speech features`_ library. 10 | 11 | .. _python speech features: https://github.com/jameslyons/python_speech_features 12 | 13 | We clain the following advantages for our library: 14 | 15 | 1. More accurate operations have been performed for the mel-frequency calculations. 16 | 2. The package supports different ``Python`` versions. 17 | 3. The feature are generated in a more organized way as cubic features. 18 | 4. The package is well-tested and integrated. 19 | 5. The package is up-to-date and actively developing. 20 | 6. The package has been used for research purposes. 21 | 7. Exceptions and extreme cases are handled in this library. 22 | 23 | 24 | ------------- 25 | Contributing 26 | ------------- 27 | 28 | When contributing to this repository, please first discuss the change you wish to make via issue, 29 | email, or any other method with the owners of this repository before making a change. *For typos, please 30 | do not create a pull request. Instead, declare them in issues or email the repository owner*. 31 | 32 | Please note we have a code of conduct, please follow it in all your interactions with the project. 33 | 34 | -------------------------- 35 | Pull Request Process 36 | -------------------------- 37 | 38 | Please consider the following criterions in order to help us in a better way: 39 | 40 | 1. The pull request is mainly expected to be a code script suggestion or improvement. 41 | 2. A pull request related to non-code-script sections is expected to make a significant difference in the documentation. Otherwise, it is expected to be announced in the issues section. 42 | 3. Ensure any install or build dependencies are removed before the end of the layer when doing a 43 | build and creating a pull request. 44 | 4. Add comments with details of changes to the interface, this includes new environment 45 | variables, exposed ports, useful file locations and container parameters. 46 | 5. You may merge the Pull Request in once you have the sign-off of at least one other developer, or if you 47 | do not have permission to do that, you may request the owner to merge it for you if you believe all checks are passed. 48 | 49 | ------------- 50 | Final Note 51 | ------------- 52 | 53 | We are looking forward to your kind feedback. Please help us to improve this open source project and make our work better. 54 | For contribution, please create a pull request and we will investigate it promptly. Once again, we appreciate 55 | your kind feedback and elaborate code inspections. 56 | -------------------------------------------------------------------------------- /docs/source/epilogue/test.rst: -------------------------------------------------------------------------------- 1 | ============ 2 | test 3 | ============ 4 | 5 | ------------- 6 | Test Package 7 | ------------- 8 | Once the package has been installed, a test file can be directly run to show the results. 9 | The test example can be seen in ``test/test_package.py`` as below: 10 | 11 | .. code-block:: python 12 | 13 | import scipy.io.wavfile as wav 14 | import numpy as np 15 | import speechpy 16 | import os 17 | 18 | file_name = os.path.join(os.path.dirname(os.path.abspath(__file__)),'Alesis-Sanctuary-QCard-AcoustcBas-C2.wav') 19 | fs, signal = wav.read(file_name) 20 | signal = signal[:,0] 21 | 22 | # Example of pre-emphasizing. 23 | signal_preemphasized = speechpy.processing.preemphasis(signal, cof=0.98) 24 | 25 | # Example of staching frames 26 | frames = speechpy.processing.stack_frames(signal, sampling_frequency=fs, frame_length=0.020, frame_stride=0.01, filter=lambda x: np.ones((x,)), 27 | zero_padding=True) 28 | 29 | # Example of extracting power spectrum 30 | power_spectrum = speechpy.processing.power_spectrum(frames, fft_points=512) 31 | print('power spectrum shape=', power_spectrum.shape) 32 | 33 | ############# Extract MFCC features ############# 34 | mfcc = speechpy.feature.mfcc(signal, sampling_frequency=fs, frame_length=0.020, frame_stride=0.01, 35 | num_filters=40, fft_length=512, low_frequency=0, high_frequency=None) 36 | mfcc_cmvn = speechpy.processing.cmvnw(mfcc,win_size=301,variance_normalization=True) 37 | print('mfcc(mean + variance normalized) feature shape=', mfcc_cmvn.shape) 38 | 39 | mfcc_feature_cube = speechpy.feature.extract_derivative_feature(mfcc) 40 | print('mfcc feature cube shape=', mfcc_feature_cube.shape) 41 | 42 | ############# Extract logenergy features ############# 43 | logenergy = speechpy.feature.lmfe(signal, sampling_frequency=fs, frame_length=0.020, frame_stride=0.01, 44 | num_filters=40, fft_length=512, low_frequency=0, high_frequency=None) 45 | logenergy_feature_cube = speechpy.feature.extract_derivative_feature(logenergy) 46 | print('logenergy features=', logenergy.shape) 47 | 48 | 49 | 50 | 51 | 52 | 53 | ----------- 54 | Test Local 55 | ----------- 56 | 57 | There is an alternative local way of testing without the necessity to package installation. 58 | The local test example can be found in ``test/test_package.py`` as follows: 59 | 60 | .. code-block:: python 61 | 62 | import scipy.io.wavfile as wav 63 | import numpy as np 64 | import os 65 | import sys 66 | lib_path = os.path.abspath(os.path.join('..')) 67 | print(lib_path) 68 | sys.path.append(lib_path) 69 | import speechpy 70 | import os 71 | 72 | file_name = os.path.join(os.path.dirname(os.path.abspath(__file__)),'Alesis-Sanctuary-QCard-AcoustcBas-C2.wav') 73 | fs, signal = wav.read(file_name) 74 | signal = signal[:,0] 75 | 76 | # Example of pre-emphasizing. 77 | signal_preemphasized = speechpy.processing.preemphasis(signal, cof=0.98) 78 | 79 | # Example of staching frames 80 | frames = speechpy.processing.stack_frames(signal, sampling_frequency=fs, frame_length=0.020, frame_stride=0.01, filter=lambda x: np.ones((x,)), 81 | zero_padding=True) 82 | 83 | # Example of extracting power spectrum 84 | power_spectrum = speechpy.processing.power_spectrum(frames, fft_points=512) 85 | print('power spectrum shape=', power_spectrum.shape) 86 | 87 | ############# Extract MFCC features ############# 88 | mfcc = speechpy.feature.mfcc(signal, sampling_frequency=fs, frame_length=0.020, frame_stride=0.01, 89 | num_filters=40, fft_length=512, low_frequency=0, high_frequency=None) 90 | mfcc_cmvn = speechpy.processing.cmvnw(mfcc,win_size=301,variance_normalization=True) 91 | print('mfcc(mean + variance normalized) feature shape=', mfcc_cmvn.shape) 92 | 93 | mfcc_feature_cube = speechpy.feature.extract_derivative_feature(mfcc) 94 | print('mfcc feature cube shape=', mfcc_feature_cube.shape) 95 | 96 | ############# Extract logenergy features ############# 97 | logenergy = speechpy.feature.lmfe(signal, sampling_frequency=fs, frame_length=0.020, frame_stride=0.01, 98 | num_filters=40, fft_length=512, low_frequency=0, high_frequency=None) 99 | logenergy_feature_cube = speechpy.feature.extract_derivative_feature(logenergy) 100 | print('logenergy features=', logenergy.shape) 101 | 102 | 103 | 104 | 105 | 106 | 107 | 108 | 109 | 110 | 111 | 112 | 113 | 114 | 115 | 116 | For ectracting the feature at first, the signal samples will be stacked into frames. The features are computed for each frame in the stacked frames collection. 117 | 118 | ------------- 119 | Dependencies 120 | ------------- 121 | 122 | Two packages of ``Scipy`` and ``NumPy`` are the required dependencies which will be installed automatically by running the ``setup.py`` file. 123 | -------------------------------------------------------------------------------- /docs/source/index.rst: -------------------------------------------------------------------------------- 1 | .. SpeechPy documentation master file, created by 2 | sphinx-quickstart on Wed Nov 22 14:40:49 2017. 3 | You can adapt this file completely to your liking, but it should at least 4 | contain the root `toctree` directive. 5 | 6 | :github_url: https://github.com/astorfi/speechpy 7 | 8 | Welcome to SpeechPy's documentation! 9 | ==================================== 10 | 11 | .. toctree:: 12 | :maxdepth: 2 13 | :caption: Preface 14 | 15 | intro/introductions 16 | 17 | .. toctree:: 18 | :maxdepth: 2 19 | :caption: Package Reference 20 | 21 | content/preprocessing 22 | content/features 23 | content/postprocessing 24 | 25 | .. toctree:: 26 | :maxdepth: 2 27 | :caption: Epilogue 28 | 29 | epilogue/test 30 | epilogue/CONTRIBUTING 31 | 32 | 33 | Indices and tables 34 | ================== 35 | 36 | * :ref:`genindex` 37 | * :ref:`modindex` 38 | * :ref:`search` 39 | -------------------------------------------------------------------------------- /docs/source/intro/introductions.rst: -------------------------------------------------------------------------------- 1 | 2 | ============ 3 | Introduction 4 | ============ 5 | 6 | ------------------------- 7 | Foreword 8 | ------------------------- 9 | 10 | The purpose of this project is to provide a package for speech processing and 11 | feature extraction. This library provides most frequent used speech features including MFCCs and filterbank energies alongside with the log-energy of filterbanks. 12 | 13 | 14 | .. image:: ../_static/img/speech.jpg 15 | :height: 200px 16 | :width: 400 px 17 | :scale: 100 % 18 | :alt: alternate text 19 | :align: center 20 | 21 | ------------------------- 22 | Motivation 23 | ------------------------- 24 | 25 | There are different motivations for this open source project. 26 | 27 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~ 28 | Deep Learning application 29 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~ 30 | 31 | One of the main reasons for creating this package was to provide necessary features for deep learning applications such as ASR(Automatic Speech Recognition) or SR(Speaker Recognition). 32 | As a results, most of the features that are necessary are provided hear. 33 | 34 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~ 35 | Pythonic Packaging 36 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~ 37 | 38 | Another reason for creating this package was to have a Pythonic environment for 39 | speech recognition and feature extraction due to the fact that the Python language 40 | is becoming ubiquotous! 41 | 42 | 43 | ------------------------- 44 | How to Install? 45 | ------------------------- 46 | 47 | .. image:: ../_static/img/installation_logo.jpg 48 | :height: 100 px 49 | :width: 200 px 50 | :scale: 80 % 51 | :alt: alternate text 52 | :align: center 53 | 54 | 55 | There are two possible ways for installation of this package: local installation and PyPi. 56 | 57 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~ 58 | Local Installation 59 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~ 60 | 61 | For local installation at first the repository must be cloned:: 62 | 63 | git clone https://github.com/astorfi/speech_feature_extraction.git 64 | 65 | 66 | After cloning the reposity, root to the repository directory then execute:: 67 | 68 | python setup.py develop 69 | 70 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~ 71 | Pypi 72 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~ 73 | 74 | The package is available on PyPi. For direct installation simply execute the following: 75 | 76 | 77 | .. code-block:: shell 78 | 79 | pip install speechpy 80 | 81 | ------------------------- 82 | Citation 83 | ------------------------- 84 | 85 | If you used this package, please cite it as follows: 86 | 87 | .. code:: bash 88 | 89 | @misc{amirsina_torfi_2017_840395, 90 | author = {Amirsina Torfi}, 91 | title = {{SpeechPy: Speech recognition and feature extraction}}, 92 | month = aug, 93 | year = 2017, 94 | doi = {10.5281/zenodo.840395}, 95 | url = {https://doi.org/10.5281/zenodo.840395} 96 | } 97 | -------------------------------------------------------------------------------- /example/Alesis-Sanctuary-QCard-AcoustcBas-C2.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/astorfi/speechpy/b317746f042d02e03124518625be6d9327b71e4a/example/Alesis-Sanctuary-QCard-AcoustcBas-C2.wav -------------------------------------------------------------------------------- /example/test_local.py: -------------------------------------------------------------------------------- 1 | """ 2 | This example is provided to test the package locally. 3 | There is no need to installing the package using pip. 4 | Only forking the project repository is required. 5 | """ 6 | 7 | import scipy.io.wavfile as wav 8 | import numpy as np 9 | import os 10 | import sys 11 | lib_path = os.path.abspath(os.path.join('..')) 12 | print(lib_path) 13 | sys.path.append(lib_path) 14 | from speechpy import processing 15 | from speechpy import feature 16 | import os 17 | 18 | file_name = os.path.join(os.path.dirname(os.path.abspath(__file__)),'Alesis-Sanctuary-QCard-AcoustcBas-C2.wav') 19 | fs, signal = wav.read(file_name) 20 | signal = signal[:,0] 21 | 22 | # Pre-emphasizing. 23 | signal_preemphasized = processing.preemphasis(signal, cof=0.98) 24 | 25 | # Staching frames 26 | frames = processing.stack_frames(signal, sampling_frequency=fs, 27 | frame_length=0.020, 28 | frame_stride=0.01, 29 | filter=lambda x: np.ones((x,)), 30 | zero_padding=True) 31 | 32 | # Extracting power spectrum 33 | power_spectrum = processing.power_spectrum(frames, fft_points=512) 34 | print('power spectrum shape=', power_spectrum.shape) 35 | 36 | ############# Extract MFCC features ############# 37 | mfcc = feature.mfcc(signal, sampling_frequency=fs, 38 | frame_length=0.020, frame_stride=0.01, 39 | num_filters=40, fft_length=512, low_frequency=0, 40 | high_frequency=None) 41 | 42 | # Cepstral mean variance normalization. 43 | mfcc_cmvn = processing.cmvn(mfcc,variance_normalization=True) 44 | print('mfcc(mean + variance normalized) feature shape=', mfcc_cmvn.shape) 45 | 46 | # Extracting derivative features 47 | mfcc_feature_cube = feature.extract_derivative_feature(mfcc) 48 | print('mfcc feature cube shape=', mfcc_feature_cube.shape) 49 | 50 | ############# Extract logenergy features ############# 51 | logenergy = feature.lmfe(signal, sampling_frequency=fs, 52 | frame_length=0.020, frame_stride=0.01, 53 | num_filters=40, fft_length=512, 54 | low_frequency=0, high_frequency=None) 55 | logenergy_feature_cube = feature.extract_derivative_feature(logenergy) 56 | print('logenergy features=', logenergy.shape) 57 | 58 | 59 | 60 | 61 | 62 | -------------------------------------------------------------------------------- /example/test_package.py: -------------------------------------------------------------------------------- 1 | """ 2 | This example is provided to test the installed package. 3 | The package should be installed from PyPi using pip install speechpy. 4 | """ 5 | 6 | import scipy.io.wavfile as wav 7 | import numpy as np 8 | import speechpy 9 | import os 10 | 11 | # Reading the sample wave file 12 | file_name = os.path.join(os.path.dirname(os.path.abspath(__file__)), 13 | 'Alesis-Sanctuary-QCard-AcoustcBas-C2.wav') 14 | fs, signal = wav.read(file_name) 15 | signal = signal[:,0] 16 | 17 | # Pre-emphasizing. 18 | signal_preemphasized = speechpy.processing.preemphasis(signal, cof=0.98) 19 | 20 | # Staching frames 21 | frames = speechpy.processing.stack_frames(signal, sampling_frequency=fs, 22 | frame_length=0.020, 23 | frame_stride=0.01, 24 | filter=lambda x: np.ones((x,)), 25 | zero_padding=True) 26 | 27 | # Extracting power spectrum 28 | power_spectrum = speechpy.processing.power_spectrum(frames, fft_points=512) 29 | print('power spectrum shape=', power_spectrum.shape) 30 | 31 | ############# Extract MFCC features ############# 32 | mfcc = speechpy.feature.mfcc(signal, sampling_frequency=fs, 33 | frame_length=0.020, frame_stride=0.01, 34 | num_filters=40, fft_length=512, low_frequency=0, 35 | high_frequency=None) 36 | mfcc_cmvn = speechpy.processing.cmvnw(mfcc,win_size=301, 37 | variance_normalization=True) 38 | print('mfcc(mean + variance normalized) feature shape=', mfcc_cmvn.shape) 39 | 40 | # Extracting derivative features 41 | mfcc_feature_cube = speechpy.feature.extract_derivative_feature(mfcc) 42 | print('mfcc feature cube shape=', mfcc_feature_cube.shape) 43 | 44 | ############# Extract logenergy features ############# 45 | logenergy = speechpy.feature.lmfe(signal, sampling_frequency=fs, 46 | frame_length=0.020, frame_stride=0.01, 47 | num_filters=40, fft_length=512, 48 | low_frequency=0, high_frequency=None) 49 | logenergy_feature_cube = speechpy.feature.extract_derivative_feature(logenergy) 50 | print('logenergy features=', logenergy.shape) 51 | 52 | 53 | 54 | 55 | -------------------------------------------------------------------------------- /paper/paper.bib: -------------------------------------------------------------------------------- 1 | @inproceedings{hirsch2000aurora, 2 | title={The Aurora experimental framework for the performance evaluation of speech recognition systems under noisy conditions}, 3 | author={Hirsch, Hans-G{\"u}nter and Pearce, David}, 4 | booktitle={ASR2000-Automatic Speech Recognition: Challenges for the new Millenium ISCA Tutorial and Research Workshop (ITRW)}, 5 | year={2000} 6 | } 7 | 8 | @book{guyon2008feature, 9 | title={Feature extraction: foundations and applications}, 10 | author={Guyon, Isabelle and Gunn, Steve and Nikravesh, Masoud and Zadeh, Lofti A}, 11 | volume={207}, 12 | year={2008}, 13 | publisher={Springer} 14 | } 15 | 16 | @article{furui1986speaker, 17 | title={Speaker-independent isolated word recognition using dynamic features of speech spectrum}, 18 | author={Furui, Sadaoki}, 19 | journal={IEEE Transactions on Acoustics, Speech, and Signal Processing}, 20 | volume={34}, 21 | number={1}, 22 | pages={52--59}, 23 | year={1986}, 24 | publisher={IEEE} 25 | } 26 | 27 | @book{yu2016automatic, 28 | title={AUTOMATIC SPEECH RECOGNITION.}, 29 | author={Yu, Dong and Deng, Li}, 30 | year={2016}, 31 | publisher={Springer} 32 | } 33 | 34 | @book{rabiner1993fundamentals, 35 | title={Fundamentals of speech recognition}, 36 | author={Rabiner, Lawrence R and Juang, Biing-Hwang}, 37 | volume={14}, 38 | year={1993}, 39 | publisher={PTR Prentice Hall Englewood Cliffs} 40 | } 41 | 42 | @article{campbell1997speaker, 43 | title={Speaker recognition: A tutorial}, 44 | author={Campbell, Joseph P}, 45 | journal={Proceedings of the IEEE}, 46 | volume={85}, 47 | number={9}, 48 | pages={1437--1462}, 49 | year={1997}, 50 | publisher={IEEE} 51 | } 52 | 53 | 54 | @inproceedings{deng2013recent, 55 | title={Recent advances in deep learning for speech research at Microsoft}, 56 | author={Deng, Li and Li, Jinyu and Huang, Jui-Ting and Yao, Kaisheng and Yu, Dong and Seide, Frank and Seltzer, Michael and Zweig, Geoff and He, Xiaodong and Williams, Jason and others}, 57 | booktitle={Acoustics, Speech and Signal Processing (ICASSP), 2013 IEEE International Conference on}, 58 | pages={8604--8608}, 59 | year={2013}, 60 | organization={IEEE} 61 | } 62 | 63 | @inproceedings{lee2009unsupervised, 64 | title={Unsupervised feature learning for audio classification using convolutional deep belief networks}, 65 | author={Lee, Honglak and Pham, Peter and Largman, Yan and Ng, Andrew Y}, 66 | booktitle={Advances in neural information processing systems}, 67 | pages={1096--1104}, 68 | year={2009} 69 | } 70 | 71 | @inproceedings{yu2011improved, 72 | title={Improved bottleneck features using pretrained deep neural networks}, 73 | author={Yu, Dong and Seltzer, Michael L}, 74 | booktitle={Twelfth Annual Conference of the International Speech Communication Association}, 75 | year={2011} 76 | } 77 | 78 | @article{giannakopoulos2015pyaudioanalysis, 79 | title={pyAudioAnalysis: An Open-Source Python Library for Audio Signal Analysis}, 80 | author={Giannakopoulos, Theodoros}, 81 | journal={PloS one}, 82 | volume={10}, 83 | number={12}, 84 | year={2015}, 85 | publisher={Public Library of Science} 86 | } 87 | 88 | @article{torfi2017text, 89 | title={Text-independent speaker verification using 3d convolutional neural networks}, 90 | author={Torfi, Amirsina and Nasrabadi, Nasser M and Dawson, Jeremy}, 91 | journal={arXiv preprint arXiv:1705.09422}, 92 | year={2017} 93 | } 94 | 95 | @article{torfi20173d, 96 | title={3D Convolutional Neural Networks for Cross Audio-Visual Matching Recognition}, 97 | author={Torfi, Amirsina and Iranmanesh, Seyed Mehdi and Nasrabadi, Nasser and Dawson, Jeremy}, 98 | journal={IEEE Access}, 99 | volume={5}, 100 | pages={22081--22091}, 101 | year={2017}, 102 | publisher={IEEE} 103 | } 104 | 105 | @article{prechelt2000empirical, 106 | title={An empirical comparison of c, c++, java, perl, python, rexx and tcl}, 107 | author={Prechelt, Lutz}, 108 | journal={IEEE Computer}, 109 | volume={33}, 110 | number={10}, 111 | pages={23--29}, 112 | year={2000} 113 | } 114 | 115 | @misc{torfispeechpy, 116 | author = {Amirsina Torfi}, 117 | title = {{SpeechPy: Speech recognition and feature extraction}}, 118 | month = aug, 119 | year = 2017, 120 | doi = {10.5281/zenodo.810391}, 121 | url = {https://doi.org/10.5281/zenodo.810391}} 122 | 123 | @article{torfi2017coupled, 124 | title={Coupled 3D Convolutional Neural Networks for Audio-Visual Recognition}, 125 | author={Torfi, Amirsina and Iranmanesh, Seyed Mehdi and Nasrabadi, Nasser M and Dawson, Jeremy}, 126 | journal={arXiv preprint arXiv:1706.05739}, 127 | year={2017} 128 | } 129 | 130 | @article{torfi2017construction, 131 | title={On the Construction of Polar Codes for Achieving the Capacity of Marginal Channels}, 132 | author={Torfi, Amisina and Soleymani, Sobhan and Vakili, Vahid Tabataba}, 133 | journal={arXiv preprint arXiv:1707.04512}, 134 | year={2017} 135 | } 136 | 137 | @article{shannon2001mathematical, 138 | title={A mathematical theory of communication}, 139 | author={Shannon, Claude Elwood}, 140 | journal={ACM SIGMOBILE Mobile Computing and Communications Review}, 141 | volume={5}, 142 | number={1}, 143 | pages={3--55}, 144 | year={2001}, 145 | publisher={ACM} 146 | } 147 | 148 | @article{gurban2009information, 149 | title={Information theoretic feature extraction for audio-visual speech recognition}, 150 | author={Gurban, Mihai and Thiran, Jean-Philippe}, 151 | journal={IEEE Transactions on signal processing}, 152 | volume={57}, 153 | number={12}, 154 | pages={4765--4776}, 155 | year={2009}, 156 | publisher={IEEE} 157 | } 158 | 159 | @inproceedings{variani2014deep, 160 | title={Deep neural networks for small footprint text-dependent speaker verification}, 161 | author={Variani, Ehsan and Lei, Xin and McDermott, Erik and Moreno, Ignacio Lopez and Gonzalez-Dominguez, Javier}, 162 | booktitle={Acoustics, Speech and Signal Processing (ICASSP), 2014 IEEE International Conference on}, 163 | pages={4052--4056}, 164 | year={2014}, 165 | organization={IEEE} 166 | } 167 | 168 | @article{hinton2012deep, 169 | title={Deep neural networks for acoustic modeling in speech recognition: The shared views of four research groups}, 170 | author={Hinton, Geoffrey and Deng, Li and Yu, Dong and Dahl, George E and Mohamed, Abdel-rahman and Jaitly, Navdeep and Senior, Andrew and Vanhoucke, Vincent and Nguyen, Patrick and Sainath, Tara N and others}, 171 | journal={IEEE Signal Processing Magazine}, 172 | volume={29}, 173 | number={6}, 174 | pages={82--97}, 175 | year={2012}, 176 | publisher={IEEE} 177 | } 178 | 179 | @article{lecun2015deep, 180 | title={Deep learning}, 181 | author={LeCun, Yann and Bengio, Yoshua and Hinton, Geoffrey}, 182 | journal={nature}, 183 | volume={521}, 184 | number={7553}, 185 | pages={436}, 186 | year={2015}, 187 | publisher={Nature Publishing Group} 188 | } 189 | 190 | @article{liu2015deep, 191 | title={Deep feature for text-dependent speaker verification}, 192 | author={Liu, Yuan and Qian, Yanmin and Chen, Nanxin and Fu, Tianfan and Zhang, Ya and Yu, Kai}, 193 | journal={Speech Communication}, 194 | volume={73}, 195 | pages={1--13}, 196 | year={2015}, 197 | publisher={Elsevier} 198 | } 199 | 200 | @article{torfi2018attention, 201 | title={Attention-Based Guided Structured Sparsity of Deep Neural Networks}, 202 | author={Torfi, Amirsina and Shirvani, Rouzbeh A}, 203 | journal={arXiv preprint arXiv:1802.09902}, 204 | year={2018} 205 | } 206 | 207 | 208 | -------------------------------------------------------------------------------- /paper/paper.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: 'SpeechPy - A Library for Speech Processing and Recognition' 3 | tags: 4 | - Python 5 | authors: 6 | - name: Amirsina Torfi 7 | orcid: 0000-0003-2282-4361 8 | affiliation: "1" 9 | affiliations: 10 | - name: Virginia Tech, Department of Computer Science 11 | index: 1 12 | date: 15 May 2018 13 | bibliography: paper.bib 14 | --- 15 | 16 | # Abstract 17 | SpeechPy is an open source Python package that contains speech preprocessing techniques, speech features, and important post-processing operations. It provides most frequent used speech features including MFCCs and filterbank energies alongside with the log-energy of filter-banks. The aim of the package is to provide researchers with a simple tool for speech feature extraction and processing purposes in applications such as Automatic Speech Recognition and Speaker Verification. 18 | 19 | # Introduction 20 | Automatic Speech Recognition (ASR) requires three main components for 21 | further analysis: Preprocessing, feature extraction, and 22 | post-processing. Feature extraction, in an abstract meaning, is 23 | extracting descriptive features from raw signal for speech 24 | classification purposes. Due to the high 25 | dimensionality, the raw signal can be less informative compared to 26 | extracted higher level features. Feature extraction comes to our rescue 27 | for turning the high dimensional signal to a lower dimensional and yet 28 | a more informative version of that for sound recognition and 29 | classification [@furui1986speaker; @guyon2008feature; @hirsch2000aurora]. 30 | 31 | ![Scheme of speech recognition](test/_imgs/Scheme_of_speech_recognition_system.png) 32 | 33 | Feature extraction, in essence, should be done considering the specific 34 | application at hand. For example, in ASR applications, the linguistic 35 | characteristics of the raw signal are of great importance and the other 36 | characteristics must be 37 | ignored [@yu2016automatic; @rabiner1993fundamentals]. On the other hand, 38 | in Speaker Recognition (SR) task, solely voice-associated information 39 | must be contained in the extracted feature [@campbell1997speaker]. So the 40 | feature extraction goal is to extract the relevant feature from the raw 41 | signal and map it to a lower dimensional feature space. The problem of 42 | feature extraction has been investigated in pattern classification aimed 43 | at preventing the curse of dimensionality. There are some feature 44 | extraction approaches based on information theory 45 | [@torfi2017construction; @shannon2001mathematical] applied to multimodal 46 | signals and demonstrated promising results [@gurban2009information]. 47 | 48 | The speech features can be categorized into two general types of 49 | acoustic and linguistic features. The former one is mainly related to 50 | non-verbal sounds and the later one is associated with ASR and SR 51 | systems for which verbal part has the major role. Perhaps one of the most 52 | famous linguistic feature which is hard to beat is the Mel-Frequency 53 | Cepstral Coefficients (MFCC). It uses speech raw frames in the range 54 | from 20ms to 40ms for having stationary 55 | characteristics [@rabiner1993fundamentals]. MFCC is widely used for both 56 | ASR and SR tasks and more recently in the associated deep learning 57 | applications as the input to the network rather than directly feeding 58 | the signal [@deng2013recent; @lee2009unsupervised; @yu2011improved]. 59 | With the advent of deep learning [@lecun2015deep; @torfi2018attention], 60 | major improvements have been achieved by using deep neural networks 61 | rather than traditional methods for speech recognition 62 | applications [@variani2014deep; @hinton2012deep; @liu2015deep]. 63 | 64 | With the availability of free software for speech recognition such as 65 | VOICEBOX, most of these softwares are Matlab-based which limits 66 | their reproducibility due to commercial issues. Another great package is 67 | PyAudioAnalysis [@giannakopoulos2015pyaudioanalysis], which is a 68 | the comprehensive package developed in Python. However, the issue with 69 | PyAudioAnalysis is that its complexity and being too verbose for 70 | extracting simple features and it also lacks some important 71 | preprocessing and post-processing operations for its current version. 72 | 73 | Considering the recent advent of deep learning in ASR and SR and the 74 | importance of the accurate speech feature extraction, here are the 75 | motivations behind SpeechPy package: 76 | 77 | * Developing a free open source package which covers important 78 | preprocessing techniques, speech features, and post-processing 79 | operations required for ASR and SR applications. 80 | 81 | * A simple package with a minimum degree of complexity should be 82 | available for beginners. 83 | 84 | * A well-tested and continuously integrated package for future 85 | developments should be developed. 86 | 87 | SpeechPy has been developed to satisfy the aforementioned needs. It 88 | contains the most important preprocessing and post-processing operations 89 | and a selection of frequently used speech features. The package is free 90 | and released as an open source software. Continuous integration 91 | using for instant error check and validity of changes has been deployed 92 | for SpeechPy. Moreover, prior to the latest official release of 93 | SpeechPy, the package has successfully been utilized for research 94 | purposes [@torfi20173d; @torfi2017text]. 95 | 96 | # Package Eco-system 97 | 98 | 99 | SpeechPy has been developed using Python language for its interface and 100 | backed as well. An empirical study demonstrated that Python as a 101 | scripting language, is more effective and productive than conventional 102 | languages for some programming problems and memory consumption is 103 | often “better than Java and not much worse than C or 104 | C++” [@prechelt2000empirical]. We chose Python due to its simplicity and 105 | popularity. Third-party libraries are avoided except *Numpy* and *Scipy* 106 | for handling data and numeric computations. 107 | 108 | ## Complexity 109 | 110 | As the user should not and does not even need to manipulate the internal 111 | package structure, object-oriented programming is mostly used for 112 | package development which provides an easier interface for the user with a 113 | sacrifice to the simplicity of the code. However, the internal code 114 | complexity of the package does not affect the user experience since the 115 | modules can easily be called with the associated arguments. SpeechPy is 116 | a library with a collection of sub-modules. 117 | 118 | ## Code Style and Documentation 119 | 120 | SpeechPy is constructed based on PEP 8 style guide for Python codes. 121 | Moreover, it is extensively documented using the formatted docstrings 122 | and Sphinx for further automatic modifications to the document in 123 | case of changing internal modules. The full documentation of the project 124 | will be generated in HTML and PDF format using Sphinx and is hosted 125 | online. The official releases of the project are hosted on the Zenodo as 126 | well [@torfispeechpy]. 127 | 128 | ![A general view of the package](test/_imgs/packageview.png) 129 | 130 | ## Continuous Testing and Extensibility 131 | 132 | The output of each function has been evaluated as well as using different 133 | tests as opposed to the other existing standard packages. For continuous 134 | testing, the code is hosted on GitHub and integrated with Travis CI. 135 | Each modification to the code must pass the unit tests defined for the 136 | continuous integration. This will ensure the package does not break with 137 | unadapted code scripts. However, the validity of the modifications 138 | should always be investigated with the owner or authorized collaborators 139 | of the project. The code will be tested at each time of modification for 140 | Python versions *“2.7”*, *“3.4”* and *“3.5”*. In the future, these 141 | versions are subject to change. 142 | 143 | ![Travic CI web interface after testing SpeechPy against a new change](test/_imgs/travicCI.png) 144 | 145 | # Availability 146 | 147 | ## Operating system {#operating-system .unnumbered} 148 | 149 | Tested on Ubuntu 14.04 and 16.04 LTS Linux, Apple Mac OS X 10.9.5 , and 150 | Microsoft Windows 7 & 10. We expect that SpeechPy works on any 151 | distribution as long as Python and the package dependencies are 152 | installed. 153 | 154 | ## Programming language {#programming-language .unnumbered} 155 | 156 | The package has been tested with Python 2.7, 3.4 and 3.5. However, using 157 | Python 3.5 is suggested. 158 | 159 | ## Additional system requirements & dependencies {#additional-system-requirements-dependencies .unnumbered} 160 | 161 | SpeechPy is a light package and small computational power would be 162 | enough for running it. Although the speed of the execution is totally 163 | dependent on the system architecture. The dependencies are as follows: 164 | 165 | * Numpy 166 | 167 | * SciPy 168 | 169 | # Acknowledgement 170 | 171 | This work has been completed with computational resources provided by the West Virginia University and Virginia Tech and is based upon a work 172 | supported by the Center for Identification Technology Research (CITeR) and the National Science Foundation (NSF) under Grant \#1650474. 173 | I would like to thank professor Nasser Nasrabadi for supporting me through this project and for his valuable supervision regarding my research in speech technology. 174 | 175 | # References 176 | -------------------------------------------------------------------------------- /paper/paper.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/astorfi/speechpy/b317746f042d02e03124518625be6d9327b71e4a/paper/paper.pdf -------------------------------------------------------------------------------- /paper/test/_imgs/Scheme_of_speech_recognition_system.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/astorfi/speechpy/b317746f042d02e03124518625be6d9327b71e4a/paper/test/_imgs/Scheme_of_speech_recognition_system.png -------------------------------------------------------------------------------- /paper/test/_imgs/packageview.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/astorfi/speechpy/b317746f042d02e03124518625be6d9327b71e4a/paper/test/_imgs/packageview.png -------------------------------------------------------------------------------- /paper/test/_imgs/travicCI.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/astorfi/speechpy/b317746f042d02e03124518625be6d9327b71e4a/paper/test/_imgs/travicCI.png -------------------------------------------------------------------------------- /paper/test/test.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: 'SpeechPy - A Library for Speech Processing and Recognition' 3 | tags: 4 | - Python 5 | authors: 6 | - name: Amirsina Torfi 7 | orcid: 0000-0003-2282-4361 8 | affiliation: "1" 9 | affiliations: 10 | - name: Virginia Tech, Department of Computer Science 11 | index: 1 12 | date: 15 May 2018 13 | bibliography: paper.bib 14 | --- 15 | 16 | # Abstract 17 | SpeechPy is an open source Python package that contains speech preprocessing techniques, speech features, and important post-processing operations. It provides most frequent used speech features including MFCCs and filterbank energies alongside with the log-energy of filter-banks. The aim of the package is to provide researchers with a simple tool for speech feature extraction and processing purposes in applications such as Automatic Speech Recognition and Speaker Verification. 18 | 19 | # Introduction 20 | Automatic Speech Recognition (ASR) requires three main components for 21 | further analysis: Preprocessing, feature extraction, and 22 | post-processing. Feature extraction, in an abstract meaning, is 23 | extracting descriptive features from raw signal for speech 24 | classification purposes. Due to the high 25 | dimensionality, the raw signal can be less informative compared to 26 | extracted higher level features. Feature extraction comes to our rescue 27 | for turning the high dimensional signal to a lower dimensional and yet 28 | more informative version of that for sound recognition and 29 | classification [@furui1986speaker; @guyon2008feature; @hirsch2000aurora]. 30 | 31 | ![Scheme of speech recognition](_imgs/Scheme_of_speech_recognition_system.png) 32 | 33 | Feature extraction, in essence, should be done considering the specific 34 | application at hand. For example, in ASR applications, the linguistic 35 | characteristics of the raw signal are of great importance and the other 36 | characteristics must be 37 | ignored [@yu2016automatic; @rabiner1993fundamentals]. On the other hand, 38 | in Speaker Recognition (SR) task, solely voice-associated information 39 | must be contained in extracted feature [@campbell1997speaker]. So the 40 | feature extraction goal is to extract the relevant feature from the raw 41 | signal and map it to a lower dimensional feature space. The problem of 42 | feature extraction has been investigated in pattern classification aimed 43 | at preventing the curse of dimensionality. There are some feature 44 | extraction approaches based on information theory 45 | [@torfi2017construction; @shannon2001mathematical] applied to multimodal 46 | signals and demonstrated promising results [@gurban2009information]. 47 | 48 | The speech features can be categorized into two general types of 49 | acoustic and linguistic features. The former one is mainly related to 50 | non-verbal sounds and the later one is associated with ASR and SR 51 | systems for which verbal part has the major role. Perhaps one the most 52 | famous linguistic feature which is hard to beat is the Mel-Frequency 53 | Cepstral Coefficients (MFCC). It uses speech raw frames in the range 54 | from 20ms to 40ms for having stationary 55 | characteristics [@rabiner1993fundamentals]. MFCC is widely used for both 56 | ASR and SR tasks and more recently in the associated deep learning 57 | applications as the input to the network rather than directly feeding 58 | the signal [@deng2013recent; @lee2009unsupervised; @yu2011improved]. 59 | With the advent of deep learning [@lecun2015deep; @torfi2018attention], 60 | major improvements have been achieved by using deep neural networks 61 | rather than traditional methods for speech recognition 62 | applications [@variani2014deep; @hinton2012deep; @liu2015deep]. 63 | 64 | With the availability of free software for speech recognition such as 65 | VOICEBOX[^1], most of these softwares are Matlab-based which limits 66 | their reproducibility due to commercial issues. Another great package is 67 | PyAudioAnalysis [@giannakopoulos2015pyaudioanalysis], which is a 68 | comprehensive package developed in Python. However, the issue with 69 | PyAudioAnalysis is that its complexity and being too verbose for 70 | extracting simple features and it also lacks some important 71 | preprocessing and post-processing operations for its current version. 72 | 73 | Considering the recent advent of deep learning in ASR and SR and the 74 | importance of the accurate speech feature extraction, here are the 75 | motivations behind SpeechPy package: 76 | 77 | * Developing a free open source package which covers important 78 | preprocessing techniques, speech features, and post-processing 79 | operations required for ASR and SR applications. 80 | 81 | * A simple package with a minimum degree of complexity should be 82 | available for beginners. 83 | 84 | * A well-tested and continuously integrated package for future 85 | developments should be developed. 86 | 87 | SpeechPy has been developed to satisfy the aforementioned needs. It 88 | contains the most important preprocessing and post-processing operations 89 | and a selection of frequently used speech features. The package is free 90 | and released as an open source software[^2]. Continuous integration 91 | using for instant error check and validity of changes has been deployed 92 | for SpeechPy. Moreover, prior to the latest official release of 93 | SpeechPy, the package has successfully been utilized for research 94 | purposes [@torfi20173d; @torfi2017text]. 95 | 96 | # Package Eco-system 97 | 98 | 99 | SpeechPy has been developed using Python language for its interface and 100 | backed as well. An empirical study demonstrated that Python as a 101 | scripting language, is more effective and productive than conventional 102 | languages[^3] for some programming problems and memory consumption is 103 | often “better than Java and not much worse than C or 104 | C++” [@prechelt2000empirical]. We chose Python due to its simplicity and 105 | popularity. Third-party libraries are avoided except *Numpy* and *Scipy* 106 | for handling data and numeric computations. 107 | 108 | ## Complexity 109 | 110 | As the user should not and does not even need to manipulate the internal 111 | package structure, object-oriented programming is mostly used for 112 | package development which provides easier interface for the user with a 113 | sacrifice to the simplicity of the code. However, the internal code 114 | complexity of the package does not affect the user experience since the 115 | modules can easily be called with the associated arguments. SpeechPy is 116 | a library with a collection of sub-modules. 117 | 118 | ## Code Style and Documentation 119 | 120 | SpeechPy is constructed based on PEP 8 style guide for Python codes. 121 | Moreover, it is extensively documented using the formatted docstrings 122 | and Sphinx[^4] for further automatic modifications to the document in 123 | case of changing internal modules. The full documentation of the project 124 | will be generated in HTML and PDF format using Sphinx and is hosted 125 | online. The official releases of the project are hosted on the Zenodo as 126 | well[^5] [@torfispeechpy]. 127 | 128 | ![A general view of the package](_imgs/packageview.png) 129 | 130 | ## Continuous Testing and Extensibility 131 | 132 | The output of each function has been evaluated as well using different 133 | tests as opposed to the other existing standard packages. For continuous 134 | testing, the code is hosted on GitHub and integrated with Travis CI. 135 | Each modification to the code must pass the unit tests defined for the 136 | continuous integration. This will ensure the package does not break with 137 | unadapted code scripts. However, the validity of the modifications 138 | should always be investigated with the owner or authorized collaborators 139 | of the project. The code will be tested at each time of modification for 140 | Python versions *“2.7”*, *“3.4”* and *“3.5”*. In the future, these 141 | versions are subject to change. 142 | 143 | ![Travic CI web interface after testing SpeechPy against a new change](_imgs/travicCI.png) 144 | 145 | # Availability 146 | 147 | ## Operating system {#operating-system .unnumbered} 148 | 149 | Tested on Ubuntu 14.04 and 16.04 LTS Linux, Apple Mac OS X 10.9.5 , and 150 | Microsoft Windows 7 & 10. We expect that SpeechPy works on any 151 | distribution as long as Python and the package dependencies are 152 | installed. 153 | 154 | ## Programming language {#programming-language .unnumbered} 155 | 156 | The package has been tested Python 2.7, 3.4 and 3.5. However, using 157 | Python 3.5 is suggested. 158 | 159 | ## Additional system requirements & dependencies {#additional-system-requirements-dependencies .unnumbered} 160 | 161 | SpeechPy is a light package and small computational power would be 162 | enough for running it. Although the speed of the execution is totally 163 | dependent to the system architecture. The dependencies are as follows: 164 | 165 | * Numpy 166 | 167 | * SciPy 168 | 169 | # Aknowledgement 170 | 171 | This work has been completed in part with computational resources 172 | provided by the West Virginia University and is based upon a work 173 | supported by the Center for Identification Technology Research (CITeR) 174 | and the National Science Foundation (NSF) under Grant \#1650474. 175 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | numpy 2 | scipy 3 | speechpy 4 | python-coveralls 5 | pytest 6 | -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [metadata] 2 | description-file = README.rst 3 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup, find_packages 2 | 3 | setup(name='speechpy', 4 | version='2.4', 5 | description='The python package for extracting speech features.', 6 | author='Amirsina Torfi', 7 | author_email='amirsina.torfi@gmail.com', 8 | url='https://github.com/astorfi/speechpy', 9 | download_url = 'https://github.com/astorfi/speechpy/archive/2.4.zip', 10 | packages=find_packages(exclude=('tests', 'docs')), 11 | include_package_data=True, 12 | install_requires=[ 13 | 'scipy', 14 | 'numpy', 15 | ], 16 | zip_safe=False) 17 | -------------------------------------------------------------------------------- /speechpy/__init__.py: -------------------------------------------------------------------------------- 1 | from . import feature 2 | from . import processing 3 | -------------------------------------------------------------------------------- /speechpy/feature.py: -------------------------------------------------------------------------------- 1 | """feature module. 2 | 3 | This module provides functions for calculating the main speech 4 | features that the package is aimed to extract as well as the required 5 | elements. 6 | 7 | 8 | Functions: 9 | 10 | filterbanks: Compute the Mel-filterbanks 11 | The filterbanks must be created for extracting 12 | speech features such as MFCC. 13 | 14 | mfcc: Extracting Mel Frequency Cepstral Coefficient feature. 15 | 16 | mfe: Extracting Mel Energy feature. 17 | 18 | lmfe: Extracting Log Mel Energy feature. 19 | 20 | extract_derivative_feature: Extract the first and second derivative 21 | features. This finction, directly use the ``derivative_extraction`` 22 | function in the ``processing`` module. 23 | 24 | """ 25 | 26 | from __future__ import division 27 | import numpy as np 28 | from . import processing 29 | from scipy.fftpack import dct 30 | from . import functions 31 | 32 | 33 | def filterbanks( 34 | num_filter, 35 | coefficients, 36 | sampling_freq, 37 | low_freq=None, 38 | high_freq=None): 39 | """Compute the Mel-filterbanks. Each filter will be stored in one rows. 40 | The columns correspond to fft bins. 41 | 42 | Args: 43 | num_filter (int): the number of filters in the filterbank, default 20. 44 | coefficients (int): (fftpoints//2 + 1). Default is 257. 45 | sampling_freq (float): the samplerate of the signal we are working 46 | with. It affects mel spacing. 47 | low_freq (float): lowest band edge of mel filters, default 0 Hz 48 | high_freq (float): highest band edge of mel filters, 49 | default samplerate/2 50 | 51 | Returns: 52 | array: A numpy array of size num_filter x (fftpoints//2 + 1) 53 | which are filterbank 54 | """ 55 | high_freq = high_freq or sampling_freq / 2 56 | low_freq = low_freq or 300 57 | s = "High frequency cannot be greater than half of the sampling frequency!" 58 | assert high_freq <= sampling_freq / 2, s 59 | assert low_freq >= 0, "low frequency cannot be less than zero!" 60 | 61 | # Computing the Mel filterbank 62 | # converting the upper and lower frequencies to Mels. 63 | # num_filter + 2 is because for num_filter filterbanks we need 64 | # num_filter+2 point. 65 | mels = np.linspace( 66 | functions.frequency_to_mel(low_freq), 67 | functions.frequency_to_mel(high_freq), 68 | num_filter + 2) 69 | 70 | # we should convert Mels back to Hertz because the start and end-points 71 | # should be at the desired frequencies. 72 | hertz = functions.mel_to_frequency(mels) 73 | 74 | # The frequency resolution required to put filters at the 75 | # exact points calculated above should be extracted. 76 | # So we should round those frequencies to the closest FFT bin. 77 | freq_index = ( 78 | np.floor( 79 | (coefficients + 80 | 1) * 81 | hertz / 82 | sampling_freq)).astype(int) 83 | 84 | # Initial definition 85 | filterbank = np.zeros([num_filter, coefficients]) 86 | 87 | # The triangular function for each filter 88 | for i in range(0, num_filter): 89 | left = int(freq_index[i]) 90 | middle = int(freq_index[i + 1]) 91 | right = int(freq_index[i + 2]) 92 | z = np.linspace(left, right, num=right - left + 1) 93 | filterbank[i, 94 | left:right + 1] = functions.triangle(z, 95 | left=left, 96 | middle=middle, 97 | right=right) 98 | 99 | return filterbank 100 | 101 | 102 | def mfcc( 103 | signal, 104 | sampling_frequency, 105 | frame_length=0.020, 106 | frame_stride=0.01, 107 | num_cepstral=13, 108 | num_filters=40, 109 | fft_length=512, 110 | low_frequency=0, 111 | high_frequency=None, 112 | dc_elimination=True): 113 | """Compute MFCC features from an audio signal. 114 | 115 | Args: 116 | 117 | signal (array): the audio signal from which to compute features. 118 | Should be an N x 1 array 119 | sampling_frequency (int): the sampling frequency of the signal 120 | we are working with. 121 | frame_length (float): the length of each frame in seconds. 122 | Default is 0.020s 123 | frame_stride (float): the step between successive frames in seconds. 124 | Default is 0.02s (means no overlap) 125 | num_filters (int): the number of filters in the filterbank, 126 | default 40. 127 | fft_length (int): number of FFT points. Default is 512. 128 | low_frequency (float): lowest band edge of mel filters. 129 | In Hz, default is 0. 130 | high_frequency (float): highest band edge of mel filters. 131 | In Hz, default is samplerate/2 132 | num_cepstral (int): Number of cepstral coefficients. 133 | dc_elimination (bool): hIf the first dc component should 134 | be eliminated or not. 135 | 136 | Returns: 137 | array: A numpy array of size (num_frames x num_cepstral) containing mfcc features. 138 | """ 139 | feature, energy = mfe(signal, sampling_frequency=sampling_frequency, 140 | frame_length=frame_length, frame_stride=frame_stride, 141 | num_filters=num_filters, fft_length=fft_length, 142 | low_frequency=low_frequency, 143 | high_frequency=high_frequency) 144 | if len(feature) == 0: 145 | return np.empty((0, num_cepstral)) 146 | feature = np.log(feature) 147 | feature = dct(feature, type=2, axis=-1, norm='ortho')[:, :num_cepstral] 148 | 149 | # replace first cepstral coefficient with log of frame energy for DC 150 | # elimination. 151 | if dc_elimination: 152 | feature[:, 0] = np.log(energy) 153 | return feature 154 | 155 | 156 | def mfe(signal, sampling_frequency, frame_length=0.020, frame_stride=0.01, 157 | num_filters=40, fft_length=512, low_frequency=0, high_frequency=None): 158 | """Compute Mel-filterbank energy features from an audio signal. 159 | 160 | Args: 161 | signal (array): the audio signal from which to compute features. 162 | Should be an N x 1 array 163 | sampling_frequency (int): the sampling frequency of the signal 164 | we are working with. 165 | frame_length (float): the length of each frame in seconds. 166 | Default is 0.020s 167 | frame_stride (float): the step between successive frames in seconds. 168 | Default is 0.02s (means no overlap) 169 | num_filters (int): the number of filters in the filterbank, 170 | default 40. 171 | fft_length (int): number of FFT points. Default is 512. 172 | low_frequency (float): lowest band edge of mel filters. 173 | In Hz, default is 0. 174 | high_frequency (float): highest band edge of mel filters. 175 | In Hz, default is samplerate/2 176 | 177 | Returns: 178 | array: features - the energy of fiterbank of size num_frames x num_filters. The energy of each frame: num_frames x 1 179 | """ 180 | 181 | # Convert to float 182 | signal = signal.astype(float) 183 | 184 | # Stack frames 185 | frames = processing.stack_frames( 186 | signal, 187 | sampling_frequency=sampling_frequency, 188 | frame_length=frame_length, 189 | frame_stride=frame_stride, 190 | filter=lambda x: np.ones( 191 | (x, 192 | )), 193 | zero_padding=False) 194 | 195 | # getting the high frequency 196 | high_frequency = high_frequency or sampling_frequency / 2 197 | 198 | # calculation of the power sprectum 199 | power_spectrum = processing.power_spectrum(frames, fft_length) 200 | coefficients = power_spectrum.shape[1] 201 | # this stores the total energy in each frame 202 | frame_energies = np.sum(power_spectrum, 1) 203 | 204 | # Handling zero enegies. 205 | frame_energies = functions.zero_handling(frame_energies) 206 | 207 | # Extracting the filterbank 208 | filter_banks = filterbanks( 209 | num_filters, 210 | coefficients, 211 | sampling_frequency, 212 | low_frequency, 213 | high_frequency) 214 | 215 | # Filterbank energies 216 | features = np.dot(power_spectrum, filter_banks.T) 217 | features = functions.zero_handling(features) 218 | 219 | return features, frame_energies 220 | 221 | 222 | def lmfe(signal, sampling_frequency, frame_length=0.020, frame_stride=0.01, 223 | num_filters=40, fft_length=512, low_frequency=0, high_frequency=None): 224 | """Compute log Mel-filterbank energy features from an audio signal. 225 | 226 | 227 | Args: 228 | signal (array): the audio signal from which to compute features. 229 | Should be an N x 1 array 230 | sampling_frequency (int): the sampling frequency of the signal 231 | we are working with. 232 | frame_length (float): the length of each frame in seconds. 233 | Default is 0.020s 234 | frame_stride (float): the step between successive frames in seconds. 235 | Default is 0.02s (means no overlap) 236 | num_filters (int): the number of filters in the filterbank, 237 | default 40. 238 | fft_length (int): number of FFT points. Default is 512. 239 | low_frequency (float): lowest band edge of mel filters. 240 | In Hz, default is 0. 241 | high_frequency (float): highest band edge of mel filters. 242 | In Hz, default is samplerate/2 243 | 244 | Returns: 245 | array: Features - The log energy of fiterbank of size num_frames x num_filters frame_log_energies. The log energy of each frame num_frames x 1 246 | """ 247 | 248 | feature, frame_energies = mfe(signal, 249 | sampling_frequency=sampling_frequency, 250 | frame_length=frame_length, 251 | frame_stride=frame_stride, 252 | num_filters=num_filters, 253 | fft_length=fft_length, 254 | low_frequency=low_frequency, 255 | high_frequency=high_frequency) 256 | feature = np.log(feature) 257 | 258 | return feature 259 | 260 | 261 | def extract_derivative_feature(feature): 262 | """ 263 | This function extracts temporal derivative features which are 264 | first and second derivatives. 265 | 266 | Args: 267 | feature (array): The feature vector which its size is: N x M 268 | 269 | Return: 270 | array: The feature cube vector which contains the static, first and second derivative features of size: N x M x 3 271 | """ 272 | first_derivative_feature = processing.derivative_extraction( 273 | feature, DeltaWindows=2) 274 | second_derivative_feature = processing.derivative_extraction( 275 | first_derivative_feature, DeltaWindows=2) 276 | 277 | # Creating the future cube for each file 278 | feature_cube = np.concatenate( 279 | (feature[:, :, None], first_derivative_feature[:, :, None], 280 | second_derivative_feature[:, :, None]), 281 | axis=2) 282 | return feature_cube 283 | -------------------------------------------------------------------------------- /speechpy/functions.py: -------------------------------------------------------------------------------- 1 | """function module. 2 | 3 | This module contains necessary functions for calculating the features 4 | in the `features` module. 5 | 6 | 7 | Attributes: 8 | 9 | frequency_to_mel: Converting the frequency to Mel scale. 10 | This is necessary for filterbank energy calculation. 11 | mel_to_frequency: Converting the Mel to frequency scale. 12 | This is necessary for filterbank energy calculation. 13 | triangle: Creating a triangle for filterbanks. 14 | This is necessary for filterbank energy calculation. 15 | zero_handling: Handling zero values due to the possible 16 | issues regarding the log functions. 17 | """ 18 | 19 | from __future__ import division 20 | import numpy as np 21 | from . import processing 22 | from scipy.fftpack import dct 23 | import math 24 | 25 | 26 | def frequency_to_mel(f): 27 | """converting from frequency to Mel scale. 28 | 29 | :param f: The frequency values(or a single frequency) in Hz. 30 | :returns: The mel scale values(or a single mel). 31 | """ 32 | return 1127 * np.log(1 + f / 700.) 33 | 34 | 35 | def mel_to_frequency(mel): 36 | """converting from Mel scale to frequency. 37 | 38 | :param mel: The mel scale values(or a single mel). 39 | :returns: The frequency values(or a single frequency) in Hz. 40 | """ 41 | return 700 * (np.exp(mel / 1127.0) - 1) 42 | 43 | 44 | def triangle(x, left, middle, right): 45 | out = np.zeros(x.shape) 46 | out[x <= left] = 0 47 | out[x >= right] = 0 48 | first_half = np.logical_and(left < x, x <= middle) 49 | out[first_half] = (x[first_half] - left) / (middle - left) 50 | second_half = np.logical_and(middle <= x, x < right) 51 | out[second_half] = (right - x[second_half]) / (right - middle) 52 | return out 53 | 54 | 55 | def zero_handling(x): 56 | """ 57 | This function handle the issue with zero values if the are exposed 58 | to become an argument for any log function. 59 | :param x: The vector. 60 | :return: The vector with zeros substituted with epsilon values. 61 | """ 62 | return np.where(x == 0, np.finfo(float).eps, x) 63 | -------------------------------------------------------------------------------- /speechpy/processing.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """Processing module for signal processing operations. 3 | 4 | This module demonstrates documentation for the signal processing 5 | function which are required as internal computations in the package. 6 | 7 | 8 | Attributes: 9 | 10 | preemphasis: Preemphasising on the signal. This is a preprocessing step. 11 | 12 | stack_frames: Create stacking frames from the raw signal. 13 | 14 | fft_spectrum: Calculation of the Fast Fourier Transform. 15 | 16 | power_spectrum: Power Spectrum calculation. 17 | 18 | log_power_spectrum: Log Power Spectrum calculation. 19 | 20 | derivative_extraction: Calculation of the derivative of the extracted featurs. 21 | 22 | cmvn: Cepstral mean variance normalization. This is a post processing operation. 23 | 24 | cmvnw: Cepstral mean variance normalization over the sliding window. This is a post processing operation. 25 | 26 | """ 27 | 28 | __license__ = "MIT" 29 | __author__ = " Amirsina Torfi" 30 | __docformat__ = 'reStructuredText' 31 | 32 | import decimal 33 | import numpy as np 34 | import math 35 | 36 | 37 | # 1.4 becomes 1 and 1.6 becomes 2. special case: 1.5 becomes 2. 38 | def round_half_up(number): 39 | return int( 40 | decimal.Decimal(number).quantize( 41 | decimal.Decimal('1'), 42 | rounding=decimal.ROUND_HALF_UP)) 43 | 44 | 45 | def preemphasis(signal, shift=1, cof=0.98): 46 | """preemphasising on the signal. 47 | 48 | Args: 49 | signal (array): The input signal. 50 | shift (int): The shift step. 51 | cof (float): The preemphasising coefficient. 0 equals to no filtering. 52 | 53 | Returns: 54 | array: The pre-emphasized signal. 55 | """ 56 | 57 | rolled_signal = np.roll(signal, shift) 58 | return signal - cof * rolled_signal 59 | 60 | 61 | def stack_frames( 62 | sig, 63 | sampling_frequency, 64 | frame_length=0.020, 65 | frame_stride=0.020, 66 | filter=lambda x: np.ones( 67 | (x, 68 | )), 69 | zero_padding=True): 70 | """Frame a signal into overlapping frames. 71 | 72 | Args: 73 | sig (array): The audio signal to frame of size (N,). 74 | sampling_frequency (int): The sampling frequency of the signal. 75 | frame_length (float): The length of the frame in second. 76 | frame_stride (float): The stride between frames. 77 | filter (array): The time-domain filter for applying to each frame. 78 | By default it is one so nothing will be changed. 79 | zero_padding (bool): If the samples is not a multiple of 80 | frame_length(number of frames sample), zero padding will 81 | be done for generating last frame. 82 | 83 | Returns: 84 | array: Stacked_frames-Array of frames of size (number_of_frames x frame_len). 85 | 86 | """ 87 | 88 | # Check dimension 89 | s = "Signal dimention should be of the format of (N,) but it is %s instead" 90 | assert sig.ndim == 1, s % str(sig.shape) 91 | 92 | # Initial necessary values 93 | length_signal = sig.shape[0] 94 | frame_sample_length = int( 95 | np.round( 96 | sampling_frequency * 97 | frame_length)) # Defined by the number of samples 98 | frame_stride = float(np.round(sampling_frequency * frame_stride)) 99 | 100 | # Zero padding is done for allocating space for the last frame. 101 | if zero_padding: 102 | # Calculation of number of frames 103 | numframes = (int(math.ceil((length_signal 104 | - frame_sample_length) / frame_stride))) 105 | print(numframes,length_signal,frame_sample_length,frame_stride) 106 | 107 | # Zero padding 108 | len_sig = int(numframes * frame_stride + frame_sample_length) 109 | additive_zeros = np.zeros((len_sig - length_signal,)) 110 | signal = np.concatenate((sig, additive_zeros)) 111 | 112 | else: 113 | # No zero padding! The last frame which does not have enough 114 | # samples(remaining samples <= frame_sample_length), will be dropped! 115 | numframes = int(math.floor((length_signal 116 | - frame_sample_length) / frame_stride)) 117 | 118 | # new length 119 | len_sig = int((numframes - 1) * frame_stride + frame_sample_length) 120 | signal = sig[0:len_sig] 121 | 122 | # Getting the indices of all frames. 123 | indices = np.tile(np.arange(0, 124 | frame_sample_length), 125 | (numframes, 126 | 1)) + np.tile(np.arange(0, 127 | numframes * frame_stride, 128 | frame_stride), 129 | (frame_sample_length, 130 | 1)).T 131 | indices = np.array(indices, dtype=np.int32) 132 | 133 | # Extracting the frames based on the allocated indices. 134 | frames = signal[indices] 135 | 136 | # Apply the windows function 137 | window = np.tile(filter(frame_sample_length), (numframes, 1)) 138 | Extracted_Frames = frames * window 139 | return Extracted_Frames 140 | 141 | 142 | def fft_spectrum(frames, fft_points=512): 143 | """This function computes the one-dimensional n-point discrete Fourier 144 | Transform (DFT) of a real-valued array by means of an efficient algorithm 145 | called the Fast Fourier Transform (FFT). Please refer to 146 | https://docs.scipy.org/doc/numpy/reference/generated/numpy.fft.rfft.html 147 | for further details. 148 | 149 | Args: 150 | frames (array): The frame array in which each row is a frame. 151 | fft_points (int): The length of FFT. If fft_length is greater than frame_len, the frames will be zero-padded. 152 | 153 | Returns: 154 | array: The fft spectrum. 155 | If frames is an num_frames x sample_per_frame matrix, output 156 | will be num_frames x FFT_LENGTH. 157 | """ 158 | SPECTRUM_VECTOR = np.fft.rfft(frames, n=fft_points, axis=-1, norm=None) 159 | return np.absolute(SPECTRUM_VECTOR) 160 | 161 | 162 | def power_spectrum(frames, fft_points=512): 163 | """Power spectrum of each frame. 164 | 165 | Args: 166 | frames (array): The frame array in which each row is a frame. 167 | fft_points (int): The length of FFT. If fft_length is greater than frame_len, the frames will be zero-padded. 168 | 169 | Returns: 170 | array: The power spectrum. 171 | If frames is an num_frames x sample_per_frame matrix, output 172 | will be num_frames x fft_length. 173 | """ 174 | return 1.0 / fft_points * np.square(fft_spectrum(frames, fft_points)) 175 | 176 | 177 | def log_power_spectrum(frames, fft_points=512, normalize=True): 178 | """Log power spectrum of each frame in frames. 179 | 180 | Args: 181 | frames (array): The frame array in which each row is a frame. 182 | fft_points (int): The length of FFT. If fft_length is greater than 183 | frame_len, the frames will be zero-padded. 184 | normalize (bool): If normalize=True, the log power spectrum 185 | will be normalized. 186 | 187 | Returns: 188 | array: The power spectrum - If frames is an 189 | num_frames x sample_per_frame matrix, output will be 190 | num_frames x fft_length. 191 | """ 192 | power_spec = power_spectrum(frames, fft_points) 193 | power_spec[power_spec <= 1e-20] = 1e-20 194 | log_power_spec = 10 * np.log10(power_spec) 195 | if normalize: 196 | return log_power_spec - np.max(log_power_spec) 197 | else: 198 | return log_power_spec 199 | 200 | 201 | def derivative_extraction(feat, DeltaWindows): 202 | """This function the derivative features. 203 | 204 | Args: 205 | feat (array): The main feature vector(For returning the second 206 | order derivative it can be first-order derivative). 207 | DeltaWindows (int): The value of DeltaWindows is set using 208 | the configuration parameter DELTAWINDOW. 209 | 210 | Returns: 211 | array: Derivative feature vector - A NUMFRAMESxNUMFEATURES numpy 212 | array which is the derivative features along the features. 213 | """ 214 | 215 | # Getting the shape of the vector. 216 | rows, cols = feat.shape 217 | 218 | # Difining the vector of differences. 219 | DIF = np.zeros(feat.shape, dtype=feat.dtype) 220 | Scale = 0 221 | 222 | # Pad only along features in the vector. 223 | FEAT = np.lib.pad(feat, ((0, 0), (DeltaWindows, DeltaWindows)), 'edge') 224 | for i in range(DeltaWindows): 225 | # Start index 226 | offset = DeltaWindows 227 | 228 | # The dynamic range 229 | Range = i + 1 230 | 231 | dif = Range * FEAT[:, offset + Range:offset + Range + cols] 232 | - FEAT[:, offset - Range:offset - Range + cols] 233 | Scale += 2 * np.power(Range, 2) 234 | DIF += dif 235 | 236 | return DIF / Scale 237 | 238 | 239 | def cmvn(vec, variance_normalization=False): 240 | """ This function is aimed to perform global cepstral mean and 241 | variance normalization (CMVN) on input feature vector "vec". 242 | The code assumes that there is one observation per row. 243 | 244 | Args: 245 | vec (array): input feature matrix 246 | (size:(num_observation,num_features)) 247 | variance_normalization (bool): If the variance 248 | normilization should be performed or not. 249 | 250 | Return: 251 | array: The mean(or mean+variance) normalized feature vector. 252 | """ 253 | eps = 2**-30 254 | rows, cols = vec.shape 255 | 256 | # Mean calculation 257 | norm = np.mean(vec, axis=0) 258 | norm_vec = np.tile(norm, (rows, 1)) 259 | 260 | # Mean subtraction 261 | mean_subtracted = vec - norm_vec 262 | 263 | # Variance normalization 264 | if variance_normalization: 265 | stdev = np.std(mean_subtracted, axis=0) 266 | stdev_vec = np.tile(stdev, (rows, 1)) 267 | output = mean_subtracted / (stdev_vec + eps) 268 | else: 269 | output = mean_subtracted 270 | 271 | return output 272 | 273 | 274 | def cmvnw(vec, win_size=301, variance_normalization=False): 275 | """ This function is aimed to perform local cepstral mean and 276 | variance normalization on a sliding window. The code assumes that 277 | there is one observation per row. 278 | 279 | Args: 280 | vec (array): input feature matrix 281 | (size:(num_observation,num_features)) 282 | win_size (int): The size of sliding window for local normalization. 283 | Default=301 which is around 3s if 100 Hz rate is 284 | considered(== 10ms frame stide) 285 | variance_normalization (bool): If the variance normilization should 286 | be performed or not. 287 | 288 | Return: 289 | array: The mean(or mean+variance) normalized feature vector. 290 | """ 291 | # Get the shapes 292 | eps = 2**-30 293 | rows, cols = vec.shape 294 | 295 | # Windows size must be odd. 296 | assert isinstance(win_size, int), "Size must be of type 'int'!" 297 | assert win_size % 2 == 1, "Windows size must be odd!" 298 | 299 | # Padding and initial definitions 300 | pad_size = int((win_size - 1) / 2) 301 | vec_pad = np.lib.pad(vec, ((pad_size, pad_size), (0, 0)), 'symmetric') 302 | mean_subtracted = np.zeros(np.shape(vec), dtype=np.float32) 303 | 304 | for i in range(rows): 305 | window = vec_pad[i:i + win_size, :] 306 | window_mean = np.mean(window, axis=0) 307 | mean_subtracted[i, :] = vec[i, :] - window_mean 308 | 309 | # Variance normalization 310 | if variance_normalization: 311 | 312 | # Initial definitions. 313 | variance_normalized = np.zeros(np.shape(vec), dtype=np.float32) 314 | vec_pad_variance = np.lib.pad( 315 | mean_subtracted, ((pad_size, pad_size), (0, 0)), 'symmetric') 316 | 317 | # Looping over all observations. 318 | for i in range(rows): 319 | window = vec_pad_variance[i:i + win_size, :] 320 | window_variance = np.std(window, axis=0) 321 | variance_normalized[i, :] \ 322 | = mean_subtracted[i, :] / (window_variance + eps) 323 | output = variance_normalized 324 | else: 325 | output = mean_subtracted 326 | 327 | return output 328 | 329 | 330 | # def resample_Fn(wave, fs, f_new=16000): 331 | # """This function resample the data to arbitrary frequency 332 | # :param fs: Frequency of the sound file. 333 | # :param wave: The sound file itself. 334 | # :returns: 335 | # f_new: The new frequency. 336 | # signal_new: The new signal samples at new frequency. 337 | # 338 | # dependency: from scikits.samplerate import resample 339 | # """ 340 | # 341 | # # Resampling using interpolation(There are other 342 | # methods than 'sinc_best') 343 | # signal_new = resample(wave, float(f_new) / fs, 'sinc_best') 344 | # 345 | # # Necessary data converting for saving .wav file using scipy. 346 | # signal_new = np.asarray(signal_new, dtype=np.int16) 347 | # 348 | # # # Uncomment if you want to save the audio file 349 | # # # Save using new format 350 | # # wav.write(filename='resample_rainbow_16k.wav',rate=fr,data=signal_new) 351 | # return signal_new, f_new 352 | -------------------------------------------------------------------------------- /tests/Alesis-Sanctuary-QCard-AcoustcBas-C2.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/astorfi/speechpy/b317746f042d02e03124518625be6d9327b71e4a/tests/Alesis-Sanctuary-QCard-AcoustcBas-C2.wav -------------------------------------------------------------------------------- /tests/test_speechpy.py: -------------------------------------------------------------------------------- 1 | import scipy.io.wavfile as wav 2 | import numpy as np 3 | import os 4 | import sys 5 | lib_path = os.path.abspath(os.path.join('..')) 6 | print(lib_path) 7 | sys.path.append(lib_path) 8 | from speechpy import processing 9 | from speechpy import feature 10 | from speechpy import functions 11 | 12 | # Ramdom signal generation for testing 13 | mu, sigma = 0, 0.1 # mean and standard deviation 14 | signal = np.random.normal(mu, sigma, 1000000) 15 | fs = 16000 16 | 17 | # Generating stached frames with SpeechPy 18 | frame_length = 0.02 19 | frame_stride = 0.02 20 | num_filters=40 21 | 22 | 23 | class Test_Methods_Exists(object): 24 | def test_processing(self): 25 | 26 | # Cheching the availibility of functions in the chosen attribute 27 | assert hasattr(processing, 'preemphasis') 28 | assert hasattr(processing, 'stack_frames') 29 | assert hasattr(processing, 'fft_spectrum') 30 | assert hasattr(processing, 'power_spectrum') 31 | assert hasattr(processing, 'log_power_spectrum') 32 | assert hasattr(processing, 'derivative_extraction') 33 | assert hasattr(processing, 'cmvn') 34 | assert hasattr(processing, 'cmvnw') 35 | 36 | def test_feature(self): 37 | 38 | # Cheching the availibility of functions in the chosen attribute 39 | assert hasattr(feature, 'filterbanks') 40 | assert hasattr(feature, 'mfcc') 41 | assert hasattr(feature, 'mfe') 42 | assert hasattr(feature, 'lmfe') 43 | assert hasattr(feature, 'extract_derivative_feature') 44 | 45 | def test_functions(self): 46 | 47 | # Cheching the availibility of functions in the chosen attribute 48 | assert hasattr(functions, 'frequency_to_mel') 49 | assert hasattr(functions, 'mel_to_frequency') 50 | assert hasattr(functions, 'triangle') 51 | assert hasattr(functions, 'zero_handling') 52 | 53 | 54 | class Test_Processing(object): 55 | 56 | def test_preemphasis(self): 57 | 58 | # Performing the operation on the generated signal. 59 | signal_preemphasized = processing.preemphasis(signal, cof=0.98) 60 | 61 | # Shape matcher 62 | assert signal_preemphasized.ndim == 1 63 | assert signal_preemphasized.shape == signal.shape 64 | 65 | def test_stack_frames(self): 66 | 67 | frames = processing.stack_frames(signal, sampling_frequency=fs, 68 | frame_length=frame_length, 69 | frame_stride=frame_stride, 70 | filter=lambda x: np.ones((x,)), 71 | zero_padding=True) 72 | 73 | # Direct calculation using numpy 74 | window = int(np.round(frame_length * fs)) 75 | step = int(np.round(frame_stride * fs)) 76 | all_frames = (int(np.ceil((signal.shape[0] 77 | - window) / step))) 78 | 79 | # Shape matching of stacked frames 80 | assert all_frames == frames.shape[0] 81 | 82 | def test_cmvn(self): 83 | 84 | feature_vector = np.random.rand(50,100) 85 | normalized_feature = processing.cmvn(feature_vector, variance_normalization=True) 86 | 87 | # Shape match 88 | assert normalized_feature.shape == feature_vector.shape 89 | 90 | # Check the std and mean of the output vector 91 | assert np.allclose(np.mean(normalized_feature,axis=0), np.zeros((1,normalized_feature.shape[1]))) 92 | assert np.allclose(np.std(normalized_feature,axis=0), np.ones((1,normalized_feature.shape[1]))) 93 | 94 | 95 | class Test_feature(object): 96 | 97 | def test_mfcc(self): 98 | 99 | num_cepstral = 13 100 | mfcc = feature.mfcc(signal, sampling_frequency=fs, 101 | frame_length=0.020, num_cepstral=num_cepstral, frame_stride=0.01, 102 | num_filters=num_filters, fft_length=512, low_frequency=0, 103 | high_frequency=None) 104 | 105 | # Shape matcher 106 | assert mfcc.shape[1] == num_cepstral 107 | 108 | 109 | 110 | 111 | 112 | 113 | 114 | --------------------------------------------------------------------------------