├── LICENSE ├── MANIFEST ├── README.rst ├── docs ├── Makefile ├── make.bat └── source │ ├── conf.py │ └── index.rst ├── english.wav ├── example.py ├── python_speech_features ├── __init__.py ├── base.py ├── base_orig.py ├── sigproc.py └── sigproc_orig.py ├── requirements.txt ├── setup.py └── test └── test_sigproc.py /LICENSE: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2013 James Lyons 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy of 6 | this software and associated documentation files (the "Software"), to deal in 7 | the Software without restriction, including without limitation the rights to 8 | use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of 9 | the Software, and to permit persons to whom the Software is furnished to do so, 10 | subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS 17 | FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR 18 | COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER 19 | IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 20 | CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 21 | -------------------------------------------------------------------------------- /MANIFEST: -------------------------------------------------------------------------------- 1 | # file GENERATED by distutils, do NOT edit 2 | setup.py 3 | python_speech_features\__init__.py 4 | python_speech_features\base.py 5 | python_speech_features\sigproc.py 6 | -------------------------------------------------------------------------------- /README.rst: -------------------------------------------------------------------------------- 1 | 2 | 3 | forked from ``_ 4 | 5 | check the readme therein for the usages 6 | 7 | It has been modified to produce the same results as with the compute-mfcc-feats and compute-fbank-feats (check their default parameters first) commands in Kaldi. 8 | 9 | ------------------------------- 10 | 11 | The compute-mfcc-feats pipeline: 12 | 13 | src/featbin/Compute-mfcc-feats.cc 14 | 15 | Mfcc mfcc(mfcc_opts) --> src/feat/Feature-mfcc.h 16 | 17 | struct MfccOptions 18 | 19 | typedef OfflineFeatureTpl Mfcc --> src/feat/Feature-common.h 20 | 21 | MfccComputer() --> src/feat/Feature-mfcc.cc 22 | 23 | ComputeDctMatrix() --> src/matrix/Matrix-functions.cc 24 | 25 | ComputeLifterCoeffs() --> src/feat/Mel-computations.cc 26 | 27 | 28 | for each utterance: 29 | mfcc.ComputeFeatures() 30 | 31 | src/feat/Feature-common-inl.h 32 | 33 |    OfflineFeatureTpl::ComputeFeatures() 34 | 35 | Compute() 36 | 37 | ExtractWindow() --> src/feat/Feature-window.cc 38 | 39 | ProcessWindow() 40 | 41 | Dither, remove_dc_offset, log_energy_pre_window, Preemphasize, window 42 | 43 |            computer_.Compute() --> src/feat/Feature-mfcc.cc 44 | 45 | MfccComputer::Compute() 46 | 47 |                                         const MelBanks &mel_banks --> Mel-computations.cc 48 | 49 |                                          srfft_ 50 |                                         51 |                                         ComputerPowerSpectrum() 52 | 53 | mel_banks.Compute() 54 | 55 | mel_energies_.ApplyLog() 56 | 57 | dct, cepstral_lifter 58 | 59 | -------------------------------------------------------------------------------- /docs/Makefile: -------------------------------------------------------------------------------- 1 | # Makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line. 5 | SPHINXOPTS = 6 | SPHINXBUILD = sphinx-build 7 | PAPER = 8 | BUILDDIR = build 9 | 10 | # Internal variables. 11 | PAPEROPT_a4 = -D latex_paper_size=a4 12 | PAPEROPT_letter = -D latex_paper_size=letter 13 | ALLSPHINXOPTS = -d $(BUILDDIR)/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) source 14 | 15 | .PHONY: help clean html dirhtml pickle json htmlhelp qthelp latex changes linkcheck doctest 16 | 17 | help: 18 | @echo "Please use \`make ' where is one of" 19 | @echo " html to make standalone HTML files" 20 | @echo " dirhtml to make HTML files named index.html in directories" 21 | @echo " pickle to make pickle files" 22 | @echo " json to make JSON files" 23 | @echo " htmlhelp to make HTML files and a HTML help project" 24 | @echo " qthelp to make HTML files and a qthelp project" 25 | @echo " latex to make LaTeX files, you can set PAPER=a4 or PAPER=letter" 26 | @echo " changes to make an overview of all changed/added/deprecated items" 27 | @echo " linkcheck to check all external links for integrity" 28 | @echo " doctest to run all doctests embedded in the documentation (if enabled)" 29 | 30 | clean: 31 | -rm -rf $(BUILDDIR)/* 32 | 33 | html: 34 | $(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html 35 | @echo 36 | @echo "Build finished. The HTML pages are in $(BUILDDIR)/html." 37 | 38 | dirhtml: 39 | $(SPHINXBUILD) -b dirhtml $(ALLSPHINXOPTS) $(BUILDDIR)/dirhtml 40 | @echo 41 | @echo "Build finished. The HTML pages are in $(BUILDDIR)/dirhtml." 42 | 43 | pickle: 44 | $(SPHINXBUILD) -b pickle $(ALLSPHINXOPTS) $(BUILDDIR)/pickle 45 | @echo 46 | @echo "Build finished; now you can process the pickle files." 47 | 48 | json: 49 | $(SPHINXBUILD) -b json $(ALLSPHINXOPTS) $(BUILDDIR)/json 50 | @echo 51 | @echo "Build finished; now you can process the JSON files." 52 | 53 | htmlhelp: 54 | $(SPHINXBUILD) -b htmlhelp $(ALLSPHINXOPTS) $(BUILDDIR)/htmlhelp 55 | @echo 56 | @echo "Build finished; now you can run HTML Help Workshop with the" \ 57 | ".hhp project file in $(BUILDDIR)/htmlhelp." 58 | 59 | qthelp: 60 | $(SPHINXBUILD) -b qthelp $(ALLSPHINXOPTS) $(BUILDDIR)/qthelp 61 | @echo 62 | @echo "Build finished; now you can run "qcollectiongenerator" with the" \ 63 | ".qhcp project file in $(BUILDDIR)/qthelp, like this:" 64 | @echo "# qcollectiongenerator $(BUILDDIR)/qthelp/python_speech_features.qhcp" 65 | @echo "To view the help file:" 66 | @echo "# assistant -collectionFile $(BUILDDIR)/qthelp/python_speech_features.qhc" 67 | 68 | latex: 69 | $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex 70 | @echo 71 | @echo "Build finished; the LaTeX files are in $(BUILDDIR)/latex." 72 | @echo "Run \`make all-pdf' or \`make all-ps' in that directory to" \ 73 | "run these through (pdf)latex." 74 | 75 | changes: 76 | $(SPHINXBUILD) -b changes $(ALLSPHINXOPTS) $(BUILDDIR)/changes 77 | @echo 78 | @echo "The overview file is in $(BUILDDIR)/changes." 79 | 80 | linkcheck: 81 | $(SPHINXBUILD) -b linkcheck $(ALLSPHINXOPTS) $(BUILDDIR)/linkcheck 82 | @echo 83 | @echo "Link check complete; look for any errors in the above output " \ 84 | "or in $(BUILDDIR)/linkcheck/output.txt." 85 | 86 | doctest: 87 | $(SPHINXBUILD) -b doctest $(ALLSPHINXOPTS) $(BUILDDIR)/doctest 88 | @echo "Testing of doctests in the sources finished, look at the " \ 89 | "results in $(BUILDDIR)/doctest/output.txt." 90 | -------------------------------------------------------------------------------- /docs/make.bat: -------------------------------------------------------------------------------- 1 | @ECHO OFF 2 | 3 | REM Command file for Sphinx documentation 4 | 5 | set SPHINXBUILD=sphinx-build 6 | set BUILDDIR=build 7 | set ALLSPHINXOPTS=-d %BUILDDIR%/doctrees %SPHINXOPTS% source 8 | if NOT "%PAPER%" == "" ( 9 | set ALLSPHINXOPTS=-D latex_paper_size=%PAPER% %ALLSPHINXOPTS% 10 | ) 11 | 12 | if "%1" == "" goto help 13 | 14 | if "%1" == "help" ( 15 | :help 16 | echo.Please use `make ^` where ^ is one of 17 | echo. html to make standalone HTML files 18 | echo. dirhtml to make HTML files named index.html in directories 19 | echo. pickle to make pickle files 20 | echo. json to make JSON files 21 | echo. htmlhelp to make HTML files and a HTML help project 22 | echo. qthelp to make HTML files and a qthelp project 23 | echo. latex to make LaTeX files, you can set PAPER=a4 or PAPER=letter 24 | echo. changes to make an overview over all changed/added/deprecated items 25 | echo. linkcheck to check all external links for integrity 26 | echo. doctest to run all doctests embedded in the documentation if enabled 27 | goto end 28 | ) 29 | 30 | if "%1" == "clean" ( 31 | for /d %%i in (%BUILDDIR%\*) do rmdir /q /s %%i 32 | del /q /s %BUILDDIR%\* 33 | goto end 34 | ) 35 | 36 | if "%1" == "html" ( 37 | %SPHINXBUILD% -b html %ALLSPHINXOPTS% %BUILDDIR%/html 38 | echo. 39 | echo.Build finished. The HTML pages are in %BUILDDIR%/html. 40 | goto end 41 | ) 42 | 43 | if "%1" == "dirhtml" ( 44 | %SPHINXBUILD% -b dirhtml %ALLSPHINXOPTS% %BUILDDIR%/dirhtml 45 | echo. 46 | echo.Build finished. The HTML pages are in %BUILDDIR%/dirhtml. 47 | goto end 48 | ) 49 | 50 | if "%1" == "pickle" ( 51 | %SPHINXBUILD% -b pickle %ALLSPHINXOPTS% %BUILDDIR%/pickle 52 | echo. 53 | echo.Build finished; now you can process the pickle files. 54 | goto end 55 | ) 56 | 57 | if "%1" == "json" ( 58 | %SPHINXBUILD% -b json %ALLSPHINXOPTS% %BUILDDIR%/json 59 | echo. 60 | echo.Build finished; now you can process the JSON files. 61 | goto end 62 | ) 63 | 64 | if "%1" == "htmlhelp" ( 65 | %SPHINXBUILD% -b htmlhelp %ALLSPHINXOPTS% %BUILDDIR%/htmlhelp 66 | echo. 67 | echo.Build finished; now you can run HTML Help Workshop with the ^ 68 | .hhp project file in %BUILDDIR%/htmlhelp. 69 | goto end 70 | ) 71 | 72 | if "%1" == "qthelp" ( 73 | %SPHINXBUILD% -b qthelp %ALLSPHINXOPTS% %BUILDDIR%/qthelp 74 | echo. 75 | echo.Build finished; now you can run "qcollectiongenerator" with the ^ 76 | .qhcp project file in %BUILDDIR%/qthelp, like this: 77 | echo.^> qcollectiongenerator %BUILDDIR%\qthelp\python_speech_features.qhcp 78 | echo.To view the help file: 79 | echo.^> assistant -collectionFile %BUILDDIR%\qthelp\python_speech_features.ghc 80 | goto end 81 | ) 82 | 83 | if "%1" == "latex" ( 84 | %SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex 85 | echo. 86 | echo.Build finished; the LaTeX files are in %BUILDDIR%/latex. 87 | goto end 88 | ) 89 | 90 | if "%1" == "changes" ( 91 | %SPHINXBUILD% -b changes %ALLSPHINXOPTS% %BUILDDIR%/changes 92 | echo. 93 | echo.The overview file is in %BUILDDIR%/changes. 94 | goto end 95 | ) 96 | 97 | if "%1" == "linkcheck" ( 98 | %SPHINXBUILD% -b linkcheck %ALLSPHINXOPTS% %BUILDDIR%/linkcheck 99 | echo. 100 | echo.Link check complete; look for any errors in the above output ^ 101 | or in %BUILDDIR%/linkcheck/output.txt. 102 | goto end 103 | ) 104 | 105 | if "%1" == "doctest" ( 106 | %SPHINXBUILD% -b doctest %ALLSPHINXOPTS% %BUILDDIR%/doctest 107 | echo. 108 | echo.Testing of doctests in the sources finished, look at the ^ 109 | results in %BUILDDIR%/doctest/output.txt. 110 | goto end 111 | ) 112 | 113 | :end 114 | -------------------------------------------------------------------------------- /docs/source/conf.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # 3 | # python_speech_features documentation build configuration file, created by 4 | # sphinx-quickstart on Thu Oct 31 16:49:58 2013. 5 | # 6 | # This file is execfile()d with the current directory set to its containing dir. 7 | # 8 | # Note that not all possible configuration values are present in this 9 | # autogenerated file. 10 | # 11 | # All configuration values have a default; values that are commented out 12 | # serve to show the default. 13 | 14 | import sys, os 15 | 16 | import mock 17 | 18 | MOCK_MODULES = ['numpy', 'scipy', 'scipy.fftpack'] 19 | for mod_name in MOCK_MODULES: 20 | sys.modules[mod_name] = mock.Mock() 21 | 22 | # If extensions (or modules to document with autodoc) are in another directory, 23 | # add these directories to sys.path here. If the directory is relative to the 24 | # documentation root, use os.path.abspath to make it absolute, like shown here. 25 | sys.path.insert(0,os.path.abspath('../..')) 26 | 27 | # -- General configuration ----------------------------------------------------- 28 | 29 | # Add any Sphinx extension module names here, as strings. They can be extensions 30 | # coming with Sphinx (named 'sphinx.ext.*') or your custom ones. 31 | extensions = ['sphinx.ext.autodoc'] 32 | 33 | # Add any paths that contain templates here, relative to this directory. 34 | templates_path = ['_templates'] 35 | 36 | # The suffix of source filenames. 37 | source_suffix = '.rst' 38 | 39 | # The encoding of source files. 40 | #source_encoding = 'utf-8' 41 | 42 | # The master toctree document. 43 | master_doc = 'index' 44 | 45 | # General information about the project. 46 | project = u'python_speech_features' 47 | copyright = u'2013, James Lyons' 48 | 49 | # The version info for the project you're documenting, acts as replacement for 50 | # |version| and |release|, also used in various other places throughout the 51 | # built documents. 52 | # 53 | # The short X.Y version. 54 | version = '0.1.0' 55 | # The full version, including alpha/beta/rc tags. 56 | release = '0.1.0' 57 | 58 | # The language for content autogenerated by Sphinx. Refer to documentation 59 | # for a list of supported languages. 60 | #language = None 61 | 62 | # There are two options for replacing |today|: either, you set today to some 63 | # non-false value, then it is used: 64 | #today = '' 65 | # Else, today_fmt is used as the format for a strftime call. 66 | #today_fmt = '%B %d, %Y' 67 | 68 | # List of documents that shouldn't be included in the build. 69 | #unused_docs = [] 70 | 71 | # List of directories, relative to source directory, that shouldn't be searched 72 | # for source files. 73 | exclude_trees = [] 74 | 75 | # The reST default role (used for this markup: `text`) to use for all documents. 76 | #default_role = None 77 | 78 | # If true, '()' will be appended to :func: etc. cross-reference text. 79 | #add_function_parentheses = True 80 | 81 | # If true, the current module name will be prepended to all description 82 | # unit titles (such as .. function::). 83 | #add_module_names = True 84 | 85 | # If true, sectionauthor and moduleauthor directives will be shown in the 86 | # output. They are ignored by default. 87 | #show_authors = False 88 | 89 | # The name of the Pygments (syntax highlighting) style to use. 90 | pygments_style = 'sphinx' 91 | 92 | # A list of ignored prefixes for module index sorting. 93 | #modindex_common_prefix = [] 94 | 95 | 96 | # -- Options for HTML output --------------------------------------------------- 97 | 98 | # The theme to use for HTML and HTML Help pages. Major themes that come with 99 | # Sphinx are currently 'default' and 'sphinxdoc'. 100 | html_theme = 'default' 101 | 102 | # Theme options are theme-specific and customize the look and feel of a theme 103 | # further. For a list of options available for each theme, see the 104 | # documentation. 105 | #html_theme_options = {} 106 | 107 | # Add any paths that contain custom themes here, relative to this directory. 108 | #html_theme_path = [] 109 | 110 | # The name for this set of Sphinx documents. If None, it defaults to 111 | # " v documentation". 112 | #html_title = None 113 | 114 | # A shorter title for the navigation bar. Default is the same as html_title. 115 | #html_short_title = None 116 | 117 | # The name of an image file (relative to this directory) to place at the top 118 | # of the sidebar. 119 | #html_logo = None 120 | 121 | # The name of an image file (within the static path) to use as favicon of the 122 | # docs. This file should be a Windows icon file (.ico) being 16x16 or 32x32 123 | # pixels large. 124 | #html_favicon = None 125 | 126 | # Add any paths that contain custom static files (such as style sheets) here, 127 | # relative to this directory. They are copied after the builtin static files, 128 | # so a file named "default.css" will overwrite the builtin "default.css". 129 | html_static_path = ['_static'] 130 | 131 | # If not '', a 'Last updated on:' timestamp is inserted at every page bottom, 132 | # using the given strftime format. 133 | #html_last_updated_fmt = '%b %d, %Y' 134 | 135 | # If true, SmartyPants will be used to convert quotes and dashes to 136 | # typographically correct entities. 137 | #html_use_smartypants = True 138 | 139 | # Custom sidebar templates, maps document names to template names. 140 | #html_sidebars = {} 141 | 142 | # Additional templates that should be rendered to pages, maps page names to 143 | # template names. 144 | #html_additional_pages = {} 145 | 146 | # If false, no module index is generated. 147 | #html_use_modindex = True 148 | 149 | # If false, no index is generated. 150 | #html_use_index = True 151 | 152 | # If true, the index is split into individual pages for each letter. 153 | #html_split_index = False 154 | 155 | # If true, links to the reST sources are added to the pages. 156 | #html_show_sourcelink = True 157 | 158 | # If true, an OpenSearch description file will be output, and all pages will 159 | # contain a tag referring to it. The value of this option must be the 160 | # base URL from which the finished HTML is served. 161 | #html_use_opensearch = '' 162 | 163 | # If nonempty, this is the file name suffix for HTML files (e.g. ".xhtml"). 164 | #html_file_suffix = '' 165 | 166 | # Output file base name for HTML help builder. 167 | htmlhelp_basename = 'python_speech_featuresdoc' 168 | 169 | 170 | # -- Options for LaTeX output -------------------------------------------------- 171 | 172 | # The paper size ('letter' or 'a4'). 173 | #latex_paper_size = 'letter' 174 | 175 | # The font size ('10pt', '11pt' or '12pt'). 176 | #latex_font_size = '10pt' 177 | 178 | # Grouping the document tree into LaTeX files. List of tuples 179 | # (source start file, target name, title, author, documentclass [howto/manual]). 180 | latex_documents = [ 181 | ('index', 'python_speech_features.tex', u'python\\_speech\\_features Documentation', 182 | u'James Lyons', 'manual'), 183 | ] 184 | 185 | # The name of an image file (relative to this directory) to place at the top of 186 | # the title page. 187 | #latex_logo = None 188 | 189 | # For "manual" documents, if this is true, then toplevel headings are parts, 190 | # not chapters. 191 | #latex_use_parts = False 192 | 193 | # Additional stuff for the LaTeX preamble. 194 | #latex_preamble = '' 195 | 196 | # Documents to append as an appendix to all manuals. 197 | #latex_appendices = [] 198 | 199 | # If false, no module index is generated. 200 | #latex_use_modindex = True 201 | 202 | autodoc_member_order = 'bysource' 203 | -------------------------------------------------------------------------------- /docs/source/index.rst: -------------------------------------------------------------------------------- 1 | .. python_speech_features documentation master file, created by 2 | sphinx-quickstart on Thu Oct 31 16:49:58 2013. 3 | You can adapt this file completely to your liking, but it should at least 4 | contain the root `toctree` directive. 5 | 6 | Welcome to python_speech_features's documentation! 7 | ================================================== 8 | 9 | This library provides common speech features for ASR including MFCCs and filterbank energies. 10 | If you are not sure what MFCCs are, and would like to know more have a look at this MFCC tutorial: 11 | http://www.practicalcryptography.com/miscellaneous/machine-learning/guide-mel-frequency-cepstral-coefficients-mfccs/. 12 | 13 | You will need numpy and scipy to run these files. The code for this project is available at https://github.com/jameslyons/python_speech_features . 14 | 15 | Supported features: 16 | 17 | - :py:meth:`python_speech_features.mfcc` - Mel Frequency Cepstral Coefficients 18 | - :py:meth:`python_speech_features.fbank` - Filterbank Energies 19 | - :py:meth:`python_speech_features.logfbank` - Log Filterbank Energies 20 | - :py:meth:`python_speech_features.ssc` - Spectral Subband Centroids 21 | 22 | To use MFCC features:: 23 | 24 | from python_speech_features import mfcc 25 | from python_speech_features import logfbank 26 | import scipy.io.wavfile as wav 27 | 28 | (rate,sig) = wav.read("file.wav") 29 | mfcc_feat = mfcc(sig,rate) 30 | fbank_feat = logfbank(sig,rate) 31 | 32 | print(fbank_feat[1:3,:]) 33 | 34 | From here you can write the features to a file etc. 35 | 36 | Functions provided in python_speech_features module 37 | ------------------------------------- 38 | 39 | .. automodule:: python_speech_features.base 40 | :members: 41 | 42 | 43 | Functions provided in sigproc module 44 | ------------------------------------ 45 | .. automodule:: python_speech_features.sigproc 46 | :members: 47 | 48 | 49 | Indices and tables 50 | ================== 51 | 52 | * :ref:`genindex` 53 | * :ref:`search` 54 | 55 | -------------------------------------------------------------------------------- /english.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ZitengWang/python_kaldi_features/fc1bd6240c2008412ab64dc25045cd872f5e126c/english.wav -------------------------------------------------------------------------------- /example.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | from python_speech_features import mfcc 4 | from python_speech_features import delta 5 | from python_speech_features import logfbank 6 | import scipy.io.wavfile as wav 7 | 8 | (rate,sig) = wav.read("english.wav") 9 | 10 | # note that generally nfilt=40 is used for speech recognition 11 | fbank_feat = logfbank(sig,nfilt=23,lowfreq=20,dither=0,wintype='povey') 12 | 13 | # the computed fbank coefficents of english.wav with dimension [110,23] 14 | # [ 12.2865 12.6906 13.1765 15.714 16.064 15.7553 16.5746 16.9205 16.6472 16.1302 16.4576 16.7326 16.8864 17.7215 18.88 19.1377 19.1495 18.6683 18.3886 20.3506 20.2772 18.8248 18.1899 15 | # 11.9198 13.146 14.7215 15.8642 17.4288 16.394 16.8238 16.1095 16.4297 16.6331 16.3163 16.5093 17.4981 18.3429 19.6555 19.6263 19.8435 19.0534 19.001 20.0287 19.7707 19.5852 19.1112 16 | # ... 17 | # ... 18 | # the same with that using kaldi commands: compute-fbank-feats --dither=0.0 19 | 20 | 21 | mfcc_feat = mfcc(sig,dither=0,useEnergy=True,wintype='povey') 22 | 23 | # the computed mfcc coefficents of english.wav with dimension [110,13] 24 | # [ 17.1337 -23.3651 -7.41751 -7.73686 -21.3682 -8.93884 -3.70843 4.68346 -16.0676 12.782 -7.24054 8.25089 10.7292 25 | # 17.1692 -23.3028 -5.61872 -4.0075 -23.287 -20.6101 -5.51584 -6.15273 -14.4333 8.13052 -0.0345329 2.06274 -0.564298 26 | # ... 27 | # ... 28 | # the same with that using kaldi commands: compute-mfcc-feats --dither=0.0 29 | 30 | -------------------------------------------------------------------------------- /python_speech_features/__init__.py: -------------------------------------------------------------------------------- 1 | from .base import * 2 | -------------------------------------------------------------------------------- /python_speech_features/base.py: -------------------------------------------------------------------------------- 1 | # calculate filterbank features. Provides e.g. fbank and mfcc features for use in ASR applications 2 | # Author: James Lyons 2012 3 | from __future__ import division 4 | import numpy 5 | from python_speech_features import sigproc 6 | from scipy.fftpack import dct 7 | 8 | def mfcc(signal,samplerate=16000,winlen=0.025,winstep=0.01,numcep=13, 9 | nfilt=23,nfft=512,lowfreq=20,highfreq=None,dither=1.0,remove_dc_offset=True,preemph=0.97, 10 | ceplifter=22,useEnergy=True,wintype='povey'): 11 | """Compute MFCC features from an audio signal. 12 | 13 | :param signal: the audio signal from which to compute features. Should be an N*1 array 14 | :param samplerate: the samplerate of the signal we are working with. 15 | :param winlen: the length of the analysis window in seconds. Default is 0.025s (25 milliseconds) 16 | :param winstep: the step between successive windows in seconds. Default is 0.01s (10 milliseconds) 17 | :param numcep: the number of cepstrum to return, default 13 18 | :param nfilt: the number of filters in the filterbank, default 26. 19 | :param nfft: the FFT size. Default is 512. 20 | :param lowfreq: lowest band edge of mel filters. In Hz, default is 0. 21 | :param highfreq: highest band edge of mel filters. In Hz, default is samplerate/2 22 | :param preemph: apply preemphasis filter with preemph as coefficient. 0 is no filter. Default is 0.97. 23 | :param ceplifter: apply a lifter to final cepstral coefficients. 0 is no lifter. Default is 22. 24 | :param appendEnergy: if this is true, the zeroth cepstral coefficient is replaced with the log of the total frame energy. 25 | :param winfunc: the analysis window to apply to each frame. By default no window is applied. You can use numpy window functions here e.g. winfunc=numpy.hamming 26 | :returns: A numpy array of size (NUMFRAMES by numcep) containing features. Each row holds 1 feature vector. 27 | """ 28 | feat,energy = fbank(signal,samplerate,winlen,winstep,nfilt,nfft,lowfreq,highfreq,dither,remove_dc_offset,preemph,wintype) 29 | feat = numpy.log(feat) 30 | feat = dct(feat, type=2, axis=1, norm='ortho')[:,:numcep] 31 | feat = lifter(feat,ceplifter) 32 | if useEnergy: feat[:,0] = numpy.log(energy) # replace first cepstral coefficient with log of frame energy 33 | return feat 34 | 35 | def fbank(signal,samplerate=16000,winlen=0.025,winstep=0.01, 36 | nfilt=40,nfft=512,lowfreq=0,highfreq=None,dither=1.0,remove_dc_offset=True, preemph=0.97, 37 | wintype='hamming'): 38 | """Compute Mel-filterbank energy features from an audio signal. 39 | 40 | :param signal: the audio signal from which to compute features. Should be an N*1 array 41 | :param samplerate: the samplerate of the signal we are working with. 42 | :param winlen: the length of the analysis window in seconds. Default is 0.025s (25 milliseconds) 43 | :param winstep: the step between successive windows in seconds. Default is 0.01s (10 milliseconds) 44 | :param nfilt: the number of filters in the filterbank, default 26. 45 | :param nfft: the FFT size. Default is 512. 46 | :param lowfreq: lowest band edge of mel filters. In Hz, default is 0. 47 | :param highfreq: highest band edge of mel filters. In Hz, default is samplerate/2 48 | :param preemph: apply preemphasis filter with preemph as coefficient. 0 is no filter. Default is 0.97. 49 | :param winfunc: the analysis window to apply to each frame. By default no window is applied. You can use numpy window functions here e.g. winfunc=numpy.hamming 50 | winfunc=lambda x:numpy.ones((x,)) 51 | :returns: 2 values. The first is a numpy array of size (NUMFRAMES by nfilt) containing features. Each row holds 1 feature vector. The 52 | second return value is the energy in each frame (total energy, unwindowed) 53 | """ 54 | highfreq= highfreq or samplerate/2 55 | frames,raw_frames = sigproc.framesig(signal, winlen*samplerate, winstep*samplerate, dither, preemph, remove_dc_offset, wintype) 56 | pspec = sigproc.powspec(frames,nfft) # nearly the same until this part 57 | energy = numpy.sum(raw_frames**2,1) # this stores the raw energy in each frame 58 | energy = numpy.where(energy == 0,numpy.finfo(float).eps,energy) # if energy is zero, we get problems with log 59 | 60 | fb = get_filterbanks(nfilt,nfft,samplerate,lowfreq,highfreq) 61 | feat = numpy.dot(pspec,fb.T) # compute the filterbank energies 62 | feat = numpy.where(feat == 0,numpy.finfo(float).eps,feat) # if feat is zero, we get problems with log 63 | 64 | return feat,energy 65 | 66 | def logfbank(signal,samplerate=16000,winlen=0.025,winstep=0.01, 67 | nfilt=40,nfft=512,lowfreq=64,highfreq=None,dither=1.0,remove_dc_offset=True,preemph=0.97,wintype='hamming'): 68 | """Compute log Mel-filterbank energy features from an audio signal. 69 | 70 | :param signal: the audio signal from which to compute features. Should be an N*1 array 71 | :param samplerate: the samplerate of the signal we are working with. 72 | :param winlen: the length of the analysis window in seconds. Default is 0.025s (25 milliseconds) 73 | :param winstep: the step between successive windows in seconds. Default is 0.01s (10 milliseconds) 74 | :param nfilt: the number of filters in the filterbank, default 26. 75 | :param nfft: the FFT size. Default is 512. 76 | :param lowfreq: lowest band edge of mel filters. In Hz, default is 0. 77 | :param highfreq: highest band edge of mel filters. In Hz, default is samplerate/2 78 | :param preemph: apply preemphasis filter with preemph as coefficient. 0 is no filter. Default is 0.97. 79 | :returns: A numpy array of size (NUMFRAMES by nfilt) containing features. Each row holds 1 feature vector. 80 | """ 81 | feat,energy = fbank(signal,samplerate,winlen,winstep,nfilt,nfft,lowfreq,highfreq,dither, remove_dc_offset,preemph,wintype) 82 | return numpy.log(feat) 83 | 84 | def hz2mel(hz): 85 | """Convert a value in Hertz to Mels 86 | 87 | :param hz: a value in Hz. This can also be a numpy array, conversion proceeds element-wise. 88 | :returns: a value in Mels. If an array was passed in, an identical sized array is returned. 89 | """ 90 | return 1127 * numpy.log(1+hz/700.0) 91 | 92 | 93 | def mel2hz(mel): 94 | """Convert a value in Mels to Hertz 95 | 96 | :param mel: a value in Mels. This can also be a numpy array, conversion proceeds element-wise. 97 | :returns: a value in Hertz. If an array was passed in, an identical sized array is returned. 98 | """ 99 | return 700 * (numpy.exp(mel/1127.0)-1) 100 | 101 | def get_filterbanks(nfilt=26,nfft=512,samplerate=16000,lowfreq=0,highfreq=None): 102 | """Compute a Mel-filterbank. The filters are stored in the rows, the columns correspond 103 | to fft bins. The filters are returned as an array of size nfilt * (nfft/2 + 1) 104 | 105 | :param nfilt: the number of filters in the filterbank, default 20. 106 | :param nfft: the FFT size. Default is 512. 107 | :param samplerate: the samplerate of the signal we are working with. Affects mel spacing. 108 | :param lowfreq: lowest band edge of mel filters, default 0 Hz 109 | :param highfreq: highest band edge of mel filters, default samplerate/2 110 | :returns: A numpy array of size nfilt * (nfft/2 + 1) containing filterbank. Each row holds 1 filter. 111 | """ 112 | highfreq= highfreq or samplerate/2 113 | assert highfreq <= samplerate/2, "highfreq is greater than samplerate/2" 114 | 115 | # compute points evenly spaced in mels 116 | lowmel = hz2mel(lowfreq) 117 | highmel = hz2mel(highfreq) 118 | 119 | # check kaldi/src/feat/Mel-computations.h 120 | fbank = numpy.zeros([nfilt,nfft//2+1]) 121 | mel_freq_delta = (highmel-lowmel)/(nfilt+1) 122 | for j in range(0,nfilt): 123 | leftmel = lowmel+j*mel_freq_delta 124 | centermel = lowmel+(j+1)*mel_freq_delta 125 | rightmel = lowmel+(j+2)*mel_freq_delta 126 | for i in range(0,nfft//2): 127 | mel=hz2mel(i*samplerate/nfft) 128 | if mel>leftmel and mel 0: 143 | nframes,ncoeff = numpy.shape(cepstra) 144 | n = numpy.arange(ncoeff) 145 | lift = 1 + (L/2.)*numpy.sin(numpy.pi*n/L) 146 | return lift*cepstra 147 | else: 148 | # values of L <= 0, do nothing 149 | return cepstra 150 | 151 | def delta(feat, N): 152 | """Compute delta features from a feature vector sequence. 153 | 154 | :param feat: A numpy array of size (NUMFRAMES by number of features) containing features. Each row holds 1 feature vector. 155 | :param N: For each frame, calculate delta features based on preceding and following N frames 156 | :returns: A numpy array of size (NUMFRAMES by number of features) containing delta features. Each row holds 1 delta feature vector. 157 | """ 158 | if N < 1: 159 | raise ValueError('N must be an integer >= 1') 160 | NUMFRAMES = len(feat) 161 | denominator = 2 * sum([i**2 for i in range(1, N+1)]) 162 | delta_feat = numpy.empty_like(feat) 163 | padded = numpy.pad(feat, ((N, N), (0, 0)), mode='edge') # padded version of feat 164 | for t in range(NUMFRAMES): 165 | delta_feat[t] = numpy.dot(numpy.arange(-N, N+1), padded[t : t+2*N+1]) / denominator # [t : t+2*N+1] == [(N+t)-N : (N+t)+N+1] 166 | return delta_feat 167 | -------------------------------------------------------------------------------- /python_speech_features/base_orig.py: -------------------------------------------------------------------------------- 1 | # calculate filterbank features. Provides e.g. fbank and mfcc features for use in ASR applications 2 | # Author: James Lyons 2012 3 | from __future__ import division 4 | import numpy 5 | from python_speech_features import sigproc 6 | from scipy.fftpack import dct 7 | 8 | def mfcc(signal,samplerate=16000,winlen=0.025,winstep=0.01,numcep=13, 9 | nfilt=26,nfft=512,lowfreq=0,highfreq=None,preemph=0.97,ceplifter=22,appendEnergy=True, 10 | winfunc=lambda x:numpy.ones((x,))): 11 | """Compute MFCC features from an audio signal. 12 | 13 | :param signal: the audio signal from which to compute features. Should be an N*1 array 14 | :param samplerate: the samplerate of the signal we are working with. 15 | :param winlen: the length of the analysis window in seconds. Default is 0.025s (25 milliseconds) 16 | :param winstep: the step between successive windows in seconds. Default is 0.01s (10 milliseconds) 17 | :param numcep: the number of cepstrum to return, default 13 18 | :param nfilt: the number of filters in the filterbank, default 26. 19 | :param nfft: the FFT size. Default is 512. 20 | :param lowfreq: lowest band edge of mel filters. In Hz, default is 0. 21 | :param highfreq: highest band edge of mel filters. In Hz, default is samplerate/2 22 | :param preemph: apply preemphasis filter with preemph as coefficient. 0 is no filter. Default is 0.97. 23 | :param ceplifter: apply a lifter to final cepstral coefficients. 0 is no lifter. Default is 22. 24 | :param appendEnergy: if this is true, the zeroth cepstral coefficient is replaced with the log of the total frame energy. 25 | :param winfunc: the analysis window to apply to each frame. By default no window is applied. You can use numpy window functions here e.g. winfunc=numpy.hamming 26 | :returns: A numpy array of size (NUMFRAMES by numcep) containing features. Each row holds 1 feature vector. 27 | """ 28 | feat,energy = fbank(signal,samplerate,winlen,winstep,nfilt,nfft,lowfreq,highfreq,preemph,winfunc) 29 | feat = numpy.log(feat) 30 | feat = dct(feat, type=2, axis=1, norm='ortho')[:,:numcep] 31 | feat = lifter(feat,ceplifter) 32 | if appendEnergy: feat[:,0] = numpy.log(energy) # replace first cepstral coefficient with log of frame energy 33 | return feat 34 | 35 | def fbank(signal,samplerate=16000,winlen=0.025,winstep=0.01, 36 | nfilt=26,nfft=512,lowfreq=0,highfreq=None,preemph=0.97, 37 | winfunc=lambda x:numpy.ones((x,))): 38 | """Compute Mel-filterbank energy features from an audio signal. 39 | 40 | :param signal: the audio signal from which to compute features. Should be an N*1 array 41 | :param samplerate: the samplerate of the signal we are working with. 42 | :param winlen: the length of the analysis window in seconds. Default is 0.025s (25 milliseconds) 43 | :param winstep: the step between successive windows in seconds. Default is 0.01s (10 milliseconds) 44 | :param nfilt: the number of filters in the filterbank, default 26. 45 | :param nfft: the FFT size. Default is 512. 46 | :param lowfreq: lowest band edge of mel filters. In Hz, default is 0. 47 | :param highfreq: highest band edge of mel filters. In Hz, default is samplerate/2 48 | :param preemph: apply preemphasis filter with preemph as coefficient. 0 is no filter. Default is 0.97. 49 | :param winfunc: the analysis window to apply to each frame. By default no window is applied. You can use numpy window functions here e.g. winfunc=numpy.hamming 50 | :returns: 2 values. The first is a numpy array of size (NUMFRAMES by nfilt) containing features. Each row holds 1 feature vector. The 51 | second return value is the energy in each frame (total energy, unwindowed) 52 | """ 53 | highfreq= highfreq or samplerate/2 54 | signal = sigproc.preemphasis(signal,preemph) 55 | frames = sigproc.framesig(signal, winlen*samplerate, winstep*samplerate, winfunc) 56 | pspec = sigproc.powspec(frames,nfft) 57 | energy = numpy.sum(pspec,1) # this stores the total energy in each frame 58 | energy = numpy.where(energy == 0,numpy.finfo(float).eps,energy) # if energy is zero, we get problems with log 59 | 60 | fb = get_filterbanks(nfilt,nfft,samplerate,lowfreq,highfreq) 61 | feat = numpy.dot(pspec,fb.T) # compute the filterbank energies 62 | feat = numpy.where(feat == 0,numpy.finfo(float).eps,feat) # if feat is zero, we get problems with log 63 | 64 | return feat,energy 65 | 66 | def logfbank(signal,samplerate=16000,winlen=0.025,winstep=0.01, 67 | nfilt=26,nfft=512,lowfreq=0,highfreq=None,preemph=0.97): 68 | """Compute log Mel-filterbank energy features from an audio signal. 69 | 70 | :param signal: the audio signal from which to compute features. Should be an N*1 array 71 | :param samplerate: the samplerate of the signal we are working with. 72 | :param winlen: the length of the analysis window in seconds. Default is 0.025s (25 milliseconds) 73 | :param winstep: the step between successive windows in seconds. Default is 0.01s (10 milliseconds) 74 | :param nfilt: the number of filters in the filterbank, default 26. 75 | :param nfft: the FFT size. Default is 512. 76 | :param lowfreq: lowest band edge of mel filters. In Hz, default is 0. 77 | :param highfreq: highest band edge of mel filters. In Hz, default is samplerate/2 78 | :param preemph: apply preemphasis filter with preemph as coefficient. 0 is no filter. Default is 0.97. 79 | :returns: A numpy array of size (NUMFRAMES by nfilt) containing features. Each row holds 1 feature vector. 80 | """ 81 | feat,energy = fbank(signal,samplerate,winlen,winstep,nfilt,nfft,lowfreq,highfreq,preemph) 82 | return numpy.log(feat) 83 | 84 | def ssc(signal,samplerate=16000,winlen=0.025,winstep=0.01, 85 | nfilt=26,nfft=512,lowfreq=0,highfreq=None,preemph=0.97, 86 | winfunc=lambda x:numpy.ones((x,))): 87 | """Compute Spectral Subband Centroid features from an audio signal. 88 | 89 | :param signal: the audio signal from which to compute features. Should be an N*1 array 90 | :param samplerate: the samplerate of the signal we are working with. 91 | :param winlen: the length of the analysis window in seconds. Default is 0.025s (25 milliseconds) 92 | :param winstep: the step between successive windows in seconds. Default is 0.01s (10 milliseconds) 93 | :param nfilt: the number of filters in the filterbank, default 26. 94 | :param nfft: the FFT size. Default is 512. 95 | :param lowfreq: lowest band edge of mel filters. In Hz, default is 0. 96 | :param highfreq: highest band edge of mel filters. In Hz, default is samplerate/2 97 | :param preemph: apply preemphasis filter with preemph as coefficient. 0 is no filter. Default is 0.97. 98 | :param winfunc: the analysis window to apply to each frame. By default no window is applied. You can use numpy window functions here e.g. winfunc=numpy.hamming 99 | :returns: A numpy array of size (NUMFRAMES by nfilt) containing features. Each row holds 1 feature vector. 100 | """ 101 | highfreq= highfreq or samplerate/2 102 | signal = sigproc.preemphasis(signal,preemph) 103 | frames = sigproc.framesig(signal, winlen*samplerate, winstep*samplerate, winfunc) 104 | pspec = sigproc.powspec(frames,nfft) 105 | pspec = numpy.where(pspec == 0,numpy.finfo(float).eps,pspec) # if things are all zeros we get problems 106 | 107 | fb = get_filterbanks(nfilt,nfft,samplerate,lowfreq,highfreq) 108 | feat = numpy.dot(pspec,fb.T) # compute the filterbank energies 109 | R = numpy.tile(numpy.linspace(1,samplerate/2,numpy.size(pspec,1)),(numpy.size(pspec,0),1)) 110 | 111 | return numpy.dot(pspec*R,fb.T) / feat 112 | 113 | def hz2mel(hz): 114 | """Convert a value in Hertz to Mels 115 | 116 | :param hz: a value in Hz. This can also be a numpy array, conversion proceeds element-wise. 117 | :returns: a value in Mels. If an array was passed in, an identical sized array is returned. 118 | """ 119 | return 2595 * numpy.log10(1+hz/700.) 120 | 121 | def mel2hz(mel): 122 | """Convert a value in Mels to Hertz 123 | 124 | :param mel: a value in Mels. This can also be a numpy array, conversion proceeds element-wise. 125 | :returns: a value in Hertz. If an array was passed in, an identical sized array is returned. 126 | """ 127 | return 700*(10**(mel/2595.0)-1) 128 | 129 | def get_filterbanks(nfilt=20,nfft=512,samplerate=16000,lowfreq=0,highfreq=None): 130 | """Compute a Mel-filterbank. The filters are stored in the rows, the columns correspond 131 | to fft bins. The filters are returned as an array of size nfilt * (nfft/2 + 1) 132 | 133 | :param nfilt: the number of filters in the filterbank, default 20. 134 | :param nfft: the FFT size. Default is 512. 135 | :param samplerate: the samplerate of the signal we are working with. Affects mel spacing. 136 | :param lowfreq: lowest band edge of mel filters, default 0 Hz 137 | :param highfreq: highest band edge of mel filters, default samplerate/2 138 | :returns: A numpy array of size nfilt * (nfft/2 + 1) containing filterbank. Each row holds 1 filter. 139 | """ 140 | highfreq= highfreq or samplerate/2 141 | assert highfreq <= samplerate/2, "highfreq is greater than samplerate/2" 142 | 143 | # compute points evenly spaced in mels 144 | lowmel = hz2mel(lowfreq) 145 | highmel = hz2mel(highfreq) 146 | melpoints = numpy.linspace(lowmel,highmel,nfilt+2) 147 | # our points are in Hz, but we use fft bins, so we have to convert 148 | # from Hz to fft bin number 149 | bin = numpy.floor((nfft+1)*mel2hz(melpoints)/samplerate) 150 | 151 | fbank = numpy.zeros([nfilt,nfft//2+1]) 152 | for j in range(0,nfilt): 153 | for i in range(int(bin[j]), int(bin[j+1])): 154 | fbank[j,i] = (i - bin[j]) / (bin[j+1]-bin[j]) 155 | for i in range(int(bin[j+1]), int(bin[j+2])): 156 | fbank[j,i] = (bin[j+2]-i) / (bin[j+2]-bin[j+1]) 157 | return fbank 158 | 159 | def lifter(cepstra, L=22): 160 | """Apply a cepstral lifter the the matrix of cepstra. This has the effect of increasing the 161 | magnitude of the high frequency DCT coeffs. 162 | 163 | :param cepstra: the matrix of mel-cepstra, will be numframes * numcep in size. 164 | :param L: the liftering coefficient to use. Default is 22. L <= 0 disables lifter. 165 | """ 166 | if L > 0: 167 | nframes,ncoeff = numpy.shape(cepstra) 168 | n = numpy.arange(ncoeff) 169 | lift = 1 + (L/2.)*numpy.sin(numpy.pi*n/L) 170 | return lift*cepstra 171 | else: 172 | # values of L <= 0, do nothing 173 | return cepstra 174 | 175 | def delta(feat, N): 176 | """Compute delta features from a feature vector sequence. 177 | 178 | :param feat: A numpy array of size (NUMFRAMES by number of features) containing features. Each row holds 1 feature vector. 179 | :param N: For each frame, calculate delta features based on preceding and following N frames 180 | :returns: A numpy array of size (NUMFRAMES by number of features) containing delta features. Each row holds 1 delta feature vector. 181 | """ 182 | if N < 1: 183 | raise ValueError('N must be an integer >= 1') 184 | NUMFRAMES = len(feat) 185 | denominator = 2 * sum([i**2 for i in range(1, N+1)]) 186 | delta_feat = numpy.empty_like(feat) 187 | padded = numpy.pad(feat, ((N, N), (0, 0)), mode='edge') # padded version of feat 188 | for t in range(NUMFRAMES): 189 | delta_feat[t] = numpy.dot(numpy.arange(-N, N+1), padded[t : t+2*N+1]) / denominator # [t : t+2*N+1] == [(N+t)-N : (N+t)+N+1] 190 | return delta_feat 191 | -------------------------------------------------------------------------------- /python_speech_features/sigproc.py: -------------------------------------------------------------------------------- 1 | # This file includes routines for basic signal processing including framing and computing power spectra. 2 | # Author: James Lyons 2012 3 | import decimal 4 | 5 | import numpy 6 | import math 7 | import logging 8 | 9 | 10 | def round_half_up(number): 11 | return int(decimal.Decimal(number).quantize(decimal.Decimal('1'), rounding=decimal.ROUND_HALF_UP)) 12 | 13 | 14 | def rolling_window(a, window, step=1): 15 | # http://ellisvalentiner.com/post/2017-03-21-np-strides-trick 16 | shape = a.shape[:-1] + (a.shape[-1] - window + 1, window) 17 | strides = a.strides + (a.strides[-1],) 18 | return numpy.lib.stride_tricks.as_strided(a, shape=shape, strides=strides)[::step] 19 | 20 | 21 | def framesig(sig, frame_len, frame_step, dither=1.0, preemph=0.97, remove_dc_offset=True, wintype='hamming', stride_trick=True): 22 | """Frame a signal into overlapping frames. 23 | 24 | :param sig: the audio signal to frame. 25 | :param frame_len: length of each frame measured in samples. 26 | :param frame_step: number of samples after the start of the previous frame that the next frame should begin. 27 | :param winfunc: the analysis window to apply to each frame. By default no window is applied. 28 | :param stride_trick: use stride trick to compute the rolling window and window multiplication faster 29 | :returns: an array of frames. Size is NUMFRAMES by frame_len. 30 | """ 31 | slen = len(sig) 32 | frame_len = int(round_half_up(frame_len)) 33 | frame_step = int(round_half_up(frame_step)) 34 | if slen <= frame_len: 35 | numframes = 1 36 | else: 37 | numframes = 1 + (( slen - frame_len) // frame_step) 38 | 39 | # check kaldi/src/feat/feature-window.h 40 | padsignal = sig[:(numframes-1)*frame_step+frame_len] 41 | if wintype is 'povey': 42 | win = numpy.empty(frame_len) 43 | for i in range(frame_len): 44 | win[i] = (0.5-0.5*numpy.cos(2*numpy.pi/(frame_len-1)*i))**0.85 45 | else: # the hamming window 46 | win = numpy.hamming(frame_len) 47 | 48 | if stride_trick: 49 | frames = rolling_window(padsignal, window=frame_len, step=frame_step) 50 | else: 51 | indices = numpy.tile(numpy.arange(0, frame_len), (numframes, 1)) + numpy.tile( 52 | numpy.arange(0, numframes * frame_step, frame_step), (frame_len, 1)).T 53 | indices = numpy.array(indices, dtype=numpy.int32) 54 | frames = padsignal[indices] 55 | win = numpy.tile(win, (numframes, 1)) 56 | 57 | frames = frames.astype(numpy.float32) 58 | raw_frames = numpy.zeros(frames.shape) 59 | for frm in range(frames.shape[0]): 60 | frames[frm,:] = do_dither(frames[frm,:], dither) # dither 61 | frames[frm,:] = do_remove_dc_offset(frames[frm,:]) # remove dc offset 62 | raw_frames[frm,:] = frames[frm,:] 63 | frames[frm,:] = do_preemphasis(frames[frm,:], preemph) # preemphasize 64 | 65 | return frames * win, raw_frames 66 | 67 | def deframesig(frames, siglen, frame_len, frame_step, winfunc=lambda x: numpy.ones((x,))): 68 | """Does overlap-add procedure to undo the action of framesig. 69 | 70 | :param frames: the array of frames. 71 | :param siglen: the length of the desired signal, use 0 if unknown. Output will be truncated to siglen samples. 72 | :param frame_len: length of each frame measured in samples. 73 | :param frame_step: number of samples after the start of the previous frame that the next frame should begin. 74 | :param winfunc: the analysis window to apply to each frame. By default no window is applied. 75 | :returns: a 1-D signal. 76 | """ 77 | frame_len = round_half_up(frame_len) 78 | frame_step = round_half_up(frame_step) 79 | numframes = numpy.shape(frames)[0] 80 | assert numpy.shape(frames)[1] == frame_len, '"frames" matrix is wrong size, 2nd dim is not equal to frame_len' 81 | 82 | indices = numpy.tile(numpy.arange(0, frame_len), (numframes, 1)) + numpy.tile( 83 | numpy.arange(0, numframes * frame_step, frame_step), (frame_len, 1)).T 84 | indices = numpy.array(indices, dtype=numpy.int32) 85 | padlen = (numframes - 1) * frame_step + frame_len 86 | 87 | if siglen <= 0: siglen = padlen 88 | 89 | rec_signal = numpy.zeros((padlen,)) 90 | window_correction = numpy.zeros((padlen,)) 91 | win = winfunc(frame_len) 92 | 93 | for i in range(0, numframes): 94 | window_correction[indices[i, :]] = window_correction[ 95 | indices[i, :]] + win + 1e-15 # add a little bit so it is never zero 96 | rec_signal[indices[i, :]] = rec_signal[indices[i, :]] + frames[i, :] 97 | 98 | rec_signal = rec_signal / window_correction 99 | return rec_signal[0:siglen] 100 | 101 | 102 | def magspec(frames, NFFT): 103 | """Compute the magnitude spectrum of each frame in frames. If frames is an NxD matrix, output will be Nx(NFFT/2+1). 104 | 105 | :param frames: the array of frames. Each row is a frame. 106 | :param NFFT: the FFT length to use. If NFFT > frame_len, the frames are zero-padded. 107 | :returns: If frames is an NxD matrix, output will be Nx(NFFT/2+1). Each row will be the magnitude spectrum of the corresponding frame. 108 | """ 109 | if numpy.shape(frames)[1] > NFFT: 110 | logging.warn( 111 | 'frame length (%d) is greater than FFT size (%d), frame will be truncated. Increase NFFT to avoid.', 112 | numpy.shape(frames)[1], NFFT) 113 | complex_spec = numpy.fft.rfft(frames, NFFT) 114 | return numpy.absolute(complex_spec) 115 | 116 | 117 | def powspec(frames, NFFT): 118 | """Compute the power spectrum of each frame in frames. If frames is an NxD matrix, output will be Nx(NFFT/2+1). 119 | 120 | :param frames: the array of frames. Each row is a frame. 121 | :param NFFT: the FFT length to use. If NFFT > frame_len, the frames are zero-padded. 122 | :returns: If frames is an NxD matrix, output will be Nx(NFFT/2+1). Each row will be the power spectrum of the corresponding frame. 123 | """ 124 | return numpy.square(magspec(frames, NFFT)) 125 | 126 | 127 | def logpowspec(frames, NFFT, norm=1): 128 | """Compute the log power spectrum of each frame in frames. If frames is an NxD matrix, output will be Nx(NFFT/2+1). 129 | 130 | :param frames: the array of frames. Each row is a frame. 131 | :param NFFT: the FFT length to use. If NFFT > frame_len, the frames are zero-padded. 132 | :param norm: If norm=1, the log power spectrum is normalised so that the max value (across all frames) is 0. 133 | :returns: If frames is an NxD matrix, output will be Nx(NFFT/2+1). Each row will be the log power spectrum of the corresponding frame. 134 | """ 135 | ps = powspec(frames, NFFT); 136 | ps[ps <= 1e-30] = 1e-30 137 | lps = 10 * numpy.log10(ps) 138 | if norm: 139 | return lps - numpy.max(lps) 140 | else: 141 | return lps 142 | 143 | def do_dither(signal, dither_value=1.0): 144 | signal += numpy.random.normal(size=signal.shape) * dither_value 145 | return signal 146 | 147 | def do_remove_dc_offset(signal): 148 | signal -= numpy.mean(signal) 149 | return signal 150 | 151 | def do_preemphasis(signal, coeff=0.97): 152 | """perform preemphasis on the input signal. 153 | 154 | :param signal: The signal to filter. 155 | :param coeff: The preemphasis coefficient. 0 is no filter, default is 0.95. 156 | :returns: the filtered signal. 157 | """ 158 | return numpy.append((1-coeff)*signal[0], signal[1:] - coeff * signal[:-1]) 159 | -------------------------------------------------------------------------------- /python_speech_features/sigproc_orig.py: -------------------------------------------------------------------------------- 1 | # This file includes routines for basic signal processing including framing and computing power spectra. 2 | # Author: James Lyons 2012 3 | import decimal 4 | 5 | import numpy 6 | import math 7 | import logging 8 | 9 | 10 | def round_half_up(number): 11 | return int(decimal.Decimal(number).quantize(decimal.Decimal('1'), rounding=decimal.ROUND_HALF_UP)) 12 | 13 | 14 | def rolling_window(a, window, step=1): 15 | # http://ellisvalentiner.com/post/2017-03-21-np-strides-trick 16 | shape = a.shape[:-1] + (a.shape[-1] - window + 1, window) 17 | strides = a.strides + (a.strides[-1],) 18 | return numpy.lib.stride_tricks.as_strided(a, shape=shape, strides=strides)[::step] 19 | 20 | 21 | def framesig(sig, frame_len, frame_step, winfunc=lambda x: numpy.ones((x,)), stride_trick=True): 22 | """Frame a signal into overlapping frames. 23 | 24 | :param sig: the audio signal to frame. 25 | :param frame_len: length of each frame measured in samples. 26 | :param frame_step: number of samples after the start of the previous frame that the next frame should begin. 27 | :param winfunc: the analysis window to apply to each frame. By default no window is applied. 28 | :param stride_trick: use stride trick to compute the rolling window and window multiplication faster 29 | :returns: an array of frames. Size is NUMFRAMES by frame_len. 30 | """ 31 | slen = len(sig) 32 | frame_len = int(round_half_up(frame_len)) 33 | frame_step = int(round_half_up(frame_step)) 34 | if slen <= frame_len: 35 | numframes = 1 36 | else: 37 | numframes = 1 + int(math.ceil((1.0 * slen - frame_len) / frame_step)) 38 | 39 | padlen = int((numframes - 1) * frame_step + frame_len) 40 | 41 | zeros = numpy.zeros((padlen - slen,)) 42 | padsignal = numpy.concatenate((sig, zeros)) 43 | if stride_trick: 44 | win = winfunc(frame_len) 45 | frames = rolling_window(padsignal, window=frame_len, step=frame_step) 46 | else: 47 | indices = numpy.tile(numpy.arange(0, frame_len), (numframes, 1)) + numpy.tile( 48 | numpy.arange(0, numframes * frame_step, frame_step), (frame_len, 1)).T 49 | indices = numpy.array(indices, dtype=numpy.int32) 50 | frames = padsignal[indices] 51 | win = numpy.tile(winfunc(frame_len), (numframes, 1)) 52 | 53 | return frames * win 54 | 55 | 56 | def deframesig(frames, siglen, frame_len, frame_step, winfunc=lambda x: numpy.ones((x,))): 57 | """Does overlap-add procedure to undo the action of framesig. 58 | 59 | :param frames: the array of frames. 60 | :param siglen: the length of the desired signal, use 0 if unknown. Output will be truncated to siglen samples. 61 | :param frame_len: length of each frame measured in samples. 62 | :param frame_step: number of samples after the start of the previous frame that the next frame should begin. 63 | :param winfunc: the analysis window to apply to each frame. By default no window is applied. 64 | :returns: a 1-D signal. 65 | """ 66 | frame_len = round_half_up(frame_len) 67 | frame_step = round_half_up(frame_step) 68 | numframes = numpy.shape(frames)[0] 69 | assert numpy.shape(frames)[1] == frame_len, '"frames" matrix is wrong size, 2nd dim is not equal to frame_len' 70 | 71 | indices = numpy.tile(numpy.arange(0, frame_len), (numframes, 1)) + numpy.tile( 72 | numpy.arange(0, numframes * frame_step, frame_step), (frame_len, 1)).T 73 | indices = numpy.array(indices, dtype=numpy.int32) 74 | padlen = (numframes - 1) * frame_step + frame_len 75 | 76 | if siglen <= 0: siglen = padlen 77 | 78 | rec_signal = numpy.zeros((padlen,)) 79 | window_correction = numpy.zeros((padlen,)) 80 | win = winfunc(frame_len) 81 | 82 | for i in range(0, numframes): 83 | window_correction[indices[i, :]] = window_correction[ 84 | indices[i, :]] + win + 1e-15 # add a little bit so it is never zero 85 | rec_signal[indices[i, :]] = rec_signal[indices[i, :]] + frames[i, :] 86 | 87 | rec_signal = rec_signal / window_correction 88 | return rec_signal[0:siglen] 89 | 90 | 91 | def magspec(frames, NFFT): 92 | """Compute the magnitude spectrum of each frame in frames. If frames is an NxD matrix, output will be Nx(NFFT/2+1). 93 | 94 | :param frames: the array of frames. Each row is a frame. 95 | :param NFFT: the FFT length to use. If NFFT > frame_len, the frames are zero-padded. 96 | :returns: If frames is an NxD matrix, output will be Nx(NFFT/2+1). Each row will be the magnitude spectrum of the corresponding frame. 97 | """ 98 | if numpy.shape(frames)[1] > NFFT: 99 | logging.warn( 100 | 'frame length (%d) is greater than FFT size (%d), frame will be truncated. Increase NFFT to avoid.', 101 | numpy.shape(frames)[1], NFFT) 102 | complex_spec = numpy.fft.rfft(frames, NFFT) 103 | return numpy.absolute(complex_spec) 104 | 105 | 106 | def powspec(frames, NFFT): 107 | """Compute the power spectrum of each frame in frames. If frames is an NxD matrix, output will be Nx(NFFT/2+1). 108 | 109 | :param frames: the array of frames. Each row is a frame. 110 | :param NFFT: the FFT length to use. If NFFT > frame_len, the frames are zero-padded. 111 | :returns: If frames is an NxD matrix, output will be Nx(NFFT/2+1). Each row will be the power spectrum of the corresponding frame. 112 | """ 113 | return 1.0 / NFFT * numpy.square(magspec(frames, NFFT)) 114 | 115 | 116 | def logpowspec(frames, NFFT, norm=1): 117 | """Compute the log power spectrum of each frame in frames. If frames is an NxD matrix, output will be Nx(NFFT/2+1). 118 | 119 | :param frames: the array of frames. Each row is a frame. 120 | :param NFFT: the FFT length to use. If NFFT > frame_len, the frames are zero-padded. 121 | :param norm: If norm=1, the log power spectrum is normalised so that the max value (across all frames) is 0. 122 | :returns: If frames is an NxD matrix, output will be Nx(NFFT/2+1). Each row will be the log power spectrum of the corresponding frame. 123 | """ 124 | ps = powspec(frames, NFFT); 125 | ps[ps <= 1e-30] = 1e-30 126 | lps = 10 * numpy.log10(ps) 127 | if norm: 128 | return lps - numpy.max(lps) 129 | else: 130 | return lps 131 | 132 | 133 | def preemphasis(signal, coeff=0.95): 134 | """perform preemphasis on the input signal. 135 | 136 | :param signal: The signal to filter. 137 | :param coeff: The preemphasis coefficient. 0 is no filter, default is 0.95. 138 | :returns: the filtered signal. 139 | """ 140 | return numpy.append(signal[0], signal[1:] - coeff * signal[:-1]) 141 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | mock 2 | scipy 3 | numpy 4 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | try: 2 | from setuptools import setup #enables develop 3 | except ImportError: 4 | from distutils.core import setup 5 | 6 | setup(name='python_speech_features', 7 | version='0.6', 8 | description='Python Speech Feature extraction', 9 | author='James Lyons', 10 | author_email='james.lyons0@gmail.com', 11 | license='MIT', 12 | url='https://github.com/jameslyons/python_speech_features', 13 | packages=['python_speech_features'], 14 | ) 15 | -------------------------------------------------------------------------------- /test/test_sigproc.py: -------------------------------------------------------------------------------- 1 | from python_speech_features import sigproc 2 | import unittest 3 | import numpy as np 4 | import time 5 | 6 | 7 | class test_case(unittest.TestCase): 8 | def test_frame_sig(self): 9 | n = 10000124 10 | frame_len = 37 11 | frame_step = 13 12 | x = np.random.rand(n) 13 | t0 = time.time() 14 | y_old = sigproc.framesig(x, frame_len=frame_len, frame_step=frame_step, stride_trick=False) 15 | t1 = time.time() 16 | y_new = sigproc.framesig(x, frame_len=frame_len, frame_step=frame_step, stride_trick=True) 17 | t_new = time.time() - t1 18 | t_old = t1 - t0 19 | self.assertTupleEqual(y_old.shape, y_new.shape) 20 | np.testing.assert_array_equal(y_old, y_new) 21 | self.assertLess(t_new, t_old) 22 | print('new run time %3.2f < %3.2f sec' % (t_new, t_old)) 23 | 24 | def test_rolling(self): 25 | x = np.arange(10) 26 | y = sigproc.rolling_window(x, window=4, step=3) 27 | y_expected = np.array([[0, 1, 2, 3], 28 | [3, 4, 5, 6], 29 | [6, 7, 8, 9]] 30 | ) 31 | y = np.testing.assert_array_equal(y, y_expected) 32 | --------------------------------------------------------------------------------