├── .gitignore ├── LICENSE ├── Makefile ├── README.md ├── data └── 1_0_0_0_0_0_0_0.wav ├── include ├── add-deltas.h ├── am-diag-gmm.h ├── common.h ├── compressed-matrix.h ├── compute-cmvn-stats.h ├── feature-mfcc.h ├── fstreader.h ├── simple-decoder.h ├── srfft.h ├── transition-model.h └── wavereader.h ├── model ├── HCLG.fst └── final.mdl └── src ├── add-deltas.cpp ├── am-diag-gmm.cpp ├── common.cpp ├── compressed-matrix.cpp ├── compute-cmvn-stats.cpp ├── decode.cpp ├── feature-mfcc.cpp ├── fstreader.cpp ├── simple-decoder.cpp ├── srfft.cpp ├── transition-model.cpp └── wavereader.cpp /.gitignore: -------------------------------------------------------------------------------- 1 | # Prerequisites 2 | *.d 3 | 4 | # Compiled Object files 5 | *.slo 6 | *.lo 7 | *.o 8 | *.obj 9 | 10 | # Precompiled Headers 11 | *.gch 12 | *.pch 13 | 14 | # Compiled Dynamic libraries 15 | *.so 16 | *.dylib 17 | *.dll 18 | 19 | # Fortran module files 20 | *.mod 21 | *.smod 22 | 23 | # Compiled Static libraries 24 | *.lai 25 | *.la 26 | *.a 27 | *.lib 28 | 29 | # Executables 30 | *.exe 31 | *.out 32 | *.app 33 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [yyyy] [name of copyright owner] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | CXX = g++ 2 | CXXFLAGS = -std=c++11 --debug 3 | INCLUDE = -I/usr/local/include -I./include 4 | LDFLAGS = -std=c++11 5 | LDLIBS = -lpthread -lm 6 | EXECUTABLE= bin/main 7 | SOURCES = $(wildcard src/*.cpp) 8 | HEADERS = $(wildcard includes/*.h) 9 | OBJECTS = $(patsubst src/%.cpp, obj/%.o, $(SOURCES)) 10 | 11 | BASE = $(USER) 12 | 13 | all: $(EXECUTABLE) 14 | 15 | $(EXECUTABLE): $(OBJECTS) 16 | $(CXX) $(LDFLAGS) $(LDLIBS) $^ -o $@ 17 | 18 | obj/%.o: src/%.cpp 19 | $(CXX) -c $(CXXFLAGS) $(INCLUDE) $< -o $@ 20 | 21 | clean: 22 | rm $(OBJECTS) -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # asr-decode 2 | 3 | ## 从 [Kaldi](https://github.com/kaldi-asr/kaldi) 中裁剪的解码推理框架 4 | 5 | ## 实现 6 | 1. 不依赖OpenFST、OpenBLAS等库实现全部计算,便于学习和移植 7 | 2. 重现了基础的Viterbi解码(https://github.com/kaldi-asr/kaldi/blob/master/src/gmmbin/gmm-decode-simple.cc) 8 | 9 | ## 使用 10 | ```shell 11 | ./bin/main ./model/final.mdl ./model/HCLG.fst ./data/1_0_0_0_0_0_0_0.wav 12 | ``` 13 | 备注: 14 | 1. model文件来源于yesno的基础示例 15 | 2. 从音频计算feature的过程等价于下面过程 16 | ``` 17 | #从wave计算mfcc(包含一次compress) 18 | kaldi/src/featbin/compute-mfcc-feats --config=conf/mfcc.conf scp:data/test_yesno/wav.scp ark:- | kaldi/src/featbin/copy-feats --compress=true ark:- ark,scp:test_yesno.ark,test_yesno.scp 19 | 20 | #从mfcc计算cmvn 21 | kaldi/src/featbin/compute-cmvn-stats --spk2utt=ark:data/test_yesno/spk2utt scp:test_yesno.scp ark,scp:cmvn_test_yesno.ark,cmvn_test_yesno.scp 22 | 23 | #应用cmvn到mfcc feature(包含一次add deltas) 24 | kaldi/src/featbin/apply-cmvn --utt2spk=ark:data/test_yesno/split1/1/utt2spk scp:cmvn_test_yesno.scp scp:test_yesno.scp ark:- | kaldi/src/featbin/add-deltas ark:- ark:feat.ark 25 | ``` 26 | 27 | ## Todo 28 | 1. 其他解码方式和声学模型并优化,实现[vosk-api](https://github.com/alphacep/vosk-api)的完整功能 29 | -------------------------------------------------------------------------------- /data/1_0_0_0_0_0_0_0.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Ma-Dan/asr-decode/e83e8ede576bbcddd84d6d3dc2204d2639086d31/data/1_0_0_0_0_0_0_0.wav -------------------------------------------------------------------------------- /include/add-deltas.h: -------------------------------------------------------------------------------- 1 | #ifndef ADD_DELTAS 2 | #define ADD_DELTAS 3 | 4 | #include "common.h" 5 | 6 | struct DeltaFeaturesOptions { 7 | int32 order; 8 | int32 window; // e.g. 2; controls window size (window size is 2*window + 1) 9 | // the behavior at the edges is to replicate the first or last frame. 10 | // this is not configurable. 11 | 12 | DeltaFeaturesOptions(int32 order = 2, int32 window = 2): 13 | order(order), window(window) { } 14 | }; 15 | 16 | class DeltaFeatures { 17 | public: 18 | // This class provides a low-level function to compute delta features. 19 | // The function takes as input a matrix of features and a frame index 20 | // that it should compute the deltas on. It puts its output in an object 21 | // of type VectorBase, of size (original-feature-dimension) * (opts.order+1). 22 | // This is not the most efficient way to do the computation, but it's 23 | // state-free and thus easier to understand 24 | 25 | explicit DeltaFeatures(const DeltaFeaturesOptions &opts); 26 | 27 | void Process(const P_Matrix input_feats, 28 | int32 frame, 29 | BaseFloat *output_frame) const; 30 | private: 31 | DeltaFeaturesOptions opts_; 32 | std::vector > scales_; // a scaling window for each 33 | // of the orders, including zero: multiply the features for each 34 | // dimension by this window. 35 | }; 36 | 37 | void ComputeDeltas(const DeltaFeaturesOptions &delta_opts, 38 | const P_Matrix input_features, 39 | P_Matrix output_features); 40 | 41 | #endif -------------------------------------------------------------------------------- /include/am-diag-gmm.h: -------------------------------------------------------------------------------- 1 | #ifndef AM_DIAG_GMM 2 | #define AM_DIAG_GMM 3 | 4 | #include "common.h" 5 | 6 | class DiagGmm 7 | { 8 | public: 9 | vector gconsts; 10 | bool valid_gconsts; 11 | vector weights; 12 | Matrix inv_vars; 13 | Matrix means_invvars; 14 | }; 15 | 16 | class AmDiagGmm { 17 | public: 18 | void Read(FILE *fp); 19 | ~AmDiagGmm(); 20 | DiagGmm& GetPdf(int32 pdf_index) const; 21 | 22 | private: 23 | vector densities; 24 | DiagGmm* ReadDiagGmm(FILE *fp); 25 | int32 ComputeGconsts(DiagGmm* diaggmm); 26 | int32 NumGauss(DiagGmm* diaggmm) const; 27 | int32 Dim(DiagGmm* diaggmm) const; 28 | }; 29 | 30 | #endif -------------------------------------------------------------------------------- /include/common.h: -------------------------------------------------------------------------------- 1 | #ifndef COMMON_H 2 | #define COMMON_H 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | 13 | using namespace std; 14 | 15 | typedef unsigned char uint8; 16 | typedef signed char int8; 17 | typedef unsigned short uint16; 18 | typedef signed short int16; 19 | typedef int int32; 20 | typedef unsigned int uint32; 21 | typedef long long int64; 22 | typedef unsigned long long uint64; 23 | typedef float BaseFloat; 24 | 25 | #define SAFE_FREE(x) if(x) {free(x); x=NULL;} 26 | 27 | #ifndef M_2PI 28 | #define M_2PI 6.283185307179586476925286766559005 29 | #endif 30 | 31 | #ifndef M_LOG_2PI 32 | #define M_LOG_2PI 1.8378770664093454835606594728112 33 | #endif 34 | 35 | #ifndef FLT_EPSILON 36 | #define FLT_EPSILON 1.19209290e-7f 37 | #endif 38 | 39 | typedef struct tagMatrix 40 | { 41 | int32 cols; 42 | int32 rows; 43 | int32 stride; 44 | vector data; 45 | } Matrix, *P_Matrix; 46 | 47 | typedef struct tagMatrixDouble 48 | { 49 | int32 cols; 50 | int32 rows; 51 | int32 stride; 52 | vector data; 53 | } MatrixDouble, *P_MatrixDouble; 54 | 55 | void ReadToken(FILE *fp, char* s); 56 | void ReadIntegerVector(FILE *fp, vector *v); 57 | void ReadBasicType(FILE *fp, int32 *t); 58 | void ReadBasicType(FILE *fp, BaseFloat *t); 59 | void ReadFloatVectors(FILE *fp, vector *v); 60 | void ReadFloatMatrix(FILE *fp, P_Matrix m); 61 | BaseFloat ReadMatrix(P_Matrix m, int32 row, int32 col); 62 | 63 | #endif -------------------------------------------------------------------------------- /include/compressed-matrix.h: -------------------------------------------------------------------------------- 1 | #ifndef COMPRESSED_MATRIX 2 | #define COMPRESSED_MATRIX 3 | 4 | #include "common.h" 5 | 6 | struct GlobalHeader { 7 | int32 format; // Represents the enum DataFormat. 8 | float min_value; // min_value and range represent the ranges of the integer 9 | // data in the kTwoByte and kOneByte formats, and the 10 | // range of the PerColHeader uint16's in the 11 | // kOneByteWithColheaders format. 12 | float range; 13 | int32 num_rows; 14 | int32 num_cols; 15 | }; 16 | 17 | struct PerColHeader { 18 | uint16 percentile_0; 19 | uint16 percentile_25; 20 | uint16 percentile_75; 21 | uint16 percentile_100; 22 | }; 23 | 24 | class CompressedMatrix { 25 | public: 26 | CompressedMatrix(): data_(NULL) { } 27 | ~CompressedMatrix() { Clear(); } 28 | 29 | void Clear(); 30 | 31 | void CopyFromMat(const P_Matrix mat); 32 | void ComputeGlobalHeader(const P_Matrix mat, GlobalHeader *header); 33 | static int32 DataSize(const GlobalHeader &header); 34 | static void* AllocateData(int32 num_bytes); 35 | void CopyToMat(P_Matrix mat) const; 36 | 37 | private: 38 | void GetMinMax(const P_Matrix mat, BaseFloat* pMin, BaseFloat* pMax); 39 | static void CompressColumn(const GlobalHeader &global_header, 40 | const BaseFloat *data, int32 stride, 41 | int32 num_rows, PerColHeader *header, 42 | uint8 *byte_data); 43 | static void ComputeColHeader(const GlobalHeader &global_header, 44 | const BaseFloat *data, int32 stride, 45 | int32 num_rows, PerColHeader *header); 46 | 47 | static inline uint16 FloatToUint16(const GlobalHeader &global_header, 48 | float value); 49 | 50 | static inline float Uint16ToFloat(const GlobalHeader &global_header, 51 | uint16 value); 52 | 53 | // this is used only in the kOneByteWithColHeaders compression format. 54 | static inline uint8 FloatToChar(float p0, float p25, 55 | float p75, float p100, 56 | float value); 57 | 58 | // this is used only in the kOneByteWithColHeaders compression format. 59 | static inline float CharToFloat(float p0, float p25, 60 | float p75, float p100, 61 | uint8 value); 62 | 63 | void *data_; 64 | }; 65 | 66 | #endif -------------------------------------------------------------------------------- /include/compute-cmvn-stats.h: -------------------------------------------------------------------------------- 1 | #ifndef COMPUTE_CMVN_STATS 2 | #define COMPUTE_CMVN_STATS 3 | 4 | #include "common.h" 5 | 6 | void InitCmvnStats(int32 dim, P_MatrixDouble stats); 7 | void AccCmvnStats(const P_Matrix feats, P_MatrixDouble stats); 8 | void ApplyCmvn(const P_MatrixDouble stats, bool var_norm, P_Matrix feats); 9 | 10 | #endif -------------------------------------------------------------------------------- /include/feature-mfcc.h: -------------------------------------------------------------------------------- 1 | #ifndef FEATURE_MFCC 2 | #define FEATURE_MFCC 3 | 4 | #include "common.h" 5 | #include "srfft.h" 6 | 7 | int32 RoundUpToNearestPowerOfTwo(int32 n); 8 | 9 | enum WindowsType 10 | { 11 | hanning = 0, 12 | sine, 13 | hamming, 14 | povey, 15 | rectangular, 16 | blackman 17 | }; 18 | 19 | struct FrameExtractionOptions 20 | { 21 | BaseFloat samp_freq; 22 | BaseFloat frame_shift_ms; // in milliseconds. 23 | BaseFloat frame_length_ms; // in milliseconds. 24 | BaseFloat dither; // Amount of dithering, 0.0 means no dither. 25 | BaseFloat preemph_coeff; // Preemphasis coefficient. 26 | bool remove_dc_offset; // Subtract mean of wave before FFT. 27 | vector window; 28 | WindowsType window_type; 29 | BaseFloat blackman_coeff; 30 | 31 | FrameExtractionOptions(): 32 | samp_freq(8000), 33 | frame_shift_ms(10.0), 34 | frame_length_ms(25.0), 35 | dither(1.0), 36 | preemph_coeff(0.97), 37 | remove_dc_offset(true), 38 | window_type(povey), 39 | blackman_coeff(0.42){}; 40 | 41 | int32 WindowShift() const { 42 | return static_cast(samp_freq * 0.001 * frame_shift_ms); 43 | } 44 | int32 WindowSize() const { 45 | return static_cast(samp_freq * 0.001 * frame_length_ms); 46 | } 47 | int32 PaddedWindowSize() const { 48 | return RoundUpToNearestPowerOfTwo(WindowSize()); 49 | } 50 | }; 51 | 52 | struct MelBanksOptions { 53 | int32 num_bins; // e.g. 25; number of triangular bins 54 | BaseFloat low_freq; // e.g. 20; lower frequency cutoff 55 | BaseFloat high_freq; // an upper frequency cutoff; 0 -> no cutoff, negative 56 | // ->added to the Nyquist frequency to get the cutoff. 57 | BaseFloat vtln_low; // vtln lower cutoff of warping function. 58 | BaseFloat vtln_high; // vtln upper cutoff of warping function: if negative, added 59 | // to the Nyquist frequency to get the cutoff. 60 | 61 | explicit MelBanksOptions(int num_bins = 25) 62 | : num_bins(num_bins), low_freq(20), high_freq(0), vtln_low(100), 63 | vtln_high(-500) {} 64 | }; 65 | 66 | class MelBanks { 67 | public: 68 | 69 | static inline BaseFloat InverseMelScale(BaseFloat mel_freq) { 70 | return 700.0f * (expf (mel_freq / 1127.0f) - 1.0f); 71 | } 72 | 73 | static inline BaseFloat MelScale(BaseFloat freq) { 74 | return 1127.0f * logf (1.0f + freq / 700.0f); 75 | } 76 | 77 | static BaseFloat VtlnWarpFreq(BaseFloat vtln_low_cutoff, 78 | BaseFloat vtln_high_cutoff, // discontinuities in warp func 79 | BaseFloat low_freq, 80 | BaseFloat high_freq, // upper+lower frequency cutoffs in 81 | // the mel computation 82 | BaseFloat vtln_warp_factor, 83 | BaseFloat freq); 84 | 85 | static BaseFloat VtlnWarpMelFreq(BaseFloat vtln_low_cutoff, 86 | BaseFloat vtln_high_cutoff, 87 | BaseFloat low_freq, 88 | BaseFloat high_freq, 89 | BaseFloat vtln_warp_factor, 90 | BaseFloat mel_freq); 91 | 92 | 93 | MelBanks(const MelBanksOptions &opts, 94 | const FrameExtractionOptions &frame_opts, 95 | BaseFloat vtln_warp_factor); 96 | 97 | /// Compute Mel energies (note: not log enerties). 98 | /// At input, "fft_energies" contains the FFT energies (not log). 99 | void Compute(const vector &fft_energies, 100 | vector &mel_energies_out) const; 101 | 102 | int32 NumBins() const { return bins_.size(); } 103 | 104 | // returns vector of central freq of each bin; needed by plp code. 105 | const vector &GetCenterFreqs() const { return center_freqs_; } 106 | 107 | const std::vector > >& GetBins() const { 108 | return bins_; 109 | } 110 | 111 | private: 112 | 113 | // center frequencies of bins, numbered from 0 ... num_bins-1. 114 | // Needed by GetCenterFreqs(). 115 | vector center_freqs_; 116 | 117 | // the "bins_" vector is a vector, one for each bin, of a pair: 118 | // (the first nonzero fft-bin), (the vector of weights). 119 | std::vector > > bins_; 120 | }; 121 | 122 | struct MfccOptions 123 | { 124 | MelBanksOptions mel_opts; 125 | int num_ceps; // e.g. 13: num cepstral coeffs, counting zero. 126 | BaseFloat cepstral_lifter; // Scaling factor on cepstra for HTK compatibility. 127 | // if 0.0, no liftering is done. 128 | 129 | MfccOptions() 130 | : mel_opts(23), 131 | num_ceps(13), 132 | cepstral_lifter(22.0) 133 | {}; 134 | }; 135 | 136 | struct RandomState { 137 | RandomState(); 138 | unsigned seed; 139 | }; 140 | 141 | class MfccComputer 142 | { 143 | public: 144 | MfccComputer(); 145 | ~MfccComputer(); 146 | void ComputeFeatures(const vector &wave, BaseFloat sample_freq, BaseFloat vtln_warp, P_Matrix output); 147 | 148 | private: 149 | int32 NumFrames(int64 num_samples); 150 | void ExtractWindow(const vector &wave, int32 f, BaseFloat vtln_warp, vector &window, BaseFloat* output); 151 | void ProcessWindow(vector window, BaseFloat vtln_warp, BaseFloat* output); 152 | const MelBanks *GetMelBanks(BaseFloat vtln_warp); 153 | 154 | MfccOptions mfccOptions; 155 | FrameExtractionOptions frameOptions; 156 | std::map mel_banks_; // BaseFloat is VTLN coefficient. 157 | SplitRadixRealFft *srfft; 158 | 159 | vector mel_energies_; 160 | Matrix dct_matrix; 161 | vector lifter_coeffs_; 162 | }; 163 | 164 | #endif -------------------------------------------------------------------------------- /include/fstreader.h: -------------------------------------------------------------------------------- 1 | #ifndef FST_READER 2 | #define FST_READER 3 | 4 | #include "common.h" 5 | 6 | class FstHeader { 7 | public: 8 | bool Read(const char* fileName); 9 | ~FstHeader(); 10 | 11 | char* fsttype; // E.g. "vector". 12 | char* arctype; // E.g. "standard". 13 | int32 version; // Type version number. 14 | int32 flags; // File format bits. 15 | uint64 properties; // FST property bits. 16 | int64 start; // Start state. 17 | int64 numstates; // # of states. 18 | int64 numarcs; // # of arcs. 19 | private: 20 | void ReadString(char **buf, FILE *fp); 21 | void ReadInt(void *buf, int bytes, FILE *fp); 22 | }; 23 | 24 | typedef struct tagArc 25 | { 26 | int ilabel; 27 | int olabel; 28 | BaseFloat weight; 29 | int nextstate; 30 | } Arc, *P_Arc; 31 | 32 | typedef struct tagState 33 | { 34 | BaseFloat weight; 35 | int field1; 36 | int arcNum; 37 | int field3; 38 | int field4; 39 | P_Arc arc; 40 | } State, *P_State; 41 | 42 | class FstReader { 43 | public: 44 | bool Read(const char* fileName); 45 | int Start(); 46 | ~FstReader(); 47 | //private: 48 | FstHeader hdr; 49 | P_State state; 50 | P_Arc arc; 51 | }; 52 | 53 | #endif -------------------------------------------------------------------------------- /include/simple-decoder.h: -------------------------------------------------------------------------------- 1 | #ifndef SIMPLE_DECODER 2 | #define SIMPLE_DECODER 3 | 4 | #include "common.h" 5 | #include "transition-model.h" 6 | #include "am-diag-gmm.h" 7 | #include "fstreader.h" 8 | 9 | typedef int StateId; 10 | 11 | typedef struct tagDecodeArc 12 | { 13 | int ilabel; 14 | int olabel; 15 | BaseFloat weight1; 16 | BaseFloat weight2; 17 | int nextstate; 18 | } DecodeArc, *P_DecodeArc; 19 | 20 | typedef struct Token 21 | { 22 | DecodeArc arc; 23 | Token *prev; 24 | int32 ref_count; 25 | double cost; 26 | } *P_Token; 27 | 28 | class SimpleDecoder 29 | { 30 | public: 31 | SimpleDecoder(TransitionModel *transmodel, AmDiagGmm *amgmm, FstReader *fst, BaseFloat beam); 32 | bool Decode(P_Matrix feature, BaseFloat acoustic_scale); 33 | vector GetBestPath(); 34 | 35 | void InitDecoding(); 36 | void AdvanceDecoding(P_Matrix feature, BaseFloat acoustic_scale); 37 | 38 | private: 39 | class TransitionModel *m_transmodel; 40 | class AmDiagGmm *m_amgmm; 41 | map cur_toks; 42 | map prev_toks; 43 | class FstReader *m_fst; 44 | BaseFloat m_beam; 45 | int32 num_frames_decoded; 46 | 47 | void ProcessEmitting(P_Matrix feature, BaseFloat acoustic_scale); 48 | void ProcessNonemitting(); 49 | 50 | BaseFloat LogLikelihood(P_Matrix feature, int32 frame, int32 tid); 51 | 52 | static void ClearToks(map &toks); 53 | static void PruneToks(BaseFloat beam, map *toks); 54 | }; 55 | 56 | #endif -------------------------------------------------------------------------------- /include/srfft.h: -------------------------------------------------------------------------------- 1 | #ifndef SRFFT_H 2 | #define SRFFT_H 3 | 4 | #include "common.h" 5 | 6 | class SplitRadixComplexFft 7 | { 8 | public: 9 | SplitRadixComplexFft(int32 N); 10 | ~SplitRadixComplexFft(); 11 | 12 | // Does the FFT computation, given pointers to the real and 13 | // imaginary parts. If "forward", do the forward FFT; else 14 | // do the inverse FFT (without the 1/N factor). 15 | // xr and xi are pointers to zero-based arrays of size N, 16 | // containing the real and imaginary parts 17 | // respectively. 18 | void Compute(BaseFloat *xr, BaseFloat *xi, bool forward) const; 19 | 20 | // This version of Compute takes a single array of size N*2, 21 | // containing [ r0 im0 r1 im1 ... ]. Otherwise its behavior is the 22 | // same as the version above. 23 | void Compute(BaseFloat *x, bool forward); 24 | 25 | 26 | // This version of Compute is const; it operates on an array of size N*2 27 | // containing [ r0 im0 r1 im1 ... ], but it uses the argument "temp_buffer" as 28 | // temporary storage instead of a class-member variable. It will allocate it if 29 | // needed. 30 | void Compute(BaseFloat *x, bool forward, std::vector *temp_buffer) const; 31 | 32 | private: 33 | void ComputeTables(); 34 | void ComputeRecursive(BaseFloat *xr, BaseFloat *xi, int32 logn) const; 35 | void BitReversePermute(BaseFloat *x, int32 logn) const; 36 | 37 | int32 N_; 38 | int32 logn_; // log(N) 39 | 40 | int32 *brseed_; 41 | // brseed is Evans' seed table, ref: (Ref: D. M. W. 42 | // Evans, "An improved digit-reversal permutation algorithm ...", 43 | // IEEE Trans. ASSP, Aug. 1987, pp. 1120-1125). 44 | BaseFloat **tab_; // Tables of butterfly coefficients. 45 | protected: 46 | std::vector temp_buffer_; 47 | }; 48 | 49 | class SplitRadixRealFft: private SplitRadixComplexFft { 50 | public: 51 | SplitRadixRealFft(int32 N): // will fail unless N>=4 and N is a power of 2. 52 | SplitRadixComplexFft (N/2), N_(N) { } 53 | 54 | /// If forward == true, this function transforms from a sequence of N real points to its complex fourier 55 | /// transform; otherwise it goes in the reverse direction. If you call it 56 | /// in the forward and then reverse direction and multiply by 1.0/N, you 57 | /// will get back the original data. 58 | /// The interpretation of the complex-FFT data is as follows: the array 59 | /// is a sequence of complex numbers C_n of length N/2 with (real, im) format, 60 | /// i.e. [real0, real_{N/2}, real1, im1, real2, im2, real3, im3, ...]. 61 | void Compute(BaseFloat *x, bool forward); 62 | 63 | 64 | /// This is as the other Compute() function, but it is a const version that 65 | /// uses a user-supplied buffer. 66 | void Compute(BaseFloat *x, bool forward, std::vector *temp_buffer) const; 67 | 68 | private: 69 | int N_; 70 | }; 71 | 72 | #endif -------------------------------------------------------------------------------- /include/transition-model.h: -------------------------------------------------------------------------------- 1 | #ifndef TRANSITION_MODEL 2 | #define TRANSITION_MODEL 3 | 4 | #include "common.h" 5 | 6 | typedef struct tagHmmState 7 | { 8 | int32 forward_pdf_class; 9 | int32 self_loop_pdf_class; 10 | vector > transitions; 11 | } HmmState, *P_HmmState; 12 | 13 | typedef vector TopologyEntry; 14 | 15 | typedef struct tagHmmTopology 16 | { 17 | vector phones; 18 | vector phone2idx; 19 | vector entries; 20 | } HmmTopology, *P_HmmTopology; 21 | 22 | typedef struct tagTuple 23 | { 24 | int32 phone; 25 | int32 hmm_state; 26 | int32 forward_pdf; 27 | int32 self_loop_pdf; 28 | } Tuple, *P_Tuple; 29 | 30 | class TransitionModel 31 | { 32 | public: 33 | void Read(FILE *fp); 34 | int32 TransitionIdToPdf(int32 trans_id) const; 35 | 36 | private: 37 | HmmTopology topo; 38 | vector tuples; 39 | vector state2id; 40 | vector id2state; 41 | vector id2pdf_id; 42 | vector log_probs; 43 | vector non_self_loop_log_probs; 44 | int32 num_pdfs; 45 | 46 | void ReadTopo(FILE *fp); 47 | 48 | void ComputeDerived(); 49 | bool IsSelfLoop(int32 trans_id) const; 50 | const TopologyEntry& TopologyForPhone(int32 phone) const; 51 | void ComputeDerivedOfProbs(); 52 | int32 SelfLoopOf(int32 trans_state) const; 53 | BaseFloat GetTransitionLogProb(int32 trans_id) const; 54 | int32 PairToTransitionId(int32 trans_state, int32 trans_index) const; 55 | int32 NumTransitionStates(); 56 | }; 57 | 58 | #endif -------------------------------------------------------------------------------- /include/wavereader.h: -------------------------------------------------------------------------------- 1 | #ifndef WAVE_READER 2 | #define WAVE_READER 3 | 4 | #include "common.h" 5 | 6 | typedef struct tagWaveHeader 7 | { 8 | uint8 chunk_id[4]; //'RIFF' 9 | uint32 chunk_size; 10 | uint8 format[4]; //'WAVE' 11 | uint8 subchunk1_id[4]; //'FMT' 12 | uint32 subchunk1_size; //PCM = 16 13 | uint16 audio_format; //PCM = 1 14 | uint16 channels; 15 | uint32 sample_rate; 16 | uint32 byte_rate; 17 | uint16 block_align; //NumChannels * BitsPerSample / 8 18 | uint16 bit_per_sample; 19 | uint8 subchunk2_id[4]; //'DATA' 20 | uint32 subchunk2_size; 21 | } WaveHeader, *P_WaveHeader; 22 | 23 | typedef struct tagWaveFile 24 | { 25 | WaveHeader header; 26 | vector data; 27 | } WaveFile, *P_WaveFile; 28 | 29 | class WaveReader 30 | { 31 | public: 32 | WaveReader(); 33 | ~WaveReader(); 34 | void ReadWaveFile(const char* fileName); 35 | 36 | WaveFile m_wavefile; 37 | vector m_waveData; 38 | 39 | private: 40 | }; 41 | 42 | #endif -------------------------------------------------------------------------------- /model/HCLG.fst: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Ma-Dan/asr-decode/e83e8ede576bbcddd84d6d3dc2204d2639086d31/model/HCLG.fst -------------------------------------------------------------------------------- /model/final.mdl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Ma-Dan/asr-decode/e83e8ede576bbcddd84d6d3dc2204d2639086d31/model/final.mdl -------------------------------------------------------------------------------- /src/add-deltas.cpp: -------------------------------------------------------------------------------- 1 | #include "add-deltas.h" 2 | 3 | DeltaFeatures::DeltaFeatures(const DeltaFeaturesOptions &opts): opts_(opts) { 4 | scales_.resize(opts.order+1); 5 | scales_[0].resize(1); 6 | scales_[0][0] = 1.0; // trivial window for 0th order delta [i.e. baseline feats] 7 | 8 | for (int32 i = 1; i <= opts.order; i++) { 9 | vector &prev_scales = scales_[i-1], 10 | &cur_scales = scales_[i]; 11 | int32 window = opts.window; // this code is designed to still 12 | // work if instead we later make it an array and do opts.window[i-1], 13 | // or something like that. "window" is a parameter specifying delta-window 14 | // width which is actually 2*window + 1. 15 | int32 prev_offset = (static_cast(prev_scales.size()-1))/2, 16 | cur_offset = prev_offset + window; 17 | cur_scales.resize(prev_scales.size() + 2*window); // also zeros it. 18 | 19 | BaseFloat normalizer = 0.0; 20 | for (int32 j = -window; j <= window; j++) { 21 | normalizer += j*j; 22 | for (int32 k = -prev_offset; k <= prev_offset; k++) { 23 | cur_scales[j+k+cur_offset] += 24 | static_cast(j) * prev_scales[k+prev_offset]; 25 | } 26 | } 27 | for(int32 i=0; irows, 38 | feat_dim = input_feats->cols; 39 | for(int32 i=0; i<(opts_.order+1)*feat_dim; i++) 40 | { 41 | output_frame[i] = 0.0f; 42 | } 43 | for (int32 i = 0; i <= opts_.order; i++) { 44 | const vector &scales = scales_[i]; 45 | int32 max_offset = (scales.size() - 1) / 2; 46 | BaseFloat* output = output_frame + i*feat_dim; 47 | for (int32 j = -max_offset; j <= max_offset; j++) { 48 | // if asked to read 49 | int32 offset_frame = frame + j; 50 | if (offset_frame < 0) offset_frame = 0; 51 | else if (offset_frame >= num_frames) 52 | offset_frame = num_frames - 1; 53 | BaseFloat scale = scales[j + max_offset]; 54 | if (scale != 0.0) 55 | { 56 | for(int32 k=0; kdata[offset_frame*input_feats->cols+k]; 59 | } 60 | } 61 | } 62 | } 63 | } 64 | 65 | void ComputeDeltas(const DeltaFeaturesOptions &delta_opts, 66 | const P_Matrix input_features, 67 | P_Matrix output_features) { 68 | output_features->rows = input_features->rows; 69 | output_features->cols = input_features->cols*(delta_opts.order + 1); 70 | output_features->data.resize(output_features->rows * output_features->cols); 71 | DeltaFeatures delta(delta_opts); 72 | for (int32 r = 0; r < static_cast(input_features->rows); r++) { 73 | BaseFloat* row = output_features->data.data() + r*output_features->cols; 74 | delta.Process(input_features, r, row); 75 | } 76 | } 77 | -------------------------------------------------------------------------------- /src/am-diag-gmm.cpp: -------------------------------------------------------------------------------- 1 | #include "am-diag-gmm.h" 2 | 3 | void AmDiagGmm::Read(FILE *fp) 4 | { 5 | int32 num_pdfs, dim; 6 | char token[128]; 7 | 8 | ReadToken(fp, token); // 9 | ReadBasicType(fp, &dim); 10 | ReadToken(fp, token); // 11 | ReadBasicType(fp, &num_pdfs); 12 | 13 | densities.reserve(num_pdfs); 14 | for (int32 i = 0; i < num_pdfs; i++) 15 | { 16 | densities.push_back(ReadDiagGmm(fp)); 17 | } 18 | } 19 | 20 | AmDiagGmm::~AmDiagGmm() 21 | { 22 | } 23 | 24 | DiagGmm& AmDiagGmm::GetPdf(int32 pdf_index) const 25 | { 26 | return *(densities[pdf_index]); 27 | } 28 | 29 | DiagGmm* AmDiagGmm::ReadDiagGmm(FILE *fp) 30 | { 31 | DiagGmm *diag_gmm = new DiagGmm(); 32 | char token[128]; 33 | 34 | ReadToken(fp, token); // or 35 | 36 | ReadToken(fp, token); 37 | if(0 == strcmp(token, "")) 38 | { 39 | ReadFloatVectors(fp, &diag_gmm->gconsts); 40 | } 41 | 42 | ReadToken(fp, token); 43 | if(0 == strcmp(token, "")) 44 | { 45 | ReadFloatVectors(fp, &diag_gmm->weights); 46 | } 47 | 48 | ReadToken(fp, token); // 49 | ReadFloatMatrix(fp, &diag_gmm->means_invvars); 50 | 51 | ReadToken(fp, token); // 52 | ReadFloatMatrix(fp, &diag_gmm->inv_vars); 53 | 54 | ReadToken(fp, token); // 55 | 56 | ComputeGconsts(diag_gmm); 57 | 58 | return diag_gmm; 59 | } 60 | 61 | int32 AmDiagGmm::ComputeGconsts(DiagGmm* diaggmm) { 62 | int32 num_mix = NumGauss(diaggmm); 63 | int32 dim = Dim(diaggmm); 64 | BaseFloat offset = -0.5 * M_LOG_2PI * dim; // constant term in gconst. 65 | int32 num_bad = 0; 66 | 67 | // Resize if Gaussians have been removed during Update() 68 | if (num_mix != static_cast(diaggmm->gconsts.size())) 69 | { 70 | diaggmm->gconsts.resize(num_mix); 71 | } 72 | 73 | for (int32 mix = 0; mix < num_mix; mix++) 74 | { 75 | BaseFloat gc = logf(diaggmm->weights[mix]) + offset; // May be -inf if weights == 0 76 | for (int32 d = 0; d < dim; d++) 77 | { 78 | gc += 0.5 * logf(ReadMatrix(&diaggmm->inv_vars, mix, d)) - 0.5 * ReadMatrix(&diaggmm->means_invvars, mix, d) 79 | * ReadMatrix(&diaggmm->means_invvars, mix, d) / ReadMatrix(&diaggmm->inv_vars, mix, d); 80 | } 81 | // Change sign for logdet because var is inverted. Also, note that 82 | // mean_invvars(mix, d)*mean_invvars(mix, d)/inv_vars(mix, d) is the 83 | // mean-squared times inverse variance, since mean_invvars(mix, d) contains 84 | // the mean times inverse variance. 85 | // So gc is the likelihood at zero feature value. 86 | 87 | if (isnan(gc)) 88 | { // negative infinity is OK but NaN is not acceptable 89 | printf("At component %d not a number in gconst computation", mix); 90 | } 91 | if (isinf(gc)) 92 | { 93 | num_bad++; 94 | // If positive infinity, make it negative infinity. 95 | // Want to make sure the answer becomes -inf in the end, not NaN. 96 | if (gc > 0) 97 | { 98 | gc = -gc; 99 | } 100 | } 101 | diaggmm->gconsts[mix] = gc; 102 | } 103 | 104 | diaggmm->valid_gconsts = true; 105 | return num_bad; 106 | } 107 | 108 | int32 AmDiagGmm::NumGauss(DiagGmm* diaggmm) const 109 | { 110 | return diaggmm->weights.size(); 111 | } 112 | 113 | int32 AmDiagGmm::Dim(DiagGmm* diaggmm) const 114 | { 115 | return diaggmm->means_invvars.cols; 116 | } -------------------------------------------------------------------------------- /src/common.cpp: -------------------------------------------------------------------------------- 1 | #include "common.h" 2 | 3 | void ReadToken(FILE *fp, char* s) 4 | { 5 | int index = 0; 6 | char c = '\0'; 7 | while(c != ' ') 8 | { 9 | fread(&c, 1, 1, fp); 10 | s[index] = c; 11 | index++; 12 | } 13 | 14 | s[index-1] = '\0'; 15 | } 16 | 17 | void ReadIntegerVector(FILE *fp, vector *v) 18 | { 19 | uint8 size = 0; 20 | fread(&size, sizeof(size), 1, fp); 21 | 22 | if(size != sizeof(int32)) 23 | { 24 | printf("vector size error!\n"); 25 | return; 26 | } 27 | 28 | uint32 vsize = 0; 29 | fread(&vsize, sizeof(vsize), 1, fp); 30 | 31 | int32 value; 32 | for(int i=0; ipush_back(value); 36 | } 37 | } 38 | 39 | void ReadBasicType(FILE *fp, int32 *t) 40 | { 41 | uint8 size = 0; 42 | fread(&size, sizeof(size), 1, fp); 43 | 44 | if(size != sizeof(int32)) 45 | { 46 | printf("int32 size error!\n"); 47 | return; 48 | } 49 | 50 | fread(t, sizeof(*t), 1, fp); 51 | } 52 | 53 | void ReadBasicType(FILE *fp, BaseFloat *t) 54 | { 55 | uint8 size = 0; 56 | fread(&size, sizeof(size), 1, fp); 57 | 58 | if(size != sizeof(BaseFloat)) 59 | { 60 | printf("float size error!\n"); 61 | return; 62 | } 63 | 64 | fread(t, sizeof(*t), 1, fp); 65 | } 66 | 67 | void ReadFloatVectors(FILE *fp, vector *v) 68 | { 69 | //TODO: Support other type, eg, double 70 | const char *my_token = "FV"; 71 | char token[128]; 72 | ReadToken(fp, token); //FV 73 | int32 size; 74 | ReadBasicType(fp, &size); 75 | v->resize(size); 76 | fread(v->data(), sizeof(BaseFloat), size, fp); 77 | } 78 | 79 | void ReadFloatMatrix(FILE *fp, P_Matrix m) 80 | { 81 | const char *my_token = "FM"; 82 | char token[128]; 83 | ReadToken(fp, token); //FM 84 | 85 | int32 rows, cols; 86 | ReadBasicType(fp, &rows); 87 | ReadBasicType(fp, &cols); 88 | 89 | m->rows = rows; 90 | m->cols = cols; 91 | 92 | int32 skip = ((16 / sizeof(BaseFloat)) - cols % (16 / sizeof(BaseFloat))) % (16 / sizeof(BaseFloat)); 93 | m->stride = cols + skip; 94 | 95 | int32 size = rows * cols; 96 | m->data.resize(size); 97 | fread(m->data.data(), sizeof(BaseFloat), size, fp); 98 | } 99 | 100 | BaseFloat ReadMatrix(P_Matrix m, int32 row, int32 col) 101 | { 102 | return m->data[m->cols*row+col]; 103 | } -------------------------------------------------------------------------------- /src/compressed-matrix.cpp: -------------------------------------------------------------------------------- 1 | #include "compressed-matrix.h" 2 | 3 | void CompressedMatrix::Clear() { 4 | if (data_ != NULL) { 5 | delete [] static_cast(data_); 6 | data_ = NULL; 7 | } 8 | } 9 | 10 | int32 CompressedMatrix::DataSize(const GlobalHeader &header) { 11 | return sizeof(GlobalHeader) + 12 | header.num_cols * (sizeof(PerColHeader) + header.num_rows); 13 | } 14 | 15 | void* CompressedMatrix::AllocateData(int32 num_bytes) { 16 | return reinterpret_cast(new float[(num_bytes/3) + 4]); 17 | } 18 | 19 | void CompressedMatrix::CopyFromMat(const P_Matrix mat) 20 | { 21 | Clear(); 22 | 23 | GlobalHeader global_header; 24 | ComputeGlobalHeader(mat, &global_header); 25 | 26 | int32 data_size = DataSize(global_header); 27 | 28 | data_ = AllocateData(data_size); 29 | 30 | *(reinterpret_cast(data_)) = global_header; 31 | 32 | PerColHeader *header_data = 33 | reinterpret_cast(static_cast(data_) + 34 | sizeof(GlobalHeader)); 35 | uint8 *byte_data = 36 | reinterpret_cast(header_data + global_header.num_cols); 37 | 38 | const BaseFloat *matrix_data = mat->data.data(); 39 | 40 | for (int32 col = 0; col < global_header.num_cols; col++) { 41 | CompressColumn(global_header, 42 | matrix_data + col, mat->cols, 43 | global_header.num_rows, 44 | header_data, byte_data); 45 | header_data++; 46 | byte_data += global_header.num_rows; 47 | } 48 | } 49 | 50 | void CompressedMatrix::CopyToMat(P_Matrix mat) const { 51 | GlobalHeader *h = reinterpret_cast(data_); 52 | int32 num_cols = h->num_cols, num_rows = h->num_rows; 53 | if (1) { 54 | PerColHeader *per_col_header = reinterpret_cast(h+1); 55 | uint8 *byte_data = reinterpret_cast(per_col_header + 56 | h->num_cols); 57 | for (int32 i = 0; i < num_cols; i++, per_col_header++) { 58 | float p0 = Uint16ToFloat(*h, per_col_header->percentile_0), 59 | p25 = Uint16ToFloat(*h, per_col_header->percentile_25), 60 | p75 = Uint16ToFloat(*h, per_col_header->percentile_75), 61 | p100 = Uint16ToFloat(*h, per_col_header->percentile_100); 62 | for (int32 j = 0; j < num_rows; j++, byte_data++) { 63 | float f = CharToFloat(p0, p25, p75, p100, *byte_data); 64 | mat->data[j*num_cols+i] = f; 65 | } 66 | } 67 | } 68 | } 69 | 70 | void CompressedMatrix::ComputeGlobalHeader(const P_Matrix mat, GlobalHeader *header) 71 | { 72 | header->num_rows = mat->rows; 73 | header->num_cols = mat->cols; 74 | 75 | BaseFloat min_value, max_value; 76 | GetMinMax(mat, &min_value, &max_value); 77 | 78 | header->min_value = min_value; 79 | header->range = max_value - min_value; 80 | } 81 | 82 | void CompressedMatrix::GetMinMax(const P_Matrix mat, BaseFloat* pMin, BaseFloat* pMax) 83 | { 84 | int32 total = mat->rows * mat->cols; 85 | 86 | *pMin = mat->data[0]; 87 | *pMax = mat->data[0]; 88 | 89 | for(int32 i=1; i mat->data[i]) 92 | { 93 | *pMin = mat->data[i]; 94 | } 95 | 96 | if(*pMax < mat->data[i]) 97 | { 98 | *pMax = mat->data[i]; 99 | } 100 | } 101 | } 102 | 103 | void CompressedMatrix::CompressColumn( 104 | const GlobalHeader &global_header, 105 | const BaseFloat *data, int32 stride, 106 | int32 num_rows, PerColHeader *header, 107 | uint8 *byte_data) { 108 | ComputeColHeader(global_header, data, stride, 109 | num_rows, header); 110 | 111 | float p0 = Uint16ToFloat(global_header, header->percentile_0), 112 | p25 = Uint16ToFloat(global_header, header->percentile_25), 113 | p75 = Uint16ToFloat(global_header, header->percentile_75), 114 | p100 = Uint16ToFloat(global_header, header->percentile_100); 115 | 116 | for (int32 i = 0; i < num_rows; i++) { 117 | BaseFloat this_data = data[i * stride]; 118 | byte_data[i] = FloatToChar(p0, p25, p75, p100, this_data); 119 | } 120 | } 121 | 122 | void CompressedMatrix::ComputeColHeader( 123 | const GlobalHeader &global_header, 124 | const BaseFloat *data, int32 stride, 125 | int32 num_rows, PerColHeader *header) { 126 | std::vector sdata(num_rows); // the sorted data. 127 | for (size_t i = 0, size = sdata.size(); i < size; i++) 128 | sdata[i] = data[i*stride]; 129 | 130 | if (num_rows >= 5) { 131 | int quarter_nr = num_rows/4; 132 | // std::sort(sdata.begin(), sdata.end()); 133 | // The elements at positions 0, quarter_nr, 134 | // 3*quarter_nr, and num_rows-1 need to be in sorted order. 135 | std::nth_element(sdata.begin(), sdata.begin() + quarter_nr, sdata.end()); 136 | // Now, sdata.begin() + quarter_nr contains the element that would appear 137 | // in sorted order, in that position. 138 | std::nth_element(sdata.begin(), sdata.begin(), sdata.begin() + quarter_nr); 139 | // Now, sdata.begin() and sdata.begin() + quarter_nr contain the elements 140 | // that would appear at those positions in sorted order. 141 | std::nth_element(sdata.begin() + quarter_nr + 1, 142 | sdata.begin() + (3*quarter_nr), sdata.end()); 143 | // Now, sdata.begin(), sdata.begin() + quarter_nr, and sdata.begin() + 144 | // 3*quarter_nr, contain the elements that would appear at those positions 145 | // in sorted order. 146 | std::nth_element(sdata.begin() + (3*quarter_nr) + 1, sdata.end() - 1, 147 | sdata.end()); 148 | // Now, sdata.begin(), sdata.begin() + quarter_nr, and sdata.begin() + 149 | // 3*quarter_nr, and sdata.end() - 1, contain the elements that would appear 150 | // at those positions in sorted order. 151 | 152 | header->percentile_0 = 153 | std::min(FloatToUint16(global_header, sdata[0]), 65532); 154 | header->percentile_25 = 155 | std::min( 156 | std::max( 157 | FloatToUint16(global_header, sdata[quarter_nr]), 158 | header->percentile_0 + static_cast(1)), 65533); 159 | header->percentile_75 = 160 | std::min( 161 | std::max( 162 | FloatToUint16(global_header, sdata[3*quarter_nr]), 163 | header->percentile_25 + static_cast(1)), 65534); 164 | header->percentile_100 = std::max( 165 | FloatToUint16(global_header, sdata[num_rows-1]), 166 | header->percentile_75 + static_cast(1)); 167 | 168 | } 169 | } 170 | 171 | inline uint16 CompressedMatrix::FloatToUint16( 172 | const GlobalHeader &global_header, 173 | float value) { 174 | float f = (value - global_header.min_value) / 175 | global_header.range; 176 | if (f > 1.0) f = 1.0; // Note: this should not happen. 177 | if (f < 0.0) f = 0.0; // Note: this should not happen. 178 | return static_cast(f * 65535 + 0.499); // + 0.499 is to 179 | // round to closest int; avoids bias. 180 | } 181 | 182 | inline uint8 CompressedMatrix::FloatToChar( 183 | float p0, float p25, float p75, float p100, 184 | float value) { 185 | int ans; 186 | if (value < p25) { // range [ p0, p25 ) covered by 187 | // characters 0 .. 64. We round to the closest int. 188 | float f = (value - p0) / (p25 - p0); 189 | ans = static_cast(f * 64 + 0.5); 190 | // Note: the checks on the next two lines 191 | // are necessary in pathological cases when all the elements in a row 192 | // are the same and the percentile_* values are separated by one. 193 | if (ans < 0) ans = 0; 194 | if (ans > 64) ans = 64; 195 | } else if (value < p75) { // range [ p25, p75 )covered 196 | // by characters 64 .. 192. We round to the closest int. 197 | float f = (value - p25) / (p75 - p25); 198 | ans = 64 + static_cast(f * 128 + 0.5); 199 | if (ans < 64) ans = 64; 200 | if (ans > 192) ans = 192; 201 | } else { // range [ p75, p100 ] covered by 202 | // characters 192 .. 255. Note: this last range 203 | // has fewer characters than the left range, because 204 | // we go up to 255, not 256. 205 | float f = (value - p75) / (p100 - p75); 206 | ans = 192 + static_cast(f * 63 + 0.5); 207 | if (ans < 192) ans = 192; 208 | if (ans > 255) ans = 255; 209 | } 210 | return static_cast(ans); 211 | } 212 | 213 | inline float CompressedMatrix::Uint16ToFloat( 214 | const GlobalHeader &global_header, 215 | uint16 value) { 216 | // the constant 1.52590218966964e-05 is 1/65535. 217 | return global_header.min_value 218 | + global_header.range * 1.52590218966964e-05F * value; 219 | } 220 | 221 | inline float CompressedMatrix::CharToFloat( 222 | float p0, float p25, float p75, float p100, 223 | uint8 value) { 224 | if (value <= 64) { 225 | return p0 + (p25 - p0) * value * (1/64.0); 226 | } else if (value <= 192) { 227 | return p25 + (p75 - p25) * (value - 64) * (1/128.0); 228 | } else { 229 | return p75 + (p100 - p75) * (value - 192) * (1/63.0); 230 | } 231 | } 232 | -------------------------------------------------------------------------------- /src/compute-cmvn-stats.cpp: -------------------------------------------------------------------------------- 1 | #include "compute-cmvn-stats.h" 2 | 3 | void InitCmvnStats(int32 dim, P_MatrixDouble stats) { 4 | stats->rows = 2; 5 | stats->cols = dim+1; 6 | stats->data.resize(2*(dim+1)); 7 | } 8 | 9 | void AccCmvnStats(const BaseFloat* feats, int32 dim, BaseFloat weight, P_MatrixDouble stats) { 10 | // Remove these __restrict__ modifiers if they cause compilation problems. 11 | // It's just an optimization. 12 | double *__restrict__ mean_ptr = stats->data.data(), 13 | *__restrict__ var_ptr = stats->data.data()+stats->cols, 14 | *__restrict__ count_ptr = mean_ptr + dim; 15 | const BaseFloat * __restrict__ feats_ptr = feats; 16 | *count_ptr += weight; 17 | // Careful-- if we change the format of the matrix, the "mean_ptr < count_ptr" 18 | // statement below might become wrong. 19 | for (; mean_ptr < count_ptr; mean_ptr++, var_ptr++, feats_ptr++) { 20 | *mean_ptr += *feats_ptr * weight; 21 | *var_ptr += *feats_ptr * *feats_ptr * weight; 22 | } 23 | } 24 | 25 | void AccCmvnStats(const P_Matrix feats, P_MatrixDouble stats) { 26 | int32 num_frames = feats->rows; 27 | for (int32 i = 0; i < num_frames; i++) { 28 | const BaseFloat* this_frame = feats->data.data() + i * feats->cols; 29 | BaseFloat weight = 1.0; 30 | if (weight != 0.0) 31 | AccCmvnStats(this_frame, feats->cols, weight, stats); 32 | } 33 | } 34 | 35 | void ApplyCmvn(const P_MatrixDouble stats, 36 | bool var_norm, 37 | P_Matrix feats) { 38 | int32 dim = stats->cols - 1; 39 | 40 | double count = stats->data[dim]; 41 | 42 | if (!var_norm) { 43 | vector offset; 44 | offset.resize(dim); 45 | for(int32 i=0; idata[i] / stats->data[dim]; 48 | } 49 | for(int32 i=0; irows; i++) 50 | { 51 | for(int32 j=0; jcols; j++) 52 | { 53 | feats->data[i*feats->cols+j] += offset[j]; 54 | } 55 | } 56 | return; 57 | } 58 | // norm(0, d) = mean offset; 59 | // norm(1, d) = scale, e.g. x(d) <-- x(d)*norm(1, d) + norm(0, d). 60 | Matrix norm; 61 | norm.rows = 2; 62 | norm.cols = dim; 63 | norm.data.resize(2*dim); 64 | for (int32 d = 0; d < dim; d++) { 65 | double mean, offset, scale; 66 | mean = stats->data[d]/count; 67 | double var = (stats->data[1*stats->cols + d]/count) - mean*mean, 68 | floor = 1.0e-20; 69 | scale = 1.0 / sqrt(var); 70 | offset = -(mean*scale); 71 | norm.data[d] = offset; 72 | norm.data[1*norm.cols+d] = scale; 73 | } 74 | // Apply the normalization. 75 | //feats->MulColsVec(norm.Row(1)); 76 | //feats->AddVecToRows(1.0, norm.Row(0)); 77 | } 78 | -------------------------------------------------------------------------------- /src/decode.cpp: -------------------------------------------------------------------------------- 1 | #include "wavereader.h" 2 | #include "transition-model.h" 3 | #include "am-diag-gmm.h" 4 | #include "fstreader.h" 5 | #include "feature-mfcc.h" 6 | #include "compressed-matrix.h" 7 | #include "compute-cmvn-stats.h" 8 | #include "add-deltas.h" 9 | #include "simple-decoder.h" 10 | 11 | int main(int argc, char* argv[]) 12 | { 13 | if(argc < 4) 14 | { 15 | printf("arg error\n"); 16 | return -1; 17 | } 18 | 19 | char* mdlFileName = argv[1]; 20 | char* fstFileName = argv[2]; 21 | char* waveFileName = argv[3]; 22 | 23 | BaseFloat vtln_warp = 1.0; 24 | 25 | BaseFloat acoustic_scale = 0.083333; 26 | BaseFloat beam = 16.0; 27 | 28 | // Read Transition model and GMM AM model 29 | FILE *fpMdl = fopen(mdlFileName, "rb"); 30 | 31 | bool binary = false; 32 | char hdr[2]; 33 | fread(hdr, 2, 1, fpMdl); 34 | if(hdr[1] == 'B') 35 | { 36 | binary = true; 37 | } 38 | TransitionModel trans_model; 39 | trans_model.Read(fpMdl); 40 | 41 | AmDiagGmm am_gmm; 42 | am_gmm.Read(fpMdl); 43 | 44 | fclose(fpMdl); 45 | 46 | // Read HCLG fst 47 | FstReader fstReader; 48 | fstReader.Read(fstFileName); 49 | 50 | // Read wave file 51 | WaveReader waveReader; 52 | waveReader.ReadWaveFile(waveFileName); 53 | 54 | // Compute MFCC 55 | MfccComputer mfccComputer; 56 | Matrix feats; 57 | mfccComputer.ComputeFeatures(waveReader.m_waveData, waveReader.m_wavefile.header.sample_rate, vtln_warp, &feats); 58 | 59 | // Compress matrix 60 | CompressedMatrix compressedMatrix; 61 | compressedMatrix.CopyFromMat(&feats); 62 | compressedMatrix.CopyToMat(&feats); 63 | 64 | // Compute CMVN stats and apply 65 | MatrixDouble cmvn_stats; 66 | InitCmvnStats(feats.cols, &cmvn_stats); 67 | AccCmvnStats(&feats, &cmvn_stats); 68 | ApplyCmvn(&cmvn_stats, false, &feats); 69 | 70 | // Add deltas 71 | DeltaFeaturesOptions opts; 72 | Matrix feature; 73 | ComputeDeltas(opts, &feats, &feature); 74 | 75 | // Decode feature 76 | SimpleDecoder decoder(&trans_model, &am_gmm, &fstReader, beam); 77 | decoder.Decode(&feature, acoustic_scale); 78 | 79 | vector result = decoder.GetBestPath(); 80 | 81 | printf("Decoded result: "); 82 | for(int i=0; i> 1; 6 | n |= n >> 2; 7 | n |= n >> 4; 8 | n |= n >> 8; 9 | n |= n >> 16; 10 | return n+1; 11 | } 12 | 13 | int Rand(struct RandomState* state) { 14 | if (state) { 15 | return rand_r(&(state->seed)); 16 | } else { 17 | return rand(); 18 | } 19 | } 20 | 21 | RandomState::RandomState() { 22 | // we initialize it as Rand() + 27437 instead of just Rand(), because on some 23 | // systems, e.g. at the very least Mac OSX Yosemite and later, it seems to be 24 | // the case that rand_r when initialized with rand() will give you the exact 25 | // same sequence of numbers that rand() will give if you keep calling rand() 26 | // after that initial call. This can cause problems with repeated sequences. 27 | // For example if you initialize two RandomState structs one after the other 28 | // without calling rand() in between, they would give you the same sequence 29 | // offset by one (if we didn't have the "+ 27437" in the code). 27437 is just 30 | // a randomly chosen prime number. 31 | seed = unsigned(Rand(NULL)) + 27437; 32 | } 33 | 34 | /// Returns a random number strictly between 0 and 1. 35 | inline float RandUniform(struct RandomState* state = NULL) { 36 | return static_cast((Rand(state) + 1.0) / (RAND_MAX+2.0)); 37 | } 38 | 39 | inline float RandGauss(struct RandomState* state = NULL) { 40 | return static_cast(sqrtf (-2 * logf(RandUniform(state))) 41 | * cosf(2*M_PI*RandUniform(state))); 42 | } 43 | 44 | void Dither(vector &waveform, int32 frame_length, BaseFloat dither_value) { 45 | if (dither_value == 0.0) 46 | { 47 | return; 48 | } 49 | BaseFloat *data = waveform.data(); 50 | RandomState rstate; 51 | for (int32 i = 0; i < frame_length; i++) 52 | { 53 | data[i] += RandGauss(&rstate) * dither_value; 54 | } 55 | } 56 | 57 | BaseFloat Sum(vector window) 58 | { 59 | BaseFloat sum = 0.0f; 60 | 61 | for(int i=0; i &waveform, int32 frame_length, BaseFloat preemph_coeff) 70 | { 71 | if (preemph_coeff == 0.0) 72 | { 73 | return; 74 | } 75 | for (int32 i = frame_length-1; i > 0; i--) 76 | { 77 | waveform[i] -= preemph_coeff * waveform[i-1]; 78 | } 79 | 80 | waveform[0] -= preemph_coeff * waveform[0]; 81 | } 82 | 83 | void ComputePowerSpectrum(vector &waveform) { 84 | int32 dim = waveform.size(); 85 | 86 | // no, letting it be non-power-of-two for now. 87 | // KALDI_ASSERT(dim > 0 && (dim & (dim-1) == 0)); // make sure a power of two.. actually my FFT code 88 | // does not require this (dan) but this is better in case we use different code [dan]. 89 | 90 | // RealFft(waveform, true); // true == forward (not inverse) FFT; makes no difference here, 91 | // as we just want power spectrum. 92 | 93 | // now we have in waveform, first half of complex spectrum 94 | // it's stored as [real0, realN/2, real1, im1, real2, im2, ...] 95 | int32 half_dim = dim/2; 96 | BaseFloat first_energy = waveform[0] * waveform[0], 97 | last_energy = waveform[1] * waveform[1]; // handle this special case 98 | for (int32 i = 1; i < half_dim; i++) { 99 | BaseFloat real = waveform[i*2], im = waveform[i*2 + 1]; 100 | waveform[i] = real*real + im*im; 101 | } 102 | waveform[0] = first_energy; 103 | waveform[half_dim] = last_energy; // Will actually never be used, and anyway 104 | // if the signal has been bandlimited sensibly this should be zero. 105 | } 106 | 107 | void ApplyFloor(vector &v, BaseFloat floor_val) 108 | { 109 | for (int32 i = 0; i < v.size(); i++) { 110 | v[i] = std::max(v[i], floor_val); 111 | } 112 | } 113 | 114 | void ApplyLog(vector &v) 115 | { 116 | for (int32 i = 0; i < v.size(); i++) { 117 | v[i] = logf(v[i]); 118 | } 119 | } 120 | 121 | void PrepareMatrix(P_Matrix m, int32 rows, int32 cols) 122 | { 123 | m->rows = rows; 124 | m->cols = cols; 125 | 126 | m->data.resize(rows * cols); 127 | } 128 | 129 | void ComputeDctMatrix(P_Matrix M) { 130 | //KALDI_ASSERT(M->NumRows() == M->NumCols()); 131 | int32 K = M->rows; 132 | int32 N = M->cols; 133 | 134 | BaseFloat normalizer = sqrt(1.0 / static_cast(N)); // normalizer for 135 | // X_0. 136 | for (int32 j = 0; j < N; j++) M->data[0*M->cols + j] = normalizer; 137 | normalizer = sqrt(2.0 / static_cast(N)); // normalizer for other 138 | // elements. 139 | for (int32 k = 1; k < K; k++) 140 | for (int32 n = 0; n < N; n++) 141 | M->data[k*M->cols + n] = normalizer 142 | * cos( static_cast(M_PI)/N * (n + 0.5) * k ); 143 | } 144 | 145 | void ComputeLifterCoeffs(BaseFloat Q, vector &coeffs) { 146 | // Compute liftering coefficients (scaling on cepstral coeffs) 147 | // coeffs are numbered slightly differently from HTK: the zeroth 148 | // index is C0, which is not affected. 149 | for (int32 i = 0; i < coeffs.size(); i++) 150 | coeffs[i] = 1.0 + 0.5 * Q * sin (M_PI * i / Q); 151 | } 152 | 153 | void PrepareFeatureWindowFunction(FrameExtractionOptions &opts) { 154 | int32 frame_length = opts.WindowSize(); 155 | opts.window.resize(frame_length); 156 | double a = M_2PI / (frame_length-1); 157 | for (int32 i = 0; i < frame_length; i++) { 158 | double i_fl = static_cast(i); 159 | if (opts.window_type == hanning) { 160 | opts.window[i] = 0.5 - 0.5*cos(a * i_fl); 161 | } else if (opts.window_type == sine) { 162 | // when you are checking ws wikipedia, please 163 | // note that 0.5 * a = M_PI/(frame_length-1) 164 | opts.window[i] = sin(0.5 * a * i_fl); 165 | } else if (opts.window_type == hamming) { 166 | opts.window[i] = 0.54 - 0.46*cos(a * i_fl); 167 | } else if (opts.window_type == povey) { // like hamming but goes to zero at edges. 168 | opts.window[i] = pow(0.5 - 0.5*cos(a * i_fl), 0.85); 169 | } else if (opts.window_type == rectangular) { 170 | opts.window[i] = 1.0; 171 | } else if (opts.window_type == blackman) { 172 | opts.window[i] = opts.blackman_coeff - 0.5*cos(a * i_fl) + 173 | (0.5 - opts.blackman_coeff) * cos(2 * a * i_fl); 174 | } 175 | } 176 | } 177 | 178 | MelBanks::MelBanks(const MelBanksOptions &opts, 179 | const FrameExtractionOptions &frame_opts, 180 | BaseFloat vtln_warp_factor) { 181 | int32 num_bins = opts.num_bins; 182 | BaseFloat sample_freq = frame_opts.samp_freq; 183 | int32 window_length_padded = frame_opts.PaddedWindowSize(); 184 | int32 num_fft_bins = window_length_padded / 2; 185 | BaseFloat nyquist = 0.5 * sample_freq; 186 | 187 | BaseFloat low_freq = opts.low_freq, high_freq; 188 | if (opts.high_freq > 0.0) 189 | high_freq = opts.high_freq; 190 | else 191 | high_freq = nyquist + opts.high_freq; 192 | 193 | BaseFloat fft_bin_width = sample_freq / window_length_padded; 194 | // fft-bin width [think of it as Nyquist-freq / half-window-length] 195 | 196 | BaseFloat mel_low_freq = MelScale(low_freq); 197 | BaseFloat mel_high_freq = MelScale(high_freq); 198 | 199 | // divide by num_bins+1 in next line because of end-effects where the bins 200 | // spread out to the sides. 201 | BaseFloat mel_freq_delta = (mel_high_freq - mel_low_freq) / (num_bins+1); 202 | 203 | BaseFloat vtln_low = opts.vtln_low, 204 | vtln_high = opts.vtln_high; 205 | if (vtln_high < 0.0) { 206 | vtln_high += nyquist; 207 | } 208 | 209 | bins_.resize(num_bins); 210 | center_freqs_.resize(num_bins); 211 | 212 | for (int32 bin = 0; bin < num_bins; bin++) { 213 | BaseFloat left_mel = mel_low_freq + bin * mel_freq_delta, 214 | center_mel = mel_low_freq + (bin + 1) * mel_freq_delta, 215 | right_mel = mel_low_freq + (bin + 2) * mel_freq_delta; 216 | 217 | if (vtln_warp_factor != 1.0) { 218 | left_mel = VtlnWarpMelFreq(vtln_low, vtln_high, low_freq, high_freq, 219 | vtln_warp_factor, left_mel); 220 | center_mel = VtlnWarpMelFreq(vtln_low, vtln_high, low_freq, high_freq, 221 | vtln_warp_factor, center_mel); 222 | right_mel = VtlnWarpMelFreq(vtln_low, vtln_high, low_freq, high_freq, 223 | vtln_warp_factor, right_mel); 224 | } 225 | center_freqs_[bin] = InverseMelScale(center_mel); 226 | // this_bin will be a vector of coefficients that is only 227 | // nonzero where this mel bin is active. 228 | vector this_bin(num_fft_bins); 229 | int32 first_index = -1, last_index = -1; 230 | for (int32 i = 0; i < num_fft_bins; i++) { 231 | BaseFloat freq = (fft_bin_width * i); // Center frequency of this fft 232 | // bin. 233 | BaseFloat mel = MelScale(freq); 234 | if (mel > left_mel && mel < right_mel) { 235 | BaseFloat weight; 236 | if (mel <= center_mel) 237 | weight = (mel - left_mel) / (center_mel - left_mel); 238 | else 239 | weight = (right_mel-mel) / (right_mel-center_mel); 240 | this_bin[i] = weight; 241 | if (first_index == -1) 242 | first_index = i; 243 | last_index = i; 244 | } 245 | } 246 | 247 | bins_[bin].first = first_index; 248 | int32 size = last_index + 1 - first_index; 249 | bins_[bin].second.resize(size); 250 | for(int32 i=0; i high_freq) return freq; // in case this gets called 291 | // for out-of-range frequencies, just return the freq. 292 | 293 | BaseFloat one = 1.0; 294 | BaseFloat l = vtln_low_cutoff * std::max(one, vtln_warp_factor); 295 | BaseFloat h = vtln_high_cutoff * std::min(one, vtln_warp_factor); 296 | BaseFloat scale = 1.0 / vtln_warp_factor; 297 | BaseFloat Fl = scale * l; // F(l); 298 | BaseFloat Fh = scale * h; // F(h); 299 | // slope of left part of the 3-piece linear function 300 | BaseFloat scale_left = (Fl - low_freq) / (l - low_freq); 301 | // [slope of center part is just "scale"] 302 | 303 | // slope of right part of the 3-piece linear function 304 | BaseFloat scale_right = (high_freq - Fh) / (high_freq - h); 305 | 306 | if (freq < l) { 307 | return low_freq + scale_left * (freq - low_freq); 308 | } else if (freq < h) { 309 | return scale * freq; 310 | } else { // freq >= h 311 | return high_freq + scale_right * (freq - high_freq); 312 | } 313 | } 314 | 315 | BaseFloat MelBanks::VtlnWarpMelFreq(BaseFloat vtln_low_cutoff, // upper+lower frequency cutoffs for VTLN. 316 | BaseFloat vtln_high_cutoff, 317 | BaseFloat low_freq, // upper+lower frequency cutoffs in mel computation 318 | BaseFloat high_freq, 319 | BaseFloat vtln_warp_factor, 320 | BaseFloat mel_freq) { 321 | return MelScale(VtlnWarpFreq(vtln_low_cutoff, vtln_high_cutoff, 322 | low_freq, high_freq, 323 | vtln_warp_factor, InverseMelScale(mel_freq))); 324 | } 325 | 326 | 327 | // "power_spectrum" contains fft energies. 328 | void MelBanks::Compute(const vector &power_spectrum, 329 | vector &mel_energies_out) const { 330 | int32 num_bins = bins_.size(); 331 | 332 | for (int32 i = 0; i < num_bins; i++) { 333 | int32 offset = bins_[i].first; 334 | const vector &v(bins_[i].second); 335 | BaseFloat energy = 0.0f; 336 | for(int32 j=0; j &wave, BaseFloat sample_freq, BaseFloat vtln_warp, P_Matrix output) 377 | { 378 | int32 rows_out = NumFrames(wave.size()); 379 | int32 cols_out = mfccOptions.num_ceps; 380 | 381 | output->rows = rows_out; 382 | output->cols = cols_out; 383 | 384 | int32 skip = ((16 / sizeof(BaseFloat)) - cols_out % (16 / sizeof(BaseFloat))) % (16 / sizeof(BaseFloat)); 385 | output->stride = cols_out + skip; 386 | 387 | output->data.resize(rows_out * cols_out); 388 | 389 | vector window; // windowed waveform. 390 | for (int32 frame = 0; frame < rows_out; ++frame) 391 | { 392 | ExtractWindow(wave, frame, vtln_warp, window, output->data.data()+frame*cols_out); 393 | } 394 | } 395 | 396 | int32 MfccComputer::NumFrames(int64 num_samples) 397 | { 398 | int64 frame_shift = frameOptions.WindowShift(); 399 | int64 frame_length = frameOptions.WindowSize(); 400 | 401 | if (num_samples < frame_length) 402 | { 403 | return 0; 404 | } 405 | else 406 | { 407 | return (1 + ((num_samples - frame_length) / frame_shift)); 408 | } 409 | } 410 | 411 | void MfccComputer::ExtractWindow(const vector &wave, int32 f, BaseFloat vtln_warp, vector &window, BaseFloat* output) 412 | { 413 | int32 frame_length = frameOptions.WindowSize(); 414 | int32 frame_length_padded = frameOptions.PaddedWindowSize(); 415 | 416 | if(window.size() != frame_length_padded) 417 | { 418 | window.resize(frame_length_padded); 419 | } 420 | 421 | memcpy(window.data(), wave.data()+f*frameOptions.WindowShift(), frame_length*sizeof(BaseFloat)); 422 | memset(window.data()+frame_length, 0, (frame_length_padded-frame_length)*sizeof(BaseFloat)); 423 | 424 | ProcessWindow(window, vtln_warp, output); 425 | } 426 | 427 | void MfccComputer::ProcessWindow(vector window, BaseFloat vtln_warp, BaseFloat* output) 428 | { 429 | int32 frame_length = frameOptions.WindowSize(); 430 | 431 | if (frameOptions.dither != 0.0) 432 | { 433 | Dither(window, frame_length, frameOptions.dither); 434 | } 435 | 436 | if (frameOptions.remove_dc_offset) 437 | { 438 | BaseFloat offset = -Sum(window) / frame_length; 439 | for(int i=0; iCompute(window.data(), true); 456 | 457 | ComputePowerSpectrum(window); 458 | 459 | const MelBanks &mel_banks = *(GetMelBanks(vtln_warp)); 460 | mel_banks.Compute(window, mel_energies_); 461 | 462 | ApplyFloor(mel_energies_, std::numeric_limits::epsilon()); 463 | ApplyLog(mel_energies_); 464 | 465 | for(int32 i=0; i::iterator iter = mel_banks_.find(vtln_warp); 486 | if (iter == mel_banks_.end()) { 487 | this_mel_banks = new MelBanks(mfccOptions.mel_opts, 488 | frameOptions, 489 | vtln_warp); 490 | mel_banks_[vtln_warp] = this_mel_banks; 491 | } else { 492 | this_mel_banks = iter->second; 493 | } 494 | return this_mel_banks; 495 | } -------------------------------------------------------------------------------- /src/fstreader.cpp: -------------------------------------------------------------------------------- 1 | #include "fstreader.h" 2 | 3 | const int32 fstMagicNumber = 2125659606; 4 | 5 | bool FstHeader::Read(const char* fileName) 6 | { 7 | FILE *fp; 8 | fp = fopen(fileName, "rb"); 9 | 10 | if(fp == NULL) 11 | { 12 | printf("Error opening fst file\n"); 13 | return false; 14 | } 15 | 16 | int32 magic_number = 0; 17 | ReadInt(&magic_number, sizeof(magic_number), fp); 18 | if (magic_number != fstMagicNumber) { 19 | printf("FstHeader::Read: Bad FST header\n"); 20 | return false; 21 | } 22 | 23 | ReadString(&fsttype, fp); 24 | ReadString(&arctype, fp); 25 | ReadInt(&version, sizeof(version), fp); 26 | ReadInt(&flags, sizeof(flags), fp); 27 | ReadInt(&properties, sizeof(properties), fp); 28 | ReadInt(&start, sizeof(start), fp); 29 | ReadInt(&numstates, sizeof(numstates), fp); 30 | ReadInt(&numarcs, sizeof(numarcs), fp); 31 | 32 | fclose(fp); 33 | return true; 34 | } 35 | 36 | void FstHeader::ReadString(char **buf, FILE *fp) 37 | { 38 | uint32 len = 0; 39 | fread(&len, sizeof(len), 1, fp); 40 | *buf = (char*)malloc(len+1); 41 | memset(*buf, 0, len+1); 42 | fread(*buf, len, 1, fp); 43 | } 44 | 45 | void FstHeader::ReadInt(void *buf, int bytes, FILE *fp) 46 | { 47 | fread(buf, bytes, 1, fp); 48 | } 49 | 50 | FstHeader::~FstHeader() 51 | { 52 | SAFE_FREE(fsttype); 53 | SAFE_FREE(arctype); 54 | } 55 | 56 | 57 | bool FstReader::Read(const char* fileName) 58 | { 59 | if(!hdr.Read(fileName)) 60 | { 61 | return false; 62 | } 63 | 64 | FILE *fp; 65 | fp = fopen(fileName, "rb"); 66 | 67 | if(fp == NULL) 68 | { 69 | printf("Error opening fst file\n"); 70 | return false; 71 | } 72 | 73 | //65 bytes header 74 | fseek(fp, 65, SEEK_SET); 75 | //Check the type of Arc 76 | 77 | //Read the FST 78 | //20 bytes per state 79 | state = (P_State)malloc(hdr.numstates * sizeof(State)); 80 | for(int64 i=0; iref_count == 0) 6 | { 7 | Token *prev = tok->prev; 8 | SAFE_FREE(tok); 9 | if(prev == NULL) 10 | { 11 | return; 12 | } 13 | else tok = prev; 14 | } 15 | } 16 | 17 | static BaseFloat GetDecodeArcWeight(P_DecodeArc arc) 18 | { 19 | return arc->weight1 + arc->weight2; 20 | } 21 | 22 | static P_Token newToken(P_DecodeArc arc, BaseFloat acoustic_cost, Token *prev) 23 | { 24 | P_Token token = (P_Token)malloc(sizeof(Token)); 25 | 26 | token->arc.ilabel = arc->ilabel; 27 | token->arc.olabel = arc->olabel; 28 | token->arc.weight1 = GetDecodeArcWeight(arc); 29 | token->arc.weight2 = acoustic_cost; 30 | token->arc.nextstate = arc->nextstate; 31 | 32 | token->prev = prev; 33 | token->ref_count = 1; 34 | 35 | if(prev) 36 | { 37 | prev->ref_count++; 38 | token->cost = prev->cost + (GetDecodeArcWeight(arc) + acoustic_cost); 39 | } 40 | else 41 | { 42 | token->cost = GetDecodeArcWeight(arc) + acoustic_cost; 43 | } 44 | 45 | return token; 46 | } 47 | 48 | static P_Token newToken(P_Arc arc, BaseFloat acoustic_cost, Token *prev) 49 | { 50 | P_Token token = (P_Token)malloc(sizeof(Token)); 51 | 52 | token->arc.ilabel = arc->ilabel; 53 | token->arc.olabel = arc->olabel; 54 | token->arc.weight1 = arc->weight; 55 | token->arc.weight2 = acoustic_cost; 56 | token->arc.nextstate = arc->nextstate; 57 | 58 | token->prev = prev; 59 | token->ref_count = 1; 60 | 61 | if(prev) 62 | { 63 | prev->ref_count++; 64 | token->cost = prev->cost + (arc->weight + acoustic_cost); 65 | } 66 | else 67 | { 68 | token->cost = arc->weight + acoustic_cost; 69 | } 70 | 71 | return token; 72 | } 73 | 74 | SimpleDecoder::SimpleDecoder(TransitionModel *transmodel, AmDiagGmm *amgmm, FstReader *fst, BaseFloat beam) 75 | { 76 | m_transmodel = transmodel; 77 | m_amgmm = amgmm; 78 | m_fst = fst; 79 | m_beam = beam; 80 | } 81 | 82 | void SimpleDecoder::InitDecoding() 83 | { 84 | // clean up from last time: 85 | ClearToks(cur_toks); 86 | ClearToks(prev_toks); 87 | // initialize decoding: 88 | StateId start_state = m_fst->Start(); 89 | 90 | DecodeArc dummy_arc; 91 | dummy_arc.ilabel = 0; 92 | dummy_arc.olabel = 0; 93 | dummy_arc.weight1 = 0; 94 | dummy_arc.weight2 = 0; 95 | dummy_arc.nextstate = start_state; 96 | 97 | cur_toks[start_state] = newToken(&dummy_arc, 0.0, NULL); 98 | 99 | num_frames_decoded = 0; 100 | ProcessNonemitting(); 101 | } 102 | 103 | bool SimpleDecoder::Decode(P_Matrix feature, BaseFloat acoustic_scale) 104 | { 105 | InitDecoding(); 106 | AdvanceDecoding(feature, acoustic_scale); 107 | return (!cur_toks.empty()); 108 | } 109 | 110 | vector SimpleDecoder::GetBestPath() 111 | { 112 | Token* best_token; 113 | BaseFloat best_cost = std::numeric_limits::infinity(); 114 | 115 | for(map::iterator iter = cur_toks.begin(); 116 | iter != cur_toks.end(); ++iter) 117 | { 118 | if(best_cost > iter->second->cost) 119 | { 120 | best_cost = iter->second->cost; 121 | best_token = iter->second; 122 | } 123 | } 124 | 125 | vector result_rev; 126 | Token* path = best_token; 127 | while(path != NULL) 128 | { 129 | if(path->arc.olabel != 0) 130 | { 131 | result_rev.push_back(path->arc.olabel); 132 | } 133 | path = path->prev; 134 | } 135 | 136 | vector result; 137 | for(int i=result_rev.size()-1; i>=0; i--) 138 | { 139 | result.push_back(result_rev[i]); 140 | } 141 | 142 | return result; 143 | } 144 | 145 | void SimpleDecoder::AdvanceDecoding(P_Matrix feature, BaseFloat acoustic_scale) 146 | { 147 | while (num_frames_decoded < feature->rows) 148 | { 149 | // note: ProcessEmitting() increments num_frames_decoded_ 150 | ClearToks(prev_toks); 151 | cur_toks.swap(prev_toks); 152 | ProcessEmitting(feature, acoustic_scale); 153 | ProcessNonemitting(); 154 | PruneToks(m_beam, &cur_toks); 155 | } 156 | } 157 | 158 | void SimpleDecoder::ProcessEmitting(P_Matrix feature, BaseFloat acoustic_scale) 159 | { 160 | int32 frame = num_frames_decoded; 161 | // Processes emitting arcs for one frame. Propagates from 162 | // prev_toks_ to cur_toks_. 163 | double cutoff = numeric_limits::infinity(); 164 | for(map::iterator iter = prev_toks.begin(); 165 | iter != prev_toks.end(); 166 | ++iter) 167 | { 168 | StateId state = iter->first; 169 | Token *tok = iter->second; 170 | for(int i=0; istate[state].arcNum; i++) 171 | { 172 | P_Arc arc = &m_fst->state[state].arc[i]; 173 | if(arc->ilabel != 0) 174 | { 175 | // propagate.. 176 | BaseFloat acoustic_cost = -acoustic_scale * LogLikelihood(feature, frame, arc->ilabel); 177 | double total_cost = tok->cost + arc->weight + acoustic_cost; 178 | 179 | if(total_cost >= cutoff) 180 | { 181 | continue; 182 | } 183 | if(total_cost + m_beam < cutoff) 184 | { 185 | cutoff = total_cost + m_beam; 186 | } 187 | 188 | Token *new_tok = newToken(arc, acoustic_cost, tok); 189 | map::iterator find_iter = cur_toks.find(arc->nextstate); 190 | if(find_iter == cur_toks.end()) 191 | { 192 | cur_toks[arc->nextstate] = new_tok; 193 | } 194 | else 195 | { 196 | if(find_iter->second->cost > new_tok->cost) 197 | { 198 | TokenDelete(find_iter->second); 199 | find_iter->second = new_tok; 200 | } 201 | else 202 | { 203 | TokenDelete(new_tok); 204 | } 205 | } 206 | } 207 | } 208 | } 209 | num_frames_decoded++; 210 | } 211 | 212 | void SimpleDecoder::ProcessNonemitting() 213 | { 214 | // Processes nonemitting arcs for one frame. Propagates within 215 | // cur_toks_. 216 | vector queue; 217 | double infinity = std::numeric_limits::infinity(); 218 | double best_cost = infinity; 219 | 220 | for(map::iterator iter = cur_toks.begin(); 221 | iter != cur_toks.end(); 222 | ++iter) 223 | { 224 | queue.push_back(iter->first); 225 | best_cost = min(best_cost, iter->second->cost); 226 | } 227 | double cutoff = best_cost + m_beam; 228 | 229 | while(!queue.empty()) 230 | { 231 | StateId state = queue.back(); 232 | queue.pop_back(); 233 | Token *tok = cur_toks[state]; 234 | for(int i=0; istate[state].arcNum; i++) 235 | { 236 | P_Arc arc = &m_fst->state[state].arc[i]; 237 | 238 | if(arc->ilabel == 0) 239 | { // propagate nonemitting only... 240 | const BaseFloat acoustic_cost = 0.0; 241 | Token *new_tok = newToken(arc, acoustic_cost, tok); 242 | if(new_tok->cost > cutoff) 243 | { 244 | TokenDelete(new_tok); 245 | } 246 | else 247 | { 248 | map::iterator find_iter = cur_toks.find(arc->nextstate); 249 | if(find_iter == cur_toks.end()) 250 | { 251 | cur_toks[arc->nextstate] = new_tok; 252 | queue.push_back(arc->nextstate); 253 | } 254 | else 255 | { 256 | if(find_iter->second->cost > new_tok->cost) 257 | { 258 | TokenDelete(find_iter->second); 259 | find_iter->second = new_tok; 260 | queue.push_back(arc->nextstate); 261 | } 262 | else 263 | { 264 | TokenDelete(new_tok); 265 | } 266 | } 267 | } 268 | } 269 | } 270 | } 271 | } 272 | 273 | static const BaseFloat kMinLogDiffFloat = logf(FLT_EPSILON); 274 | 275 | BaseFloat LogSumExp(vector input, BaseFloat prune) 276 | { 277 | BaseFloat max_elem = input[0]; 278 | for(int i=1; i 0.0 && max_elem - prune > cutoff) // explicit pruning... 288 | { 289 | cutoff = max_elem - prune; 290 | } 291 | 292 | double sum_relto_max_elem = 0.0; 293 | 294 | for(int i = 0; i < input.size(); i++) 295 | { 296 | BaseFloat f = input[i]; 297 | if (f >= cutoff) 298 | { 299 | sum_relto_max_elem += expf(f - max_elem); 300 | } 301 | } 302 | return max_elem + logf(sum_relto_max_elem); 303 | } 304 | 305 | BaseFloat SimpleDecoder::LogLikelihood(P_Matrix feature, int32 frame, int32 tid) 306 | { 307 | int32 state = m_transmodel->TransitionIdToPdf(tid); 308 | 309 | vector data; 310 | vector data_squared; 311 | for(int i=0; icols; i++) 312 | { 313 | BaseFloat v = ReadMatrix(feature, frame, i); 314 | data.push_back(v); 315 | data_squared.push_back(v*v); 316 | } 317 | 318 | DiagGmm& pdf = m_amgmm->GetPdf(state); 319 | 320 | vector loglikes; 321 | for(int i=0; i &toks) { 343 | for(map::iterator iter = toks.begin(); 344 | iter != toks.end(); ++iter) 345 | { 346 | TokenDelete(iter->second); 347 | } 348 | toks.clear(); 349 | } 350 | 351 | void SimpleDecoder::PruneToks(BaseFloat beam, map *toks) 352 | { 353 | if(toks->empty()) 354 | { 355 | printf("No tokens to prune.\n"); 356 | return; 357 | } 358 | double best_cost = numeric_limits::infinity(); 359 | for(map::iterator iter = toks->begin(); 360 | iter != toks->end(); ++iter) 361 | { 362 | best_cost = min(best_cost, iter->second->cost); 363 | } 364 | 365 | vector retained; 366 | double cutoff = best_cost + beam; 367 | for(map::iterator iter = toks->begin(); 368 | iter != toks->end(); ++iter) 369 | { 370 | if(iter->second->cost < cutoff) 371 | { 372 | retained.push_back(iter->first); 373 | } 374 | else 375 | { 376 | TokenDelete(iter->second); 377 | } 378 | } 379 | map tmp; 380 | for (size_t i = 0; i < retained.size(); i++) 381 | { 382 | tmp[retained[i]] = (*toks)[retained[i]]; 383 | } 384 | printf("Pruned to %lu toks.\n", retained.size()); 385 | tmp.swap(*toks); 386 | } -------------------------------------------------------------------------------- /src/srfft.cpp: -------------------------------------------------------------------------------- 1 | #include "srfft.h" 2 | 3 | SplitRadixComplexFft::SplitRadixComplexFft(int32 N) 4 | { 5 | N_ = N; 6 | logn_ = 0; 7 | while (N > 1) { 8 | N >>= 1; 9 | logn_ ++; 10 | } 11 | ComputeTables(); 12 | } 13 | 14 | SplitRadixComplexFft::~SplitRadixComplexFft() 15 | { 16 | delete [] brseed_; 17 | if (tab_ != NULL) { 18 | for (int32 i = 0; i < logn_-3; i++) 19 | { 20 | delete [] tab_[i]; 21 | } 22 | delete [] tab_; 23 | } 24 | } 25 | 26 | void SplitRadixComplexFft::Compute(BaseFloat *xr, BaseFloat *xi, bool forward) const { 27 | if (!forward) { // reverse real and imaginary parts for complex FFT. 28 | BaseFloat *tmp = xr; 29 | xr = xi; 30 | xi = tmp; 31 | } 32 | ComputeRecursive(xr, xi, logn_); 33 | if (logn_ > 1) { 34 | BitReversePermute(xr, logn_); 35 | BitReversePermute(xi, logn_); 36 | } 37 | } 38 | 39 | void SplitRadixComplexFft::Compute(BaseFloat *x, bool forward, 40 | std::vector *temp_buffer) const { 41 | if (temp_buffer->size() != N_) 42 | temp_buffer->resize(N_); 43 | BaseFloat *temp_ptr = &((*temp_buffer)[0]); 44 | for (int32 i = 0; i < N_; i++) { 45 | x[i] = x[i * 2]; // put the real part in the first half of x. 46 | temp_ptr[i] = x[i * 2 + 1]; // put the imaginary part in temp_buffer. 47 | } 48 | // copy the imaginary part back to the second half of x. 49 | memcpy(static_cast(x + N_), 50 | static_cast(temp_ptr), 51 | sizeof(BaseFloat) * N_); 52 | 53 | Compute(x, x + N_, forward); 54 | // Now change the format back to interleaved. 55 | memcpy(static_cast(temp_ptr), 56 | static_cast(x + N_), 57 | sizeof(BaseFloat) * N_); 58 | for (int32 i = N_-1; i > 0; i--) { // don't include 0, 59 | // in case MatrixIndexT is unsigned, the loop would not terminate. 60 | // Treat it as a special case. 61 | x[i*2] = x[i]; 62 | x[i*2 + 1] = temp_ptr[i]; 63 | } 64 | x[1] = temp_ptr[0]; // special case of i = 0. 65 | } 66 | 67 | void SplitRadixComplexFft::Compute(BaseFloat *x, bool forward) { 68 | this->Compute(x, forward, &temp_buffer_); 69 | } 70 | 71 | void SplitRadixComplexFft::BitReversePermute(BaseFloat *x, int32 logn) const { 72 | int32 i, j, lg2, n; 73 | int32 off, fj, gno, *brp; 74 | BaseFloat tmp, *xp, *xq; 75 | 76 | lg2 = logn >> 1; 77 | n = 1 << lg2; 78 | if (logn & 1) lg2++; 79 | 80 | /* Unshuffling loop */ 81 | for (off = 1; off < n; off++) { 82 | fj = n * brseed_[off]; i = off; j = fj; 83 | tmp = x[i]; x[i] = x[j]; x[j] = tmp; 84 | xp = &x[i]; 85 | brp = &(brseed_[1]); 86 | for (gno = 1; gno < brseed_[off]; gno++) { 87 | xp += n; 88 | j = fj + *brp++; 89 | xq = x + j; 90 | tmp = *xp; *xp = *xq; *xq = tmp; 91 | } 92 | } 93 | } 94 | 95 | void SplitRadixComplexFft::ComputeRecursive(BaseFloat *xr, BaseFloat *xi, int32 logn) const { 96 | 97 | int32 m, m2, m4, m8, nel, n; 98 | BaseFloat *xr1, *xr2, *xi1, *xi2; 99 | BaseFloat *cn = nullptr, *spcn = nullptr, *smcn = nullptr, *c3n = nullptr, 100 | *spc3n = nullptr, *smc3n = nullptr; 101 | BaseFloat tmp1, tmp2; 102 | BaseFloat sqhalf = M_SQRT1_2; 103 | 104 | /* Compute trivial cases */ 105 | if (logn < 3) { 106 | if (logn == 2) { /* length m = 4 */ 107 | xr2 = xr + 2; 108 | xi2 = xi + 2; 109 | tmp1 = *xr + *xr2; 110 | *xr2 = *xr - *xr2; 111 | *xr = tmp1; 112 | tmp1 = *xi + *xi2; 113 | *xi2 = *xi - *xi2; 114 | *xi = tmp1; 115 | xr1 = xr + 1; 116 | xi1 = xi + 1; 117 | xr2++; 118 | xi2++; 119 | tmp1 = *xr1 + *xr2; 120 | *xr2 = *xr1 - *xr2; 121 | *xr1 = tmp1; 122 | tmp1 = *xi1 + *xi2; 123 | *xi2 = *xi1 - *xi2; 124 | *xi1 = tmp1; 125 | xr2 = xr + 1; 126 | xi2 = xi + 1; 127 | tmp1 = *xr + *xr2; 128 | *xr2 = *xr - *xr2; 129 | *xr = tmp1; 130 | tmp1 = *xi + *xi2; 131 | *xi2 = *xi - *xi2; 132 | *xi = tmp1; 133 | xr1 = xr + 2; 134 | xi1 = xi + 2; 135 | xr2 = xr + 3; 136 | xi2 = xi + 3; 137 | tmp1 = *xr1 + *xi2; 138 | tmp2 = *xi1 + *xr2; 139 | *xi1 = *xi1 - *xr2; 140 | *xr2 = *xr1 - *xi2; 141 | *xr1 = tmp1; 142 | *xi2 = tmp2; 143 | return; 144 | } 145 | else if (logn == 1) { /* length m = 2 */ 146 | xr2 = xr + 1; 147 | xi2 = xi + 1; 148 | tmp1 = *xr + *xr2; 149 | *xr2 = *xr - *xr2; 150 | *xr = tmp1; 151 | tmp1 = *xi + *xi2; 152 | *xi2 = *xi - *xi2; 153 | *xi = tmp1; 154 | return; 155 | } 156 | else if (logn == 0) return; /* length m = 1 */ 157 | } 158 | 159 | /* Compute a few constants */ 160 | m = 1 << logn; m2 = m / 2; m4 = m2 / 2; m8 = m4 /2; 161 | 162 | 163 | /* Step 1 */ 164 | xr1 = xr; xr2 = xr1 + m2; 165 | xi1 = xi; xi2 = xi1 + m2; 166 | for (n = 0; n < m2; n++) { 167 | tmp1 = *xr1 + *xr2; 168 | *xr2 = *xr1 - *xr2; 169 | xr2++; 170 | *xr1++ = tmp1; 171 | tmp2 = *xi1 + *xi2; 172 | *xi2 = *xi1 - *xi2; 173 | xi2++; 174 | *xi1++ = tmp2; 175 | } 176 | 177 | /* Step 2 */ 178 | xr1 = xr + m2; xr2 = xr1 + m4; 179 | xi1 = xi + m2; xi2 = xi1 + m4; 180 | for (n = 0; n < m4; n++) { 181 | tmp1 = *xr1 + *xi2; 182 | tmp2 = *xi1 + *xr2; 183 | *xi1 = *xi1 - *xr2; 184 | xi1++; 185 | *xr2++ = *xr1 - *xi2; 186 | *xr1++ = tmp1; 187 | *xi2++ = tmp2; 188 | // xr1++; xr2++; xi1++; xi2++; 189 | } 190 | 191 | /* Steps 3 & 4 */ 192 | xr1 = xr + m2; xr2 = xr1 + m4; 193 | xi1 = xi + m2; xi2 = xi1 + m4; 194 | if (logn >= 4) { 195 | nel = m4 - 2; 196 | cn = tab_[logn-4]; spcn = cn + nel; smcn = spcn + nel; 197 | c3n = smcn + nel; spc3n = c3n + nel; smc3n = spc3n + nel; 198 | } 199 | xr1++; xr2++; xi1++; xi2++; 200 | // xr1++; xi1++; 201 | for (n = 1; n < m4; n++) { 202 | if (n == m8) { 203 | tmp1 = sqhalf * (*xr1 + *xi1); 204 | *xi1 = sqhalf * (*xi1 - *xr1); 205 | *xr1 = tmp1; 206 | tmp2 = sqhalf * (*xi2 - *xr2); 207 | *xi2 = -sqhalf * (*xr2 + *xi2); 208 | *xr2 = tmp2; 209 | } else { 210 | tmp2 = *cn++ * (*xr1 + *xi1); 211 | tmp1 = *spcn++ * *xr1 + tmp2; 212 | *xr1 = *smcn++ * *xi1 + tmp2; 213 | *xi1 = tmp1; 214 | tmp2 = *c3n++ * (*xr2 + *xi2); 215 | tmp1 = *spc3n++ * *xr2 + tmp2; 216 | *xr2 = *smc3n++ * *xi2 + tmp2; 217 | *xi2 = tmp1; 218 | } 219 | xr1++; xr2++; xi1++; xi2++; 220 | } 221 | 222 | /* Call ssrec again with half DFT length */ 223 | ComputeRecursive(xr, xi, logn-1); 224 | 225 | /* Call ssrec again twice with one quarter DFT length. 226 | Constants have to be recomputed, because they are static! */ 227 | // m = 1 << logn; m2 = m / 2; 228 | ComputeRecursive(xr + m2, xi + m2, logn - 2); 229 | // m = 1 << logn; 230 | m4 = 3 * (m / 4); 231 | ComputeRecursive(xr + m4, xi + m4, logn - 2); 232 | } 233 | 234 | void SplitRadixComplexFft::ComputeTables() { 235 | int32 imax, lg2, i, j; 236 | int32 m, m2, m4, m8, nel, n; 237 | BaseFloat *cn, *spcn, *smcn, *c3n, *spc3n, *smc3n; 238 | BaseFloat ang, c, s; 239 | 240 | lg2 = logn_ >> 1; 241 | if (logn_ & 1) lg2++; 242 | brseed_ = new int32[1 << lg2]; 243 | brseed_[0] = 0; 244 | brseed_[1] = 1; 245 | for (j = 2; j <= lg2; j++) { 246 | imax = 1 << (j - 1); 247 | for (i = 0; i < imax; i++) { 248 | brseed_[i] <<= 1; 249 | brseed_[i + imax] = brseed_[i] + 1; 250 | } 251 | } 252 | 253 | if (logn_ < 4) { 254 | tab_ = NULL; 255 | } else { 256 | tab_ = new BaseFloat* [logn_-3]; 257 | for (i = logn_; i>=4 ; i--) { 258 | /* Compute a few constants */ 259 | m = 1 << i; m2 = m / 2; m4 = m2 / 2; m8 = m4 /2; 260 | 261 | /* Allocate memory for tables */ 262 | nel = m4 - 2; 263 | 264 | tab_[i-4] = new BaseFloat[6*nel]; 265 | 266 | /* Initialize pointers */ 267 | cn = tab_[i-4]; spcn = cn + nel; smcn = spcn + nel; 268 | c3n = smcn + nel; spc3n = c3n + nel; smc3n = spc3n + nel; 269 | 270 | /* Compute tables */ 271 | for (n = 1; n < m4; n++) { 272 | if (n == m8) continue; 273 | ang = n * M_2PI / m; 274 | c = cos(ang); s = sin(ang); 275 | *cn++ = c; *spcn++ = - (s + c); *smcn++ = s - c; 276 | ang = 3 * n * M_2PI / m; 277 | c = cos(ang); s = sin(ang); 278 | *c3n++ = c; *spc3n++ = - (s + c); *smc3n++ = s - c; 279 | } 280 | } 281 | } 282 | } 283 | 284 | inline void ComplexMul(const BaseFloat &a_re, const BaseFloat &a_im, 285 | BaseFloat *b_re, BaseFloat *b_im) { 286 | BaseFloat tmp_re = (*b_re * a_re) - (*b_im * a_im); 287 | *b_im = *b_re * a_im + *b_im * a_re; 288 | *b_re = tmp_re; 289 | } 290 | 291 | inline void ComplexAddProduct(const BaseFloat &a_re, const BaseFloat &a_im, 292 | const BaseFloat &b_re, const BaseFloat &b_im, 293 | BaseFloat *c_re, BaseFloat *c_im) { 294 | *c_re += b_re*a_re - b_im*a_im; 295 | *c_im += b_re*a_im + b_im*a_re; 296 | } 297 | 298 | 299 | inline void ComplexImExp(BaseFloat x, BaseFloat *a_re, BaseFloat *a_im) { 300 | *a_re = cos(x); 301 | *a_im = sin(x); 302 | } 303 | 304 | void SplitRadixRealFft::Compute(BaseFloat *data, bool forward) { 305 | Compute(data, forward, &this->temp_buffer_); 306 | } 307 | 308 | 309 | // This code is mostly the same as the RealFft function. It would be 310 | // possible to replace it with more efficient code from Rico's book. 311 | void SplitRadixRealFft::Compute(BaseFloat *data, bool forward, 312 | std::vector *temp_buffer) const { 313 | int32 N = N_, N2 = N/2; 314 | 315 | if (forward) // call to base class 316 | SplitRadixComplexFft::Compute(data, true, temp_buffer); 317 | 318 | BaseFloat rootN_re, rootN_im; // exp(-2pi/N), forward; exp(2pi/N), backward 319 | int forward_sign = forward ? -1 : 1; 320 | ComplexImExp(static_cast(M_2PI/N *forward_sign), &rootN_re, &rootN_im); 321 | BaseFloat kN_re = -forward_sign, kN_im = 0.0; // exp(-2pik/N), forward; exp(-2pik/N), backward 322 | // kN starts out as 1.0 for forward algorithm but -1.0 for backward. 323 | for (int32 k = 1; 2*k <= N2; k++) { 324 | ComplexMul(rootN_re, rootN_im, &kN_re, &kN_im); 325 | 326 | BaseFloat Ck_re, Ck_im, Dk_re, Dk_im; 327 | // C_k = 1/2 (B_k + B_{N/2 - k}^*) : 328 | Ck_re = 0.5 * (data[2*k] + data[N - 2*k]); 329 | Ck_im = 0.5 * (data[2*k + 1] - data[N - 2*k + 1]); 330 | // re(D_k)= 1/2 (im(B_k) + im(B_{N/2-k})): 331 | Dk_re = 0.5 * (data[2*k + 1] + data[N - 2*k + 1]); 332 | // im(D_k) = -1/2 (re(B_k) - re(B_{N/2-k})) 333 | Dk_im =-0.5 * (data[2*k] - data[N - 2*k]); 334 | // A_k = C_k + 1^(k/N) D_k: 335 | data[2*k] = Ck_re; // A_k <-- C_k 336 | data[2*k+1] = Ck_im; 337 | // now A_k += D_k 1^(k/N) 338 | ComplexAddProduct(Dk_re, Dk_im, kN_re, kN_im, &(data[2*k]), &(data[2*k+1])); 339 | 340 | int32 kdash = N2 - k; 341 | if (kdash != k) { 342 | // Next we handle the index k' = N/2 - k. This is necessary 343 | // to do now, to avoid invalidating data that we will later need. 344 | // The quantities C_{k'} and D_{k'} are just the conjugates of C_k 345 | // and D_k, so the equations are simple modifications of the above, 346 | // replacing Ck_im and Dk_im with their negatives. 347 | data[2*kdash] = Ck_re; // A_k' <-- C_k' 348 | data[2*kdash+1] = -Ck_im; 349 | // now A_k' += D_k' 1^(k'/N) 350 | // We use 1^(k'/N) = 1^((N/2 - k) / N) = 1^(1/2) 1^(-k/N) = -1 * (1^(k/N))^* 351 | // so it's the same as 1^(k/N) but with the real part negated. 352 | ComplexAddProduct(Dk_re, -Dk_im, -kN_re, kN_im, &(data[2*kdash]), &(data[2*kdash+1])); 353 | } 354 | } 355 | 356 | { // Now handle k = 0. 357 | // In simple terms: after the complex fft, data[0] becomes the sum of real 358 | // parts input[0], input[2]... and data[1] becomes the sum of imaginary 359 | // pats input[1], input[3]... 360 | // "zeroth" [A_0] is just the sum of input[0]+input[1]+input[2].. 361 | // and "n2th" [A_{N/2}] is input[0]-input[1]+input[2]... . 362 | BaseFloat zeroth = data[0] + data[1], 363 | n2th = data[0] - data[1]; 364 | data[0] = zeroth; 365 | data[1] = n2th; 366 | if (!forward) { 367 | data[0] /= 2; 368 | data[1] /= 2; 369 | } 370 | } 371 | if (!forward) { // call to base class 372 | SplitRadixComplexFft::Compute(data, false, temp_buffer); 373 | for (int32 i = 0; i < N; i++) 374 | data[i] *= 2.0; 375 | // This is so we get a factor of N increase, rather than N/2 which we would 376 | // otherwise get from [ComplexFft, forward] + [ComplexFft, backward] in dimension N/2. 377 | // It's for consistency with our normal FFT convensions. 378 | } 379 | } 380 | -------------------------------------------------------------------------------- /src/transition-model.cpp: -------------------------------------------------------------------------------- 1 | #include "transition-model.h" 2 | 3 | const char* transmodel = ""; 4 | const char* topology = ""; 5 | const char* tuplesName = ""; 6 | const char* triplesName = ""; 7 | 8 | void TransitionModel::Read(FILE* fp) 9 | { 10 | char token[128]; 11 | ReadToken(fp, token); 12 | if(strcmp(transmodel, token) != 0) 13 | { 14 | printf("Model file type error!\n"); 15 | return; 16 | } 17 | 18 | ReadTopo(fp); 19 | 20 | //Read tuples 21 | ReadToken(fp, token); 22 | int32 size; 23 | ReadBasicType(fp, &size); 24 | tuples.resize(size); 25 | for (int32 i = 0; i < size; i++) 26 | { 27 | ReadBasicType(fp, &(tuples[i].phone)); 28 | ReadBasicType(fp, &(tuples[i].hmm_state)); 29 | ReadBasicType(fp, &(tuples[i].forward_pdf)); 30 | if (0 == strcmp(token, tuplesName)) 31 | { 32 | ReadBasicType(fp, &(tuples[i].self_loop_pdf)); 33 | } 34 | else if (0 == strcmp(token, triplesName)) 35 | { 36 | tuples[i].self_loop_pdf = tuples[i].forward_pdf; 37 | } 38 | } 39 | ReadToken(fp, token); 40 | //TODO: Check token is or 41 | ComputeDerived(); 42 | ReadToken(fp, token); // 43 | ReadFloatVectors(fp, &log_probs); 44 | ReadToken(fp, token); // 45 | ReadToken(fp, token); // 46 | ComputeDerivedOfProbs(); 47 | //TODO: Check 48 | } 49 | 50 | int32 TransitionModel::TransitionIdToPdf(int32 trans_id) const 51 | { 52 | return id2pdf_id[trans_id]; 53 | } 54 | 55 | void TransitionModel::ReadTopo(FILE *fp) 56 | { 57 | char token[128]; 58 | ReadToken(fp, token); 59 | if(strcmp(topology, token) != 0) 60 | { 61 | printf("Topology file type error!\n"); 62 | return; 63 | } 64 | 65 | ReadIntegerVector(fp, &topo.phones); 66 | ReadIntegerVector(fp, &topo.phone2idx); 67 | 68 | //Read Tuples 69 | int32 size; 70 | ReadBasicType(fp, &size); 71 | bool is_hmm = true; 72 | topo.entries.resize(size); 73 | for (int32 i = 0; i < size; i++) 74 | { 75 | int32 thist_sz; 76 | ReadBasicType(fp, &thist_sz); 77 | topo.entries[i].resize(thist_sz); 78 | for (int32 j = 0 ; j < thist_sz; j++) 79 | { 80 | ReadBasicType(fp, &(topo.entries[i][j].forward_pdf_class)); 81 | if(is_hmm) 82 | { 83 | topo.entries[i][j].self_loop_pdf_class = topo.entries[i][j].forward_pdf_class; 84 | } 85 | else 86 | { 87 | ReadBasicType(fp, &(topo.entries[i][j].self_loop_pdf_class)); 88 | } 89 | int32 thiss_sz; 90 | ReadBasicType(fp, &thiss_sz); 91 | topo.entries[i][j].transitions.resize(thiss_sz); 92 | for (int32 k = 0; k < thiss_sz; k++) 93 | { 94 | ReadBasicType(fp, &(topo.entries[i][j].transitions[k].first)); 95 | ReadBasicType(fp, &(topo.entries[i][j].transitions[k].second)); 96 | } 97 | } 98 | } 99 | ReadToken(fp, token); 100 | //TODO: Add check 101 | } 102 | 103 | void TransitionModel::ComputeDerived() 104 | { 105 | state2id.resize(tuples.size()+2); // indexed by transition-state, which 106 | // is one based, but also an entry for one past end of list. 107 | 108 | int32 cur_transition_id = 1; 109 | num_pdfs = 0; 110 | for (int32 tstate = 1; 111 | tstate <= static_cast(tuples.size()+1); // not a typo. 112 | tstate++) 113 | { 114 | state2id[tstate] = cur_transition_id; 115 | if (static_cast(tstate) <= tuples.size()) 116 | { 117 | int32 phone = tuples[tstate-1].phone, 118 | hmm_state = tuples[tstate-1].hmm_state, 119 | forward_pdf = tuples[tstate-1].forward_pdf, 120 | self_loop_pdf = tuples[tstate-1].self_loop_pdf; 121 | num_pdfs = max(num_pdfs, 1 + forward_pdf); 122 | num_pdfs = max(num_pdfs, 1 + self_loop_pdf); 123 | const HmmState &state = TopologyForPhone(phone)[hmm_state]; 124 | int32 my_num_ids = static_cast(state.transitions.size()); 125 | cur_transition_id += my_num_ids; // # trans out of this state. 126 | } 127 | } 128 | 129 | id2state.resize(cur_transition_id); // cur_transition_id is #transition-ids+1. 130 | id2pdf_id.resize(cur_transition_id); 131 | for (int32 tstate = 1; tstate <= static_cast(tuples.size()); tstate++) 132 | { 133 | for (int32 tid = state2id[tstate]; tid < state2id[tstate+1]; tid++) 134 | { 135 | id2state[tid] = tstate; 136 | if (IsSelfLoop(tid)) 137 | { 138 | id2pdf_id[tid] = tuples[tstate-1].self_loop_pdf; 139 | } 140 | else 141 | { 142 | id2pdf_id[tid] = tuples[tstate-1].forward_pdf; 143 | } 144 | } 145 | } 146 | 147 | // The following statements put copies a large number in the region of memory 148 | // past the end of the id2pdf_id_ array, while leaving the array as it was 149 | // before. The goal of this is to speed up decoding by disabling a check 150 | // inside TransitionIdToPdf() that the transition-id was within the correct 151 | // range. 152 | int32 num_big_numbers = min(2000, cur_transition_id); 153 | id2pdf_id.resize(cur_transition_id + num_big_numbers, 154 | std::numeric_limits::max()); 155 | id2pdf_id.resize(cur_transition_id); 156 | } 157 | 158 | bool TransitionModel::IsSelfLoop(int32 trans_id) const { 159 | int32 trans_state = id2state[trans_id]; 160 | int32 trans_index = trans_id - state2id[trans_state]; 161 | const Tuple &tuple = tuples[trans_state-1]; 162 | int32 phone = tuple.phone, hmm_state = tuple.hmm_state; 163 | const TopologyEntry &entry = TopologyForPhone(phone); 164 | return (static_cast(trans_index) < entry[hmm_state].transitions.size() 165 | && entry[hmm_state].transitions[trans_index].first == hmm_state); 166 | } 167 | 168 | const TopologyEntry& TransitionModel::TopologyForPhone(int32 phone) const 169 | { 170 | // Will throw if phone not covered. 171 | if (static_cast(phone) >= topo.phone2idx.size() || topo.phone2idx[phone] == -1) { 172 | printf("TopologyForPhone(), phone %d not covered.\n", phone); 173 | } 174 | return topo.entries[topo.phone2idx[phone]]; 175 | } 176 | 177 | void TransitionModel::ComputeDerivedOfProbs() 178 | { 179 | non_self_loop_log_probs.resize(NumTransitionStates()+1); // this array indexed 180 | // by transition-state with nothing in zeroth element. 181 | for (int32 tstate = 1; tstate <= NumTransitionStates(); tstate++) 182 | { 183 | int32 tid = SelfLoopOf(tstate); 184 | if (tid == 0) 185 | { // no self-loop 186 | non_self_loop_log_probs[tstate] = 0.0; // log(1.0) 187 | } 188 | else 189 | { 190 | BaseFloat self_loop_prob = expf(GetTransitionLogProb(tid)), 191 | non_self_loop_prob = 1.0 - self_loop_prob; 192 | if (non_self_loop_prob <= 0.0) 193 | { 194 | printf("ComputeDerivedOfProbs(): non-self-loop prob is %f\n", non_self_loop_prob); 195 | non_self_loop_prob = 1.0e-10; // just so we can continue... 196 | } 197 | non_self_loop_log_probs[tstate] = logf(non_self_loop_prob); // will be negative. 198 | } 199 | } 200 | } 201 | 202 | int32 TransitionModel::SelfLoopOf(int32 trans_state) const 203 | { // returns the self-loop transition-id 204 | const Tuple &tuple = tuples[trans_state-1]; 205 | // or zero if does not exist. 206 | int32 phone = tuple.phone, hmm_state = tuple.hmm_state; 207 | const TopologyEntry &entry = TopologyForPhone(phone); 208 | 209 | for (int32 trans_index = 0; 210 | trans_index < static_cast(entry[hmm_state].transitions.size()); 211 | trans_index++) 212 | { 213 | if (entry[hmm_state].transitions[trans_index].first == hmm_state) 214 | { 215 | return PairToTransitionId(trans_state, trans_index); 216 | } 217 | } 218 | 219 | return 0; // invalid transition id. 220 | } 221 | 222 | BaseFloat TransitionModel::GetTransitionLogProb(int32 trans_id) const 223 | { 224 | return log_probs[trans_id]; 225 | } 226 | 227 | int32 TransitionModel::PairToTransitionId(int32 trans_state, int32 trans_index) const 228 | { 229 | return state2id[trans_state] + trans_index; 230 | } 231 | 232 | int32 TransitionModel::NumTransitionStates() 233 | { 234 | return tuples.size(); 235 | } -------------------------------------------------------------------------------- /src/wavereader.cpp: -------------------------------------------------------------------------------- 1 | #include "wavereader.h" 2 | 3 | WaveReader::WaveReader() 4 | { 5 | memset(&m_wavefile, 0, sizeof(WaveFile)); 6 | } 7 | 8 | WaveReader::~WaveReader() 9 | { 10 | m_wavefile.data.clear(); 11 | m_waveData.clear(); 12 | } 13 | 14 | void WaveReader::ReadWaveFile(const char* fileName) 15 | { 16 | FILE *fp = fopen(fileName, "rb"); 17 | 18 | if(!fp) 19 | { 20 | printf("Open wave file %s error\n", fileName); 21 | } 22 | 23 | //读取文件头 24 | fread(&m_wavefile, sizeof(WaveHeader), 1, fp); 25 | 26 | //读取数据 27 | int dataSize = m_wavefile.header.subchunk2_size; 28 | m_wavefile.data.resize(dataSize/2); 29 | fread(m_wavefile.data.data(), dataSize, 1, fp); 30 | 31 | m_waveData.clear(); 32 | for(int i=0; i(m_wavefile.data[i])); 35 | } 36 | 37 | fclose(fp); 38 | } --------------------------------------------------------------------------------