├── .flake8 ├── .gitattributes ├── .gitignore ├── .pre-commit-config.yaml ├── .style.yapf ├── CMakeLists.txt ├── LICENSE ├── README.md ├── TODO ├── doc ├── README.md ├── adaptive_beamformer │ ├── README.md │ └── asset │ │ ├── egs.wav │ │ ├── gevd-ban.wav │ │ ├── gevd.wav │ │ ├── mask.jpg │ │ ├── mvdr.wav │ │ ├── pmwf-0-eig.wav │ │ ├── pmwf-0-gev.wav │ │ └── pmwf-0.wav ├── data_simu │ ├── README.md │ └── asset │ │ ├── 4ch-rir1.wav │ │ ├── 4ch-rir2.wav │ │ ├── 4ch-rir3.wav │ │ ├── iso.wav │ │ ├── noise.wav │ │ ├── spk1.wav │ │ └── spk2.wav ├── fixed_beamformer │ ├── README.md │ └── asset │ │ ├── ds.wav │ │ ├── egs.wav │ │ └── sd.wav ├── format_transform │ ├── README.md │ └── asset │ │ └── egs.ark ├── ns │ ├── README.md │ └── asset │ │ ├── egs.wav │ │ └── egs_ns.wav ├── rir │ ├── README.md │ └── asset │ │ ├── 1d_Room1.jpg │ │ ├── 1d_Room2.jpg │ │ ├── 1d_rir.json │ │ ├── 2d_Room1.jpg │ │ ├── 2d_Room2.jpg │ │ └── 2d_rir.json ├── spatial_clustering │ ├── README.md │ └── asset │ │ ├── 2spk.jpg │ │ ├── 2spk.wav │ │ ├── noisy.jpg │ │ └── noisy.wav ├── spatial_feature │ ├── README.md │ └── asset │ │ ├── cgmm_mask.jpg │ │ ├── df.jpg │ │ ├── egs.wav │ │ └── ipd_02_04_24.jpg ├── spectral_feature │ ├── README.md │ └── asset │ │ ├── 257dim-log-spectrogram.jpg │ │ ├── 80dim-log-fbank.jpg │ │ └── egs.wav ├── ssl │ ├── README.md │ └── asset │ │ ├── angular_spectrum.jpg │ │ └── egs.wav ├── steer_vector │ ├── README.md │ └── asset │ │ ├── 1d_6mic_sv.npy │ │ ├── 2d_4mic_sv.npy │ │ ├── beam_v1.npy │ │ └── beampattern_v1.png ├── tf_mask │ ├── README.md │ └── asset │ │ ├── clean.wav │ │ ├── enhan.wav │ │ ├── iam-cutoff-2.jpg │ │ ├── ibm.jpg │ │ ├── irm.jpg │ │ ├── noisy.wav │ │ ├── psa.jpg │ │ └── psm-cutoff-2.jpg ├── vad │ ├── README.md │ └── asset │ │ ├── utt.wav │ │ └── utt_vad.wav └── wpe │ ├── README.md │ └── asset │ ├── egs.wav │ ├── mask.jpg │ ├── wpd_egs.wav │ └── wpe_egs.wav ├── include ├── CMakeLists.txt ├── beamformer.cc ├── beamformer.h ├── cblas-cpl-wrappers.h ├── complex-base.h ├── complex-matrix.cc ├── complex-matrix.h ├── complex-vector.cc ├── complex-vector.h ├── rir-generator.cc ├── rir-generator.h ├── srp-phat.cc ├── srp-phat.h ├── stft.cc └── stft.h ├── path.sh ├── requirements.txt ├── scripts ├── compute_circular_srp.sh ├── compute_df_on_mask.sh ├── compute_ipd_and_linear_srp.sh ├── compute_librosa_fbank.sh ├── compute_librosa_spectrogram.sh ├── compute_oracle_mask.sh ├── get_wav_duration.sh ├── run_adapt_beamformer.sh ├── run_auxiva.sh ├── run_cacgmm.sh ├── run_cgmm.sh ├── run_ds_beamformer.sh ├── run_fixed_beamformer.sh ├── run_sd_beamformer.sh ├── run_ssl.sh ├── run_tf_masking.sh ├── run_vad.sh ├── run_wpe.sh └── sptk │ ├── README.md │ ├── apply_adaptive_beamformer.py │ ├── apply_auxiva.py │ ├── apply_classic_beamformer.py │ ├── apply_ds_beamformer.py │ ├── apply_fixed_beamformer.py │ ├── apply_ns.py │ ├── apply_sd_beamformer.py │ ├── apply_wpd.py │ ├── apply_wpe.py │ ├── compute_centroid.py │ ├── compute_circular_srp.py │ ├── compute_df_on_geometry.py │ ├── compute_df_on_mask.py │ ├── compute_dpcl_label.py │ ├── compute_fbank.py │ ├── compute_ipd_and_linear_srp.py │ ├── compute_mask.py │ ├── compute_sdr.py │ ├── compute_si_snr.py │ ├── compute_similar_score.py │ ├── compute_spectrogram.py │ ├── compute_steer_vector.py │ ├── compute_wer.py │ ├── copy_archive_to_mat.py │ ├── copy_complex_mat.py │ ├── copy_mat_to_archive.py │ ├── do_ssl.py │ ├── do_vad.py │ ├── estimate_cacgmm_masks.py │ ├── estimate_cgmm_masks.py │ ├── libs │ ├── __init__.py │ ├── beamformer.py │ ├── cluster.py │ ├── data_handler.py │ ├── exraw.py │ ├── kaldi_io.py │ ├── metric.py │ ├── ns.py │ ├── opts.py │ ├── sampler.py │ ├── spatial.py │ ├── ssl.py │ ├── utils.py │ └── wpe.py │ ├── oracle_separate.py │ ├── rir_generate_1d.py │ ├── rir_generate_2d.py │ ├── visualize_angular_spectrum.py │ ├── visualize_beampattern.py │ ├── visualize_pca.py │ ├── visualize_spectrogram.py │ ├── visualize_tf_matrix.py │ ├── wav_duration.py │ ├── wav_estimate.py │ ├── wav_separate.py │ └── wav_simulate.py ├── src ├── CMakeLists.txt ├── apply-cmvn-perutt.cc ├── apply-fixed-beamformer.cc ├── apply-supervised-max-snr.cc ├── apply-supervised-mvdr.cc ├── compute-masks.cc ├── compute-srp-phat.cc ├── compute-stft-stats.cc ├── matrix-scale-elements.cc ├── matrix-scale-rows.cc ├── modify-feats.cc ├── rir-simulate.cc ├── wav-estimate.cc ├── wav-separate.cc └── wav-to-power.cc ├── steps ├── archive_wav.sh ├── compute_masks.sh ├── compute_stft_stats.sh ├── extract_segments.sh ├── mono_mask_enhance.sh ├── train_dnn_mask.sh └── train_rnn_mask.sh ├── test ├── CMakeLists.txt ├── test-beamformer.cc ├── test-complex.cc ├── test-srp-phat.cc ├── test-stft.cc └── test_rir_generator.sh └── utils ├── filter_scp.pl ├── parse_options.sh ├── queue.pl ├── run.pl └── split_scp.pl /.flake8: -------------------------------------------------------------------------------- 1 | [flake8] 2 | max-line-length = 80 3 | exclude = 4 | # git 5 | .git, 6 | # local mantained directory 7 | local/ 8 | cache/ 9 | examples/ 10 | per-file-ignores = 11 | # imported but unused 12 | # unable to detect undefined names 13 | __init__.py: F401, F403 14 | ignore = 15 | # unexpected spaces around keyword / parameter equals 16 | E251, 17 | # yapf has issues: closing bracket does not match visual indentation 18 | E124, 19 | # yapf has issues: continuation line over-indented for hanging indent 20 | E126, 21 | # ambiguous variable name 22 | E741, 23 | # line too long (in some cases, yapf can not fix this issue) 24 | E501, 25 | # do not assign a lambda expression, use a def 26 | E731 27 | # continuation line unaligned for hanging indent 28 | E131 29 | # line break before binary operator 30 | W503, 31 | # line break after binary operator 32 | W504 33 | -------------------------------------------------------------------------------- /.gitattributes: -------------------------------------------------------------------------------- 1 | * text=auto 2 | 3 | *.py text eol=lf 4 | *.sh text eol=lf 5 | 6 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # cmake build 2 | bin/ 3 | lib/ 4 | build/ 5 | # python cache 6 | __pycache__/ 7 | *.py[cod] 8 | # egs 9 | egs/*/exp/ 10 | egs/*/sps/ 11 | egs/*/data/ 12 | egs/*/stft/ 13 | egs/*/mask/ 14 | # figure dir 15 | figure/ 16 | # vscode configure 17 | .vscode/ 18 | # mypy cache 19 | .mypy_cache/ 20 | # egs scp 21 | *.scp 22 | *.log 23 | # python tmp dir 24 | scripts/sptk/create_* 25 | scripts/sptk/test_* 26 | scripts/sptk/\.*.py 27 | scripts/sptk/pb_perm_solver.py 28 | 29 | # old local dir 30 | local/ 31 | # clang format 32 | .clang-format 33 | 34 | # pyenv 35 | .python-version 36 | 37 | # MAC 38 | .DS_Store -------------------------------------------------------------------------------- /.pre-commit-config.yaml: -------------------------------------------------------------------------------- 1 | # See https://pre-commit.com for more information 2 | # See https://pre-commit.com/hooks.html for more hooks 3 | repos: 4 | - repo: https://github.com/pre-commit/pre-commit-hooks 5 | rev: v4.0.1 6 | hooks: 7 | - id: trailing-whitespace 8 | - id: end-of-file-fixer 9 | - id: check-yaml 10 | - id: check-ast 11 | - repo: https://github.com/pre-commit/mirrors-yapf 12 | rev: 'v0.31.0' 13 | hooks: 14 | - id: yapf 15 | language: python 16 | - repo: https://github.com/pre-commit/mirrors-clang-format 17 | rev: 'v13.0.0' 18 | hooks: 19 | - id: clang-format 20 | - repo: https://github.com/pycqa/flake8 21 | rev: '4.0.1' 22 | hooks: 23 | - id: flake8 24 | -------------------------------------------------------------------------------- /.style.yapf: -------------------------------------------------------------------------------- 1 | [style] 2 | based_on_style = google 3 | column_limit = 80 4 | -------------------------------------------------------------------------------- /CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 3.0) 2 | project("SETK: Speech Enhancement Tookit on Kaldi") 3 | 4 | set(CMAKE_CXX_STANDARD 11) 5 | 6 | MESSAGE(STATUS "detected OS: ${CMAKE_SYSTEM}") 7 | 8 | set(KALDI_ROOT $ENV{KALDI_ROOT}) 9 | set(OPENFST_ROOT $ENV{OPENFST_ROOT}) 10 | if(NOT DEFINED KALDI_ROOT) 11 | message(FATAL_ERROR "KALDI_ROOT not defined, export KALDI_ROOT=/path/to/kaldi") 12 | endif() 13 | if(NOT DEFINED OPENFST_ROOT) 14 | message(FATAL_ERROR "OPENFST_ROOT not defined, export OPENFST_ROOT=/path/to/openfst") 15 | endif() 16 | 17 | message(STATUS "defined KALDI_ROOT=" $ENV{KALDI_ROOT}) 18 | set(DEPEND_LIBS kaldi-base kaldi-util kaldi-matrix kaldi-feat kaldi-transform pthread) 19 | 20 | add_definitions(-O3 -g -std=c++11) 21 | include_directories(${KALDI_ROOT}/src ${OPENFST_ROOT}/include ${CMAKE_SOURCE_DIR}) 22 | link_directories(${KALDI_ROOT}/src/lib ${CMAKE_SOURCE_DIR}/lib) 23 | 24 | set(EXECUTABLE_OUTPUT_PATH ${CMAKE_SOURCE_DIR}/bin) 25 | set(LIBRARY_OUTPUT_PATH ${CMAKE_SOURCE_DIR}/lib) 26 | 27 | if(APPLE) 28 | set(CMAKE_EXE_LINKER_FLAGS "-framework Accelerate") 29 | add_definitions(-DHAVE_CLAPACK) 30 | include_directories(${KALDI_ROOT}/tools/CLAPACK) 31 | elseif(UNIX) 32 | add_definitions(-DHAVE_OPENBLAS) 33 | set(OPENBLAS_ROOT $ENV{OPENBLAS_ROOT}) 34 | if(NOT DEFINED OPENBLAS_ROOT) 35 | message(FATAL_ERROR "OPENBLAS_ROOT not defined, export OPENBLAS_ROOT=/path/to/openblas") 36 | endif() 37 | include_directories(${OPENBLAS_ROOT}/include) 38 | link_directories(${OPENBLAS_ROOT}/lib) 39 | set(DEPEND_LIBS ${DEPEND_LIBS} openblas) 40 | message(STATUS "defined OPENBLAS_ROOT=" $ENV{OPENBLAS_ROOT}) 41 | endif() 42 | 43 | add_subdirectory(include) 44 | add_subdirectory(src) 45 | add_subdirectory(test) 46 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ## SETK: Speech Enhancement Tools integrated with Kaldi 2 | 3 | Here are some speech enhancement/separation tools integrated with [Kaldi](https://github.com/kaldi-asr/kaldi). I use them for front-end's data processing. 4 | 5 | ### Python Scripts 6 | 7 | * Supervised (mask-based) adaptive beamformer (GEVD/MVDR/MCWF...) 8 | * Data convertion among MATLAB, Numpy and Kaldi 9 | * Data visualization (TF-mask, spatial/spectral features, beam pattern...) 10 | * Unified data and IO handlers for Kaldi's scripts, archives, wave and numpy's ndarray... 11 | * Unsupervised mask estimation (CGMM/CACGMM) 12 | * Spatial/Spectral feature computation 13 | * DS (delay and sum) beamformer, SD (supper-directive) beamformer 14 | * AuxIVA, WPE & WPD, FB (Fixed Beamformer) 15 | * Mask computation (iam, irm, ibm, psm, crm) 16 | * RIR simulation (1D/2D arrays) 17 | * Single channel speech separation (TF spectral masking) 18 | * Si-SDR/SDR/WER evaluation 19 | * Pywebrtc vad wrapper 20 | * Mask-based source localization 21 | * Noise suppression 22 | * Data simulation 23 | * ... 24 | 25 | Please check out the following instruction for usage of the scripts. 26 | 27 | * [Adaptive Beamformer](doc/adaptive_beamformer) 28 | * [Fixed Beamformer](doc/fixed_beamformer) 29 | * [Sound Source Localization](doc/ssl) 30 | * [Spectral Feature](doc/spectral_feature) 31 | * [Spatial Feature](doc/spatial_feature) 32 | * [VAD](doc/vad) 33 | * [Noise Suppression](doc/ns) 34 | * [Steer Vector](doc/steer_vector) 35 | * [Room Impluse Response](doc/rir) 36 | * [Spatial Clustering](doc/spatial_clustering) 37 | * [WPE & WPD](doc/wpe) 38 | * [Time-frequency Mask](doc/tf_mask) 39 | * [Format Transform](doc/format_transform) 40 | * [Data Simulation](doc/data_simu) 41 | 42 | ### Kaldi Commands 43 | 44 | * Compute time-frequency masks (ibm, irm etc) 45 | * Compute phase & magnitude spectrogram & complex STFT 46 | * Seperate target component using input masks 47 | * Wave reconstruction from enhanced spectral features 48 | * Complex matrix/vector class 49 | * MVDR/GEVD beamformer (depend on T-F mask, not very stable) 50 | * Fixed beamformer 51 | * Compute angular spectrogram based on SRP-PHAT 52 | * RIR generator (reference from [RIR-Generator](https://github.com/ehabets/RIR-Generator)) 53 | 54 | To build the sources, you need to compile [Kaldi](https://github.com/kaldi-asr/kaldi) with `--shared` flags and patch `matrix/matrix-common.h` first 55 | ```c++ 56 | typedef enum { 57 | kTrans = 112, // CblasTrans 58 | kNoTrans = 111, // CblasNoTrans 59 | kConjTrans = 113, // CblasConjTrans 60 | kConjNoTrans = 114 // CblasConjNoTrans 61 | } MatrixTransposeType; 62 | ``` 63 | 64 | Then run 65 | ```bash 66 | mkdir build 67 | cd build 68 | export KALDI_ROOT=/path/to/kaldi/root 69 | export OPENFST_ROOT=/path/to/openfst/root 70 | # if on UNIX, need compile kaldi with openblas 71 | export OPENBLAS_ROOT=/path/to/openblas/root 72 | cmake .. 73 | make -j 74 | ``` 75 | 76 | ***Now I mainly work on [sptk](scripts) package, development based on kaldi is stopped.*** 77 | 78 | For developers (who want to make commits or PRs), please remember to setup [pre-commit](https://pre-commit.com) for code style formating. 79 | -------------------------------------------------------------------------------- /TODO: -------------------------------------------------------------------------------- 1 | TODO: 2 | 3 | 1. AGC, AEC, GSC 4 | 2. Conventional methods for speech enhancement/separation 5 | 3. ... 6 | -------------------------------------------------------------------------------- /doc/README.md: -------------------------------------------------------------------------------- 1 | ## Document 2 | 3 | Document on usage of the scripts under [scripts/sptk](../scripts/sptk). 4 | 5 | * [Adaptive Beamformer](adaptive_beamformer) 6 | * [Fixed Beamformer](fixed_beamformer) 7 | * [Sound Source Localization](ssl) 8 | * [Spectral Feature](spectral_feature) 9 | * [Spatial Feature](spatial_feature) 10 | * [VAD](vad) 11 | * [Noise Suppression](ns) 12 | * [Steer Vector](steer_vector) 13 | * [Room Impluse Response](rir) 14 | * [Spatial Clustering](spatial_clustering) 15 | * [WPE & WPD](doc/wpe) 16 | * [Time-frequency Mask](tf_mask) 17 | * [Format Transform](format_transform) 18 | * [Data Simulation](data_simu) -------------------------------------------------------------------------------- /doc/adaptive_beamformer/README.md: -------------------------------------------------------------------------------- 1 | ## Adaptive Beamformer 2 | 3 | Implementation of the mask-based adaptive beamformer (MVDR, GEVD, MCWF). 4 | 5 | ### Cmd options 6 | 7 | See `./scripts/sptk/apply_adaptive_beamformer.py -h`. 8 | 9 | ### Usage 10 | 11 | ```bash 12 | echo "egs asset/egs.wav" > wav.scp 13 | # estimate TF-masks 14 | ../../scripts/sptk/estimate_cgmm_masks.py \ 15 | --frame-len 512 \ 16 | --frame-hop 256 \ 17 | --num-iters 20 \ 18 | wav.scp mask 19 | # visualize and check 20 | ../../scripts/sptk/visualize_tf_matrix.py \ 21 | --input dir \ 22 | --cmap binary \ 23 | --frame-hop 256 \ 24 | mask 25 | echo "egs mask/egs.npy" > mask.scp 26 | # mvdr 27 | ../../scripts/sptk/apply_adaptive_beamformer.py \ 28 | --frame-len 512 \ 29 | --frame-hop 256 \ 30 | --mask-format numpy \ 31 | --beamformer mvdr \ 32 | wav.scp mask.scp mvdr 33 | # gevd 34 | ../../scripts/sptk/apply_adaptive_beamformer.py \ 35 | --frame-len 512 \ 36 | --frame-hop 256 \ 37 | --mask-format numpy \ 38 | --beamformer gevd \ 39 | wav.scp mask.scp gevd 40 | # gevd-ban 41 | ../../scripts/sptk/apply_adaptive_beamformer.py \ 42 | --frame-len 512 \ 43 | --frame-hop 256 \ 44 | --mask-format numpy \ 45 | --beamformer gevd \ 46 | --ban true 47 | wav.scp mask.scp gevd-ban 48 | # pmwf-0 49 | ../../scripts/sptk/apply_adaptive_beamformer.py \ 50 | --frame-len 512 \ 51 | --frame-hop 256 \ 52 | --mask-format numpy \ 53 | --beamformer pmwf-0 \ 54 | wav.scp mask.scp pmwf-0 55 | # pmwf-0-eig 56 | ../../scripts/sptk/apply_adaptive_beamformer.py \ 57 | --frame-len 512 \ 58 | --frame-hop 256 \ 59 | --mask-format numpy \ 60 | --beamformer pmwf-0 \ 61 | --rank1-appro eig \ 62 | wav.scp mask.scp pmwf-0 63 | # pmwf-0-gev 64 | ../../scripts/sptk/apply_adaptive_beamformer.py \ 65 | --frame-len 512 \ 66 | --frame-hop 256 \ 67 | --mask-format numpy \ 68 | --beamformer pmwf-0 \ 69 | --rank1-appro gev \ 70 | wav.scp mask.scp pmwf-0-gev 71 | ``` 72 | 73 | ### Reference 74 | 75 | 1. J. Heymann, L. Drude, R. Haeb-Umbach. Neural Network Based Spectral Mask Estimation for Acoustic Beamforming[C]. 2016 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), 2016:196–200. 76 | 2. Erdogan H, Hershey J R, Watanabe S, et al. Improved MVDR Beamforming Using Single-Channel Mask Prediction Networks[C]//Interspeech. 2016: 1981-1985. 77 | 3. Souden M, Benesty J, Affes S. On optimal frequency-domain multichannel linear filtering for noise reduction[J]. IEEE Transactions on audio, speech, and language processing, 2010, 18(2): 260-276. 78 | 4. E. Warsitz, R. Haeb-Umbach. Blind Acoustic Beamforming Based on Generalized Eigenvalue Decomposition[J]. IEEE Transactions on audio, speech, and language processing, 2007, 15(5):1529–1539. 79 | 5. Ziteng Wang, Emmanuel Vincent, Romain Serizel, and Yonghong Yan, “Rank-1 Constrained Multichannel Wiener Filter for Speech Recognition in Noisy Environments,” Jul 2017. 80 | -------------------------------------------------------------------------------- /doc/adaptive_beamformer/asset/egs.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/funcwj/setk/50e4da07c4e7fce7439da9be2b0bb1a0079491c3/doc/adaptive_beamformer/asset/egs.wav -------------------------------------------------------------------------------- /doc/adaptive_beamformer/asset/gevd-ban.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/funcwj/setk/50e4da07c4e7fce7439da9be2b0bb1a0079491c3/doc/adaptive_beamformer/asset/gevd-ban.wav -------------------------------------------------------------------------------- /doc/adaptive_beamformer/asset/gevd.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/funcwj/setk/50e4da07c4e7fce7439da9be2b0bb1a0079491c3/doc/adaptive_beamformer/asset/gevd.wav -------------------------------------------------------------------------------- /doc/adaptive_beamformer/asset/mask.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/funcwj/setk/50e4da07c4e7fce7439da9be2b0bb1a0079491c3/doc/adaptive_beamformer/asset/mask.jpg -------------------------------------------------------------------------------- /doc/adaptive_beamformer/asset/mvdr.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/funcwj/setk/50e4da07c4e7fce7439da9be2b0bb1a0079491c3/doc/adaptive_beamformer/asset/mvdr.wav -------------------------------------------------------------------------------- /doc/adaptive_beamformer/asset/pmwf-0-eig.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/funcwj/setk/50e4da07c4e7fce7439da9be2b0bb1a0079491c3/doc/adaptive_beamformer/asset/pmwf-0-eig.wav -------------------------------------------------------------------------------- /doc/adaptive_beamformer/asset/pmwf-0-gev.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/funcwj/setk/50e4da07c4e7fce7439da9be2b0bb1a0079491c3/doc/adaptive_beamformer/asset/pmwf-0-gev.wav -------------------------------------------------------------------------------- /doc/adaptive_beamformer/asset/pmwf-0.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/funcwj/setk/50e4da07c4e7fce7439da9be2b0bb1a0079491c3/doc/adaptive_beamformer/asset/pmwf-0.wav -------------------------------------------------------------------------------- /doc/data_simu/README.md: -------------------------------------------------------------------------------- 1 | ## Data Simulation 2 | 3 | Add reverberation only, noise only or mix speakers with noises in close-talk & far-field scenarios. 4 | 5 | ### Cmd options 6 | 7 | See `./scripts/sptk/wav_simulate.py -h` 8 | 9 | ### Usage 10 | 11 | 1. Add reverberation 12 | ```bash 13 | sox asset/4ch-rir1.wav asset/4ch-rir1-ch2.wav remix 2 14 | # spk1_reverb_{1,2}.wav are same 15 | ../../scripts/sptk/wav_simulate.py \ 16 | --src-spk asset/spk1.wav \ 17 | --src-rir asset/4ch-rir1-ch2.wav \ 18 | spk1_reverb_1.wav 19 | ../../scripts/sptk/wav_simulate.py \ 20 | --src-spk asset/spk1.wav \ 21 | --src-rir asset/4ch-rir1.wav \ 22 | --dump-channel 1 \ 23 | spk1_reverb_2.wav 24 | ``` 25 | 26 | 2. Add noise 27 | ```bash 28 | # close-talk + noise 29 | ../../scripts/sptk/wav_simulate.py \ 30 | --src-spk asset/spk1.wav \ 31 | --point-noise asset/noise.wav \ 32 | --point-noise-snr 5 \ 33 | spk1_noisy1.wav 34 | # far-field + pointsource noise 35 | ../../scripts/sptk/wav_simulate.py \ 36 | --src-spk asset/spk1.wav \ 37 | --src-rir asset/4ch-rir1.wav \ 38 | --point-noise asset/noise.wav \ 39 | --point-noise-snr 5 \ 40 | --point-noise-rir asset/4ch-rir3.wav \ 41 | spk1_noisy2.wav 42 | # far-field + pointsource noise + isotropic noise 43 | ../../scripts/sptk/wav_simulate.py \ 44 | --src-spk asset/spk1.wav \ 45 | --src-rir asset/4ch-rir1.wav \ 46 | --point-noise asset/noise.wav \ 47 | --point-noise-snr 5 \ 48 | --point-noise-rir asset/4ch-rir3.wav \ 49 | --isotropic-noise-snr 8 \ 50 | --isotropic-noise asset/iso.wav \ 51 | --isotropic-noise-offset 16000 \ 52 | spk1_noisy3.wav 53 | ``` 54 | 55 | 3. Mix speakers 56 | ```bash 57 | # close-talk (no noise) 58 | ../../scripts/sptk/wav_simulate.py \ 59 | --src-spk asset/spk1.wav,asset/spk2.wav \ 60 | --src-begin=32000,0 \ 61 | --src-sdr=3 \ 62 | 2spk_mix1.wav 63 | # close-talk (noise) 64 | ../../scripts/sptk/wav_simulate.py \ 65 | --src-spk asset/spk1.wav,asset/spk2.wav \ 66 | --src-begin=32000,0 \ 67 | --src-sdr=3 \ 68 | --point-noise asset/noise.wav \ 69 | --point-noise-snr 5 \ 70 | 2spk_mix2.wav 71 | # far-field 72 | ../../scripts/sptk/wav_simulate.py \ 73 | --src-spk asset/spk1.wav,asset/spk2.wav \ 74 | --src-rir asset/4ch-rir1.wav,asset/4ch-rir2.wav \ 75 | --src-begin=32000,0 \ 76 | --src-sdr=3 \ 77 | --point-noise asset/noise.wav \ 78 | --point-noise-snr 5 \ 79 | --point-noise-rir asset/4ch-rir3.wav \ 80 | --isotropic-noise-snr 8 \ 81 | --isotropic-noise asset/iso.wav \ 82 | --isotropic-noise-offset 16000 \ 83 | --dump-ref-dir ref \ 84 | 2spk_mix1.wav 85 | ``` 86 | 87 | -------------------------------------------------------------------------------- /doc/data_simu/asset/4ch-rir1.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/funcwj/setk/50e4da07c4e7fce7439da9be2b0bb1a0079491c3/doc/data_simu/asset/4ch-rir1.wav -------------------------------------------------------------------------------- /doc/data_simu/asset/4ch-rir2.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/funcwj/setk/50e4da07c4e7fce7439da9be2b0bb1a0079491c3/doc/data_simu/asset/4ch-rir2.wav -------------------------------------------------------------------------------- /doc/data_simu/asset/4ch-rir3.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/funcwj/setk/50e4da07c4e7fce7439da9be2b0bb1a0079491c3/doc/data_simu/asset/4ch-rir3.wav -------------------------------------------------------------------------------- /doc/data_simu/asset/iso.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/funcwj/setk/50e4da07c4e7fce7439da9be2b0bb1a0079491c3/doc/data_simu/asset/iso.wav -------------------------------------------------------------------------------- /doc/data_simu/asset/noise.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/funcwj/setk/50e4da07c4e7fce7439da9be2b0bb1a0079491c3/doc/data_simu/asset/noise.wav -------------------------------------------------------------------------------- /doc/data_simu/asset/spk1.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/funcwj/setk/50e4da07c4e7fce7439da9be2b0bb1a0079491c3/doc/data_simu/asset/spk1.wav -------------------------------------------------------------------------------- /doc/data_simu/asset/spk2.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/funcwj/setk/50e4da07c4e7fce7439da9be2b0bb1a0079491c3/doc/data_simu/asset/spk2.wav -------------------------------------------------------------------------------- /doc/fixed_beamformer/README.md: -------------------------------------------------------------------------------- 1 | ## Fixed Beamformer 2 | 3 | DS (delay and sum) beamformer, SD (supper-directive) beamformer and other fixed beamformers. 4 | 5 | ### Cmd options 6 | 7 | See `./scripts/sptk/apply_{ds,sd,fixed}_beamformer.py -h` 8 | 9 | ### Usage 10 | 11 | ```bash 12 | # Get steer vector 13 | ../../scripts/sptk/compute_steer_vector.py \ 14 | --num-doas 360 \ 15 | --num-bins 257 \ 16 | --geometry circular \ 17 | --circular-radius 0.05 \ 18 | --circular-around 4 \ 19 | 4mic_sv.npy 20 | # SSL (got 100 degree) 21 | echo "egs asset/egs.wav" | ../../scripts/sptk/do_ssl.py \ 22 | --frame-len 512 \ 23 | --frame-hop 256 \ 24 | --backend srp \ 25 | --srp-pair "0,2;1,3" \ 26 | --doa-range 0,360 \ 27 | --output degree \ 28 | - 4mic_sv.npy doa.scp 29 | # DS beamformer 30 | echo "egs asset/egs.wav" | ../../scripts/sptk/apply_ds_beamformer.py \ 31 | --frame-len 512 \ 32 | --frame-hop 256 \ 33 | --geometry circular \ 34 | --circular-around 4 \ 35 | --circular-radius 0.05 \ 36 | --utt2doa doa.scp \ 37 | --sr 16000 - ds 38 | # SD beamformer 39 | echo "egs asset/egs.wav" | ../../scripts/sptk/apply_sd_beamformer.py \ 40 | --frame-len 512 \ 41 | --frame-hop 256 \ 42 | --geometry circular \ 43 | --circular-around 4 \ 44 | --circular-radius 0.05 \ 45 | --utt2doa doa.scp \ 46 | --sr 16000 - sd 47 | ``` 48 | 49 | To use other fixed beamformers, pre-design the filter coefficients on each direction (in shape of `num_directions x num_bins x num_mics`) and run `./scripts/sptk/apply_fixed_beamformer.py`. -------------------------------------------------------------------------------- /doc/fixed_beamformer/asset/ds.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/funcwj/setk/50e4da07c4e7fce7439da9be2b0bb1a0079491c3/doc/fixed_beamformer/asset/ds.wav -------------------------------------------------------------------------------- /doc/fixed_beamformer/asset/egs.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/funcwj/setk/50e4da07c4e7fce7439da9be2b0bb1a0079491c3/doc/fixed_beamformer/asset/egs.wav -------------------------------------------------------------------------------- /doc/fixed_beamformer/asset/sd.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/funcwj/setk/50e4da07c4e7fce7439da9be2b0bb1a0079491c3/doc/fixed_beamformer/asset/sd.wav -------------------------------------------------------------------------------- /doc/format_transform/README.md: -------------------------------------------------------------------------------- 1 | ## Data Format 2 | 3 | Format tranform between Kaldi, Numpy and Matlab. 4 | 5 | ### Cmd options 6 | 7 | See `./scripts/sptk/copy_mat_to_archive.py -h` and `./scripts/sptk/copy_archive_to_mat.py` 8 | 9 | ### Usage 10 | 11 | ```bash 12 | # .ark to .npy 13 | ../../scripts/sptk/copy_archive_to_mat.py \ 14 | --src-format ark \ 15 | --dst-format npy \ 16 | asset/egs.ark npy 17 | # .ark to .mat 18 | ../../scripts/sptk/copy_archive_to_mat.py \ 19 | --src-format ark \ 20 | --dst-format mat \ 21 | asset/egs.ark mat 22 | # .npy to .ark 23 | find npy -name "*.npy" | awk -F '[/.]' '{print $2"."$3"\t"$0}' \ 24 | | ../../scripts/sptk/copy_mat_to_archive.py \ 25 | --src npy - npy.ark 26 | # .mat to .ark 27 | find mat -name "*.mat" | awk -F '[/.]' '{print $2"."$3"\t"$0}' \ 28 | | ../../scripts/sptk/copy_mat_to_archive.py \ 29 | --src mat - mat.ark 30 | ``` -------------------------------------------------------------------------------- /doc/format_transform/asset/egs.ark: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/funcwj/setk/50e4da07c4e7fce7439da9be2b0bb1a0079491c3/doc/format_transform/asset/egs.ark -------------------------------------------------------------------------------- /doc/ns/README.md: -------------------------------------------------------------------------------- 1 | ## Noise Suppression 2 | 3 | MCRA based noise suppression 4 | 5 | ### Cmd options 6 | 7 | See `./scripts/sptk/apply_ns.py -h` 8 | 9 | ### Usage 10 | 11 | ```bash 12 | echo "egs asset/egs.wav" | ../../scripts/sptk/apply_ns.py --output wave - ns 13 | ``` 14 | 15 | The `--output` option controls the output type, audio or TF masks (also named TF gain). Note that this command is hard coded using iMCRA method. 16 | 17 | ### Reference 18 | 19 | 1. Cohen I, Berdugo B. Speech enhancement for non-stationary noise environments[J]. Signal processing, 2001, 81(11): 2403-2418. 20 | 2. Cohen I. Noise spectrum estimation in adverse environments: Improved minima controlled recursive averaging[J]. IEEE Transactions on speech and audio processing, 2003, 11(5): 466-475. -------------------------------------------------------------------------------- /doc/ns/asset/egs.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/funcwj/setk/50e4da07c4e7fce7439da9be2b0bb1a0079491c3/doc/ns/asset/egs.wav -------------------------------------------------------------------------------- /doc/ns/asset/egs_ns.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/funcwj/setk/50e4da07c4e7fce7439da9be2b0bb1a0079491c3/doc/ns/asset/egs_ns.wav -------------------------------------------------------------------------------- /doc/rir/README.md: -------------------------------------------------------------------------------- 1 | ## RIR simulation 2 | 3 | Generation of the room impluse response (RIR) using image method. Now three optional backends are available: 4 | 5 | 1. rir-simulate (see setk/src/rir-simulate.cc) 6 | 2. [pyrirgen](https://github.com/Marvin182/rir-generator) 7 | 3. [gpuRIR](https://github.com/DavidDiazGuerra/gpuRIR) 8 | 9 | ### Cmd options 10 | 11 | See `./scripts/sptk/rir_generate_1d.py -h` or `./scripts/sptk/rir_generate_2d.py -h`. Using `--gpu true` to set [gpuRIR](https://github.com/DavidDiazGuerra/gpuRIR) as backend. 12 | 13 | ### Usage 14 | 15 | The following commands will generate `Room{1,2}-{1..25}.wav, rir.json, Room{1,2}.jpg` under directory `rir_egs`. See examples at [asset](asset). 16 | 17 | 1. 1D (linear) arrays 18 | ```bash 19 | dump_dir=rir_egs 20 | num_room=2 21 | num_rirs=25 22 | # CPU version is slow, use --gpu true or run.pl to make parallelization 23 | ../../scripts/sptk/rir_generate_1d.py \ 24 | --num-rirs $num_rirs \ 25 | --dump-dir $dump_dir \ 26 | --array-height "1.2,1.8" \ 27 | --array-topo "0,0.05,0.1,0.15" \ 28 | --room-dim "4,7;4,7;2,3" \ 29 | --rt60 "0.2,0.5" \ 30 | --array-relx "0.4,0.6" \ 31 | --array-rely "0.1,0.2" \ 32 | --speaker-height "1,2" \ 33 | --source-distance "1.5,3" \ 34 | --rir-dur 0.5 \ 35 | --vertical-oriented false \ 36 | --dump-cfg true \ 37 | --gpu false \ 38 | $num_room 39 | ``` 40 | 41 | 2. 2D (circular) arrays 42 | ```bash 43 | dump_dir=rir_egs 44 | num_room=2 45 | num_rirs=25 46 | 47 | ../../scripts/sptk/rir_generate_2d.py \ 48 | --num-rirs $num_rirs \ 49 | --dump-dir $dump_dir \ 50 | --array-height "1.2,1.8" \ 51 | --array-topo "0,0.05;0.05,0;0,-0.05;-0.05,0" \ 52 | --room-dim "4,7;4,7;2,3" \ 53 | --rt60 "0.2,0.5" \ 54 | --array-relx "0.4,0.6" \ 55 | --array-rely "0.1,0.2" \ 56 | --speaker-height "1,2" \ 57 | --source-distance "1.5,3" \ 58 | --rir-dur 0.5 \ 59 | --dump-cfg true \ 60 | --gpu false \ 61 | $num_room 62 | ``` -------------------------------------------------------------------------------- /doc/rir/asset/1d_Room1.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/funcwj/setk/50e4da07c4e7fce7439da9be2b0bb1a0079491c3/doc/rir/asset/1d_Room1.jpg -------------------------------------------------------------------------------- /doc/rir/asset/1d_Room2.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/funcwj/setk/50e4da07c4e7fce7439da9be2b0bb1a0079491c3/doc/rir/asset/1d_Room2.jpg -------------------------------------------------------------------------------- /doc/rir/asset/2d_Room1.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/funcwj/setk/50e4da07c4e7fce7439da9be2b0bb1a0079491c3/doc/rir/asset/2d_Room1.jpg -------------------------------------------------------------------------------- /doc/rir/asset/2d_Room2.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/funcwj/setk/50e4da07c4e7fce7439da9be2b0bb1a0079491c3/doc/rir/asset/2d_Room2.jpg -------------------------------------------------------------------------------- /doc/spatial_clustering/README.md: -------------------------------------------------------------------------------- 1 | 2 | ## Spatial Clustering 3 | 4 | Time-frequency mask estimation algorithm based on spatial clustering. 5 | 6 | ### Cmd options 7 | 8 | See `./scripts/sptk/estimate_cacgmm_masks.py -h` and `./scripts/sptk/estimate_cgmm_masks.py -h`. 9 | 10 | ### Usage 11 | 12 | 1. Blind Speech Separation 13 | ```bash 14 | # choose cacgmm model 15 | echo "2spk asset/2spk.wav" | ../../scripts/sptk/estimate_cacgmm_masks.py \ 16 | --num-classes 3 \ 17 | --num-iters 20 \ 18 | --frame-len 512 \ 19 | - mask 20 | # choose cgmm model 21 | echo "2spk asset/2spk.wav" | ../../scripts/sptk/estimate_cgmm_masks.py \ 22 | --num-classes 3 \ 23 | --num-iters 20 \ 24 | --frame-len 512 \ 25 | --solve-permu true \ 26 | - mask 27 | ``` 28 | This command estimates TF-masks of the three sources (2 active speakers and 1 for noise). The output order of the sources are random and the masks are generated at `mask/2spk.npy`. You can use `./scripts/sptk/visualize_tf_matrix.py` for mask visualization, e.g., 29 | ```bash 30 | ../../scripts/sptk/visualize_tf_matrix.py \ 31 | --input dir --cmap binary \ 32 | --frame-hop 256 \ 33 | --cache-dir mask mask 34 | ``` 35 | 36 | 2. Speech Enhancement 37 | ```bash 38 | # use cgmm model 39 | echo "noisy asset/noisy.wav" | ../../scripts/sptk/estimate_cgmm_masks.py \ 40 | --num-iters 20 \ 41 | --frame-len 512 \ 42 | - mask 43 | # use cacgmm model 44 | echo "noisy asset/noisy.wav" | ../../scripts/sptk/estimate_cacgmm_masks.py \ 45 | --num-classes 2 \ 46 | --num-iters 20 \ 47 | --solve-permu false \ 48 | --cgmm-init true \ 49 | --update-alpha false \ 50 | --frame-len 512 \ 51 | - mask 52 | ``` 53 | This command estimates TF-masks of the (one) source speaker and mask are generated at `mask/noisy.npy`. 54 | 55 | ### Reference 56 | 57 | 1. T. Higuchi, N. Ito, S. Araki, et al. Online Mvdr Beamformer Based on Complex Gaussian Mixture Model with Spatial Prior for Noise Robust Asr[J]. IEEE/ACM Transactions on Audio, Speech, and Language Processing, 2017, 25(4):780–793. 58 | 2. N. Ito, S. Araki, T. Nakatani. Complex Angular Central Gaussian Mixture Model for Directional Statistics in Mask-based Microphone Array Signal Processing[C]. 2016 24th European Signal Processing Conference (EUSIPCO), 2016:1153–1157. 59 | -------------------------------------------------------------------------------- /doc/spatial_clustering/asset/2spk.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/funcwj/setk/50e4da07c4e7fce7439da9be2b0bb1a0079491c3/doc/spatial_clustering/asset/2spk.jpg -------------------------------------------------------------------------------- /doc/spatial_clustering/asset/2spk.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/funcwj/setk/50e4da07c4e7fce7439da9be2b0bb1a0079491c3/doc/spatial_clustering/asset/2spk.wav -------------------------------------------------------------------------------- /doc/spatial_clustering/asset/noisy.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/funcwj/setk/50e4da07c4e7fce7439da9be2b0bb1a0079491c3/doc/spatial_clustering/asset/noisy.jpg -------------------------------------------------------------------------------- /doc/spatial_clustering/asset/noisy.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/funcwj/setk/50e4da07c4e7fce7439da9be2b0bb1a0079491c3/doc/spatial_clustering/asset/noisy.wav -------------------------------------------------------------------------------- /doc/spatial_feature/README.md: -------------------------------------------------------------------------------- 1 | ## Spatial Features 2 | 3 | Computation of spatial features, e.g., IPD (inter-channel difference), DF (directional/angle feature). 4 | 5 | ### Cmd options 6 | 7 | See `./scripts/sptk/compute_ipd_and_linear_srp.py -h` for IPD and `./scripts/sptk/compute_df_on_{geometry,mask}.py` for DF. 8 | 9 | ### Usage 10 | 11 | 1. IPD 12 | ```bash 13 | # compute cosIPD between channel 0,2 0,4 and 2,4 14 | echo "egs asset/egs.wav" | ../../scripts/sptk/compute_ipd_and_linear_srp.py 15 | --frame-len 512 16 | --frame-hop 256 \ 17 | --ipd.pair "0,2;0,4;2,4" \ 18 | --type ipd \ 19 | --ipd.cos true \ 20 | - feats.ark 21 | # visualize and check 22 | ../../scripts/sptk/visualize_tf_matrix.py \ 23 | --input ark \ 24 | --split 3 \ 25 | --cmap jet \ 26 | --frame-hop 256 \ 27 | feats.ark 28 | ``` 29 | 30 | 2. DF 31 | 32 | We provide two methods to compute directional features, one is based on given steer vector and another one is based on TF-masks. The following shows the usage of `./scripts/sptk/compute_df_on_mask.py`. 33 | 34 | ```bash 35 | # estimate TF-mask of the source speaker 36 | echo "egs asset/egs.wav" | ../../scripts/sptk/estimate_cgmm_masks.py \ 37 | --frame-len 512 \ 38 | --num-iters 20 \ 39 | - mask 40 | # compute DF 41 | echo "egs mask/egs.npy" > mask.scp 42 | df_pair="0,1;0,2;0,3;0,4;1,2;1,3;1,4;2,3;2,4;3,4" 43 | echo "egs asset/egs.wav" | ../../scripts/sptk/compute_df_on_mask.py \ 44 | --frame-len 512 \ 45 | --mask-format numpy \ 46 | --df-pair $df_pair \ 47 | - mask.scp feats.ark 48 | # visualize and check 49 | ../../scripts/sptk/visualize_tf_matrix.py \ 50 | --input ark \ 51 | --cmap jet \ 52 | --frame-hop 256 \ 53 | feats.ark 54 | ``` 55 | 56 | ### Reference 57 | 58 | 1. Z. Chen, X. Xiao, T. Yoshioka, et al. Multi-channel Overlapped Speech Recognition with Location Guided Speech Extraction Network[C]. 2018 IEEE Spoken Language Technology Workshop (SLT), 2018:558–565. 59 | 2. Z.-Q. Wang, D. Wang. Integrating Spectral and Spatial Features for Multi-channel Speaker Separation.[C]. Interspeech, 2018:2718–2722. 60 | 3. Z.-Q. Wang, D. Wang. All-neural Multi-channel Speech Enhancement.[C]. Interspeech, 2018:3234–3238. -------------------------------------------------------------------------------- /doc/spatial_feature/asset/cgmm_mask.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/funcwj/setk/50e4da07c4e7fce7439da9be2b0bb1a0079491c3/doc/spatial_feature/asset/cgmm_mask.jpg -------------------------------------------------------------------------------- /doc/spatial_feature/asset/df.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/funcwj/setk/50e4da07c4e7fce7439da9be2b0bb1a0079491c3/doc/spatial_feature/asset/df.jpg -------------------------------------------------------------------------------- /doc/spatial_feature/asset/egs.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/funcwj/setk/50e4da07c4e7fce7439da9be2b0bb1a0079491c3/doc/spatial_feature/asset/egs.wav -------------------------------------------------------------------------------- /doc/spatial_feature/asset/ipd_02_04_24.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/funcwj/setk/50e4da07c4e7fce7439da9be2b0bb1a0079491c3/doc/spatial_feature/asset/ipd_02_04_24.jpg -------------------------------------------------------------------------------- /doc/spectral_feature/README.md: -------------------------------------------------------------------------------- 1 | ## Spectral Feature 2 | 3 | Computation of the spectrogam & mel-filter bank features. 4 | 5 | ### Cmd options 6 | 7 | See `./scripts/sptk/compute_{spectrogram,fbank}.py -h` 8 | 9 | ### Usage 10 | 11 | 1. Spectrogram 12 | ```bash 13 | # 257-dim log spectrogram 14 | echo "egs asset/egs.wav" | ../../scripts/sptk/compute_spectrogram.py \ 15 | --frame-len 400 \ 16 | --frame-hop 256 \ 17 | --round-power-of-two true \ 18 | --center true \ 19 | --apply-log true \ 20 | - feats.ark 21 | # visualize and check 22 | ../../scripts/sptk/visualize_tf_matrix.py \ 23 | --input ark --cmap jet --frame-hop 256 \ 24 | feats.ark 25 | ``` 26 | 27 | 2. Fbank 28 | ```bash 29 | # 80-dim log fbank 30 | echo "egs asset/egs.wav" | ../../scripts/sptk/compute_fbank.py \ 31 | --frame-len 400 \ 32 | --frame-hop 256 \ 33 | --round-power-of-two true \ 34 | --center true \ 35 | --apply-log true \ 36 | --num-bins 80 \ 37 | - feats.ark 38 | # visualize and check 39 | ../../scripts/sptk/visualize_tf_matrix.py \ 40 | --input ark --cmap jet --frame-hop 256 \ 41 | feats.ark 42 | ``` -------------------------------------------------------------------------------- /doc/spectral_feature/asset/257dim-log-spectrogram.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/funcwj/setk/50e4da07c4e7fce7439da9be2b0bb1a0079491c3/doc/spectral_feature/asset/257dim-log-spectrogram.jpg -------------------------------------------------------------------------------- /doc/spectral_feature/asset/80dim-log-fbank.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/funcwj/setk/50e4da07c4e7fce7439da9be2b0bb1a0079491c3/doc/spectral_feature/asset/80dim-log-fbank.jpg -------------------------------------------------------------------------------- /doc/spectral_feature/asset/egs.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/funcwj/setk/50e4da07c4e7fce7439da9be2b0bb1a0079491c3/doc/spectral_feature/asset/egs.wav -------------------------------------------------------------------------------- /doc/ssl/README.md: -------------------------------------------------------------------------------- 1 | ## SSL (Sound Source Localization) 2 | 3 | SSL implementation. 4 | 5 | ### Cmd options 6 | 7 | See `./scripts/sptk/do_ssl.py -h` for SSL and `./scripts/sptk/compute_ipd_and_linear_srp.py`, `./scripts/sptk/compute_circular_srp.py` for SRP angular spectrum computation. 8 | 9 | ### Usage 10 | 11 | ```bash 12 | echo "egs asset/egs.wav" > wav.scp 13 | # srp matrices 14 | ../../scripts/sptk/compute_circular_srp.py \ 15 | --frame-len 512 \ 16 | --frame-hop 256 \ 17 | --n 16 \ 18 | --d 0.1 \ 19 | --diag-pair "0,8;1,9;2,10;3,11;4,12;5,13;6,14;7,15" \ 20 | --num-doa 361 \ 21 | wav.scp srp.ark 22 | # visualize and check (found peak around 60 degree) 23 | ../../scripts/sptk/visualize_angular_spectrum.py --frame-hop 16 srp.ark 24 | # compute steer vector 25 | ../../scripts/sptk/compute_steer_vector.py \ 26 | --num-doas 360 \ 27 | --num-bins 257 \ 28 | --sr 16000 \ 29 | --geometry circular \ 30 | --circular-radius 0.05 \ 31 | --circular-around 16 \ 32 | 16mic_sv.npy 33 | # run srp-based SSL (got 59 degree) 34 | ../../scripts/sptk/do_ssl.py \ 35 | --frame-len 512 \ 36 | --frame-hop 256 \ 37 | --backend srp \ 38 | --doa-range 0,360 \ 39 | --output degree \ 40 | --srp-pair "0,8;1,9;2,10;3,11;4,12;5,13;6,14;7,15" \ 41 | wav.scp 16mic_sv.npy doa.scp 42 | # run SSL using ml backend (also got 59 degree) 43 | ../../scripts/sptk/do_ssl.py \ 44 | --frame-len 512 \ 45 | --frame-hop 256 \ 46 | --backend ml \ 47 | --doa-range 0,360 \ 48 | --output degree \ 49 | wav.scp 16mic_sv.npy doa.scp 50 | # run SSL using music backend (59 degree) 51 | ../../scripts/sptk/do_ssl.py \ 52 | --frame-len 512 \ 53 | --frame-hop 256 \ 54 | --backend music \ 55 | --doa-range 0,360 \ 56 | --output degree \ 57 | wav.scp 16mic_sv.npy doa.scp 58 | ``` -------------------------------------------------------------------------------- /doc/ssl/asset/angular_spectrum.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/funcwj/setk/50e4da07c4e7fce7439da9be2b0bb1a0079491c3/doc/ssl/asset/angular_spectrum.jpg -------------------------------------------------------------------------------- /doc/ssl/asset/egs.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/funcwj/setk/50e4da07c4e7fce7439da9be2b0bb1a0079491c3/doc/ssl/asset/egs.wav -------------------------------------------------------------------------------- /doc/steer_vector/README.md: -------------------------------------------------------------------------------- 1 | ## Steer Vector 2 | 3 | Computation of the steer vector matrices (used in SSL and directional features). The matrice is in shape of `num_directions x num_mics x num_bins`. 4 | 5 | ### Cmd options 6 | 7 | See `./scripts/sptk/compute_steer_vector.py -h`. 8 | 9 | ### Usage 10 | 11 | 1. Linear array 12 | ```bash 13 | # DoA in [0, 180] 14 | # output shape: 181 x 6 x 257 15 | # 0 deg * * * * * * 180 deg 16 | ../../scripts/sptk/compute_steer_vector.py \ 17 | --sr 16000 \ 18 | --num-doas 181 \ 19 | --num-bins 257 \ 20 | --geometry linear \ 21 | --linear-topo "0,0.01,0.02,0.03,0.04,0.05" \ 22 | asset/1d_6mic_sv.npy 23 | ``` 24 | 25 | 2. Circular array 26 | ```bash 27 | # DoA in [0, 360) 28 | # output shape: 360 x 4 x 257 29 | # * 30 | # 180 deg * * 0 deg 31 | # * 32 | ../../scripts/sptk/compute_steer_vector.py \ 33 | --sr 16000 \ 34 | --num-doas 360 \ 35 | --num-bins 257 \ 36 | --geometry circular \ 37 | --circular-radius 0.05 \ 38 | --circular-around 4 \ 39 | --circular-center false \ 40 | asset/2d_4mic_sv.npy 41 | ``` 42 | 43 | 3. Beam pattern 44 | ```bash 45 | ../../scripts/sptk/visualize_beampattern.py --doa-range 360 \ 46 | asset/beam_v1.npy asset/2d_4mic_sv.npy 47 | ``` -------------------------------------------------------------------------------- /doc/steer_vector/asset/1d_6mic_sv.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/funcwj/setk/50e4da07c4e7fce7439da9be2b0bb1a0079491c3/doc/steer_vector/asset/1d_6mic_sv.npy -------------------------------------------------------------------------------- /doc/steer_vector/asset/2d_4mic_sv.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/funcwj/setk/50e4da07c4e7fce7439da9be2b0bb1a0079491c3/doc/steer_vector/asset/2d_4mic_sv.npy -------------------------------------------------------------------------------- /doc/steer_vector/asset/beam_v1.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/funcwj/setk/50e4da07c4e7fce7439da9be2b0bb1a0079491c3/doc/steer_vector/asset/beam_v1.npy -------------------------------------------------------------------------------- /doc/steer_vector/asset/beampattern_v1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/funcwj/setk/50e4da07c4e7fce7439da9be2b0bb1a0079491c3/doc/steer_vector/asset/beampattern_v1.png -------------------------------------------------------------------------------- /doc/tf_mask/README.md: -------------------------------------------------------------------------------- 1 | ## Time-frequency Mask 2 | 3 | Computation of the time-frequency mask (PSM, IRM, IBM, IAM, ...) as the neural network training labels. 4 | 5 | ### Cmd options 6 | 7 | See `./scripts/sptk/compute_mask.py -h` 8 | 9 | ### Usage 10 | 11 | 1. IBM & IRM computation 12 | ```bash 13 | # prepare scp 14 | echo "egs asset/clean.wav" > clean.scp 15 | echo "egs asset/noisy.wav" > noisy.scp 16 | # computation 17 | ../../scripts/sptk/compute_mask.py \ 18 | --mask irm clean.scp noisy.scp irm.ark 19 | # visualize and check 20 | ../../scripts/sptk/visualize_tf_matrix.py \ 21 | --input ark \ 22 | --cmap jet \ 23 | --cache-dir irm \ 24 | irm.ark 25 | ``` 26 | 27 | 2. PSM & IAM (FFT-mask or SMM) computation 28 | ```bash 29 | # add cutoff as they are unbounded 30 | ../../scripts/sptk/compute_mask.py \ 31 | --mask psm \ 32 | --cutoff 2 \ 33 | clean.scp noisy.scp psm.ark 34 | # visualize and check 35 | ../../scripts/sptk/visualize_tf_matrix.py \ 36 | --input ark \ 37 | --cmap jet \ 38 | --cache-dir psm \ 39 | psm.ark 40 | ``` 41 | 42 | 3. Restore audio using TF-masks 43 | ```bash 44 | # psm as example 45 | ../../scripts/sptk/compute_mask.py \ 46 | --mask psm \ 47 | --cutoff 2 \ 48 | --scp mask.scp \ 49 | clean.scp noisy.scp mask.ark 50 | # do TF masking (using noisy phase) 51 | ../../scripts/sptk/wav_separate.py \ 52 | --mask-format kaldi \ 53 | noisy.scp mask.scp enh 54 | ``` 55 | The enhancement output is under directory `enh`. See `../../scripts/sptk/wav_separate.py -h` for more command options. -------------------------------------------------------------------------------- /doc/tf_mask/asset/clean.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/funcwj/setk/50e4da07c4e7fce7439da9be2b0bb1a0079491c3/doc/tf_mask/asset/clean.wav -------------------------------------------------------------------------------- /doc/tf_mask/asset/enhan.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/funcwj/setk/50e4da07c4e7fce7439da9be2b0bb1a0079491c3/doc/tf_mask/asset/enhan.wav -------------------------------------------------------------------------------- /doc/tf_mask/asset/iam-cutoff-2.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/funcwj/setk/50e4da07c4e7fce7439da9be2b0bb1a0079491c3/doc/tf_mask/asset/iam-cutoff-2.jpg -------------------------------------------------------------------------------- /doc/tf_mask/asset/ibm.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/funcwj/setk/50e4da07c4e7fce7439da9be2b0bb1a0079491c3/doc/tf_mask/asset/ibm.jpg -------------------------------------------------------------------------------- /doc/tf_mask/asset/irm.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/funcwj/setk/50e4da07c4e7fce7439da9be2b0bb1a0079491c3/doc/tf_mask/asset/irm.jpg -------------------------------------------------------------------------------- /doc/tf_mask/asset/noisy.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/funcwj/setk/50e4da07c4e7fce7439da9be2b0bb1a0079491c3/doc/tf_mask/asset/noisy.wav -------------------------------------------------------------------------------- /doc/tf_mask/asset/psa.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/funcwj/setk/50e4da07c4e7fce7439da9be2b0bb1a0079491c3/doc/tf_mask/asset/psa.jpg -------------------------------------------------------------------------------- /doc/tf_mask/asset/psm-cutoff-2.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/funcwj/setk/50e4da07c4e7fce7439da9be2b0bb1a0079491c3/doc/tf_mask/asset/psm-cutoff-2.jpg -------------------------------------------------------------------------------- /doc/vad/README.md: -------------------------------------------------------------------------------- 1 | ## VAD 2 | 3 | Removing the silence from the given utterance. 4 | 5 | ### Cmd options 6 | 7 | See `./scripts/sptk/do_vad.py -h` 8 | 9 | ### Usage 10 | 11 | ```bash 12 | # - means stdin 13 | echo "utt asset/utt.wav" | ../../scripts/sptk/do_vad.py --mode 3 --sr 16000 - vad 14 | ``` 15 | The processed audio are generated under directory `vad`. 16 | 17 | ### Dependency 18 | 19 | * [webrtcvad](https://github.com/wiseman/py-webrtcvad) 20 | -------------------------------------------------------------------------------- /doc/vad/asset/utt.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/funcwj/setk/50e4da07c4e7fce7439da9be2b0bb1a0079491c3/doc/vad/asset/utt.wav -------------------------------------------------------------------------------- /doc/vad/asset/utt_vad.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/funcwj/setk/50e4da07c4e7fce7439da9be2b0bb1a0079491c3/doc/vad/asset/utt_vad.wav -------------------------------------------------------------------------------- /doc/wpe/README.md: -------------------------------------------------------------------------------- 1 | ## WPE 2 | 3 | Weighted Prediction Error for speech dereverberation. 4 | 5 | ### Cmd options 6 | 7 | See `./scripts/sptk/apply_wpe.py -h` 8 | 9 | ### Usage 10 | 11 | ```bash 12 | echo "egs asset/egs.wav" | ../../scripts/sptk/apply_wpe.py \ 13 | --frame-len 512 \ 14 | --frame-hop 128 \ 15 | - wpe 16 | ``` 17 | 18 | To use nara-wpe: 19 | ```bash 20 | echo "egs asset/egs.wav" | ../../scripts/sptk/apply_wpe.py \ 21 | --frame-len 512 \ 22 | --frame-hop 128 \ 23 | --nara-wpe true \ 24 | - wpe 25 | ``` 26 | 27 | To run WPD, using command 28 | ```bash 29 | echo "egs asset/egs.wav" | ../../scripts/sptk/apply_wpd.py \ 30 | --frame-len 512 \ 31 | --taps 10 --delay 3 --context 1 \ 32 | --wpd-iters 2 --cgmm-iters 10 \ 33 | --update-alpha false \ 34 | --dump-mask true - wpd 35 | ``` 36 | which will generate TF-masks and dereverbrated & enhanced audio simultaneously. 37 | 38 | ### Reference 39 | 40 | 1. [nara_wpe](https://github.com/fgnt/nara_wpe) 41 | 2. Yoshioka, Takuya, and Tomohiro Nakatani. "Generalization of multi-channel linear prediction methods for blind MIMO impulse response shortening." IEEE Transactions on Audio, Speech, and Language Processing 20.10 (2012): 2707-2720. 42 | 3. Nakatani, Tomohiro, and Keisuke Kinoshita. "A unified convolutional beamformer for simultaneous denoising and dereverberation." IEEE Signal Processing Letters 26.6 (2019): 903-907. 43 | 4. Boeddeker, Christoph, et al. "Jointly optimal dereverberation and beamforming." ICASSP 2020-2020 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP). IEEE, 2020. 44 | 5. Nakatani, Tomohiro, and Keisuke Kinoshita. "Maximum likelihood convolutional beamformer for simultaneous denoising and dereverberation." 2019 27th European Signal Processing Conference (EUSIPCO). IEEE, 2019. -------------------------------------------------------------------------------- /doc/wpe/asset/egs.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/funcwj/setk/50e4da07c4e7fce7439da9be2b0bb1a0079491c3/doc/wpe/asset/egs.wav -------------------------------------------------------------------------------- /doc/wpe/asset/mask.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/funcwj/setk/50e4da07c4e7fce7439da9be2b0bb1a0079491c3/doc/wpe/asset/mask.jpg -------------------------------------------------------------------------------- /doc/wpe/asset/wpd_egs.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/funcwj/setk/50e4da07c4e7fce7439da9be2b0bb1a0079491c3/doc/wpe/asset/wpd_egs.wav -------------------------------------------------------------------------------- /doc/wpe/asset/wpe_egs.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/funcwj/setk/50e4da07c4e7fce7439da9be2b0bb1a0079491c3/doc/wpe/asset/wpe_egs.wav -------------------------------------------------------------------------------- /include/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 3.0) 2 | 3 | set(SETK_SRC ${CMAKE_SOURCE_DIR}/include/complex-matrix.cc 4 | ${CMAKE_SOURCE_DIR}/include/complex-vector.cc 5 | ${CMAKE_SOURCE_DIR}/include/stft.cc 6 | ${CMAKE_SOURCE_DIR}/include/srp-phat.cc 7 | ${CMAKE_SOURCE_DIR}/include/rir-generator.cc 8 | ${CMAKE_SOURCE_DIR}/include/beamformer.cc) 9 | if(APPLE) 10 | set(CMAKE_SHARED_LINKER_FLAGS "-framework Accelerate") 11 | endif() 12 | 13 | add_library(setk SHARED ${SETK_SRC}) 14 | target_link_libraries(setk ${DEPEND_LIBS}) 15 | -------------------------------------------------------------------------------- /include/beamformer.h: -------------------------------------------------------------------------------- 1 | // include/beamformer.h 2 | // wujian@2018 3 | 4 | // Copyright 2018 Jian Wu 5 | 6 | // See ../../COPYING for clarification regarding multiple authors 7 | // 8 | // Licensed under the Apache License, Version 2.0 (the "License"); 9 | // you may not use this file except in compliance with the License. 10 | // You may obtain a copy of the License at 11 | // 12 | // http://www.apache.org/licenses/LICENSE-2.0 13 | // 14 | // THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 15 | // KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED 16 | // WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, 17 | // MERCHANTABLITY OR NON-INFRINGEMENT. 18 | // See the Apache 2 License for the specific language governing permissions and 19 | // limitations under the License. 20 | 21 | #ifndef BEAMFORMER_H_ 22 | #define BEAMFORMER_H_ 23 | 24 | #include "include/complex-base.h" 25 | #include "include/complex-matrix.h" 26 | #include "include/complex-vector.h" 27 | 28 | namespace kaldi { 29 | 30 | // Cast CMatrix into Matrix, in Realfft format, to reconstruct speech 31 | // The Realfft format is space efficient, so I refused to use CMatrix in stft.h 32 | void CastIntoRealfft(const CMatrixBase &cstft, 33 | Matrix *rstft); 34 | 35 | // src_stft: (num_frames, num_bins x num_channels) or 36 | // (num_frames x num_channels, num_bins) 37 | // dst_stft: (num_bins x num_frames, num_channels) 38 | // Shape multiple complex stft from shape num_frames x [num_bins * num_channels] 39 | // or [num_frames x num_channels] x num_bins into [num_bins * num_frames] x 40 | // num_channels 41 | // for convenience of psd estimate and beamforming 42 | void TrimStft(const int32 num_bins, const int32 num_channels, 43 | const CMatrixBase &src_stft, 44 | CMatrix *dst_stft); 45 | 46 | // 47 | // src_stft: (num_bins x num_frames, num_channels) 48 | // target_mask: (num_frames, num_bins) 49 | // target_psd: (num_bins x num_channels, num_channels) 50 | // 51 | void EstimatePsd(const CMatrixBase &src_stft, 52 | const MatrixBase &target_mask, 53 | CMatrix *target_psd, 54 | CMatrix *second_psd); 55 | 56 | // target_psd: (num_bins x num_channels, num_channels) 57 | // steer_vector:(num_bins, num_channels) 58 | // using maximum eigen vector as estimation of steer vector 59 | void EstimateSteerVector(const CMatrixBase &target_psd, 60 | CMatrix *steer_vector); 61 | 62 | // target_psd: (num_bins x num_channels, num_channels) 63 | // steer_vector:(num_bins, num_channels) 64 | // beam_weights:(num_bins, num_channels) 65 | // NOTE mvdr: 66 | // numerator = psd_inv * steer_vector 67 | // denumerator = numerator * steer_vector^H 68 | // weight = numerator / denumerator 69 | void ComputeMvdrBeamWeights(const CMatrixBase &noise_psd, 70 | const CMatrixBase &steer_vector, 71 | CMatrix *beam_weights); 72 | 73 | // target_psd: (num_bins x num_channels, num_channels) 74 | // noise_psd: (num_bins x num_channels, num_channels) 75 | // beam_weights:(num_bins, num_channels) 76 | void ComputeGevdBeamWeights(const CMatrixBase &target_psd, 77 | const CMatrixBase &noise_psd, 78 | CMatrix *beam_weights); 79 | 80 | // src_stft: (num_bins x num_frames, num_channels) 81 | // weights: (num_bins, num_channels) 82 | // enh_stft: (num_frames, num_bins) 83 | // NOTE: 84 | // To avoid Transpose, using AddMatMat instead of: 85 | // enh_stft->Resize(num_bins, num_frames); 86 | // for (int32 f = 0; f < num_bins; f++) 87 | // enh_stft->Row(f).AddMatVec(1, 0, src_stft.RowRange(f * num_frames, 88 | // num_frames), kNoTrans, weights.Row(f), 0, 0); 89 | // enh_stft->Transpose(); 90 | 91 | void Beamform(const CMatrixBase &src_stft, 92 | const CMatrixBase &weights, 93 | CMatrix *enh_stft); 94 | } 95 | 96 | #endif 97 | -------------------------------------------------------------------------------- /include/complex-base.h: -------------------------------------------------------------------------------- 1 | // include/complex-base.h 2 | // wujian@2018 3 | 4 | // Copyright 2018 Jian Wu 5 | 6 | // See ../../COPYING for clarification regarding multiple authors 7 | // 8 | // Licensed under the Apache License, Version 2.0 (the "License"); 9 | // you may not use this file except in compliance with the License. 10 | // You may obtain a copy of the License at 11 | // 12 | // http://www.apache.org/licenses/LICENSE-2.0 13 | // 14 | // THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 15 | // KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED 16 | // WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, 17 | // MERCHANTABLITY OR NON-INFRINGEMENT. 18 | // See the Apache 2 License for the specific language governing permissions and 19 | // limitations under the License. 20 | 21 | #ifndef COMPLEX_BASE_H_ 22 | #define COMPLEX_BASE_H_ 23 | 24 | #include 25 | #include "matrix/cblas-wrappers.h" 26 | #include "matrix/kaldi-matrix.h" 27 | #include "matrix/kaldi-vector.h" 28 | #include "matrix/matrix-common.h" 29 | 30 | #include "include/cblas-cpl-wrappers.h" 31 | 32 | namespace kaldi { 33 | 34 | typedef enum { kReal, kImag } ComplexIndexType; 35 | 36 | typedef enum { 37 | kConj, 38 | kNoConj, 39 | } ConjugateType; 40 | 41 | // 42 | // typedef enum { 43 | // kNoTrans = 111, // CblasNoTrans 44 | // kTrans = 112, // CblasTrans 45 | // kConjTrans = 113, // CblasConjTrans 46 | // kConjNoTrans = 114 // CblasConjNoTrans 47 | // } CMatrixTransposeType; 48 | 49 | template 50 | struct Complex { 51 | Real real, imag; 52 | Complex(Real r, Real i) : real(r), imag(i) {} 53 | Complex() {} 54 | }; 55 | 56 | template 57 | class CVectorBase; 58 | template 59 | class CVector; 60 | template 61 | class SubCVector; 62 | 63 | template 64 | class CMatrixBase; 65 | template 66 | class CMatrix; 67 | template 68 | class SubCMatrix; 69 | 70 | template 71 | inline void ComplexDiv(const Real &a_re, const Real &a_im, Real *b_re, 72 | Real *b_im) { 73 | Real d = a_re * a_re + a_im * a_im; 74 | Real tmp_re = (*b_re * a_re) + (*b_im * a_im); 75 | *b_im = (*b_re * a_im - *b_im * a_re) / d; 76 | *b_re = tmp_re / d; 77 | } 78 | } 79 | 80 | #endif 81 | -------------------------------------------------------------------------------- /include/srp-phat.cc: -------------------------------------------------------------------------------- 1 | // include/srp-phat.cc 2 | // wujian@18.5.29 3 | 4 | // Copyright 2018 Jian Wu 5 | 6 | // See ../../COPYING for clarification regarding multiple authors 7 | // 8 | // Licensed under the Apache License, Version 2.0 (the "License"); 9 | // you may not use this file except in compliance with the License. 10 | // You may obtain a copy of the License at 11 | // 12 | // http://www.apache.org/licenses/LICENSE-2.0 13 | // 14 | // THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 15 | // KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED 16 | // WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, 17 | // MERCHANTABLITY OR NON-INFRINGEMENT. 18 | // See the Apache 2 License for the specific language governing permissions and 19 | // limitations under the License. 20 | 21 | #include "include/srp-phat.h" 22 | 23 | namespace kaldi { 24 | 25 | // GCC(x, y) = conj(GCC(y, x)) 26 | void SrpPhatComputor::ComputeGccPhat(const CMatrixBase &L, 27 | const CMatrixBase &R, 28 | BaseFloat dist, 29 | CMatrixBase *gcc_phat) { 30 | KALDI_ASSERT(dist > 0); 31 | BaseFloat max_tdoa = dist / opts_.sound_speed; 32 | BaseFloat inc_tdoa = max_tdoa * 2 / (opts_.samp_rate - 1); 33 | for (int32 i = 0; i < opts_.samp_rate; i++) 34 | if (opts_.samp_tdoa) 35 | delay_axis_(i) = (max_tdoa - inc_tdoa * i) * 2 * M_PI; 36 | else 37 | delay_axis_(i) = 38 | std::cos(i * M_PI / opts_.samp_rate) * max_tdoa * 2 * M_PI; 39 | 40 | idtft_coef_.SetZero(); 41 | idtft_coef_.AddVecVec(1, frequency_axis_, delay_axis_); 42 | exp_idtft_coef_j_.Exp(idtft_coef_); 43 | 44 | CMatrix cor(L); 45 | cor.MulElements(R, kConj); 46 | cor.DivElements(L, kNoConj, true); 47 | cor.DivElements(R, kNoConj, true); 48 | // gcc_phat = gcc_phat + cor * coef 49 | gcc_phat->AddMatMat(1, 0, cor, kNoTrans, exp_idtft_coef_j_, kNoTrans, 1, 0); 50 | } 51 | 52 | void SrpPhatComputor::Compute(const CMatrixBase &stft, 53 | Matrix *spectra) { 54 | std::vector &topo = opts_.array_topo; 55 | int32 num_chs = topo.size(); 56 | KALDI_ASSERT(num_chs >= 2); 57 | MatrixIndexT num_frames = stft.NumRows() / num_chs, num_bins = stft.NumCols(); 58 | CMatrix coef(num_bins, delay_axis_.Dim()); 59 | CMatrix srp_phat(num_frames, delay_axis_.Dim()); 60 | spectra->Resize(num_frames, delay_axis_.Dim()); 61 | 62 | // GCC(x, y) = conj(GCC(y, x)) 63 | // GCC_PHAT(x, x) = I 64 | for (int32 i = 0; i < num_chs; i++) { 65 | for (int32 j = i + 1; j < num_chs; j++) { 66 | ComputeGccPhat(stft.RowRange(i * num_frames, num_frames), 67 | stft.RowRange(j * num_frames, num_frames), 68 | std::abs(topo[j] - topo[i]), &srp_phat); 69 | } 70 | } 71 | if (opts_.smooth_context) Smooth(&srp_phat); 72 | srp_phat.Part(spectra, kReal); 73 | spectra->ApplyFloor(0); 74 | } 75 | 76 | void SrpPhatComputor::Smooth(CMatrix *spectra) { 77 | int32 context = opts_.smooth_context; 78 | CMatrix smooth_spectra(spectra->NumRows(), spectra->NumCols()); 79 | for (int32 t = 0; t < spectra->NumRows(); t++) { 80 | for (int32 c = -context; c <= context; c++) { 81 | int32 index = std::min(std::max(t + c, 0), spectra->NumRows() - 1); 82 | SubCVector ctx(*spectra, index); 83 | smooth_spectra.Row(t).AddVec(1, 0, ctx); 84 | } 85 | } 86 | smooth_spectra.Scale(1.0 / (2 * context + 1), 0); 87 | spectra->CopyFromMat(smooth_spectra); 88 | } 89 | 90 | } // kaldi 91 | -------------------------------------------------------------------------------- /path.sh: -------------------------------------------------------------------------------- 1 | # only export path of current package 2 | 3 | ENHAN_PATH=$PWD 4 | 5 | for dir in bin utils; do 6 | [ -d $PWD/$dir ] && ENHAN_PATH="$ENHAN_PATH:$PWD/$dir" 7 | done 8 | 9 | export PATH=$ENHAN_PATH:$PATH -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | editdistance==0.6.0 2 | librosa==0.8.1 3 | matplotlib==3.5.0 4 | mir_eval==0.7 5 | nara_wpe==0.0.7 6 | numpy==1.22.3 7 | pypesq==1.2.4 8 | PyYAML==6.0 9 | scikit_learn==1.0.2 10 | scipy==1.7.3 11 | SoundFile==0.10.3.post1 12 | tqdm==4.62.3 13 | webrtcvad==2.0.10 14 | -------------------------------------------------------------------------------- /scripts/compute_circular_srp.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | # wujian@2018 4 | 5 | set -eu 6 | 7 | nj=40 8 | cmd="run.pl" 9 | compress=true 10 | 11 | stft_conf=conf/stft.conf 12 | fs=16000 13 | num_doas=121 14 | d=0.07 15 | n=6 16 | diag_pair="0,3\;1,4\;2,5" 17 | 18 | echo "$0 $@" 19 | 20 | function usage { 21 | echo "Options:" 22 | echo " --nj # number of jobs to run parallel, (default=$nj)" 23 | echo " --cmd # how to run jobs, (default=$cmd)" 24 | echo " --compress # compress feature or not, (default=$compress)" 25 | echo " --stft-conf # stft configurations files, (default=$stft_conf)" 26 | echo " --fs # sample frequency for source wave, (default=$fs)" 27 | echo " --num-doas # doa resolution, (default=$num_doas)" 28 | echo " --d # diameter of circular array, (default=$d)" 29 | echo " --n # number of arrays, (default=$n)" 30 | echo " --diag-pair # diagonal pairs to compute gcc-phat, (default=$diag_pair)" 31 | } 32 | 33 | . ./path.sh 34 | . ./utils/parse_options.sh || exit 1 35 | 36 | [ $# -ne 3 ] && echo "Script format error: $0 " && usage && exit 1 37 | 38 | src_dir=$(cd $1; pwd) 39 | dst_dir=$3 40 | 41 | for x in $src_dir/wav.scp $stft_conf; do [ ! -f $x ] && echo "$0: missing file: $x" && exit 1; done 42 | 43 | srp_opts=$(cat $stft_conf | xargs) 44 | srp_opts="$srp_opts --n $n --d $d --sr $fs --num-doas $num_doas --diag-pair $diag_pair" 45 | 46 | echo "$0: Compute srp circular features..." 47 | 48 | exp_dir=$2 && mkdir -p $exp_dir 49 | mkdir -p $dst_dir && dst_dir=$(cd $3; pwd) 50 | 51 | wav_split_scp="" 52 | for n in $(seq $nj); do wav_split_scp="$wav_split_scp $exp_dir/wav.$n.scp"; done 53 | ./utils/split_scp.pl $src_dir/wav.scp $wav_split_scp || exit 1 54 | 55 | name=$(basename $src_dir) 56 | 57 | if $compress ; then 58 | $cmd JOB=1:$nj $exp_dir/log/compute_srp.JOB.log \ 59 | ./scripts/sptk/compute_circular_srp.py $srp_opts \ 60 | $exp_dir/wav.JOB.scp - \| \ 61 | copy-feats --compress=$compress ark:- \ 62 | ark,scp:$dst_dir/$name.srp.JOB.ark,$dst_dir/$name.srp.JOB.scp 63 | else 64 | $cmd JOB=1:$nj $exp_dir/log/compute_srp.JOB.log \ 65 | ./scripts/sptk/compute_circular_srp.py $srp_opts \ 66 | --scp $dst_dir/$name.srp.JOB.scp \ 67 | $exp_dir/wav.JOB.scp $dst_dir/$name.srp.JOB.ark 68 | fi 69 | 70 | cat $dst_dir/$name.srp.*.scp | sort -k1 > $src_dir/srp.scp 71 | 72 | echo "$0: Compute srp circular features done" 73 | 74 | -------------------------------------------------------------------------------- /scripts/compute_df_on_mask.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | # wujian@2018 4 | 5 | set -eu 6 | 7 | nj=40 8 | cmd="run.pl" 9 | compress=true 10 | 11 | stft_conf=conf/stft.conf 12 | mask_format="kaldi" 13 | 14 | echo "$0 $@" 15 | 16 | function usage { 17 | echo "Options:" 18 | echo " --nj # number of jobs to run parallel, (default=$nj)" 19 | echo " --cmd # how to run jobs, (default=$cmd)" 20 | echo " --compress # compress feature or not, (default=$compress)" 21 | echo " --stft-conf # stft configurations files, (default=$stft_conf)" 22 | echo " --mask-format # load masks from np.ndarray instead, (default=$mask_format)" 23 | } 24 | 25 | . ./path.sh 26 | . ./utils/parse_options.sh || exit 1 27 | 28 | [ $# -ne 4 ] && echo "Script format error: $0 " && usage && exit 1 29 | 30 | src_dir=$(cd $1; pwd) 31 | dst_dir=$4 32 | 33 | for x in $src_dir/wav.scp $stft_conf; do [ ! -f $x ] && echo "$0: Missing file: $x..." && exit 1; done 34 | 35 | echo "$0: Compute directional features for $1..." 36 | 37 | exp_dir=$3 && mkdir -p $exp_dir 38 | mkdir -p $dst_dir && dst_dir=$(cd $4; pwd) 39 | 40 | mask_scp_or_dir=$2 41 | if [ -d $mask_scp_or_dir ]; then 42 | [ $mask_format != "numpy" ] && echo "$0: $mask_scp_or_dir is a directory, expected to set --mask-format numpy" && exit 1 43 | find $mask_scp_or_dir -name "*.npy" | awk -F '/' '{printf("%s\t%s\n", $NF, $0)}' | \ 44 | sed 's:\.npy::' | sort -k1 > $exp_dir/masks.scp 45 | echo "$0: Got $(cat $exp_dir/masks.scp | wc -l) numpy's masks" 46 | else 47 | cp $mask_scp_or_dir $exp_dir/masks.scp 48 | fi 49 | 50 | awk '{print $1}' $exp_dir/masks.scp | ./utils/filter_scp.pl -f 1 - $src_dir/wav.scp | sort -k1 > $exp_dir/wav.scp 51 | echo "$0: Reduce $(cat $src_dir/wav.scp | wc -l) utterances to $(cat $exp_dir/wav.scp | wc -l)" 52 | 53 | wav_split_scp="" && for n in $(seq $nj); do wav_split_scp="$wav_split_scp $exp_dir/wav.$n.scp"; done 54 | ./utils/split_scp.pl $exp_dir/wav.scp $wav_split_scp 55 | 56 | name="df" 57 | dir=$(basename $src_dir) 58 | 59 | df_opts=$(cat $stft_conf | xargs) 60 | df_opts="$df_opts --mask-format $mask_format" 61 | 62 | if $compress ; then 63 | $cmd JOB=1:$nj $exp_dir/log/compute_df_$dir.JOB.log \ 64 | ./scripts/sptk/compute_linear_df_mask.py \ 65 | $df_opts $exp_dir/wav.JOB.scp \ 66 | $exp_dir/masks.scp - \| copy-feats --compress=$compress ark:- \ 67 | ark,scp:$dst_dir/$dir.$name.JOB.ark,$dst_dir/$dir.$name.JOB.scp 68 | else 69 | $cmd JOB=1:$nj $exp_dir/log/compute_df_$dir.JOB.log \ 70 | ./scripts/sptk/compute_linear_df_mask.py $df_opts \ 71 | --scp $dst_dir/$dir.$name.JOB.scp \ 72 | $exp_dir/wav.JOB.scp $exp_dir/masks.scp \ 73 | $dst_dir/$dir.$name.JOB.ark 74 | fi 75 | 76 | cat $dst_dir/$dir.$name.*.scp | sort -k1 > $src_dir/df.scp 77 | 78 | echo "$0: Compute directional features done" 79 | 80 | -------------------------------------------------------------------------------- /scripts/compute_ipd_and_linear_srp.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | # wujian@2018 4 | 5 | set -eu 6 | 7 | nj=40 8 | cmd="run.pl" 9 | compress=true 10 | 11 | stft_conf=conf/stft.conf 12 | feats=ipd 13 | # ipd cfg 14 | ipd_index="0,1" 15 | ipd_cos=true 16 | ipd_sin=false 17 | # msc cfg 18 | msc_ctx=1 19 | # srp cfg 20 | srp_fs=16000 21 | srp_num_doa=181 22 | srp_topo="" 23 | src_sample_tdoa=false 24 | 25 | echo "$0 $@" 26 | 27 | function usage { 28 | echo "Options:" 29 | echo " --nj # number of jobs to run parallel, (default=40)" 30 | echo " --cmd # how to run jobs, (default=run.pl)" 31 | echo " --compress # compress feature or not, (default=true)" 32 | echo " --stft-conf # stft configurations files, (default=conf/stft.conf)" 33 | echo " --feats # type of spatial features, (default=ipd)" 34 | echo " --ipd-index # channel index to compute ipd, (default=0,1)" 35 | echo " --ipd-cos # compute cosIPD instead of raw IPD, (default=false)" 36 | echo " --ipd-sin # paste sinIPD to cosIPD features or not, (default=false)" 37 | echo " --msc-ctx # length of context for MSC computation, (default=1)" 38 | echo " --srp-fs # sample frequency for source wave, (default=16000)" 39 | echo " --srp-topo # microphone topo description, (default="")" 40 | echo " --srp-num-doa # doa resolution, (default=181)" 41 | echo " --srp-sample-tdoa # sample tdoa instead of doa, (default=false)" 42 | } 43 | 44 | . ./path.sh 45 | . ./utils/parse_options.sh || exit 1 46 | 47 | [ $# -ne 3 ] && echo "Script format error: $0 " && usage && exit 1 48 | 49 | src_dir=$(cd $1; pwd) 50 | dst_dir=$3 51 | 52 | for x in $src_dir/wav.scp $stft_conf; do [ ! -f $x ] && echo "$0: missing file: $x" && exit 1; done 53 | 54 | spatial_opts=$(cat $stft_conf | xargs) 55 | case $feats in 56 | "ipd" ) 57 | spatial_opts="$spatial_opts --ipd.index $ipd_index" 58 | spatial_opts="$spatial_opts --ipd.cos $ipd_cos --ipd.sin $ipd_sin" 59 | ;; 60 | "msc" ) 61 | spatial_opts="$spatial_opts --msc.ctx $msc_ctx" 62 | ;; 63 | "srp" ) 64 | spatial_opts="$spatial_opts --srp.sample-tdoa $src_sample_tdoa" 65 | spatial_opts="$spatial_opts --srp.num_doa $srp_num_doa --srp.sample-rate $srp_fs --srp.topo $srp_topo" 66 | ;; 67 | * ) 68 | echo "$0: Unknown spatial feats type: $feats" && exit 1 69 | ;; 70 | esac 71 | 72 | echo "$0: Compute $feats spatial features..." 73 | 74 | exp_dir=$2 && mkdir -p $exp_dir 75 | mkdir -p $dst_dir && dst_dir=$(cd $3; pwd) 76 | 77 | wav_split_scp="" 78 | for n in $(seq $nj); do wav_split_scp="$wav_split_scp $exp_dir/wav.$n.scp"; done 79 | ./utils/split_scp.pl $src_dir/wav.scp $wav_split_scp || exit 1 80 | 81 | name=$(basename $src_dir) 82 | 83 | if $compress ; then 84 | $cmd JOB=1:$nj $exp_dir/log/compute_$feats.JOB.log \ 85 | ./scripts/sptk/compute_ipd_and_linear_srp.py \ 86 | --type $feats $spatial_opts \ 87 | $exp_dir/wav.JOB.scp - \| \ 88 | copy-feats --compress=$compress ark:- \ 89 | ark,scp:$dst_dir/$name.$feats.JOB.ark,$dst_dir/$name.$feats.JOB.scp 90 | else 91 | $cmd JOB=1:$nj $exp_dir/log/compute_$feats.JOB.log \ 92 | ./scripts/sptk/compute_ipd_and_linear_srp.py \ 93 | --type $feats $spatial_opts \ 94 | --scp $dst_dir/$name.$feats.JOB.scp \ 95 | $exp_dir/wav.JOB.scp $dst_dir/$name.$feats.JOB.ark 96 | fi 97 | 98 | cat $dst_dir/$name.$feats.*.scp | sort -k1 > $src_dir/$feats.scp 99 | 100 | echo "$0: Compute $feats spatial features done" 101 | 102 | -------------------------------------------------------------------------------- /scripts/compute_librosa_fbank.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | # wujian@2018 4 | 5 | set -eu 6 | 7 | cmd="run.pl" 8 | nj=40 9 | sample_normalize=true 10 | apply_log=true 11 | apply_pow=false 12 | # egs: 13 | # --frame-len 1024 14 | # --frame-hop 256 15 | # --window hann 16 | # --num-bins 40 17 | # --sample-frequency 16000 18 | # --min-freq 0 19 | # --max-freq 8000 20 | # --center true 21 | fbank_conf=conf/fbank_librosa.conf 22 | 23 | compress=true 24 | 25 | echo "$0 $@" 26 | 27 | function usage { 28 | echo "Options:" 29 | echo " --nj # number of jobs to run parallel, (default=40)" 30 | echo " --cmd # how to run jobs, (default=run.pl)" 31 | echo " --compress # compress feature or not, (default=true)" 32 | echo " --apply-log # use log or linear fbank, (default=true)" 33 | echo " --apply-pow # use power or magnitude spectrogram, (default=false)" 34 | echo " --fbank-conf # stft configurations files, (default=conf/fbank_librosa.conf)" 35 | echo " --sample-normalize # normalize wav samples into [0, 1] or not, (default=true)" 36 | } 37 | 38 | . ./path.sh 39 | 40 | . ./utils/parse_options.sh || exit 1 41 | 42 | [ $# -ne 3 ] && echo "Script format error: $0 " && usage && exit 1 43 | 44 | src_dir=$(cd $1; pwd) 45 | dst_dir=$3 46 | 47 | for x in $src_dir/wav.scp $fbank_conf; do [ ! -f $x ] && echo "$0: missing file: $x" && exit 1; done 48 | 49 | fbank_opts=$(cat $fbank_conf | xargs) 50 | fbank_opts="$fbank_opts --normalize-samples $sample_normalize" 51 | fbank_opts="$fbank_opts --apply-log $apply_log --apply-pow $apply_pow" 52 | 53 | exp_dir=$2 && mkdir -p $exp_dir 54 | mkdir -p $dst_dir && dst_dir=$(cd $dst_dir; pwd) 55 | 56 | wav_split_scp="" 57 | for n in $(seq $nj); do wav_split_scp="$wav_split_scp $exp_dir/wav.$n.scp"; done 58 | 59 | ./utils/split_scp.pl $src_dir/wav.scp $wav_split_scp || exit 1 60 | 61 | name="fbank" 62 | dir=$(basename $src_dir) 63 | 64 | if $compress ; then 65 | $cmd JOB=1:$nj $exp_dir/log/compute_fbank_$dir.JOB.log \ 66 | ./scripts/sptk/compute_fbank.py \ 67 | $fbank_opts $exp_dir/wav.JOB.scp - \| \ 68 | copy-feats --compress=$compress ark:- \ 69 | ark,scp:$dst_dir/$dir.$name.JOB.ark,$dst_dir/$dir.$name.JOB.scp 70 | else 71 | $cmd JOB=1:$nj $exp_dir/log/compute_fbank_$dir.JOB.log \ 72 | ./scripts/sptk/compute_fbank.py $fbank_opts \ 73 | --scp $dst_dir/$dir.$name.JOB.scp \ 74 | $exp_dir/wav.JOB.scp $dst_dir/$dir.$name.JOB.ark 75 | fi 76 | 77 | cat $dst_dir/$dir.$name.*.scp | sort -k1 > $src_dir/feats.scp 78 | 79 | echo "$0: Compute fbank using librosa done" 80 | 81 | -------------------------------------------------------------------------------- /scripts/compute_librosa_spectrogram.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | # wujian@2018 4 | 5 | set -eu 6 | 7 | cmd="run.pl" 8 | nj=40 9 | 10 | sample_normalize=true 11 | apply_log=true 12 | apply_pow=false 13 | # egs: 14 | # --frame-len 1024 15 | # --frame-hop 256 16 | # --center true 17 | # --window hann 18 | stft_conf=conf/stft.conf 19 | 20 | compress=true 21 | 22 | echo "$0 $@" 23 | 24 | function usage { 25 | echo "Options:" 26 | echo " --nj # number of jobs to run parallel, (default=40)" 27 | echo " --cmd # how to run jobs, (default=run.pl)" 28 | echo " --compress # compress feature or not, (default=true)" 29 | echo " --apply-log # use log or linear spectrogram, (default=true)" 30 | echo " --apply-pow # use power or magnitude spectrogram, (default=false)" 31 | echo " --stft-conf # stft configurations files, (default=conf/stft.conf)" 32 | echo " --sample-normalize # normalize wav samples into [0, 1] or not, (default=true)" 33 | } 34 | 35 | . ./path.sh 36 | 37 | . ./utils/parse_options.sh || exit 1 38 | 39 | [ $# -ne 3 ] && echo "Script format error: $0 " && usage && exit 1 40 | 41 | src_dir=$(cd $1; pwd) 42 | dst_dir=$3 43 | 44 | for x in $src_dir/wav.scp $stft_conf; do [ ! -f $x ] && echo "$0: missing file: $x" && exit 1; done 45 | 46 | spectrogram_opts=$(cat $stft_conf | xargs) 47 | 48 | spectrogram_opts="$spectrogram_opts --normalize-samples $sample_normalize" 49 | spectrogram_opts="$spectrogram_opts --apply-log $apply_log --apply-pow $apply_pow" 50 | 51 | exp_dir=$2 && mkdir -p $exp_dir 52 | mkdir -p $dst_dir && dst_dir=$(cd $dst_dir; pwd) 53 | 54 | wav_split_scp="" 55 | for n in $(seq $nj); do wav_split_scp="$wav_split_scp $exp_dir/wav.$n.scp"; done 56 | 57 | ./utils/split_scp.pl $src_dir/wav.scp $wav_split_scp || exit 1 58 | 59 | $apply_log && $apply_pow && echo "$0: Using log-amplitude feature instead" && exit 1 60 | 61 | name="linear_amp_spectrogram" 62 | $apply_log && ! $apply_pow && name="log_amp_spectrogram" 63 | ! $apply_log && $apply_pow && name="linear_pow_spectrogram" 64 | 65 | dir=$(basename $src_dir) 66 | 67 | if $compress ; then 68 | $cmd JOB=1:$nj $exp_dir/log/compute_spectrogram_$dir.JOB.log \ 69 | ./scripts/sptk/compute_spectrogram.py \ 70 | $spectrogram_opts $exp_dir/wav.JOB.scp - \| \ 71 | copy-feats --compress=$compress ark:- \ 72 | ark,scp:$dst_dir/$dir.$name.JOB.ark,$dst_dir/$dir.$name.JOB.scp 73 | else 74 | $cmd JOB=1:$nj $exp_dir/log/compute_spectrogram_$dir.JOB.log \ 75 | ./scripts/sptk/compute_spectrogram.py $spectrogram_opts \ 76 | --scp $dst_dir/$dir.$name.JOB.scp \ 77 | $exp_dir/wav.JOB.scp $dst_dir/$dir.$name.JOB.ark 78 | fi 79 | 80 | cat $dst_dir/$dir.$name.*.scp | sort -k1 > $src_dir/feats.scp 81 | 82 | echo "$0: Compute spectrogram using librosa done" 83 | 84 | -------------------------------------------------------------------------------- /scripts/compute_oracle_mask.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | set -eu 4 | 5 | mask="irm" 6 | # for iam(FFT-mask)/psm etc 7 | cutoff=10 8 | stft_conf=conf/stft.conf 9 | 10 | compress=true 11 | cmd="run.pl" 12 | nj=40 13 | 14 | echo "$0 $@" 15 | 16 | function usage { 17 | echo "Options:" 18 | echo " --nj # number of jobs to run parallel, (default=$nj)" 19 | echo " --cmd # how to run jobs, (default=$cmd)" 20 | echo " --compress # compress feature or not, (default=$compress)" 21 | echo " --stft-conf # stft configurations files, (default=$stft_conf)" 22 | echo " --cutoff # values to cutoff when compute iam/psm, (default=$cutoff)" 23 | echo " --mask # type of TF-masks to compute, (default=$mask)" 24 | } 25 | 26 | . ./path.sh 27 | . ./utils/parse_options.sh || exit 1 28 | 29 | [ $# -ne 3 ] && echo "Script format error: $0 " && usage && exit 1 30 | 31 | data_dir=$(cd $1; pwd) 32 | mask_dir=$3 33 | 34 | denominator_scp=wav.scp 35 | case $mask in 36 | "iam"|"psm"|"psa"|"crm" ) 37 | denominator_scp=wav.scp 38 | ;; 39 | "ibm"|"irm" ) 40 | ;; 41 | * ) 42 | echo "$0: Unknown type of mask: $mask" && exit 1 43 | ;; 44 | esac 45 | 46 | for f in clean.scp $denominator_scp; do 47 | [ ! -f $data_dir/$f ] && echo "$0: missing $f in $data_dir" && exit 1 48 | done 49 | 50 | exp_dir=$2 && mkdir -p $exp_dir 51 | mkdir -p $mask_dir && mask_dir=$(cd $mask_dir; pwd) 52 | 53 | split_speech_wav="" 54 | for n in $(seq $nj); do split_speech_wav="$split_speech_wav $exp_dir/clean.$n.scp"; done 55 | 56 | ./utils/split_scp.pl $data_dir/clean.scp $split_speech_wav || exit 1 57 | 58 | mask_opts=$(cat $stft_conf | xargs) 59 | mask_opts="$mask_opts --mask $mask" 60 | name=$(basename $data_dir) 61 | 62 | if $compress ; then 63 | $cmd JOB=1:$nj $exp_dir/log/compute_mask_$name.JOB.log \ 64 | ./scripts/sptk/compute_mask.py $mask_opts --cutoff $cutoff \ 65 | $exp_dir/clean.JOB.scp $data_dir/$denominator_scp - \| \ 66 | copy-feats --compress=$compress ark:- \ 67 | ark,scp:$mask_dir/$name.$mask.JOB.ark,$mask_dir/$name.$mask.JOB.scp 68 | else 69 | $cmd JOB=1:$nj $exp_dir/log/compute_mask_$name.JOB.log \ 70 | ./scripts/sptk/compute_mask.py $mask_opts --cutoff $cutoff \ 71 | --scp $mask_dir/$name.$mask.JOB.scp \ 72 | $exp_dir/clean.JOB.scp $data_dir/$denominator_scp \ 73 | $mask_dir/$name.$mask.JOB.ark 74 | fi 75 | 76 | cat $mask_dir/$name.$mask.*.scp | sort -k1 > $data_dir/mask.scp 77 | 78 | echo "$0: Compute $mask done" 79 | 80 | 81 | -------------------------------------------------------------------------------- /scripts/get_wav_duration.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | # wujian@2020 4 | 5 | set -eu 6 | 7 | nj=20 8 | cmd="run.pl" 9 | output="sample" 10 | 11 | . ./path.sh 12 | . ./utils/parse_options.sh || exit 1 13 | 14 | [ $# -ne 2 ] && echo "Script format error: $0 " && exit 1 15 | 16 | data_dir=$(cd $1; pwd) 17 | log_dir=$2 && mkdir -p $log_dir 18 | 19 | [ ! -f $data_dir/wav.scp ] && echo "Missing $data_dir/wav.scp" && exit 1 20 | 21 | split_scp="" 22 | for n in $(seq $nj); do split_scp="$split_scp $log_dir/wav.$n.scp"; done 23 | 24 | ./utils/split_scp.pl $data_dir/wav.scp $split_scp || exit 1 25 | 26 | $cmd JOB=1:$nj $log_dir/log/get_wav_dur.JOB.log \ 27 | ./utils/wav_duration.py --output $output $log_dir/wav.JOB.scp $log_dir/dur.JOB 28 | 29 | cat $log_dir/dur.* | sort -k1 > $data_dir/utt2dur 30 | echo "$0: Get duration for $1 done" -------------------------------------------------------------------------------- /scripts/run_adapt_beamformer.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | # wujian@2018 4 | 5 | set -eu 6 | 7 | nj=40 8 | cmd="run.pl" 9 | stft_conf=conf/stft.conf 10 | mask_format="kaldi" 11 | beamformer="mvdr" 12 | # do ban or not 13 | ban=false 14 | post_masking=false 15 | vad_proportion=1 16 | # online 17 | alpha=0.8 18 | chunk_size=-1 19 | channels=4 20 | pmwf_ref=-1 21 | pmwf_rank1_appro="none" 22 | itf_mask="" 23 | 24 | echo "$0 $@" 25 | 26 | function usage { 27 | echo "Options:" 28 | echo " --nj # number of jobs to run parallel, (default=$nj)" 29 | echo " --cmd # how to run jobs, (default=$cmd)" 30 | echo " --stft-conf # stft configurations files, (default=$stft_conf)" 31 | echo " --mask-format # load masks from np.ndarray instead, (default=$mask_format)" 32 | echo " --itf-mask # scripts of interfering masks, (default=$itf_mask)" 33 | echo " --beamformer # type of adaptive beamformer to apply, (default=$beamformer)" 34 | echo " --ban # do ban or not, (default=$ban)" 35 | echo " --pmwf-rank1-appro # weather to use rank1 approximation in PMWF, (default=$pmwf_rank1_appro)" 36 | echo " --post-masking # do TF-masking after beamforming or not, (default=$post_masking)" 37 | echo " --vad-proportion # vad proportion to filter silence masks, (default=$vad_proportion)" 38 | echo " --alpha # remember coefficient used in online version, (default=$alpha)" 39 | echo " --chunk-size # chunk size in online beamformer, (default=$chunk_size)" 40 | echo " --channels # number of channels, (default=$channels)" 41 | } 42 | 43 | . ./path.sh 44 | . ./utils/parse_options.sh || exit 1 45 | 46 | [ $# -ne 3 ] && echo "Script format error: $0 " && usage && exit 1 47 | 48 | wav_scp=$1 49 | enhan_dir=$3 50 | 51 | for x in $wav_scp $stft_conf; do [ ! -f $x ] && echo "$0: missing file: $x" && exit 1; done 52 | 53 | dirname=$(basename $enhan_dir) 54 | exp_dir=./exp/run_$beamformer/$dirname && mkdir -p $exp_dir 55 | 56 | # if second parameter is a directory 57 | if [ -d $2 ]; then 58 | [ $mask_format != "numpy" ] && echo "$0: $2 is a directory, expected to set --mask-format numpy" && exit 1 59 | find $2 -name "*.npy" | awk -F '/' '{printf("%s\t%s\n", $NF, $0)}' | \ 60 | sed 's:\.npy::' | sort -k1 > $exp_dir/masks.scp 61 | echo "$0: Got $(cat $exp_dir/masks.scp | wc -l) numpy's masks" 62 | else 63 | cp $2 $exp_dir/masks.scp 64 | fi 65 | 66 | awk '{print $1}' $exp_dir/masks.scp | ./utils/filter_scp.pl -f 1 - $wav_scp | sort -k1 > $exp_dir/wav.scp 67 | echo "$0: Reduce $(cat $wav_scp | wc -l) utterances to $(cat $exp_dir/wav.scp | wc -l)" 68 | 69 | wav_split_scp="" && for n in $(seq $nj); do wav_split_scp="$wav_split_scp $exp_dir/wav.$n.scp"; done 70 | ./utils/split_scp.pl $exp_dir/wav.scp $wav_split_scp 71 | 72 | beamformer_opts=$(cat $stft_conf | xargs) 73 | [ ! -z $itf_mask ] && beamformer_opts="$beamformer_opts --itf-mask $itf_mask" 74 | 75 | if [ $chunk_size -gt 0 ]; then 76 | beamformer_opts="$beamformer_opts --online.alpha $alpha --online.chunk-size $chunk_size --online.channels $channels" 77 | fi 78 | 79 | mkdir -p $enhan_dir 80 | $cmd JOB=1:$nj $exp_dir/log/run_$beamformer.JOB.log \ 81 | ./scripts/sptk/apply_adaptive_beamformer.py \ 82 | $beamformer_opts \ 83 | --beamformer $beamformer \ 84 | --mask-format $mask_format \ 85 | --pmwf-ref $pmwf_ref \ 86 | --ban $ban \ 87 | --rank1-appro $pmwf_rank1_appro \ 88 | --vad-proportion $vad_proportion \ 89 | --post-masking $post_masking \ 90 | $exp_dir/wav.JOB.scp \ 91 | $exp_dir/masks.scp \ 92 | $enhan_dir 93 | 94 | echo "$0: Run $beamformer done!" 95 | 96 | -------------------------------------------------------------------------------- /scripts/run_auxiva.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | # wujian@2018 4 | 5 | 6 | set -eu 7 | 8 | cmd="run.pl" 9 | nj=40 10 | epochs=20 11 | fs=16000 12 | stft_conf=conf/stft.conf 13 | 14 | echo "$0 $@" 15 | 16 | function usage { 17 | echo "Options:" 18 | echo " --nj # number of jobs to run parallel, (default=40)" 19 | echo " --cmd # how to run jobs, (default=run.pl)" 20 | echo " --stft-conf # stft configurations files, (default=conf/stft.conf)" 21 | echo " --epochs # number of epochs to run AuxIVA, (default=20)" 22 | echo " --fs # sample frequency for output wave, (default=16000)" 23 | } 24 | 25 | . ./path.sh || exit 1 26 | . ./utils/parse_options.sh || exit 1 27 | 28 | [ $# -ne 2 ] && echo "Script format error: $0 " && usage && exit 1 29 | 30 | wav_scp=$1 31 | dst_dir=$2 32 | 33 | [ ! -d $dst_dir ] && mkdir -p $dst_dir 34 | 35 | dirname=$(basename $dst_dir) 36 | exp_dir=exp/auxiva/$dirname && mkdir -p $exp_dir 37 | 38 | split_wav_scp="" 39 | for n in $(seq $nj); do split_wav_scp="$split_wav_scp $exp_dir/wav.$n.scp"; done 40 | 41 | ./utils/split_scp.pl $wav_scp $split_wav_scp || exit 1 42 | 43 | stft_opts=$(cat $stft_conf | xargs) 44 | 45 | mkdir -p $dst_dir 46 | $cmd JOB=1:$nj $exp_dir/log/run_auxiva.JOB.log \ 47 | ./scripts/sptk/apply_auxiva.py \ 48 | --sample-frequency $fs \ 49 | --num-epochs $epochs \ 50 | $stft_opts \ 51 | $exp_dir/wav.JOB.scp \ 52 | $dst_dir 53 | 54 | 55 | echo "$0: Do auxiva for $wav_scp done" 56 | -------------------------------------------------------------------------------- /scripts/run_cacgmm.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | # wujian@2019 4 | 5 | set -eu 6 | 7 | nj=40 8 | cmd="run.pl" 9 | iters=50 10 | stft_conf=conf/stft.conf 11 | init_mask= 12 | num_classes=3 13 | solve_permu=true 14 | mask_format="numpy" 15 | 16 | echo "$0 $@" 17 | 18 | function usage { 19 | echo "Options:" 20 | echo " --nj # number of jobs to run parallel, (default=$nj)" 21 | echo " --cmd # how to run jobs, (default=$cmd)" 22 | echo " --stft-conf # stft configurations files, (default=$stft_conf)" 23 | echo " --iters # number of interations to run cacgmm, (default=$iters)" 24 | echo " --num-classes # number of the cluster used in cacgmm model, (default=$num_classes)" 25 | echo " --solve-permu # solve frequency permutation or not, (default=$solve_permu)" 26 | echo " --init-mask # dir or script for mask initialization, (default=$init_mask)" 27 | echo " --mask-format # mask storage type, (default=$mask_format)" 28 | } 29 | 30 | . ./path.sh 31 | . ./utils/parse_options.sh || exit 1 32 | 33 | [ $# -ne 2 ] && echo "Script format error: $0 " && usage && exit 1 34 | 35 | wav_scp=$1 36 | dst_dir=$2 37 | 38 | for x in $wav_scp $stft_conf; do [ ! -f $x ] && echo "$0: missing file: $x" && exit 1; done 39 | 40 | dirname=$(basename $dst_dir) 41 | exp_dir=./exp/cacgmm/$dirname && mkdir -p $exp_dir 42 | stft_opts=$(cat $stft_conf | xargs) 43 | 44 | split_wav_scp="" && for n in $(seq $nj); do split_wav_scp="$split_wav_scp $exp_dir/wav.$n.scp"; done 45 | 46 | ./utils/split_scp.pl $wav_scp $split_wav_scp 47 | 48 | cacgmm_opts="--num-iters $iters --num-classes $num_classes --solve-permu $solve_permu" 49 | [ ! -z $init_mask ] && cacgmm_opts="$cacgmm_opts --init-mask $init_mask --mask-format $mask_format" 50 | 51 | mkdir -p $dst_dir 52 | $cmd JOB=1:$nj $exp_dir/log/run_cacgmm.JOB.log \ 53 | ./scripts/sptk/estimate_cacgmm_masks.py \ 54 | $stft_opts $cacgmm_opts \ 55 | $exp_dir/wav.JOB.scp \ 56 | $dst_dir 57 | 58 | echo "$0: Estimate mask using Cacgmm (K = $num_classes) methods done" 59 | -------------------------------------------------------------------------------- /scripts/run_cgmm.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | # wujian@2018 4 | 5 | set -eu 6 | 7 | nj=40 8 | cmd="run.pl" 9 | iters=20 10 | # stft.conf example: 11 | # --frame-length 1024 12 | # --frame-shift 256 13 | # --window hann 14 | # --center true 15 | stft_conf=conf/stft.conf 16 | init_mask= 17 | num_classes=2 18 | solve_permu=false 19 | mask_format="numpy" 20 | 21 | echo "$0 $@" 22 | 23 | function usage { 24 | echo "Options:" 25 | echo " --nj # number of jobs to run parallel, (default=$nj)" 26 | echo " --cmd # how to run jobs, (default=$cmd)" 27 | echo " --stft-conf # stft configurations files, (default=$stft_conf)" 28 | echo " --iters # number of iterations to run CGMM, (default=$iters)" 29 | echo " --num-classes # number of the cluster used in cgmm model, (default=$num_classes)" 30 | echo " --solve-permu # solve frequency permutation or not, (default=$solve_permu)" 31 | echo " --init-mask # dir or script for mask initialization, (default=$init_mask)" 32 | echo " --mask-format # mask storage type, (default=$mask_format)" 33 | } 34 | 35 | . ./path.sh 36 | . ./utils/parse_options.sh || exit 1 37 | 38 | [ $# -ne 2 ] && echo "Script format error: $0 " && usage && exit 1 39 | 40 | wav_scp=$1 41 | dst_dir=$2 42 | 43 | for x in $wav_scp $stft_conf; do [ ! -f $x ] && echo "$0: missing file: $x" && exit 1; done 44 | 45 | dirname=$(basename $dst_dir) 46 | exp_dir=./exp/cgmm/$dirname && mkdir -p $exp_dir 47 | stft_opts=$(cat $stft_conf | xargs) 48 | 49 | split_wav_scp="" && for n in $(seq $nj); do split_wav_scp="$split_wav_scp $exp_dir/wav.$n.scp"; done 50 | 51 | ./utils/split_scp.pl $wav_scp $split_wav_scp 52 | 53 | cgmm_opts="--num-iters $iters --num-classes $num_classes --solve-permu $solve_permu" 54 | [ ! -z $init_mask ] && cgmm_opts="$cgmm_opts --init-mask $init_mask --mask-format $mask_format" 55 | 56 | mkdir -p $dst_dir 57 | $cmd JOB=1:$nj $exp_dir/log/run_cgmm.JOB.log \ 58 | ./scripts/sptk/estimate_cgmm_masks.py \ 59 | $stft_opts $cgmm_opts \ 60 | $exp_dir/wav.JOB.scp \ 61 | $dst_dir 62 | 63 | echo "$0: Estimate mask using CGMM methods done" 64 | -------------------------------------------------------------------------------- /scripts/run_ds_beamformer.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | # wujian@2018 4 | 5 | set -eu 6 | 7 | nj=40 8 | cmd="run.pl" 9 | fs=16000 10 | speed=343 11 | geometry="linear" 12 | linear_topo="0,0.2,0.4,0.6" 13 | circular_radius=0.5 14 | circular_center=false 15 | circular_around=6 16 | doa_list="30 70 110 150" 17 | utt2doa="" 18 | stft_conf=./conf/stft.conf 19 | 20 | echo "$0 $@" 21 | 22 | function usage { 23 | echo "Options:" 24 | echo " --nj # number of jobs to run parallel, (default=$nj)" 25 | echo " --cmd # how to run jobs, (default=$cmd)" 26 | echo " --stft-conf # stft configurations files, (default=$stft_conf)" 27 | echo " --fs # sample frequency for source signal, (default=$fs)" 28 | echo " --geometry # geometry of the array, (default=$geometry)" 29 | echo " --linear-topo # topology for linear microphone arrays, (default=$linear_topo)" 30 | echo " --circular-center
# is there a microphone in the center, (default=$circular_center)" 31 | echo " --circular-radius # radius of the array, (default=$circular_radius)" 32 | echo " --circular-around # number microphones around the center, (default=$circular_around)" 33 | echo " --doa-list # list of DoA to be processed, (default=$doa_list)" 34 | echo " --utt2doa # utt2doa file, (default=$utt2doa)" 35 | echo " --speed # sound speed, (default=$speed)" 36 | } 37 | 38 | . ./path.sh 39 | . ./utils/parse_options.sh || exit 1 40 | 41 | [ $# -ne 3 ] && echo "Script format error: $0 " && usage && exit 1 42 | 43 | wav_scp=$1/wav.scp 44 | exp_dir=$2 45 | dst_dir=$3 46 | 47 | for x in $stft_conf $wav_scp; do [ ! -f $x ] && echo "$0: Missing file: $x" && exit 1; done 48 | [ ! -d $exp_dir ] && mkdir -p $exp_dir 49 | 50 | split_wav_scp="" 51 | for n in $(seq $nj); do split_wav_scp="$split_wav_scp $exp_dir/wav.$n.scp"; done 52 | 53 | ./utils/split_scp.pl $wav_scp $split_wav_scp 54 | stft_opts=$(cat $stft_conf | xargs) 55 | beamformer_opts="--sr $fs --speed $speed --geometry $geometry" 56 | 57 | case $geometry in 58 | "linear" ) 59 | beamformer_opts="$beamformer_opts --linear-topo $linear_topo" 60 | ;; 61 | "circular" ) 62 | beamformer_opts="$beamformer_opts --circular-around $circular_around" 63 | beamformer_opts="$beamformer_opts --circular-radius $circular_radius" 64 | beamformer_opts="$beamformer_opts --circular_center $circular_center" 65 | ;; 66 | * ) 67 | echo "$0: Unknown type of geometry: $geometry" && exit 1 68 | ;; 69 | esac 70 | if [ ! -z $utt2doa ]; then 71 | echo "$0: Run DS beamformer on $utt2doa ..." 72 | mkdir -p $dst_dir/doa${doa}_$dirname 73 | $cmd JOB=1:$nj $exp_dir/run_ds.JOB.log \ 74 | ./scripts/sptk/apply_ds_beamformer.py \ 75 | $stft_opts $beamformer_opts \ 76 | --utt2doa $utt2doa \ 77 | $exp_dir/wav.JOB.scp \ 78 | $dst_dir 79 | echo "$0: Run delay and sum beamformer -- $utt2doa done" 80 | else 81 | dirname=$(basename $1) 82 | for doa in $doa_list; do 83 | echo "$0: Run DS beamformer on DoA $doa ..." 84 | mkdir -p $dst_dir/doa${doa}_$dirname 85 | $cmd JOB=1:$nj $exp_dir/$dirname.$doa.ds.JOB.log \ 86 | ./scripts/sptk/apply_ds_beamformer.py \ 87 | $stft_opts $beamformer_opts \ 88 | --doa $doa \ 89 | $exp_dir/wav.JOB.scp \ 90 | $dst_dir/doa${doa}_$dirname 91 | done 92 | echo "$0: Run delay and sum beamformer -- $doa_list done" 93 | fi 94 | 95 | 96 | -------------------------------------------------------------------------------- /scripts/run_fixed_beamformer.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | # wujian@2018 4 | 5 | set -eu 6 | 7 | nj=40 8 | cmd="run.pl" 9 | stft_conf=conf/stft.conf 10 | beam="" 11 | 12 | echo "$0 $@" 13 | 14 | function usage { 15 | echo "Options:" 16 | echo " --nj # number of jobs to run parallel, (default=$nj)" 17 | echo " --cmd # how to run jobs, (default=$cmd)" 18 | echo " --stft-conf # stft configurations files, (default=$stft_conf)" 19 | echo " --beam # beam index to use in beamformer weights, (default=$beam)" 20 | } 21 | 22 | . ./path.sh 23 | . ./utils/parse_options.sh || exit 1 24 | 25 | [ $# -ne 3 ] && echo "Script format error: $0 " && usage && exit 1 26 | 27 | wav_scp=$1 28 | weight=$2 29 | enhan_dir=$3 30 | 31 | for x in $wav_scp $weight $stft_conf; do [ ! -f $x ] && echo "$0: missing file: $x" && exit 1; done 32 | 33 | exp_dir=exp/fixed_beam && mkdir -p $exp_dir 34 | 35 | wav_split_scp="" && for n in $(seq $nj); do wav_split_scp="$wav_split_scp $exp_dir/wav.$n.scp"; done 36 | ./utils/split_scp.pl $wav_scp $wav_split_scp 37 | 38 | fixed_beam_opts=$(cat $stft_conf | xargs) 39 | [ ! -z $beam ] && fixed_beam_opts="$fixed_beam_opts --beam $beam" 40 | 41 | mkdir -p $enhan_dir 42 | $cmd JOB=1:$nj $exp_dir/log/run_fixed_beam.JOB.log \ 43 | ./scripts/sptk/apply_fixed_beamformer.py \ 44 | $fixed_beam_opts \ 45 | $exp_dir/wav.JOB.scp \ 46 | $weight $enhan_dir 47 | 48 | echo "$0: Run fixed beamformer done!" 49 | 50 | -------------------------------------------------------------------------------- /scripts/run_sd_beamformer.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | # wujian@2018 4 | 5 | set -eu 6 | 7 | nj=40 8 | cmd="run.pl" 9 | fs=16000 10 | speed=343 11 | geometry="linear" 12 | linear_topo="0,0.2,0.4,0.6" 13 | circular_radius=0.5 14 | circular_center=false 15 | circular_around=6 16 | doa_list="30 70 110 150" 17 | utt2doa="" 18 | stft_conf=./conf/stft.conf 19 | 20 | echo "$0 $@" 21 | 22 | function usage { 23 | echo "Options:" 24 | echo " --nj # number of jobs to run parallel, (default=$nj)" 25 | echo " --cmd # how to run jobs, (default=$cmd)" 26 | echo " --stft-conf # stft configurations files, (default=$stft_conf)" 27 | echo " --fs # sample frequency for source signal, (default=$fs)" 28 | echo " --geometry # geometry of the array, (default=$geometry)" 29 | echo " --linear-topo # topology for linear microphone arrays, (default=$linear_topo)" 30 | echo " --circular-center
# is there a microphone in the center, (default=$circular_center)" 31 | echo " --circular-radius # radius of the array, (default=$circular_radius)" 32 | echo " --circular-around # number microphones around the center, (default=$circular_around)" 33 | echo " --doa-list # list of DoA to be processed, (default=$doa_list)" 34 | echo " --utt2doa # utt2doa file, (default=$utt2doa)" 35 | echo " --speed # sound speed, (default=$speed)" 36 | } 37 | 38 | . ./path.sh 39 | . ./utils/parse_options.sh || exit 1 40 | 41 | [ $# -ne 3 ] && echo "Script format error: $0 " && usage && exit 1 42 | 43 | wav_scp=$1/wav.scp 44 | exp_dir=$2 45 | dst_dir=$3 46 | 47 | for x in $stft_conf $wav_scp; do [ ! -f $x ] && echo "$0: Missing file: $x" && exit 1; done 48 | [ ! -d $exp_dir ] && mkdir -p $exp_dir 49 | 50 | split_wav_scp="" 51 | for n in $(seq $nj); do split_wav_scp="$split_wav_scp $exp_dir/wav.$n.scp"; done 52 | 53 | ./utils/split_scp.pl $wav_scp $split_wav_scp 54 | stft_opts=$(cat $stft_conf | xargs) 55 | beamformer_opts="--sr $fs --speed $speed --geometry $geometry" 56 | 57 | case $geometry in 58 | "linear" ) 59 | beamformer_opts="$beamformer_opts --linear-topo $linear_topo" 60 | ;; 61 | "circular" ) 62 | beamformer_opts="$beamformer_opts --circular-around $circular_around" 63 | beamformer_opts="$beamformer_opts --circular-radius $circular_radius" 64 | beamformer_opts="$beamformer_opts --circular_center $circular_center" 65 | ;; 66 | * ) 67 | echo "$0: Unknown type of geometry: $geometry" && exit 1 68 | ;; 69 | esac 70 | 71 | if [ ! -z $utt2doa ]; then 72 | echo "$0: Run supper-directive beamformer on $utt2doa ..." 73 | mkdir -p $dst_dir/doa${doa}_$dirname 74 | $cmd JOB=1:$nj $exp_dir/run_sd.JOB.log \ 75 | ./scripts/sptk/apply_sd_beamformer.py \ 76 | $stft_opts $beamformer_opts \ 77 | --utt2doa $utt2doa \ 78 | $exp_dir/wav.JOB.scp \ 79 | $dst_dir 80 | echo "$0: Run supper-directive beamformer -- $utt2doa done" 81 | else 82 | dirname=$(basename $1) 83 | for doa in $doa_list; do 84 | echo "$0: Run supper-directive beamformer on DoA $doa ..." 85 | mkdir -p $dst_dir/doa${doa}_$dirname 86 | $cmd JOB=1:$nj $exp_dir/$dirname.$doa.sd.JOB.log \ 87 | ./scripts/sptk/apply_sd_beamformer.py \ 88 | $stft_opts $beamformer_opts \ 89 | --doa $doa \ 90 | $exp_dir/wav.JOB.scp \ 91 | $dst_dir/doa${doa}_$dirname 92 | done 93 | echo "$0: Run supper-directive beamformer -- $doa_list done" 94 | fi -------------------------------------------------------------------------------- /scripts/run_ssl.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | # wujian@2020 4 | 5 | set -eu 6 | 7 | nj=40 8 | cmd="run.pl" 9 | stft_conf=conf/stft.conf 10 | backend="ml" 11 | srp_pair="" 12 | doa_range="0,180" 13 | mask_scp="" 14 | mask_eps=-1 15 | output="degree" 16 | chunk_len=-1 17 | look_back=125 18 | 19 | echo "$0 $@" 20 | 21 | function usage { 22 | echo "Options:" 23 | echo " --nj # number of jobs to run parallel, (default=$nj)" 24 | echo " --cmd # how to run jobs, (default=$cmd)" 25 | echo " --backend # backend algorithm to choose, (default=$backend)" 26 | echo " --srp-pair # microphone index pair to compute srp response, (default=$srp_pair)" 27 | echo " --doa-range # doa range, (default=$doa_range)" 28 | echo " --output # output type of the DoA, (default=$output)" 29 | echo " --mask-scp # scripts of the speaker masks (default=$mask_scp)" 30 | echo " --mask-eps # value of eps used in winner-take-all (default=$mask_eps)" 31 | 32 | } 33 | 34 | . ./path.sh 35 | . ./utils/parse_options.sh || exit 1 36 | 37 | [ $# -ne 3 ] && echo "Script format error: $0 " && usage && exit 1 38 | 39 | wav_scp=$1/wav.scp 40 | doa_scp=$3 41 | steer_vector=$2 42 | 43 | for x in $wav_scp $stft_conf; do [ ! -f $x ] && echo "$0: missing file: $x" && exit 1; done 44 | 45 | dirname=$(basename $1) 46 | exp_dir=./exp/ssl/$dirname && mkdir -p $exp_dir 47 | 48 | ssl_opts=$(cat $stft_conf | xargs) 49 | [ ! -z $srp_pair ] && ssl_opts="$ssl_opts --srp-pair $srp_pair" 50 | [ ! -z $mask_scp ] && ssl_opts="$ssl_opts --mask-scp $mask_scp --mask-eps $mask_eps" 51 | [ $chunk_len -lt 1 ] && ssl_opts="$ssl_opts --chunk-len $chunk_len --look-back $look_back" 52 | 53 | split_wav_scp="" && for n in $(seq $nj); do split_wav_scp="$split_wav_scp $exp_dir/wav.$n.scp"; done 54 | 55 | ./utils/split_scp.pl $wav_scp $split_wav_scp 56 | 57 | $cmd JOB=1:$nj $exp_dir/log/do_ssl.JOB.log \ 58 | ./scripts/sptk/do_ssl.py $ssl_opts \ 59 | --backend $backend \ 60 | --doa-range $doa_range \ 61 | --output $output \ 62 | $exp_dir/wav.JOB.scp \ 63 | $steer_vector \ 64 | $exp_dir/doa.JOB.scp 65 | 66 | cat $exp_dir/doa.*.scp | sort -k1 > $doa_scp 67 | 68 | echo "$0: Do SSL for $wav_scp done (backend = $backend)" -------------------------------------------------------------------------------- /scripts/run_tf_masking.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | # wujian@2018 4 | 5 | set -eu 6 | 7 | nj=40 8 | cmd="run.pl" 9 | 10 | mask_format="numpy" 11 | keep_length=false 12 | fs=16000 13 | stft_conf=conf/stft.conf 14 | phase_ref= 15 | 16 | echo "$0 $@" 17 | 18 | function usage { 19 | echo "Options:" 20 | echo " --nj # number of jobs to run parallel, (default=$nj)" 21 | echo " --cmd # how to run jobs, (default=$cmd)" 22 | echo " --stft-conf # stft configurations files, (default=$stft_conf)" 23 | echo " --mask-format # load masks from np.ndarray instead, (default=$mask_format)" 24 | echo " --keep-length # keep same length as original or not, (default=$keep_length)" 25 | echo " --phase-ref # use phase reference or mixture, (default=$phase_ref)" 26 | echo " --fs # sample frequency for output wave, (default=$fs)" 27 | } 28 | 29 | . ./path.sh 30 | . ./utils/parse_options.sh || exit 1 31 | 32 | [ $# -ne 3 ] && echo "Script format error: $0 " && usage && exit 1 33 | 34 | wav_scp=$1 35 | enhan_dir=$3 36 | 37 | for x in $wav_scp $stft_conf; do [ ! -f $x ] && echo "$0: missing file: $x" && exit 1; done 38 | 39 | dirname=$(basename $enhan_dir) 40 | exp_dir=exp/tf_masking/$dirname && mkdir -p $exp_dir 41 | 42 | # if second parameter is a directory 43 | if [ -d $2 ]; then 44 | [ $mask_format != "numpy" ] && echo "$0: $2 is a directory, expected to set --mask-format numpy" && exit 1 45 | find $2 -name "*.npy" | awk -F '/' '{printf("%s\t%s\n", $NF, $0)}' | \ 46 | sed 's:\.npy::' | sort -k1 > $exp_dir/masks.scp 47 | echo "$0: Got $(cat $exp_dir/masks.scp | wc -l) numpy's masks" 48 | else 49 | cp $2 $exp_dir/masks.scp 50 | fi 51 | 52 | awk '{print $1}' $exp_dir/masks.scp | ./utils/filter_scp.pl -f 1 - $wav_scp | sort -k1 > $exp_dir/wav.scp 53 | echo "$0: Reduce $(cat $wav_scp | wc -l) utterances to $(cat $exp_dir/wav.scp | wc -l)" 54 | 55 | split_wav_scp="" 56 | for n in $(seq $nj); do split_wav_scp="$split_wav_scp $exp_dir/wav.$n.scp"; done 57 | 58 | ./utils/split_scp.pl $wav_scp $split_wav_scp || exit 1 59 | 60 | mask_opts=$(cat $stft_conf | xargs) 61 | mask_opts="$mask_opts --keep-length $keep_length" 62 | [ ! -z $phase_ref ] && mask_opts="$mask_opts --phase-ref $phase_ref" 63 | 64 | mkdir -p $enhan_dir 65 | $cmd JOB=1:$nj $exp_dir/log/wav_separate.JOB.scp \ 66 | ./scripts/sptk/wav_separate.py \ 67 | --sample-frequency $fs \ 68 | --mask-format $mask_format \ 69 | $mask_opts \ 70 | $exp_dir/wav.JOB.scp \ 71 | $exp_dir/masks.scp \ 72 | $enhan_dir 73 | 74 | echo "$0: Run TF-masking done" 75 | -------------------------------------------------------------------------------- /scripts/run_vad.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | # wujian@2018 4 | 5 | set -eu 6 | 7 | nj=40 8 | cmd="run.pl" 9 | 10 | mode=3 11 | sr=16000 12 | chunk_size=20 13 | cache_size=5 14 | 15 | function usage { 16 | echo "Options:" 17 | echo " --nj # number of jobs to run parallel, (default=40)" 18 | echo " --cmd # how to run jobs, (default=run.pl)" 19 | echo " --sr # sample rate for input wave, (default=16000)" 20 | echo " --mode <0-3> # vad mode (0->3 less->more aggressive) used in webrtc, (default=3)" 21 | echo " --chunk-size # frame length in ms, must be x10, (default=20)" 22 | echo " --cache-size # number of frames remembered in history, (default=5)" 23 | } 24 | 25 | echo "$0 $@" 26 | 27 | . ./path.sh 28 | . ./utils/parse_options.sh || exit 1 29 | 30 | [ $# -ne 3 ] && echo "Script format error: $0 " && usage && exit 1 31 | 32 | wav_scp=$1 33 | exp_dir=$2 && mkdir -p $exp_dir 34 | dst_dir=$3 && mkdir -p $dst_dir 35 | 36 | for x in $wav_scp; do [ ! -f $x ] && echo "$0: Missing $wav_scp..." && exit 1; done 37 | 38 | split_wav_scp="" 39 | for n in $(seq $nj); do split_wav_scp="$split_wav_scp $exp_dir/wav.$n.scp"; done 40 | 41 | ./utils/split_scp.pl $wav_scp $split_wav_scp 42 | 43 | $cmd JOB=1:$nj $exp_dir/log/cut_silence.JOB.log \ 44 | ./scripts/sptk/do_vad.py \ 45 | --mode $mode \ 46 | --sr $sr \ 47 | --chunk-size $chunk_size \ 48 | --cache-size $cache_size \ 49 | $exp_dir/wav.JOB.scp \ 50 | $dst_dir 51 | 52 | echo "$0: done" 53 | 54 | -------------------------------------------------------------------------------- /scripts/run_wpe.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | # wujian@2019 4 | 5 | set -eu 6 | 7 | nj=20 8 | cmd="run.pl" 9 | iters=3 10 | stft_conf=conf/wpe.conf 11 | delay=3 12 | taps=10 13 | context=1 14 | sr=16000 15 | nara_wpe=false 16 | 17 | echo "$0 $@" 18 | 19 | function usage { 20 | echo "Options:" 21 | echo " --nj # number of jobs to run parallel, (default=$nj)" 22 | echo " --cmd # how to run jobs, (default=$cmd)" 23 | echo " --stft-conf # stft configuration files, (default=$stft_conf)" 24 | echo " --iters # number of iters to run GWPE, (default=$iters)" 25 | echo " --delay # time delay in GWPE, (default=$delay)" 26 | echo " --taps # number of taps in GWPE, (default=$taps)" 27 | echo " --context # left/right context used in PSD matrix estimation, (default=$context)" 28 | echo " --sr # sample rate for source wave, (default=$sr)" 29 | echo " --nara-wpe # use nara-wpe or not, (default=$nara_wpe)" 30 | 31 | } 32 | 33 | . ./path.sh 34 | . ./utils/parse_options.sh || exit 1 35 | 36 | [ $# -ne 2 ] && echo "Script format error: $0 " && usage && exit 1 37 | 38 | wav_scp=$1 39 | dst_dir=$2 40 | 41 | for x in $wav_scp $stft_conf; do [ ! -f $x ] && echo "$0: missing file: $x" && exit 1; done 42 | 43 | dirname=$(basename $dst_dir) 44 | exp_dir=./exp/wpe/$dirname && mkdir -p $exp_dir 45 | stft_opts=$(cat $stft_conf | xargs) 46 | 47 | split_wav_scp="" && for n in $(seq $nj); do split_wav_scp="$split_wav_scp $exp_dir/wav.$n.scp"; done 48 | 49 | ./utils/split_scp.pl $wav_scp $split_wav_scp 50 | 51 | mkdir -p $dst_dir 52 | $cmd JOB=1:$nj $exp_dir/log/run_wpe.JOB.log \ 53 | ./scripts/sptk/apply_wpe.py \ 54 | $stft_opts --num-iters $iters \ 55 | --sample-rate $sr \ 56 | --nara-wpe $nara_wpe \ 57 | --context $context \ 58 | --taps $taps --delay $delay \ 59 | $exp_dir/wav.JOB.scp \ 60 | $dst_dir 61 | 62 | echo "$0: Run wpe algorithm done" 63 | -------------------------------------------------------------------------------- /scripts/sptk/README.md: -------------------------------------------------------------------------------- 1 | Python scripts (work with python 3.6+) for speech enhancement/separation integrated with kaldi, which could be used 2 | independently. 3 | 4 | * Supervised (mask-based) adaptive beamformer (GEVD/MVDR/PWWF) 5 | * Data convertion among MATLAB, Numpy and Kaldi 6 | * Data visualization (TF-mask, spatial/spectral features, beam pattern...) 7 | * Unified data and IO handlers for Kaldi's scripts, archives, wave, spectrogram, numpy's ndarray... 8 | * Unsupervised mask estimation (CGMM/CACGMM) 9 | * Spatial/Spectral feature computation 10 | * DS (delay and sum) beamformer, SD (supper-directive) beamformer 11 | * AuxIVA, WPE & WPD, FB (Fixed Beamformer) 12 | * Mask computation (iam, irm, ibm, psm, crm) 13 | * RIR simulation (1D/2D arrays) 14 | * Single channel speech separation (TF spectral masking) 15 | * Si-SDR/SDR/WER evaluation 16 | * Pywebrtc vad wrapper 17 | * Mask-based source localization 18 | * Noise suppression 19 | * Data simulation 20 | * ... -------------------------------------------------------------------------------- /scripts/sptk/apply_auxiva.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | # wujian@2018 4 | """ 5 | AuxIVA: 6 | Ono N. Stable and fast update rules for independent vector analysis 7 | based on auxiliary function technique[C]//Applications of Signal 8 | Processing to Audio and Acoustics (WASPAA), 2011 IEEE Workshop on. IEEE, 2011: 189-192. 9 | Reference: https://github.com/LCAV/pyroomacoustics/blob/master/pyroomacoustics/bss/auxiva.py 10 | """ 11 | 12 | import argparse 13 | from pathlib import Path 14 | 15 | import numpy as np 16 | 17 | from libs.data_handler import SpectrogramReader 18 | from libs.opts import StftParser 19 | from libs.utils import inverse_stft, get_logger, write_wav, EPSILON 20 | 21 | logger = get_logger(__name__) 22 | 23 | 24 | def auxiva(X, epochs=20): 25 | """ 26 | Arguments: 27 | X: shape in N x T x F 28 | Return 29 | Y: same shape as X 30 | """ 31 | N, T, F = X.shape 32 | # X: F x T x N 33 | X = X.transpose([2, 1, 0]) 34 | # F x N x N 35 | W = np.array([np.eye(N, dtype=np.complex) for f in range(F)]) 36 | I = np.eye(N) 37 | # Y: F x T x N 38 | Y = np.einsum("...tn,...nx->...tx", X, np.conj(W)) 39 | 40 | for _ in range(epochs): 41 | # T x N 42 | R = np.sqrt(np.sum(np.abs(Y)**2, axis=0)) 43 | # N x T 44 | Gr = 1 / (R.T + EPSILON) 45 | for f in range(F): 46 | for n in range(N): 47 | # compute V 48 | V = (np.dot(np.expand_dims(Gr[n], 0) * X[f].T, np.conj( 49 | X[f]))) / T 50 | # update W 51 | w = np.linalg.solve(np.conj(W[f].T) @ V, I[n]) 52 | W[f, :, n] = w / np.inner(np.conj(w), V @ w) 53 | 54 | Y = np.einsum("...tn,...nx->...tx", X, np.conj(W)) 55 | # F x T x N => N x T x F 56 | Y = np.transpose(Y, [2, 1, 0]) 57 | return Y 58 | 59 | 60 | def run(args): 61 | stft_kwargs = { 62 | "frame_len": args.frame_len, 63 | "frame_hop": args.frame_hop, 64 | "window": args.window, 65 | "center": args.center, 66 | "transpose": True # F x T instead of T x F 67 | } 68 | 69 | spectrogram_reader = SpectrogramReader( 70 | args.wav_scp, round_power_of_two=args.round_power_of_two, **stft_kwargs) 71 | for key, spectrogram in spectrogram_reader: 72 | logger.info(f"Processing utterance {key}...") 73 | separated = auxiva(spectrogram, args.epochs) 74 | norm = spectrogram_reader.maxabs(key) 75 | for idx in range(separated.shape[0]): 76 | samps = inverse_stft(separated[idx], **stft_kwargs, norm=norm) 77 | fname = Path(args.dst_dir) / f"{key}.src{idx + 1}.wav" 78 | write_wav(fname, samps, sr=args.sr) 79 | logger.info(f"Processed {len(spectrogram_reader)} utterances") 80 | 81 | 82 | if __name__ == "__main__": 83 | parser = argparse.ArgumentParser( 84 | description="Command to do AuxIVA bss algorithm", 85 | formatter_class=argparse.ArgumentDefaultsHelpFormatter, 86 | parents=[StftParser.parser]) 87 | parser.add_argument("wav_scp", 88 | type=str, 89 | help="Multi-channel wave scripts in kaldi format") 90 | parser.add_argument("dst_dir", 91 | type=str, 92 | help="Location to dump separated source files") 93 | parser.add_argument("--num-epochs", 94 | default=20, 95 | type=int, 96 | dest="epochs", 97 | help="Number of epochs to run AuxIVA algorithm") 98 | parser.add_argument("--sr", 99 | type=int, 100 | default=16000, 101 | help="Waveform data sample rate") 102 | args = parser.parse_args() 103 | run(args) 104 | -------------------------------------------------------------------------------- /scripts/sptk/apply_ds_beamformer.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding=utf-8 3 | # wujian@2020 4 | 5 | import argparse 6 | from distutils.util import strtobool 7 | 8 | from apply_classic_beamformer import run as run_classic_beamformer 9 | from libs.opts import StftParser, str2tuple 10 | from libs.utils import get_logger 11 | 12 | logger = get_logger(__name__) 13 | 14 | 15 | def run(args): 16 | args.beamformer = "ds" 17 | run_classic_beamformer(args) 18 | 19 | 20 | if __name__ == "__main__": 21 | parser = argparse.ArgumentParser( 22 | description= 23 | "Command to apply delay and sum beamformer (linear & circular array).", 24 | formatter_class=argparse.ArgumentDefaultsHelpFormatter, 25 | parents=[StftParser.parser]) 26 | parser.add_argument("wav_scp", 27 | type=str, 28 | help="Rspecifier for multi-channel wave file") 29 | parser.add_argument("dst_dir", 30 | type=str, 31 | help="Directory to dump enhanced results") 32 | parser.add_argument("--sr", 33 | type=int, 34 | default=16000, 35 | help="Sample rate of the input wave") 36 | parser.add_argument("--speed", 37 | type=float, 38 | default=343, 39 | help="Speed of sound") 40 | parser.add_argument("--geometry", 41 | type=str, 42 | choices=["linear", "circular"], 43 | default="linear", 44 | help="Geometry of the microphone array") 45 | parser.add_argument("--linear-topo", 46 | type=str2tuple, 47 | default=(), 48 | help="Topology of linear microphone arrays") 49 | parser.add_argument("--circular-around", 50 | type=int, 51 | default=6, 52 | help="Number of the micriphones in circular arrays") 53 | parser.add_argument("--circular-radius", 54 | type=float, 55 | default=0.05, 56 | help="Radius of circular array") 57 | parser.add_argument("--circular-center", 58 | type=strtobool, 59 | default=False, 60 | help="Is there a microphone put in the " 61 | "center of the circular array?") 62 | parser.add_argument("--utt2doa", 63 | type=str, 64 | default="", 65 | help="Given DoA for each utterances, in degrees") 66 | parser.add_argument("--doa", 67 | type=str, 68 | default="0", 69 | help="DoA for all utterances if " 70 | "--utt2doa is not assigned") 71 | parser.add_argument("--normalize", 72 | type=strtobool, 73 | default=False, 74 | help="Normalize stft after enhancement?") 75 | parser.add_argument("--chunk-len", 76 | type=int, 77 | default=-1, 78 | help="Number frames per chunk " 79 | "(for online setups)") 80 | args = parser.parse_args() 81 | run(args) 82 | -------------------------------------------------------------------------------- /scripts/sptk/apply_fixed_beamformer.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding=utf-8 3 | # wujian@2018 4 | 5 | import argparse 6 | 7 | import numpy as np 8 | 9 | from libs.beamformer import FixedBeamformer 10 | from libs.data_handler import SpectrogramReader, WaveWriter, ScpReader 11 | from libs.opts import StftParser 12 | from libs.utils import inverse_stft, get_logger 13 | 14 | logger = get_logger(__name__) 15 | 16 | 17 | def run(args): 18 | stft_kwargs = { 19 | "frame_len": args.frame_len, 20 | "frame_hop": args.frame_hop, 21 | "window": args.window, 22 | "center": args.center, 23 | "transpose": False 24 | } 25 | spectrogram_reader = SpectrogramReader( 26 | args.wav_scp, round_power_of_two=args.round_power_of_two, **stft_kwargs) 27 | # F x N or B x F x N 28 | weights = np.load(args.weights) 29 | if weights.ndim == 2: 30 | beamformer = FixedBeamformer(weights) 31 | beam_index = None 32 | else: 33 | beamformer = [FixedBeamformer(w) for w in weights] 34 | if not args.beam: 35 | raise RuntimeError( 36 | "--beam must be assigned, as there are multiple beams") 37 | beam_index = ScpReader(args.beam, value_processor=int) 38 | with WaveWriter(args.dst_dir) as writer: 39 | for key, stft_mat in spectrogram_reader: 40 | logger.info(f"Processing utterance {key}...") 41 | if beamformer: 42 | beam = beam_index[key] 43 | stft_enh = beamformer[beam].run(stft_mat) 44 | else: 45 | stft_enh = beamformer.run(stft_mat) 46 | norm = spectrogram_reader.maxabs(key) 47 | samps = inverse_stft(stft_enh, **stft_kwargs, norm=norm) 48 | writer.write(key, samps) 49 | logger.info(f"Processed {len(spectrogram_reader):d} utterances") 50 | 51 | 52 | if __name__ == "__main__": 53 | parser = argparse.ArgumentParser( 54 | description="Command to run fixed beamformer. Runing this command needs " 55 | "to design fixed beamformer first.", 56 | formatter_class=argparse.ArgumentDefaultsHelpFormatter, 57 | parents=[StftParser.parser]) 58 | parser.add_argument("wav_scp", 59 | type=str, 60 | help="Multi-channel wave scripts in Kaldi format") 61 | parser.add_argument("weights", 62 | type=str, 63 | help="Fixed beamformer weights in numpy format " + 64 | "(in shape F x M or B x F x M)") 65 | parser.add_argument("dst_dir", 66 | type=str, 67 | help="Location to dump the enhanced audio") 68 | parser.add_argument("--beam", 69 | type=str, 70 | default="", 71 | help="Beam index to use in beamformer weights " 72 | "(in shape B x F x M)") 73 | args = parser.parse_args() 74 | run(args) 75 | -------------------------------------------------------------------------------- /scripts/sptk/apply_ns.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | # wujian@2020 4 | 5 | import argparse 6 | 7 | import yaml 8 | 9 | from libs.data_handler import SpectrogramReader, NumpyWriter, WaveWriter 10 | from libs.ns import iMCRA 11 | from libs.opts import StftParser 12 | from libs.utils import inverse_stft, get_logger 13 | 14 | logger = get_logger(__name__) 15 | 16 | 17 | def run(args): 18 | if args.sr != 16000: 19 | raise ValueError("Now only support audio in 16kHz") 20 | # shape: T x F, complex 21 | stft_kwargs = { 22 | "frame_len": args.frame_len, 23 | "frame_hop": args.frame_hop, 24 | "window": args.window, 25 | "center": args.center, 26 | } 27 | spectrogram_reader = SpectrogramReader( 28 | args.wav_scp, **stft_kwargs, round_power_of_two=args.round_power_of_two) 29 | 30 | if args.conf: 31 | with open(args.conf, "r") as conf: 32 | omlsa_conf = yaml.full_load(conf) 33 | suppressor = iMCRA(**omlsa_conf) 34 | else: 35 | suppressor = iMCRA() 36 | 37 | if args.output == "wave": 38 | with WaveWriter(args.dst_dir, sr=args.sr) as writer: 39 | for key, stft in spectrogram_reader: 40 | logger.info(f"Processing utterance {key}...") 41 | gain = suppressor.run(stft) 42 | samps = inverse_stft(gain * stft, **stft_kwargs) 43 | writer.write(key, samps) 44 | else: 45 | with NumpyWriter(args.dst_dir) as writer: 46 | for key, stft in spectrogram_reader: 47 | logger.info(f"Processing utterance {key}...") 48 | gain = suppressor.run(stft) 49 | writer.write(key, gain) 50 | logger.info(f"Processed {len(spectrogram_reader):d} utterances done") 51 | 52 | 53 | if __name__ == "__main__": 54 | parser = argparse.ArgumentParser( 55 | description="Command to do single channel noise suppression", 56 | formatter_class=argparse.ArgumentDefaultsHelpFormatter, 57 | parents=[StftParser.parser]) 58 | parser.add_argument("wav_scp", 59 | type=str, 60 | help="Noisy audio scripts in Kaldi format") 61 | parser.add_argument("dst_dir", 62 | type=str, 63 | help="Location to dump enhanced audio " 64 | "or gain coefficients") 65 | parser.add_argument("--conf", 66 | type=str, 67 | default="", 68 | help="Yaml configurations for OMLSA") 69 | parser.add_argument("--output", 70 | type=str, 71 | choices=["gain", "wave"], 72 | default="wave", 73 | help="Output type of the command") 74 | parser.add_argument("--sr", 75 | type=int, 76 | default=16000, 77 | help="Waveform data sample frequency") 78 | args = parser.parse_args() 79 | run(args) 80 | -------------------------------------------------------------------------------- /scripts/sptk/apply_sd_beamformer.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding=utf-8 3 | # wujian@2020 4 | 5 | import argparse 6 | from distutils.util import strtobool 7 | 8 | from apply_classic_beamformer import run as run_classic_beamformer 9 | from libs.opts import StftParser, str2tuple 10 | from libs.utils import get_logger 11 | 12 | logger = get_logger(__name__) 13 | 14 | 15 | def run(args): 16 | args.beamformer = "sd" 17 | run_classic_beamformer(args) 18 | 19 | 20 | if __name__ == "__main__": 21 | parser = argparse.ArgumentParser( 22 | description= 23 | "Command to apply supperdirective beamformer (linear & circular array).", 24 | formatter_class=argparse.ArgumentDefaultsHelpFormatter, 25 | parents=[StftParser.parser]) 26 | parser.add_argument("wav_scp", 27 | type=str, 28 | help="Rspecifier for multi-channel wave file") 29 | parser.add_argument("dst_dir", 30 | type=str, 31 | help="Directory to dump enhanced results") 32 | parser.add_argument("--sr", 33 | type=int, 34 | default=16000, 35 | help="Sample rate of the input wave") 36 | parser.add_argument("--speed", 37 | type=float, 38 | default=343, 39 | help="Speed of sound") 40 | parser.add_argument("--geometry", 41 | type=str, 42 | choices=["linear", "circular"], 43 | default="linear", 44 | help="Geometry of the microphone array") 45 | parser.add_argument("--linear-topo", 46 | type=str2tuple, 47 | default=(), 48 | help="Topology of linear microphone arrays") 49 | parser.add_argument("--circular-around", 50 | type=int, 51 | default=6, 52 | help="Number of the micriphones in circular arrays") 53 | parser.add_argument("--circular-radius", 54 | type=float, 55 | default=0.05, 56 | help="Radius of circular array") 57 | parser.add_argument("--circular-center", 58 | type=strtobool, 59 | default=False, 60 | help="Is there a microphone put in the " 61 | "center of the circular array?") 62 | parser.add_argument("--utt2doa", 63 | type=str, 64 | default="", 65 | help="Given DoA for each utterances, in degrees") 66 | parser.add_argument("--doa", 67 | type=str, 68 | default="0", 69 | help="DoA for all utterances if " 70 | "--utt2doa is not assigned") 71 | parser.add_argument("--normalize", 72 | type=strtobool, 73 | default=False, 74 | help="Normalize stft after enhancement?") 75 | parser.add_argument("--chunk-len", 76 | type=int, 77 | default=-1, 78 | help="Number frames per chunk " 79 | "(for online setups)") 80 | args = parser.parse_args() 81 | run(args) 82 | -------------------------------------------------------------------------------- /scripts/sptk/compute_centroid.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # wujian@2018 3 | 4 | import argparse 5 | from distutils.util import strtobool 6 | 7 | import numpy as np 8 | 9 | from libs.data_handler import NumpyReader, NumpyWriter, parse_scps 10 | from libs.utils import get_logger 11 | 12 | logger = get_logger(__name__) 13 | 14 | 15 | def run(args): 16 | numpy_reader = NumpyReader(args.npy_scp) 17 | 18 | spk2utt = parse_scps(args.spk2utt, num_tokens=-1) if args.spk2utt else None 19 | 20 | with NumpyWriter(args.dump_dir, args.scp) as writer: 21 | if spk2utt is None: 22 | for key, mat in numpy_reader: 23 | if mat.ndim != 2: 24 | raise RuntimeError( 25 | "--spk2utt is None, so input ndarray must be 2D, got {:d}" 26 | .format(mat.ndim)) 27 | if args.normalize: 28 | mat = mat / np.linalg.norm( 29 | mat, ord=2, axis=1, keepdims=True) 30 | writer.write(key, np.mean(mat, axis=0)) 31 | logger.info("Processed {:d} speakers".format(len(numpy_reader))) 32 | else: 33 | for spkid, uttlist in spk2utt.items(): 34 | spkset = [] 35 | for uttid in uttlist: 36 | vec = numpy_reader[uttid] 37 | if vec.ndim != 1: 38 | raise RuntimeError( 39 | "--spk2utt is not None, expect input as vector, got {:d}" 40 | .format(vec.ndim)) 41 | if args.normalize: 42 | vec = vec / np.linalg.norm(vec) 43 | spkset.append(vec) 44 | spk_mat = np.stack(spkset) 45 | writer.write(spkid, np.mean(spk_mat, axis=0)) 46 | logger.info("Processed {:d} speakers".format(len(spk2utt))) 47 | 48 | 49 | if __name__ == "__main__": 50 | parser = argparse.ArgumentParser( 51 | description="Command to compute means of numpy vectors/matrix", 52 | formatter_class=argparse.ArgumentDefaultsHelpFormatter) 53 | parser.add_argument("npy_scp", type=str, help="Input numpy rspecifier") 54 | parser.add_argument("--dump-dir", 55 | type=str, 56 | default="mean", 57 | help="Directory to dump computed results") 58 | parser.add_argument("--spk2utt", 59 | type=str, 60 | default="", 61 | help="Rspecifier for speaker to utterance-list map") 62 | parser.add_argument("--scp", 63 | type=str, 64 | default="", 65 | help="If assigned, generate corresponding scripts") 66 | parser.add_argument("--normalize", 67 | type=strtobool, 68 | default=False, 69 | help="If true, normalize vectors before compute means") 70 | args = parser.parse_args() 71 | run(args) 72 | -------------------------------------------------------------------------------- /scripts/sptk/compute_circular_srp.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | # wujian@2019 4 | 5 | import argparse 6 | 7 | import numpy as np 8 | 9 | from libs.data_handler import SpectrogramReader, ArchiveWriter 10 | from libs.opts import StftParser 11 | from libs.spatial import gcc_phat_diag 12 | from libs.utils import get_logger, nextpow2 13 | 14 | logger = get_logger(__name__) 15 | 16 | 17 | def run(args): 18 | srp_pair = [ 19 | tuple(map(int, p.split(","))) for p in args.diag_pair.split(";") 20 | ] 21 | if not len(srp_pair): 22 | raise RuntimeError(f"Bad configurations with --pair {args.pair}") 23 | logger.info(f"Compute gcc with {srp_pair}") 24 | 25 | stft_kwargs = { 26 | "frame_len": args.frame_len, 27 | "frame_hop": args.frame_hop, 28 | "round_power_of_two": args.round_power_of_two, 29 | "window": args.window, 30 | "center": args.center, # false to comparable with kaldi 31 | "transpose": True # T x F 32 | } 33 | num_done = 0 34 | num_ffts = nextpow2( 35 | args.frame_len) if args.round_power_of_two else args.frame_len 36 | reader = SpectrogramReader(args.wav_scp, **stft_kwargs) 37 | with ArchiveWriter(args.srp_ark, args.scp) as writer: 38 | for key, stft_mat in reader: 39 | num_done += 1 40 | srp = [] 41 | # N x T x F 42 | for (i, j) in srp_pair: 43 | srp.append( 44 | gcc_phat_diag(stft_mat[i], 45 | stft_mat[j], 46 | min(i, j) * np.pi * 2 / args.n, 47 | args.d, 48 | num_bins=num_ffts // 2 + 1, 49 | sr=args.sr, 50 | num_doas=args.num_doas)) 51 | srp = np.average(np.stack(srp), axis=0) 52 | nan = np.sum(np.isnan(srp)) 53 | if nan: 54 | raise RuntimeError(f"Matrix {key} has nan ({nan:d}) items)") 55 | writer.write(key, srp) 56 | if not num_done % 1000: 57 | logger.info(f"Processed {num_done:d} utterances...") 58 | logger.info(f"Processd {len(reader):d} utterances done") 59 | 60 | 61 | if __name__ == "__main__": 62 | parser = argparse.ArgumentParser( 63 | description= 64 | "Command to compute SRP augular spectrum for circular arrays", 65 | formatter_class=argparse.ArgumentDefaultsHelpFormatter, 66 | parents=[StftParser.parser]) 67 | parser.add_argument("wav_scp", 68 | type=str, 69 | help="Rspecifier for multi-channel wave") 70 | parser.add_argument("srp_ark", type=str, help="Location to dump features") 71 | parser.add_argument("--scp", 72 | type=str, 73 | default="", 74 | help="If assigned, generate corresponding scripts") 75 | parser.add_argument("--n", type=int, default=6, help="Number of arrays") 76 | parser.add_argument("--d", 77 | type=float, 78 | default=0.07, 79 | help="Diameter of circular array") 80 | parser.add_argument("--diag-pair", 81 | type=str, 82 | default="0,3;1,4;2,5", 83 | help="Compute gcc between those diagonal arrays") 84 | parser.add_argument("--sr", 85 | type=int, 86 | default=16000, 87 | help="Sample rate of input wave") 88 | parser.add_argument("--num-doas", 89 | type=int, 90 | default=121, 91 | help="Number of DoA to sample between 0 and 2pi") 92 | args = parser.parse_args() 93 | run(args) 94 | -------------------------------------------------------------------------------- /scripts/sptk/compute_df_on_mask.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | # wujian@2018 4 | """ 5 | Compute directional features using steer vector, based on TF-mask 6 | """ 7 | 8 | import argparse 9 | 10 | import numpy as np 11 | 12 | from libs.beamformer import solve_pevd, compute_covar 13 | from libs.data_handler import SpectrogramReader, ScriptReader, NumpyReader, ArchiveWriter 14 | from libs.opts import StftParser 15 | from libs.spatial import directional_feats 16 | from libs.utils import get_logger 17 | 18 | logger = get_logger(__name__) 19 | 20 | 21 | def run(args): 22 | stft_kwargs = { 23 | "frame_len": args.frame_len, 24 | "frame_hop": args.frame_hop, 25 | "round_power_of_two": args.round_power_of_two, 26 | "window": args.window, 27 | "center": args.center, # false to comparable with kaldi 28 | "transpose": False # F x T 29 | } 30 | feat_reader = SpectrogramReader(args.wav_scp, **stft_kwargs) 31 | MaskReader = {"numpy": NumpyReader, "kaldi": ScriptReader} 32 | mask_reader = MaskReader[args.fmt](args.mask_scp) 33 | 34 | df_pair = [tuple(map(int, p.split(","))) for p in args.df_pair.split(";")] 35 | if not len(df_pair): 36 | raise RuntimeError(f"Bad configurations with --pair {args.pair}") 37 | logger.info(f"Compute directional feature with {df_pair}") 38 | 39 | num_done = 0 40 | with ArchiveWriter(args.dup_ark, args.scp) as writer: 41 | for key, obs in feat_reader: 42 | if key in mask_reader: 43 | speech_masks = mask_reader[key] 44 | # make sure speech_masks in T x F 45 | _, F, _ = obs.shape 46 | if speech_masks.shape[0] == F: 47 | speech_masks = np.transpose(speech_masks) 48 | speech_masks = np.minimum(speech_masks, 1) 49 | # obs: N x F x T 50 | speech_covar = compute_covar(obs, speech_masks) 51 | sv = solve_pevd(speech_covar) 52 | df = directional_feats(obs, sv.T, df_pair=df_pair) 53 | writer.write(key, df) 54 | num_done += 1 55 | if not num_done % 1000: 56 | logger.info(f"Processed {num_done:d} utterance...") 57 | else: 58 | logger.warn(f"Missing TF-mask for utterance {key}") 59 | logger.info(f"Processed {num_done:d} utterances over {len(feat_reader):d}") 60 | 61 | 62 | if __name__ == "__main__": 63 | parser = argparse.ArgumentParser( 64 | description= 65 | "Command to compute directional features for arbitrary arrays, " 66 | "based on estimated TF-masks", 67 | formatter_class=argparse.ArgumentDefaultsHelpFormatter, 68 | parents=[StftParser.parser]) 69 | parser.add_argument("wav_scp", 70 | type=str, 71 | help="Multi-Channel wave scripts in kaldi format") 72 | parser.add_argument("mask_scp", 73 | type=str, 74 | help="Scripts of masks in kaldi's " 75 | "archive or numpy's ndarray") 76 | parser.add_argument("dup_ark", 77 | type=str, 78 | help="Location to dump features in kaldi's archives") 79 | parser.add_argument("--scp", 80 | type=str, 81 | default="", 82 | help="If assigned, generate corresponding " 83 | "feature scripts") 84 | parser.add_argument("--mask-format", 85 | dest="fmt", 86 | choices=["kaldi", "numpy"], 87 | default="kaldi", 88 | help="Define format of masks, in kaldi's " 89 | "archives or numpy's ndarray") 90 | parser.add_argument("--df-pair", 91 | type=str, 92 | default="0,1", 93 | help="Microphone pairs for directional " 94 | "feature computation") 95 | args = parser.parse_args() 96 | run(args) 97 | -------------------------------------------------------------------------------- /scripts/sptk/compute_dpcl_label.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | # wujian@2019 4 | """ 5 | Compute labels for DC (Deep Clustering) training: 6 | -1 means silence 7 | 0...N for each speaker 8 | """ 9 | 10 | import argparse 11 | 12 | import numpy as np 13 | 14 | from libs.data_handler import SpectrogramReader, NumpyWriter 15 | from libs.opts import StftParser 16 | from libs.utils import get_logger, EPSILON 17 | 18 | logger = get_logger(__name__) 19 | 20 | 21 | def run(args): 22 | # shape: T x F 23 | stft_kwargs = { 24 | "frame_len": args.frame_len, 25 | "frame_hop": args.frame_hop, 26 | "round_power_of_two": args.round_power_of_two, 27 | "window": args.window, 28 | "center": args.center, 29 | "apply_abs": True, 30 | } 31 | spk_scps = args.spks.split(",") 32 | if len(spk_scps) < 2: 33 | raise RuntimeError("Please give at least 2 speakers") 34 | mix_reader = SpectrogramReader(args.mix, **stft_kwargs) 35 | spk_reader = [SpectrogramReader(spk, **stft_kwargs) for spk in spk_scps] 36 | 37 | with NumpyWriter(args.dir) as writer: 38 | for key, mix in mix_reader: 39 | T, F = mix.shape 40 | masks = np.zeros_like(mix, dtype=np.float32) 41 | # sil: -1 42 | mix_2db = 20 * np.log10(np.maximum(mix, EPSILON)) 43 | sil_idx = mix_2db < (np.max(mix_2db) - args.beta) 44 | masks[sil_idx] = -1 45 | logger.info("For {}, silence covered {:.2f}%".format( 46 | key, 47 | np.sum(sil_idx) * 100 / (T * F))) 48 | # for each speaker 49 | act_idx = ~sil_idx 50 | labels = np.argmax(np.stack([reader[key] for reader in spk_reader]), 51 | axis=0) 52 | masks[act_idx] = labels[act_idx] 53 | writer.write(key, masks) 54 | logger.info("Processed {:d} utterances done".format(len(mix_reader))) 55 | 56 | 57 | if __name__ == "__main__": 58 | parser = argparse.ArgumentParser( 59 | description="Command to compute labels for DC (Deep Clustering) " 60 | "training, -1 means silence, 0..N for each speaker", 61 | formatter_class=argparse.ArgumentDefaultsHelpFormatter, 62 | parents=[StftParser.parser]) 63 | parser.add_argument("mix", type=str, help="Rspecifier for mixture") 64 | parser.add_argument("spks", 65 | type=str, 66 | help="Rspecifier for multiple speakers, " 67 | "separated by \',\', egs: spk1.scp,spk2.scp") 68 | parser.add_argument("dir", 69 | type=str, 70 | help="Directory to store computed labels") 71 | parser.add_argument("--beta", 72 | type=float, 73 | default=40, 74 | help="Threshold to discriminate silence bins (in dB)") 75 | args = parser.parse_args() 76 | run(args) 77 | -------------------------------------------------------------------------------- /scripts/sptk/compute_sdr.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # wujian@2019 3 | 4 | import argparse 5 | from collections import defaultdict 6 | 7 | import numpy as np 8 | from mir_eval.separation import bss_eval_sources 9 | 10 | from libs.data_handler import WaveReader, ScpReader 11 | 12 | 13 | class AudioReader(object): 14 | 15 | def __init__(self, spks_scp): 16 | self.wav_reader = [WaveReader(scp) for scp in spks_scp.split(",")] 17 | 18 | def __len__(self): 19 | return len(self.wav_reader[0]) 20 | 21 | def __getitem__(self, key): 22 | data = [] 23 | for reader in self.wav_reader: 24 | wave = reader[key] 25 | data.append(wave if wave.ndim == 1 else wave[0]) 26 | return np.stack(data, axis=0) 27 | 28 | def __iter__(self): 29 | ref = self.wav_reader[0] 30 | for key in ref.index_keys: 31 | yield key, self[key] 32 | 33 | 34 | class Report(object): 35 | 36 | def __init__(self, spk2class=None): 37 | self.s2c = ScpReader(spk2class) if spk2class else None 38 | self.snr = defaultdict(float) 39 | self.cnt = defaultdict(int) 40 | 41 | def add(self, key, val): 42 | cls_str = "NG" 43 | if self.s2c: 44 | cls_str = self.s2c[key] 45 | self.snr[cls_str] += val 46 | self.cnt[cls_str] += 1 47 | 48 | def report(self): 49 | print("SDR(dB) Report: ") 50 | tot_utt = sum([self.cnt[cls_str] for cls_str in self.cnt]) 51 | tot_snr = sum([self.snr[cls_str] for cls_str in self.snr]) 52 | print("Total: {:d}/{:.3f}".format(tot_utt, tot_snr / tot_utt)) 53 | for cls_str in self.snr: 54 | cls_snr = self.snr[cls_str] 55 | num_utt = self.cnt[cls_str] 56 | print("\t{}: {:d}/{:.3f}".format(cls_str, num_utt, 57 | cls_snr / num_utt)) 58 | 59 | 60 | def run(args): 61 | sep_reader = AudioReader(args.sep_scp) 62 | ref_reader = AudioReader(args.ref_scp) 63 | utt_snr = open(args.per_utt, "w") if args.per_utt else None 64 | utt_ali = open(args.utt_ali, "w") if args.utt_ali else None 65 | reporter = Report(args.spk2class) 66 | # sep: N x S 67 | for key, sep in sep_reader: 68 | # ref: N x S 69 | ref = ref_reader[key] 70 | # keep same shape 71 | nsamps = min(sep.shape[-1], ref.shape[-1]) 72 | sdr, _, _, ali = bss_eval_sources(ref[:, :nsamps], sep[:, :nsamps]) 73 | sdr = np.mean(sdr) 74 | reporter.add(key, sdr) 75 | if utt_snr: 76 | utt_snr.write("{}\t{:.2f}\n".format(key, sdr)) 77 | if utt_ali: 78 | ali_str = " ".join(map(str, ali)) 79 | utt_ali.write(f"{key}\t{ali_str}\n") 80 | reporter.report() 81 | if utt_snr: 82 | utt_snr.close() 83 | if utt_ali: 84 | utt_ali.close() 85 | 86 | 87 | if __name__ == "__main__": 88 | parser = argparse.ArgumentParser( 89 | description="Command to eval speech separation (SDR) using " 90 | "mir_eval (https://github.com/craffel/mir_eval)", 91 | formatter_class=argparse.ArgumentDefaultsHelpFormatter) 92 | parser.add_argument("sep_scp", 93 | type=str, 94 | help="Separated speech scripts, waiting for measure" 95 | "(support multi-speaker, egs: spk1.scp,spk2.scp)") 96 | parser.add_argument("ref_scp", 97 | type=str, 98 | help="Reference speech scripts, as ground truth for " 99 | "separation evaluation") 100 | parser.add_argument("--spk2class", 101 | type=str, 102 | default="", 103 | help="If assigned, report results" 104 | " per class (gender or degree)") 105 | parser.add_argument("--per-utt", 106 | type=str, 107 | default="", 108 | help="If assigned, report snr " 109 | "improvement for each utterance") 110 | parser.add_argument("--utt-ali", 111 | type=str, 112 | default="", 113 | help="If assigned, output audio alignments") 114 | args = parser.parse_args() 115 | run(args) 116 | -------------------------------------------------------------------------------- /scripts/sptk/compute_similar_score.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | # wujian@2018 4 | """ 5 | Compute score for speaker varification tasks 6 | """ 7 | 8 | import argparse 9 | from distutils.util import strtobool 10 | 11 | import numpy as np 12 | 13 | from libs.data_handler import NumpyReader, ScriptReader, parse_scps 14 | from libs.utils import get_logger 15 | 16 | logger = get_logger(__name__) 17 | 18 | 19 | def run(args): 20 | utt2spk = parse_scps(args.utt2spk) 21 | 22 | def Reader(scp, t): 23 | return NumpyReader(scp) if t == "numpy" else ScriptReader(scp) 24 | 25 | spks_reader = Reader(args.spks_scp, args.type) 26 | spks_keys, spks_embs = [], [] 27 | for spkid, spkvec in spks_reader: 28 | spks_keys.append(spkid) 29 | spks_embs.append(spkvec) 30 | spks_mat = np.stack(spks_embs) 31 | if args.normalize: 32 | spks_mat = np.linalg.norm(spks_mat, axis=1, ord=2, keepdims=True) 33 | logger.info("Load {:d} speakers from enrollment embeddings".format( 34 | len(spks_keys))) 35 | 36 | eval_reader = Reader(args.eval_scp, args.type) 37 | for uttid, uttvec in eval_reader: 38 | spkid = utt2spk[uttid] 39 | if args.normalize: 40 | uttvec = uttvec / np.linalg.norm(uttvec) 41 | if spkid not in spks_keys: 42 | raise RuntimeError( 43 | "Seems speaker {} do not exist in enrollment set".format(spkid)) 44 | # using dot product, because embeddings has been normalized 45 | # 1 x N 46 | score_mat = uttvec @ np.transpose(spks_mat) 47 | for index, cmpid in enumerate(spks_keys): 48 | print("{:.2f} {}".format( 49 | score_mat[index], "target" if cmpid == spkid else "nontarget")) 50 | logger.info("Compute scores for {:d} utterances done".format( 51 | len(eval_reader))) 52 | 53 | 54 | if __name__ == "__main__": 55 | parser = argparse.ArgumentParser( 56 | description="Command to compute scores between candidate embeddings " 57 | "and registered ones, output results to stdout, which could " 58 | "be used to compute eer using compute-eer in kaldi.", 59 | formatter_class=argparse.ArgumentDefaultsHelpFormatter) 60 | parser.add_argument("spks_scp", 61 | type=str, 62 | help="Embedding rspecifier computed " 63 | "from enrollment utterances") 64 | parser.add_argument("eval_scp", 65 | type=str, 66 | help="Embedding rspecifier to evaluate perfermance") 67 | parser.add_argument("--utt2spk", 68 | type=str, 69 | required=True, 70 | help="Rspecifier for utterance to speaker map") 71 | parser.add_argument("--vector-type", 72 | dest="type", 73 | type=str, 74 | choices=["kaldi", "numpy"], 75 | default="kaldi", 76 | help="Storage format for embeddings") 77 | parser.add_argument("--normalize", 78 | type=strtobool, 79 | default=False, 80 | help="If true, normalize embeddings " 81 | "before compute dot product") 82 | args = parser.parse_args() 83 | run(args) 84 | -------------------------------------------------------------------------------- /scripts/sptk/compute_spectrogram.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | # wujian@2018 4 | """ 5 | Compute spectrogram features(using librosa kernels) and write in kaldi format 6 | """ 7 | 8 | import argparse 9 | from distutils.util import strtobool 10 | 11 | from libs.data_handler import SpectrogramReader, ArchiveWriter 12 | from libs.exraw import BinaryWriter 13 | from libs.opts import StftParser 14 | from libs.utils import get_logger 15 | 16 | logger = get_logger(__name__) 17 | 18 | 19 | def run(args): 20 | stft_kwargs = { 21 | "frame_len": args.frame_len, 22 | "frame_hop": args.frame_hop, 23 | "round_power_of_two": args.round_power_of_two, 24 | "window": args.window, 25 | "center": args.center, # false to comparable with kaldi 26 | "apply_log": args.apply_log, 27 | "apply_pow": args.apply_pow, 28 | "apply_abs": True, 29 | "transpose": True # T x F 30 | } 31 | reader = SpectrogramReader(args.wav_scp, **stft_kwargs) 32 | WriterImpl = {"kaldi": ArchiveWriter, "exraw": BinaryWriter}[args.format] 33 | with WriterImpl(args.dup_ark, args.scp) as writer: 34 | for key, feats in reader: 35 | # default using ch1 in multi-channel case 36 | if feats.ndim == 3: 37 | writer.write(key, feats[0]) 38 | else: 39 | writer.write(key, feats) 40 | logger.info("Process {:d} utterances".format(len(reader))) 41 | 42 | 43 | if __name__ == "__main__": 44 | parser = argparse.ArgumentParser( 45 | description= 46 | "Command to extract spectrogram features(using sptk's librosa kernels) " 47 | "and write as kaldi's archives", 48 | formatter_class=argparse.ArgumentDefaultsHelpFormatter, 49 | parents=[StftParser.parser]) 50 | parser.add_argument("wav_scp", 51 | type=str, 52 | help="Source location of wave scripts in kaldi format") 53 | parser.add_argument("dup_ark", 54 | type=str, 55 | help="Location to dump spectrogram features") 56 | parser.add_argument("--scp", 57 | type=str, 58 | default="", 59 | help="If assigned, generate corresponding " 60 | "scripts for archives") 61 | parser.add_argument("--format", 62 | type=str, 63 | default="kaldi", 64 | choices=["kaldi", "exraw"], 65 | help="Output archive format, see format " 66 | "in sptk/libs/exraw.py") 67 | parser.add_argument("--apply-log", 68 | type=strtobool, 69 | default=False, 70 | help="If true, using log spectrogram " 71 | "instead of linear") 72 | parser.add_argument("--apply-pow", 73 | type=strtobool, 74 | default=False, 75 | help="If true, extract power spectrum " 76 | "instead of magnitude spectrum") 77 | parser.add_argument("--normalize-samples", 78 | type=strtobool, 79 | default=False, 80 | dest="normalize", 81 | help="If true, normalize sample " 82 | "values between [-1, 1]") 83 | args = parser.parse_args() 84 | run(args) 85 | -------------------------------------------------------------------------------- /scripts/sptk/compute_steer_vector.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | # wujian@2020 4 | """ 5 | Compute steer vector (based on array geometry) for linear/circular arrays 6 | """ 7 | 8 | import argparse 9 | from distutils.util import strtobool 10 | 11 | import numpy as np 12 | 13 | from libs.beamformer import linear_steer_vector, circular_steer_vector 14 | from libs.opts import str2tuple 15 | 16 | 17 | def run(args): 18 | if args.geometry == "linear": 19 | topo = np.array(args.linear_topo) 20 | candidate_doa = np.linspace(0, 180, args.num_doas) 21 | else: 22 | topo = None 23 | step = 360 / args.num_doas 24 | candidate_doa = np.arange(0, 360, step) 25 | 26 | sv = [] 27 | for doa in candidate_doa: 28 | if topo is None: 29 | sv.append( 30 | circular_steer_vector(args.circular_radius, 31 | args.circular_around, 32 | doa, 33 | args.num_bins, 34 | c=args.speed, 35 | sr=args.sr, 36 | center=args.circular_center)) 37 | else: 38 | sv.append( 39 | linear_steer_vector(topo, 40 | doa, 41 | args.num_bins, 42 | c=args.speed, 43 | sr=args.sr)) 44 | # A x F x M 45 | sv = np.stack(sv) 46 | # norm or not 47 | if args.normalize: 48 | sv = sv / sv.shape[-1]**0.5 49 | # A x M x F 50 | sv = sv.transpose(0, 2, 1) 51 | np.save(args.steer_vector, sv) 52 | 53 | 54 | if __name__ == "__main__": 55 | parser = argparse.ArgumentParser( 56 | description= 57 | "Command to compute steer vectors, using for SSL & BF & AF computation", 58 | formatter_class=argparse.ArgumentDefaultsHelpFormatter) 59 | parser.add_argument("steer_vector", 60 | type=str, 61 | help="Output location of the steer vector") 62 | parser.add_argument("--num-doas", 63 | type=int, 64 | default=181, 65 | help="Step size when sampling the DoA") 66 | parser.add_argument("--num-bins", 67 | type=int, 68 | default=257, 69 | help="Number of the FFT points used") 70 | parser.add_argument("--sr", 71 | type=int, 72 | default=16000, 73 | help="Sample rate of input wave") 74 | parser.add_argument("--speed", 75 | type=float, 76 | default=343, 77 | help="Speed of sound") 78 | parser.add_argument("--linear-topo", 79 | type=str2tuple, 80 | default=(), 81 | help="Topology of linear microphone arrays") 82 | parser.add_argument("--circular-around", 83 | type=int, 84 | default=6, 85 | help="Number of the micriphones in circular arrays") 86 | parser.add_argument("--circular-radius", 87 | type=float, 88 | default=0.05, 89 | help="Radius of circular array") 90 | parser.add_argument("--circular-center", 91 | type=strtobool, 92 | default=False, 93 | help="Is there a microphone put in the " 94 | "center of the circular array?") 95 | parser.add_argument("--geometry", 96 | type=str, 97 | choices=["linear", "circular"], 98 | default="linear", 99 | help="Geometry of the microphone array") 100 | parser.add_argument("--normalize", 101 | type=strtobool, 102 | default=False, 103 | help="Normalzed steer vector or not") 104 | args = parser.parse_args() 105 | run(args) 106 | -------------------------------------------------------------------------------- /scripts/sptk/compute_wer.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | # wujian@2019 4 | 5 | import argparse 6 | from collections import defaultdict 7 | 8 | from libs.data_handler import ScpReader 9 | from libs.metric import permute_ed 10 | 11 | 12 | class TransReader(object): 13 | """ 14 | Class to handle single/multi-speaker transcriptions 15 | """ 16 | 17 | def __init__(self, text): 18 | self.text_reader = [ 19 | ScpReader(t, num_tokens=-1, restrict=False) for t in text.split(",") 20 | ] 21 | 22 | def __len__(self): 23 | return len(self.text_reader) 24 | 25 | def __getitem__(self, key): 26 | if not self._check(key): 27 | raise RuntimeError(f"Missing {key} in text reader") 28 | return [reader[key] for reader in self.text_reader] 29 | 30 | def _check(self, key): 31 | status = [key in reader for reader in self.text_reader] 32 | return sum(status) == len(self.text_reader) 33 | 34 | def __iter__(self): 35 | ref = self.text_reader[0] 36 | for key in ref.index_keys: 37 | if self._check(key): 38 | yield key, self[key] 39 | 40 | 41 | class Report(object): 42 | 43 | def __init__(self, spk2class=None): 44 | self.s2c = ScpReader(spk2class) if spk2class else None 45 | self.err = defaultdict(float) 46 | self.tot = defaultdict(float) 47 | self.cnt = 0 48 | 49 | def add(self, key, err, tot): 50 | cls_str = "NG" 51 | if self.s2c: 52 | cls_str = self.s2c[key] 53 | self.err[cls_str] += err 54 | self.tot[cls_str] += tot 55 | self.cnt += 1 56 | 57 | def report(self): 58 | print("WER(%) Report: ") 59 | sum_err = sum([self.err[cls_str] for cls_str in self.err]) 60 | sum_len = sum([self.tot[cls_str] for cls_str in self.tot]) 61 | print(f"Total WER: {sum_err * 100 / sum_len:.2f}%, " + 62 | f"{self.cnt} utterances") 63 | if len(self.err) != 1: 64 | for cls_str in self.err: 65 | cls_err = self.err[cls_str] 66 | cls_tot = self.tot[cls_str] 67 | print(f" {cls_str}: {cls_err * 100 / cls_tot:.2f}%") 68 | 69 | 70 | def run(args): 71 | hyp_reader = TransReader(args.hyp) 72 | ref_reader = TransReader(args.ref) 73 | if len(hyp_reader) != len(ref_reader): 74 | raise RuntimeError("Looks number of speakers do not match in hyp & ref") 75 | each_utt = open(args.per_utt, "w") if args.per_utt else None 76 | 77 | reporter = Report(args.utt2class) 78 | for key, hyp in hyp_reader: 79 | ref = ref_reader[key] 80 | err = permute_ed(hyp, ref) 81 | ref_len = sum([len(r) for r in ref]) 82 | if each_utt: 83 | if ref_len != 0: 84 | each_utt.write("{}\t{:.3f}\n".format(key, err / ref_len)) 85 | else: 86 | each_utt.write("{}\tINF\n".format(key)) 87 | reporter.add(key, err, ref_len) 88 | reporter.report() 89 | 90 | 91 | if __name__ == "__main__": 92 | parser = argparse.ArgumentParser( 93 | description="Command to compute wer (edit/levenshtein distance), " 94 | "accepting text following Kaldi's format", 95 | formatter_class=argparse.ArgumentDefaultsHelpFormatter) 96 | parser.add_argument("hyp", 97 | type=str, 98 | help="Hypothesis transcripts " 99 | "(multi-speakers need split by ',')") 100 | parser.add_argument("ref", 101 | type=str, 102 | help="References transcripts " 103 | "(multi-speakers need split by ',')") 104 | parser.add_argument("--per-utt", 105 | type=str, 106 | default="", 107 | help="If assigned, compute wer for each utterance") 108 | parser.add_argument("--utt2class", 109 | type=str, 110 | default="", 111 | help="If assigned, report results " 112 | "per-class (gender or degree)") 113 | args = parser.parse_args() 114 | run(args) 115 | -------------------------------------------------------------------------------- /scripts/sptk/copy_archive_to_mat.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding=utf-8 3 | # wujian@2018 4 | 5 | import argparse 6 | from distutils.util import strtobool 7 | 8 | import numpy as np 9 | 10 | from libs.data_handler import ScriptReader, ArchiveReader, NumpyWriter, MatWriter 11 | from libs.utils import get_logger 12 | 13 | logger = get_logger(__name__) 14 | 15 | 16 | def run(args): 17 | src_reader = ScriptReader( 18 | args.src_dec) if args.src == "scp" else ArchiveReader(args.src_dec) 19 | num_done = 0 20 | WriterImpl = {"npy": NumpyWriter, "mat": MatWriter}[args.dst] 21 | with WriterImpl(args.dst_dir, args.scp) as writer: 22 | for key, mat in src_reader: 23 | if args.trans: 24 | mat = np.transpose(mat) 25 | writer.write(key, mat) 26 | num_done += 1 27 | logger.info(f"Copy {num_done} matrices into directory {args.dst_dir}") 28 | 29 | 30 | if __name__ == "__main__": 31 | parser = argparse.ArgumentParser( 32 | description="Command to copy Kaldi's archives to Numpy's ndarrays", 33 | formatter_class=argparse.ArgumentDefaultsHelpFormatter) 34 | parser.add_argument("src_dec", 35 | type=str, 36 | help="Rspecifier for input features(.ark/.scp)") 37 | parser.add_argument("dst_dir", 38 | type=str, 39 | help="Location to dump numpy's ndarray") 40 | parser.add_argument("--src-format", 41 | type=str, 42 | dest="src", 43 | choices=["ark", "scp"], 44 | default="scp", 45 | help="Format of input rspecifier") 46 | parser.add_argument("--dst-format", 47 | type=str, 48 | dest="dst", 49 | choices=["npy", "mat"], 50 | default="npy", 51 | help="Format of the data to transform to") 52 | parser.add_argument("--transpose", 53 | type=strtobool, 54 | default=False, 55 | dest="trans", 56 | help="If true, transpose matrix " 57 | "before write to ndarray") 58 | parser.add_argument("--scp", 59 | type=str, 60 | default="", 61 | help="If assigned, dump corresponding scripts") 62 | args = parser.parse_args() 63 | run(args) 64 | -------------------------------------------------------------------------------- /scripts/sptk/copy_complex_mat.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding=utf-8 3 | # wujian@2018 4 | """ 5 | Copy MATLAB's .mat into (C)Matrix binary format 6 | """ 7 | 8 | import argparse 9 | from distutils.util import strtobool 10 | 11 | import numpy as np 12 | import scipy.io as sio 13 | 14 | from libs.kaldi_io import write_common_mat, write_token, write_int32, write_binary_symbol 15 | 16 | 17 | def write_complex_mat(fd, cmat): 18 | assert cmat.dtype == np.complex64 or cmat.dtype == np.complex128 19 | mat_type = 'FCM' if cmat.dtype == np.complex64 else 'DCM' 20 | write_token(fd, mat_type) 21 | num_rows, num_cols = cmat.shape 22 | write_int32(fd, num_rows) 23 | write_int32(fd, num_cols) 24 | fd.write(cmat.tobytes()) 25 | 26 | 27 | def run(args): 28 | mdict = sio.loadmat(args.mmat) 29 | mmat = mdict[args.key] 30 | assert mmat.dtype in [np.float32, np.float64, np.complex64, np.complex128] 31 | print('Detect input dtype={}'.format(mmat.dtype)) 32 | if args.transpose: 33 | mmat = np.transpose(mmat) 34 | 35 | if mmat.dtype == np.float64 or mmat.dtype == np.float32: 36 | # from float32 to float64 37 | if mmat.dtype == np.float32 and args.double: 38 | mmat = np.array(mmat, dtype=np.float64) 39 | if mmat.dtype == np.float64 and args.float: 40 | mmat = np.array(mmat, dtype=np.float32) 41 | with open(args.kmat, "wb") as f: 42 | write_binary_symbol(f) 43 | write_common_mat(f, mmat) 44 | else: 45 | if mmat.dtype == np.complex64 and args.double: 46 | mmat = np.array(mmat, dtype=np.complex128) 47 | if mmat.dtype == np.complex128 and args.float: 48 | mmat = np.array(mmat, dtype=np.complex64) 49 | with open(args.kmat, "wb") as f: 50 | write_binary_symbol(f) 51 | write_complex_mat(f, mmat) 52 | print("Copy from {} to {} in {}".format(args.mmat, args.kmat, mmat.dtype)) 53 | 54 | 55 | if __name__ == '__main__': 56 | parser = argparse.ArgumentParser( 57 | description="Command to copy MATLAB's (complex) " 58 | "matrix into (C)Matrix.", 59 | formatter_class=argparse.ArgumentDefaultsHelpFormatter, 60 | ) 61 | parser.add_argument("mmat", 62 | type=str, 63 | help="Original matrix in matlab's format") 64 | parser.add_argument("kmat", 65 | type=str, 66 | help="Object matrix in kaldi's format") 67 | parser.add_argument("key", 68 | type=str, 69 | help="Key values to index matrix in mmat") 70 | parser.add_argument("--double", 71 | type=strtobool, 72 | default=False, 73 | help="If true, then write matrix " 74 | "in float64/complex128") 75 | parser.add_argument("--float", 76 | type=strtobool, 77 | default=False, 78 | help="If true, then write matrix in float32/complex64") 79 | parser.add_argument("--transpose", 80 | type=strtobool, 81 | default=False, 82 | help="If true, write transpose of " 83 | "original matrix instead") 84 | args = parser.parse_args() 85 | run(args) 86 | -------------------------------------------------------------------------------- /scripts/sptk/copy_mat_to_archive.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding=utf-8 3 | # wujian@2018 4 | 5 | import argparse 6 | 7 | import numpy as np 8 | 9 | from libs.data_handler import ArchiveWriter, NumpyReader, MatReader 10 | from libs.utils import filekey, get_logger, EPSILON 11 | 12 | logger = get_logger(__name__) 13 | 14 | supported_op = ["trans", "log", "minus", "stack"] 15 | 16 | 17 | def run(args): 18 | src_reader = NumpyReader(args.src_scp) if args.src == "npy" else MatReader( 19 | args.src_scp, args.key) 20 | num_mat = 0 21 | mat_list = [] 22 | ops = args.op.split(",") 23 | for op in ops: 24 | if op and op not in supported_op: 25 | raise RuntimeError(f"Unknown operation: {op}") 26 | stack = "stack" in ops 27 | with ArchiveWriter(args.dst_ark, args.scp) as writer: 28 | for key, mat in src_reader: 29 | for op in ops: 30 | if op == "trans": 31 | mat = np.transpose(mat) 32 | elif op == "log": 33 | mat = np.log(np.maximum(mat, EPSILON)) 34 | elif op == "minus": 35 | mat = 1 - mat 36 | else: 37 | pass 38 | if stack: 39 | mat_list.append(mat) 40 | else: 41 | writer.write(key, mat) 42 | num_mat += 1 43 | if stack: 44 | mat = np.vstack(mat_list) 45 | writer.write(filekey(args.dst_ark), mat) 46 | logger.info("Merge {0} matrix into archive {1}, shape as " 47 | "{2[0]}x{2[1]}".format(num_mat, args.dst_ark, 48 | mat.shape)) 49 | if not stack: 50 | logger.info(f"Copy {num_mat} matrices into archive {args.dst_ark}") 51 | 52 | 53 | if __name__ == "__main__": 54 | parser = argparse.ArgumentParser( 55 | description="Command to copy MATLAB's .mat or Python's .npy (real)matrix " 56 | "to kaldi's archives", 57 | formatter_class=argparse.ArgumentDefaultsHelpFormatter) 58 | parser.add_argument("src_scp", 59 | type=str, 60 | help="Source scripts for .mat/.npy files") 61 | parser.add_argument("dst_ark", 62 | type=str, 63 | help="Location to dump kaldi's archives") 64 | parser.add_argument("--scp", 65 | type=str, 66 | default=None, 67 | help="If assigned, generate corresponding " 68 | ".scp for archives") 69 | parser.add_argument("--mat-index", 70 | type=str, 71 | dest="key", 72 | default="data", 73 | help="A string to index data in MATLAB's .mat file") 74 | parser.add_argument("--op", 75 | type=str, 76 | default="", 77 | help="Operations to applied on source " 78 | "matrix/vector, separated by \",\", now support " 79 | "trans/log/minus/stack") 80 | parser.add_argument("--src-format", 81 | type=str, 82 | dest="src", 83 | choices=["npy", "mat"], 84 | default="npy", 85 | help="Data format in the input rspecifier") 86 | args = parser.parse_args() 87 | run(args) 88 | -------------------------------------------------------------------------------- /scripts/sptk/libs/__init__.py: -------------------------------------------------------------------------------- 1 | name = "libs" 2 | -------------------------------------------------------------------------------- /scripts/sptk/libs/metric.py: -------------------------------------------------------------------------------- 1 | # wujian@2018 2 | """ 3 | 1. Si-SNR (scale-invariant SNR/SDR) 4 | 2. WER (word error rate) 5 | """ 6 | 7 | from itertools import permutations 8 | 9 | import editdistance as ed 10 | import numpy as np 11 | 12 | 13 | def si_snr(x, s, eps=1e-8, remove_dc=True): 14 | """ 15 | Compute Si-SNR 16 | Arguments: 17 | x: vector, enhanced/separated signal 18 | s: vector, reference signal (ground truth) 19 | """ 20 | 21 | def vec_l2norm(x): 22 | return np.linalg.norm(x, 2) 23 | 24 | # zero mean, seems do not hurt results 25 | if remove_dc: 26 | x_zm = x - np.mean(x) 27 | s_zm = s - np.mean(s) 28 | t = np.inner(x_zm, s_zm) * s_zm / (vec_l2norm(s_zm)**2 + eps) 29 | n = x_zm - t 30 | else: 31 | t = np.inner(x, s) * s / (vec_l2norm(s)**2 + eps) 32 | n = x - t 33 | return 20 * np.log10(vec_l2norm(t) / (vec_l2norm(n) + eps) + eps) 34 | 35 | 36 | def permute_si_snr(xlist, slist, align=False): 37 | """ 38 | Compute Si-SNR between N pairs 39 | Arguments: 40 | x: list[vector], enhanced/separated signal 41 | s: list[vector], reference signal (ground truth) 42 | """ 43 | 44 | def si_snr_avg(xlist, slist): 45 | return sum([si_snr(x, s) for x, s in zip(xlist, slist)]) / len(xlist) 46 | 47 | N = len(xlist) 48 | if N != len(slist): 49 | raise RuntimeError("size do not match between xlist " + 50 | f"and slist: {N} vs {len(slist)}") 51 | si_snrs = [] 52 | perm = [] 53 | for order in permutations(range(N)): 54 | si_snrs.append(si_snr_avg(xlist, [slist[n] for n in order])) 55 | perm.append(order) 56 | if not align: 57 | return max(si_snrs) 58 | else: 59 | max_idx = np.argmax(si_snrs) 60 | return max(si_snrs), perm[max_idx] 61 | 62 | 63 | def permute_ed(hlist, rlist): 64 | """ 65 | Compute edit distance between N pairs 66 | Arguments: 67 | hlist: list[vector], hypothesis 68 | rlist: list[vector], reference 69 | """ 70 | 71 | def distance(hlist, rlist): 72 | return sum([ed.eval(h, r) for h, r in zip(hlist, rlist)]) 73 | 74 | N = len(hlist) 75 | if N != len(rlist): 76 | raise RuntimeError("size do not match between hlist " + 77 | f"and rlist: {N} vs {len(rlist)}") 78 | wers = [] 79 | for order in permutations(range(N)): 80 | wers.append(distance(hlist, [rlist[n] for n in order])) 81 | return min(wers) 82 | -------------------------------------------------------------------------------- /scripts/sptk/libs/opts.py: -------------------------------------------------------------------------------- 1 | # wujian@2018 2 | """ 3 | Some customized action for argparse 4 | """ 5 | import argparse 6 | from distutils.util import strtobool 7 | 8 | 9 | def str2tuple(string, sep=","): 10 | """ 11 | Map "1.0,2,0" => (1.0, 2.0) 12 | """ 13 | tokens = string.split(sep) 14 | # if len(tokens) == 1: 15 | # raise ValueError("Get only one token by " + 16 | # f"sep={sep}, string={string}") 17 | floats = map(float, tokens) 18 | return tuple(floats) 19 | 20 | 21 | class StftParser(object): 22 | """ 23 | STFT argparser 24 | """ 25 | parser = argparse.ArgumentParser(add_help=False) 26 | parser.add_argument("--frame-len", 27 | type=int, 28 | default=512, 29 | help="Frame length in number of samples " 30 | "(related to sample frequency)") 31 | parser.add_argument("--frame-hop", 32 | type=int, 33 | default=256, 34 | help="Frame shift in number of samples " 35 | "(related to sample frequency)") 36 | parser.add_argument("--center", 37 | type=strtobool, 38 | default=True, 39 | help="Value of parameter \'center\' in " 40 | "librosa.stft functions") 41 | parser.add_argument("--round-power-of-two", 42 | type=strtobool, 43 | default=True, 44 | help="If true, pad fft size to power of two") 45 | parser.add_argument("--window", 46 | type=str, 47 | default="hann", 48 | help="Type of window function, " 49 | "see scipy.signal.get_window") 50 | -------------------------------------------------------------------------------- /scripts/sptk/libs/sampler.py: -------------------------------------------------------------------------------- 1 | # wujian@2020 2 | 3 | import random 4 | 5 | from .opts import str2tuple 6 | 7 | 8 | class UniformSampler(object): 9 | """ 10 | A uniform sampler class 11 | """ 12 | 13 | def __init__(self, tuple_or_str): 14 | if isinstance(tuple_or_str, (list, tuple)): 15 | self.min, self.max = tuple_or_str 16 | else: 17 | self.min, self.max = str2tuple(tuple_or_str) 18 | 19 | def sample(self): 20 | return random.uniform(self.min, self.max) 21 | -------------------------------------------------------------------------------- /scripts/sptk/libs/ssl.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | # wujian@2019 4 | """ 5 | Sound Source Localization (SSL) Module 6 | """ 7 | import numpy as np 8 | 9 | from .utils import cmat_abs 10 | 11 | 12 | def ml_ssl(stft, sv, compression=0, eps=1e-8, norm=False, mask=None): 13 | """ 14 | Maximum likelihood SSL 15 | Arguments: 16 | stft: STFT transform result, M x T x F 17 | sv: steer vector in each directions, A x M x F 18 | norm: normalze STFT or not 19 | mask: TF-mask for source, T x F x (N) 20 | Return: 21 | index: DoA index 22 | """ 23 | _, T, F = stft.shape 24 | if mask is None: 25 | mask = np.ones([T, F]) 26 | # make sure sv is normalized 27 | sv = sv / np.linalg.norm(sv, axis=1, keepdims=True) 28 | if norm: 29 | stft = stft / np.maximum(cmat_abs(stft), eps) 30 | ssh_cor = np.abs(np.einsum("mtf,mtf->tf", stft, stft.conj())) 31 | ssv_cor = np.abs(np.einsum("amf,mtf->atf", sv, stft.conj()))**2 32 | # A x T x F 33 | delta = ssh_cor[None, ...] - ssv_cor / (1 + eps) 34 | if compression <= 0: 35 | tf_loglike = -np.log(np.maximum(delta, eps)) 36 | else: 37 | tf_loglike = -np.power(delta, compression) 38 | # masking 39 | if mask.ndim == 2: 40 | loglike = np.sum(mask[None, ...] * tf_loglike, (1, 2)) 41 | else: 42 | loglike = np.einsum("ntf,atf->na", mask, tf_loglike) 43 | return np.argmax(loglike, axis=-1) 44 | 45 | 46 | def srp_ssl(stft, sv, srp_pair=None, mask=None): 47 | """ 48 | Do SRP-PHAT based SSL 49 | Arguments: 50 | stft: STFT transform result, M x T x F 51 | sv: steer vector in each directions, A x M x F 52 | srp_pair: index pair to compute srp response 53 | mask: TF-mask for source, T x F 54 | Return: 55 | index: DoA index 56 | """ 57 | if srp_pair is None: 58 | raise ValueError("srp_pair cannot be None, (list, list)") 59 | _, T, F = stft.shape 60 | if mask is None: 61 | mask = np.ones([T, F]) 62 | index_l, index_r = srp_pair 63 | # M x T x F 64 | obs_pha = np.angle(stft) 65 | # A x M x F 66 | ora_pha = np.angle(sv) 67 | # observed ipd: P x T x F 68 | obs_ipd = obs_pha[index_l] - obs_pha[index_r] 69 | # oracle ipd: A x P x F 70 | ora_ipd = ora_pha[:, index_l] - ora_pha[:, index_r] 71 | # directional feature: A x P x T x F 72 | af = np.cos(obs_ipd[None, ...] - ora_ipd[..., None, :]) 73 | # mean: A x T x F 74 | af = np.mean(af, 1) 75 | # mask and sum: A 76 | srp = np.sum(af * mask[None, ...], (1, 2)) 77 | return np.argmax(srp) 78 | 79 | 80 | def music_ssl(stft, sv, mask=None): 81 | """ 82 | Do MUSIC based SSL 83 | Arguments: 84 | stft: STFT transform result, M x T x F 85 | sv: steer vector in each directions, A x M x F 86 | mask: TF-mask for source, T x F 87 | Return: 88 | index: DoA index 89 | """ 90 | _, T, F = stft.shape 91 | if mask is None: 92 | mask = np.ones([T, F]) 93 | # F x M x T 94 | obs = np.transpose(stft * mask, (2, 0, 1)) 95 | # F x M x M 96 | obs_covar = np.einsum("...at,...bt->...ab", obs, obs.conj()) / T 97 | # w: ascending order 98 | _, v = np.linalg.eigh(obs_covar) 99 | # F x M x M - 1 100 | noise_sub = v[..., :-1] 101 | # F x M x M 102 | noise_covar = np.einsum("...at,...bt->...ab", noise_sub, noise_sub.conj()) 103 | # F x A x M 104 | sv = np.transpose(sv, (2, 0, 1)) 105 | # F x A 106 | denorm = np.einsum("...a,...ab,...b->...", sv.conj(), noise_covar[:, None], 107 | sv) 108 | # A 109 | score = np.sum(np.abs(denorm), axis=0) 110 | return np.argmin(score) 111 | -------------------------------------------------------------------------------- /scripts/sptk/visualize_angular_spectrum.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding=utf-8 3 | # wujian@2018 4 | 5 | import argparse 6 | import os 7 | from distutils.util import strtobool 8 | 9 | import matplotlib.pyplot as plt 10 | import numpy as np 11 | 12 | from libs.data_handler import ArchiveReader 13 | from libs.utils import get_logger 14 | 15 | default_font = "Times New Roman" 16 | default_dpi = 200 17 | default_fmt = "jpg" 18 | 19 | logger = get_logger(__name__) 20 | 21 | 22 | def save_figure(key, mat, dest, hop=16, samp_tdoa=False): 23 | logger.info(f"Plot utterance {key} to {dest}.{default_fmt}...") 24 | num_frames, num_doas = mat.shape 25 | fig, ax = plt.subplots() 26 | ax.imshow(np.transpose(mat), 27 | origin="lower", 28 | cmap="binary", 29 | aspect="auto", 30 | interpolation="none") 31 | xp = np.linspace(0, num_frames - 1, 5) 32 | yp = np.linspace(0, num_doas - 1, 7) 33 | ax.set_title(key, fontdict={"family": default_font}) 34 | ax.set_xticks(xp) 35 | ax.set_xticklabels([f"{t:.2f}" for t in (xp * hop)], 36 | fontproperties=default_font) 37 | ax.set_yticks(yp) 38 | ax.set_yticklabels(["%d" % d for d in yp], fontproperties=default_font) 39 | ax.set_xlabel("Time(s)", fontdict={"family": default_font}) 40 | ax.set_ylabel("DoA" if not samp_tdoa else "TDoA Index", 41 | fontdict={"family": default_font}) 42 | fig.savefig(f"{dest}.{default_fmt}", dpi=default_dpi, format=default_fmt) 43 | plt.close(fig) 44 | 45 | 46 | def run(args): 47 | if not os.path.exists(args.cache_dir): 48 | os.makedirs(args.cache_dir) 49 | 50 | ark_reader = ArchiveReader(args.srp_ark) 51 | for key, mat in ark_reader: 52 | dst = os.path.join(args.cache_dir, key.replace(".", "-")) 53 | save_figure(key, 54 | mat, 55 | dst, 56 | hop=args.frame_hop * 1e-3, 57 | samp_tdoa=args.tdoa) 58 | 59 | 60 | if __name__ == '__main__': 61 | parser = argparse.ArgumentParser( 62 | description="Command to visualize augular spectrum.\n" 63 | "egs: ./visualize_angular_spectrum.py a.ark --cache-dir demo", 64 | formatter_class=argparse.ArgumentDefaultsHelpFormatter) 65 | parser.add_argument("srp_ark", 66 | type=str, 67 | help="Path of augular spectrum in " 68 | "kaldi's archive format") 69 | parser.add_argument("--frame-hop", 70 | type=int, 71 | default=16, 72 | help="Frame shift in ms") 73 | parser.add_argument("--cache-dir", 74 | type=str, 75 | default="figure", 76 | help="Location to dump pictures") 77 | parser.add_argument("--sample-tdoa", 78 | dest="tdoa", 79 | type=strtobool, 80 | default=False, 81 | help="Sample TDoA instead of DoA when " 82 | "computing spectrum") 83 | args = parser.parse_args() 84 | run(args) 85 | -------------------------------------------------------------------------------- /scripts/sptk/visualize_beampattern.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | # wujian@2019 4 | 5 | import argparse 6 | 7 | import matplotlib.pyplot as plt 8 | import numpy as np 9 | 10 | from libs.beamformer import beam_pattern 11 | 12 | 13 | def run(args): 14 | # (B) x F x M 15 | weight = np.load(args.weight) 16 | multi_beam = weight.ndim == 3 17 | # A x M x F 18 | steer_vector = np.load(args.steer_vector) 19 | A, _, F = steer_vector.shape 20 | steer_vector = np.einsum("amf->fam", steer_vector) 21 | 22 | if multi_beam: 23 | if args.beam >= weight.shape[0]: 24 | raise RuntimeError("Beam index out of range: " + 25 | f"{args.beam} vs {weight.shape[0]}") 26 | pattern = beam_pattern(weight[args.beam], steer_vector) 27 | else: 28 | pattern = beam_pattern(weight, steer_vector) 29 | xp = np.linspace(0, F - 1, 6) 30 | xt = np.linspace(0, args.sr // 2, 6) / 1000 31 | yp = np.linspace(0, A, 5) 32 | plt.imshow(pattern.T, cmap="jet", origin="lower") 33 | plt.xticks(xp, [f"{t:.1f}" for t in xt]) 34 | plt.yticks(yp, [f"{int(t)}" for t in (yp * args.doa_range / A)]) 35 | plt.ylabel("DoA (degree)") 36 | plt.xlabel("Frequency (kHz)") 37 | if multi_beam: 38 | plt.title(f"BeamPattern of Beam-{args.beam + 1}") 39 | else: 40 | plt.title("BeamPattern") 41 | plt.show() 42 | 43 | 44 | if __name__ == "__main__": 45 | parser = argparse.ArgumentParser( 46 | description="Command to plot beam pattern of the fixed beamformer", 47 | formatter_class=argparse.ArgumentDefaultsHelpFormatter) 48 | parser.add_argument("weight", 49 | type=str, 50 | help="Weight of the fixed beamformer " 51 | "(in shape B x F x N or F x N)") 52 | parser.add_argument("steer_vector", 53 | type=str, 54 | help="Pre-computed steer vector in each " 55 | "directions (in shape A x M x F, A: number " 56 | "of DoAs, M: microphone number, F: FFT bins)") 57 | parser.add_argument("--beam", 58 | type=int, 59 | default=0, 60 | help="Beam index to plot " 61 | "(if contains multi-beam)") 62 | parser.add_argument("--doa-range", 63 | type=int, 64 | default=360, 65 | help="Maximum DoA value") 66 | parser.add_argument("--sr", 67 | type=int, 68 | default=16000, 69 | help="Sample rate of the data") 70 | args = parser.parse_args() 71 | run(args) 72 | -------------------------------------------------------------------------------- /scripts/sptk/visualize_pca.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # wujian@2018 3 | 4 | import argparse 5 | import os 6 | 7 | import matplotlib.pyplot as plt 8 | import numpy as np 9 | from sklearn.decomposition import PCA 10 | 11 | from libs.data_handler import ArchiveReader, DirReader 12 | 13 | 14 | class NumpyReader(DirReader): 15 | """ 16 | Numpy matrix reader 17 | """ 18 | 19 | def __init__(self, obj_dir): 20 | super(NumpyReader, self).__init__(obj_dir, "npy") 21 | 22 | def _load(self, key): 23 | return np.load(self.index_dict[key]) 24 | 25 | 26 | def run(args): 27 | pca = PCA(n_components=args.dim) 28 | 29 | is_dir = os.path.isdir(args.rspec_or_dir) 30 | samples = [] 31 | feats_reader = ArchiveReader( 32 | args.rspec_or_dir) if not is_dir else NumpyReader(args.rspec_or_dir) 33 | for _, feats in feats_reader: 34 | if feats.ndim != 1: 35 | feats = np.average(feats, 0) 36 | samples.append(feats) 37 | org_mat = np.stack(samples) 38 | pca_mat = pca.fit_transform(org_mat) 39 | 40 | fig = plt.figure() 41 | ax = fig.add_subplot(111, projection="3d") 42 | x, y, z = np.split(pca_mat, 3, axis=1) 43 | ax.scatter(x, y, z, s=2) 44 | plt.show() 45 | 46 | 47 | if __name__ == "__main__": 48 | parser = argparse.ArgumentParser( 49 | description= 50 | "Command to visualize embeddings (egs: ivector/xvector/dvector) " 51 | "using PCA transform", 52 | formatter_class=argparse.ArgumentDefaultsHelpFormatter) 53 | parser.add_argument("rspec_or_dir", 54 | type=str, 55 | help="Read specifier of " 56 | "archives/Directory of ndarrays") 57 | parser.add_argument("--dim", 58 | type=int, 59 | default=3, 60 | help="Number of components in PCA transform") 61 | args = parser.parse_args() 62 | run(args) 63 | -------------------------------------------------------------------------------- /scripts/sptk/wav_duration.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | # wujian@2020 4 | 5 | import argparse 6 | import sys 7 | import warnings 8 | import wave 9 | 10 | 11 | def ext_open(fd, mode): 12 | if mode not in ["r", "w"]: 13 | raise ValueError(f"Unsupported mode: {mode}") 14 | if fd == "-": 15 | return sys.stdout if mode == "w" else sys.stdin 16 | else: 17 | return open(fd, mode) 18 | 19 | 20 | def run(args): 21 | prog_interval = 100 22 | done, total = 0, 0 23 | with ext_open(args.utt2dur, "w") as utt2dur: 24 | with ext_open(args.wav_scp, "r") as wav_scp: 25 | for raw_line in wav_scp: 26 | total += 1 27 | line = raw_line.strip() 28 | toks = line.split() 29 | if len(toks) != 2: 30 | warnings.warn(f"Line format error: {line}") 31 | continue 32 | done += 1 33 | key, path = toks 34 | with wave.open(path, "r") as wav: 35 | dur = wav.getnframes() 36 | if args.output == "time": 37 | dur = float(dur) / wav.getframerate() 38 | if args.output == "time": 39 | utt2dur.write(f"{key}\t{dur:.4f}\n") 40 | else: 41 | utt2dur.write(f"{key}\t{dur:d}\n") 42 | if done % prog_interval == 0: 43 | print(f"Processed {done} utterances...", flush=True) 44 | print(f"Processed {done} utterances done, total {total}") 45 | 46 | 47 | if __name__ == "__main__": 48 | parser = argparse.ArgumentParser( 49 | description="Command to generate duration of the wave. " 50 | "We avoid to read whole utterance as it may slow down the speed", 51 | formatter_class=argparse.ArgumentDefaultsHelpFormatter) 52 | parser.add_argument("wav_scp", type=str, help="Input wave script") 53 | parser.add_argument("utt2dur", type=str, help="Output utt2dur file") 54 | parser.add_argument("--output", 55 | type=str, 56 | choices=["time", "sample"], 57 | default="sample", 58 | help="Output type of the script") 59 | args = parser.parse_args() 60 | run(args) 61 | -------------------------------------------------------------------------------- /src/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 3.0) 2 | 3 | add_executable(compute-stft-stats compute-stft-stats.cc) 4 | add_executable(compute-masks compute-masks.cc) 5 | add_executable(compute-srp-phat compute-srp-phat.cc) 6 | add_executable(wav-separate wav-separate.cc) 7 | add_executable(wav-estimate wav-estimate.cc) 8 | add_executable(rir-simulate rir-simulate.cc) 9 | add_executable(apply-fixed-beamformer apply-fixed-beamformer.cc) 10 | add_executable(apply-supervised-mvdr apply-supervised-mvdr.cc) 11 | add_executable(apply-supervised-max-snr apply-supervised-max-snr.cc) 12 | add_executable(matrix-scale-elements matrix-scale-elements.cc) 13 | add_executable(matrix-scale-rows matrix-scale-rows.cc) 14 | add_executable(wav-to-power wav-to-power.cc) 15 | add_executable(modify-feats modify-feats.cc) 16 | add_executable(apply-cmvn-perutt apply-cmvn-perutt.cc) 17 | 18 | target_link_libraries(compute-stft-stats ${DEPEND_LIBS} setk) 19 | target_link_libraries(compute-masks ${DEPEND_LIBS} setk) 20 | target_link_libraries(compute-srp-phat ${DEPEND_LIBS} setk) 21 | target_link_libraries(wav-separate ${DEPEND_LIBS} setk) 22 | target_link_libraries(wav-estimate ${DEPEND_LIBS} setk) 23 | target_link_libraries(rir-simulate ${DEPEND_LIBS} setk) 24 | target_link_libraries(apply-fixed-beamformer ${DEPEND_LIBS} setk) 25 | target_link_libraries(apply-supervised-mvdr ${DEPEND_LIBS} setk) 26 | target_link_libraries(apply-supervised-max-snr ${DEPEND_LIBS} setk) 27 | target_link_libraries(matrix-scale-elements ${DEPEND_LIBS} setk) 28 | target_link_libraries(matrix-scale-rows ${DEPEND_LIBS} setk) 29 | target_link_libraries(wav-to-power ${DEPEND_LIBS} setk) 30 | target_link_libraries(modify-feats ${DEPEND_LIBS} setk) 31 | target_link_libraries(apply-cmvn-perutt ${DEPEND_LIBS} setk) 32 | 33 | -------------------------------------------------------------------------------- /src/matrix-scale-elements.cc: -------------------------------------------------------------------------------- 1 | // src/matrix-scale-elements.cc 2 | 3 | // Copyright 2018 Jian Wu 4 | 5 | // See ../../COPYING for clarification regarding multiple authors 6 | // 7 | // Licensed under the Apache License, Version 2.0 (the "License"); 8 | // you may not use this file except in compliance with the License. 9 | // You may obtain a copy of the License at 10 | // 11 | // http://www.apache.org/licenses/LICENSE-2.0 12 | // 13 | // THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 | // KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED 15 | // WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, 16 | // MERCHANTABLITY OR NON-INFRINGEMENT. 17 | // See the Apache 2 License for the spec 18 | 19 | #include "base/kaldi-common.h" 20 | #include "matrix/kaldi-matrix.h" 21 | #include "util/common-utils.h" 22 | 23 | int main(int argc, char *argv[]) { 24 | try { 25 | using namespace kaldi; 26 | 27 | const char *usage = 28 | "Compute hadamard product of matrix\n" 29 | "\n" 30 | "Usage: matrix-scale-elements [options] " 31 | " \n" 32 | "e.g.: matrix-scale-elements scp:src.scp scp:weights.scp ark:fix.ark\n"; 33 | 34 | ParseOptions po(usage); 35 | 36 | po.Read(argc, argv); 37 | 38 | if (po.NumArgs() != 3) { 39 | po.PrintUsage(); 40 | exit(1); 41 | } 42 | 43 | std::string input_rspecifier = po.GetArg(1); 44 | std::string scale_rspecifier = po.GetArg(2); 45 | std::string matrix_wspecifier = po.GetArg(3); 46 | 47 | SequentialBaseFloatMatrixReader input_reader(input_rspecifier); 48 | RandomAccessBaseFloatMatrixReader scale_reader(scale_rspecifier); 49 | BaseFloatMatrixWriter mat_writer(matrix_wspecifier); 50 | 51 | int32 num_done = 0, num_matrix = 0; 52 | 53 | for (; !input_reader.Done(); input_reader.Next()) { 54 | std::string key = input_reader.Key(); 55 | num_matrix++; 56 | 57 | if (!scale_reader.HasKey(key)) continue; 58 | 59 | Matrix scale(scale_reader.Value(key)); 60 | const Matrix &input = input_reader.Value(); 61 | scale.MulElements(input); 62 | mat_writer.Write(key, scale); 63 | num_done++; 64 | } 65 | 66 | KALDI_LOG << "Scaled " << num_done << " matrices, " << num_matrix 67 | << " matrix in total."; 68 | 69 | return (num_done != 0 ? 0 : 1); 70 | } catch (const std::exception &e) { 71 | std::cerr << e.what(); 72 | return -1; 73 | } 74 | } 75 | -------------------------------------------------------------------------------- /src/matrix-scale-rows.cc: -------------------------------------------------------------------------------- 1 | // src/matrix-scale-rows.cc 2 | 3 | // Copyright 2018 Jian Wu 4 | 5 | // See ../../COPYING for clarification regarding multiple authors 6 | // 7 | // Licensed under the Apache License, Version 2.0 (the "License"); 8 | // you may not use this file except in compliance with the License. 9 | // You may obtain a copy of the License at 10 | // 11 | // http://www.apache.org/licenses/LICENSE-2.0 12 | // 13 | // THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 | // KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED 15 | // WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, 16 | // MERCHANTABLITY OR NON-INFRINGEMENT. 17 | // See the Apache 2 License for the specific language governing permissions and 18 | // limitations under the License. 19 | 20 | #include "base/kaldi-common.h" 21 | #include "matrix/kaldi-matrix.h" 22 | #include "util/common-utils.h" 23 | 24 | int main(int argc, char *argv[]) { 25 | try { 26 | using namespace kaldi; 27 | 28 | const char *usage = 29 | "Scale the rows of an input table of matrices and output the " 30 | "corresponding table of matrices\n" 31 | "\n" 32 | "Usage: matrix-scale-rows [options] " 33 | " \n" 34 | "e.g.: matrix-scale-rows ark:- scp:post.scp ark:weight_post.ark\n" 35 | "See also: matrix-sum, vector-sum\n"; 36 | 37 | ParseOptions po(usage); 38 | 39 | po.Read(argc, argv); 40 | 41 | if (po.NumArgs() != 3) { 42 | po.PrintUsage(); 43 | exit(1); 44 | } 45 | std::string vector_rspecifier = po.GetArg(1); 46 | std::string matrix_rspecifier = po.GetArg(2); 47 | std::string matrix_wspecifier = po.GetArg(3); 48 | 49 | SequentialBaseFloatVectorReader vec_reader(vector_rspecifier); 50 | RandomAccessBaseFloatMatrixReader mat_reader(matrix_rspecifier); 51 | BaseFloatMatrixWriter mat_writer(matrix_wspecifier); 52 | 53 | int32 num_done = 0, num_matrix = 0; 54 | 55 | for (; !vec_reader.Done(); vec_reader.Next()) { 56 | std::string key = vec_reader.Key(); 57 | num_matrix++; 58 | 59 | if (!mat_reader.HasKey(key)) continue; 60 | 61 | Matrix mat(mat_reader.Value(key)); 62 | const Vector &scale = vec_reader.Value(); 63 | 64 | int32 vector_dim = scale.Dim(), num_rows = mat.NumRows(); 65 | int32 num_scale_rows = vector_dim; 66 | if (vector_dim != num_rows) { 67 | num_scale_rows = std::min(vector_dim, num_rows); 68 | KALDI_VLOG(1) << "vector dim = " << vector_dim 69 | << ", matrix num_rows = " << num_rows << ", scale first " 70 | << num_scale_rows << " rows"; 71 | } 72 | 73 | mat.RowRange(0, num_scale_rows) 74 | .MulRowsVec(scale.Range(0, num_scale_rows)); 75 | mat_writer.Write(key, mat); 76 | num_done++; 77 | } 78 | 79 | KALDI_LOG << "Scaled " << num_done << " matrices, " << num_matrix 80 | << " matrix in total."; 81 | 82 | return (num_done != 0 ? 0 : 1); 83 | } catch (const std::exception &e) { 84 | std::cerr << e.what(); 85 | return -1; 86 | } 87 | } 88 | -------------------------------------------------------------------------------- /src/rir-simulate.cc: -------------------------------------------------------------------------------- 1 | // src/rir-simulate.cc 2 | // wujian@2018.6.26 3 | 4 | // Copyright 2018 Jian Wu 5 | 6 | // See ../../COPYING for clarification regarding multiple authors 7 | // 8 | // Licensed under the Apache License, Version 2.0 (the "License"); 9 | // you may not use this file except in compliance with the License. 10 | // You may obtain a copy of the License at 11 | // 12 | // http://www.apache.org/licenses/LICENSE-2.0 13 | // 14 | // THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 15 | // KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED 16 | // WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, 17 | // MERCHANTABLITY OR NON-INFRINGEMENT. 18 | // See the Apache 2 License for the specific language governing permissions and 19 | // limitations under the License. 20 | 21 | #include "feat/wave-reader.h" 22 | #include "include/rir-generator.h" 23 | 24 | int main(int argc, char const* argv[]) { 25 | try { 26 | using namespace kaldi; 27 | 28 | const char* usage = 29 | "Computes the response of an acoustic source to one or more " 30 | "microphones " 31 | "in a reverberant room using the image method.\n" 32 | "Reference: https://github.com/ehabets/RIR-Generator\n" 33 | "\n" 34 | "Usage: rir-simulate [options] \n" 35 | "See also: wav-reverberate\n"; 36 | 37 | ParseOptions po(usage); 38 | 39 | bool report = false, normalize = false; 40 | po.Register("report", &report, "If true, output RirGenerator's statistics"); 41 | po.Register("normalize", &normalize, 42 | "If true, normalize output room impluse response"); 43 | 44 | RirGeneratorOptions generator_opts; 45 | generator_opts.Register(&po); 46 | 47 | po.Read(argc, argv); 48 | 49 | if (po.NumArgs() != 1) { 50 | po.PrintUsage(); 51 | exit(1); 52 | } 53 | 54 | RirGenerator generator(generator_opts); 55 | Matrix rir; 56 | BaseFloat int16_max = 57 | static_cast(std::numeric_limits::max()); 58 | 59 | generator.GenerateRir(&rir); 60 | 61 | if (normalize) { 62 | rir.Scale(1.0 / rir.LargestAbsElem()); 63 | } 64 | rir.Scale(int16_max); 65 | 66 | if (report) std::cout << generator.Report(); 67 | 68 | std::string target_rir = po.GetArg(1); 69 | Output ko(target_rir, true, false); 70 | WaveData rir_simu(generator.Frequency(), rir); 71 | rir_simu.Write(ko.Stream()); 72 | } catch (const std::exception& e) { 73 | std::cerr << e.what(); 74 | return -1; 75 | } 76 | return 0; 77 | } 78 | -------------------------------------------------------------------------------- /src/wav-to-power.cc: -------------------------------------------------------------------------------- 1 | // src/wav-to-power.cc 2 | 3 | // Copyright 2018 Jian Wu 4 | 5 | // See ../../COPYING for clarification regarding multiple authors 6 | // 7 | // Licensed under the Apache License, Version 2.0 (the "License"); 8 | // you may not use this file except in compliance with the License. 9 | // You may obtain a copy of the License at 10 | // 11 | // http://www.apache.org/licenses/LICENSE-2.0 12 | // 13 | // THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 | // KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED 15 | // WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, 16 | // MERCHANTABLITY OR NON-INFRINGEMENT. 17 | // See the Apache 2 License for the specific language governing permissions and 18 | // limitations under the License. 19 | 20 | #include "base/kaldi-common.h" 21 | #include "feat/feature-mfcc.h" 22 | #include "feat/wave-reader.h" 23 | #include "util/common-utils.h" 24 | 25 | int main(int argc, char *argv[]) { 26 | try { 27 | using namespace kaldi; 28 | const char *usage = 29 | "Read wav files and output an archive consisting of a single float:\n" 30 | "the power of each one in dB.\n" 31 | "Usage: wav-to-power [options...] " 32 | "\n" 33 | "E.g.: wav-to-power scp:wav.scp ark,t:-\n" 34 | "See also: wav-copy extract-segments feat-to-len\n" 35 | "Currently this program may output a lot of harmless warnings " 36 | "regarding\n" 37 | "nonzero exit status of pipes\n"; 38 | 39 | ParseOptions po(usage); 40 | 41 | po.Read(argc, argv); 42 | 43 | if (po.NumArgs() != 2) { 44 | po.PrintUsage(); 45 | exit(1); 46 | } 47 | 48 | std::string wav_rspecifier = po.GetArg(1), power_wspecifier = po.GetArg(2); 49 | 50 | double sum_power = 0.0, 51 | min_power = std::numeric_limits::infinity(), 52 | max_power = 0; 53 | int32 num_done = 0; 54 | 55 | BaseFloatWriter power_writer(power_wspecifier); 56 | SequentialTableReader wav_reader(wav_rspecifier); 57 | for (; !wav_reader.Done(); wav_reader.Next()) { 58 | std::string key = wav_reader.Key(); 59 | const WaveData &wave_data = wav_reader.Value(); 60 | const Matrix &data = wave_data.Data(); 61 | 62 | BaseFloat power = VecVec(data.Row(0), data.Row(0)) / data.Row(0).Dim(); 63 | BaseFloat power_db = 10 * log10(power); 64 | 65 | power_writer.Write(key, power_db); 66 | 67 | sum_power += power_db; 68 | min_power = std::min(min_power, power_db); 69 | max_power = std::max(max_power, power_db); 70 | num_done++; 71 | } 72 | 73 | KALDI_LOG << "Printed power for " << num_done << " audio files."; 74 | if (num_done > 0) { 75 | KALDI_LOG << "Mean power was " << (sum_power / num_done) 76 | << ", min and max power were " << min_power << ", " 77 | << max_power; 78 | } 79 | return (num_done != 0 ? 0 : 1); 80 | } catch (const std::exception &e) { 81 | std::cerr << e.what(); 82 | return -1; 83 | } 84 | } 85 | -------------------------------------------------------------------------------- /steps/archive_wav.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | # wujian@2020 4 | 5 | # Archive wav.scp to wav.ark 6 | 7 | set -eu 8 | 9 | nj=32 10 | cmd="run.pl" 11 | 12 | . ./path.sh 13 | . ./utils/parse_options.sh || exit 1 14 | 15 | [ $# -ne 2 ] && echo "format error: $0 " && exit 1 16 | 17 | data_dir=$(cd $1; pwd) 18 | ark_dir=$2 19 | 20 | [ ! -f $data_dir/wav.scp ] && echo "$0: Missing wav.scp in $data_dir" && exit 21 | 22 | mkdir -p $ark_dir && ark_dir=$(cd $ark_dir; pwd) 23 | 24 | split_id=$(seq $nj) 25 | mkdir -p $data_dir/split$nj 26 | 27 | split_wav_scp="" 28 | for n in $split_id; do split_wav_scp="$split_wav_scp $data_dir/split$nj/wav.$n.scp"; done 29 | 30 | ./utils/split_scp.pl $data_dir/wav.scp $split_wav_scp 31 | 32 | exp=$(basename $data_dir) 33 | $cmd JOB=1:$nj exp/wav_copy/$exp/wav_copy.JOB.log \ 34 | wav-copy scp:$data_dir/split$nj/wav.JOB.scp \ 35 | ark,scp:$ark_dir/wav.JOB.ark,$ark_dir/wav.JOB.scp 36 | 37 | echo "$0: Archive wav.scp from $data_dir to $ark_dir done" -------------------------------------------------------------------------------- /steps/compute_masks.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # wujian@2018 3 | 4 | set -ue 5 | 6 | nj=10 7 | mask=irm 8 | cmd=run.pl 9 | mask_config=conf/mask.conf 10 | compress=true 11 | noise=false 12 | 13 | . ./path.sh || exit 1 14 | 15 | . ./utils/parse_options.sh || exit 1 16 | 17 | [ $# -ne 3 ] && echo "format error: $0 " && exit 1 18 | 19 | data_dir=$1 20 | logg_dir=$2 21 | mask_dir=$(cd $3; pwd) 22 | 23 | name=$(basename $data_dir) 24 | 25 | for x in noise.scp clean.scp; do [ ! -f $data_dir/$x ] && echo "$data_dir/$x do not exists!" && exit 1; done 26 | for x in $logg_dir $mask_dir; do [ ! -d $x ] && mkdir -p $x; done 27 | 28 | for x in noise clean; do 29 | dest_parts=$(for n in `seq $nj`; do echo $logg_dir/${name}_$x.$n.scp; done) 30 | cat $data_dir/$x.scp | ./utils/split_scp.pl - $dest_parts 31 | done 32 | 33 | if ! $noise; then 34 | echo "$0: compute $mask for clean parts in $data_dir..." 35 | $cmd JOB=1:$nj $logg_dir/compute_${name}_${mask}.JOB.log \ 36 | compute-masks --verbose=2 --config=$mask_config scp:$logg_dir/${name}_noise.JOB.scp \ 37 | scp:$logg_dir/${name}_clean.JOB.scp ark:- \| \ 38 | copy-feats --compress=$compress ark:- \ 39 | ark,scp:$mask_dir/${name}_${mask}.JOB.ark,$mask_dir/${name}_${mask}.JOB.scp || exit 1; 40 | else 41 | echo "$0: compute $mask for noise parts in $data_dir..." 42 | $cmd JOB=1:$nj $logg_dir/compute_${name}_${mask}.JOB.log \ 43 | compute-masks --verbose=2 --config=$mask_config scp:$logg_dir/${name}_clean.JOB.scp \ 44 | scp:$logg_dir/${name}_noise.JOB.scp ark:- \| \ 45 | copy-feats --compress=$compress ark:- \ 46 | ark,scp:$mask_dir/${name}_${mask}.JOB.ark,$mask_dir/${name}_${mask}.JOB.scp || exit 1; 47 | fi 48 | 49 | for n in `seq $nj`; do cat $mask_dir/${name}_${mask}.$n.scp; done | sort -k1 > $data_dir/masks.scp 50 | 51 | echo "$0: Compute $mask for $data_dir done!" 52 | 53 | 54 | -------------------------------------------------------------------------------- /steps/extract_segments.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # wujian@2018 3 | 4 | set -ue 5 | 6 | nj=32 7 | cmd="run.pl" 8 | 9 | . ./path.sh 10 | . ./utils/parse_options.sh || exit 1 11 | 12 | [ $# -ne 2 ] && echo "format error: $0 " && exit 1 13 | 14 | data_dir=$(cd $1; pwd) 15 | segment_dir=$2 16 | 17 | mkdir -p $segment_dir && segment_dir=$(cd $segment_dir; pwd) 18 | 19 | for dep in "segments" "wav.scp"; do 20 | [ ! -f $data_dir/$dep ] && echo "$0: Missing dependence $dep in $data_dir" && exit 21 | done 22 | 23 | split_id=$(seq $nj) 24 | mkdir -p $data_dir/split$nj 25 | 26 | split_segments="" 27 | for n in $split_id; do split_segments="$split_segments $data_dir/split$nj/$n.seg"; done 28 | 29 | ./utils/split_scp.pl $data_dir/segments $split_segments 30 | 31 | for n in $split_id; do 32 | awk -v dst_dir=$segment_dir '{print $1"\t"dst_dir"/"$1".wav"}' \ 33 | $data_dir/split$nj/$n.seg > $data_dir/split$nj/$n.scp; 34 | done 35 | 36 | $cmd JOB=1:$nj exp/segment/extract_segment.JOB.log \ 37 | extract-segments scp:$data_dir/wav.scp \ 38 | $data_dir/split$nj/JOB.seg \ 39 | scp:$data_dir/split$nj/JOB.scp 40 | 41 | echo "$0: Extract segments from $data_dir done" 42 | -------------------------------------------------------------------------------- /steps/mono_mask_enhance.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | # wujian@2018 3 | 4 | set -eu 5 | 6 | nj=10 7 | stage=1 8 | iter=final 9 | cmd=run.pl 10 | keep_mask=true 11 | mask_only=false 12 | # using for wav-separate 13 | mask_conf=conf/mask.conf 14 | online_ivector_dir= 15 | chunk_width=64 16 | extra_left_context=0 17 | extra_right_context=0 18 | extra_left_context_initial=-1 19 | extra_right_context_final=-1 20 | 21 | . ./path.sh 22 | 23 | . ./utils/parse_options.sh || exit 1 24 | 25 | [ $# -ne 3 ] && echo "format error: $0 " && exit 1 26 | 27 | src_dir=$1 28 | mdl_dir=$2 29 | dst_dir=$3 30 | 31 | [ ! -d $dst_dir ] && mkdir -p $dst_dir 32 | 33 | dst_dir=$(cd $dst_dir; pwd) 34 | 35 | ./utils/validate_data_dir.sh --no-text $src_dir || exit 1 36 | 37 | # compute mask 38 | if [ $stage -le 1 ]; then 39 | echo "$0: compute mask for $src_dir using nnet in $mdl_dir" 40 | ./steps/nnet3/compute_output.sh --nj $nj --frames-per-chunk $chunk_width \ 41 | --extra-left-context $extra_left_context --extra-right-context $extra_right_context \ 42 | --extra-left-context-initial $extra_left_context_initial \ 43 | --extra-right-context-final $extra_right_context_final \ 44 | --online-ivector-dir "$online_ivector_dir" --iter $iter \ 45 | $src_dir $mdl_dir $dst_dir/mask 46 | fi 47 | 48 | [ $mask_only ] && exit 0 49 | 50 | sdata=$src_dir/split$nj 51 | 52 | [ ! -d $sdata ] && echo "**error** $0: run stage 1 first" && exit 1 53 | 54 | if [ $stage -le 2 ]; then 55 | echo "$0: estimate enhanced wave using masks in $dst_dir/mask" 56 | for x in $(seq $nj); do 57 | awk -v dst_dir=$dst_dir '{printf("%s\t%s/%s.wav\n", $1, dst_dir, $1)}' \ 58 | $sdata/$x/wav.scp > $sdata/$x/enh.scp 59 | done 60 | $cmd JOB=1:$nj exp/mono_enhan/wav_separate.JOB.log \ 61 | wav-separate --config=$mask_conf scp:$sdata/JOB/wav.scp \ 62 | scp:$dst_dir/mask/output.JOB.scp scp:$sdata/JOB/enh.scp 63 | [ ! $keep_mask ] && rm -rf $dst_dir/mask && echo "$0: clear mask in $dst_dir/mask" 64 | fi 65 | 66 | echo "$0: done!" 67 | 68 | 69 | 70 | 71 | -------------------------------------------------------------------------------- /test/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 3.0) 2 | 3 | add_executable(test-stft test-stft.cc) 4 | add_executable(test-srp-phat test-srp-phat.cc) 5 | add_executable(test-complex test-complex.cc) 6 | add_executable(test-beamformer test-beamformer.cc) 7 | 8 | target_link_libraries(test-stft ${DEPEND_LIBS} setk) 9 | target_link_libraries(test-srp-phat ${DEPEND_LIBS} setk) 10 | target_link_libraries(test-complex ${DEPEND_LIBS} setk) 11 | target_link_libraries(test-beamformer ${DEPEND_LIBS} setk) 12 | 13 | -------------------------------------------------------------------------------- /test/test-beamformer.cc: -------------------------------------------------------------------------------- 1 | // test-beamformer.cc 2 | // wujian@2018 3 | 4 | #include "util/common-utils.h" 5 | #include "include/beamformer.h" 6 | #include "include/complex-base.h" 7 | #include "include/complex-matrix.h" 8 | #include "include/complex-vector.h" 9 | 10 | using namespace kaldi; 11 | 12 | void CreateHermiteCmatrix(CMatrix *cm, MatrixIndexT s) { 13 | cm->Resize(s, s); 14 | cm->SetRandn(); 15 | for (MatrixIndexT i = 0; i < s; i++) { 16 | for (MatrixIndexT j = 0; j < i; j++) { 17 | (*cm)(j, i, kReal) = (*cm)(i, j, kReal); 18 | (*cm)(j, i, kImag) = -(*cm)(i, j, kImag); 19 | } 20 | (*cm)(i, i, kImag) = 0; 21 | } 22 | } 23 | 24 | void TestStringSpliter() { 25 | std::string scp = "scp:CH1.scp,CH2.scp,CH3.scp"; 26 | std::vector tokens; 27 | size_t found = scp.find_first_of(":", 0); 28 | if (found != std::string::npos) 29 | std::cout << scp.substr(0, found) << std::endl; 30 | SplitStringToVector(scp.substr(found + 1), ",", false, &tokens); 31 | for (std::string &s : tokens) std::cout << s << std::endl; 32 | } 33 | 34 | void TestEstimatePsd() { 35 | for (int32 i = 0; i < 10; i++) { 36 | int32 f = Rand() % 6 + 4, t = Rand() % 6 + 4, c = Rand() % 5 + 3; 37 | CMatrix src_stft(f * t, c), psd; 38 | Matrix mask(t, f); 39 | src_stft.SetRandn(); 40 | mask.SetRandn(); 41 | EstimatePsd(src_stft, mask, &psd, NULL); 42 | std::cout << "f = " << f << ", t = " << t << ", c = " << c << std::endl; 43 | for (int32 j = 0; j < f; j++) { 44 | SubCMatrix covar(psd, j * c, c, 0, c); 45 | KALDI_ASSERT(covar.IsHermitian()); 46 | } 47 | } 48 | } 49 | 50 | void TestBeamform() { 51 | for (int32 i = 0; i < 10; i++) { 52 | int32 f = Rand() % 6 + 4, t = Rand() % 6 + 4, c = Rand() % 5 + 3; 53 | CMatrix src_stft(f * t, c), weights(f, c), enh_stft; 54 | src_stft.SetRandn(); 55 | weights.SetRandn(); 56 | weights.Conjugate(); 57 | Beamform(src_stft, weights, &enh_stft); 58 | std::cout << "f = " << f << ", t = " << t << ", c = " << c << std::endl; 59 | std::cout << enh_stft; 60 | } 61 | } 62 | 63 | void TestEstimateSteerVector() { 64 | for (int32 i = 0; i < 10; i++) { 65 | int32 f = Rand() % 6 + 4, t = Rand() % 6 + 4, c = Rand() % 5 + 3; 66 | CMatrix psd(f * c, c), hmat, sv; 67 | for (int32 j = 0; j < f; j++) { 68 | CreateHermiteCmatrix(&hmat, c); 69 | psd.RowRange(j * c, c).CopyFromMat(hmat); 70 | } 71 | std::cout << "f = " << f << ", t = " << t << ", c = " << c << std::endl; 72 | EstimateSteerVector(psd, &sv); 73 | std::cout << sv; 74 | } 75 | } 76 | 77 | void TestComputeMvdrBeamWeights() { 78 | for (int32 i = 0; i < 10; i++) { 79 | int32 f = Rand() % 6 + 4, t = Rand() % 6 + 4, c = Rand() % 5 + 3; 80 | CMatrix psd(f * c, c), hmat, weights, sv(f, c); 81 | sv.SetRandn(); 82 | for (int32 j = 0; j < f; j++) { 83 | CreateHermiteCmatrix(&hmat, c); 84 | psd.RowRange(j * c, c).CopyFromMat(hmat); 85 | } 86 | std::cout << "f = " << f << ", t = " << t << ", c = " << c << std::endl; 87 | ComputeMvdrBeamWeights(psd, sv, &weights); 88 | std::cout << weights; 89 | } 90 | } 91 | 92 | void TestTrimStft() { 93 | for (int32 i = 0; i < 10; i++) { 94 | int32 f = Rand() % 6 + 4, t = Rand() % 6 + 4, c = Rand() % 4 + 2; 95 | CMatrix src_stft(t, f * c); 96 | src_stft.SetRandn(); 97 | for (int32 j = 0; j < c; j++) 98 | std::cout << "CH " << j << " :\n" << src_stft.ColRange(j * f, f); 99 | CMatrix dst_stft; 100 | TrimStft(f, c, src_stft, &dst_stft); 101 | std::cout << dst_stft; 102 | } 103 | } 104 | 105 | int main() { 106 | TestEstimatePsd(); 107 | TestBeamform(); 108 | TestEstimateSteerVector(); 109 | TestComputeMvdrBeamWeights(); 110 | TestTrimStft(); 111 | TestStringSpliter(); 112 | return 0; 113 | } 114 | -------------------------------------------------------------------------------- /test/test-srp-phat.cc: -------------------------------------------------------------------------------- 1 | // srp-phat.cc 2 | // wujian@18.5.29 3 | // 4 | 5 | #include "include/srp-phat.h" 6 | #include "include/stft.h" 7 | 8 | using namespace kaldi; 9 | 10 | void TestSrpPhat() { 11 | SrpPhatOptions srp_opts; 12 | srp_opts.topo_descriptor = "0,0.037,0.113,0.226"; 13 | srp_opts.smooth_context = 0; 14 | 15 | ShortTimeFTOptions stft_opts; 16 | stft_opts.window = "hanning"; 17 | 18 | SrpPhatComputor srp_computor(srp_opts, 16000, 513); 19 | 20 | bool binary; 21 | Input wave_in("srp_test.wav", &binary); 22 | WaveData wave; 23 | wave.Read(wave_in.Stream()); 24 | 25 | Matrix stft, srp_phat; 26 | ShortTimeFTComputer stft_computer(stft_opts); 27 | stft_computer.Compute(wave.Data(), &stft, NULL, NULL); 28 | 29 | CMatrix cstft(stft.NumRows(), stft.NumCols() / 2 + 1); 30 | cstft.CopyFromRealfft(stft); 31 | srp_computor.Compute(cstft, &srp_phat); 32 | 33 | Output ko("srp_test.mat", true); 34 | srp_phat.Write(ko.Stream(), true); 35 | // KALDI_LOG << srp_phat; 36 | } 37 | 38 | void TestMathOpts() { 39 | CMatrix A(4, 6), B(4, 6); 40 | A.SetRandn(); 41 | B.SetRandn(); 42 | CMatrix R(A); 43 | KALDI_LOG << "A = " << A; 44 | KALDI_LOG << "B = " << B; 45 | R.MulElements(B, kConj); 46 | KALDI_LOG << "A .* B = " << R; 47 | R.DivElements(A, kNoConj, true); 48 | KALDI_LOG << "A .* B / |A| = " << R; 49 | R.DivElements(B, kNoConj, true); 50 | KALDI_LOG << "A .* B / (|A| .* |B|) = " << R; 51 | } 52 | int main() { 53 | TestMathOpts(); 54 | TestSrpPhat(); 55 | return 0; 56 | } 57 | -------------------------------------------------------------------------------- /test/test-stft.cc: -------------------------------------------------------------------------------- 1 | // test-stft.cc 2 | // wujian@18.2.12 3 | 4 | #include "include/stft.h" 5 | 6 | using namespace kaldi; 7 | 8 | void TestStft() { 9 | bool binary; 10 | Input wave_in("orig.wav", &binary); 11 | WaveData wave_orig; 12 | wave_orig.Read(wave_in.Stream()); 13 | 14 | // configs 15 | ShortTimeFTOptions opts; 16 | opts.frame_length = 1024; 17 | opts.center = true; 18 | opts.normalize_input = false; 19 | 20 | // BaseFloat range = wave_orig.Data().LargestAbsElem(); 21 | ShortTimeFTComputer stft_computer(opts); 22 | 23 | Matrix specs; 24 | stft_computer.ShortTimeFT(wave_orig.Data(), &specs); 25 | 26 | Matrix recon; 27 | stft_computer.InverseShortTimeFT(specs, &recon, -1); 28 | 29 | Output ko("copy.wav", binary, false); 30 | WaveData wave_copy(16000, recon); 31 | wave_copy.Write(ko.Stream()); 32 | // std:: cout << vec << std::endl; 33 | } 34 | 35 | void TestRealfft() { 36 | int32 dim = 16; 37 | 38 | Vector vec(dim); 39 | vec.SetRandn(); 40 | std::cout << vec; 41 | RealFft(&vec, true); 42 | std::cout << vec; 43 | RealFft(&vec, false); 44 | vec.Scale(1.0 / dim); 45 | std::cout << vec; 46 | } 47 | 48 | void TestIstft() { 49 | bool binary; 50 | Input wave_in("orig.wav", &binary); 51 | WaveData wave_orig; 52 | wave_orig.Read(wave_in.Stream()); 53 | 54 | // configs 55 | ShortTimeFTOptions opts; 56 | opts.frame_length = 1024; 57 | opts.frame_shift = 256; 58 | opts.normalize_input = false; 59 | opts.apply_log = true; 60 | opts.apply_pow = true; 61 | 62 | ShortTimeFTComputer stft_computer(opts); 63 | 64 | Matrix stft_orig, specs, arg; 65 | stft_computer.Compute(wave_orig.Data(), &stft_orig, &specs, &arg); 66 | BaseFloat range = wave_orig.Data().LargestAbsElem(); 67 | /* 68 | stft_computer.ShortTimeFT(wave_orig.Data(), &stft_orig); 69 | stft_computer.ComputeSpectrum(stft_orig, &specs); 70 | stft_computer.ComputeArg(stft_orig, &arg); 71 | */ 72 | 73 | Matrix stft_recon; 74 | stft_computer.Polar(specs, arg, &stft_recon); 75 | 76 | std::cout << stft_orig.Row(10); 77 | std::cout << stft_recon.Row(10); 78 | 79 | Matrix recon; 80 | stft_computer.InverseShortTimeFT(stft_recon, &recon, range); 81 | 82 | Output ko("copy.wav", binary, false); 83 | WaveData wave_copy(16000, recon); 84 | wave_copy.Write(ko.Stream()); 85 | // std:: cout << vec << std::endl; 86 | } 87 | 88 | int main() { 89 | TestStft(); 90 | TestIstft(); 91 | return 0; 92 | } 93 | -------------------------------------------------------------------------------- /test/test_rir_generator.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | # wujian@2018.6.26 3 | 4 | # config same as example_{1..4}.m in https://github.com/ehabets/RIR-Generator 5 | 6 | ./bin/rir-simulate --report --sound-velocity=340 \ 7 | --samp-frequency=16000 --receiver-location=2,1.5,2 \ 8 | --source-location=2,3.5,2 --room-topo=5,4,6 \ 9 | --beta=0.4 --number-samples=4096 rir1.wav 10 | 11 | ./bin/rir-simulate --report --sound-velocity=340 \ 12 | --samp-frequency=16000 --receiver-location=2,1.5,2 \ 13 | --source-location=2,3.5,2 --room-topo=5,4,6 \ 14 | --beta=0.4 --number-samples=2048 --order=2 \ 15 | --microphone-type=omnidirectional \ 16 | --angle=0 --hp-filter=true rir2.wav 17 | 18 | ./bin/rir-simulate --report --sound-velocity=340 \ 19 | --samp-frequency=16000 --receiver-location="2,1.5,2;1,1.5,2" \ 20 | --source-location=2,3.5,2 --room-topo=5,4,6 \ 21 | --beta=0.4 --number-samples=4096 --order=-1 \ 22 | --microphone-type=omnidirectional \ 23 | --angle=0 --hp-filter=true rir3.wav 24 | 25 | ./bin/rir-simulate --report --sound-velocity=340 \ 26 | --samp-frequency=16000 --receiver-location=2,1.5,2 \ 27 | --source-location=2,3.5,2 --room-topo=5,4,6 \ 28 | --beta=0.4 --number-samples=4096 --order=-1 \ 29 | --microphone-type=hypercardioid \ 30 | --hp-filter=false --angle=1.57,0 rir4.wav 31 | -------------------------------------------------------------------------------- /utils/filter_scp.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env perl 2 | # Copyright 2010-2012 Microsoft Corporation 3 | # Johns Hopkins University (author: Daniel Povey) 4 | 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 12 | # KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED 13 | # WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, 14 | # MERCHANTABLITY OR NON-INFRINGEMENT. 15 | # See the Apache 2 License for the specific language governing permissions and 16 | # limitations under the License. 17 | 18 | 19 | # This script takes a list of utterance-ids or any file whose first field 20 | # of each line is an utterance-id, and filters an scp 21 | # file (or any file whose "n-th" field is an utterance id), printing 22 | # out only those lines whose "n-th" field is in id_list. The index of 23 | # the "n-th" field is 1, by default, but can be changed by using 24 | # the -f switch 25 | 26 | $exclude = 0; 27 | $field = 1; 28 | $shifted = 0; 29 | 30 | do { 31 | $shifted=0; 32 | if ($ARGV[0] eq "--exclude") { 33 | $exclude = 1; 34 | shift @ARGV; 35 | $shifted=1; 36 | } 37 | if ($ARGV[0] eq "-f") { 38 | $field = $ARGV[1]; 39 | shift @ARGV; shift @ARGV; 40 | $shifted=1 41 | } 42 | } while ($shifted); 43 | 44 | if(@ARGV < 1 || @ARGV > 2) { 45 | die "Usage: filter_scp.pl [--exclude] [-f ] id_list [in.scp] > out.scp \n" . 46 | "Prints only the input lines whose f'th field (default: first) is in 'id_list'.\n" . 47 | "Note: only the first field of each line in id_list matters. With --exclude, prints\n" . 48 | "only the lines that were *not* in id_list.\n" . 49 | "Caution: previously, the -f option was interpreted as a zero-based field index.\n" . 50 | "If your older scripts (written before Oct 2014) stopped working and you used the\n" . 51 | "-f option, add 1 to the argument.\n" . 52 | "See also: utils/filter_scp.pl .\n"; 53 | } 54 | 55 | 56 | $idlist = shift @ARGV; 57 | open(F, "<$idlist") || die "Could not open id-list file $idlist"; 58 | while() { 59 | @A = split; 60 | @A>=1 || die "Invalid id-list file line $_"; 61 | $seen{$A[0]} = 1; 62 | } 63 | 64 | if ($field == 1) { # Treat this as special case, since it is common. 65 | while(<>) { 66 | $_ =~ m/\s*(\S+)\s*/ || die "Bad line $_, could not get first field."; 67 | # $1 is what we filter on. 68 | if ((!$exclude && $seen{$1}) || ($exclude && !defined $seen{$1})) { 69 | print $_; 70 | } 71 | } 72 | } else { 73 | while(<>) { 74 | @A = split; 75 | @A > 0 || die "Invalid scp file line $_"; 76 | @A >= $field || die "Invalid scp file line $_"; 77 | if ((!$exclude && $seen{$A[$field-1]}) || ($exclude && !defined $seen{$A[$field-1]})) { 78 | print $_; 79 | } 80 | } 81 | } 82 | 83 | # tests: 84 | # the following should print "foo 1" 85 | # ( echo foo 1; echo bar 2 ) | utils/filter_scp.pl <(echo foo) 86 | # the following should print "bar 2". 87 | # ( echo foo 1; echo bar 2 ) | utils/filter_scp.pl -f 2 <(echo 2) 88 | -------------------------------------------------------------------------------- /utils/parse_options.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Copyright 2012 Johns Hopkins University (Author: Daniel Povey); 4 | # Arnab Ghoshal, Karel Vesely 5 | 6 | # Licensed under the Apache License, Version 2.0 (the "License"); 7 | # you may not use this file except in compliance with the License. 8 | # You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 13 | # KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED 14 | # WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, 15 | # MERCHANTABLITY OR NON-INFRINGEMENT. 16 | # See the Apache 2 License for the specific language governing permissions and 17 | # limitations under the License. 18 | 19 | 20 | # Parse command-line options. 21 | # To be sourced by another script (as in ". parse_options.sh"). 22 | # Option format is: --option-name arg 23 | # and shell variable "option_name" gets set to value "arg." 24 | # The exception is --help, which takes no arguments, but prints the 25 | # $help_message variable (if defined). 26 | 27 | 28 | ### 29 | ### The --config file options have lower priority to command line 30 | ### options, so we need to import them first... 31 | ### 32 | 33 | # Now import all the configs specified by command-line, in left-to-right order 34 | for ((argpos=1; argpos<$#; argpos++)); do 35 | if [ "${!argpos}" == "--config" ]; then 36 | argpos_plus1=$((argpos+1)) 37 | config=${!argpos_plus1} 38 | [ ! -r $config ] && echo "$0: missing config '$config'" && exit 1 39 | . $config # source the config file. 40 | fi 41 | done 42 | 43 | 44 | ### 45 | ### No we process the command line options 46 | ### 47 | while true; do 48 | [ -z "${1:-}" ] && break; # break if there are no arguments 49 | case "$1" in 50 | # If the enclosing script is called with --help option, print the help 51 | # message and exit. Scripts should put help messages in $help_message 52 | --help|-h) if [ -z "$help_message" ]; then echo "No help found." 1>&2; 53 | else printf "$help_message\n" 1>&2 ; fi; 54 | exit 0 ;; 55 | --*=*) echo "$0: options to scripts must be of the form --name value, got '$1'" 56 | exit 1 ;; 57 | # If the first command-line argument begins with "--" (e.g. --foo-bar), 58 | # then work out the variable name as $name, which will equal "foo_bar". 59 | --*) name=`echo "$1" | sed s/^--// | sed s/-/_/g`; 60 | # Next we test whether the variable in question is undefned-- if so it's 61 | # an invalid option and we die. Note: $0 evaluates to the name of the 62 | # enclosing script. 63 | # The test [ -z ${foo_bar+xxx} ] will return true if the variable foo_bar 64 | # is undefined. We then have to wrap this test inside "eval" because 65 | # foo_bar is itself inside a variable ($name). 66 | eval '[ -z "${'$name'+xxx}" ]' && echo "$0: invalid option $1" 1>&2 && exit 1; 67 | 68 | oldval="`eval echo \\$$name`"; 69 | # Work out whether we seem to be expecting a Boolean argument. 70 | if [ "$oldval" == "true" ] || [ "$oldval" == "false" ]; then 71 | was_bool=true; 72 | else 73 | was_bool=false; 74 | fi 75 | 76 | # Set the variable to the right value-- the escaped quotes make it work if 77 | # the option had spaces, like --cmd "queue.pl -sync y" 78 | eval $name=\"$2\"; 79 | 80 | # Check that Boolean-valued arguments are really Boolean. 81 | if $was_bool && [[ "$2" != "true" && "$2" != "false" ]]; then 82 | echo "$0: expected \"true\" or \"false\": $1 $2" 1>&2 83 | exit 1; 84 | fi 85 | shift 2; 86 | ;; 87 | *) break; 88 | esac 89 | done 90 | 91 | 92 | # Check for an empty argument to the --cmd option, which can easily occur as a 93 | # result of scripting errors. 94 | [ ! -z "${cmd+xxx}" ] && [ -z "$cmd" ] && echo "$0: empty argument to --cmd option" 1>&2 && exit 1; 95 | 96 | 97 | true; # so this script returns exit code 0. 98 | --------------------------------------------------------------------------------