├── AudioSourceSeparation ├── README.md ├── pra_ILRMA.ipynb └── pra_ILRMA.py ├── AudioWatermark ├── README.md ├── cepstrum_method.py ├── echo_hiding_method.py ├── lsb_method.py ├── phase_coding_method.py ├── spread_spectrum_method.py ├── svd_cepstrum_method.py ├── svd_dct_method.py ├── svd_stft_method.py ├── svd_stft_method_offdiag.py └── wavelet_method.py ├── DigitalSignalProcessing ├── README.md ├── dsp_add_whitenoise.ipynb ├── dsp_add_whitenoise.py ├── dsp_convolution.py ├── dsp_fir_denoise.py ├── dsp_hilbert.py ├── dsp_rectangle_anime.py ├── dsp_rectangle_fourier.py ├── dsp_sawtooth_anime.py ├── dsp_sawtooth_fourier.py ├── dsp_sine.html ├── dsp_sine.ipynb ├── dsp_sine.py ├── dsp_sine_addnoise.py ├── dsp_sine_addnoise_plot.py ├── dsp_sine_beat.py ├── dsp_sine_euler.py ├── dsp_sine_plot.py ├── dsp_sine_plot_multi.py ├── dsp_triangle_fourier.py ├── dsp_window_blackman.py ├── dsp_window_hamming.py ├── dsp_window_hann.py ├── dsp_window_triangle.py ├── rectangle_anime.mp4 └── sawtooth_anime.mp4 ├── LICENSE ├── PhaseRetrieval ├── README.md ├── phaseret_pghi.py ├── phaseret_rtisila.py ├── phaseret_rtpghi.py └── phaseret_spsi.py ├── README.md ├── SoundEffect ├── README.md ├── pysox_bandpass_bandreject.py ├── pysox_change_bitdepth.ipynb ├── pysox_change_bitdepth.py ├── pysox_change_samplerate.py ├── pysox_downsample.py ├── pysox_echo.ipynb ├── pysox_echo.py ├── pysox_flanger.py ├── pysox_lowpass-highpass.py ├── pysox_pitchshift.ipynb ├── pysox_pitchshift.py ├── pysox_reverb.ipynb ├── pysox_reverb.py ├── pysox_stereo2mono.py ├── pysox_timestretch.ipynb ├── pysox_timestretch.py ├── pysox_tremolo.ipynb ├── pysox_tremolo.py ├── pysox_upsample.py └── pysox_wav2raw.py ├── SpeakerRecognition ├── README.md ├── config.yaml ├── config_sklearn.yaml ├── download_pretrained_model.py ├── download_voicestats_corpus.py ├── extract_sample.py ├── extract_xvector_voicestats.py ├── spk_recog_mlp.py └── spk_recog_mlp_sklearn.py ├── SpeechAnalysis ├── README.md ├── feat_cepstrum.py ├── feat_fo_autocorr.ipynb ├── feat_fo_autocorr.py ├── feat_fo_autocorr_variant.ipynb ├── feat_fo_autocorr_variant.py ├── feat_fo_cepstrum.ipynb ├── feat_fo_cepstrum.py ├── feat_fo_cepstrum_sequence.py ├── feat_fo_dio.ipynb ├── feat_fo_dio.py ├── feat_fo_music.ipynb ├── feat_fo_music.py ├── feat_fo_pyin.ipynb ├── feat_fo_pyin.py ├── feat_fo_yin.ipynb ├── feat_fo_yin.py ├── feat_gla.ipynb ├── feat_gla.py ├── feat_gla_admm.ipynb ├── feat_gla_admm.py ├── feat_librosa_gla.ipynb ├── feat_melspec.ipynb ├── feat_melspec.py ├── feat_mfcc.ipynb ├── feat_mfcc.py ├── feat_stft.py ├── feat_stft_istft.py ├── feat_stft_spec.ipynb └── feat_stft_spec.py ├── SpeechAnalysisSynthesis ├── README.md ├── pysptk_anasyn_lpc.ipynb ├── pysptk_anasyn_lpc.py ├── pysptk_anasyn_lsp.ipynb ├── pysptk_anasyn_lsp.py ├── pysptk_anasyn_mlsa.ipynb ├── pysptk_anasyn_mlsa.py ├── pysptk_anasyn_mlsa_others.ipynb ├── pysptk_anasyn_mlsa_others.py ├── pysptk_anasyn_mlsa_pyworld.py ├── pysptk_anasyn_parcor.ipynb ├── pysptk_anasyn_parcor.py ├── pysptk_anasyn_recog.py ├── pyworld_anasyn.ipynb ├── pyworld_anasyn.py └── pyworld_anasyn_encdec.py ├── SpeechRecognition ├── README.md ├── google_mode_modoki.py ├── recog_speech_rec.py ├── recog_wikipedia.py ├── record_speech.py ├── vosk_asr_recorded.py ├── vosk_asr_streaming.py └── vosk_asr_streaming_vad.py ├── SpeechSynthesis ├── README.md ├── synth_gtts.py ├── synth_gtts_gui.py ├── synth_pyopenjtalk.py ├── synth_pyopenjtalk_gui.py ├── synth_pyttsx.py └── synth_ttslearn_multi_gui.py ├── VoiceConversion ├── README.md └── pysimplegui_realtime_vc.py └── WarmUp ├── README.md ├── ffmpeg_mp3_to_wav.py ├── ffmpeg_wav_to_mp3.py ├── librosa_plot_specgram.ipynb ├── librosa_plot_specgram.py ├── librosa_plot_waveform.ipynb ├── librosa_plot_waveform.py ├── plt_specgram.ipynb ├── plt_specgram.py ├── plt_waveform.ipynb ├── plt_waveform.py ├── plt_waveform_scipy.py ├── plt_whitenoise.ipynb ├── pydub_mp3_to_wav.py ├── pydub_wav_to_mp3.py ├── sounddevice_play_wav.py ├── sounddevice_rec_wav.py ├── subprocess_play_wav.py ├── wave_change_bitdepth.py ├── wave_change_framerate.py ├── wave_normalize.py ├── wave_play_wav.ipynb ├── wave_read_write.py ├── wave_read_write_scipy.py ├── wave_stereo_to_mono.py └── wave_write_whitenoise.py /AudioSourceSeparation/README.md: -------------------------------------------------------------------------------- 1 | # 音源分離 2 | ## はじめに 3 | ``` 4 | pip3 install pyroomacoustics 5 | pip3 install nussl 6 | ``` 7 | 8 | - pyroomacoustics https://github.com/LCAV/pyroomacoustics 9 | - nussl https://github.com/nussl/nussl 10 | 11 | ### Pythonスクリプト 12 | - pyroomacoustics 13 | - ILRMAベースの音源分離 [pra_ILRMA.py](https://nbviewer.jupyter.org/github/tam17aki/speech_process_exercise/blob/master/AudioSourceSeparation/pra_ILRMA.py) 14 | 15 | ### Jupyter notebook 16 | - pyroomacoustics 17 | - ILRMAベースの音源分離 [pra_ILRMA.ipynb](https://nbviewer.jupyter.org/github/tam17aki/speech_process_exercise/blob/master/AudioSourceSeparation/pra_ILRMA.ipynb) 18 | 19 | 20 | ### Google Colaboratory 21 | - nussl 22 | - AudioSignal入門 [Introduction_to_AudioSignal.ipynb](https://colab.research.google.com/drive/1ntYryCmSam1El-WWIWRzYS8a9f8Fa8d5?usp=sharing) 23 | - STFT表現 [audio_signal_stft.ipynb](https://colab.research.google.com/drive/1ALGz70yCLTn1y6njR4D9DCr5qNIku_la?usp=sharing) 24 | - 周波数マスキング入門 [masking_audio_signal_timefreq.ipynb](https://colab.research.google.com/drive/1qPyDcUAOwsfDZ_X1x_yn1Zqb2Ef52QUr?usp=sharing) 25 | - ローパス・ハイパスフィルタによる音源分離 [high-lowpass_filters.ipynb](https://colab.research.google.com/drive/1tTqqcBgWFK0wGQeZZjJXUGE9_4ja2GM2?usp=sharing) 26 | - 理想バイナリマスクによる音源分離 [ideal_binary_mask.ipynb](https://colab.research.google.com/drive/1sxQu62bunrIcjslTl01HGmwyPdjTM4i4?usp=sharing) 27 | - 理想 ratio マスク(ソフトマスク)による音源分離 [ideal_mask.ipynb](https://colab.research.google.com/drive/1XYMJqc6X_9vKptt5irrGTi-deLoMGwF8?usp=sharing) 28 | - ウィーナーフィルタによる信号復元(音源分離結果の強調) [wiener_filter.ipynb](https://colab.research.google.com/drive/1f6fbPZNAG8iO2bgZFyFOlAGPiwx7CTr9?usp=sharing) 29 | - ロバストPCAによる音源分離(歌声と楽曲の分離)[robust_pca.ipynb](https://colab.research.google.com/drive/1S34MIYs-_OCKEt7YULR2MfJpJ_TaOUVx?usp=sharing) 30 | - 独立成分分析による音源分離 [ica.ipynb](https://colab.research.google.com/drive/1q3Pk5EXMS3GXO0kRkms5mxIzbfw0o3dQ?usp=sharing) 31 | - 2次元フーリエ変換による音源分離(歌声と楽曲の分離)[2-d_fourier.ipynb](https://colab.research.google.com/drive/1G6c8SLP6bpnu_3f_AaAk2nK4FgzoSbC8?usp=sharing) 32 | - REPET法による音源分離(歌声と楽曲の分離)[REPET.ipynb](https://colab.research.google.com/drive/1H4IcYHJSD2F9XBjrCNoGtrMjmrg7Up9W?usp=sharing) 33 | - REPET-SIM法による音源分離(歌声と楽曲の分離)[REPETSIM.ipynb](https://colab.research.google.com/drive/12X9Pvv94vcDIQlv1pUYNqt_HsJCVhiWw?usp=sharing) 34 | - Timber clusteringによる音源分離 [timber_clustering.ipynb](https://colab.research.google.com/drive/1f8sFW6TJaCvyi7YL9tvg-TgTUnBi2Bu_?usp=sharing) 35 | - 調波打楽器音分離 [hpss.ipynb](https://colab.research.google.com/drive/1UKrPpfTMSmDxEOcX5xiqxUXn-ElvD-vB?usp=sharing) 36 | - 空間クラスタリングによる音源分離 [spatial_clustering.ipynb](https://colab.research.google.com/drive/1gYfOZqvtoGL0W00XA-f6Ro16qNev79Dt?usp=sharing) 37 | - PROJET法による音源分離 [PROJET.ipynb](https://colab.research.google.com/drive/15gs2AFfh3Pj60r_Vn21O8-MmXBXL_07x?usp=sharing) 38 | - DUET法によるによるブラインド音源分離 [DUET.ipynb](https://colab.research.google.com/drive/15BEzg7TWd4yoiTN5nx-5Xh82Mysczkfh?usp=sharing) 39 | -------------------------------------------------------------------------------- /AudioWatermark/README.md: -------------------------------------------------------------------------------- 1 | # 音の電子透かしとステガノグラフィ 2 | 3 | ## はじめに 4 | 音の電子透かしおよびステガノグラフィ技術をPythonで実装するのが目的。 5 | 6 | ## ファイル一覧 7 | ### Pythonスクリプト 8 | - 最下位ビット置換法 [lsb_method.py](https://github.com/tam17aki/speech_process_exercise/blob/master/AudioWatermark/lsb_method.py) 9 | - 拡散スペクトル法 [spread_spectrum_method.py](https://github.com/tam17aki/speech_process_exercise/blob/master/AudioWatermark/spread_spectrum_method.py) 10 | - ケプストラム法 [cepstrum_method.py](https://github.com/tam17aki/speech_process_exercise/blob/master/AudioWatermark/cepstrum_method.py) 11 | - 位相コーディング法 [phase_coding_method.py](https://github.com/tam17aki/speech_process_exercise/blob/master/AudioWatermark/phase_coding_method.py) 12 | - エコーハイディング法 [echo_hiding_method.py](https://github.com/tam17aki/speech_process_exercise/blob/master/AudioWatermark/echo_hiding_method.py) 13 | - ウェーブレット法 [wavelet_method.py](https://github.com/tam17aki/speech_process_exercise/blob/master/AudioWatermark/wavelet_method.py) 14 | - 特異値分解法(STFTに対する)[svd_stft_method.py](https://github.com/tam17aki/speech_process_exercise/blob/master/AudioWatermark/svd_stft_method.py) 15 | - 特異値分解法(複素ケプストラムに対する)[svd_cepstrum method.py](https://github.com/tam17aki/speech_process_exercise/blob/master/AudioWatermark/svd_cepstrum_method.py) 16 | - 特異値分解法(DCT係数に対する)[svd_dct_method.py](https://github.com/tam17aki/speech_process_exercise/blob/master/AudioWatermark/svd_dct_method.py) 17 | 18 | ### Google Colaboratory 19 | - 最下位ビット置換法 [lsb_method.ipynb](https://colab.research.google.com/drive/1bz8GQZ-IOQ2S7hJELy2xfujzJiddgqeE?usp=sharing) 20 | - 拡散スペクトル法 [spread_spectrum_method.ipynb](https://colab.research.google.com/drive/1yMvfnFOjs2BRsQGhvnypSPyGm4E7DNNq?usp=sharing) 21 | - ケプストラム法 [cepstrum_method.ipynb](https://colab.research.google.com/drive/1IGQXgBiskWaJjhlam8i7m5-ghthsane0?usp=sharing) 22 | - 位相コーディング法 [phase_coding_method.ipynb](https://colab.research.google.com/drive/1djdRBmzbbFYJIqgC_EbSiKFHfPk2YGa7?usp=sharing) 23 | - エコーハイディング法 [echo_hiding_method.ipynb](https://colab.research.google.com/drive/1NFVCjcVUCG8NNlkzQ6hUelzUtcK9429H?usp=sharing) 24 | - ウェーブレット法 [wavelet_method.ipynb](https://colab.research.google.com/drive/1k8yiN1BzevJI7DjEl58NGDYuW3s4IFnb?usp=sharing) 25 | - 特異値分解法(STFTに対する)[svd_stft_method.ipynb](https://colab.research.google.com/drive/13m1Q_J5UNrTHG-DOMifHiFrYW5LP4wnZ?usp=sharing) 26 | - 特異値分解法(複素ケプストラムに対する)[svd_cepstrum method.ipynb](https://colab.research.google.com/drive/1hXvO6HqfLm1mKUXK5NDF54NDqEGk7lD0?usp=sharing) 27 | - 特異値分解法(DCT係数に対する)[svd_dct_method.ipynb](https://colab.research.google.com/drive/1Xb0s4Aa9YfCXW8J8R6wYv9n74f7GqYPK?usp=sharing) 28 | -------------------------------------------------------------------------------- /DigitalSignalProcessing/dsp_add_whitenoise.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | """ 音声情報処理 n本ノック !! """ 4 | 5 | # MIT License 6 | 7 | # Copyright (C) 2020 by Akira TAMAMORI 8 | 9 | # Permission is hereby granted, free of charge, to any person 10 | # obtaining a copy of this software and associated documentation files 11 | # (the Software"), to deal in the Software without restriction, 12 | # including without limitation the rights to use, copy, modify, merge, 13 | # publish, distribute, sublicense, and/or sell copies of the Software, 14 | # and to permit persons to whom the Software is furnished to do so, 15 | # subject to the following conditions: 16 | 17 | # The above copyright notice and this permission notice shall be 18 | # included in all copies or substantial portions of the Software. 19 | 20 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 21 | # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 22 | # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 23 | # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE 24 | # LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION 25 | # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION 26 | # WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 27 | 28 | # Commentary: 29 | # - 音声に白色雑音を混ぜる 30 | # - scipyを用いたwav出力 31 | # - matplotlibによるプロット(元音声と雑音入り音声) 32 | 33 | import matplotlib.pyplot as plt 34 | import numpy as np 35 | from scipy.io import wavfile 36 | 37 | IN_WAVE_FILE = "in.wav" # モノラル音声(前提) 38 | OUT_WAVE_FILE = "out_whitenoise.wav" 39 | 40 | # 音声データ読み込み (fsがサンプリング周波数、dataは音声データ) 41 | fs, speech_data = wavfile.read(IN_WAVE_FILE) 42 | 43 | # 音声データの長さ 44 | n_speech = len(speech_data) 45 | 46 | # 雑音だけの区間の長さ 47 | n_noise = 4000 48 | 49 | # 全体の長さ 50 | n_samples = n_noise + n_speech 51 | 52 | # 白色雑音を生成 53 | white_noise = np.random.normal(scale=0.04, size=n_samples) 54 | 55 | # 2バイトのデータとして書き込むためにスケールを調整 56 | white_noise = white_noise * np.iinfo(np.int16).max 57 | 58 | # ゲインを調整 59 | white_noise = 0.5 * white_noise 60 | 61 | # 白色雑音を混ぜる 62 | mixed_signal = white_noise # 最初に雑音を入れる 63 | mixed_signal[n_noise:] += speech_data # 後から音声を足す 64 | 65 | # wavの書き込み (scipyモジュール) 66 | mixed_signal = mixed_signal.astype(np.int16) # 16bit整数に変換 67 | wavfile.write(OUT_WAVE_FILE, fs, mixed_signal) 68 | 69 | # プロット枠を確保 (10がヨコのサイズ、4はタテのサイズ) 70 | fig = plt.figure(figsize=(12, 8)) 71 | axes1 = fig.add_subplot(2, 1, 1) 72 | n_samples = len(speech_data) 73 | time = np.arange(n_samples) / fs 74 | axes1.plot(time, speech_data) # 音声データのプロット 75 | axes1.set_xlabel("Time (sec)") # x軸のラベル 76 | axes1.set_ylabel("Amplitude") # y軸のラベル 77 | axes1.set_title("Original speech") 78 | 79 | axes2 = fig.add_subplot(2, 1, 2) 80 | n_samples = len(mixed_signal) 81 | time = np.arange(n_samples) / fs 82 | axes2.plot(time, mixed_signal) # 音声データのプロット 83 | axes2.set_xlabel("Time (sec)") # x軸のラベル 84 | axes2.set_ylabel("Amplitude") # y軸のラベル 85 | axes2.set_title("Mixed speech (original + white noise)") 86 | 87 | # 画像を画面表示 88 | plt.tight_layout() 89 | plt.show() 90 | -------------------------------------------------------------------------------- /DigitalSignalProcessing/dsp_convolution.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | """ 音声情報処理 n本ノック !! """ 4 | 5 | # MIT License 6 | 7 | # Copyright (C) 2020-2022 by Akira TAMAMORI 8 | 9 | # Permission is hereby granted, free of charge, to any person 10 | # obtaining a copy of this software and associated documentation files 11 | # (the Software"), to deal in the Software without restriction, 12 | # including without limitation the rights to use, copy, modify, merge, 13 | # publish, distribute, sublicense, and/or sell copies of the Software, 14 | # and to permit persons to whom the Software is furnished to do so, 15 | # subject to the following conditions: 16 | 17 | # The above copyright notice and this permission notice shall be 18 | # included in all copies or substantial portions of the Software. 19 | 20 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 21 | # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 22 | # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 23 | # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE 24 | # LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION 25 | # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION 26 | # WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 27 | 28 | # Commentary: 29 | # - 畳み込みをスクラッチで実装する 30 | 31 | import matplotlib.pyplot as plt 32 | import numpy as np 33 | 34 | # input signal 35 | x = np.zeros(32, dtype=np.float32) 36 | x[0], x[20] = 2.0, 1.0 37 | 38 | # Impulse response (waves that decay while oscillating) 39 | h = np.exp(- np.arange(16) / 4.0) * np.sin(2.0 * np.pi * np.arange(16) / 15.0) 40 | 41 | # output signal 42 | y = np.zeros(len(h) + len(x) - 1, dtype=np.float32) 43 | hzero = np.hstack([h, np.zeros(len(x) - 1)]) # zero padding 44 | xzero = np.hstack([x, np.zeros(len(h) - 1)]) # zero padding 45 | 46 | # convolution 47 | for n in range(0, len(y)): 48 | for k in range(0, n + 1): 49 | y[n] = y[n] + hzero[k] * xzero[n - k] 50 | 51 | fig = plt.figure(figsize=(18, 4)) 52 | for i, (s, l) in enumerate(zip([x, h, y], ["input", "impulse response", "output"])): 53 | fig.add_subplot("13%d" % (i + 1)) 54 | plt.plot(s, "-o", label=l) 55 | plt.xlim(0, len(y)) 56 | plt.legend() 57 | plt.xlabel("Time index") 58 | plt.ylabel("Magnitude") 59 | plt.grid() 60 | 61 | plt.show() 62 | 63 | # numpy implementation 64 | y_true = np.convolve(h, x, "full") 65 | fig = plt.figure(figsize=(18, 4)) 66 | for i, (s, l) in enumerate(zip([x, h, y_true], 67 | ["input", "impulse response (numpy)", "output"])): 68 | fig.add_subplot("13%d" % (i + 1)) 69 | plt.plot(s, "-o", label=l) 70 | plt.xlim(0, len(y_true)) 71 | plt.legend() 72 | plt.xlabel("Time index") 73 | plt.ylabel("Magnitude") 74 | plt.grid() 75 | 76 | plt.show() 77 | -------------------------------------------------------------------------------- /DigitalSignalProcessing/dsp_fir_denoise.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | """ 音声情報処理 n本ノック !! """ 4 | 5 | # MIT License 6 | 7 | # Copyright (C) 2020 by Akira TAMAMORI 8 | 9 | # Permission is hereby granted, free of charge, to any person 10 | # obtaining a copy of this software and associated documentation files 11 | # (the Software"), to deal in the Software without restriction, 12 | # including without limitation the rights to use, copy, modify, merge, 13 | # publish, distribute, sublicense, and/or sell copies of the Software, 14 | # and to permit persons to whom the Software is furnished to do so, 15 | # subject to the following conditions: 16 | 17 | # The above copyright notice and this permission notice shall be 18 | # included in all copies or substantial portions of the Software. 19 | 20 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 21 | # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 22 | # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 23 | # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE 24 | # LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION 25 | # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION 26 | # WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 27 | 28 | # Commentary: 29 | # - ディジタルな正弦波を作成する 30 | # - scipyのsignalモジュールで畳み込みを実行 31 | # - 移動平均フィルタによりホワイトノイズの除去 32 | 33 | import matplotlib.pyplot as plt 34 | import numpy as np 35 | 36 | n_framerate = 1000 # 標本化周波数 (Hz) 37 | 38 | freq = 4 # 正弦波の周波数 (Hz) 39 | duration = 1 # 音の継続時間 (sec) 40 | amplitude = 100 # 正弦波の振幅 41 | 42 | noise_gain = 10 # 雑音のゲイン 43 | 44 | T = 1.0 / n_framerate # 標本化周期 (sec) 45 | 46 | # 係数作成 47 | COEF_LEN = 10 48 | coef = np.ones(COEF_LEN) / COEF_LEN 49 | 50 | # 正弦波作成 51 | time = np.arange(0, duration, T) # 継続時間に等しい標本点の作成 52 | sine_wave = amplitude * np.sin(2 * np.pi * freq * time) 53 | 54 | # ホワイトノイズ作成 55 | noise = np.random.randn(len(time)) 56 | noise *= noise_gain 57 | 58 | # ノイズの重畳 59 | sine_wave_noised = sine_wave + noise 60 | 61 | # 正弦波に窓をかける 62 | sine_wave_convolved = np.convolve(sine_wave_noised, coef, "valid") 63 | signal_len = len(sine_wave_convolved) 64 | 65 | # ノイズ重畳後とノイズ除去後の比較 66 | plt.plot(time[:signal_len], sine_wave_noised[:signal_len], label="noised") 67 | plt.plot(time[:signal_len], sine_wave_convolved, label="denoised", linewidth=2) 68 | plt.xlabel("Time (sec)") 69 | plt.ylabel("Amplitude") 70 | plt.ylim(-amplitude - 3.0 * noise_gain, amplitude + 3.0 * noise_gain) 71 | plt.title("Denoising by convolution") 72 | plt.legend() 73 | plt.show() 74 | 75 | # ノイズ重畳前とノイズ除去後の比較 76 | plt.plot(time[:signal_len], sine_wave[:signal_len], label="original") 77 | plt.plot(time[:signal_len], sine_wave_convolved, label="denoised", linewidth=2) 78 | plt.xlabel("Time (sec)") 79 | plt.ylabel("Amplitude") 80 | plt.ylim(-amplitude - 3.0 * noise_gain, amplitude + 3.0 * noise_gain) 81 | plt.title("Denoising by convolution") 82 | plt.legend() 83 | plt.show() 84 | -------------------------------------------------------------------------------- /DigitalSignalProcessing/dsp_hilbert.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | """ 音声情報処理 n本ノック !! """ 4 | 5 | # MIT License 6 | 7 | # Copyright (C) 2020 by Akira TAMAMORI 8 | 9 | # Permission is hereby granted, free of charge, to any person 10 | # obtaining a copy of this software and associated documentation files 11 | # (the Software"), to deal in the Software without restriction, 12 | # including without limitation the rights to use, copy, modify, merge, 13 | # publish, distribute, sublicense, and/or sell copies of the Software, 14 | # and to permit persons to whom the Software is furnished to do so, 15 | # subject to the following conditions: 16 | 17 | # The above copyright notice and this permission notice shall be 18 | # included in all copies or substantial portions of the Software. 19 | 20 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 21 | # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 22 | # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 23 | # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE 24 | # LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION 25 | # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION 26 | # WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 27 | 28 | # Commentary: 29 | # - ディジタルな正弦波を作成する 30 | # - 周波数の近い2つの正弦波を重ね合わせて「うなり」を発生させる 31 | # - ヒルベルト変換による包絡線および瞬時位相の抽出 32 | # - 包絡線と瞬時位相から波形の再構成 33 | 34 | import numpy as np 35 | import scipy.signal as signal 36 | import matplotlib.pyplot as plt 37 | 38 | n_framerate = 16000 # 標本化周波数 (Hz) 39 | 40 | freq1 = 6 # 正弦波の周波数 (Hz) 41 | freq2 = 4 # 正弦波の周波数 (Hz) 42 | duration = 2 # 音の継続時間 (sec) 43 | amplitude = 1.0 # 正弦波の振幅 44 | 45 | T = 1.0 / n_framerate # 標本化周期 (sec) 46 | 47 | # 正弦波作成 48 | time = np.arange(0, duration, T) # 継続時間に等しい標本点の作成 49 | sine_wave1 = amplitude * np.sin(2 * np.pi * freq1 * time) 50 | sine_wave2 = amplitude * np.sin(2 * np.pi * freq2 * time) 51 | 52 | # うなり発生 53 | sine_wave = sine_wave1 + sine_wave2 54 | 55 | # ヒルベルト変換 (FFT -> 虚部0 & 実部2倍 -> 逆FFT) 56 | envelop = np.abs(signal.hilbert(sine_wave)) # 包絡 57 | angle = np.unwrap(np.angle(signal.hilbert(sine_wave))) # 瞬時位相 58 | 59 | # 波形と包絡線のプロット 60 | fig = plt.figure(figsize=(10, 6)) 61 | plt.xlabel("Time [s]") 62 | plt.ylabel("Amplitude") 63 | plt.title("Original waveform & envelop") 64 | plt.plot(time, sine_wave, label="original") 65 | plt.plot(time, envelop, label="upper envelop") # 上側の包絡 66 | plt.plot(time[::-1], -envelop, label="lower envelop") # 下側の包絡 67 | plt.ylim(-3.2, 3.2) 68 | plt.legend() 69 | plt.show() 70 | 71 | # 瞬時位相のプロット 72 | fig = plt.figure(figsize=(10, 6)) 73 | plt.xlabel("Time [s]") 74 | plt.ylabel("Phase [rad]") 75 | plt.title("Instantatenous phase") 76 | plt.plot(time, angle) 77 | plt.show() 78 | 79 | # オリジナルの波形と再構成後の波形 80 | reconst = envelop * np.cos(angle) # 再構成 81 | fig = plt.figure(figsize=(10, 6)) 82 | plt.xlabel("Time [s]") 83 | plt.ylabel("Amplitude") 84 | plt.title("Original & reconstructed waveform") 85 | plt.plot(time, sine_wave, label="original", linewidth=3) 86 | plt.plot(time, reconst, label="reconstructed") 87 | plt.ylim(-3.2, 3.2) 88 | plt.legend() 89 | plt.show() 90 | -------------------------------------------------------------------------------- /DigitalSignalProcessing/dsp_rectangle_fourier.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | """ 音声情報処理 n本ノック !! """ 4 | 5 | # MIT License 6 | 7 | # Copyright (C) 2020 by Akira TAMAMORI 8 | 9 | # Permission is hereby granted, free of charge, to any person 10 | # obtaining a copy of this software and associated documentation files 11 | # (the Software"), to deal in the Software without restriction, 12 | # including without limitation the rights to use, copy, modify, merge, 13 | # publish, distribute, sublicense, and/or sell copies of the Software, 14 | # and to permit persons to whom the Software is furnished to do so, 15 | # subject to the following conditions: 16 | 17 | # The above copyright notice and this permission notice shall be 18 | # included in all copies or substantial portions of the Software. 19 | 20 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 21 | # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 22 | # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 23 | # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE 24 | # LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION 25 | # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION 26 | # WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 27 | 28 | # Commentary: 29 | # - 矩形波をフーリエ級数近似により作成する 30 | # - scipyを用いたwav出力 31 | 32 | import numpy as np 33 | import numpy.matlib 34 | from scipy.io import wavfile 35 | 36 | OUT_WAVE_FILE = "out_rectangle.wav" 37 | 38 | sample_rate = 16000 # 標本化周波数 (Hz) 39 | freq = 500 # 矩形波の周波数 (Hz) 40 | duration = 1 # 矩形波の継続時間 (sec) 41 | amplitude = 8000 # 振幅 (ゲイン) 42 | order = 1000 # 級数近似における倍音次数の上限値 43 | 44 | period = 1.0 / freq # 矩形波の周期 (sec) 45 | 46 | # 標本点の数 47 | sample_num = int(np.floor(duration * sample_rate)) 48 | 49 | # 標本点 50 | time_axis = np.arange(0, sample_num).T / sample_rate 51 | 52 | # フーリエ級数の倍音の次数 (1倍音, 3倍音, 5倍音,...) 53 | orders = np.arange(1, order, 2) # 引数 start, stop, step 54 | 55 | # 矩形波のフーリエ係数 56 | coef = 2 * duration / (np.pi * orders) * np.cos(np.pi * orders) 57 | 58 | # 矩形波の級数近似 59 | rectwav = np.empty(sample_num) 60 | for n, t in enumerate(time_axis): 61 | rectwav[n] = coef.dot(np.sin(2 * np.pi * orders * t / period)) 62 | 63 | rectwav *= amplitude 64 | 65 | # wavの書き込み 66 | rectwav = rectwav.astype(np.int16) # 16bit整数に変換 67 | wavfile.write(OUT_WAVE_FILE, sample_rate, rectwav) 68 | -------------------------------------------------------------------------------- /DigitalSignalProcessing/dsp_sawtooth_fourier.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | """ 音声情報処理 n本ノック !! """ 4 | 5 | # MIT License 6 | 7 | # Copyright (C) 2020 by Akira TAMAMORI 8 | 9 | # Permission is hereby granted, free of charge, to any person 10 | # obtaining a copy of this software and associated documentation files 11 | # (the Software"), to deal in the Software without restriction, 12 | # including without limitation the rights to use, copy, modify, merge, 13 | # publish, distribute, sublicense, and/or sell copies of the Software, 14 | # and to permit persons to whom the Software is furnished to do so, 15 | # subject to the following conditions: 16 | 17 | # The above copyright notice and this permission notice shall be 18 | # included in all copies or substantial portions of the Software. 19 | 20 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 21 | # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 22 | # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 23 | # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE 24 | # LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION 25 | # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION 26 | # WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 27 | 28 | # Commentary: 29 | # - ノコギリ波をフーリエ級数近似により作成する 30 | # - scipyを用いたwav出力 31 | 32 | import numpy as np 33 | import numpy.matlib 34 | from scipy.io import wavfile 35 | 36 | OUT_WAVE_FILE = "out_sawtooh.wav" 37 | 38 | sample_rate = 16000 # 標本化周波数 (Hz) 39 | freq = 500 # ノコギリ波の周波数 (Hz) 40 | duration = 1 # ノコギリ波の継続時間 (sec) 41 | amplitude = 8000 # 振幅 (ゲイン) 42 | order = 1000 # 級数近似における倍音次数の上限値 43 | 44 | period = 1.0 / freq # ノコギリ波の周期 (sec) 45 | 46 | # 標本点の数 47 | sample_num = int(np.floor(duration * sample_rate)) 48 | 49 | # 標本点 50 | time_axis = np.arange(0, sample_num).T / sample_rate 51 | 52 | # フーリエ級数の倍音の次数 (1倍音, 2倍音, 3倍音,...) 53 | orders = np.arange(1, order) # 引数 start, stop, step 54 | 55 | # ノコギリ波のフーリエ係数 56 | coef = -1.0 * duration / (np.pi * orders) * np.cos(np.pi * orders) 57 | 58 | # ノコギリ波の級数近似 59 | sawtooth_wav = np.empty(sample_num) 60 | for n, t in enumerate(time_axis): 61 | sawtooth_wav[n] = coef.dot(np.sin(2 * np.pi * orders * t / period)) 62 | 63 | sawtooth_wav *= amplitude 64 | 65 | # wavの書き込み 66 | sawtooth_wav = sawtooth_wav.astype(np.int16) # 16bit整数に変換 67 | wavfile.write(OUT_WAVE_FILE, sample_rate, sawtooth_wav) 68 | -------------------------------------------------------------------------------- /DigitalSignalProcessing/dsp_sine.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | """ 音声情報処理 n本ノック !! """ 4 | 5 | # MIT License 6 | 7 | # Copyright (C) 2020 by Akira TAMAMORI 8 | 9 | # Permission is hereby granted, free of charge, to any person 10 | # obtaining a copy of this software and associated documentation files 11 | # (the Software"), to deal in the Software without restriction, 12 | # including without limitation the rights to use, copy, modify, merge, 13 | # publish, distribute, sublicense, and/or sell copies of the Software, 14 | # and to permit persons to whom the Software is furnished to do so, 15 | # subject to the following conditions: 16 | 17 | # The above copyright notice and this permission notice shall be 18 | # included in all copies or substantial portions of the Software. 19 | 20 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 21 | # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 22 | # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 23 | # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE 24 | # LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION 25 | # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION 26 | # WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 27 | 28 | # Commentary: 29 | # - ディジタルな正弦波を作成する 30 | # - 正弦波の周波数を指定して「聞くことのできる」波を作る 31 | # - waveモジュールを用いたwav出力 32 | # - scipyを用いたwav出力 33 | 34 | import wave 35 | import numpy as np 36 | from scipy.io import wavfile 37 | 38 | OUT_WAVE_FILE = "out_wave.wav" 39 | OUT_SCIPY_WAVE_FILE = "out_scipy.wav" 40 | 41 | n_channel = 1 # モノラル 42 | bitdepth = 2 # 量子化ビット数 16 bit (2 byte) 43 | n_framerate = 16000 # 標本化周波数 (Hz) 44 | 45 | freq = 1000 # 正弦波の周波数 (Hz) 46 | duration = 2 # 音の継続時間 (sec) 47 | amplitude = 8000 # 正弦波の振幅 48 | 49 | T = 1.0 / n_framerate # 標本化周期 (sec) 50 | 51 | # 正弦波作成 52 | time = np.arange(0, duration, T) # 継続時間に等しい標本点の作成 53 | sine_wave = amplitude * np.sin(2 * np.pi * freq * time) 54 | 55 | # サンプル数 56 | n_frames = len(sine_wave) 57 | 58 | # bytesオブジェクトへの変換 59 | sound_frames = sine_wave.astype(np.int16).tobytes() 60 | 61 | # wavの書き込み (waveモジュール) 62 | with wave.open(OUT_WAVE_FILE, "w") as sound: 63 | sound.setnchannels(n_channel) # チャネル数 64 | sound.setsampwidth(bitdepth) # 量子化ビット数 (byte!) 65 | sound.setframerate(n_framerate) # 標本化周波数 (Hz) 66 | sound.setnframes(n_frames) # チャネルあたりのサンプル数 67 | sound.writeframes(sound_frames) # 音声データの書き込み 68 | 69 | # wavの書き込み (scipyモジュール) -> お手軽! 70 | sine_wave = sine_wave.astype(np.int16) # 16bit整数に変換 71 | wavfile.write(OUT_SCIPY_WAVE_FILE, n_framerate, sine_wave) 72 | -------------------------------------------------------------------------------- /DigitalSignalProcessing/dsp_sine_addnoise.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | """ 音声情報処理 n本ノック !! """ 4 | 5 | # MIT License 6 | 7 | # Copyright (C) 2020 by Akira TAMAMORI 8 | 9 | # Permission is hereby granted, free of charge, to any person 10 | # obtaining a copy of this software and associated documentation files 11 | # (the Software"), to deal in the Software without restriction, 12 | # including without limitation the rights to use, copy, modify, merge, 13 | # publish, distribute, sublicense, and/or sell copies of the Software, 14 | # and to permit persons to whom the Software is furnished to do so, 15 | # subject to the following conditions: 16 | 17 | # The above copyright notice and this permission notice shall be 18 | # included in all copies or substantial portions of the Software. 19 | 20 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 21 | # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 22 | # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 23 | # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE 24 | # LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION 25 | # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION 26 | # WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 27 | 28 | # Commentary: 29 | # - ディジタルな正弦波を作成する 30 | # - ホワイトノイズを生成し、正弦波に重畳 31 | # - 重畳後の正弦波を音声としてwavに保存 32 | 33 | import numpy as np 34 | from scipy.io import wavfile 35 | 36 | OUT_WAVE_FILE = "out_wave_noised.wav" 37 | 38 | n_framerate = 16000 # 標本化周波数 (Hz) 39 | 40 | freq = 1000 # 正弦波の周波数 (Hz) 41 | duration = 1 # 音の継続時間 (sec) 42 | amplitude = 8000 # 正弦波の振幅 43 | 44 | noise_gain = 2000 # 雑音のゲイン 45 | 46 | T = 1.0 / n_framerate # 標本化周期 (sec) 47 | 48 | # 正弦波作成 49 | time = np.arange(0, duration, T) # 継続時間に等しい標本点の作成 50 | sine_wave = amplitude * np.sin(2 * np.pi * freq * time) 51 | 52 | # ホワイトノイズ作成 53 | noise = np.random.randn(len(time)) 54 | 55 | # ノイズのゲイン調整 56 | noise *= noise_gain 57 | 58 | # ノイズの重畳 59 | sine_wave_noised = sine_wave + noise 60 | 61 | # wavの書き込み (scipyモジュール) -> お手軽! 62 | sine_wave = sine_wave.astype(np.int16) # 16bit整数に変換 63 | wavfile.write(OUT_WAVE_FILE, n_framerate, sine_wave_noised) 64 | -------------------------------------------------------------------------------- /DigitalSignalProcessing/dsp_sine_addnoise_plot.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | """ 音声情報処理 n本ノック !! """ 4 | 5 | # MIT License 6 | 7 | # Copyright (C) 2020 by Akira TAMAMORI 8 | 9 | # Permission is hereby granted, free of charge, to any person 10 | # obtaining a copy of this software and associated documentation files 11 | # (the Software"), to deal in the Software without restriction, 12 | # including without limitation the rights to use, copy, modify, merge, 13 | # publish, distribute, sublicense, and/or sell copies of the Software, 14 | # and to permit persons to whom the Software is furnished to do so, 15 | # subject to the following conditions: 16 | 17 | # The above copyright notice and this permission notice shall be 18 | # included in all copies or substantial portions of the Software. 19 | 20 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 21 | # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 22 | # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 23 | # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE 24 | # LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION 25 | # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION 26 | # WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 27 | 28 | # Commentary: 29 | # - ディジタルな正弦波を作成する 30 | # - ホワイトノイズを生成し、正弦波に重畳 31 | # - 重畳前と重畳後の波形をプロット 32 | 33 | import numpy as np 34 | import matplotlib.pyplot as plt 35 | 36 | n_framerate = 1000 # 標本化周波数 (Hz) 37 | 38 | freq = 4 # 正弦波の周波数 (Hz) 39 | duration = 1 # 音の継続時間 (sec) 40 | amplitude = 100 # 正弦波の振幅 41 | 42 | noise_gain = 10 # 雑音のゲイン 43 | 44 | T = 1.0 / n_framerate # 標本化周期 (sec) 45 | 46 | # 正弦波作成 47 | time = np.arange(0, duration, T) # 継続時間に等しい標本点の作成 48 | sine_wave = amplitude * np.sin(2 * np.pi * freq * time) 49 | 50 | # ホワイトノイズ作成 51 | noise = np.random.randn(len(time)) 52 | 53 | # ノイズのゲイン調整 54 | noise *= noise_gain 55 | 56 | # ノイズの重畳 57 | sine_wave_noised = sine_wave + noise 58 | 59 | # 波形表示 60 | fig = plt.figure(figsize=(10, 6)) 61 | n_samples = len(sine_wave) 62 | time = np.arange(n_samples) / n_framerate 63 | plt.plot(time, sine_wave_noised, label="noised") 64 | plt.plot(time, sine_wave, label="original", linewidth=3) 65 | plt.xlabel("Time (sec)") 66 | plt.ylabel("Amplitude") 67 | plt.title("Waveform") 68 | plt.legend() 69 | plt.show() 70 | -------------------------------------------------------------------------------- /DigitalSignalProcessing/dsp_sine_beat.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | """ 音声情報処理 n本ノック !! """ 4 | 5 | # MIT License 6 | 7 | # Copyright (C) 2020 by Akira TAMAMORI 8 | 9 | # Permission is hereby granted, free of charge, to any person 10 | # obtaining a copy of this software and associated documentation files 11 | # (the Software"), to deal in the Software without restriction, 12 | # including without limitation the rights to use, copy, modify, merge, 13 | # publish, distribute, sublicense, and/or sell copies of the Software, 14 | # and to permit persons to whom the Software is furnished to do so, 15 | # subject to the following conditions: 16 | 17 | # The above copyright notice and this permission notice shall be 18 | # included in all copies or substantial portions of the Software. 19 | 20 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 21 | # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 22 | # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 23 | # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE 24 | # LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION 25 | # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION 26 | # WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 27 | 28 | # Commentary: 29 | # - ディジタルな正弦波を作成する 30 | # - 正弦波の周波数を指定して「聞くことのできる」波を作る 31 | # - 周波数の近い2つの正弦波を重ね合わせて「うなり」を発生させる 32 | # - scipyを用いたwav出力 33 | 34 | import numpy as np 35 | from scipy.io import wavfile 36 | 37 | OUT_WAVE_FILE = "out_wave_beat.wav" 38 | 39 | n_framerate = 16000 # 標本化周波数 (Hz) 40 | 41 | freq1 = 500 # 正弦波の周波数 (Hz) 42 | freq2 = 504 # 正弦波の周波数 (Hz) 43 | duration = 2 # 音の継続時間 (sec) 44 | amplitude = 8000 # 正弦波の振幅 45 | 46 | T = 1.0 / n_framerate # 標本化周期 (sec) 47 | 48 | # 正弦波作成 49 | time = np.arange(0, duration, T) # 継続時間に等しい標本点の作成 50 | sine_wave1 = amplitude * np.sin(2 * np.pi * freq1 * time) 51 | sine_wave2 = amplitude * np.sin(2 * np.pi * freq2 * time) 52 | 53 | # うなり発生 54 | sine_wave = sine_wave1 + sine_wave2 55 | 56 | # wavの書き込み (scipyモジュール) 57 | sine_wave = sine_wave.astype(np.int16) # 16bit整数に変換 58 | wavfile.write(OUT_WAVE_FILE, n_framerate, sine_wave) 59 | -------------------------------------------------------------------------------- /DigitalSignalProcessing/dsp_sine_euler.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | """ 音声情報処理 n本ノック !! """ 4 | 5 | # MIT License 6 | 7 | # Copyright (C) 2020 by Akira TAMAMORI 8 | 9 | # Permission is hereby granted, free of charge, to any person 10 | # obtaining a copy of this software and associated documentation files 11 | # (the Software"), to deal in the Software without restriction, 12 | # including without limitation the rights to use, copy, modify, merge, 13 | # publish, distribute, sublicense, and/or sell copies of the Software, 14 | # and to permit persons to whom the Software is furnished to do so, 15 | # subject to the following conditions: 16 | 17 | # The above copyright notice and this permission notice shall be 18 | # included in all copies or substantial portions of the Software. 19 | 20 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 21 | # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 22 | # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 23 | # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE 24 | # LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION 25 | # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION 26 | # WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 27 | 28 | # Commentary: 29 | # - オイラーの公式により複素正弦波を作成する 30 | # - 複素数の実部と虚部を取り出して2次元プロット 31 | # - 複素数の実部と虚部を取り出してそれぞれプロット 32 | # - 複素数の絶対値と位相を取り出してそれぞれプロット 33 | 34 | import numpy as np 35 | import matplotlib.pyplot as plt 36 | 37 | OUT_WAVE_FILE = "out_wave_beat.wav" 38 | 39 | n_framerate = 16000 # 標本化周波数 (Hz) 40 | 41 | freq = 2 # 正弦波の周波数 (Hz) 42 | duration = 1 # 音の継続時間 (sec) 43 | amplitude = 2.0 # 正弦波の振幅 44 | 45 | T = 1.0 / n_framerate # 標本化周期 (sec) 46 | 47 | # 継続時間に等しい標本点の作成 48 | time = np.arange(0, duration, T) 49 | 50 | # 位相 51 | phase = 2.0 * np.pi * freq * time 52 | 53 | # 複素指数関数 54 | complex_exp = amplitude * np.exp(1j * phase) 55 | 56 | # 実部と虚部を取り出して 2次元プロット 57 | plt.figure(figsize=(6, 6)) # figureの縦横の大きさ 58 | plt.scatter(complex_exp.real, complex_exp.imag) 59 | plt.xlabel('Real part') 60 | plt.xlabel('Imaginary part') 61 | plt.show() 62 | 63 | # 実部と虚部を取り出して それぞれプロット 64 | plt.figure(figsize=(10, 7)) 65 | plt.subplot(2, 1, 1) 66 | plt.plot(time, complex_exp.real) 67 | plt.xlabel("Time (sec)") 68 | plt.ylabel("Real part") 69 | plt.subplot(2, 1, 2) 70 | plt.plot(time, complex_exp.imag) 71 | plt.xlabel("Time (sec)") 72 | plt.ylabel("Imaginary part") 73 | plt.show() 74 | 75 | # 絶対値と位相を計算して それぞれプロット 76 | amplitude = np.abs(complex_exp) 77 | phase = np.angle(complex_exp) 78 | plt.figure(figsize=(10, 7)) 79 | plt.subplot(2, 1, 1) 80 | plt.plot(time, amplitude) 81 | plt.xlabel("Time (sec)") 82 | plt.ylabel("Absolute value") 83 | plt.subplot(2, 1, 2) 84 | plt.plot(time, phase) 85 | plt.xlabel("Time (sec)") 86 | plt.ylabel("Phase") 87 | plt.show() 88 | -------------------------------------------------------------------------------- /DigitalSignalProcessing/dsp_sine_plot.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | """ 音声情報処理 n本ノック !! """ 4 | 5 | # MIT License 6 | 7 | # Copyright (C) 2020 by Akira TAMAMORI 8 | 9 | # Permission is hereby granted, free of charge, to any person 10 | # obtaining a copy of this software and associated documentation files 11 | # (the Software"), to deal in the Software without restriction, 12 | # including without limitation the rights to use, copy, modify, merge, 13 | # publish, distribute, sublicense, and/or sell copies of the Software, 14 | # and to permit persons to whom the Software is furnished to do so, 15 | # subject to the following conditions: 16 | 17 | # The above copyright notice and this permission notice shall be 18 | # included in all copies or substantial portions of the Software. 19 | 20 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 21 | # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 22 | # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 23 | # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE 24 | # LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION 25 | # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION 26 | # WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 27 | 28 | # Commentary: 29 | # - ディジタルな正弦波を作成してプロットする 30 | 31 | import numpy as np 32 | import matplotlib.pyplot as plt 33 | 34 | samplerate = 16000 35 | freq = 3 # 正弦波の周波数 (Hz) 36 | duration = 2 # 音の継続時間 (sec) 37 | amplitude = 8000 # 正弦波の振幅 38 | 39 | T = 1.0 / samplerate # 標本化周期 (sec) 40 | 41 | # 正弦波作成 42 | time = np.arange(0, duration, T) # 継続時間に等しい標本点の作成 43 | sine_wave = amplitude * np.sin(2 * np.pi * freq * time) 44 | 45 | # 正弦波のプロット 46 | plt.plot(time, sine_wave) 47 | plt.xlabel("Time (sec)") 48 | plt.ylabel("Amplitude") 49 | plt.title("Sine Wave") 50 | plt.show() 51 | -------------------------------------------------------------------------------- /DigitalSignalProcessing/dsp_triangle_fourier.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | """ 音声情報処理 n本ノック !! """ 4 | 5 | # MIT License 6 | 7 | # Copyright (C) 2020 by Akira TAMAMORI 8 | 9 | # Permission is hereby granted, free of charge, to any person 10 | # obtaining a copy of this software and associated documentation files 11 | # (the Software"), to deal in the Software without restriction, 12 | # including without limitation the rights to use, copy, modify, merge, 13 | # publish, distribute, sublicense, and/or sell copies of the Software, 14 | # and to permit persons to whom the Software is furnished to do so, 15 | # subject to the following conditions: 16 | 17 | # The above copyright notice and this permission notice shall be 18 | # included in all copies or substantial portions of the Software. 19 | 20 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 21 | # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 22 | # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 23 | # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE 24 | # LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION 25 | # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION 26 | # WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 27 | 28 | # Commentary: 29 | # - 三角波をフーリエ級数近似により作成する 30 | # - scipyを用いたwav出力 31 | 32 | import matplotlib.pyplot as plt 33 | import numpy as np 34 | import numpy.matlib 35 | from scipy.io import wavfile 36 | 37 | OUT_WAVE_FILE = "out_triangle.wav" 38 | 39 | sample_rate = 16000 # 標本化周波数 (Hz) 40 | freq = 500 # 三角波の周波数 (Hz) 41 | duration = 1 # 三角波の継続時間 (sec) 42 | amplitude = 8000 # 振幅 (ゲイン) 43 | order = 1000 # 級数近似における倍音次数の上限値 44 | 45 | period = 1.0 / freq # 三角波の周期 (sec) 46 | 47 | # 標本点の数 48 | sample_num = int(np.floor(duration * sample_rate)) 49 | 50 | # 標本点 51 | time_axis = np.arange(0, sample_num).T / sample_rate 52 | 53 | # フーリエ級数の倍音の次数 (1倍音, 3倍音, 5倍音,...) 54 | orders = np.arange(1, order, 2) # 引数 start, stop, step 55 | 56 | # 三角波のフーリエ係数 57 | coef = 1.0 / (orders * orders) * np.sin(orders * np.pi / 2.0) 58 | coef *= 8.0 * duration / (np.pi * np.pi) 59 | 60 | # 三角波の級数近似 61 | triwav = np.empty(sample_num) 62 | for n, t in enumerate(time_axis): 63 | triwav[n] = coef.dot(np.sin(2 * np.pi * orders * t / period)) 64 | 65 | triwav *= amplitude 66 | 67 | plt.plot(triwav) 68 | plt.xlabel("Time (sec)") 69 | plt.ylabel("Amplitude") 70 | plt.title("Waveform") 71 | plt.show() 72 | 73 | # wavの書き込み 74 | triwav = triwav.astype(np.int16) # 16bit整数に変換 75 | wavfile.write(OUT_WAVE_FILE, sample_rate, triwav) 76 | -------------------------------------------------------------------------------- /DigitalSignalProcessing/dsp_window_blackman.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | """ 音声情報処理 n本ノック !! """ 4 | 5 | # MIT License 6 | 7 | # Copyright (C) 2020 by Akira TAMAMORI 8 | 9 | # Permission is hereby granted, free of charge, to any person 10 | # obtaining a copy of this software and associated documentation files 11 | # (the Software"), to deal in the Software without restriction, 12 | # including without limitation the rights to use, copy, modify, merge, 13 | # publish, distribute, sublicense, and/or sell copies of the Software, 14 | # and to permit persons to whom the Software is furnished to do so, 15 | # subject to the following conditions: 16 | 17 | # The above copyright notice and this permission notice shall be 18 | # included in all copies or substantial portions of the Software. 19 | 20 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 21 | # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 22 | # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 23 | # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE 24 | # LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION 25 | # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION 26 | # WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 27 | 28 | # Commentary: 29 | # - ディジタルな正弦波を作成する 30 | # - scipyのsignalモジュールでBlackman窓を作る 31 | # - 定義式に従ってBlackman窓を作る 32 | 33 | import matplotlib.pyplot as plt 34 | import numpy as np 35 | from scipy import signal 36 | 37 | n_framerate = 2000 # 標本化周波数 (Hz) 38 | 39 | freq = 20 # 正弦波の周波数 (Hz) 40 | duration = 1 # 音の継続時間 (sec) 41 | amplitude = 8000 # 正弦波の振幅 42 | 43 | T = 1.0 / n_framerate # 標本化周期 (sec) 44 | 45 | # Blackman窓の作成 46 | window_len = 1025 47 | blackman_window = signal.blackman(window_len) 48 | blackman_window_scratch = np.empty(window_len) 49 | for n in range(window_len): 50 | blackman_window_scratch[n] = ( 51 | 0.42 52 | - 0.5 * np.cos(2 * np.pi * n / (window_len - 1)) 53 | + 0.08 * np.cos(4 * np.pi * n / (window_len - 1)) 54 | ) 55 | 56 | # scipyから作った窓関数と、定義式から作った窓関数をプロットして比較する 57 | plt.plot(blackman_window, label="scipy", linewidth=3) 58 | plt.plot(blackman_window_scratch, label="scratch") 59 | plt.xlabel("Index") 60 | plt.ylabel("Amplitude") 61 | plt.title("Blackman window") 62 | plt.legend() 63 | plt.show() 64 | 65 | # 正弦波作成 66 | time = np.arange(0, duration, T) # 継続時間に等しい標本点の作成 67 | sine_wave = amplitude * np.sin(2 * np.pi * freq * time) 68 | 69 | # 正弦波に窓をかける 70 | windowed = sine_wave[:window_len] * blackman_window 71 | 72 | # 正弦波のプロット 73 | plt.plot(time[:window_len], sine_wave[:window_len], label="original") 74 | plt.plot(time[:window_len], windowed, label="windowed") 75 | plt.xlabel("Time (sec)") 76 | plt.ylabel("Amplitude") 77 | plt.title("Sine Wave") 78 | plt.legend() 79 | plt.show() 80 | -------------------------------------------------------------------------------- /DigitalSignalProcessing/dsp_window_hamming.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | """ 音声情報処理 n本ノック !! """ 4 | 5 | # MIT License 6 | 7 | # Copyright (C) 2020 by Akira TAMAMORI 8 | 9 | # Permission is hereby granted, free of charge, to any person 10 | # obtaining a copy of this software and associated documentation files 11 | # (the Software"), to deal in the Software without restriction, 12 | # including without limitation the rights to use, copy, modify, merge, 13 | # publish, distribute, sublicense, and/or sell copies of the Software, 14 | # and to permit persons to whom the Software is furnished to do so, 15 | # subject to the following conditions: 16 | 17 | # The above copyright notice and this permission notice shall be 18 | # included in all copies or substantial portions of the Software. 19 | 20 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 21 | # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 22 | # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 23 | # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE 24 | # LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION 25 | # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION 26 | # WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 27 | 28 | # Commentary: 29 | # - ディジタルな正弦波を作成する 30 | # - scipyのsignalモジュールでHamming窓を作る 31 | # - 定義式に従ってHamming窓を作る 32 | 33 | import matplotlib.pyplot as plt 34 | import numpy as np 35 | from scipy import signal 36 | 37 | n_framerate = 2000 # 標本化周波数 (Hz) 38 | 39 | freq = 20 # 正弦波の周波数 (Hz) 40 | duration = 1 # 音の継続時間 (sec) 41 | amplitude = 8000 # 正弦波の振幅 42 | 43 | T = 1.0 / n_framerate # 標本化周期 (sec) 44 | 45 | # Hann窓の作成 46 | WINDOW_LEN = 1025 47 | hamming_window = signal.hamming(WINDOW_LEN) 48 | hamming_window_scratch = np.empty(WINDOW_LEN) 49 | for n in range(WINDOW_LEN): 50 | hamming_window_scratch[n] = 0.54 - 0.46 * np.cos(2 * np.pi * n / (WINDOW_LEN - 1)) 51 | 52 | # scipyから作った窓関数と、定義式から作った窓関数をプロットして比較する 53 | plt.plot(hamming_window, label="scipy", linewidth=3) 54 | plt.plot(hamming_window_scratch, label="scratch") 55 | plt.xlabel("Index") 56 | plt.ylabel("Amplitude") 57 | plt.title("Hamming window") 58 | plt.legend() 59 | plt.show() 60 | 61 | # 正弦波作成 62 | time = np.arange(0, duration, T) # 継続時間に等しい標本点の作成 63 | sine_wave = amplitude * np.sin(2 * np.pi * freq * time) 64 | 65 | # 正弦波に窓をかける 66 | windowed = sine_wave[:WINDOW_LEN] * hamming_window 67 | 68 | # 正弦波のプロット 69 | plt.plot(time[:WINDOW_LEN], sine_wave[:WINDOW_LEN], label="original") 70 | plt.plot(time[:WINDOW_LEN], windowed, label="windowed") 71 | plt.xlabel("Time (sec)") 72 | plt.ylabel("Amplitude") 73 | plt.title("Sine Wave") 74 | plt.legend() 75 | plt.show() 76 | -------------------------------------------------------------------------------- /DigitalSignalProcessing/dsp_window_hann.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | """ 音声情報処理 n本ノック !! """ 4 | 5 | # MIT License 6 | 7 | # Copyright (C) 2020 by Akira TAMAMORI 8 | 9 | # Permission is hereby granted, free of charge, to any person 10 | # obtaining a copy of this software and associated documentation files 11 | # (the Software"), to deal in the Software without restriction, 12 | # including without limitation the rights to use, copy, modify, merge, 13 | # publish, distribute, sublicense, and/or sell copies of the Software, 14 | # and to permit persons to whom the Software is furnished to do so, 15 | # subject to the following conditions: 16 | 17 | # The above copyright notice and this permission notice shall be 18 | # included in all copies or substantial portions of the Software. 19 | 20 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 21 | # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 22 | # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 23 | # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE 24 | # LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION 25 | # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION 26 | # WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 27 | 28 | # Commentary: 29 | # - ディジタルな正弦波を作成する 30 | # - scipyのsignalモジュールでHann窓を作る 31 | # - 定義式に従ってHann窓を作る 32 | 33 | import matplotlib.pyplot as plt 34 | import numpy as np 35 | from scipy import signal 36 | 37 | n_framerate = 2000 # 標本化周波数 (Hz) 38 | 39 | freq = 20 # 正弦波の周波数 (Hz) 40 | duration = 1 # 音の継続時間 (sec) 41 | amplitude = 8000 # 正弦波の振幅 42 | 43 | T = 1.0 / n_framerate # 標本化周期 (sec) 44 | 45 | # Hann窓の作成 46 | WINDOW_LEN = 1025 47 | hann_window = signal.hann(WINDOW_LEN) 48 | hann_window_scratch = np.empty(WINDOW_LEN) 49 | for n in range(WINDOW_LEN): 50 | hann_window_scratch[n] = 0.5 - 0.5 * np.cos(2 * np.pi * n / (WINDOW_LEN - 1)) 51 | 52 | # scipyから作った窓関数と、定義式から作った窓関数をプロットして比較する 53 | plt.plot(hann_window, label="scipy", linewidth=3) 54 | plt.plot(hann_window_scratch, label="scratch") 55 | plt.xlabel("Index") 56 | plt.ylabel("Amplitude") 57 | plt.title("Hann window") 58 | plt.legend() 59 | plt.show() 60 | 61 | # 正弦波作成 62 | time = np.arange(0, duration, T) # 継続時間に等しい標本点の作成 63 | sine_wave = amplitude * np.sin(2 * np.pi * freq * time) 64 | 65 | # 正弦波に窓をかける 66 | windowed = sine_wave[:WINDOW_LEN] * hann_window 67 | 68 | # 正弦波のプロット 69 | plt.plot(time[:WINDOW_LEN], sine_wave[:WINDOW_LEN], label="original") 70 | plt.plot(time[:WINDOW_LEN], windowed, label="windowed") 71 | plt.xlabel("Time (sec)") 72 | plt.ylabel("Amplitude") 73 | plt.title("Sine Wave") 74 | plt.legend() 75 | plt.show() 76 | -------------------------------------------------------------------------------- /DigitalSignalProcessing/dsp_window_triangle.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | """ 音声情報処理 n本ノック !! """ 4 | 5 | # MIT License 6 | 7 | # Copyright (C) 2020-2022 by Akira TAMAMORI 8 | 9 | # Permission is hereby granted, free of charge, to any person 10 | # obtaining a copy of this software and associated documentation files 11 | # (the Software"), to deal in the Software without restriction, 12 | # including without limitation the rights to use, copy, modify, merge, 13 | # publish, distribute, sublicense, and/or sell copies of the Software, 14 | # and to permit persons to whom the Software is furnished to do so, 15 | # subject to the following conditions: 16 | 17 | # The above copyright notice and this permission notice shall be 18 | # included in all copies or substantial portions of the Software. 19 | 20 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 21 | # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 22 | # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 23 | # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE 24 | # LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION 25 | # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION 26 | # WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 27 | 28 | # Commentary: 29 | # - ディジタルな正弦波を作成する 30 | # - scipyのsignalモジュールで三角窓を作る 31 | # - 定義式に従って三角窓を作る 32 | 33 | import matplotlib.pyplot as plt 34 | import numpy as np 35 | from scipy import signal 36 | 37 | n_framerate = 2000 # 標本化周波数 (Hz) 38 | 39 | freq = 20 # 正弦波の周波数 (Hz) 40 | duration = 1 # 音の継続時間 (sec) 41 | amplitude = 8000 # 正弦波の振幅 42 | 43 | T = 1.0 / n_framerate # 標本化周期 (sec) 44 | 45 | # 三角窓の作成 46 | window_len = 1025 47 | triangle_window = signal.triang(window_len) 48 | triangle_window_scratch = np.empty(window_len) 49 | for n in range(window_len // 2): 50 | triangle_window_scratch[n] = 2 * n / (window_len - 1) 51 | for n in range(window_len // 2, window_len): 52 | triangle_window_scratch[n] = 2 - 2 * n / (window_len - 1) 53 | 54 | # scipyから作った窓関数と、定義式から作った窓関数をプロットして比較する 55 | plt.plot(triangle_window, label="scipy", linewidth=3) 56 | plt.plot(triangle_window_scratch, label="scratch") 57 | plt.xlabel("Index") 58 | plt.ylabel("Amplitude") 59 | plt.title("Triangle window") 60 | plt.legend() 61 | plt.show() 62 | 63 | # 正弦波作成 64 | time = np.arange(0, duration, T) # 継続時間に等しい標本点の作成 65 | sine_wave = amplitude * np.sin(2 * np.pi * freq * time) 66 | 67 | # 正弦波に窓をかける 68 | windowed = sine_wave[:window_len] * triangle_window 69 | 70 | # 正弦波のプロット 71 | plt.plot(time[:window_len], sine_wave[:window_len], label="original") 72 | plt.plot(time[:window_len], windowed, label="windowed") 73 | plt.xlabel("Time (sec)") 74 | plt.ylabel("Amplitude") 75 | plt.title("Sine Wave") 76 | plt.legend() 77 | plt.show() 78 | -------------------------------------------------------------------------------- /DigitalSignalProcessing/rectangle_anime.mp4: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tam17aki/speech_process_exercise/9d5e1359b948d66046744cc0c461d43d20e1ec66/DigitalSignalProcessing/rectangle_anime.mp4 -------------------------------------------------------------------------------- /DigitalSignalProcessing/sawtooth_anime.mp4: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tam17aki/speech_process_exercise/9d5e1359b948d66046744cc0c461d43d20e1ec66/DigitalSignalProcessing/sawtooth_anime.mp4 -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2020 Akira TAMAMORI 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /PhaseRetrieval/README.md: -------------------------------------------------------------------------------- 1 | # 位相復元 2 | 3 | ## はじめに 4 | 5 | ``` 6 | pip3 install numpy 7 | pip3 install soundfile 8 | pip3 install oct2py 9 | pip3 install scipy 10 | ``` 11 | 12 | Oct2Py経由でMATLAB/GNU Octave用ライブラリLTFATとPHASERETを利用し,音声の位相復元を実装する. 13 | 14 | 事前にOctaveのインストールを済ませておく. 15 | 16 | 1. GitHubからltfatの[最新版](https://github.com/ltfat/ltfat)をダウンロードし,適切な場所で解凍する. 17 | 18 | 例えばパスは /home/hoge/ltfat-main とする 19 | 20 | 2. GitHubからphaseretの[最新版](https://github.com/ltfat/phaseret)をダウンロードし, ltfat-main直下に解凍する. 21 | 22 | 例えばパスは /home/hoge/ltfat-main/phaseret-main とする 23 | 24 | 3. ltfat-mainに移動して octave を起動し, 25 | 26 | ``` 27 | octave> ltfatstart; 28 | octave> ltfatmex; 29 | ``` 30 | によって事前にライブラリのコンパイルを済ませておく('octave>' はプロンプト). 31 | 32 | octave上からphaseret-mainに移動して,同様にコンパイルを済ませておく. 33 | 34 | ``` 35 | octave> phaseretstart; 36 | octave> phaseretmex; 37 | ``` 38 | 39 | ## ファイル一覧 40 | ### Pythonスクリプト 41 | - Single Pass Spectrogram Inversion (SPSI) による位相復元 [phaseret_spsi.py](https://github.com/tam17aki/speech_process_exercise/blob/master/PhaseRetrieval/phaseret_spsi.py) 42 | - Phase Gradient Heap Integration (PGHI) による位相復元 [phaseret_pghi.py](https://github.com/tam17aki/speech_process_exercise/blob/master/PhaseRetrieval/phaseret_pghi.py) 43 | - Real-Time Phase Gradient Heap Integration (RTPGHI) による位相復元 [phaseret_rtpghi.py](https://github.com/tam17aki/speech_process_exercise/blob/master/PhaseRetrieval/phaseret_rtpghi.py) 44 | - Real-Time Iterative Spectrogram Inversion with Look Ahead (RTISILA) による位相復元 [phaseret_rtisila.py](https://github.com/tam17aki/speech_process_exercise/blob/master/PhaseRetrieval/phaseret_rtisila.py) 45 | -------------------------------------------------------------------------------- /PhaseRetrieval/phaseret_pghi.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """Demonstration of Phase Gradient Heap Integration (PGHI). 3 | 4 | Copyright (C) 2024 by Akira TAMAMORI 5 | 6 | Permission is hereby granted, free of charge, to any person obtaining a copy 7 | of this software and associated documentation files (the "Software"), to deal 8 | in the Software without restriction, including without limitation the rights 9 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 10 | copies of the Software, and to permit persons to whom the Software is 11 | furnished to do so, subject to the following conditions: 12 | 13 | The above copyright notice and this permission notice shall be included in all 14 | copies or substantial portions of the Software. 15 | 16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 19 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 21 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 22 | SOFTWARE. 23 | """ 24 | 25 | import argparse 26 | from pathlib import Path 27 | 28 | import numpy as np 29 | import soundfile as sf 30 | from oct2py import octave 31 | from scipy import signal 32 | 33 | 34 | def main(): 35 | """Reconstruct phase by using PGHI.""" 36 | parser = argparse.ArgumentParser() 37 | parser.add_argument("--ltfat_dir", type=str, default="/work/tamamori/ltfat-main") 38 | parser.add_argument("--win_len", type=int, default=512) 39 | parser.add_argument("--hop_len", type=int, default=128) 40 | parser.add_argument("--fft_len", type=int, default=512) 41 | parser.add_argument("--window", type=str, default="hann") 42 | parser.add_argument("--in_wavdir", type=str, default="/home/tamamori") 43 | parser.add_argument("--in_wav", type=str, default="in.wav") 44 | parser.add_argument("--out_wavdir", type=str, default="/home/tamamori") 45 | parser.add_argument("--out_wav", type=str, default="out.wav") 46 | args = parser.parse_args() 47 | 48 | # initialization 49 | octave.addpath(octave.genpath(args.ltfat_dir)) 50 | octave.ltfatstart(0) 51 | octave.phaseretstart(0) 52 | 53 | # compute magnitude spectrum 54 | audio, rate = sf.read(Path(args.in_wavdir, args.in_wav)) 55 | stfft = signal.ShortTimeFFT( 56 | win=signal.get_window(args.window, args.win_len), 57 | hop=args.hop_len, 58 | fs=rate, 59 | mfft=args.fft_len, 60 | ) 61 | mag_spec = np.abs(stfft.stft(audio)) 62 | 63 | # reconstruct phase spectrum with PGHI 64 | gamma = octave.pghi_findgamma(args.window, args.hop_len, args.win_len) 65 | reconst_spec = octave.pghi(mag_spec, gamma, args.hop_len, args.win_len) 66 | audio = stfft.istft(reconst_spec) 67 | sf.write(Path(args.out_wavdir, args.out_wav), audio, rate) 68 | 69 | 70 | if __name__ == "__main__": 71 | main() 72 | -------------------------------------------------------------------------------- /PhaseRetrieval/phaseret_rtpghi.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """Demonstration of Real-Time Phase Gradient Heap Integration (RTPGHI). 3 | 4 | Copyright (C) 2024 by Akira TAMAMORI 5 | 6 | Permission is hereby granted, free of charge, to any person obtaining a copy 7 | of this software and associated documentation files (the "Software"), to deal 8 | in the Software without restriction, including without limitation the rights 9 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 10 | copies of the Software, and to permit persons to whom the Software is 11 | furnished to do so, subject to the following conditions: 12 | 13 | The above copyright notice and this permission notice shall be included in all 14 | copies or substantial portions of the Software. 15 | 16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 19 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 21 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 22 | SOFTWARE. 23 | """ 24 | 25 | import argparse 26 | from pathlib import Path 27 | 28 | import numpy as np 29 | import soundfile as sf 30 | from oct2py import octave 31 | from scipy import signal 32 | 33 | 34 | def main(): 35 | """Reconstruct phase by using RTPGHI.""" 36 | parser = argparse.ArgumentParser() 37 | parser.add_argument("--ltfat_dir", type=str, default="/work/tamamori/ltfat-main") 38 | parser.add_argument("--win_len", type=int, default=512) 39 | parser.add_argument("--hop_len", type=int, default=128) 40 | parser.add_argument("--fft_len", type=int, default=512) 41 | parser.add_argument("--window", type=str, default="hann") 42 | parser.add_argument("--pghi_type", choices=["normal", "causal"], default="causal") 43 | parser.add_argument("--in_wavdir", type=str, default="/home/tamamori") 44 | parser.add_argument("--in_wav", type=str, default="in.wav") 45 | parser.add_argument("--out_wavdir", type=str, default="/home/tamamori") 46 | parser.add_argument("--out_wav", type=str, default="out.wav") 47 | args = parser.parse_args() 48 | 49 | # initialization 50 | octave.addpath(octave.genpath(args.ltfat_dir)) 51 | octave.ltfatstart(0) 52 | octave.phaseretstart(0) 53 | 54 | # compute magnitude spectrum 55 | audio, rate = sf.read(Path(args.in_wavdir, args.in_wav)) 56 | stfft = signal.ShortTimeFFT( 57 | win=signal.get_window(args.window, args.win_len), 58 | hop=args.hop_len, 59 | fs=rate, 60 | mfft=args.fft_len, 61 | ) 62 | mag_spec = np.abs(stfft.stft(audio)) 63 | 64 | # reconstruct phase spectrum with RTPGHI 65 | gamma = octave.pghi_findgamma(args.window, args.hop_len, args.win_len) 66 | reconst_spec = octave.rtpghi( 67 | mag_spec, gamma, args.hop_len, args.win_len, args.pghi_type 68 | ) 69 | audio = stfft.istft(reconst_spec) 70 | sf.write(Path(args.out_wavdir, args.out_wav), audio, rate) 71 | 72 | 73 | if __name__ == "__main__": 74 | main() 75 | -------------------------------------------------------------------------------- /PhaseRetrieval/phaseret_spsi.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """Demonstration of Single Pass Spectrogram Inversion (SPSI). 3 | 4 | Copyright (C) 2024 by Akira TAMAMORI 5 | 6 | Permission is hereby granted, free of charge, to any person obtaining a copy 7 | of this software and associated documentation files (the "Software"), to deal 8 | in the Software without restriction, including without limitation the rights 9 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 10 | copies of the Software, and to permit persons to whom the Software is 11 | furnished to do so, subject to the following conditions: 12 | 13 | The above copyright notice and this permission notice shall be included in all 14 | copies or substantial portions of the Software. 15 | 16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 19 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 21 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 22 | SOFTWARE. 23 | """ 24 | 25 | import argparse 26 | from pathlib import Path 27 | 28 | import numpy as np 29 | import soundfile as sf 30 | from oct2py import octave 31 | from scipy import signal 32 | 33 | 34 | def main(): 35 | """Reconstruct phase by using SPSI.""" 36 | parser = argparse.ArgumentParser() 37 | parser.add_argument("--ltfat_dir", type=str, default="/work/tamamori/ltfat-main") 38 | parser.add_argument("--win_len", type=int, default=512) 39 | parser.add_argument("--hop_len", type=int, default=128) 40 | parser.add_argument("--fft_len", type=int, default=512) 41 | parser.add_argument("--window", type=str, default="hann") 42 | parser.add_argument("--in_wavdir", type=str, default="/home/tamamori") 43 | parser.add_argument("--in_wav", type=str, default="in.wav") 44 | parser.add_argument("--out_wavdir", type=str, default="/home/tamamori") 45 | parser.add_argument("--out_wav", type=str, default="out.wav") 46 | args = parser.parse_args() 47 | 48 | # initialization 49 | octave.addpath(octave.genpath(args.ltfat_dir)) 50 | octave.ltfatstart(0) 51 | octave.phaseretstart(0) 52 | 53 | # compute magnitude spectrum 54 | audio, rate = sf.read(Path(args.in_wavdir, args.in_wav)) 55 | stfft = signal.ShortTimeFFT( 56 | win=signal.get_window(args.window, args.win_len), 57 | hop=args.hop_len, 58 | fs=rate, 59 | mfft=args.fft_len, 60 | ) 61 | mag_spec = np.abs(stfft.stft(audio)) 62 | 63 | # reconstruct phase spectrum with SPSI 64 | reconst_spec = octave.spsi(mag_spec, args.hop_len, args.win_len) 65 | audio = stfft.istft(reconst_spec) 66 | sf.write(Path(args.out_wavdir, args.out_wav), audio, rate) 67 | 68 | 69 | if __name__ == "__main__": 70 | main() 71 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # speech_process_exercise 2 | 音声情報処理n本ノックを目指して 3 | 4 | ## [第1章 準備運動(音声読み込み・書き込み・波形プロットなど)](https://github.com/tam17aki/speech_process_exercise/tree/master/WarmUp) 5 | ## [第2章 ディジタル信号処理の基礎](https://github.com/tam17aki/speech_process_exercise/tree/master/DigitalSignalProcessing) 6 | ## [第3章 音声加工とサウンドエフェクト](https://github.com/tam17aki/speech_process_exercise/tree/master/SoundEffect) 7 | ## [第4章 音声の特徴量抽出](https://github.com/tam17aki/speech_process_exercise/tree/master/SpeechAnalysis) 8 | ## [第5章 音声の分析合成](https://github.com/tam17aki/speech_process_exercise/tree/master/SpeechAnalysisSynthesis) 9 | ## [第6章 音声合成](https://github.com/tam17aki/speech_process_exercise/tree/master/SpeechSynthesis) 10 | ## [第7章 音声認識](https://github.com/tam17aki/speech_process_exercise/tree/master/SpeechRecognition) 11 | ## 第8章 音声対話システム 12 | ## [第9章 音声変換](https://github.com/tam17aki/speech_process_exercise/tree/master/VoiceConversion) 13 | ## [第10章 話者認識](https://github.com/tam17aki/speech_process_exercise/tree/master/SpeakerRecognition) 14 | ## [第11章 音源分離](https://github.com/tam17aki/speech_process_exercise/tree/master/AudioSourceSeparation) 15 | ## [第12章 音の電子透かし](https://github.com/tam17aki/speech_process_exercise/tree/master/AudioWatermark) 16 | ## [第13章 音の位相復元](https://github.com/tam17aki/speech_process_exercise/tree/master/PhaseRetrieval) 17 | -------------------------------------------------------------------------------- /SoundEffect/pysox_bandpass_bandreject.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | """ 音声情報処理 n本ノック !! """ 4 | 5 | # MIT License 6 | 7 | # Copyright (C) 2020 by Akira TAMAMORI 8 | 9 | # Permission is hereby granted, free of charge, to any person 10 | # obtaining a copy of this software and associated documentation files 11 | # (the Software"), to deal in the Software without restriction, 12 | # including without limitation the rights to use, copy, modify, merge, 13 | # publish, distribute, sublicense, and/or sell copies of the Software, 14 | # and to permit persons to whom the Software is furnished to do so, 15 | # subject to the following conditions: 16 | 17 | # The above copyright notice and this permission notice shall be 18 | # included in all copies or substantial portions of the Software. 19 | 20 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 21 | # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 22 | # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 23 | # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE 24 | # LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION 25 | # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION 26 | # WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 27 | 28 | # Commentary: 29 | # - PySoXを用いた音声情報処理シリーズ 30 | # - バンドパスフィルタ / バンドリジェクトフィルタをかける 31 | # →特定周波数帯域の通過 (pass) / 遮断 (rejection) 32 | 33 | import sox 34 | 35 | IN_WAVE_FILE = "in.wav" # 入力音声 36 | OUT_WAVE_FILE_PASS = "bandpass.wav" # バンドパスフィルタ適用済み音声 37 | OUT_WAVE_FILE_REJECT = "bandreject.wav" # バンドリジェクトフィルタ適用済み音声 38 | 39 | transformer = sox.Transformer() 40 | 41 | # 遮断周波数は「中心周波数」から-3dB(パワーは0.5倍、振幅は0.707倍)になる周波数 42 | BANDPASS_FREQ = 500 # バンドフィルタの「中心」周波数 (Hz) 43 | BANDREJECT_FREQ = 500 # バンドリジェクトフィルタの「中心」周波数 (Hz) 44 | 45 | # バンドパスフィルタ 46 | transformer.bandpass(frequency=BANDPASS_FREQ) 47 | transformer.build(IN_WAVE_FILE, OUT_WAVE_FILE_PASS) 48 | 49 | # バンドリジェクトフィルタ 50 | transformer.bandreject(frequency=BANDREJECT_FREQ) 51 | transformer.build(IN_WAVE_FILE, OUT_WAVE_FILE_REJECT) 52 | -------------------------------------------------------------------------------- /SoundEffect/pysox_change_bitdepth.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | """ 音声情報処理 n本ノック !! """ 4 | 5 | # MIT License 6 | 7 | # Copyright (C) 2020 by Akira TAMAMORI 8 | 9 | # Permission is hereby granted, free of charge, to any person 10 | # obtaining a copy of this software and associated documentation files 11 | # (the Software"), to deal in the Software without restriction, 12 | # including without limitation the rights to use, copy, modify, merge, 13 | # publish, distribute, sublicense, and/or sell copies of the Software, 14 | # and to permit persons to whom the Software is furnished to do so, 15 | # subject to the following conditions: 16 | 17 | # The above copyright notice and this permission notice shall be 18 | # included in all copies or substantial portions of the Software. 19 | 20 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 21 | # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 22 | # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 23 | # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE 24 | # LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION 25 | # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION 26 | # WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 27 | 28 | # Commentary: 29 | # - PySoxを用いた音声情報処理シリーズ 30 | # - 量子化ビット数を変更 31 | 32 | import sox 33 | 34 | IN_WAVE_FILE = "mono.wav" # モノラル音声 (量子化ビット数 16bit) 35 | OUT_WAVE_FILE = "out.wav" # モノラル音声 36 | 37 | BITDEPTH = 8 38 | 39 | # create trasnformer (単一ファイルに対する重ねがけ) 40 | transformer = sox.Transformer() 41 | 42 | # 量子化ビット数を8bitに変更 43 | transformer.convert(bitdepth=BITDEPTH) 44 | transformer.build(IN_WAVE_FILE, OUT_WAVE_FILE) 45 | -------------------------------------------------------------------------------- /SoundEffect/pysox_change_samplerate.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | """ 音声情報処理 n本ノック !! """ 4 | 5 | # MIT License 6 | 7 | # Copyright (C) 2020 by Akira TAMAMORI 8 | 9 | # Permission is hereby granted, free of charge, to any person 10 | # obtaining a copy of this software and associated documentation files 11 | # (the Software"), to deal in the Software without restriction, 12 | # including without limitation the rights to use, copy, modify, merge, 13 | # publish, distribute, sublicense, and/or sell copies of the Software, 14 | # and to permit persons to whom the Software is furnished to do so, 15 | # subject to the following conditions: 16 | 17 | # The above copyright notice and this permission notice shall be 18 | # included in all copies or substantial portions of the Software. 19 | 20 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 21 | # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 22 | # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 23 | # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE 24 | # LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION 25 | # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION 26 | # WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 27 | 28 | # Commentary: 29 | # - PySoxを用いた音声情報処理シリーズ 30 | # - サンプリング周波数を変更 31 | 32 | import sox 33 | 34 | IN_WAVE_FILE = "mono.wav" # モノラル音声 (サンプリング周波数 16kHz) 35 | OUT_WAVE_FILE = "out.wav" # モノラル音声 36 | 37 | SAMPLERATE = 8000 # サンプリング周波数(Hz) 38 | 39 | # create trasnformer (単一ファイルに対する重ねがけ) 40 | transformer = sox.Transformer() 41 | 42 | # サンプリング周波数を変更 43 | transformer.rate(samplerate=SAMPLERATE) 44 | transformer.build(IN_WAVE_FILE, OUT_WAVE_FILE) 45 | -------------------------------------------------------------------------------- /SoundEffect/pysox_downsample.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | """ 音声情報処理 n本ノック !! """ 4 | 5 | # MIT License 6 | 7 | # Copyright (C) 2020 by Akira TAMAMORI 8 | 9 | # Permission is hereby granted, free of charge, to any person 10 | # obtaining a copy of this software and associated documentation files 11 | # (the Software"), to deal in the Software without restriction, 12 | # including without limitation the rights to use, copy, modify, merge, 13 | # publish, distribute, sublicense, and/or sell copies of the Software, 14 | # and to permit persons to whom the Software is furnished to do so, 15 | # subject to the following conditions: 16 | 17 | # The above copyright notice and this permission notice shall be 18 | # included in all copies or substantial portions of the Software. 19 | 20 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 21 | # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 22 | # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 23 | # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE 24 | # LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION 25 | # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION 26 | # WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 27 | 28 | # Commentary: 29 | # - PySoXを用いた音声情報処理シリーズ 30 | # - ダウンサンプリング 31 | 32 | 33 | import sox 34 | 35 | IN_WAVE_FILE = "in.wav" # 入力音声 36 | OUT_WAVE_FILE = "downsample.wav" # ダウンサンプリングした音声 37 | 38 | # トランスフォーマーをつくる(単一音声に対する処理) 39 | transformer = sox.Transformer() 40 | 41 | # ダウンサンプリング の パラメタ 42 | FACTOR = 2 # ダウンサンプリング率 (正の整数) 43 | 44 | # transformerにダウンサンプリングを設定する 45 | transformer.downsample(factor=FACTOR) 46 | 47 | # ダウンサンプリングした結果をファイルに保存 48 | transformer.build(IN_WAVE_FILE, OUT_WAVE_FILE) 49 | 50 | # ダウンサンプリングした結果をarrayとして取得 51 | downsamples = transformer.build_array(IN_WAVE_FILE) 52 | -------------------------------------------------------------------------------- /SoundEffect/pysox_echo.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | """ 音声情報処理 n本ノック !! """ 4 | 5 | # MIT License 6 | 7 | # Copyright (C) 2020 by Akira TAMAMORI 8 | 9 | # Permission is hereby granted, free of charge, to any person 10 | # obtaining a copy of this software and associated documentation files 11 | # (the Software"), to deal in the Software without restriction, 12 | # including without limitation the rights to use, copy, modify, merge, 13 | # publish, distribute, sublicense, and/or sell copies of the Software, 14 | # and to permit persons to whom the Software is furnished to do so, 15 | # subject to the following conditions: 16 | 17 | # The above copyright notice and this permission notice shall be 18 | # included in all copies or substantial portions of the Software. 19 | 20 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 21 | # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 22 | # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 23 | # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE 24 | # LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION 25 | # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION 26 | # WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 27 | 28 | # Commentary: 29 | # - PySoXを用いた音声情報処理シリーズ 30 | # - エコーをかける 31 | 32 | 33 | import sox 34 | from scipy.io import wavfile 35 | 36 | IN_WAVE_FILE = "in.wav" # 入力音声 37 | OUT_WAVE_FILE = "echo.wav" # エコー済み音声 38 | 39 | # トランスフォーマーをつくる(単一音声に対する処理) 40 | transformer = sox.Transformer() 41 | 42 | # エコー の パラメタ 43 | n_echos = 2 # エコー回数 44 | delays = [375] # 遅延時間 (ms) 45 | decays = [0.5] # 減衰率 46 | 47 | # エコー回数分、遅延時間と減衰率を与える必要がある 48 | # → エコー回数に等しい長さの「リスト」を 遅延時間と減衰率それぞれで用意する 49 | # → n_echos が 2 なら遅延時間は [375, 750], 減衰率は [0.5, 0.25] 50 | for i in range(1, n_echos): 51 | delays.append(delays[0] * (i + 1)) # 遅延時間は線形的 52 | decays.append(decays[0] ** (i + 1)) # 減衰率は指数的 53 | 54 | # transformerにエコーを設定する 55 | transformer.echo(n_echos=n_echos, delays=delays, decays=decays) 56 | 57 | # エコーをかけた結果をファイルに保存 58 | transformer.build(IN_WAVE_FILE, OUT_WAVE_FILE) 59 | 60 | # エコーをかけた結果をarrayとして取得 61 | echos = transformer.build_array(IN_WAVE_FILE) 62 | -------------------------------------------------------------------------------- /SoundEffect/pysox_flanger.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | """ 音声情報処理 n本ノック !! """ 4 | 5 | # MIT License 6 | 7 | # Copyright (C) 2020 by Akira TAMAMORI 8 | 9 | # Permission is hereby granted, free of charge, to any person 10 | # obtaining a copy of this software and associated documentation files 11 | # (the Software"), to deal in the Software without restriction, 12 | # including without limitation the rights to use, copy, modify, merge, 13 | # publish, distribute, sublicense, and/or sell copies of the Software, 14 | # and to permit persons to whom the Software is furnished to do so, 15 | # subject to the following conditions: 16 | 17 | # The above copyright notice and this permission notice shall be 18 | # included in all copies or substantial portions of the Software. 19 | 20 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 21 | # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 22 | # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 23 | # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE 24 | # LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION 25 | # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION 26 | # WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 27 | 28 | # Commentary: 29 | # - PySoXを用いた音声情報処理シリーズ 30 | # - フランジャ(うなり、うねり)をかける 31 | 32 | import sox 33 | 34 | IN_WAVE_FILE = "in.wav" # 入力音声 35 | OUT_WAVE_FILE = "flanger.wav" # フランジャをかけた音声 36 | 37 | # create trasnformer (単一ファイルに対する処理) 38 | transformer = sox.Transformer() 39 | 40 | # フランジャ の パラメタ 41 | DELAY = 15 # 大もとの遅延時間 (ms) 42 | DEPTH = 3 # DELAY ± DEPTHの遅延をかける (ms) 43 | REGEN = 0 # 出力をフィードバックするときのゲイン量 (-95 to 95) 44 | WIDTH = 75 # ディレイさせた音の振幅をどれだけ減衰させたうえで重ねるか (%) 45 | SPEED = 1.0 # うなりの速さ; 遅延時間の揺れの速さ (Hz) 46 | SHAPE = "sine" # フランジャのスイープ特性; 47 | # sine的に遅延時間が変化 or 三角波("triangle")的に遅延時間が変化 48 | 49 | PHASE = 0 # 多チャネルの音にフランジャをかけるときの位相ずれ率 (%) 50 | # 実際の位相ズレはPHASE×2π[rad] 51 | 52 | # transformerにフランジャを設定する 53 | transformer.flanger( 54 | delay=DELAY, 55 | depth=DEPTH, 56 | regen=REGEN, 57 | width=WIDTH, 58 | speed=SPEED, 59 | shape=SHAPE, 60 | phase=PHASE, 61 | ) 62 | 63 | # フランジャをかけた結果をファイルに保存 64 | transformer.build(IN_WAVE_FILE, OUT_WAVE_FILE) 65 | 66 | # フランジャをかけた結果をarrayとして取得 67 | flangers = transformer.build_array(IN_WAVE_FILE) 68 | -------------------------------------------------------------------------------- /SoundEffect/pysox_lowpass-highpass.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | """ 音声情報処理 n本ノック !! """ 4 | 5 | # MIT License 6 | 7 | # Copyright (C) 2020 by Akira TAMAMORI 8 | 9 | # Permission is hereby granted, free of charge, to any person 10 | # obtaining a copy of this software and associated documentation files 11 | # (the Software"), to deal in the Software without restriction, 12 | # including without limitation the rights to use, copy, modify, merge, 13 | # publish, distribute, sublicense, and/or sell copies of the Software, 14 | # and to permit persons to whom the Software is furnished to do so, 15 | # subject to the following conditions: 16 | 17 | # The above copyright notice and this permission notice shall be 18 | # included in all copies or substantial portions of the Software. 19 | 20 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 21 | # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 22 | # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 23 | # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE 24 | # LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION 25 | # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION 26 | # WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 27 | 28 | # Commentary: 29 | # - PySoXを用いた音声情報処理シリーズ 30 | # - ローパスフィルタ/ハイパスフィルタをかける 31 | 32 | import sox 33 | 34 | IN_WAVE_FILE = "in.wav" # 入力音声 35 | OUT_WAVE_FILE_LOW = "lowpass.wav" # ローパスフィルタ適用済み音声 36 | OUT_WAVE_FILE_HIGH = "highpass.wav" # ハイパスフィルタ適用済み音声 37 | 38 | transformer = sox.Transformer() 39 | 40 | # 遮断周波数は -3dB(パワーは0.501倍、振幅は0.708倍)になる周波数 41 | LOWPASS_FREQ = 1000 # ローパスフィルタの遮断周波数 (Hz) 42 | HIGHPASS_FREQ = 1000 # ハイパスフィルタの遮断周波数 (Hz) 43 | 44 | # ローパスフィルタ 45 | transformer.lowpass(frequency=LOWPASS_FREQ) 46 | transformer.build(IN_WAVE_FILE, OUT_WAVE_FILE_LOW) 47 | 48 | # ハイパスフィルタ 49 | transformer.highpass(frequency=HIGHPASS_FREQ) 50 | transformer.build(IN_WAVE_FILE, OUT_WAVE_FILE_HIGH) 51 | -------------------------------------------------------------------------------- /SoundEffect/pysox_pitchshift.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | """ 音声情報処理 n本ノック !! """ 4 | 5 | # MIT License 6 | 7 | # Copyright (C) 2020 by Akira TAMAMORI 8 | 9 | # Permission is hereby granted, free of charge, to any person 10 | # obtaining a copy of this software and associated documentation files 11 | # (the Software"), to deal in the Software without restriction, 12 | # including without limitation the rights to use, copy, modify, merge, 13 | # publish, distribute, sublicense, and/or sell copies of the Software, 14 | # and to permit persons to whom the Software is furnished to do so, 15 | # subject to the following conditions: 16 | 17 | # The above copyright notice and this permission notice shall be 18 | # included in all copies or substantial portions of the Software. 19 | 20 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 21 | # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 22 | # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 23 | # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE 24 | # LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION 25 | # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION 26 | # WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 27 | 28 | # Commentary: 29 | # - PySoXを用いた音声情報処理シリーズ 30 | # - ピッチシフトをかける(再生時間を変えずにピッチを上下させる) 31 | 32 | import sox 33 | 34 | IN_WAVE_FILE = "in.wav" # 入力音声 35 | OUT_WAVE_FILE_HIGH = "pitch_high.wav" # ピッチシフト済み音声(音が高い) 36 | OUT_WAVE_FILE_LOW = "pitch_low.wav" # ピッチシフト済み音声(音が低い) 37 | 38 | # create trasnformer (単一ファイルに対する処理) 39 | transformer = sox.Transformer() 40 | 41 | # ピッチシフト の パラメタ 42 | # 単位:セミトーン(いわゆる半音 -> 1半音の変化は周波数的には約1.06倍) 43 | # 正値は上げる、負値は下げる 44 | # 実際にはfloat値を指定可能 45 | PITCHSHIFT_HIGH = 3.0 # 3半音上げる 46 | PITCHSHIFT_LOW = -5.0 # 5半音下げる 47 | 48 | # ピッチシフトをかける 49 | transformer.pitch(n_semitones=PITCHSHIFT_HIGH) # 上げる 50 | transformer.build(IN_WAVE_FILE, OUT_WAVE_FILE_HIGH) 51 | 52 | transformer.pitch(n_semitones=PITCHSHIFT_LOW) # 下げる 53 | transformer.build(IN_WAVE_FILE, OUT_WAVE_FILE_LOW) 54 | -------------------------------------------------------------------------------- /SoundEffect/pysox_reverb.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | """ 音声情報処理 n本ノック !! """ 4 | 5 | # MIT License 6 | 7 | # Copyright (C) 2020 by Akira TAMAMORI 8 | 9 | # Permission is hereby granted, free of charge, to any person 10 | # obtaining a copy of this software and associated documentation files 11 | # (the Software"), to deal in the Software without restriction, 12 | # including without limitation the rights to use, copy, modify, merge, 13 | # publish, distribute, sublicense, and/or sell copies of the Software, 14 | # and to permit persons to whom the Software is furnished to do so, 15 | # subject to the following conditions: 16 | 17 | # The above copyright notice and this permission notice shall be 18 | # included in all copies or substantial portions of the Software. 19 | 20 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 21 | # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 22 | # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 23 | # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE 24 | # LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION 25 | # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION 26 | # WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 27 | 28 | # Commentary: 29 | # - PySoXを用いた音声情報処理シリーズ 30 | # - リバーブをかける 31 | 32 | import sox 33 | from scipy.io import wavfile 34 | 35 | IN_WAVE_FILE = "in.wav" # 入力音声 36 | OUT_WAVE_FILE = "reverb.wav" # リバーブ済み音声 37 | 38 | # create trasnformer (単一ファイルに対する処理) 39 | transformer = sox.Transformer() 40 | 41 | # #################### リバーブ の パラメタ #################### 42 | # リバーブの残響音の長さを調整 (0-100 %) 43 | REVERBERANCE = 80 44 | 45 | # 高周波反響音の減衰率 (0-100 %) 0だと反響が長い、100だと反響が短い 46 | # →高周波成分が残響の間でどれだけ「吸収」されるかをシミュレート 47 | HIGH_FREQ_DAMPING = 30 48 | 49 | # 反響する部屋の大きさ (0-100 %) 大きいとホール、小さいと風呂場とか 50 | ROOM_SCALE = 20 51 | 52 | STEREO_DEPTH = 100 53 | 54 | # 反響が始まるまでの時間 (up to 500 ms) 大きいと遅れて残響→壁の反射を表現 55 | PRE_DELAY = 100 56 | 57 | # ウェットゲイン (dB) 付け加えた反響音そのものの大きさ 58 | WET_GAIN = 0 59 | 60 | # Trueはウェット成分のみ出力 61 | WET_ONLY = False 62 | 63 | # ############################################################## 64 | 65 | # transformerにリバーブを設定する 66 | transformer.reverb( 67 | reverberance=REVERBERANCE, 68 | high_freq_damping=HIGH_FREQ_DAMPING, 69 | room_scale=ROOM_SCALE, 70 | stereo_depth=STEREO_DEPTH, 71 | pre_delay=PRE_DELAY, 72 | wet_gain=WET_GAIN, 73 | wet_only=WET_ONLY, 74 | ) 75 | 76 | # リバーブをかけた結果をファイルに保存 77 | transformer.build(IN_WAVE_FILE, OUT_WAVE_FILE) 78 | 79 | # リバーブをかけた結果をarrayとして取得 80 | reverb = transformer.build_array(IN_WAVE_FILE) 81 | -------------------------------------------------------------------------------- /SoundEffect/pysox_stereo2mono.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | """ 音声情報処理 n本ノック !! """ 4 | 5 | # MIT License 6 | 7 | # Copyright (C) 2020 by Akira TAMAMORI 8 | 9 | # Permission is hereby granted, free of charge, to any person 10 | # obtaining a copy of this software and associated documentation files 11 | # (the Software"), to deal in the Software without restriction, 12 | # including without limitation the rights to use, copy, modify, merge, 13 | # publish, distribute, sublicense, and/or sell copies of the Software, 14 | # and to permit persons to whom the Software is furnished to do so, 15 | # subject to the following conditions: 16 | 17 | # The above copyright notice and this permission notice shall be 18 | # included in all copies or substantial portions of the Software. 19 | 20 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 21 | # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 22 | # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 23 | # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE 24 | # LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION 25 | # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION 26 | # WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 27 | 28 | # Commentary: 29 | # - PySoxを用いた音声情報処理シリーズ 30 | # - stereo から mono に変換 31 | 32 | import sox 33 | 34 | IN_WAVE_FILE = "stereo.wav" # ステレオ音声 35 | OUT_WAVE_FILE = "out.wav" # モノラル音声 36 | 37 | # create trasnformer (単一ファイルに対する重ねがけ) 38 | transformer = sox.Transformer() 39 | 40 | # ステレオをモノラルに 41 | transformer.convert(n_channels=1) 42 | transformer.build(IN_WAVE_FILE, OUT_WAVE_FILE) 43 | -------------------------------------------------------------------------------- /SoundEffect/pysox_timestretch.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | """ 音声情報処理 n本ノック !! """ 4 | 5 | # MIT License 6 | 7 | # Copyright (C) 2020 by Akira TAMAMORI 8 | 9 | # Permission is hereby granted, free of charge, to any person 10 | # obtaining a copy of this software and associated documentation files 11 | # (the Software"), to deal in the Software without restriction, 12 | # including without limitation the rights to use, copy, modify, merge, 13 | # publish, distribute, sublicense, and/or sell copies of the Software, 14 | # and to permit persons to whom the Software is furnished to do so, 15 | # subject to the following conditions: 16 | 17 | # The above copyright notice and this permission notice shall be 18 | # included in all copies or substantial portions of the Software. 19 | 20 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 21 | # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 22 | # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 23 | # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE 24 | # LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION 25 | # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION 26 | # WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 27 | 28 | # Commentary: 29 | # - PySoXを用いた音声情報処理シリーズ 30 | # - タイムストレッチをかける(ピッチを変えずにテンポを変える) 31 | 32 | import sox 33 | 34 | IN_WAVE_FILE = "in.wav" # 入力音声 35 | OUT_WAVE_FILE = "tempo.wav" # タイムストレッチ済み音声 36 | 37 | # create trasnformer (単一ファイルに対する処理) 38 | transformer = sox.Transformer() 39 | 40 | # タイムストレッチ の パラメタ 41 | FACTOR = 1.2 # 早くする (1.0より大きい) / 遅くする (1.0より小さい) 倍率 42 | 43 | # タイムストレッチをかける 44 | transformer.tempo(factor=FACTOR) 45 | transformer.build(IN_WAVE_FILE, OUT_WAVE_FILE) 46 | -------------------------------------------------------------------------------- /SoundEffect/pysox_tremolo.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | """ 音声情報処理 n本ノック !! """ 4 | 5 | # MIT License 6 | 7 | # Copyright (C) 2020 by Akira TAMAMORI 8 | 9 | # Permission is hereby granted, free of charge, to any person 10 | # obtaining a copy of this software and associated documentation files 11 | # (the Software"), to deal in the Software without restriction, 12 | # including without limitation the rights to use, copy, modify, merge, 13 | # publish, distribute, sublicense, and/or sell copies of the Software, 14 | # and to permit persons to whom the Software is furnished to do so, 15 | # subject to the following conditions: 16 | 17 | # The above copyright notice and this permission notice shall be 18 | # included in all copies or substantial portions of the Software. 19 | 20 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 21 | # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 22 | # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 23 | # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE 24 | # LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION 25 | # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION 26 | # WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 27 | 28 | # Commentary: 29 | # - PySoXを用いた音声情報処理シリーズ 30 | # - トレモロをかける (周期的な振幅の上下動) 31 | 32 | import sox 33 | 34 | IN_WAVE_FILE = "in.wav" # 入力音声 35 | OUT_WAVE_FILE = "tremolo.wav" # トレモロ済み音声 36 | 37 | # create trasnformer (単一ファイルに対する処理) 38 | transformer = sox.Transformer() 39 | 40 | # トレモロ の パラメタ 41 | # トレモロの速度 (Hz) → 振幅の上下動の頻度 42 | SPEED = 10 43 | 44 | # トレモロの深さ (%) → 振幅の上下動の深さ(当該振幅を基準にした比) 45 | DEPTH = 50 46 | 47 | # トレモロをかける 48 | transformer.tremolo(speed=SPEED, depth=DEPTH) 49 | transformer.build(IN_WAVE_FILE, OUT_WAVE_FILE) 50 | -------------------------------------------------------------------------------- /SoundEffect/pysox_upsample.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | """ 音声情報処理 n本ノック !! """ 4 | 5 | # MIT License 6 | 7 | # Copyright (C) 2020 by Akira TAMAMORI 8 | 9 | # Permission is hereby granted, free of charge, to any person 10 | # obtaining a copy of this software and associated documentation files 11 | # (the Software"), to deal in the Software without restriction, 12 | # including without limitation the rights to use, copy, modify, merge, 13 | # publish, distribute, sublicense, and/or sell copies of the Software, 14 | # and to permit persons to whom the Software is furnished to do so, 15 | # subject to the following conditions: 16 | 17 | # The above copyright notice and this permission notice shall be 18 | # included in all copies or substantial portions of the Software. 19 | 20 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 21 | # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 22 | # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 23 | # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE 24 | # LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION 25 | # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION 26 | # WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 27 | 28 | # Commentary: 29 | # - PySoXを用いた音声情報処理シリーズ 30 | # - アップサンプリング 31 | 32 | 33 | import sox 34 | 35 | IN_WAVE_FILE = "in.wav" # 入力音声 36 | OUT_WAVE_FILE = "upsample.wav" # アップサンプリングした音声 37 | 38 | # トランスフォーマーをつくる(単一音声に対する処理) 39 | transformer = sox.Transformer() 40 | 41 | # アップサンプリング の パラメタ 42 | FACTOR = 2 # アップサンプリング率 (正の整数) 43 | 44 | # transformerにアップサンプリングを設定する 45 | transformer.upsample(factor=FACTOR) 46 | 47 | # アップサンプリングした結果をファイルに保存 48 | transformer.build(IN_WAVE_FILE, OUT_WAVE_FILE) 49 | 50 | # アップサンプリングした結果をarrayとして取得 51 | upsamples = transformer.build_array(IN_WAVE_FILE) 52 | -------------------------------------------------------------------------------- /SoundEffect/pysox_wav2raw.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | """ 音声情報処理 n本ノック !! """ 4 | 5 | # MIT License 6 | 7 | # Copyright (C) 2020 by Akira TAMAMORI 8 | 9 | # Permission is hereby granted, free of charge, to any person 10 | # obtaining a copy of this software and associated documentation files 11 | # (the Software"), to deal in the Software without restriction, 12 | # including without limitation the rights to use, copy, modify, merge, 13 | # publish, distribute, sublicense, and/or sell copies of the Software, 14 | # and to permit persons to whom the Software is furnished to do so, 15 | # subject to the following conditions: 16 | 17 | # The above copyright notice and this permission notice shall be 18 | # included in all copies or substantial portions of the Software. 19 | 20 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 21 | # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 22 | # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 23 | # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE 24 | # LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION 25 | # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION 26 | # WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 27 | 28 | # Commentary: 29 | # - PySoxを用いた音声情報処理シリーズ 30 | # - wav から raw に変換 31 | 32 | import sox 33 | 34 | IN_WAVE_FILE = "in.wav" # モノラル音声 35 | OUT_RAW_FILE = "out.raw" # ヘッダファイルを抜いたもの 36 | 37 | # create trasnformer (単一ファイルに対する重ねがけ) 38 | transformer = sox.Transformer() 39 | 40 | # wav to raw -> ファイル名を指定するだけ! 41 | transformer.build(IN_WAVE_FILE, OUT_RAW_FILE) 42 | -------------------------------------------------------------------------------- /SpeakerRecognition/README.md: -------------------------------------------------------------------------------- 1 | # 話者認識 2 | 3 | ## はじめに 4 | ``` 5 | python3 -m pip install librosa 6 | python3 -m pip install hydra-core 7 | python3 -m pip install progressbar2 8 | python3 -m pip install torch 9 | python3 -m pip install torchaudio 10 | python3 -m pip install xvector-jtubespeech 11 | ``` 12 | ## 使用データ 13 | - [in.wav](https://drive.google.com/file/d/1lsN-is31x_snFBTNGR05pQwX9RhzC8sb/view?usp=sharing) 14 | - [声優統計コーパス](https://voice-statistics.github.io/) 15 | 16 | ## ファイル一覧 17 | - xvectorの抽出 via xvector-jtubespeech 18 | - 抽出のお試し [extract_sample.py](https://github.com/tam17aki/speech_process_exercise/blob/master/SpeakerRecognition/extract_sample.py) 19 | - 声優統計コーパス 20 | - コーパスのダウンロード [download_voicestats_corpus.py](https://github.com/tam17aki/speech_process_exercise/blob/master/SpeakerRecognition/download_voicestats_corpus.py) 21 | - 事前学習済モデルのダウンロード [download_pretrained_model.py](https://github.com/tam17aki/speech_process_exercise/blob/master/SpeakerRecognition/download_pretrained_model.py) 22 | - xvectorを抽出して保存 [extract_xvector_voicestats.py](https://github.com/tam17aki/speech_process_exercise/blob/master/SpeakerRecognition/extract_xvector_voicestats.py) 23 | 24 | - 話者認識モデルを動かす 25 | - 声優統計コーパスから抽出済のxvectorを用いる 26 | - サポートベクトルマシン 27 | - フィードフォワードニューラルネット (PyTorch) [spk_recog_mlp.py](https://github.com/tam17aki/speech_process_exercise/blob/master/SpeakerRecognition/spk_recog_mlp.py) 28 | - フィードフォワードニューラルネット (scikit-learn) [spk_recog_mlp_sklearn.py](https://github.com/tam17aki/speech_process_exercise/blob/master/SpeakerRecognition/spk_recog_mlp_sklearn.py) 29 | -------------------------------------------------------------------------------- /SpeakerRecognition/config.yaml: -------------------------------------------------------------------------------- 1 | xvector: 2 | root_dir: "/home/tamamori/work/n-hon-knock/SpeakerRecognition/" 3 | data_dir: "voice-statistics/data/" 4 | feat_dir: "feats/" 5 | model_dir: "model/" 6 | corpus_url: "https://github.com/voice-statistics/voice-statistics.github.com/raw/master/assets/data/" 7 | repo_url: "https://github.com/sarulab-speech/xvector_jtubespeech/archive/refs/heads/master.zip" 8 | repo_name: "xvector_jtubespeech-master" 9 | n_jobs: 10 | 11 | actor: 12 | - "tsuchiya" 13 | - "fujitou" 14 | - "uemura" 15 | 16 | emotion: 17 | - "angry" 18 | - "happy" 19 | - "normal" 20 | 21 | feature: 22 | sample_rate: 16000 23 | num_ceps: 24 24 | num_melbins: 24 25 | 26 | pretrained: 27 | repo_name: "xvector_jtubespeech-master" 28 | file_name: "xvector.pth" 29 | 30 | model: 31 | x_dim: 512 32 | h_dim: 512 33 | n_layers: 3 34 | activation: "relu" # for scikit-learn 35 | layer_sizes: [512, 512, 512] # for scikit-learn 36 | 37 | training: 38 | seed: 0 39 | n_splits: 5 # クロスバリデーションの分割数 40 | n_epoch: 50 41 | n_batch: 16 42 | learning_rate: 0.0001 # for scikit-learn 43 | model_file: "model.pytorch" 44 | optim: 45 | optimizer: # 最適化アルゴリズム 46 | name: Adam 47 | params: # 最適化アルゴリズムに応じて項目を追加したり減らしたりする 48 | lr: 1e-4 # 学習率 49 | betas: [0.9, 0.98] 50 | eps: 1e-08 51 | weight_decay: 0 52 | lr_scheduler: # 学習率調整アルゴリズム 53 | name: MultiStepLR 54 | params: # 学習率調整アルゴリズムに応じて項目を追加したり減らしたりする 55 | milestones: 56 | - 50 57 | gamma: 0.6 58 | use_scheduler: False # 学習率スケジューリングを使うか否か 59 | -------------------------------------------------------------------------------- /SpeakerRecognition/config_sklearn.yaml: -------------------------------------------------------------------------------- 1 | xvector: 2 | root_dir: "/home/tamamori/work/n-hon-knock/SpeakerRecognition/" 3 | data_dir: "voice-statistics/data/" 4 | feat_dir: "feats/" 5 | model_dir: "model/" 6 | corpus_url: "https://github.com/voice-statistics/voice-statistics.github.com/raw/master/assets/data/" 7 | repo_url: "https://github.com/sarulab-speech/xvector_jtubespeech/archive/refs/heads/master.zip" 8 | repo_name: "xvector_jtubespeech-master" 9 | n_jobs: 10 | 11 | actor: 12 | - "tsuchiya" 13 | - "fujitou" 14 | - "uemura" 15 | 16 | emotion: 17 | - "angry" 18 | - "happy" 19 | - "normal" 20 | 21 | feature: 22 | sample_rate: 16000 23 | num_ceps: 24 24 | num_melbins: 24 25 | 26 | pretrained: 27 | repo_name: "xvector_jtubespeech-master" 28 | file_name: "xvector.pth" 29 | 30 | model: 31 | x_dim: 512 32 | h_dim: 512 33 | n_layers: 3 34 | activation: "relu" # for scikit-learn 35 | layer_sizes: [512, 512, 512] # for scikit-learn 36 | 37 | 38 | training: 39 | seed: 0 40 | n_splits: 5 # クロスバリデーションの分割数 41 | n_epoch: 50 42 | n_batch: 16 43 | learning_rate: 0.0001 # for scikit-learn 44 | -------------------------------------------------------------------------------- /SpeakerRecognition/download_pretrained_model.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | """ 音声情報処理 n本ノック !! """ 4 | 5 | # MIT License 6 | 7 | # Copyright (C) 2023 by Akira TAMAMORI 8 | 9 | # Permission is hereby granted, free of charge, to any person 10 | # obtaining a copy of this software and associated documentation files 11 | # (the Software"), to deal in the Software without restriction, 12 | # including without limitation the rights to use, copy, modify, merge, 13 | # publish, distribute, sublicense, and/or sell copies of the Software, 14 | # and to permit persons to whom the Software is furnished to do so, 15 | # subject to the following conditions: 16 | 17 | # The above copyright notice and this permission notice shall be 18 | # included in all copies or substantial portions of the Software. 19 | 20 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 21 | # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 22 | # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 23 | # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE 24 | # LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION 25 | # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION 26 | # WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 27 | 28 | # Commentary: 29 | # subprocesモジュールを介したwgetによるxvector-jtubespeechの事前学習済モデルのダウンロード 30 | 31 | import os 32 | import subprocess 33 | 34 | from hydra import compose, initialize 35 | 36 | 37 | def get_pretrained_model(cfg): 38 | """Download pretrained model.""" 39 | repo_url = cfg.xvector.repo_url 40 | data_dir = os.path.join(cfg.xvector.root_dir, cfg.xvector.data_dir) 41 | os.makedirs(data_dir, exist_ok=True) 42 | model_dir = os.path.join(cfg.xvector.root_dir, cfg.xvector.model_dir) 43 | os.makedirs(model_dir, exist_ok=True) 44 | 45 | subprocess.run( 46 | "echo -n Downloading pretrained model ...", 47 | text=True, 48 | shell=True, 49 | check=True, 50 | ) 51 | 52 | # download pretrained model from github repo.b rerained 53 | command = "wget " + "-P " + "/tmp/" + " " + repo_url 54 | subprocess.run(command, text=True, shell=True, capture_output=True, check=True) 55 | command = "cd " + "/tmp/" + "; " + "unzip " + "master.zip" 56 | subprocess.run(command, text=True, shell=True, capture_output=True, check=True) 57 | command = ( 58 | "cp " 59 | + os.path.join("/tmp/", cfg.pretrained.repo_name, cfg.pretrained.file_name) 60 | + " " 61 | + os.path.join(model_dir, cfg.pretrained.file_name) 62 | ) 63 | subprocess.run(command, text=True, shell=True, capture_output=True, check=True) 64 | 65 | # clean up 66 | command = "rm " + "/tmp/master.zip" 67 | subprocess.run(command, text=True, shell=True, capture_output=True, check=True) 68 | command = "rm -rf " + os.path.join("/tmp/", cfg.pretrained.repo_name) 69 | subprocess.run(command, text=True, shell=True, capture_output=True, check=True) 70 | print(" done.") 71 | 72 | 73 | if __name__ == "__main__": 74 | with initialize(version_base=None, config_path="."): 75 | config = compose(config_name="config") 76 | get_pretrained_model(config) 77 | -------------------------------------------------------------------------------- /SpeakerRecognition/download_voicestats_corpus.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | """ 音声情報処理 n本ノック !! """ 4 | 5 | # MIT License 6 | 7 | # Copyright (C) 2023 by Akira TAMAMORI 8 | 9 | # Permission is hereby granted, free of charge, to any person 10 | # obtaining a copy of this software and associated documentation files 11 | # (the Software"), to deal in the Software without restriction, 12 | # including without limitation the rights to use, copy, modify, merge, 13 | # publish, distribute, sublicense, and/or sell copies of the Software, 14 | # and to permit persons to whom the Software is furnished to do so, 15 | # subject to the following conditions: 16 | 17 | # The above copyright notice and this permission notice shall be 18 | # included in all copies or substantial portions of the Software. 19 | 20 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 21 | # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 22 | # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 23 | # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE 24 | # LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION 25 | # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION 26 | # WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 27 | 28 | # Commentary: 29 | # subprocesモジュールを介したwgetによる声優統計コーパスダウンロード 30 | 31 | import os 32 | import subprocess 33 | 34 | from hydra import compose, initialize 35 | 36 | 37 | def get_corpus(cfg): 38 | """Download voice-statistics corpurs.""" 39 | corpus_url = cfg.xvector.corpus_url 40 | data_dir = os.path.join(cfg.xvector.root_dir, cfg.xvector.data_dir) 41 | os.makedirs(data_dir, exist_ok=True) 42 | 43 | subprocess.run( 44 | "echo -n Downloading voice statistics corpus ...", 45 | text=True, 46 | shell=True, 47 | check=True, 48 | ) 49 | for actor in cfg.actor: # "tsuchiya", "fujitou", "uemura" 50 | for emotion in cfg.emotion: # "angry", "happy", "normal" 51 | command = "wget " + "-P " + "/tmp/" + " " + corpus_url 52 | tar_file = actor + "_" + emotion + ".tar.gz" 53 | command = command + tar_file 54 | subprocess.run( 55 | command, text=True, shell=True, capture_output=True, check=True 56 | ) 57 | command = "cd " + data_dir + "; " + "tar -xzvf " + "/tmp/" + tar_file 58 | subprocess.run( 59 | command, text=True, shell=True, capture_output=True, check=True 60 | ) 61 | print(" done.") 62 | 63 | 64 | if __name__ == "__main__": 65 | with initialize(version_base=None, config_path="."): 66 | config = compose(config_name="config") 67 | get_corpus(config) 68 | -------------------------------------------------------------------------------- /SpeakerRecognition/extract_sample.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | """Sample script for extraction of x-vector from a audio (monaural wav). 3 | 4 | Copyright (C) 2022 sarulab-speech 5 | Copyright (C) 2023 by Akira TAMAMORI 6 | 7 | Permission is hereby granted, free of charge, to any person obtaining a copy 8 | of this software and associated documentation files (the "Software"), to deal 9 | in the Software without restriction, including without limitation the rights 10 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 11 | copies of the Software, and to permit persons to whom the Software is 12 | furnished to do so, subject to the following conditions: 13 | 14 | The above copyright notice and this permission notice shall be included in all 15 | copies or substantial portions of the Software. 16 | 17 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 18 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 19 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 20 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 21 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 22 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 23 | SOFTWARE. 24 | """ 25 | 26 | import numpy as np 27 | import torch 28 | from scipy.io import wavfile 29 | from torchaudio.compliance import kaldi 30 | from xvector_jtubespeech import XVector 31 | 32 | 33 | def extract_xvector(model, wav): 34 | """Extract x-vector.""" 35 | # extract mfcc 36 | wav = torch.from_numpy(wav.astype(np.float32)).unsqueeze(0) 37 | mfcc = kaldi.mfcc(wav, num_ceps=24, num_mel_bins=24) # [1, T, 24] 38 | mfcc = mfcc.unsqueeze(0) 39 | 40 | # extract xvector 41 | xvector = model.vectorize(mfcc) # (1, 512) 42 | xvector = xvector.to("cpu").detach().numpy().copy()[0] 43 | return xvector 44 | 45 | 46 | def main(): 47 | """Perform extraction demo.""" 48 | _, wav = wavfile.read("in.wav") # 16kHz mono 49 | model = XVector("xvector.pth") # pretrained model 50 | xvector = extract_xvector(model, wav) 51 | print(xvector.shape) # (512, ) 52 | 53 | 54 | if __name__ == "__main__": 55 | main() 56 | -------------------------------------------------------------------------------- /SpeechAnalysis/feat_cepstrum.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | """ 音声情報処理 n本ノック !! """ 4 | 5 | # MIT License 6 | 7 | # Copyright (C) 2021 by Akira TAMAMORI 8 | 9 | # Permission is hereby granted, free of charge, to any person 10 | # obtaining a copy of this software and associated documentation files 11 | # (the Software"), to deal in the Software without restriction, 12 | # including without limitation the rights to use, copy, modify, merge, 13 | # publish, distribute, sublicense, and/or sell copies of the Software, 14 | # and to permit persons to whom the Software is furnished to do so, 15 | # subject to the following conditions: 16 | 17 | # The above copyright notice and this permission notice shall be 18 | # included in all copies or substantial portions of the Software. 19 | 20 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 21 | # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 22 | # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 23 | # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE 24 | # LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION 25 | # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION 26 | # WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 27 | 28 | # Commentary: 29 | # - ケプストラム法によりスペクトル包絡を抽出する 30 | # - パワーが最大となる音声フレームを対象に推定 31 | 32 | import matplotlib.pyplot as plt 33 | import numpy as np 34 | import scipy 35 | from scipy.io import wavfile 36 | 37 | import librosa 38 | 39 | IN_WAVE_FILE = "in.wav" # 分析対象の音声 40 | 41 | FRAME_LENGTH = 1024 # フレーム長 (FFTサイズ) 42 | HOP_LENGTH = 80 # フレームのシフト長 43 | FFT_LENGTH = FRAME_LENGTH 44 | 45 | MAX_Fo = 200 # 分析における基本周波数の最大値 (Hz) 46 | MIN_Fo = 60 # 分析における基本周波数の最小値 (Hz) 47 | 48 | # 音声のロード 49 | fs, data = wavfile.read(IN_WAVE_FILE) 50 | data = data.astype(np.float64) 51 | 52 | # ケプストラムの最大次数、最小次数 53 | max_cep_order = int(np.floor(fs / MIN_Fo)) 54 | min_cep_order = int(np.floor(fs / MAX_Fo)) 55 | 56 | # フレーム化 57 | frames = librosa.util.frame(data, frame_length=FRAME_LENGTH, hop_length=HOP_LENGTH).T 58 | 59 | # パワーが最大のフレーム位置を取得 60 | max_ind = np.argmax(np.sum(frames * frames, axis=1)) 61 | 62 | # パワーが最大となるフレームを取り出す 63 | pow_max_frame = frames[max_ind, :] 64 | 65 | # 窓掛け(ブラックマン窓) 66 | window = scipy.signal.blackman(FFT_LENGTH) 67 | windowed_frame = pow_max_frame * window 68 | 69 | # ケプストラムの計算 (FFT → 絶対値2乗 → 対数 → 逆FFT) 70 | fft_spec = scipy.fft.rfft(windowed_frame) 71 | log_power = np.log(np.abs(fft_spec) ** 2) 72 | cepstrum = scipy.fft.irfft(log_power).real 73 | # real partを取るのはなぜ?→「対称性」を保証するため 74 | 75 | # ケプストラム; 0次(直流)成分は外してプロット 76 | plt.title("Cepstrum w/o DC") 77 | n_samples = len(cepstrum) 78 | quef = np.arange(FFT_LENGTH // 2 + 1) / fs 79 | quef *= 1000 # to msec 80 | plt.xlim([0, np.max(quef)]) 81 | plt.plot(quef, cepstrum[: len(quef)]) 82 | plt.xlabel("Quefrency (msec)") 83 | plt.ylabel("Cepstrum") 84 | plt.show() 85 | 86 | lifter = 30 # リフタ次数 87 | cepstrum[lifter : FFT_LENGTH - lifter + 1] = 0 # 高次ケプストラムを0にする 88 | envelop = scipy.fft.rfft(cepstrum).real # fftによりスペクトル包絡にする 89 | 90 | # 対数パワースペクトル + スペクトル包絡 91 | plt.title("Log power spectrum + spectral envelop") 92 | plt.xlim([0, len(log_power)]) 93 | plt.plot(log_power, label="log power") 94 | plt.plot(envelop, label="envelop") 95 | plt.xlabel("Frequency (Hz)") 96 | plt.ylabel("Log power (dB)") 97 | plt.show() 98 | -------------------------------------------------------------------------------- /SpeechAnalysis/feat_fo_cepstrum_sequence.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | """ 音声情報処理 n本ノック !! """ 4 | 5 | # MIT License 6 | 7 | # Copyright (C) 2020 by Akira TAMAMORI 8 | 9 | # Permission is hereby granted, free of charge, to any person 10 | # obtaining a copy of this software and associated documentation files 11 | # (the Software"), to deal in the Software without restriction, 12 | # including without limitation the rights to use, copy, modify, merge, 13 | # publish, distribute, sublicense, and/or sell copies of the Software, 14 | # and to permit persons to whom the Software is furnished to do so, 15 | # subject to the following conditions: 16 | 17 | # The above copyright notice and this permission notice shall be 18 | # included in all copies or substantial portions of the Software. 19 | 20 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 21 | # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 22 | # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 23 | # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE 24 | # LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION 25 | # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION 26 | # WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 27 | 28 | # Commentary: 29 | # - ケプストラム法により基本周波数の「系列」を推定する 30 | # - 簡易的な有声音の判定と無声音の判定 31 | 32 | import matplotlib.pyplot as plt 33 | import numpy as np 34 | import scipy 35 | from scipy.io import wavfile 36 | import librosa 37 | 38 | IN_WAVE_FILE = "in.wav" # 分析対象の音声 39 | 40 | FRAME_LENGTH = 1024 # フレーム長 (FFTサイズ) 41 | HOP_LENGTH = 80 # フレームのシフト長 42 | FFT_LENGTH = FRAME_LENGTH 43 | 44 | MAX_Fo = 200 # 分析における基本周波数の最大値 (Hz) 45 | MIN_Fo = 60 # 分析における基本周波数の最小値 (Hz) 46 | 47 | THRESHOLD_dB = -30 # 無声判定のしきい値 in dB 48 | 49 | # 音声のロード 50 | fs, data = wavfile.read(IN_WAVE_FILE) 51 | data = data.astype(np.float64) 52 | 53 | # ケプストラムの最大次数、最小次数 54 | max_cep_order = int(np.floor(fs / MIN_Fo)) 55 | min_cep_order = int(np.floor(fs / MAX_Fo)) 56 | 57 | # フレーム化 58 | frames = librosa.util.frame(data, frame_length=FRAME_LENGTH, 59 | hop_length=HOP_LENGTH).T 60 | 61 | # 各フレームで計算したパワーをもとに有声音のフレームを決定(泥臭い) 62 | powers = np.sum(frames * frames, axis=1) 63 | voiced = np.where(10 * np.log(powers / np.max(powers)) > THRESHOLD_dB) 64 | 65 | # 窓掛け 66 | window = scipy.signal.blackman(FFT_LENGTH) 67 | windowed_frame = frames[voiced] * window 68 | 69 | # ケプストラムの計算 (FFT → 絶対値 → 対数 → 逆FFT) 70 | fft_spec = scipy.fft.rfft(windowed_frame) 71 | log_amp_spec = np.log(np.abs(fft_spec)) 72 | cepstrum = scipy.fft.irfft(log_amp_spec) 73 | 74 | # ピーク位置の検出 75 | peak_index = np.argmax(cepstrum[:, min_cep_order: max_cep_order], axis=1) 76 | max_quef = peak_index + min_cep_order 77 | 78 | # ケフレンシから変換して基本周波数の推定 79 | fo = fs / max_quef 80 | 81 | # 基本周波数の系列:無声音のフレームでは 0 Hzとするため 一様に0で初期化 82 | fo_seq = np.zeros(frames.shape[0]) 83 | 84 | # 有声音のフレームに 推定された基本周波数を格納する 85 | fo_seq[voiced] = fo 86 | 87 | # 基本周波数の系列を表示 88 | fig = plt.figure(figsize=(12, 4)) 89 | plt.plot(fo_seq) 90 | plt.xlabel("Frame number") 91 | plt.ylabel("Frequency (Hz)") 92 | plt.title("Estimation of fundamental frequency via cepstrum method") 93 | plt.tight_layout() 94 | plt.xlim(0, len(fo_seq) - 1) 95 | 96 | plt.show() 97 | -------------------------------------------------------------------------------- /SpeechAnalysis/feat_fo_dio.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | # MIT License 4 | 5 | # Copyright (C) 2020 by Akira TAMAMORI 6 | 7 | # Permission is hereby granted, free of charge, to any person 8 | # obtaining a copy of this software and associated documentation files 9 | # (the Software"), to deal in the Software without restriction, 10 | # including without limitation the rights to use, copy, modify, merge, 11 | # publish, distribute, sublicense, and/or sell copies of the Software, 12 | # and to permit persons to whom the Software is furnished to do so, 13 | # subject to the following conditions: 14 | 15 | # The above copyright notice and this permission notice shall be 16 | # included in all copies or substantial portions of the Software. 17 | 18 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 19 | # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 20 | # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 21 | # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE 22 | # LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION 23 | # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION 24 | # WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 25 | 26 | # Commentary: 27 | # DIO による基本周波数推定 28 | 29 | import matplotlib.pyplot as plt 30 | import numpy as np 31 | import pyworld 32 | from scipy.io import wavfile 33 | 34 | IN_WAVE_FILE = "in.wav" 35 | FRAME_LENGTH = 1024 # フレーム長 (FFTサイズ) 36 | HOP_LENGTH = 80 # フレームのシフト長 37 | 38 | # 音声のロード 39 | fs, data = wavfile.read(IN_WAVE_FILE) 40 | data = data.astype(np.float64) 41 | 42 | # DIO に基づく基本周波数推定 43 | fo, _ = pyworld.dio(data, fs) 44 | 45 | # 波形表示 46 | fig = plt.figure(figsize=(12, 6)) 47 | n_samples = len(data) 48 | time = np.arange(n_samples) / fs 49 | axes = fig.add_subplot(2, 1, 1) 50 | axes.plot(time, data) 51 | axes.set_xlabel("Time (sec)") 52 | axes.set_ylabel("Amplitude") 53 | axes.set_title("Waveform") 54 | axes.set_xlim(0, np.max(time)) 55 | 56 | axes = fig.add_subplot(2, 1, 2) 57 | axes.plot(fo) 58 | axes.set_xlabel("Frame number") 59 | axes.set_ylabel("Frequency (Hz)") 60 | axes.set_title("Estimation of fundamental frequency via pYIN method") 61 | axes.set_xlim(0, len(fo) - 1) 62 | 63 | plt.tight_layout() 64 | plt.show() 65 | -------------------------------------------------------------------------------- /SpeechAnalysis/feat_fo_music.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | """ 音声情報処理 n本ノック !! """ 4 | 5 | # MIT License 6 | 7 | # Copyright (C) 2020 by Akira TAMAMORI 8 | 9 | # Permission is hereby granted, free of charge, to any person 10 | # obtaining a copy of this software and associated documentation files 11 | # (the Software"), to deal in the Software without restriction, 12 | # including without limitation the rights to use, copy, modify, merge, 13 | # publish, distribute, sublicense, and/or sell copies of the Software, 14 | # and to permit persons to whom the Software is furnished to do so, 15 | # subject to the following conditions: 16 | 17 | # The above copyright notice and this permission notice shall be 18 | # included in all copies or substantial portions of the Software. 19 | 20 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 21 | # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 22 | # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 23 | # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE 24 | # LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION 25 | # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION 26 | # WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 27 | 28 | # Commentary: 29 | # - 音声セグメントからMUSIC法により基本周波数を推定する 30 | 31 | import matplotlib.pyplot as plt 32 | import numpy as np 33 | import scipy 34 | from scipy.io import wavfile 35 | import librosa 36 | 37 | IN_WAVE_FILE = "voice_a.wav" # 「あ」の音声 38 | 39 | FRAME_LENGTH = 1024 # フレーム長 (FFTサイズ) 40 | HOP_LENGTH = 80 # フレームのシフト長 41 | 42 | CUTOFF = 4000 # 遮断周波数 (Hz) 43 | 44 | 45 | # 音声のロード 46 | fs, data = wavfile.read(IN_WAVE_FILE) 47 | data = data.astype(np.float64) 48 | 49 | # フレーム化 50 | frames = librosa.util.frame(data, frame_length=FRAME_LENGTH, 51 | hop_length=HOP_LENGTH).T 52 | 53 | # 周波数軸 54 | freq_axis = np.linspace(0, fs, frames.shape[0]) 55 | 56 | # MUSIC法のノイズ成分を高域の周波数成分と見なす 57 | ORDER = np.min(np.where(freq_axis > CUTOFF)) 58 | 59 | # 標本共分散行列の計算 60 | cov_frames = np.cov(frames, bias=True) 61 | 62 | # 固有値と固有ベクトルを計算 63 | # →固有値は大きい順に並び、固有ベクトル(縦)もそれに対応して並ぶ 64 | eigval, eigvec = np.linalg.eig(cov_frames) 65 | 66 | # ノイズ成分の固有ベクトル 67 | noise_eigvec = eigvec[:, 2 * ORDER + 1:] 68 | 69 | # パワースペクトルをノイズ成分の固有ベクトルから計算 70 | power_noise_eigvec = np.abs(np.fft.fft(noise_eigvec)) 71 | power_noise_eigvec = power_noise_eigvec ** 2 72 | 73 | # MUSIC法の疑似スペクトルを計算 74 | music_pseudo_spec = 1.0 / np.sum(power_noise_eigvec, axis=1) 75 | 76 | # 基本周波数の推定 77 | # →ピーク位置の最小値を与える周波数 78 | fo = freq_axis[np.min(scipy.signal.argrelmax(music_pseudo_spec))] 79 | print(f"Estimatied fundamental frequency = {fo:.2f} Hz") 80 | 81 | # 波形表示 82 | fig = plt.figure(figsize=(10, 6)) 83 | n_samples = len(data) 84 | time = np.arange(n_samples) / fs 85 | plt.plot(time, data) 86 | plt.xlabel("Time (sec)") 87 | plt.ylabel("Amplitude") 88 | plt.title("Waveform (/a/)") 89 | plt.show() 90 | 91 | # MUSIC法による疑似スペクトルの計算結果 92 | fig = plt.figure(figsize=(10, 6)) 93 | plt.plot(freq_axis, 20 * np.log10(music_pseudo_spec)) 94 | plt.xlim(0, fs/2) 95 | plt.xlabel("Frequency (Hz)") 96 | plt.ylabel("Power [dB]") 97 | plt.title( 98 | f"Pseudospectrum via MUSIC method\nFundamental Frequency = {fo:.2f} Hz") 99 | plt.show() 100 | -------------------------------------------------------------------------------- /SpeechAnalysis/feat_fo_pyin.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | # MIT License 4 | 5 | # Copyright (C) 2020 by Akira TAMAMORI 6 | 7 | # Permission is hereby granted, free of charge, to any person 8 | # obtaining a copy of this software and associated documentation files 9 | # (the Software"), to deal in the Software without restriction, 10 | # including without limitation the rights to use, copy, modify, merge, 11 | # publish, distribute, sublicense, and/or sell copies of the Software, 12 | # and to permit persons to whom the Software is furnished to do so, 13 | # subject to the following conditions: 14 | 15 | # The above copyright notice and this permission notice shall be 16 | # included in all copies or substantial portions of the Software. 17 | 18 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 19 | # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 20 | # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 21 | # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE 22 | # LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION 23 | # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION 24 | # WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 25 | 26 | # Commentary: 27 | # pYIN による基本周波数推定 28 | 29 | import librosa 30 | import matplotlib.pyplot as plt 31 | import numpy as np 32 | from scipy.io import wavfile 33 | 34 | IN_WAVE_FILE = "in.wav" 35 | 36 | FRAME_LENGTH = 1024 # フレーム長 (FFTサイズ) 37 | HOP_LENGTH = 80 # フレームのシフト長 38 | 39 | MAX_Fo = 200 # 分析における基本周波数の最大値 (Hz) 40 | MIN_Fo = 60 # 分析における基本周波数の最小値 (Hz) 41 | 42 | # 音声のロード 43 | fs, data = wavfile.read(IN_WAVE_FILE) 44 | data = data.astype(np.float64) 45 | 46 | # 基本周波数の推定 47 | fo, _, _ = librosa.pyin( 48 | data, 49 | fmin=MIN_Fo, 50 | fmax=MAX_Fo, 51 | sr=fs, 52 | frame_length=FRAME_LENGTH, 53 | hop_length=HOP_LENGTH, 54 | fill_na=0.0, 55 | ) 56 | 57 | # 波形表示 58 | fig = plt.figure(figsize=(12, 6)) 59 | n_samples = len(data) 60 | time = np.arange(n_samples) / fs 61 | axes = fig.add_subplot(2, 1, 1) 62 | axes.plot(time, data) 63 | axes.set_xlabel("Time (sec)") 64 | axes.set_ylabel("Amplitude") 65 | axes.set_title("Waveform") 66 | axes.set_xlim(0, np.max(time)) 67 | 68 | axes = fig.add_subplot(2, 1, 2) 69 | axes.plot(fo) 70 | axes.set_xlabel("Frame number") 71 | axes.set_ylabel("Frequency (Hz)") 72 | axes.set_title("Estimation of fundamental frequency via pYIN method") 73 | axes.set_xlim(0, len(fo) - 1) 74 | axes.set_ylim(0, MAX_Fo) 75 | 76 | plt.tight_layout() 77 | plt.show() 78 | -------------------------------------------------------------------------------- /SpeechAnalysis/feat_fo_yin.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | # MIT License 4 | 5 | # Copyright (C) 2020 by Akira TAMAMORI 6 | 7 | # Permission is hereby granted, free of charge, to any person 8 | # obtaining a copy of this software and associated documentation files 9 | # (the Software"), to deal in the Software without restriction, 10 | # including without limitation the rights to use, copy, modify, merge, 11 | # publish, distribute, sublicense, and/or sell copies of the Software, 12 | # and to permit persons to whom the Software is furnished to do so, 13 | # subject to the following conditions: 14 | 15 | # The above copyright notice and this permission notice shall be 16 | # included in all copies or substantial portions of the Software. 17 | 18 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 19 | # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 20 | # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 21 | # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE 22 | # LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION 23 | # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION 24 | # WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 25 | 26 | # Commentary: 27 | # YIN による基本周波数推定 28 | # 29 | # De Cheveigné, Alain, and Hideki Kawahara, 30 | # “YIN, a fundamental frequency estimator for speech and music,” 31 | # The Journal of the Acoustical Society of America 111.4 (2002): 1917-1930. 32 | # https://asa.scitation.org/doi/10.1121/1.1458024 33 | 34 | import librosa 35 | import matplotlib.pyplot as plt 36 | import numpy as np 37 | from scipy.io import wavfile 38 | 39 | IN_WAVE_FILE = "in.wav" 40 | 41 | FRAME_LENGTH = 1024 # フレーム長 (FFTサイズ) 42 | HOP_LENGTH = 80 # フレームのシフト長 43 | 44 | MAX_Fo = 200 # 分析における基本周波数の最大値 (Hz) 45 | MIN_Fo = 60 # 分析における基本周波数の最小値 (Hz) 46 | 47 | # 音声のロード 48 | fs, data = wavfile.read(IN_WAVE_FILE) 49 | data = data.astype(np.float64) 50 | 51 | # 基本周波数の推定 (YINアルゴリズム) 52 | fo = librosa.yin( 53 | data, 54 | fmin=MIN_Fo, 55 | fmax=MAX_Fo, 56 | sr=fs, 57 | frame_length=FRAME_LENGTH, 58 | hop_length=HOP_LENGTH, 59 | trough_threshold=0.1, 60 | ) 61 | 62 | # 波形と基本周波数系列を表示 63 | fig = plt.figure(figsize=(12, 6)) 64 | n_samples = len(data) 65 | time = np.arange(n_samples) / fs 66 | axes = fig.add_subplot(2, 1, 1) 67 | axes.plot(time, data) 68 | axes.set_xlabel("Time (sec)") 69 | axes.set_ylabel("Amplitude") 70 | axes.set_title("Waveform") 71 | axes.set_xlim(0, np.max(time)) 72 | 73 | axes = fig.add_subplot(2, 1, 2) 74 | axes.plot(fo) 75 | axes.set_xlabel("Frame number") 76 | axes.set_ylabel("Frequency (Hz)") 77 | axes.set_title("Estimation of fundamental frequency via YIN method") 78 | axes.set_xlim(0, len(fo) - 1) 79 | axes.set_ylim(0, MAX_Fo) 80 | 81 | plt.tight_layout() 82 | plt.show() 83 | -------------------------------------------------------------------------------- /SpeechAnalysis/feat_gla.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | """ 音声情報処理 n本ノック !! """ 4 | 5 | # MIT License 6 | 7 | # Copyright (C) 2020 by Akira TAMAMORI 8 | 9 | # Permission is hereby granted, free of charge, to any person 10 | # obtaining a copy of this software and associated documentation files 11 | # (the Software"), to deal in the Software without restriction, 12 | # including without limitation the rights to use, copy, modify, merge, 13 | # publish, distribute, sublicense, and/or sell copies of the Software, 14 | # and to permit persons to whom the Software is furnished to do so, 15 | # subject to the following conditions: 16 | 17 | # The above copyright notice and this permission notice shall be 18 | # included in all copies or substantial portions of the Software. 19 | 20 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 21 | # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 22 | # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 23 | # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE 24 | # LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION 25 | # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION 26 | # WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 27 | 28 | # Commentary: 29 | # - Griffin-Lim法により位相を復元する 30 | 31 | import numpy as np 32 | from scipy.io import wavfile 33 | import librosa 34 | 35 | IN_WAVE_FILE = "in.wav" # モノラル音声 36 | OUT_WAVE_FILE = "out_gla.wav" # 復元音声 37 | 38 | FRAME_LENGTH = 1024 # フレーム長 (FFTサイズ) 39 | HOP_LENGTH = 80 # フレームのシフト長 40 | 41 | ITERATION = 200 # Griffin-Lim法における位相推定の最大繰り返し数 42 | 43 | # 音声のロード 44 | fs, data = wavfile.read(IN_WAVE_FILE) 45 | data = data.astype(np.float64) 46 | 47 | # 振幅スペクトル(位相復元なので手に入るのはこれのみ) 48 | amp_spec = np.abs(librosa.core.stft(data, n_fft=FRAME_LENGTH, 49 | hop_length=HOP_LENGTH, 50 | win_length=FRAME_LENGTH)) 51 | 52 | # Griffin-Lim法に基づく位相スペクトルの推定 53 | for i in range(ITERATION): 54 | if i == 0: 55 | # 初回は乱数で初期化 56 | phase_spec = np.random.rand(*amp_spec.shape) 57 | else: 58 | # 振幅スペクトルと推定された位相スペクトルから複素スペクトログラムを復元 59 | recovered_spec = amp_spec * np.exp(1j * phase_spec) 60 | 61 | # 短時間フーリエ逆変換で音声を復元 62 | recovered = librosa.core.istft(recovered_spec, hop_length=HOP_LENGTH, 63 | win_length=FRAME_LENGTH) 64 | 65 | # 復元音声から複素スペクトログラムを再計算 66 | complex_spec = librosa.core.stft(recovered, n_fft=FRAME_LENGTH, 67 | hop_length=HOP_LENGTH, 68 | win_length=FRAME_LENGTH) 69 | 70 | # 初回以降は計算済みの複素スペクトログラムから位相スペクトルを推定 71 | phase_spec = np.angle(complex_spec) 72 | 73 | # 音声を復元 74 | recovered_spec = amp_spec * np.exp(1j * phase_spec) 75 | recovered = librosa.core.istft(recovered_spec, hop_length=HOP_LENGTH, 76 | win_length=FRAME_LENGTH) 77 | recovered = recovered.astype(np.int16) 78 | 79 | # 復元された音声をwavファイルとして保存 80 | wavfile.write(OUT_WAVE_FILE, fs, recovered) 81 | -------------------------------------------------------------------------------- /SpeechAnalysis/feat_melspec.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | """ 音声情報処理 n本ノック !! """ 4 | 5 | # MIT License 6 | 7 | # Copyright (C) 2020 by Akira TAMAMORI 8 | 9 | # Permission is hereby granted, free of charge, to any person 10 | # obtaining a copy of this software and associated documentation files 11 | # (the Software"), to deal in the Software without restriction, 12 | # including without limitation the rights to use, copy, modify, merge, 13 | # publish, distribute, sublicense, and/or sell copies of the Software, 14 | # and to permit persons to whom the Software is furnished to do so, 15 | # subject to the following conditions: 16 | 17 | # The above copyright notice and this permission notice shall be 18 | # included in all copies or substantial portions of the Software. 19 | 20 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 21 | # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 22 | # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 23 | # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE 24 | # LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION 25 | # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION 26 | # WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 27 | 28 | # Commentary: 29 | # メルスペクトログラムの抽出と可視化 by librosa 30 | # 波形読み込みはscipy.ioのwavfileモジュール 31 | 32 | import librosa 33 | import librosa.display 34 | import matplotlib.pyplot as plt 35 | import numpy as np 36 | from scipy.io import wavfile 37 | 38 | IN_WAVE_FILE = "in.wav" # モノラル音声(前提) 39 | OUT_WAVE_FILE = "out_istft.wav" 40 | 41 | FRAME_LENGTH = 1024 # フレーム長 42 | HOP_LENGTH = 80 # フレームのシフト長 43 | N_OVERLAP = FRAME_LENGTH - HOP_LENGTH # オーバーラップ幅 44 | N_MELS = 128 # メルフィルタバンクの数 45 | 46 | # 音声データ読み込み (fsがサンプリング周波数、dataは音声データ) 47 | fs, data = wavfile.read(IN_WAVE_FILE) 48 | data = data.astype(np.float64) 49 | 50 | # メルスペクトログラムの抽出 51 | mel_spec = librosa.feature.melspectrogram( 52 | y=data, sr=fs, n_mels=N_MELS, hop_length=HOP_LENGTH 53 | ) 54 | 55 | # デシベルスケールにする 56 | mel_spec_dB = librosa.power_to_db(mel_spec, ref=np.max) 57 | 58 | # メルスペクトログラムの表示 59 | fig = plt.figure(figsize=(10, 4)) 60 | librosa.display.specshow( 61 | mel_spec_dB, x_axis="time", y_axis="hz", hop_length=HOP_LENGTH, sr=fs 62 | ) 63 | plt.colorbar(format="%+2.0f dB") 64 | plt.tight_layout() 65 | plt.show() 66 | -------------------------------------------------------------------------------- /SpeechAnalysis/feat_mfcc.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | """ 音声情報処理 n本ノック !! """ 4 | 5 | # MIT License 6 | 7 | # Copyright (C) 2020 by Akira TAMAMORI 8 | 9 | # Permission is hereby granted, free of charge, to any person 10 | # obtaining a copy of this software and associated documentation files 11 | # (the Software"), to deal in the Software without restriction, 12 | # including without limitation the rights to use, copy, modify, merge, 13 | # publish, distribute, sublicense, and/or sell copies of the Software, 14 | # and to permit persons to whom the Software is furnished to do so, 15 | # subject to the following conditions: 16 | 17 | # The above copyright notice and this permission notice shall be 18 | # included in all copies or substantial portions of the Software. 19 | 20 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 21 | # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 22 | # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 23 | # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE 24 | # LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION 25 | # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION 26 | # WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 27 | 28 | # Commentary: 29 | # MFCCの抽出と可視化 by librosa 30 | # 波形読み込みはscipy.ioのwavfileモジュール 31 | 32 | import librosa 33 | import librosa.display 34 | import matplotlib.pyplot as plt 35 | import numpy as np 36 | from scipy.io import wavfile 37 | 38 | IN_WAVE_FILE = "in.wav" # モノラル音声(前提) 39 | 40 | FRAME_LENGTH = 1024 # フレーム長 41 | HOP_LENGTH = 80 # フレームのシフト長 42 | N_MELS = 128 # メルフィルタバンクの数 43 | N_MFCC = 20 # MFCCの次数 44 | 45 | # 音声データ読み込み (fsがサンプリング周波数、dataは音声データ) 46 | fs, data = wavfile.read(IN_WAVE_FILE) 47 | data = data.astype(np.float64) 48 | 49 | # MFCCの抽出 (音声から抽出) 50 | mfcc = librosa.feature.mfcc(y=data, sr=fs, n_mels=N_MELS, hop_length=HOP_LENGTH) 51 | 52 | # 形状の確認 53 | print("MFCC arrayの形状: ", mfcc.shape) 54 | 55 | # MFCCの表示 56 | fig = plt.figure(figsize=(10, 4)) 57 | librosa.display.specshow(mfcc, x_axis="time", hop_length=HOP_LENGTH, sr=fs) 58 | plt.colorbar() 59 | plt.tight_layout() 60 | plt.show() 61 | 62 | # メルスペクトログラムの抽出 63 | mel_spec = librosa.feature.melspectrogram( 64 | y=data, sr=fs, n_mels=N_MELS, hop_length=HOP_LENGTH 65 | ) 66 | 67 | # デシベルスケールにする 68 | mel_spec_dB = librosa.power_to_db(mel_spec, ref=np.max) 69 | 70 | # MFCCの抽出 71 | mfcc = librosa.feature.mfcc(S=mel_spec_dB, sr=fs, n_mels=N_MELS, hop_length=HOP_LENGTH) 72 | 73 | # メルスペクトログラムの表示 74 | fig = plt.figure(figsize=(10, 4)) 75 | librosa.display.specshow(mfcc, x_axis="time", hop_length=HOP_LENGTH, sr=fs) 76 | plt.colorbar() 77 | plt.tight_layout() 78 | plt.show() 79 | -------------------------------------------------------------------------------- /SpeechAnalysis/feat_stft.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | """ 音声情報処理 n本ノック !! """ 4 | 5 | # MIT License 6 | 7 | # Copyright (C) 2020 by Akira TAMAMORI 8 | # Copyright (C) 2020 Masahito Togami 9 | 10 | # Permission is hereby granted, free of charge, to any person 11 | # obtaining a copy of this software and associated documentation files 12 | # (the Software"), to deal in the Software without restriction, 13 | # including without limitation the rights to use, copy, modify, merge, 14 | # publish, distribute, sublicense, and/or sell copies of the Software, 15 | # and to permit persons to whom the Software is furnished to do so, 16 | # subject to the following conditions: 17 | 18 | # The above copyright notice and this permission notice shall be 19 | # included in all copies or substantial portions of the Software. 20 | 21 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 22 | # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 23 | # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 24 | # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE 25 | # LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION 26 | # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION 27 | # WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 28 | 29 | # Commentary: 30 | # scipyの短時間フーリエ変換 31 | # 波形読み込みはscipy.ioのwavfileモジュール 32 | 33 | import numpy as np 34 | import scipy.signal as sp 35 | from scipy.io import wavfile 36 | 37 | IN_WAVE_FILE = "in.wav" # モノラル音声(前提) 38 | 39 | FRAME_LENGTH = 512 # フレーム長 40 | HOP_LENGTH = 256 # フレームのシフト長 41 | N_OVERLAP = FRAME_LENGTH - HOP_LENGTH # オーバーラップ幅 42 | 43 | # 音声データ読み込み (fsがサンプリング周波数、dataは音声データ) 44 | fs, data = wavfile.read(IN_WAVE_FILE) 45 | 46 | # 短時間フーリエ変換を行う 47 | f, t, stft_data = sp.stft( 48 | data, fs=fs, window="hann", nperseg=FRAME_LENGTH, noverlap=N_OVERLAP 49 | ) 50 | 51 | # 短時間フーリエ変換後のデータ形式を確認 52 | print("短時間フーリエ変換後のshape: ", np.shape(stft_data)) 53 | 54 | # 周波数軸の情報 55 | print("周波数軸 [Hz]: ", f) 56 | 57 | # 時間軸の情報 58 | print("時間軸[sec]: ", t) 59 | -------------------------------------------------------------------------------- /SpeechAnalysis/feat_stft_istft.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | """ 音声情報処理 n本ノック !! """ 4 | 5 | # MIT License 6 | 7 | # Copyright (C) 2020 by Akira TAMAMORI 8 | 9 | # Permission is hereby granted, free of charge, to any person 10 | # obtaining a copy of this software and associated documentation files 11 | # (the Software"), to deal in the Software without restriction, 12 | # including without limitation the rights to use, copy, modify, merge, 13 | # publish, distribute, sublicense, and/or sell copies of the Software, 14 | # and to permit persons to whom the Software is furnished to do so, 15 | # subject to the following conditions: 16 | 17 | # The above copyright notice and this permission notice shall be 18 | # included in all copies or substantial portions of the Software. 19 | 20 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 21 | # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 22 | # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 23 | # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE 24 | # LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION 25 | # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION 26 | # WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 27 | 28 | # Commentary: 29 | # 短時間フーリエ変換と逆変換 30 | # 波形読み込みはscipy.ioのwavfileモジュール 31 | 32 | import numpy as np 33 | import scipy.signal as sp 34 | from scipy.io import wavfile 35 | 36 | IN_WAVE_FILE = "in.wav" # モノラル音声(前提) 37 | OUT_WAVE_FILE = "out_istft.wav" 38 | 39 | FRAME_LENGTH = 512 # フレーム長 40 | HOP_LENGTH = 256 # フレームのシフト長 41 | N_OVERLAP = FRAME_LENGTH - HOP_LENGTH # オーバーラップ幅 42 | 43 | # 音声データ読み込み (fsがサンプリング周波数、dataは音声データ) 44 | fs, data = wavfile.read(IN_WAVE_FILE) 45 | 46 | # 短時間フーリエ変換によりフーリエスペクトル系列を得る 47 | _, _, stft_data = sp.stft( 48 | data, fs=fs, window="hann", nperseg=FRAME_LENGTH, noverlap=N_OVERLAP 49 | ) 50 | 51 | # 短時間フーリエ逆変換により音声に戻す 52 | _, data_inv = sp.istft( 53 | stft_data, fs=fs, window="hann", nperseg=FRAME_LENGTH, noverlap=N_OVERLAP 54 | ) 55 | 56 | # 音声の書き込み 57 | data_inv = data_inv.astype(np.int16) # 2byte (16 bit)の整数値に変換 58 | wavfile.write(OUT_WAVE_FILE, fs, data_inv) 59 | -------------------------------------------------------------------------------- /SpeechAnalysisSynthesis/README.md: -------------------------------------------------------------------------------- 1 | # 音声の分析合成 2 | 3 | ## はじめに 4 | ``` 5 | pip3 install pysptk 6 | pip3 install pyworld 7 | ``` 8 | 9 | ## ファイル一覧 10 | ### Pythonスクリプト 11 | - 線形予測分析による分析再合成 [pysptk_anasyn_lpc.py](https://github.com/tam17aki/speech_process_exercise/blob/master/SpeechAnalysisSynthesis/pysptk_anasyn_lpc.py) 12 | - PARCOR分析による分析再合成 [pysptk_anasyn_parcor.py](https://github.com/tam17aki/speech_process_exercise/blob/master/SpeechAnalysisSynthesis/pysptk_anasyn_parcor.py) 13 | - 線スペクトル対による分析再合成 [pysptk_anasyn_lsp.py](https://github.com/tam17aki/speech_process_exercise/blob/master/SpeechAnalysisSynthesis/pysptk_anasyn_lsp.py) 14 | - メルケプストラム分析による再合成 [pysptk_anasyn_mlsa.py](https://github.com/tam17aki/speech_process_exercise/blob/master/SpeechAnalysisSynthesis/pysptk_anasyn_mlsa.py) 15 | - メルケプストラム分析による再合成 (パラメタを変えていろいろな声を合成) 16 | [pysptk_anasyn_mlsa_others.py](https://github.com/tam17aki/speech_process_exercise/blob/master/SpeechAnalysisSynthesis/pysptk_anasyn_mlsa_others.py) 17 | - WORLDによる再合成 [pyworld_anasyn.py](https://github.com/tam17aki/speech_process_exercise/blob/master/SpeechAnalysisSynthesis/pyworld_anasyn.py) 18 | 19 | ### Jupyter notebook 20 | - 線形予測分析による分析再合成 [pysptk_anasyn_lpc.ipynb](https://nbviewer.jupyter.org/github/tam17aki/speech_process_exercise/blob/master/SpeechAnalysisSynthesis/pysptk_anasyn_lpc.ipynb) 21 | - PARCOR分析による分析再合成 [pysptk_anasyn_parcor.ipynb](https://nbviewer.jupyter.org/github/tam17aki/speech_process_exercise/blob/master/SpeechAnalysisSynthesis/pysptk_anasyn_parcor.ipynb) 22 | - 線スペクトル対による分析再合成 [pysptk_anasyn_lsp.ipynb](https://nbviewer.jupyter.org/github/tam17aki/speech_process_exercise/blob/master/SpeechAnalysisSynthesis/pysptk_anasyn_lsp.ipynb) 23 | - メルケプストラム分析による再合成 [pysptk_anasyn_mlsa.ipynb](https://nbviewer.jupyter.org/github/tam17aki/speech_process_exercise/blob/master/SpeechAnalysisSynthesis/pysptk_anasyn_mlsa.ipynb) 24 | - メルケプストラム分析による再合成 (パラメタを変えていろいろな声を合成) 25 | [pysptk_anasyn_mlsa_others.ipynb](https://nbviewer.jupyter.org/github/tam17aki/speech_process_exercise/blob/master/SpeechAnalysisSynthesis/pysptk_anasyn_mlsa_others.ipynb) 26 | - WORLDによる再合成 [pyworld_anasyn.ipynb](https://nbviewer.jupyter.org/github/tam17aki/speech_process_exercise/blob/master/SpeechAnalysisSynthesis/pyworld_anasyn.ipynb) 27 | 28 | ### Google Colaboratory 29 | - 線形予測分析による分析再合成 [pysptk_anasyn_lpc.ipynb](https://colab.research.google.com/drive/1nUHBRWUk4vQOCakDXC8T-BVvbZZ9jWXJ?usp=sharing) 30 | - PARCOR分析による分析再合成 [pysptk_anasyn_parcor.ipynb](https://colab.research.google.com/drive/1EFMi2VQfJ_kUwJKn367B-JZeOSbNSSaz?usp=sharing) 31 | - 線スペクトル対による分析再合成 [pysptk_anasyn_lsp.ipynb](https://colab.research.google.com/drive/1BxAMGzLgguA5HivfHuGmeyXIBD8uRWdN?usp=sharing) 32 | - メルケプストラム分析による再合成 [pysptk_anasyn_mlsa.ipynb](https://colab.research.google.com/drive/1TZml_LdOAqDBY3UEGtw_x5UPL8ok44P1?usp=sharing) 33 | - メルケプストラム分析による再合成 (パラメタを変えていろいろな声を合成) 34 | [pysptk_anasyn_mlsa_others.ipynb](https://colab.research.google.com/drive/13QK6S_vQdwgU7bX8pXdJErFjnNHnqeQy?usp=sharing) 35 | - WORLDによる再合成 [pyworld_anasyn.ipynb](https://colab.research.google.com/drive/1yeIWMuQNqX2RNti0hRmHxSoAjrlrIjRU?usp=sharing) 36 | 37 | ### PySimpleGUIによるGUIアプリ 38 | - 音声録音および分析合成(波形表示・スペクトログラム表示も可能)[pysptk_anasyn_recog.py](https://github.com/tam17aki/speech_process_exercise/blob/master/SpeechAnalysisSynthesis/pysptk_anasyn_recog.py) 39 | -------------------------------------------------------------------------------- /SpeechAnalysisSynthesis/pysptk_anasyn_lpc.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | """ 音声情報処理 n本ノック !! """ 4 | 5 | # MIT License 6 | 7 | # Copyright (C) 2020 by Akira TAMAMORI 8 | 9 | # Permission is hereby granted, free of charge, to any person 10 | # obtaining a copy of this software and associated documentation files 11 | # (the Software"), to deal in the Software without restriction, 12 | # including without limitation the rights to use, copy, modify, merge, 13 | # publish, distribute, sublicense, and/or sell copies of the Software, 14 | # and to permit persons to whom the Software is furnished to do so, 15 | # subject to the following conditions: 16 | 17 | # The above copyright notice and this permission notice shall be 18 | # included in all copies or substantial portions of the Software. 19 | 20 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 21 | # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 22 | # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 23 | # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE 24 | # LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION 25 | # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION 26 | # WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 27 | 28 | # Commentary: 29 | # - PySPTKによる音声の分析再合成 (LPC係数による全極フィルタ) 30 | 31 | from pysptk.synthesis import AllPoleDF, Synthesizer 32 | from scipy.io import wavfile 33 | import librosa 34 | import numpy as np 35 | import pysptk 36 | 37 | FRAME_LENGTH = 1024 38 | HOP_LENGTH = 80 39 | MIN_F0 = 60 40 | MAX_F0 = 240 41 | ORDER = 20 42 | 43 | IN_WAVE_FILE = "in.wav" # 入力音声 44 | OUT_WAVE_FILE = "out.wav" # 分析再合成した音声 45 | 46 | # 音声の読み込み 47 | fs, x = wavfile.read(IN_WAVE_FILE) 48 | x = x.astype(np.float64) 49 | 50 | # 音声の切り出しと窓掛け 51 | frames = librosa.util.frame(x, frame_length=FRAME_LENGTH, 52 | hop_length=HOP_LENGTH).astype(np.float64).T 53 | frames *= pysptk.blackman(FRAME_LENGTH) # 窓掛け(ブラックマン窓) 54 | 55 | # ピッチ抽出 56 | pitch = pysptk.swipe(x, fs=fs, hopsize=HOP_LENGTH, 57 | min=MIN_F0, max=MAX_F0, otype="pitch") 58 | 59 | # 励振源信号(声帯音源)の生成 60 | source_excitation = pysptk.excite(pitch, HOP_LENGTH) 61 | 62 | # 線形予測分析による線形予測係数の抽出 63 | lpc = pysptk.lpc(frames, ORDER) 64 | lpc[:, 0] = np.log(lpc[:, 0]) # loggain for AllPoleDF 65 | 66 | # 全極フィルタの作成 67 | synthesizer = Synthesizer(AllPoleDF(order=ORDER), HOP_LENGTH) 68 | 69 | # 励振源信号でフィルタを駆動して音声を合成 70 | y = synthesizer.synthesis(source_excitation, lpc) 71 | 72 | # 音声の書き込み 73 | y = y.astype(np.int16) 74 | wavfile.write(OUT_WAVE_FILE, fs, y) 75 | -------------------------------------------------------------------------------- /SpeechAnalysisSynthesis/pysptk_anasyn_lsp.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | """ 音声情報処理 n本ノック !! """ 4 | 5 | # MIT License 6 | 7 | # Copyright (C) 2020 by Akira TAMAMORI 8 | 9 | # Permission is hereby granted, free of charge, to any person 10 | # obtaining a copy of this software and associated documentation files 11 | # (the Software"), to deal in the Software without restriction, 12 | # including without limitation the rights to use, copy, modify, merge, 13 | # publish, distribute, sublicense, and/or sell copies of the Software, 14 | # and to permit persons to whom the Software is furnished to do so, 15 | # subject to the following conditions: 16 | 17 | # The above copyright notice and this permission notice shall be 18 | # included in all copies or substantial portions of the Software. 19 | 20 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 21 | # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 22 | # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 23 | # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE 24 | # LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION 25 | # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION 26 | # WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 27 | 28 | # Commentary: 29 | # - PySPTKによる音声の分析再合成 (線スペクトル対) 30 | 31 | from pysptk.synthesis import LSPDF, Synthesizer 32 | from scipy.io import wavfile 33 | import librosa 34 | import numpy as np 35 | import pysptk 36 | 37 | FRAME_LENGTH = 1024 38 | HOP_LENGTH = 80 39 | MIN_F0 = 60 40 | MAX_F0 = 240 41 | ORDER = 20 42 | 43 | IN_WAVE_FILE = "in.wav" # 入力音声 44 | OUT_WAVE_FILE = "out.wav" # 分析再合成した音声 45 | 46 | # 音声の読み込み 47 | fs, x = wavfile.read(IN_WAVE_FILE) 48 | x = x.astype(np.float64) 49 | 50 | # 音声の切り出しと窓掛け 51 | frames = librosa.util.frame(x, frame_length=FRAME_LENGTH, 52 | hop_length=HOP_LENGTH).astype(np.float64).T 53 | frames *= pysptk.blackman(FRAME_LENGTH) # 窓掛け(ブラックマン窓) 54 | 55 | # ピッチ抽出 56 | pitch = pysptk.swipe(x, fs=fs, hopsize=HOP_LENGTH, 57 | min=MIN_F0, max=MAX_F0, otype="pitch") 58 | 59 | # 励振源信号(声帯音源)の生成 60 | source_excitation = pysptk.excite(pitch, HOP_LENGTH) 61 | 62 | # 線形予測分析による線形予測符号化(LPC)係数の抽出 63 | lpc = pysptk.lpc(frames, ORDER) 64 | lpc[:, 0] = np.log(lpc[:, 0]) 65 | 66 | # LPC係数を線スペクトル対に変換 67 | lsp = pysptk.lpc2lsp(lpc, otype=0, fs=fs) 68 | 69 | # 全極フィルタの作成 70 | synthesizer = Synthesizer(LSPDF(order=ORDER), HOP_LENGTH) 71 | 72 | # 励振源信号でフィルタを駆動して音声を合成 73 | y = synthesizer.synthesis(source_excitation, lsp) 74 | 75 | # 音声の書き込み 76 | y = y.astype(np.int16) 77 | wavfile.write(OUT_WAVE_FILE, fs, y) 78 | -------------------------------------------------------------------------------- /SpeechAnalysisSynthesis/pysptk_anasyn_mlsa.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | """ 音声情報処理 n本ノック !! """ 4 | 5 | # MIT License 6 | 7 | # Copyright (C) 2020 by Akira TAMAMORI 8 | 9 | # Permission is hereby granted, free of charge, to any person 10 | # obtaining a copy of this software and associated documentation files 11 | # (the Software"), to deal in the Software without restriction, 12 | # including without limitation the rights to use, copy, modify, merge, 13 | # publish, distribute, sublicense, and/or sell copies of the Software, 14 | # and to permit persons to whom the Software is furnished to do so, 15 | # subject to the following conditions: 16 | 17 | # The above copyright notice and this permission notice shall be 18 | # included in all copies or substantial portions of the Software. 19 | 20 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 21 | # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 22 | # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 23 | # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE 24 | # LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION 25 | # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION 26 | # WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 27 | 28 | # Commentary: 29 | # - PySPTKによる音声の分析再合成 (MLSAフィルタ) 30 | 31 | from pysptk.synthesis import MLSADF, Synthesizer 32 | from scipy.io import wavfile 33 | import librosa 34 | import numpy as np 35 | import pysptk 36 | 37 | FRAME_LENGTH = 1024 38 | HOP_LENGTH = 80 39 | MIN_F0 = 60 40 | MAX_F0 = 240 41 | ORDER = 25 42 | ALPHA = 0.41 43 | 44 | IN_WAVE_FILE = "in.wav" # 入力音声 45 | OUT_WAVE_FILE = "out.wav" # 分析再合成した音声 46 | 47 | # 音声の読み込み 48 | fs, x = wavfile.read(IN_WAVE_FILE) 49 | x = x.astype(np.float64) 50 | 51 | # 音声の切り出しと窓掛け 52 | frames = librosa.util.frame(x, frame_length=FRAME_LENGTH, 53 | hop_length=HOP_LENGTH).astype(np.float64).T 54 | frames *= pysptk.blackman(FRAME_LENGTH) # 窓掛け(ブラックマン窓) 55 | 56 | # ピッチ抽出 57 | pitch = pysptk.swipe(x, fs=fs, hopsize=HOP_LENGTH, 58 | min=MIN_F0, max=MAX_F0, otype="pitch") 59 | 60 | # 励振源信号(声帯音源)の生成 61 | source_excitation = pysptk.excite(pitch, HOP_LENGTH) 62 | 63 | # メルケプストラム分析(=スペクトル包絡の抽出) 64 | mc = pysptk.mcep(frames, ORDER, ALPHA) # メルケプストラム係数の抽出 65 | 66 | # メルケプストラム係数からMLSAディジタルフィルタ係数に変換 67 | mlsa_coef = pysptk.mc2b(mc, ALPHA) 68 | 69 | # MLSAフィルタの作成 70 | synthesizer = Synthesizer(MLSADF(order=ORDER, alpha=ALPHA), HOP_LENGTH) 71 | 72 | # 励振源信号でフィルタを駆動して音声を合成 73 | y = synthesizer.synthesis(source_excitation, mlsa_coef) 74 | 75 | # 音声の書き込み 76 | y = y.astype(np.int16) 77 | wavfile.write(OUT_WAVE_FILE, fs, y) 78 | -------------------------------------------------------------------------------- /SpeechAnalysisSynthesis/pysptk_anasyn_mlsa_pyworld.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | """ 音声情報処理 n本ノック !! """ 4 | 5 | # MIT License 6 | 7 | # Copyright (C) 2020 by Akira TAMAMORI 8 | 9 | # Permission is hereby granted, free of charge, to any person 10 | # obtaining a copy of this software and associated documentation files 11 | # (the Software"), to deal in the Software without restriction, 12 | # including without limitation the rights to use, copy, modify, merge, 13 | # publish, distribute, sublicense, and/or sell copies of the Software, 14 | # and to permit persons to whom the Software is furnished to do so, 15 | # subject to the following conditions: 16 | 17 | # The above copyright notice and this permission notice shall be 18 | # included in all copies or substantial portions of the Software. 19 | 20 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 21 | # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 22 | # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 23 | # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE 24 | # LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION 25 | # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION 26 | # WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 27 | 28 | # Commentary: 29 | # - PySPTKによる音声の分析再合成 (MLSAフィルタ) 30 | # - ただしメルケプストラム係数をWORLDから抽出したスペクトル包絡から計算 31 | 32 | from pysptk.synthesis import MLSADF, Synthesizer 33 | from scipy.io import wavfile 34 | import numpy as np 35 | import pysptk 36 | import pyworld 37 | 38 | FRAME_LENGTH = 1024 39 | HOP_LENGTH = 80 40 | MIN_F0 = 60 41 | MAX_F0 = 240 42 | ORDER = 25 43 | ALPHA = 0.41 44 | 45 | IN_WAVE_FILE = "in.wav" # 入力音声 46 | OUT_WAVE_FILE = "out.wav" # 分析再合成した音声 47 | 48 | # 音声の読み込み 49 | fs, x = wavfile.read(IN_WAVE_FILE) 50 | x = x.astype(np.float64) 51 | 52 | # 音声の分析 (基本周波数、スペクトル包絡、非周期性指標) 53 | _, sp, _ = pyworld.wav2world(x, fs) 54 | 55 | # メルケプストラム係数の抽出 from WORLDのスペクトル包絡 56 | mcep = pysptk.sp2mc(sp, order=ORDER, alpha=ALPHA) 57 | 58 | # ピッチ抽出 59 | pitch = pysptk.swipe(x, fs=fs, hopsize=HOP_LENGTH, 60 | min=MIN_F0, max=MAX_F0, otype="pitch") 61 | 62 | # 励振源信号(声帯音源)の生成 63 | source_excitation = pysptk.excite(pitch, HOP_LENGTH) 64 | 65 | # メルケプストラム係数からMLSAディジタルフィルタ係数に変換 66 | mlsa_coef = pysptk.mc2b(mcep, ALPHA) 67 | 68 | # MLSAフィルタの作成 69 | synthesizer = Synthesizer(MLSADF(order=ORDER, alpha=ALPHA), HOP_LENGTH) 70 | 71 | # 励振源信号でMLSAフィルタを駆動して音声を合成 72 | y = synthesizer.synthesis(source_excitation, mlsa_coef) 73 | 74 | # 音声の書き込み 75 | y = y.astype(np.int16) 76 | wavfile.write(OUT_WAVE_FILE, fs, y) 77 | -------------------------------------------------------------------------------- /SpeechAnalysisSynthesis/pysptk_anasyn_parcor.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | """ 音声情報処理 n本ノック !! """ 4 | 5 | # MIT License 6 | 7 | # Copyright (C) 2020 by Akira TAMAMORI 8 | 9 | # Permission is hereby granted, free of charge, to any person 10 | # obtaining a copy of this software and associated documentation files 11 | # (the Software"), to deal in the Software without restriction, 12 | # including without limitation the rights to use, copy, modify, merge, 13 | # publish, distribute, sublicense, and/or sell copies of the Software, 14 | # and to permit persons to whom the Software is furnished to do so, 15 | # subject to the following conditions: 16 | 17 | # The above copyright notice and this permission notice shall be 18 | # included in all copies or substantial portions of the Software. 19 | 20 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 21 | # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 22 | # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 23 | # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE 24 | # LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION 25 | # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION 26 | # WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 27 | 28 | # Commentary: 29 | # - PySPTKによる音声の分析再合成 (PARCOR係数を経由) 30 | 31 | from pysptk.synthesis import AllPoleLatticeDF, Synthesizer 32 | from scipy.io import wavfile 33 | import librosa 34 | import numpy as np 35 | import pysptk 36 | 37 | FRAME_LENGTH = 1024 38 | HOP_LENGTH = 80 39 | MIN_F0 = 60 40 | MAX_F0 = 240 41 | ORDER = 20 42 | 43 | IN_WAVE_FILE = "in.wav" # 入力音声 44 | OUT_WAVE_FILE = "out.wav" # 分析再合成した音声 45 | 46 | # 音声の読み込み 47 | fs, x = wavfile.read(IN_WAVE_FILE) 48 | x = x.astype(np.float64) 49 | 50 | # 音声の切り出しと窓掛け 51 | frames = librosa.util.frame(x, frame_length=FRAME_LENGTH, 52 | hop_length=HOP_LENGTH).astype(np.float64).T 53 | frames *= pysptk.blackman(FRAME_LENGTH) # 窓掛け(ブラックマン窓) 54 | 55 | # ピッチ抽出 56 | pitch = pysptk.swipe(x, fs=fs, hopsize=HOP_LENGTH, 57 | min=MIN_F0, max=MAX_F0, otype="pitch") 58 | 59 | # 励振源信号(声帯音源)の生成 60 | source_excitation = pysptk.excite(pitch, HOP_LENGTH) 61 | 62 | # 線形予測分析による線形予測符号化(LPC)係数の抽出 63 | lpc = pysptk.lpc(frames, ORDER) 64 | lpc[:, 0] = np.log(lpc[:, 0]) 65 | 66 | # LPC係数をPARCOR係数に変換 67 | parcor = pysptk.lpc2par(lpc) 68 | 69 | # 全極フィルタの作成 70 | synthesizer = Synthesizer(AllPoleLatticeDF(order=ORDER), HOP_LENGTH) 71 | 72 | # 励振源信号でフィルタを駆動して音声を合成 73 | y = synthesizer.synthesis(source_excitation, parcor) 74 | 75 | # 音声の書き込み 76 | y = y.astype(np.int16) 77 | wavfile.write(OUT_WAVE_FILE, fs, y) 78 | -------------------------------------------------------------------------------- /SpeechAnalysisSynthesis/pyworld_anasyn.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | """ 音声情報処理 n本ノック !! """ 4 | 5 | # MIT License 6 | 7 | # Copyright (C) 2020 by Akira TAMAMORI 8 | 9 | # Permission is hereby granted, free of charge, to any person 10 | # obtaining a copy of this software and associated documentation files 11 | # (the Software"), to deal in the Software without restriction, 12 | # including without limitation the rights to use, copy, modify, merge, 13 | # publish, distribute, sublicense, and/or sell copies of the Software, 14 | # and to permit persons to whom the Software is furnished to do so, 15 | # subject to the following conditions: 16 | 17 | # The above copyright notice and this permission notice shall be 18 | # included in all copies or substantial portions of the Software. 19 | 20 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 21 | # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 22 | # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 23 | # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE 24 | # LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION 25 | # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION 26 | # WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 27 | 28 | # Commentary: 29 | # - PyWORLDによる音声の分析再合成 30 | 31 | from scipy.io import wavfile 32 | import numpy as np 33 | import pyworld 34 | 35 | IN_WAVE_FILE = "in.wav" # 入力音声 36 | OUT_WAVE_FILE = "out.wav" # 分析再合成した音声 37 | 38 | # 音声の読み込み 39 | fs, x = wavfile.read(IN_WAVE_FILE) 40 | x = x.astype(np.float64) 41 | 42 | # 音声の分析 (基本周波数、スペクトル包絡、非周期性指標) 43 | f0, sp, ap = pyworld.wav2world(x, fs) 44 | 45 | # 音声の再合成 46 | y = pyworld.synthesize(f0, sp, ap, fs) 47 | y = y.astype(np.int16) 48 | 49 | # wavファイルに保存 50 | wavfile.write(OUT_WAVE_FILE, fs, y) 51 | -------------------------------------------------------------------------------- /SpeechAnalysisSynthesis/pyworld_anasyn_encdec.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | """ 音声情報処理 n本ノック !! """ 4 | 5 | # MIT License 6 | 7 | # Copyright (C) 2020 by Akira TAMAMORI 8 | 9 | # Permission is hereby granted, free of charge, to any person 10 | # obtaining a copy of this software and associated documentation files 11 | # (the Software"), to deal in the Software without restriction, 12 | # including without limitation the rights to use, copy, modify, merge, 13 | # publish, distribute, sublicense, and/or sell copies of the Software, 14 | # and to permit persons to whom the Software is furnished to do so, 15 | # subject to the following conditions: 16 | 17 | # The above copyright notice and this permission notice shall be 18 | # included in all copies or substantial portions of the Software. 19 | 20 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 21 | # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 22 | # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 23 | # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE 24 | # LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION 25 | # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION 26 | # WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 27 | 28 | # Commentary: 29 | # - PyWORLDによる音声の分析再合成 30 | # - ただしスペクトル包絡と非周期性指標をエンコード/デコード 31 | 32 | from scipy.io import wavfile 33 | import numpy as np 34 | import pyworld 35 | 36 | IN_WAVE_FILE = "in.wav" # 入力音声 37 | OUT_WAVE_FILE = "out.wav" # 分析再合成した音声 38 | 39 | SP_DIM = 50 # スペクトル包絡の圧縮後の次元 40 | 41 | # 音声の読み込み 42 | fs, x = wavfile.read(IN_WAVE_FILE) 43 | x = x.astype(np.float64) 44 | 45 | # 音声の分析 (基本周波数、スペクトル包絡、非周期性指標) 46 | f0, sp, ap = pyworld.wav2world(x, fs) 47 | fft_size = pyworld.get_cheaptrick_fft_size(fs) 48 | 49 | # スペクトル包絡をエンコード / デコード 50 | # https://www.isca-speech.org/archive/Interspeech_2017/abstracts/0067.html 51 | code_sp = pyworld.code_spectral_envelope(sp, fs, SP_DIM) 52 | decode_sp = pyworld.decode_spectral_envelope(code_sp, fs, fft_size) 53 | 54 | # 非周期性指標をエンコード / デコード 55 | code_ap = pyworld.code_aperiodicity(ap, fs) 56 | decode_ap = pyworld.decode_aperiodicity(code_ap, fs, fft_size) 57 | 58 | # 音声の再合成 59 | y = pyworld.synthesize(f0, decode_sp, decode_ap, fs) 60 | y = y.astype(np.int16) 61 | 62 | # 音声の書き込み 63 | wavfile.write(OUT_WAVE_FILE, fs, y) 64 | -------------------------------------------------------------------------------- /SpeechRecognition/README.md: -------------------------------------------------------------------------------- 1 | # 音声認識 2 | 3 | ## はじめに 4 | ``` 5 | pip3 install pysimplegui 6 | pip3 install sounddevice 7 | pip3 install soundfile 8 | pip3 install SpeechRecognition 9 | pip3 install gtts 10 | pip3 install wikipedia 11 | pip3 install vosk 12 | ``` 13 | 14 | ## ファイル一覧 15 | - 指定秒数だけ音声(wav)を録音 with soundfile & sounddevice ([record_speech.py](https://github.com/tam17aki/speech_process_exercise/blob/master/SpeechRecognition/record_speech.py)) 16 | - 収録済み音声(wav)に対する音声認識 with VOSK ([vosk_asr_recorded.py](https://github.com/tam17aki/speech_process_exercise/blob/master/SpeechRecognition/vosk_asr_recorded.py)) 17 | - マイク音声入力によるストリーミング音声認識 with VOSK ([vosk_asr_streaming.py](https://github.com/tam17aki/speech_process_exercise/blob/master/SpeechRecognition/vosk_asr_streaming.py)) 18 | - マイク音声入力によるVADつきストリーミング音声認識 with VOSK ([vosk_asr_streaming_vad.py](https://github.com/tam17aki/speech_process_exercise/blob/master/SpeechRecognition/vosk_asr_streaming_vad.py)) 19 | 20 | ### PySimpleGUIによるGUIアプリ 21 | - 指定秒数だけ音声を録音し、音声認識をかける with SpeechRecognition ([recog_speech_rec.py](https://github.com/tam17aki/speech_process_exercise/blob/master/SpeechRecognition/recog_speech_rec.py)) 22 | - Google Homeもどき ([google_mode_modoki.py](https://github.com/tam17aki/speech_process_exercise/blob/master/SpeechRecognition/google_mode_modoki.py)) 23 | - 音声認識結果を使ったWikipedia検索&読み上げ ([recog_wikipedia.py](https://github.com/tam17aki/speech_process_exercise/blob/master/SpeechRecognition/recog_wikipedia.py)) 24 | 25 | ### Google Colaboratory 26 | - Juliusの日本語ディクテーションキット ([Link](https://colab.research.google.com/drive/1pdp9lmzzslLzN95iu69siTkTxMk-hzXf?usp=sharing)) 27 | - SpeechRecognition ライブラリのデモンストレーション ([Link](https://colab.research.google.com/drive/1w96tb5SxCPWqnNXaVlFQpaMPzJ24w0F3?usp=sharing)) 28 | - ESPnet2 事前学習済モデルを用いた音声認識デモンストレーション 29 | - LaboroTVSpeechコーパス ([Link](https://colab.research.google.com/drive/1xJ96-7JSSPBNJ-bAwysESDcaGvnbblAR?usp=sharing)) 30 | - VOSK ライブラリを用いた音声認識デモンストレーション ([Link](https://colab.research.google.com/drive/1Dvhw4H2hT3WxDniX2M8w7q1pae5qgXYy?usp=sharing)) 31 | -------------------------------------------------------------------------------- /SpeechRecognition/google_mode_modoki.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | """ 音声情報処理 n本ノック !! """ 4 | 5 | # MIT License 6 | 7 | # Copyright (C) 2022 by Akira TAMAMORI 8 | 9 | # Permission is hereby granted, free of charge, to any person 10 | # obtaining a copy of this software and associated documentation files 11 | # (the Software"), to deal in the Software without restriction, 12 | # including without limitation the rights to use, copy, modify, merge, 13 | # publish, distribute, sublicense, and/or sell copies of the Software, 14 | # and to permit persons to whom the Software is furnished to do so, 15 | # subject to the following conditions: 16 | 17 | # The above copyright notice and this permission notice shall be 18 | # included in all copies or substantial portions of the Software. 19 | 20 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 21 | # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 22 | # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 23 | # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE 24 | # LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION 25 | # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION 26 | # WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 27 | 28 | # Commentary: 29 | # Google Homeもどきを実現するサンプル 30 | 31 | import PySimpleGUI as sg 32 | import speech_recognition as sr 33 | 34 | # マイク設定 35 | rec = sr.Recognizer() 36 | mic = sr.Microphone() 37 | with mic as source: 38 | rec.adjust_for_ambient_noise(source) 39 | 40 | TIMEOUT = 1000 # タイムアウト時間(単位:ミリ秒) 41 | WAKE_WORD = "Ok Google" # ウェイクワード 42 | 43 | # フォント指定 44 | FONT = ("Hiragino Maru Gothic ProN", 20) 45 | 46 | # レイアウト定義 47 | LAYOUT = [ 48 | [sg.Text("お好きなタイミングで話しかけてください", size=(35, 1))], 49 | [sg.Text("認識結果: ", size=(40, 1), key="-RECOG_TEXT-")], 50 | [sg.Button("終了", key="-QUIT-")], 51 | ] 52 | 53 | # ウィンドウ生成 54 | WINDOW = sg.Window("Google Home sample", LAYOUT, font=FONT) 55 | 56 | while True: 57 | event, values = WINDOW.read(timeout=TIMEOUT, timeout_key="-RECOG_TRIGGER-") 58 | 59 | if event in (sg.WIN_CLOSED, "-QUIT-"): 60 | break 61 | 62 | elif event in "-RECOG_TRIGGER-": 63 | 64 | with mic as source: 65 | audio = rec.listen(source) 66 | try: # ウェイクワードの認識 67 | text = rec.recognize_google(audio, language="ja-JP") 68 | if WAKE_WORD in text: # 認識結果にウェイクワードが含まれるならば 69 | 70 | # 認識結果文字列のWAKE_WORDを空文字列で置き換える 71 | # →後段の処理に利用可能 72 | text = text.replace(WAKE_WORD, "") 73 | 74 | WINDOW["-RECOG_TEXT-"].Update("認識結果: " + text) 75 | else: 76 | # 認識結果をクリア 77 | WINDOW["-RECOG_TEXT-"].Update("認識結果: ") 78 | 79 | except sr.UnknownValueError: 80 | WINDOW["-RECOG_TEXT-"].Update("認識に失敗しました") 81 | 82 | WINDOW.close() 83 | -------------------------------------------------------------------------------- /SpeechRecognition/recog_speech_rec.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | """ 音声情報処理 n本ノック !! """ 4 | 5 | # MIT License 6 | 7 | # Copyright (C) 2022 by Akira TAMAMORI 8 | 9 | # Permission is hereby granted, free of charge, to any person 10 | # obtaining a copy of this software and associated documentation files 11 | # (the Software"), to deal in the Software without restriction, 12 | # including without limitation the rights to use, copy, modify, merge, 13 | # publish, distribute, sublicense, and/or sell copies of the Software, 14 | # and to permit persons to whom the Software is furnished to do so, 15 | # subject to the following conditions: 16 | 17 | # The above copyright notice and this permission notice shall be 18 | # included in all copies or substantial portions of the Software. 19 | 20 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 21 | # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 22 | # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 23 | # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE 24 | # LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION 25 | # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION 26 | # WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 27 | 28 | # Commentary: 29 | # 指定秒数だけ音声を録音し、音声認識 30 | 31 | import PySimpleGUI as sg 32 | import sounddevice as sd # 録音・再生系のライブラリ 33 | import soundfile as sf # 読み込み・書き出しのライブラリ 34 | import speech_recognition as sr 35 | 36 | # マイクの設定 37 | r = sr.Recognizer() 38 | m = sr.Microphone() 39 | with m as source: 40 | r.adjust_for_ambient_noise(source) 41 | 42 | # フォント指定 43 | FONT = ("Hiragino Maru Gothic ProN", 20) 44 | 45 | # 音声データ一時保存用 46 | AUDIO = None 47 | 48 | SAMPLE_RATE = 16000 # サンプリング周波数 49 | N_CHANNEL = 1 # チャンネル数 モノラルは1, ステレオは2 50 | DURATION = 5 # 収録秒数 51 | BUFFER = 0.1 52 | OUTPUT_FILE = "/tmp/record.wav" # 出力先の音声ファイル名 53 | 54 | 55 | # レイアウト定義 56 | LAYOUT = [ 57 | [ 58 | sg.Text("「認識」ボタンを押して" + str(DURATION) + "秒間話しかけてください", size=(35, 1), key="txt"), 59 | ], 60 | [sg.Text(size=(40, 1), key="-RECOG_TEXT-")], 61 | [ 62 | sg.Button("認識", key="recog"), 63 | sg.Button("終了", key="quit"), 64 | ], 65 | ] 66 | 67 | # ウィンドウ生成 68 | WINDOW = sg.Window("Speech-To-Text sample", LAYOUT, font=FONT) 69 | 70 | 71 | def recog(): 72 | """リッスンする関数""" 73 | 74 | # 音声録音を指定秒数実行 75 | AUDIO = sd.rec( 76 | int(DURATION * SAMPLE_RATE), samplerate=SAMPLE_RATE, channels=N_CHANNEL 77 | ) 78 | sd.wait() 79 | 80 | # 一旦ファイルに書き込む 81 | sf.write( 82 | file=OUTPUT_FILE, 83 | data=AUDIO, 84 | samplerate=SAMPLE_RATE, 85 | format="WAV", 86 | subtype="PCM_16", 87 | ) 88 | 89 | with sr.AudioFile(OUTPUT_FILE) as source: 90 | audio = r.listen(source) # 音声取得 91 | try: 92 | text = r.recognize_google(audio, language="ja-JP") 93 | WINDOW["-RECOG_TEXT-"].Update("認識結果: " + text) 94 | except sr.UnknownValueError: 95 | WINDOW["-RECOG_TEXT-"].Update("認識に失敗しました") 96 | 97 | 98 | # イベントループ 99 | while True: 100 | 101 | # イベント読み込み 102 | event, values = WINDOW.read() 103 | 104 | if event == sg.WINDOW_CLOSED or event == "quit": 105 | break 106 | 107 | elif event == "recog": 108 | recog() 109 | 110 | # ウィンドウを閉じて終了 111 | WINDOW.close() 112 | -------------------------------------------------------------------------------- /SpeechRecognition/record_speech.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | """ 音声情報処理 n本ノック !! """ 4 | 5 | # MIT License 6 | 7 | # Copyright (C) 2022 by Akira TAMAMORI 8 | 9 | # Permission is hereby granted, free of charge, to any person 10 | # obtaining a copy of this software and associated documentation files 11 | # (the Software"), to deal in the Software without restriction, 12 | # including without limitation the rights to use, copy, modify, merge, 13 | # publish, distribute, sublicense, and/or sell copies of the Software, 14 | # and to permit persons to whom the Software is furnished to do so, 15 | # subject to the following conditions: 16 | 17 | # The above copyright notice and this permission notice shall be 18 | # included in all copies or substantial portions of the Software. 19 | 20 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 21 | # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 22 | # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 23 | # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE 24 | # LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION 25 | # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION 26 | # WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 27 | 28 | # Commentary: 29 | # 指定秒数だけ音声を録音 30 | 31 | from typing import NamedTuple 32 | import sounddevice as sd # 録音・再生系のライブラリ 33 | import soundfile as sf # 読み込み・書き出しのライブラリ 34 | 35 | 36 | class RecordingConfig(NamedTuple): 37 | """Configuration for recording.""" 38 | 39 | sample_rate: float = 16000 # Hz 40 | duration: int = 3.0 # sec 41 | n_channels: int = 1 # 1: mono 42 | 43 | 44 | def record_wav(out_wavfile: str, config: RecordingConfig): 45 | """音声(wav)を録音する. 46 | 47 | Args: 48 | out_wavfile (str): 出力となるwavファイル名 49 | config (RecordingConfig): 録音の設定 50 | """ 51 | sample_rate = config.sample_rate 52 | duration = config.duration 53 | n_channels = config.n_channels 54 | 55 | # 音声録音を指定秒数実行 56 | audio = sd.rec( 57 | int(duration * sample_rate), samplerate=sample_rate, channels=n_channels 58 | ) 59 | sd.wait() 60 | 61 | # ファイルに書き込む 62 | sf.write( 63 | file=out_wavfile, 64 | data=audio, 65 | samplerate=sample_rate, 66 | format="WAV", 67 | subtype="PCM_16", 68 | ) 69 | 70 | 71 | def main(duration: int = 3.0, wav_file: str = "out.wav"): 72 | """音声を録音する. 73 | 74 | Args: 75 | duration (int): 録音秒数 76 | wav_file (str): 出力wavファイルへのパス 77 | """ 78 | # 入力デバイス情報に基づき、サンプリング周波数の情報を取得 79 | input_device_info = sd.query_devices(kind="input") 80 | sample_rate = int(input_device_info["default_samplerate"]) 81 | 82 | # 指定秒数だけ音声を録音 83 | record_config = RecordingConfig(sample_rate, duration) 84 | print("<録音開始>") 85 | record_wav(wav_file, record_config) 86 | print("<認識終了>") 87 | 88 | 89 | if __name__ == "__main__": 90 | main() 91 | -------------------------------------------------------------------------------- /SpeechRecognition/vosk_asr_recorded.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """収録済み音声ファイルに対する音声認識 via VOSK. 3 | 4 | Copyright (C) 2022 by Akira TAMAMORI 5 | 6 | Permission is hereby granted, free of charge, to any person obtaining a copy 7 | of this software and associated documentation files (the "Software"), to deal 8 | in the Software without restriction, including without limitation the rights 9 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 10 | copies of the Software, and to permit persons to whom the Software is 11 | furnished to do so, subject to the following conditions: 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 16 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 17 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 18 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 19 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 20 | SOFTWARE. 21 | """ 22 | import wave 23 | import sys 24 | import json 25 | 26 | from vosk import Model, KaldiRecognizer, SetLogLevel 27 | 28 | 29 | def get_asr_result(recognizer, stream, chunk_size): 30 | """音声認識APIを実行して最終的な認識結果を得る. 31 | 32 | Args: 33 | recognizer (KaldiRecognizer): 音声認識モジュール 34 | stream (Wave_read): wav読み取りのための入力ストリーム 35 | chunk_size (int): wavを一度に読み取るサイズ 36 | 37 | Returns: 38 | recog_text (str): 音声認識結果 39 | """ 40 | while True: 41 | data = stream.readframes(chunk_size) 42 | if len(data) == 0: 43 | break 44 | recognizer.AcceptWaveform(data) 45 | 46 | recog_result = json.loads(recognizer.FinalResult()) 47 | recog_text = recog_result["text"].split() 48 | recog_text = "".join(recog_text) 49 | return recog_text 50 | 51 | 52 | def main(chunk_size=4000, wav_file="in.wav"): 53 | """収録済み音声に対して音声認識デモンストレーションを実行. 54 | 55 | Args: 56 | chunk_size (int): 音声データを受け取る単位(サンプル数) 57 | wav_file (str): wavファイルへのパス 58 | """ 59 | SetLogLevel(-1) # VOSK起動時のログ表示を抑制 60 | 61 | wf = wave.open(wav_file, "rb") 62 | if wf.getnchannels() != 1 or wf.getsampwidth() != 2 or wf.getcomptype() != "NONE": 63 | print("Audio file must be WAV format mono PCM.") 64 | sys.exit(1) 65 | 66 | model = Model("model") 67 | recognizer = KaldiRecognizer(model, wf.getframerate()) 68 | 69 | recog_text = get_asr_result(recognizer, wf, chunk_size) 70 | print(f"認識結果: {recog_text}") 71 | 72 | 73 | if __name__ == "__main__": 74 | main() 75 | -------------------------------------------------------------------------------- /SpeechSynthesis/README.md: -------------------------------------------------------------------------------- 1 | # 音声合成 2 | 3 | ## はじめに 4 | ``` 5 | pip3 install pysimplegui 6 | pip3 install sounddevice 7 | pip3 install soundfile 8 | pip3 install pyopenjtalk[marine] 9 | pip3 install gtts 10 | pip3 install ttslearn 11 | pip3 install torch 12 | pip3 install pyttsx3 13 | ``` 14 | 15 | ## ファイル一覧 16 | ### Pythonスクリプト 17 | - gTTSによるテキスト読み上げ ([synth_gtts.py](https://github.com/tam17aki/speech_process_exercise/blob/master/SpeechSynthesis/synth_gtts.py)) 18 | - PyOpenJTalkによるテキスト読み上げ ([synth_pyopenjtalk.py](https://github.com/tam17aki/speech_process_exercise/blob/master/SpeechSynthesis/synth_pyopenjtalk.py)) 19 | - pyttsx3によるテキスト読み上げ ([synth_pyttsx.py](https://github.com/tam17aki/speech_process_exercise/blob/master/SpeechSynthesis/synth_pyttsx.py)) 20 | 21 | ### PySimpleGUIによるGUIアプリ 22 | - gTTSによるテキスト読み上げ ([synth_gtts_gui.py](https://github.com/tam17aki/speech_process_exercise/blob/master/SpeechSynthesis/synth_gtts_gui.py)) 23 | - PyOpenJTalkによるテキスト読み上げ ([synth_pyopenjtalk_gui.py](https://github.com/tam17aki/speech_process_exercise/blob/master/SpeechSynthesis/synth_pyopenjtalk_gui.py)) 24 | - ttslearnによる複数話者テキスト読み上げ ([synth_ttslearn_multi_gui.py](https://github.com/tam17aki/speech_process_exercise/blob/master/SpeechSynthesis/synth_ttslearn_multi_gui.py)) 25 | -------------------------------------------------------------------------------- /SpeechSynthesis/synth_gtts.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | """音声情報処理 n本ノック.""" 4 | 5 | # MIT License 6 | 7 | # Copyright (C) 2022 by Akira TAMAMORI 8 | 9 | # Permission is hereby granted, free of charge, to any person 10 | # obtaining a copy of this software and associated documentation files 11 | # (the Software"), to deal in the Software without restriction, 12 | # including without limitation the rights to use, copy, modify, merge, 13 | # publish, distribute, sublicense, and/or sell copies of the Software, 14 | # and to permit persons to whom the Software is furnished to do so, 15 | # subject to the following conditions: 16 | 17 | # The above copyright notice and this permission notice shall be 18 | # included in all copies or substantial portions of the Software. 19 | 20 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 21 | # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 22 | # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 23 | # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE 24 | # LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION 25 | # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION 26 | # WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 27 | 28 | # Commentary: 29 | # gTTSによるテキスト音声合成のサンプルスクリプト 30 | 31 | from gtts import gTTS 32 | from pydub import AudioSegment 33 | from pydub.playback import play 34 | 35 | 36 | class TextToSpeech: 37 | """Class for Text-to-Speech.""" 38 | 39 | def __init__(self, lang: str = "ja", out_file: str = "/tmp/tts.mp3"): 40 | """Initialize the class.""" 41 | self.lang = lang 42 | self.out_file = out_file 43 | 44 | def generate(self, text): 45 | """Perform text-to-speech.""" 46 | tts = gTTS(text, lang=self.lang) 47 | tts.save(self.out_file) # save audio in mp3 format 48 | 49 | def play(self): 50 | """Play synthesized speech.""" 51 | audio_data = AudioSegment.from_mp3(self.out_file) 52 | play(audio_data) 53 | 54 | 55 | def main(text: str = "こんにちは"): 56 | """Perform Text-to-Speech.""" 57 | tts = TextToSpeech() 58 | tts.generate(text) 59 | tts.play() 60 | 61 | 62 | if __name__ == "__main__": 63 | main("こんにちは") 64 | -------------------------------------------------------------------------------- /SpeechSynthesis/synth_gtts_gui.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | """ 音声情報処理 n本ノック !! """ 4 | 5 | # MIT License 6 | 7 | # Copyright (C) 2022 by Akira TAMAMORI 8 | 9 | # Permission is hereby granted, free of charge, to any person 10 | # obtaining a copy of this software and associated documentation files 11 | # (the Software"), to deal in the Software without restriction, 12 | # including without limitation the rights to use, copy, modify, merge, 13 | # publish, distribute, sublicense, and/or sell copies of the Software, 14 | # and to permit persons to whom the Software is furnished to do so, 15 | # subject to the following conditions: 16 | 17 | # The above copyright notice and this permission notice shall be 18 | # included in all copies or substantial portions of the Software. 19 | 20 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 21 | # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 22 | # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 23 | # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE 24 | # LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION 25 | # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION 26 | # WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 27 | 28 | # Commentary: 29 | # gTTSによるテキスト音声合成のサンプルスクリプト with PySimpleGUI 30 | 31 | import subprocess 32 | 33 | import PySimpleGUI as sg 34 | from gtts import gTTS 35 | 36 | # 一時用保存ファイル 37 | OUT_MP3 = "/tmp/tts.mp3" 38 | 39 | # 合成エンジンの言語 40 | LANG = "ja" 41 | 42 | # フォント指定 43 | FONT = ("Hiragino Maru Gothic ProN", 24) 44 | 45 | # レイアウト定義 46 | LAYOUT = [ 47 | [ 48 | sg.InputText("音声合成のサンプルです", size=(35, 1), key="txt"), 49 | ], 50 | [ 51 | sg.Button("合成", key="synth"), 52 | sg.Button("終了", key="quit"), 53 | ], 54 | ] 55 | 56 | # ウィンドウ生成 57 | WINDOW = sg.Window("TTS-sample", LAYOUT, font=FONT) 58 | 59 | # イベントループ 60 | while True: 61 | 62 | # イベント読み込み 63 | event, values = WINDOW.read() 64 | 65 | if event == sg.WINDOW_CLOSED or event == "quit": 66 | break 67 | 68 | elif event == "synth": # 入力されたテキストを音声合成する 69 | text = values["txt"] 70 | 71 | # 音声合成(テキストデータ→音声データ) 72 | tts = gTTS(text, lang=LANG) 73 | 74 | # mp3形式でファイルを保存 75 | tts.save(OUT_MP3) 76 | 77 | # 再生 78 | subprocess.run("afplay " + OUT_MP3, shell=True) 79 | 80 | # ウィンドウを閉じて終了 81 | WINDOW.close() 82 | -------------------------------------------------------------------------------- /SpeechSynthesis/synth_pyopenjtalk.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | """ 音声情報処理 n本ノック !! """ 4 | 5 | # MIT License 6 | 7 | # Copyright (C) 2022 by Akira TAMAMORI 8 | 9 | # Permission is hereby granted, free of charge, to any person 10 | # obtaining a copy of this software and associated documentation files 11 | # (the Software"), to deal in the Software without restriction, 12 | # including without limitation the rights to use, copy, modify, merge, 13 | # publish, distribute, sublicense, and/or sell copies of the Software, 14 | # and to permit persons to whom the Software is furnished to do so, 15 | # subject to the following conditions: 16 | 17 | # The above copyright notice and this permission notice shall be 18 | # included in all copies or substantial portions of the Software. 19 | 20 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 21 | # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 22 | # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 23 | # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE 24 | # LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION 25 | # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION 26 | # WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 27 | 28 | # Commentary: 29 | # pyopenjtalkによるテキスト音声合成のサンプルスクリプト 30 | 31 | import numpy as np 32 | import pyopenjtalk 33 | import sounddevice as sd 34 | 35 | 36 | class TextToSpeech: 37 | """Class for Text-to-Speech.""" 38 | 39 | def __init__(self, run_marine=False): 40 | """Initialize the class. 41 | 42 | run_marine (bool): enabel MARINE model to improve Japanese accent estimation. 43 | """ 44 | self.audio = None 45 | self.sr = None 46 | self.run_marine = run_marine 47 | 48 | def generate(self, text): 49 | """Perform text-to-speech.""" 50 | self.audio, self.sr = pyopenjtalk.tts(text, run_marine=self.run_marine) 51 | 52 | def play(self): 53 | """Play synthesized speech.""" 54 | audio = self.audio / np.abs(self.audio).max() 55 | audio = audio * (np.iinfo(np.int16).max / 2 - 1) 56 | audio = audio.astype(np.int16) 57 | sd.play(audio, self.sr) 58 | sd.sleep(int(1000 * len(audio) / self.sr)) 59 | 60 | 61 | def main(text: str = "こんにちは", run_marine: bool = False): 62 | """main module.""" 63 | tts = TextToSpeech(run_marine) 64 | tts.generate(text) 65 | tts.play() 66 | 67 | 68 | if __name__ == "__main__": 69 | print("MARINEによるアクセント推定 ON") 70 | main("いつでも話しかけてくださいね。", True) 71 | 72 | print("MARINEによるアクセント推定 OFF") 73 | main("いつでも話しかけてくださいね。", False) 74 | -------------------------------------------------------------------------------- /SpeechSynthesis/synth_pyopenjtalk_gui.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | """ 音声情報処理 n本ノック !! """ 4 | 5 | # MIT License 6 | 7 | # Copyright (C) 2022 by Akira TAMAMORI 8 | 9 | # Permission is hereby granted, free of charge, to any person 10 | # obtaining a copy of this software and associated documentation files 11 | # (the Software"), to deal in the Software without restriction, 12 | # including without limitation the rights to use, copy, modify, merge, 13 | # publish, distribute, sublicense, and/or sell copies of the Software, 14 | # and to permit persons to whom the Software is furnished to do so, 15 | # subject to the following conditions: 16 | 17 | # The above copyright notice and this permission notice shall be 18 | # included in all copies or substantial portions of the Software. 19 | 20 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 21 | # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 22 | # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 23 | # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE 24 | # LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION 25 | # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION 26 | # WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 27 | 28 | # Commentary: 29 | # PyOpenJTalkによるテキスト音声合成のサンプルスクリプト with PySimpleGUI 30 | 31 | import numpy as np 32 | import pyopenjtalk 33 | import PySimpleGUI as sg 34 | import sounddevice as sd 35 | 36 | OUT_WAV = "/tmp/tts.wav" 37 | 38 | FONT = ("Arial", 30) 39 | LAYOUT = [ 40 | [ 41 | sg.InputText(default_text="音声合成のサンプルです。", size=(40, 3), key="text"), 42 | sg.Button("合成", key="synth"), 43 | ] 44 | ] 45 | 46 | WINDOW = sg.Window("TTS-sample", LAYOUT, font=FONT) 47 | 48 | while True: 49 | event, values = WINDOW.read() 50 | 51 | if event is None: 52 | break 53 | else: 54 | # 入力されたテキストを音声合成する 55 | if event == "synth": 56 | text = values["text"] 57 | 58 | # 音声合成(テキストデータ→音声データ) 59 | audio, sr = pyopenjtalk.tts(text) 60 | 61 | # 振幅の正規化 62 | audio = audio / np.abs(audio).max() 63 | audio = audio * (np.iinfo(np.int16).max / 2 - 1) 64 | audio = audio.astype(np.int16) 65 | 66 | # 再生 67 | sd.play(audio, sr) 68 | 69 | # 再生は非同期に行われるので、明示的にsleepさせる 70 | sd.sleep(int(1000 * len(audio) / sr)) 71 | 72 | WINDOW.close() 73 | -------------------------------------------------------------------------------- /SpeechSynthesis/synth_pyttsx.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | """ 音声情報処理 n本ノック !! """ 4 | 5 | # MIT License 6 | 7 | # Copyright (C) 2022 by Akira TAMAMORI 8 | 9 | # Permission is hereby granted, free of charge, to any person 10 | # obtaining a copy of this software and associated documentation files 11 | # (the Software"), to deal in the Software without restriction, 12 | # including without limitation the rights to use, copy, modify, merge, 13 | # publish, distribute, sublicense, and/or sell copies of the Software, 14 | # and to permit persons to whom the Software is furnished to do so, 15 | # subject to the following conditions: 16 | 17 | # The above copyright notice and this permission notice shall be 18 | # included in all copies or substantial portions of the Software. 19 | 20 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 21 | # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 22 | # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 23 | # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE 24 | # LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION 25 | # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION 26 | # WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 27 | 28 | # Commentary: 29 | # pyttsx3によるテキスト音声合成のサンプルスクリプト 30 | 31 | import pyttsx3 32 | 33 | 34 | class TextToSpeech: 35 | """Class for Text-to-Speech.""" 36 | 37 | def __init__(self): 38 | """Initialize the class.""" 39 | self.engine = pyttsx3.init() 40 | 41 | def generate(self, text): 42 | """Queues a command to speak an utterance.""" 43 | self.engine.say(text) 44 | 45 | def play(self): 46 | """Play synthesized speech.""" 47 | self.engine.runAndWait() 48 | 49 | 50 | def main(text: str = "こんにちは"): 51 | """main module.""" 52 | tts = TextToSpeech() 53 | tts.generate(text) 54 | tts.play() 55 | 56 | 57 | if __name__ == "__main__": 58 | main("こんにちは") 59 | -------------------------------------------------------------------------------- /VoiceConversion/README.md: -------------------------------------------------------------------------------- 1 | # 音声変換 2 | 3 | ## はじめに 4 | ``` 5 | pip3 install pysimplegui 6 | pip3 install pyaudio 7 | pip3 install pyworld 8 | pip3 install numpy 9 | pip3 install scipy 10 | ``` 11 | 12 | ## PySimpleGUIによるGUIアプリ 13 | - PyAudioで音声を取り込み、PyWorldで分析再合成するリアルタイム音声変換 ([pysimplegui_realtime_vc.py](https://github.com/tam17aki/speech_process_exercise/blob/master/VoiceConversion/pysimplegui_realtime_vc.py)) 14 | - 動作例

pysimpleguiとpyaudioとpyworldでボイスチェンジャーができた pic.twitter.com/5V8A6I9ZX4

— mat (@ballforest) January 10, 2022
15 | -------------------------------------------------------------------------------- /WarmUp/ffmpeg_mp3_to_wav.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | """ 音声情報処理 n本ノック !! """ 4 | 5 | # MIT License 6 | 7 | # Copyright (C) 2022 by Akira TAMAMORI 8 | 9 | # Permission is hereby granted, free of charge, to any person 10 | # obtaining a copy of this software and associated documentation files 11 | # (the Software"), to deal in the Software without restriction, 12 | # including without limitation the rights to use, copy, modify, merge, 13 | # publish, distribute, sublicense, and/or sell copies of the Software, 14 | # and to permit persons to whom the Software is furnished to do so, 15 | # subject to the following conditions: 16 | 17 | # The above copyright notice and this permission notice shall be 18 | # included in all copies or substantial portions of the Software. 19 | 20 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 21 | # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 22 | # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 23 | # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE 24 | # LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION 25 | # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION 26 | # WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 27 | 28 | # Commentary: 29 | # ffmpeg-pythonによりmp3をwavへエクスポート 30 | 31 | import ffmpeg 32 | 33 | IN_MP3_FILE = "in.mp3" 34 | OUT_WAVE_FILE = "out.wav" 35 | 36 | stream = ffmpeg.input(IN_MP3_FILE) 37 | stream = ffmpeg.output(stream, OUT_WAVE_FILE, format="wav") 38 | stream.run() 39 | -------------------------------------------------------------------------------- /WarmUp/ffmpeg_wav_to_mp3.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | """ 音声情報処理 n本ノック !! """ 4 | 5 | # MIT License 6 | 7 | # Copyright (C) 2022 by Akira TAMAMORI 8 | 9 | # Permission is hereby granted, free of charge, to any person 10 | # obtaining a copy of this software and associated documentation files 11 | # (the Software"), to deal in the Software without restriction, 12 | # including without limitation the rights to use, copy, modify, merge, 13 | # publish, distribute, sublicense, and/or sell copies of the Software, 14 | # and to permit persons to whom the Software is furnished to do so, 15 | # subject to the following conditions: 16 | 17 | # The above copyright notice and this permission notice shall be 18 | # included in all copies or substantial portions of the Software. 19 | 20 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 21 | # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 22 | # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 23 | # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE 24 | # LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION 25 | # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION 26 | # WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 27 | 28 | # Commentary: 29 | # ffmpeg-pythonによりwavをmp3へエクスポート 30 | 31 | import wave 32 | 33 | import ffmpeg 34 | 35 | IN_WAVE_FILE = "in.wav" 36 | OUT_MP3_FILE = "out.mp3" 37 | 38 | with wave.open(IN_WAVE_FILE, "r") as sound: 39 | n_channel = sound.getnchannels() # チャネル数 (mono:1, stereo:2) 40 | channel_layout = "mono" if n_channel == 1 else "stereo" 41 | 42 | stream = ffmpeg.input(IN_WAVE_FILE, channel_layout=channel_layout) 43 | stream = ffmpeg.output(stream, OUT_MP3_FILE, format="mp3") 44 | stream.run() 45 | -------------------------------------------------------------------------------- /WarmUp/librosa_plot_specgram.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | """ 音声情報処理 n本ノック !! """ 4 | 5 | # MIT License 6 | 7 | # Copyright (C) 2020 by Akira TAMAMORI 8 | 9 | # Permission is hereby granted, free of charge, to any person 10 | # obtaining a copy of this software and associated documentation files 11 | # (the Software"), to deal in the Software without restriction, 12 | # including without limitation the rights to use, copy, modify, merge, 13 | # publish, distribute, sublicense, and/or sell copies of the Software, 14 | # and to permit persons to whom the Software is furnished to do so, 15 | # subject to the following conditions: 16 | 17 | # The above copyright notice and this permission notice shall be 18 | # included in all copies or substantial portions of the Software. 19 | 20 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 21 | # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 22 | # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 23 | # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE 24 | # LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION 25 | # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION 26 | # WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 27 | 28 | # Commentary: 29 | # librosa を用いた波形プロット (やや処理が重たい) 30 | 31 | import librosa 32 | import librosa.display 33 | import matplotlib.pyplot as plt 34 | import numpy as np 35 | from scipy.io import wavfile 36 | 37 | IN_WAVE_FILE = "in.wav" # モノラル音声(前提) 38 | 39 | FRAME_LENGTH = 1024 # フレーム長 40 | HOP_LENGTH = 80 # フレームのシフト長 41 | 42 | # 音声データ読み込み (fsがサンプリング周波数、dataは音声データ) 43 | fs, data = wavfile.read(IN_WAVE_FILE) 44 | data = data.astype(np.float64) 45 | 46 | # 短時間フーリエ変換 47 | data_stft = librosa.stft(data, hop_length=HOP_LENGTH, n_fft=FRAME_LENGTH) 48 | 49 | # 振幅スペクトル 50 | data_ampspec = np.abs(data_stft) 51 | 52 | # 振幅スペクトルをデシベルスケールにする 53 | data_ampspec_dB = librosa.amplitude_to_db(data_ampspec, ref=np.max) 54 | 55 | # プロット枠を確保 (10がヨコのサイズ、4はタテのサイズ) 56 | plt.figure(figsize=(10, 4)) 57 | 58 | # スペクトログラムの表示 59 | librosa.display.specshow( 60 | data_ampspec_dB, x_axis="time", y_axis="linear", hop_length=HOP_LENGTH, sr=fs 61 | ) 62 | 63 | # x軸のラベル 64 | plt.xlabel("Time (sec)") 65 | 66 | # y軸のラベル 67 | plt.ylabel("Hz") 68 | 69 | # 画像のタイトル 70 | plt.title("Spectrogram") 71 | 72 | # 余白を少なくする 73 | plt.tight_layout() 74 | 75 | # 画像を画面表示 (必須) 76 | plt.show() 77 | -------------------------------------------------------------------------------- /WarmUp/librosa_plot_waveform.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | """ 音声情報処理 n本ノック !! """ 4 | 5 | # MIT License 6 | 7 | # Copyright (C) 2020 by Akira TAMAMORI 8 | 9 | # Permission is hereby granted, free of charge, to any person 10 | # obtaining a copy of this software and associated documentation files 11 | # (the Software"), to deal in the Software without restriction, 12 | # including without limitation the rights to use, copy, modify, merge, 13 | # publish, distribute, sublicense, and/or sell copies of the Software, 14 | # and to permit persons to whom the Software is furnished to do so, 15 | # subject to the following conditions: 16 | 17 | # The above copyright notice and this permission notice shall be 18 | # included in all copies or substantial portions of the Software. 19 | 20 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 21 | # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 22 | # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 23 | # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE 24 | # LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION 25 | # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION 26 | # WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 27 | 28 | # Commentary: 29 | # librosa を用いた波形プロット (やや処理が重たい) 30 | # 波形読み込みはscipy.ioのwavfileモジュール 31 | 32 | import librosa 33 | import librosa.display 34 | import matplotlib.pyplot as plt 35 | import numpy as np 36 | from scipy.io import wavfile 37 | 38 | IN_WAVE_FILE = "in.wav" # モノラル音声(前提) 39 | 40 | # 音声データ読み込み (fsがサンプリング周波数、dataは音声データ) 41 | fs, data = wavfile.read(IN_WAVE_FILE) 42 | data = data.astype(np.float64) 43 | 44 | # プロット枠を確保 (10がヨコのサイズ、4はタテのサイズ) 45 | plt.figure(figsize=(10, 4)) 46 | 47 | # 波形の表示 48 | librosa.display.waveplot(data, sr=fs) 49 | 50 | # x軸のラベル 51 | plt.xlabel("Time (sec)") 52 | 53 | # y軸のラベル 54 | plt.ylabel("Amplitude") 55 | 56 | # 画像のタイトル 57 | plt.title("Waveform") 58 | 59 | # 余白を少なくする 60 | plt.tight_layout() 61 | 62 | # 画像を画面表示 (必須) 63 | plt.show() 64 | -------------------------------------------------------------------------------- /WarmUp/plt_specgram.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | """ 音声情報処理 n本ノック !! """ 4 | 5 | # MIT License 6 | 7 | # Copyright (C) 2020 by Akira TAMAMORI 8 | 9 | # Permission is hereby granted, free of charge, to any person 10 | # obtaining a copy of this software and associated documentation files 11 | # (the Software"), to deal in the Software without restriction, 12 | # including without limitation the rights to use, copy, modify, merge, 13 | # publish, distribute, sublicense, and/or sell copies of the Software, 14 | # and to permit persons to whom the Software is furnished to do so, 15 | # subject to the following conditions: 16 | 17 | # The above copyright notice and this permission notice shall be 18 | # included in all copies or substantial portions of the Software. 19 | 20 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 21 | # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 22 | # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 23 | # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE 24 | # LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION 25 | # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION 26 | # WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 27 | 28 | # Commentary: 29 | # matplotlib を用いたスペクトログラムのプロット 30 | # 波形読み込みはscipy.ioのwavfileモジュール 31 | 32 | import matplotlib.pyplot as plt 33 | from scipy.io import wavfile 34 | 35 | IN_WAVE_FILE = "in.wav" # モノラル音声(前提) 36 | 37 | FRAME_LENGTH = 1024 # フレーム長 38 | HOP_LENGTH = 80 # フレームのシフト長 39 | FFT_LENGTH = FRAME_LENGTH # FFTサイズ 40 | N_OVERLAP = FRAME_LENGTH - HOP_LENGTH # オーバーラップ幅 41 | 42 | # 音声データ読み込み (fsがサンプリング周波数、dataは音声データ) 43 | fs, data = wavfile.read(IN_WAVE_FILE) 44 | 45 | # プロット枠を確保 46 | plt.figure(figsize=(10, 4)) 47 | 48 | # スペクトログラムのプロット 49 | plt.specgram(data, NFFT=FFT_LENGTH, noverlap=N_OVERLAP, Fs=fs, cmap="jet") 50 | 51 | # x軸のラベル 52 | plt.xlabel("Time (sec)") 53 | 54 | # y軸のラベル 55 | plt.ylabel("Frequency (Hz)") 56 | 57 | # 画像のタイトル 58 | plt.title("Spectrogram") 59 | 60 | # 画像を画面表示 61 | plt.show() 62 | -------------------------------------------------------------------------------- /WarmUp/plt_waveform.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | """ 音声情報処理 n本ノック !! """ 4 | 5 | # MIT License 6 | 7 | # Copyright (C) 2020 by Akira TAMAMORI 8 | 9 | # Permission is hereby granted, free of charge, to any person 10 | # obtaining a copy of this software and associated documentation files 11 | # (the Software"), to deal in the Software without restriction, 12 | # including without limitation the rights to use, copy, modify, merge, 13 | # publish, distribute, sublicense, and/or sell copies of the Software, 14 | # and to permit persons to whom the Software is furnished to do so, 15 | # subject to the following conditions: 16 | 17 | # The above copyright notice and this permission notice shall be 18 | # included in all copies or substantial portions of the Software. 19 | 20 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 21 | # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 22 | # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 23 | # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE 24 | # LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION 25 | # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION 26 | # WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 27 | 28 | # Commentary: 29 | # matplotlib を用いた波形プロット 30 | # 波形読み込みはwaveモジュール 31 | 32 | import wave 33 | 34 | import matplotlib.pyplot as plt 35 | import numpy as np 36 | 37 | IN_WAVE_FILE = "in.wav" # モノラル音声(前提) 38 | 39 | # wavの読み込み 40 | with wave.open(IN_WAVE_FILE, "r") as sound: 41 | params = sound.getparams() 42 | n_channel = sound.getnchannels() # チャネル数 (mono:1, stereo:2) 43 | bitdepth = sound.getsampwidth() # 量子化ビット数 (byte!) 44 | sample_freq = sound.getframerate() # サンプリング周波数 45 | n_frames = sound.getnframes() # チャネルあたりのサンプル数 46 | n_samples = n_channel * n_frames # 総サンプル数 47 | data = sound.readframes(n_frames) # 音声データ (bytesオブジェクト) 48 | 49 | 50 | # 2バイト(16bit)の整数値系列に変換 51 | data = np.frombuffer(data, dtype=np.int16) 52 | 53 | # 時間軸を設定 54 | n_samples = len(data) 55 | time = np.arange(n_samples) / sample_freq 56 | 57 | # 音声データのプロット 58 | plt.plot(time, data) 59 | 60 | # x軸のラベル 61 | plt.xlabel("Time (sec)") 62 | 63 | # y軸のラベル 64 | plt.ylabel("Amplitude") 65 | 66 | # 画像のタイトル 67 | plt.title("Waveform") 68 | 69 | # 画像を画面表示 70 | plt.show() 71 | -------------------------------------------------------------------------------- /WarmUp/plt_waveform_scipy.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | """ 音声情報処理 n本ノック !! """ 4 | 5 | # MIT License 6 | 7 | # Copyright (C) 2020 by Akira TAMAMORI 8 | 9 | # Permission is hereby granted, free of charge, to any person 10 | # obtaining a copy of this software and associated documentation files 11 | # (the Software"), to deal in the Software without restriction, 12 | # including without limitation the rights to use, copy, modify, merge, 13 | # publish, distribute, sublicense, and/or sell copies of the Software, 14 | # and to permit persons to whom the Software is furnished to do so, 15 | # subject to the following conditions: 16 | 17 | # The above copyright notice and this permission notice shall be 18 | # included in all copies or substantial portions of the Software. 19 | 20 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 21 | # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 22 | # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 23 | # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE 24 | # LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION 25 | # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION 26 | # WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 27 | 28 | # Commentary: 29 | # matplotlib を用いた波形プロット 30 | 31 | from scipy.io import wavfile 32 | import numpy as np 33 | import matplotlib.pyplot as plt 34 | 35 | IN_WAVE_FILE = "in.wav" # モノラル音声(前提) 36 | 37 | # 波形表示 38 | fs, data = wavfile.read(IN_WAVE_FILE) 39 | n_samples = len(data) 40 | time = np.arange(n_samples) / fs 41 | plt.plot(time, data) 42 | plt.xlabel("Time (sec)") 43 | plt.ylabel("Amplitude") 44 | plt.title("Waveform") 45 | plt.show() 46 | -------------------------------------------------------------------------------- /WarmUp/pydub_mp3_to_wav.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | """ 音声情報処理 n本ノック !! """ 4 | 5 | # MIT License 6 | 7 | # Copyright (C) 2022 by Akira TAMAMORI 8 | 9 | # Permission is hereby granted, free of charge, to any person 10 | # obtaining a copy of this software and associated documentation files 11 | # (the Software"), to deal in the Software without restriction, 12 | # including without limitation the rights to use, copy, modify, merge, 13 | # publish, distribute, sublicense, and/or sell copies of the Software, 14 | # and to permit persons to whom the Software is furnished to do so, 15 | # subject to the following conditions: 16 | 17 | # The above copyright notice and this permission notice shall be 18 | # included in all copies or substantial portions of the Software. 19 | 20 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 21 | # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 22 | # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 23 | # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE 24 | # LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION 25 | # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION 26 | # WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 27 | 28 | # Commentary: 29 | # pydubによりmp3をwavへエクスポート 30 | 31 | import pydub 32 | 33 | IN_MP3_FILE = "in.mp3" 34 | OUT_WAV_FILE = "out.wav" 35 | 36 | audio = pydub.AudioSegment.from_mp3(IN_MP3_FILE) 37 | audio.export(OUT_WAV_FILE, format="wav") 38 | -------------------------------------------------------------------------------- /WarmUp/pydub_wav_to_mp3.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | """ 音声情報処理 n本ノック !! """ 4 | 5 | # MIT License 6 | 7 | # Copyright (C) 2022 by Akira TAMAMORI 8 | 9 | # Permission is hereby granted, free of charge, to any person 10 | # obtaining a copy of this software and associated documentation files 11 | # (the Software"), to deal in the Software without restriction, 12 | # including without limitation the rights to use, copy, modify, merge, 13 | # publish, distribute, sublicense, and/or sell copies of the Software, 14 | # and to permit persons to whom the Software is furnished to do so, 15 | # subject to the following conditions: 16 | 17 | # The above copyright notice and this permission notice shall be 18 | # included in all copies or substantial portions of the Software. 19 | 20 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 21 | # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 22 | # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 23 | # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE 24 | # LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION 25 | # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION 26 | # WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 27 | 28 | # Commentary: 29 | # pydubによりwavをmp3へエクスポート 30 | 31 | import pydub 32 | 33 | IN_WAVE_FILE = "in.wav" 34 | OUT_MP3_FILE = "out.mp3" 35 | 36 | audio = pydub.AudioSegment.from_wav(IN_WAVE_FILE) 37 | audio.export(OUT_MP3_FILE, format="mp3") 38 | -------------------------------------------------------------------------------- /WarmUp/sounddevice_play_wav.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | """ 音声情報処理 n本ノック !! """ 4 | 5 | # MIT License 6 | 7 | # Copyright (C) 2020 by Akira TAMAMORI 8 | 9 | # Permission is hereby granted, free of charge, to any person 10 | # obtaining a copy of this software and associated documentation files 11 | # (the Software"), to deal in the Software without restriction, 12 | # including without limitation the rights to use, copy, modify, merge, 13 | # publish, distribute, sublicense, and/or sell copies of the Software, 14 | # and to permit persons to whom the Software is furnished to do so, 15 | # subject to the following conditions: 16 | 17 | # The above copyright notice and this permission notice shall be 18 | # included in all copies or substantial portions of the Software. 19 | 20 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 21 | # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 22 | # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 23 | # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE 24 | # LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION 25 | # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION 26 | # WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 27 | 28 | # Commentary: 29 | # 波形読み込みはscipy.ioのwavfileモジュール 30 | # sounddeviceモジュールによるwav再生 31 | 32 | import sounddevice as sd 33 | from scipy.io import wavfile 34 | 35 | IN_WAVE_FILE = "in.wav" # モノラル音声(前提) 36 | 37 | # 音声データ読み込み (fsがサンプリング周波数、dataは音声データ) 38 | fs, data = wavfile.read(IN_WAVE_FILE) 39 | 40 | # 再生 41 | sd.play(data, fs) 42 | sd.wait() 43 | -------------------------------------------------------------------------------- /WarmUp/sounddevice_rec_wav.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | """ 音声情報処理 n本ノック !! """ 4 | 5 | # MIT License 6 | 7 | # Copyright (C) 2020 by Akira TAMAMORI 8 | 9 | # Permission is hereby granted, free of charge, to any person 10 | # obtaining a copy of this software and associated documentation files 11 | # (the Software"), to deal in the Software without restriction, 12 | # including without limitation the rights to use, copy, modify, merge, 13 | # publish, distribute, sublicense, and/or sell copies of the Software, 14 | # and to permit persons to whom the Software is furnished to do so, 15 | # subject to the following conditions: 16 | 17 | # The above copyright notice and this permission notice shall be 18 | # included in all copies or substantial portions of the Software. 19 | 20 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 21 | # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 22 | # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 23 | # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE 24 | # LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION 25 | # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION 26 | # WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 27 | 28 | # Commentary: 29 | # sounddeviceモジュールによる録音 30 | # 波形書き込みはwaveモジュール 31 | 32 | import wave 33 | 34 | import numpy as np 35 | import sounddevice as sd 36 | 37 | OUT_WAVE_FILE = "out.wav" 38 | 39 | fs = 16000 # サンプリング周波数 (Hz) 40 | duration = 3 # 録音時間 (sec) 41 | n_channels = 1 # モノラル 42 | 43 | n_frames = int(fs * duration) # 総サンプル数 44 | 45 | # 音声の録音 46 | data = sd.rec(frames=n_frames, samplerate=fs, channels=n_channels) 47 | sd.wait() 48 | 49 | # 振幅の正規化 50 | data = data / data.max() * np.iinfo(np.int16).max 51 | 52 | # floatを2byte整数に変換 53 | data = data.astype(np.int16) 54 | 55 | # wavの書き込み 56 | with wave.open(OUT_WAVE_FILE, mode="wb") as sound: 57 | sound.setnchannels(n_channels) # モノラル 58 | sound.setsampwidth(2) # 量子化ビット数 (byte表示) 59 | sound.setframerate(fs) 60 | sound.writeframes(data.tobytes()) 61 | -------------------------------------------------------------------------------- /WarmUp/subprocess_play_wav.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | """ 音声情報処理 n本ノック !! """ 4 | 5 | # MIT License 6 | 7 | # Copyright (C) 2020 by Akira TAMAMORI 8 | 9 | # Permission is hereby granted, free of charge, to any person 10 | # obtaining a copy of this software and associated documentation files 11 | # (the Software"), to deal in the Software without restriction, 12 | # including without limitation the rights to use, copy, modify, merge, 13 | # publish, distribute, sublicense, and/or sell copies of the Software, 14 | # and to permit persons to whom the Software is furnished to do so, 15 | # subject to the following conditions: 16 | 17 | # The above copyright notice and this permission notice shall be 18 | # included in all copies or substantial portions of the Software. 19 | 20 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 21 | # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 22 | # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 23 | # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE 24 | # LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION 25 | # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION 26 | # WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 27 | 28 | # Commentary: 29 | # subprocessモジュールによるwav再生 30 | 31 | import subprocess 32 | 33 | IN_WAVE_FILE = "in.wav" # モノラル音声(前提) 34 | PLAY_COMMAND = "afplay" # 音声再生コマンド 35 | 36 | # 再生(同期処理) 37 | subprocess.run(PLAY_COMMAND + " " + IN_WAVE_FILE, shell=True) 38 | 39 | # 再生(非同期処理) 40 | proc = subprocess.Popen(PLAY_COMMAND + " " + IN_WAVE_FILE, shell=True) 41 | proc.communicate() 42 | -------------------------------------------------------------------------------- /WarmUp/wave_change_bitdepth.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | """ 音声情報処理 n本ノック !! """ 4 | 5 | # MIT License 6 | 7 | # Copyright (C) 2020 by Akira TAMAMORI 8 | 9 | # Permission is hereby granted, free of charge, to any person 10 | # obtaining a copy of this software and associated documentation files 11 | # (the Software"), to deal in the Software without restriction, 12 | # including without limitation the rights to use, copy, modify, merge, 13 | # publish, distribute, sublicense, and/or sell copies of the Software, 14 | # and to permit persons to whom the Software is furnished to do so, 15 | # subject to the following conditions: 16 | 17 | # The above copyright notice and this permission notice shall be 18 | # included in all copies or substantial portions of the Software. 19 | 20 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 21 | # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 22 | # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 23 | # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE 24 | # LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION 25 | # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION 26 | # WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 27 | 28 | # Commentary: 29 | # 量子化ビット数を変更したwavファイルの作成 30 | 31 | import wave 32 | import numpy as np 33 | 34 | IN_WAVE_FILE = "in.wav" # 16bit モノラル音声(前提) 35 | OUT_WAVE_FILE = "out.wav" 36 | 37 | # wavの読み込み 38 | with wave.open(IN_WAVE_FILE, "r") as sound: 39 | params = sound.getparams() 40 | n_channel = sound.getnchannels() # チャネル数 (mono:1, stereo:2) 41 | bitdepth = sound.getsampwidth() # 量子化ビット数 (byte!) 42 | n_framerate = sound.getframerate() # サンプリング周波数 43 | n_frames = sound.getnframes() # チャネルあたりのサンプル数 44 | n_samples = n_channel * n_frames # 総サンプル数 45 | sound_frames = sound.readframes(n_frames) # 音声データ (bytesオブジェクト) 46 | 47 | # ヘッダ情報の表示 48 | print(f"入力ファイル名: {IN_WAVE_FILE}") 49 | print(f" ・チャネル数: {n_channel}") 50 | print(f" ・量子化ビット数: {bitdepth * 8}") 51 | print(f" ・サンプリング周波数: {n_framerate}") 52 | print(f" ・サンプル数: {n_samples}") 53 | 54 | # 量子化ビット数 変更 (16bit to 8bit) →下位ビットを捨てない 55 | x = np.frombuffer(sound_frames, dtype=np.int16) 56 | volume = np.max(x) / (2 ** 16) 57 | x = (x / np.max(x)) * (2 ** 7 - 1) 58 | x *= volume 59 | x = x.astype(np.int8) 60 | sound_frames = x.tobytes() 61 | 62 | # ヘッダ情報の変更 63 | bitdepth = 1 # 2 byte to 1byte 64 | 65 | # wavの書き込み 66 | with wave.open(OUT_WAVE_FILE, "w") as sound: 67 | sound.setnchannels(n_channel) # チャネル数 (mono:1, stereo:2) 68 | sound.setsampwidth(bitdepth) # 量子化ビット数 (byte!) 69 | sound.setframerate(n_framerate) # 標本化周波数の変更 70 | sound.setnframes(n_frames) # チャネルあたりのサンプル数 71 | sound.writeframes(sound_frames) # 音声データの書き込み 72 | 73 | print(f"出力ファイル名: {OUT_WAVE_FILE}") 74 | print(f" ・チャネル数: {n_channel}") 75 | print(f" ・量子化ビット数: {bitdepth * 8}") 76 | print(f" ・サンプリング周波数: {n_framerate}") 77 | print(f" ・サンプル数: {n_samples}") 78 | -------------------------------------------------------------------------------- /WarmUp/wave_change_framerate.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | """ 音声情報処理 n本ノック !! """ 4 | 5 | # MIT License 6 | 7 | # Copyright (C) 2020 by Akira TAMAMORI 8 | 9 | # Permission is hereby granted, free of charge, to any person 10 | # obtaining a copy of this software and associated documentation files 11 | # (the Software"), to deal in the Software without restriction, 12 | # including without limitation the rights to use, copy, modify, merge, 13 | # publish, distribute, sublicense, and/or sell copies of the Software, 14 | # and to permit persons to whom the Software is furnished to do so, 15 | # subject to the following conditions: 16 | 17 | # The above copyright notice and this permission notice shall be 18 | # included in all copies or substantial portions of the Software. 19 | 20 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 21 | # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 22 | # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 23 | # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE 24 | # LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION 25 | # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION 26 | # WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 27 | 28 | # Commentary: 29 | # サンプリング周波数を変更したwavファイルの作成 30 | 31 | import wave 32 | 33 | IN_WAVE_FILE = "in.wav" # モノラル音声(前提) 34 | OUT_WAVE_FILE = "out.wav" 35 | 36 | SAMPLE_FREQ = 8000 # 変更後のサンプリング周波数 37 | 38 | # wavの読み込み 39 | with wave.open(IN_WAVE_FILE, "r") as sound: 40 | params = sound.getparams() 41 | n_channel = sound.getnchannels() # チャネル数 (mono:1, stereo:2) 42 | bitdepth = sound.getsampwidth() # 量子化ビット数 (byte!) 43 | sample_freq = sound.getframerate() # サンプリング周波数 44 | n_frames = sound.getnframes() # チャネルあたりのサンプル数 45 | n_samples = n_channel * n_frames # 総サンプル数 46 | sound_frames = sound.readframes(n_frames) # 音声データ (bytesオブジェクト) 47 | 48 | # ヘッダ情報の表示 49 | print(f"チャネル数: {n_channel}") 50 | print(f"量子化ビット数: {bitdepth * 8}") 51 | print(f"サンプリング周波数: {sample_freq}") 52 | print(f"サンプル数: {n_samples}") 53 | 54 | # wavの書き込み 55 | with wave.open(OUT_WAVE_FILE, "w") as sound: 56 | sound.setnchannels(n_channel) # チャネル数 (mono:1, stereo:2) 57 | sound.setsampwidth(bitdepth) # 量子化ビット数 (byte!) 58 | sound.setframerate(SAMPLE_FREQ) # 標本化周波数の変更 59 | sound.setnframes(n_frames) # チャネルあたりのサンプル数 60 | sound.writeframes(sound_frames) # 音声データの書き込み 61 | -------------------------------------------------------------------------------- /WarmUp/wave_normalize.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | """ 音声情報処理 n本ノック !! """ 4 | 5 | # MIT License 6 | 7 | # Copyright (C) 2020 by Akira TAMAMORI 8 | 9 | # Permission is hereby granted, free of charge, to any person 10 | # obtaining a copy of this software and associated documentation files 11 | # (the Software"), to deal in the Software without restriction, 12 | # including without limitation the rights to use, copy, modify, merge, 13 | # publish, distribute, sublicense, and/or sell copies of the Software, 14 | # and to permit persons to whom the Software is furnished to do so, 15 | # subject to the following conditions: 16 | 17 | # The above copyright notice and this permission notice shall be 18 | # included in all copies or substantial portions of the Software. 19 | 20 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 21 | # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 22 | # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 23 | # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE 24 | # LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION 25 | # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION 26 | # WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 27 | 28 | # Commentary: 29 | # 振幅を正規化 30 | 31 | import wave 32 | import numpy as np 33 | 34 | IN_WAVE_FILE = "in.wav" # 16bit モノラル音声(前提) 35 | OUT_WAVE_FILE = "out.wav" 36 | 37 | # wavの読み込み 38 | with wave.open(IN_WAVE_FILE, "r") as sound: 39 | params = sound.getparams() 40 | n_channel = sound.getnchannels() # チャネル数 (mono:1, stereo:2) 41 | bitdepth = sound.getsampwidth() # 量子化ビット数 (byte!) 42 | n_framerate = sound.getframerate() # サンプリング周波数 43 | n_frames = sound.getnframes() # チャネルあたりのサンプル数 44 | n_samples = n_channel * n_frames # 総サンプル数 45 | sound_frames = sound.readframes(n_frames) # 音声データ (bytesオブジェクト) 46 | 47 | # ヘッダ情報の表示 48 | print(f"入力ファイル名: {IN_WAVE_FILE}") 49 | print(f" ・チャネル数: {n_channel}") 50 | print(f" ・量子化ビット数: {bitdepth * 8}") 51 | print(f" ・サンプリング周波数: {n_framerate}") 52 | print(f" ・サンプル数: {n_samples}") 53 | 54 | # 振幅の正規化 55 | x = np.frombuffer(sound_frames, dtype=np.int16) 56 | x = (x / np.max(x)) * (2 ** (bitdepth * 8 - 1) - 1) 57 | x = x.astype(np.int16) 58 | sound_frames = x.tobytes() 59 | 60 | # wavの書き込み 61 | with wave.open(OUT_WAVE_FILE, "w") as sound: 62 | sound.setnchannels(n_channel) # チャネル数 (mono:1, stereo:2) 63 | sound.setsampwidth(bitdepth) # 量子化ビット数 (byte!) 64 | sound.setframerate(n_framerate) # 標本化周波数の変更 65 | sound.setnframes(n_frames) # チャネルあたりのサンプル数 66 | sound.writeframes(sound_frames) # 音声データの書き込み 67 | 68 | print(f"出力ファイル名: {OUT_WAVE_FILE}") 69 | print(f" ・チャネル数: {n_channel}") 70 | print(f" ・量子化ビット数: {bitdepth * 8}") 71 | print(f" ・サンプリング周波数: {n_framerate}") 72 | print(f" ・サンプル数: {n_samples}") 73 | -------------------------------------------------------------------------------- /WarmUp/wave_read_write.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | """ 音声情報処理 n本ノック !! """ 4 | 5 | # MIT License 6 | 7 | # Copyright (C) 2020 by Akira TAMAMORI 8 | 9 | # Permission is hereby granted, free of charge, to any person 10 | # obtaining a copy of this software and associated documentation files 11 | # (the Software"), to deal in the Software without restriction, 12 | # including without limitation the rights to use, copy, modify, merge, 13 | # publish, distribute, sublicense, and/or sell copies of the Software, 14 | # and to permit persons to whom the Software is furnished to do so, 15 | # subject to the following conditions: 16 | 17 | # The above copyright notice and this permission notice shall be 18 | # included in all copies or substantial portions of the Software. 19 | 20 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 21 | # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 22 | # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 23 | # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE 24 | # LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION 25 | # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION 26 | # WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 27 | 28 | # Commentary: 29 | # waveモジュールを用いた音声入出力 (コピー作成) 30 | 31 | import wave 32 | 33 | IN_WAVE_FILE = "in.wav" # モノラル音声(前提) 34 | OUT_WAVE_FILE = "out.wav" 35 | 36 | # wavの読み込み 37 | with wave.open(IN_WAVE_FILE, "r") as sound: 38 | params = sound.getparams() 39 | n_channel = sound.getnchannels() # チャネル数 (mono:1, stereo:2) 40 | bitdepth = sound.getsampwidth() # 量子化ビット数 (byte!) 41 | sample_freq = sound.getframerate() # サンプリング周波数 42 | n_frames = sound.getnframes() # チャネルあたりのサンプル数 43 | n_samples = n_channel * n_frames # 総サンプル数 44 | sound_frames = sound.readframes(n_frames) # 音声データ (bytesオブジェクト) 45 | 46 | # ヘッダ情報の表示 47 | print(f"チャネル数: {n_channel}") 48 | print(f"量子化ビット数: {bitdepth * 8}") 49 | print(f"サンプリング周波数: {sample_freq}") 50 | print(f"サンプル数: {n_samples}") 51 | 52 | # wavの書き込み 53 | with wave.open(OUT_WAVE_FILE, "w") as sound: 54 | sound.setparams(params) # ヘッダ情報の書き込み 55 | sound.writeframes(sound_frames) # 音声データの書き込み 56 | -------------------------------------------------------------------------------- /WarmUp/wave_read_write_scipy.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | """ 音声情報処理 n本ノック !! """ 4 | 5 | # MIT License 6 | 7 | # Copyright (C) 2020 by Akira TAMAMORI 8 | 9 | # Permission is hereby granted, free of charge, to any person 10 | # obtaining a copy of this software and associated documentation files 11 | # (the Software"), to deal in the Software without restriction, 12 | # including without limitation the rights to use, copy, modify, merge, 13 | # publish, distribute, sublicense, and/or sell copies of the Software, 14 | # and to permit persons to whom the Software is furnished to do so, 15 | # subject to the following conditions: 16 | 17 | # The above copyright notice and this permission notice shall be 18 | # included in all copies or substantial portions of the Software. 19 | 20 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 21 | # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 22 | # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 23 | # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE 24 | # LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION 25 | # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION 26 | # WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 27 | 28 | # Commentary: 29 | # scipyモジュールを用いた音声入出力 (コピー作成) 30 | 31 | from scipy.io import wavfile 32 | import numpy as np 33 | 34 | IN_WAVE_FILE = "in.wav" # モノラル音声(前提) 35 | OUT_WAVE_FILE = "out.wav" 36 | 37 | # 音声の読み込み 38 | fs, x = wavfile.read(IN_WAVE_FILE) 39 | x = x.astype(np.float64) 40 | 41 | # 音声の書き込み 42 | x = x.astype(np.int16) 43 | wavfile.write(OUT_WAVE_FILE, fs, x) 44 | -------------------------------------------------------------------------------- /WarmUp/wave_stereo_to_mono.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | """ 音声情報処理 n本ノック !! """ 4 | 5 | # MIT License 6 | 7 | # Copyright (C) 2020 by Akira TAMAMORI 8 | 9 | # Permission is hereby granted, free of charge, to any person 10 | # obtaining a copy of this software and associated documentation files 11 | # (the Software"), to deal in the Software without restriction, 12 | # including without limitation the rights to use, copy, modify, merge, 13 | # publish, distribute, sublicense, and/or sell copies of the Software, 14 | # and to permit persons to whom the Software is furnished to do so, 15 | # subject to the following conditions: 16 | 17 | # The above copyright notice and this permission notice shall be 18 | # included in all copies or substantial portions of the Software. 19 | 20 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 21 | # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 22 | # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 23 | # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE 24 | # LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION 25 | # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION 26 | # WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 27 | 28 | # Commentary: 29 | # ステレオからモノラルへと変更 30 | 31 | import wave 32 | import numpy as np 33 | 34 | IN_WAVE_FILE = "in.wav" # ステレオ音声(前提) 35 | OUT_WAVE_FILE = "out.wav" 36 | 37 | # wavの読み込み 38 | with wave.open(IN_WAVE_FILE, "r") as sound: 39 | params = sound.getparams() 40 | n_channel = sound.getnchannels() # チャネル数 (mono:1, stereo:2) 41 | bitdepth = sound.getsampwidth() # 量子化ビット数 (byte!) 42 | n_framerate = sound.getframerate() # サンプリング周波数 43 | n_frames = sound.getnframes() # チャネルあたりのサンプル数 44 | n_samples = n_channel * n_frames # 総サンプル数 45 | sound_frames = sound.readframes(n_frames) # 音声データ (bytesオブジェクト) 46 | 47 | # ヘッダ情報の表示 48 | print(f"入力ファイル名: {IN_WAVE_FILE}") 49 | print(f" ・チャネル数: {n_channel}") 50 | print(f" ・量子化ビット数: {bitdepth * 8}") 51 | print(f" ・サンプリング周波数: {n_framerate}") 52 | print(f" ・サンプル数: {n_samples}") 53 | 54 | # ステレオからモノラルへの変換(左右チャネルの平均) 55 | channels = np.frombuffer(sound_frames, dtype=np.int16) 56 | l_channel = channels[0::2].astype(np.float32) # 左チャネル 57 | r_channel = channels[1::2].astype(np.float32) # 右チャネル 58 | mono_channel = (l_channel + r_channel) / 2 59 | mono_channel = mono_channel.astype(np.int16) 60 | 61 | # bytesオブジェクトへの変換 62 | sound_frames = mono_channel.tobytes() 63 | 64 | # チャネル数の変更 65 | n_channel = 1 # mono 66 | 67 | # wavの書き込み 68 | with wave.open(OUT_WAVE_FILE, "w") as sound: 69 | sound.setnchannels(n_channel) # チャネル数 (mono:1, stereo:2) 70 | sound.setsampwidth(bitdepth) # 量子化ビット数 (byte!) 71 | sound.setframerate(n_framerate) # 標本化周波数の変更 72 | sound.setnframes(n_frames) # チャネルあたりのサンプル数 73 | sound.writeframes(sound_frames) # 音声データの書き込み 74 | 75 | print(f"出力ファイル名: {OUT_WAVE_FILE}") 76 | print(f" ・チャネル数: {n_channel}") 77 | print(f" ・量子化ビット数: {bitdepth * 8}") 78 | print(f" ・サンプリング周波数: {n_framerate}") 79 | print(f" ・サンプル数: {n_samples}") 80 | -------------------------------------------------------------------------------- /WarmUp/wave_write_whitenoise.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | """ 音声情報処理 n本ノック !! """ 4 | 5 | # MIT License 6 | 7 | # Copyright (C) 2020 by Akira TAMAMORI 8 | 9 | # Permission is hereby granted, free of charge, to any person 10 | # obtaining a copy of this software and associated documentation files 11 | # (the Software"), to deal in the Software without restriction, 12 | # including without limitation the rights to use, copy, modify, merge, 13 | # publish, distribute, sublicense, and/or sell copies of the Software, 14 | # and to permit persons to whom the Software is furnished to do so, 15 | # subject to the following conditions: 16 | 17 | # The above copyright notice and this permission notice shall be 18 | # included in all copies or substantial portions of the Software. 19 | 20 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 21 | # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 22 | # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 23 | # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE 24 | # LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION 25 | # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION 26 | # WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 27 | 28 | # Commentary: 29 | # 白色雑音をwavとして書き込む 30 | 31 | import wave 32 | 33 | import numpy as np 34 | 35 | OUT_WAVE_FILE = "out_whitenoise.wav" 36 | 37 | # 白色雑音のサンプル数を設定 38 | n_samples = 40000 39 | 40 | # サンプリング周波数 41 | sample_freq = 16000 42 | 43 | # 白色雑音を生成 44 | data = np.random.normal(scale=0.1, size=n_samples) 45 | 46 | # 値の範囲を調整 47 | data = data * np.iinfo(np.int16).max 48 | 49 | # 2バイト(16bit)の整数値に変換 50 | data = data.astype(np.int16) 51 | 52 | # wavの書き込み 53 | with wave.open(OUT_WAVE_FILE, "w") as sound: 54 | sound.setnchannels(1) # モノラル 55 | sound.setsampwidth(2) # 量子化ビット数(2byte = 16bit) 56 | sound.setframerate(sample_freq) # サンプリング周波数 57 | sound.writeframes(data) # 音声データの書き込み 58 | --------------------------------------------------------------------------------