├── AudioSourceSeparation
    ├── README.md
    ├── pra_ILRMA.ipynb
    └── pra_ILRMA.py
├── AudioWatermark
    ├── README.md
    ├── cepstrum_method.py
    ├── echo_hiding_method.py
    ├── lsb_method.py
    ├── phase_coding_method.py
    ├── spread_spectrum_method.py
    ├── svd_cepstrum_method.py
    ├── svd_dct_method.py
    ├── svd_stft_method.py
    ├── svd_stft_method_offdiag.py
    └── wavelet_method.py
├── DigitalSignalProcessing
    ├── README.md
    ├── dsp_add_whitenoise.ipynb
    ├── dsp_add_whitenoise.py
    ├── dsp_convolution.py
    ├── dsp_fir_denoise.py
    ├── dsp_hilbert.py
    ├── dsp_rectangle_anime.py
    ├── dsp_rectangle_fourier.py
    ├── dsp_sawtooth_anime.py
    ├── dsp_sawtooth_fourier.py
    ├── dsp_sine.html
    ├── dsp_sine.ipynb
    ├── dsp_sine.py
    ├── dsp_sine_addnoise.py
    ├── dsp_sine_addnoise_plot.py
    ├── dsp_sine_beat.py
    ├── dsp_sine_euler.py
    ├── dsp_sine_plot.py
    ├── dsp_sine_plot_multi.py
    ├── dsp_triangle_fourier.py
    ├── dsp_window_blackman.py
    ├── dsp_window_hamming.py
    ├── dsp_window_hann.py
    ├── dsp_window_triangle.py
    ├── rectangle_anime.mp4
    └── sawtooth_anime.mp4
├── LICENSE
├── PhaseRetrieval
    ├── README.md
    ├── phaseret_pghi.py
    ├── phaseret_rtisila.py
    ├── phaseret_rtpghi.py
    └── phaseret_spsi.py
├── README.md
├── SoundEffect
    ├── README.md
    ├── pysox_bandpass_bandreject.py
    ├── pysox_change_bitdepth.ipynb
    ├── pysox_change_bitdepth.py
    ├── pysox_change_samplerate.py
    ├── pysox_downsample.py
    ├── pysox_echo.ipynb
    ├── pysox_echo.py
    ├── pysox_flanger.py
    ├── pysox_lowpass-highpass.py
    ├── pysox_pitchshift.ipynb
    ├── pysox_pitchshift.py
    ├── pysox_reverb.ipynb
    ├── pysox_reverb.py
    ├── pysox_stereo2mono.py
    ├── pysox_timestretch.ipynb
    ├── pysox_timestretch.py
    ├── pysox_tremolo.ipynb
    ├── pysox_tremolo.py
    ├── pysox_upsample.py
    └── pysox_wav2raw.py
├── SpeakerRecognition
    ├── README.md
    ├── config.yaml
    ├── config_sklearn.yaml
    ├── download_pretrained_model.py
    ├── download_voicestats_corpus.py
    ├── extract_sample.py
    ├── extract_xvector_voicestats.py
    ├── spk_recog_mlp.py
    └── spk_recog_mlp_sklearn.py
├── SpeechAnalysis
    ├── README.md
    ├── feat_cepstrum.py
    ├── feat_fo_autocorr.ipynb
    ├── feat_fo_autocorr.py
    ├── feat_fo_autocorr_variant.ipynb
    ├── feat_fo_autocorr_variant.py
    ├── feat_fo_cepstrum.ipynb
    ├── feat_fo_cepstrum.py
    ├── feat_fo_cepstrum_sequence.py
    ├── feat_fo_dio.ipynb
    ├── feat_fo_dio.py
    ├── feat_fo_music.ipynb
    ├── feat_fo_music.py
    ├── feat_fo_pyin.ipynb
    ├── feat_fo_pyin.py
    ├── feat_fo_yin.ipynb
    ├── feat_fo_yin.py
    ├── feat_gla.ipynb
    ├── feat_gla.py
    ├── feat_gla_admm.ipynb
    ├── feat_gla_admm.py
    ├── feat_librosa_gla.ipynb
    ├── feat_melspec.ipynb
    ├── feat_melspec.py
    ├── feat_mfcc.ipynb
    ├── feat_mfcc.py
    ├── feat_stft.py
    ├── feat_stft_istft.py
    ├── feat_stft_spec.ipynb
    └── feat_stft_spec.py
├── SpeechAnalysisSynthesis
    ├── README.md
    ├── pysptk_anasyn_lpc.ipynb
    ├── pysptk_anasyn_lpc.py
    ├── pysptk_anasyn_lsp.ipynb
    ├── pysptk_anasyn_lsp.py
    ├── pysptk_anasyn_mlsa.ipynb
    ├── pysptk_anasyn_mlsa.py
    ├── pysptk_anasyn_mlsa_others.ipynb
    ├── pysptk_anasyn_mlsa_others.py
    ├── pysptk_anasyn_mlsa_pyworld.py
    ├── pysptk_anasyn_parcor.ipynb
    ├── pysptk_anasyn_parcor.py
    ├── pysptk_anasyn_recog.py
    ├── pyworld_anasyn.ipynb
    ├── pyworld_anasyn.py
    └── pyworld_anasyn_encdec.py
├── SpeechRecognition
    ├── README.md
    ├── google_mode_modoki.py
    ├── recog_speech_rec.py
    ├── recog_wikipedia.py
    ├── record_speech.py
    ├── vosk_asr_recorded.py
    ├── vosk_asr_streaming.py
    └── vosk_asr_streaming_vad.py
├── SpeechSynthesis
    ├── README.md
    ├── synth_gtts.py
    ├── synth_gtts_gui.py
    ├── synth_pyopenjtalk.py
    ├── synth_pyopenjtalk_gui.py
    ├── synth_pyttsx.py
    └── synth_ttslearn_multi_gui.py
├── VoiceConversion
    ├── README.md
    └── pysimplegui_realtime_vc.py
└── WarmUp
    ├── README.md
    ├── ffmpeg_mp3_to_wav.py
    ├── ffmpeg_wav_to_mp3.py
    ├── librosa_plot_specgram.ipynb
    ├── librosa_plot_specgram.py
    ├── librosa_plot_waveform.ipynb
    ├── librosa_plot_waveform.py
    ├── plt_specgram.ipynb
    ├── plt_specgram.py
    ├── plt_waveform.ipynb
    ├── plt_waveform.py
    ├── plt_waveform_scipy.py
    ├── plt_whitenoise.ipynb
    ├── pydub_mp3_to_wav.py
    ├── pydub_wav_to_mp3.py
    ├── sounddevice_play_wav.py
    ├── sounddevice_rec_wav.py
    ├── subprocess_play_wav.py
    ├── wave_change_bitdepth.py
    ├── wave_change_framerate.py
    ├── wave_normalize.py
    ├── wave_play_wav.ipynb
    ├── wave_read_write.py
    ├── wave_read_write_scipy.py
    ├── wave_stereo_to_mono.py
    └── wave_write_whitenoise.py


/AudioSourceSeparation/README.md:
--------------------------------------------------------------------------------
 1 | # 音源分離
 2 | ## はじめに
 3 | ```
 4 | pip3 install pyroomacoustics
 5 | pip3 install nussl
 6 | ```
 7 | 
 8 | - pyroomacoustics https://github.com/LCAV/pyroomacoustics
 9 | - nussl https://github.com/nussl/nussl
10 | 
11 | ### Pythonスクリプト
12 | - pyroomacoustics
13 |   - ILRMAベースの音源分離 [pra_ILRMA.py](https://nbviewer.jupyter.org/github/tam17aki/speech_process_exercise/blob/master/AudioSourceSeparation/pra_ILRMA.py)
14 | 
15 | ### Jupyter notebook
16 | - pyroomacoustics
17 |   - ILRMAベースの音源分離 [pra_ILRMA.ipynb](https://nbviewer.jupyter.org/github/tam17aki/speech_process_exercise/blob/master/AudioSourceSeparation/pra_ILRMA.ipynb)
18 | 
19 | 
20 | ### Google Colaboratory
21 | - nussl
22 |   - AudioSignal入門 [Introduction_to_AudioSignal.ipynb](https://colab.research.google.com/drive/1ntYryCmSam1El-WWIWRzYS8a9f8Fa8d5?usp=sharing)
23 |   - STFT表現 [audio_signal_stft.ipynb](https://colab.research.google.com/drive/1ALGz70yCLTn1y6njR4D9DCr5qNIku_la?usp=sharing)
24 |   - 周波数マスキング入門 [masking_audio_signal_timefreq.ipynb](https://colab.research.google.com/drive/1qPyDcUAOwsfDZ_X1x_yn1Zqb2Ef52QUr?usp=sharing)
25 |   - ローパス・ハイパスフィルタによる音源分離 [high-lowpass_filters.ipynb](https://colab.research.google.com/drive/1tTqqcBgWFK0wGQeZZjJXUGE9_4ja2GM2?usp=sharing)
26 |   - 理想バイナリマスクによる音源分離 [ideal_binary_mask.ipynb](https://colab.research.google.com/drive/1sxQu62bunrIcjslTl01HGmwyPdjTM4i4?usp=sharing)
27 |   - 理想 ratio マスク（ソフトマスク）による音源分離 [ideal_mask.ipynb](https://colab.research.google.com/drive/1XYMJqc6X_9vKptt5irrGTi-deLoMGwF8?usp=sharing)
28 |   - ウィーナーフィルタによる信号復元（音源分離結果の強調） [wiener_filter.ipynb](https://colab.research.google.com/drive/1f6fbPZNAG8iO2bgZFyFOlAGPiwx7CTr9?usp=sharing)
29 |   - ロバストPCAによる音源分離（歌声と楽曲の分離）[robust_pca.ipynb](https://colab.research.google.com/drive/1S34MIYs-_OCKEt7YULR2MfJpJ_TaOUVx?usp=sharing)
30 |   - 独立成分分析による音源分離 [ica.ipynb](https://colab.research.google.com/drive/1q3Pk5EXMS3GXO0kRkms5mxIzbfw0o3dQ?usp=sharing)
31 |   - 2次元フーリエ変換による音源分離（歌声と楽曲の分離）[2-d_fourier.ipynb](https://colab.research.google.com/drive/1G6c8SLP6bpnu_3f_AaAk2nK4FgzoSbC8?usp=sharing)
32 |   - REPET法による音源分離（歌声と楽曲の分離）[REPET.ipynb](https://colab.research.google.com/drive/1H4IcYHJSD2F9XBjrCNoGtrMjmrg7Up9W?usp=sharing)
33 |   - REPET-SIM法による音源分離（歌声と楽曲の分離）[REPETSIM.ipynb](https://colab.research.google.com/drive/12X9Pvv94vcDIQlv1pUYNqt_HsJCVhiWw?usp=sharing)
34 |   - Timber clusteringによる音源分離 [timber_clustering.ipynb](https://colab.research.google.com/drive/1f8sFW6TJaCvyi7YL9tvg-TgTUnBi2Bu_?usp=sharing)
35 |   - 調波打楽器音分離 [hpss.ipynb](https://colab.research.google.com/drive/1UKrPpfTMSmDxEOcX5xiqxUXn-ElvD-vB?usp=sharing)
36 |   - 空間クラスタリングによる音源分離 [spatial_clustering.ipynb](https://colab.research.google.com/drive/1gYfOZqvtoGL0W00XA-f6Ro16qNev79Dt?usp=sharing)
37 |   - PROJET法による音源分離 [PROJET.ipynb](https://colab.research.google.com/drive/15gs2AFfh3Pj60r_Vn21O8-MmXBXL_07x?usp=sharing)
38 |   - DUET法によるによるブラインド音源分離 [DUET.ipynb](https://colab.research.google.com/drive/15BEzg7TWd4yoiTN5nx-5Xh82Mysczkfh?usp=sharing)
39 | 


--------------------------------------------------------------------------------
/AudioWatermark/README.md:
--------------------------------------------------------------------------------
 1 | # 音の電子透かしとステガノグラフィ
 2 | 
 3 | ## はじめに
 4 | 音の電子透かしおよびステガノグラフィ技術をPythonで実装するのが目的。
 5 | 
 6 | ## ファイル一覧
 7 | ### Pythonスクリプト
 8 | - 最下位ビット置換法 [lsb_method.py](https://github.com/tam17aki/speech_process_exercise/blob/master/AudioWatermark/lsb_method.py)
 9 | - 拡散スペクトル法 [spread_spectrum_method.py](https://github.com/tam17aki/speech_process_exercise/blob/master/AudioWatermark/spread_spectrum_method.py)
10 | - ケプストラム法 [cepstrum_method.py](https://github.com/tam17aki/speech_process_exercise/blob/master/AudioWatermark/cepstrum_method.py)
11 | - 位相コーディング法 [phase_coding_method.py](https://github.com/tam17aki/speech_process_exercise/blob/master/AudioWatermark/phase_coding_method.py)
12 | - エコーハイディング法 [echo_hiding_method.py](https://github.com/tam17aki/speech_process_exercise/blob/master/AudioWatermark/echo_hiding_method.py)
13 | - ウェーブレット法 [wavelet_method.py](https://github.com/tam17aki/speech_process_exercise/blob/master/AudioWatermark/wavelet_method.py)
14 | - 特異値分解法（STFTに対する）[svd_stft_method.py](https://github.com/tam17aki/speech_process_exercise/blob/master/AudioWatermark/svd_stft_method.py)
15 | - 特異値分解法（複素ケプストラムに対する）[svd_cepstrum method.py](https://github.com/tam17aki/speech_process_exercise/blob/master/AudioWatermark/svd_cepstrum_method.py)
16 | - 特異値分解法（DCT係数に対する）[svd_dct_method.py](https://github.com/tam17aki/speech_process_exercise/blob/master/AudioWatermark/svd_dct_method.py)
17 | 
18 | ### Google Colaboratory
19 | - 最下位ビット置換法 [lsb_method.ipynb](https://colab.research.google.com/drive/1bz8GQZ-IOQ2S7hJELy2xfujzJiddgqeE?usp=sharing)
20 | - 拡散スペクトル法 [spread_spectrum_method.ipynb](https://colab.research.google.com/drive/1yMvfnFOjs2BRsQGhvnypSPyGm4E7DNNq?usp=sharing)
21 | - ケプストラム法 [cepstrum_method.ipynb](https://colab.research.google.com/drive/1IGQXgBiskWaJjhlam8i7m5-ghthsane0?usp=sharing)
22 | - 位相コーディング法 [phase_coding_method.ipynb](https://colab.research.google.com/drive/1djdRBmzbbFYJIqgC_EbSiKFHfPk2YGa7?usp=sharing)
23 | - エコーハイディング法 [echo_hiding_method.ipynb](https://colab.research.google.com/drive/1NFVCjcVUCG8NNlkzQ6hUelzUtcK9429H?usp=sharing)
24 | - ウェーブレット法 [wavelet_method.ipynb](https://colab.research.google.com/drive/1k8yiN1BzevJI7DjEl58NGDYuW3s4IFnb?usp=sharing)
25 | - 特異値分解法（STFTに対する）[svd_stft_method.ipynb](https://colab.research.google.com/drive/13m1Q_J5UNrTHG-DOMifHiFrYW5LP4wnZ?usp=sharing)
26 | - 特異値分解法（複素ケプストラムに対する）[svd_cepstrum method.ipynb](https://colab.research.google.com/drive/1hXvO6HqfLm1mKUXK5NDF54NDqEGk7lD0?usp=sharing)
27 | - 特異値分解法（DCT係数に対する）[svd_dct_method.ipynb](https://colab.research.google.com/drive/1Xb0s4Aa9YfCXW8J8R6wYv9n74f7GqYPK?usp=sharing)
28 | 


--------------------------------------------------------------------------------
/DigitalSignalProcessing/dsp_add_whitenoise.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | """ 音声情報処理 n本ノック !! """
 4 | 
 5 | # MIT License
 6 | 
 7 | # Copyright (C) 2020 by Akira TAMAMORI
 8 | 
 9 | # Permission is hereby granted, free of charge, to any person
10 | # obtaining a copy of this software and associated documentation files
11 | # (the Software"), to deal in the Software without restriction,
12 | # including without limitation the rights to use, copy, modify, merge,
13 | # publish, distribute, sublicense, and/or sell copies of the Software,
14 | # and to permit persons to whom the Software is furnished to do so,
15 | # subject to the following conditions:
16 | 
17 | # The above copyright notice and this permission notice shall be
18 | # included in all copies or substantial portions of the Software.
19 | 
20 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
21 | # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
22 | # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
23 | # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
24 | # LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
25 | # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
26 | # WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
27 | 
28 | # Commentary:
29 | # - 音声に白色雑音を混ぜる
30 | # - scipyを用いたwav出力
31 | # - matplotlibによるプロット（元音声と雑音入り音声）
32 | 
33 | import matplotlib.pyplot as plt
34 | import numpy as np
35 | from scipy.io import wavfile
36 | 
37 | IN_WAVE_FILE = "in.wav"  # モノラル音声（前提）
38 | OUT_WAVE_FILE = "out_whitenoise.wav"
39 | 
40 | # 音声データ読み込み (fsがサンプリング周波数、dataは音声データ)
41 | fs, speech_data = wavfile.read(IN_WAVE_FILE)
42 | 
43 | # 音声データの長さ
44 | n_speech = len(speech_data)
45 | 
46 | # 雑音だけの区間の長さ
47 | n_noise = 4000
48 | 
49 | # 全体の長さ
50 | n_samples = n_noise + n_speech
51 | 
52 | # 白色雑音を生成
53 | white_noise = np.random.normal(scale=0.04, size=n_samples)
54 | 
55 | # 2バイトのデータとして書き込むためにスケールを調整
56 | white_noise = white_noise * np.iinfo(np.int16).max
57 | 
58 | # ゲインを調整
59 | white_noise = 0.5 * white_noise
60 | 
61 | # 白色雑音を混ぜる
62 | mixed_signal = white_noise  # 最初に雑音を入れる
63 | mixed_signal[n_noise:] += speech_data  # 後から音声を足す
64 | 
65 | # wavの書き込み (scipyモジュール)
66 | mixed_signal = mixed_signal.astype(np.int16)  # 16bit整数に変換
67 | wavfile.write(OUT_WAVE_FILE, fs, mixed_signal)
68 | 
69 | # プロット枠を確保 (10がヨコのサイズ、4はタテのサイズ)
70 | fig = plt.figure(figsize=(12, 8))
71 | axes1 = fig.add_subplot(2, 1, 1)
72 | n_samples = len(speech_data)
73 | time = np.arange(n_samples) / fs
74 | axes1.plot(time, speech_data)  # 音声データのプロット
75 | axes1.set_xlabel("Time (sec)")  # x軸のラベル
76 | axes1.set_ylabel("Amplitude")  # y軸のラベル
77 | axes1.set_title("Original speech")
78 | 
79 | axes2 = fig.add_subplot(2, 1, 2)
80 | n_samples = len(mixed_signal)
81 | time = np.arange(n_samples) / fs
82 | axes2.plot(time, mixed_signal)  # 音声データのプロット
83 | axes2.set_xlabel("Time (sec)")  # x軸のラベル
84 | axes2.set_ylabel("Amplitude")  # y軸のラベル
85 | axes2.set_title("Mixed speech (original + white noise)")
86 | 
87 | # 画像を画面表示
88 | plt.tight_layout()
89 | plt.show()
90 | 


--------------------------------------------------------------------------------
/DigitalSignalProcessing/dsp_convolution.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | """ 音声情報処理 n本ノック !! """
 4 | 
 5 | # MIT License
 6 | 
 7 | # Copyright (C) 2020-2022 by Akira TAMAMORI
 8 | 
 9 | # Permission is hereby granted, free of charge, to any person
10 | # obtaining a copy of this software and associated documentation files
11 | # (the Software"), to deal in the Software without restriction,
12 | # including without limitation the rights to use, copy, modify, merge,
13 | # publish, distribute, sublicense, and/or sell copies of the Software,
14 | # and to permit persons to whom the Software is furnished to do so,
15 | # subject to the following conditions:
16 | 
17 | # The above copyright notice and this permission notice shall be
18 | # included in all copies or substantial portions of the Software.
19 | 
20 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
21 | # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
22 | # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
23 | # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
24 | # LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
25 | # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
26 | # WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
27 | 
28 | # Commentary:
29 | # - 畳み込みをスクラッチで実装する
30 | 
31 | import matplotlib.pyplot as plt
32 | import numpy as np
33 | 
34 | # input signal
35 | x = np.zeros(32, dtype=np.float32)
36 | x[0], x[20] = 2.0, 1.0
37 | 
38 | # Impulse response (waves that decay while oscillating)
39 | h = np.exp(- np.arange(16) / 4.0) * np.sin(2.0 * np.pi * np.arange(16) / 15.0)
40 | 
41 | # output signal
42 | y = np.zeros(len(h) + len(x) - 1, dtype=np.float32)
43 | hzero = np.hstack([h, np.zeros(len(x) - 1)])  # zero padding
44 | xzero = np.hstack([x, np.zeros(len(h) - 1)])  # zero padding
45 | 
46 | # convolution
47 | for n in range(0, len(y)):
48 |     for k in range(0, n + 1):
49 |         y[n] = y[n] + hzero[k] * xzero[n - k]
50 | 
51 | fig = plt.figure(figsize=(18, 4))
52 | for i, (s, l) in enumerate(zip([x, h, y], ["input", "impulse response", "output"])):
53 |     fig.add_subplot("13%d" % (i + 1))
54 |     plt.plot(s, "-o", label=l)
55 |     plt.xlim(0, len(y))
56 |     plt.legend()
57 |     plt.xlabel("Time index")
58 |     plt.ylabel("Magnitude")
59 |     plt.grid()
60 | 
61 | plt.show()
62 | 
63 | # numpy implementation
64 | y_true = np.convolve(h, x, "full")
65 | fig = plt.figure(figsize=(18, 4))
66 | for i, (s, l) in enumerate(zip([x, h, y_true],
67 |                                ["input", "impulse response (numpy)", "output"])):
68 |     fig.add_subplot("13%d" % (i + 1))
69 |     plt.plot(s, "-o", label=l)
70 |     plt.xlim(0, len(y_true))
71 |     plt.legend()
72 |     plt.xlabel("Time index")
73 |     plt.ylabel("Magnitude")
74 |     plt.grid()
75 | 
76 | plt.show()
77 | 


--------------------------------------------------------------------------------
/DigitalSignalProcessing/dsp_fir_denoise.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | """ 音声情報処理 n本ノック !! """
 4 | 
 5 | # MIT License
 6 | 
 7 | # Copyright (C) 2020 by Akira TAMAMORI
 8 | 
 9 | # Permission is hereby granted, free of charge, to any person
10 | # obtaining a copy of this software and associated documentation files
11 | # (the Software"), to deal in the Software without restriction,
12 | # including without limitation the rights to use, copy, modify, merge,
13 | # publish, distribute, sublicense, and/or sell copies of the Software,
14 | # and to permit persons to whom the Software is furnished to do so,
15 | # subject to the following conditions:
16 | 
17 | # The above copyright notice and this permission notice shall be
18 | # included in all copies or substantial portions of the Software.
19 | 
20 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
21 | # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
22 | # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
23 | # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
24 | # LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
25 | # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
26 | # WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
27 | 
28 | # Commentary:
29 | # - ディジタルな正弦波を作成する
30 | # - scipyのsignalモジュールで畳み込みを実行
31 | # - 移動平均フィルタによりホワイトノイズの除去
32 | 
33 | import matplotlib.pyplot as plt
34 | import numpy as np
35 | 
36 | n_framerate = 1000            # 標本化周波数 (Hz)
37 | 
38 | freq = 4                      # 正弦波の周波数 (Hz)
39 | duration = 1                  # 音の継続時間 (sec)
40 | amplitude = 100               # 正弦波の振幅
41 | 
42 | noise_gain = 10               # 雑音のゲイン
43 | 
44 | T = 1.0 / n_framerate         # 標本化周期 (sec)
45 | 
46 | # 係数作成
47 | COEF_LEN = 10
48 | coef = np.ones(COEF_LEN) / COEF_LEN
49 | 
50 | # 正弦波作成
51 | time = np.arange(0, duration, T)  # 継続時間に等しい標本点の作成
52 | sine_wave = amplitude * np.sin(2 * np.pi * freq * time)
53 | 
54 | # ホワイトノイズ作成
55 | noise = np.random.randn(len(time))
56 | noise *= noise_gain
57 | 
58 | # ノイズの重畳
59 | sine_wave_noised = sine_wave + noise
60 | 
61 | # 正弦波に窓をかける
62 | sine_wave_convolved = np.convolve(sine_wave_noised, coef, "valid")
63 | signal_len = len(sine_wave_convolved)
64 | 
65 | # ノイズ重畳後とノイズ除去後の比較
66 | plt.plot(time[:signal_len], sine_wave_noised[:signal_len], label="noised")
67 | plt.plot(time[:signal_len], sine_wave_convolved, label="denoised", linewidth=2)
68 | plt.xlabel("Time (sec)")
69 | plt.ylabel("Amplitude")
70 | plt.ylim(-amplitude - 3.0 * noise_gain, amplitude + 3.0 * noise_gain)
71 | plt.title("Denoising by convolution")
72 | plt.legend()
73 | plt.show()
74 | 
75 | # ノイズ重畳前とノイズ除去後の比較
76 | plt.plot(time[:signal_len], sine_wave[:signal_len], label="original")
77 | plt.plot(time[:signal_len], sine_wave_convolved, label="denoised", linewidth=2)
78 | plt.xlabel("Time (sec)")
79 | plt.ylabel("Amplitude")
80 | plt.ylim(-amplitude - 3.0 * noise_gain, amplitude + 3.0 * noise_gain)
81 | plt.title("Denoising by convolution")
82 | plt.legend()
83 | plt.show()
84 | 


--------------------------------------------------------------------------------
/DigitalSignalProcessing/dsp_hilbert.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | """ 音声情報処理 n本ノック !! """
 4 | 
 5 | # MIT License
 6 | 
 7 | # Copyright (C) 2020 by Akira TAMAMORI
 8 | 
 9 | # Permission is hereby granted, free of charge, to any person
10 | # obtaining a copy of this software and associated documentation files
11 | # (the Software"), to deal in the Software without restriction,
12 | # including without limitation the rights to use, copy, modify, merge,
13 | # publish, distribute, sublicense, and/or sell copies of the Software,
14 | # and to permit persons to whom the Software is furnished to do so,
15 | # subject to the following conditions:
16 | 
17 | # The above copyright notice and this permission notice shall be
18 | # included in all copies or substantial portions of the Software.
19 | 
20 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
21 | # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
22 | # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
23 | # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
24 | # LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
25 | # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
26 | # WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
27 | 
28 | # Commentary:
29 | # - ディジタルな正弦波を作成する
30 | # - 周波数の近い２つの正弦波を重ね合わせて「うなり」を発生させる
31 | # - ヒルベルト変換による包絡線および瞬時位相の抽出
32 | # - 包絡線と瞬時位相から波形の再構成
33 | 
34 | import numpy as np
35 | import scipy.signal as signal
36 | import matplotlib.pyplot as plt
37 | 
38 | n_framerate = 16000             # 標本化周波数 (Hz)
39 | 
40 | freq1 = 6                       # 正弦波の周波数 (Hz)
41 | freq2 = 4                       # 正弦波の周波数 (Hz)
42 | duration = 2                    # 音の継続時間 (sec)
43 | amplitude = 1.0                 # 正弦波の振幅
44 | 
45 | T = 1.0 / n_framerate           # 標本化周期 (sec)
46 | 
47 | # 正弦波作成
48 | time = np.arange(0, duration, T)  # 継続時間に等しい標本点の作成
49 | sine_wave1 = amplitude * np.sin(2 * np.pi * freq1 * time)
50 | sine_wave2 = amplitude * np.sin(2 * np.pi * freq2 * time)
51 | 
52 | # うなり発生
53 | sine_wave = sine_wave1 + sine_wave2
54 | 
55 | # ヒルベルト変換 (FFT -> 虚部0 & 実部2倍 -> 逆FFT)
56 | envelop = np.abs(signal.hilbert(sine_wave))  # 包絡
57 | angle = np.unwrap(np.angle(signal.hilbert(sine_wave)))  # 瞬時位相
58 | 
59 | # 波形と包絡線のプロット
60 | fig = plt.figure(figsize=(10, 6))
61 | plt.xlabel("Time [s]")
62 | plt.ylabel("Amplitude")
63 | plt.title("Original waveform & envelop")
64 | plt.plot(time, sine_wave, label="original")
65 | plt.plot(time, envelop, label="upper envelop")         # 上側の包絡
66 | plt.plot(time[::-1], -envelop, label="lower envelop")  # 下側の包絡
67 | plt.ylim(-3.2, 3.2)
68 | plt.legend()
69 | plt.show()
70 | 
71 | # 瞬時位相のプロット
72 | fig = plt.figure(figsize=(10, 6))
73 | plt.xlabel("Time [s]")
74 | plt.ylabel("Phase [rad]")
75 | plt.title("Instantatenous phase")
76 | plt.plot(time, angle)
77 | plt.show()
78 | 
79 | # オリジナルの波形と再構成後の波形
80 | reconst = envelop * np.cos(angle)  # 再構成
81 | fig = plt.figure(figsize=(10, 6))
82 | plt.xlabel("Time [s]")
83 | plt.ylabel("Amplitude")
84 | plt.title("Original & reconstructed waveform")
85 | plt.plot(time, sine_wave, label="original", linewidth=3)
86 | plt.plot(time, reconst, label="reconstructed")
87 | plt.ylim(-3.2, 3.2)
88 | plt.legend()
89 | plt.show()
90 | 


--------------------------------------------------------------------------------
/DigitalSignalProcessing/dsp_rectangle_fourier.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | """ 音声情報処理 n本ノック !! """
 4 | 
 5 | # MIT License
 6 | 
 7 | # Copyright (C) 2020 by Akira TAMAMORI
 8 | 
 9 | # Permission is hereby granted, free of charge, to any person
10 | # obtaining a copy of this software and associated documentation files
11 | # (the Software"), to deal in the Software without restriction,
12 | # including without limitation the rights to use, copy, modify, merge,
13 | # publish, distribute, sublicense, and/or sell copies of the Software,
14 | # and to permit persons to whom the Software is furnished to do so,
15 | # subject to the following conditions:
16 | 
17 | # The above copyright notice and this permission notice shall be
18 | # included in all copies or substantial portions of the Software.
19 | 
20 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
21 | # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
22 | # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
23 | # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
24 | # LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
25 | # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
26 | # WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
27 | 
28 | # Commentary:
29 | # - 矩形波をフーリエ級数近似により作成する
30 | # - scipyを用いたwav出力
31 | 
32 | import numpy as np
33 | import numpy.matlib
34 | from scipy.io import wavfile
35 | 
36 | OUT_WAVE_FILE = "out_rectangle.wav"
37 | 
38 | sample_rate = 16000             # 標本化周波数 (Hz)
39 | freq = 500                      # 矩形波の周波数 (Hz)
40 | duration = 1                    # 矩形波の継続時間 (sec)
41 | amplitude = 8000                # 振幅 (ゲイン)
42 | order = 1000                    # 級数近似における倍音次数の上限値
43 | 
44 | period = 1.0 / freq             # 矩形波の周期 (sec)
45 | 
46 | # 標本点の数
47 | sample_num = int(np.floor(duration * sample_rate))
48 | 
49 | # 標本点
50 | time_axis = np.arange(0, sample_num).T / sample_rate
51 | 
52 | # フーリエ級数の倍音の次数 (1倍音, 3倍音, 5倍音,...)
53 | orders = np.arange(1, order, 2)  # 引数 start, stop, step
54 | 
55 | # 矩形波のフーリエ係数
56 | coef = 2 * duration / (np.pi * orders) * np.cos(np.pi * orders)
57 | 
58 | # 矩形波の級数近似
59 | rectwav = np.empty(sample_num)
60 | for n, t in enumerate(time_axis):
61 |     rectwav[n] = coef.dot(np.sin(2 * np.pi * orders * t / period))
62 | 
63 | rectwav *= amplitude
64 | 
65 | # wavの書き込み
66 | rectwav = rectwav.astype(np.int16)  # 16bit整数に変換
67 | wavfile.write(OUT_WAVE_FILE, sample_rate, rectwav)
68 | 


--------------------------------------------------------------------------------
/DigitalSignalProcessing/dsp_sawtooth_fourier.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | """ 音声情報処理 n本ノック !! """
 4 | 
 5 | # MIT License
 6 | 
 7 | # Copyright (C) 2020 by Akira TAMAMORI
 8 | 
 9 | # Permission is hereby granted, free of charge, to any person
10 | # obtaining a copy of this software and associated documentation files
11 | # (the Software"), to deal in the Software without restriction,
12 | # including without limitation the rights to use, copy, modify, merge,
13 | # publish, distribute, sublicense, and/or sell copies of the Software,
14 | # and to permit persons to whom the Software is furnished to do so,
15 | # subject to the following conditions:
16 | 
17 | # The above copyright notice and this permission notice shall be
18 | # included in all copies or substantial portions of the Software.
19 | 
20 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
21 | # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
22 | # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
23 | # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
24 | # LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
25 | # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
26 | # WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
27 | 
28 | # Commentary:
29 | # - ノコギリ波をフーリエ級数近似により作成する
30 | # - scipyを用いたwav出力
31 | 
32 | import numpy as np
33 | import numpy.matlib
34 | from scipy.io import wavfile
35 | 
36 | OUT_WAVE_FILE = "out_sawtooh.wav"
37 | 
38 | sample_rate = 16000             # 標本化周波数 (Hz)
39 | freq = 500                      # ノコギリ波の周波数 (Hz)
40 | duration = 1                    # ノコギリ波の継続時間 (sec)
41 | amplitude = 8000                # 振幅 (ゲイン)
42 | order = 1000                    # 級数近似における倍音次数の上限値
43 | 
44 | period = 1.0 / freq             # ノコギリ波の周期 (sec)
45 | 
46 | # 標本点の数
47 | sample_num = int(np.floor(duration * sample_rate))
48 | 
49 | # 標本点
50 | time_axis = np.arange(0, sample_num).T / sample_rate
51 | 
52 | # フーリエ級数の倍音の次数 (1倍音, 2倍音, 3倍音,...)
53 | orders = np.arange(1, order)  # 引数 start, stop, step
54 | 
55 | # ノコギリ波のフーリエ係数
56 | coef = -1.0 * duration / (np.pi * orders) * np.cos(np.pi * orders)
57 | 
58 | # ノコギリ波の級数近似
59 | sawtooth_wav = np.empty(sample_num)
60 | for n, t in enumerate(time_axis):
61 |     sawtooth_wav[n] = coef.dot(np.sin(2 * np.pi * orders * t / period))
62 | 
63 | sawtooth_wav *= amplitude
64 | 
65 | # wavの書き込み
66 | sawtooth_wav = sawtooth_wav.astype(np.int16)  # 16bit整数に変換
67 | wavfile.write(OUT_WAVE_FILE, sample_rate, sawtooth_wav)
68 | 


--------------------------------------------------------------------------------
/DigitalSignalProcessing/dsp_sine.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | """ 音声情報処理 n本ノック !! """
 4 | 
 5 | # MIT License
 6 | 
 7 | # Copyright (C) 2020 by Akira TAMAMORI
 8 | 
 9 | # Permission is hereby granted, free of charge, to any person
10 | # obtaining a copy of this software and associated documentation files
11 | # (the Software"), to deal in the Software without restriction,
12 | # including without limitation the rights to use, copy, modify, merge,
13 | # publish, distribute, sublicense, and/or sell copies of the Software,
14 | # and to permit persons to whom the Software is furnished to do so,
15 | # subject to the following conditions:
16 | 
17 | # The above copyright notice and this permission notice shall be
18 | # included in all copies or substantial portions of the Software.
19 | 
20 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
21 | # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
22 | # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
23 | # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
24 | # LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
25 | # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
26 | # WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
27 | 
28 | # Commentary:
29 | # - ディジタルな正弦波を作成する
30 | # - 正弦波の周波数を指定して「聞くことのできる」波を作る
31 | # - waveモジュールを用いたwav出力
32 | # - scipyを用いたwav出力
33 | 
34 | import wave
35 | import numpy as np
36 | from scipy.io import wavfile
37 | 
38 | OUT_WAVE_FILE = "out_wave.wav"
39 | OUT_SCIPY_WAVE_FILE = "out_scipy.wav"
40 | 
41 | n_channel = 1                   # モノラル
42 | bitdepth = 2                    # 量子化ビット数 16 bit (2 byte)
43 | n_framerate = 16000             # 標本化周波数 (Hz)
44 | 
45 | freq = 1000                     # 正弦波の周波数 (Hz)
46 | duration = 2                    # 音の継続時間 (sec)
47 | amplitude = 8000                # 正弦波の振幅
48 | 
49 | T = 1.0 / n_framerate           # 標本化周期 (sec)
50 | 
51 | # 正弦波作成
52 | time = np.arange(0, duration, T)  # 継続時間に等しい標本点の作成
53 | sine_wave = amplitude * np.sin(2 * np.pi * freq * time)
54 | 
55 | # サンプル数
56 | n_frames = len(sine_wave)
57 | 
58 | # bytesオブジェクトへの変換
59 | sound_frames = sine_wave.astype(np.int16).tobytes()
60 | 
61 | # wavの書き込み (waveモジュール)
62 | with wave.open(OUT_WAVE_FILE, "w") as sound:
63 |     sound.setnchannels(n_channel)    # チャネル数
64 |     sound.setsampwidth(bitdepth)     # 量子化ビット数 (byte!)
65 |     sound.setframerate(n_framerate)  # 標本化周波数 (Hz)
66 |     sound.setnframes(n_frames)       # チャネルあたりのサンプル数
67 |     sound.writeframes(sound_frames)  # 音声データの書き込み
68 | 
69 | # wavの書き込み (scipyモジュール) -> お手軽！
70 | sine_wave = sine_wave.astype(np.int16)  # 16bit整数に変換
71 | wavfile.write(OUT_SCIPY_WAVE_FILE, n_framerate, sine_wave)
72 | 


--------------------------------------------------------------------------------
/DigitalSignalProcessing/dsp_sine_addnoise.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | """ 音声情報処理 n本ノック !! """
 4 | 
 5 | # MIT License
 6 | 
 7 | # Copyright (C) 2020 by Akira TAMAMORI
 8 | 
 9 | # Permission is hereby granted, free of charge, to any person
10 | # obtaining a copy of this software and associated documentation files
11 | # (the Software"), to deal in the Software without restriction,
12 | # including without limitation the rights to use, copy, modify, merge,
13 | # publish, distribute, sublicense, and/or sell copies of the Software,
14 | # and to permit persons to whom the Software is furnished to do so,
15 | # subject to the following conditions:
16 | 
17 | # The above copyright notice and this permission notice shall be
18 | # included in all copies or substantial portions of the Software.
19 | 
20 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
21 | # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
22 | # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
23 | # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
24 | # LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
25 | # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
26 | # WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
27 | 
28 | # Commentary:
29 | # - ディジタルな正弦波を作成する
30 | # - ホワイトノイズを生成し、正弦波に重畳
31 | # - 重畳後の正弦波を音声としてwavに保存
32 | 
33 | import numpy as np
34 | from scipy.io import wavfile
35 | 
36 | OUT_WAVE_FILE = "out_wave_noised.wav"
37 | 
38 | n_framerate = 16000             # 標本化周波数 (Hz)
39 | 
40 | freq = 1000                    # 正弦波の周波数 (Hz)
41 | duration = 1                   # 音の継続時間 (sec)
42 | amplitude = 8000               # 正弦波の振幅
43 | 
44 | noise_gain = 2000              # 雑音のゲイン
45 | 
46 | T = 1.0 / n_framerate          # 標本化周期 (sec)
47 | 
48 | # 正弦波作成
49 | time = np.arange(0, duration, T)  # 継続時間に等しい標本点の作成
50 | sine_wave = amplitude * np.sin(2 * np.pi * freq * time)
51 | 
52 | # ホワイトノイズ作成
53 | noise = np.random.randn(len(time))
54 | 
55 | # ノイズのゲイン調整
56 | noise *= noise_gain
57 | 
58 | # ノイズの重畳
59 | sine_wave_noised = sine_wave + noise
60 | 
61 | # wavの書き込み (scipyモジュール) -> お手軽！
62 | sine_wave = sine_wave.astype(np.int16)  # 16bit整数に変換
63 | wavfile.write(OUT_WAVE_FILE, n_framerate, sine_wave_noised)
64 | 


--------------------------------------------------------------------------------
/DigitalSignalProcessing/dsp_sine_addnoise_plot.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | """ 音声情報処理 n本ノック !! """
 4 | 
 5 | # MIT License
 6 | 
 7 | # Copyright (C) 2020 by Akira TAMAMORI
 8 | 
 9 | # Permission is hereby granted, free of charge, to any person
10 | # obtaining a copy of this software and associated documentation files
11 | # (the Software"), to deal in the Software without restriction,
12 | # including without limitation the rights to use, copy, modify, merge,
13 | # publish, distribute, sublicense, and/or sell copies of the Software,
14 | # and to permit persons to whom the Software is furnished to do so,
15 | # subject to the following conditions:
16 | 
17 | # The above copyright notice and this permission notice shall be
18 | # included in all copies or substantial portions of the Software.
19 | 
20 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
21 | # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
22 | # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
23 | # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
24 | # LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
25 | # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
26 | # WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
27 | 
28 | # Commentary:
29 | # - ディジタルな正弦波を作成する
30 | # - ホワイトノイズを生成し、正弦波に重畳
31 | # - 重畳前と重畳後の波形をプロット
32 | 
33 | import numpy as np
34 | import matplotlib.pyplot as plt
35 | 
36 | n_framerate = 1000             # 標本化周波数 (Hz)
37 | 
38 | freq = 4                       # 正弦波の周波数 (Hz)
39 | duration = 1                   # 音の継続時間 (sec)
40 | amplitude = 100                # 正弦波の振幅
41 | 
42 | noise_gain = 10                # 雑音のゲイン
43 | 
44 | T = 1.0 / n_framerate          # 標本化周期 (sec)
45 | 
46 | # 正弦波作成
47 | time = np.arange(0, duration, T)  # 継続時間に等しい標本点の作成
48 | sine_wave = amplitude * np.sin(2 * np.pi * freq * time)
49 | 
50 | # ホワイトノイズ作成
51 | noise = np.random.randn(len(time))
52 | 
53 | # ノイズのゲイン調整
54 | noise *= noise_gain
55 | 
56 | # ノイズの重畳
57 | sine_wave_noised = sine_wave + noise
58 | 
59 | # 波形表示
60 | fig = plt.figure(figsize=(10, 6))
61 | n_samples = len(sine_wave)
62 | time = np.arange(n_samples) / n_framerate
63 | plt.plot(time, sine_wave_noised, label="noised")
64 | plt.plot(time, sine_wave, label="original", linewidth=3)
65 | plt.xlabel("Time (sec)")
66 | plt.ylabel("Amplitude")
67 | plt.title("Waveform")
68 | plt.legend()
69 | plt.show()
70 | 


--------------------------------------------------------------------------------
/DigitalSignalProcessing/dsp_sine_beat.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | """ 音声情報処理 n本ノック !! """
 4 | 
 5 | # MIT License
 6 | 
 7 | # Copyright (C) 2020 by Akira TAMAMORI
 8 | 
 9 | # Permission is hereby granted, free of charge, to any person
10 | # obtaining a copy of this software and associated documentation files
11 | # (the Software"), to deal in the Software without restriction,
12 | # including without limitation the rights to use, copy, modify, merge,
13 | # publish, distribute, sublicense, and/or sell copies of the Software,
14 | # and to permit persons to whom the Software is furnished to do so,
15 | # subject to the following conditions:
16 | 
17 | # The above copyright notice and this permission notice shall be
18 | # included in all copies or substantial portions of the Software.
19 | 
20 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
21 | # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
22 | # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
23 | # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
24 | # LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
25 | # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
26 | # WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
27 | 
28 | # Commentary:
29 | # - ディジタルな正弦波を作成する
30 | # - 正弦波の周波数を指定して「聞くことのできる」波を作る
31 | # - 周波数の近い２つの正弦波を重ね合わせて「うなり」を発生させる
32 | # - scipyを用いたwav出力
33 | 
34 | import numpy as np
35 | from scipy.io import wavfile
36 | 
37 | OUT_WAVE_FILE = "out_wave_beat.wav"
38 | 
39 | n_framerate = 16000             # 標本化周波数 (Hz)
40 | 
41 | freq1 = 500                     # 正弦波の周波数 (Hz)
42 | freq2 = 504                     # 正弦波の周波数 (Hz)
43 | duration = 2                    # 音の継続時間 (sec)
44 | amplitude = 8000                # 正弦波の振幅
45 | 
46 | T = 1.0 / n_framerate           # 標本化周期 (sec)
47 | 
48 | # 正弦波作成
49 | time = np.arange(0, duration, T)  # 継続時間に等しい標本点の作成
50 | sine_wave1 = amplitude * np.sin(2 * np.pi * freq1 * time)
51 | sine_wave2 = amplitude * np.sin(2 * np.pi * freq2 * time)
52 | 
53 | # うなり発生
54 | sine_wave = sine_wave1 + sine_wave2
55 | 
56 | # wavの書き込み (scipyモジュール)
57 | sine_wave = sine_wave.astype(np.int16)  # 16bit整数に変換
58 | wavfile.write(OUT_WAVE_FILE, n_framerate, sine_wave)
59 | 


--------------------------------------------------------------------------------
/DigitalSignalProcessing/dsp_sine_euler.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | """ 音声情報処理 n本ノック !! """
 4 | 
 5 | # MIT License
 6 | 
 7 | # Copyright (C) 2020 by Akira TAMAMORI
 8 | 
 9 | # Permission is hereby granted, free of charge, to any person
10 | # obtaining a copy of this software and associated documentation files
11 | # (the Software"), to deal in the Software without restriction,
12 | # including without limitation the rights to use, copy, modify, merge,
13 | # publish, distribute, sublicense, and/or sell copies of the Software,
14 | # and to permit persons to whom the Software is furnished to do so,
15 | # subject to the following conditions:
16 | 
17 | # The above copyright notice and this permission notice shall be
18 | # included in all copies or substantial portions of the Software.
19 | 
20 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
21 | # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
22 | # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
23 | # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
24 | # LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
25 | # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
26 | # WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
27 | 
28 | # Commentary:
29 | # - オイラーの公式により複素正弦波を作成する
30 | # - 複素数の実部と虚部を取り出して２次元プロット
31 | # - 複素数の実部と虚部を取り出してそれぞれプロット
32 | # - 複素数の絶対値と位相を取り出してそれぞれプロット
33 | 
34 | import numpy as np
35 | import matplotlib.pyplot as plt
36 | 
37 | OUT_WAVE_FILE = "out_wave_beat.wav"
38 | 
39 | n_framerate = 16000             # 標本化周波数 (Hz)
40 | 
41 | freq = 2                        # 正弦波の周波数 (Hz)
42 | duration = 1                    # 音の継続時間 (sec)
43 | amplitude = 2.0                 # 正弦波の振幅
44 | 
45 | T = 1.0 / n_framerate           # 標本化周期 (sec)
46 | 
47 | # 継続時間に等しい標本点の作成
48 | time = np.arange(0, duration, T)
49 | 
50 | # 位相
51 | phase = 2.0 * np.pi * freq * time
52 | 
53 | # 複素指数関数
54 | complex_exp = amplitude * np.exp(1j * phase)
55 | 
56 | # 実部と虚部を取り出して 2次元プロット
57 | plt.figure(figsize=(6, 6))  # figureの縦横の大きさ
58 | plt.scatter(complex_exp.real, complex_exp.imag)
59 | plt.xlabel('Real part')
60 | plt.xlabel('Imaginary part')
61 | plt.show()
62 | 
63 | # 実部と虚部を取り出して それぞれプロット
64 | plt.figure(figsize=(10, 7))
65 | plt.subplot(2, 1, 1)
66 | plt.plot(time, complex_exp.real)
67 | plt.xlabel("Time (sec)")
68 | plt.ylabel("Real part")
69 | plt.subplot(2, 1, 2)
70 | plt.plot(time, complex_exp.imag)
71 | plt.xlabel("Time (sec)")
72 | plt.ylabel("Imaginary part")
73 | plt.show()
74 | 
75 | # 絶対値と位相を計算して それぞれプロット
76 | amplitude = np.abs(complex_exp)
77 | phase = np.angle(complex_exp)
78 | plt.figure(figsize=(10, 7))
79 | plt.subplot(2, 1, 1)
80 | plt.plot(time, amplitude)
81 | plt.xlabel("Time (sec)")
82 | plt.ylabel("Absolute value")
83 | plt.subplot(2, 1, 2)
84 | plt.plot(time, phase)
85 | plt.xlabel("Time (sec)")
86 | plt.ylabel("Phase")
87 | plt.show()
88 | 


--------------------------------------------------------------------------------
/DigitalSignalProcessing/dsp_sine_plot.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | """ 音声情報処理 n本ノック !! """
 4 | 
 5 | # MIT License
 6 | 
 7 | # Copyright (C) 2020 by Akira TAMAMORI
 8 | 
 9 | # Permission is hereby granted, free of charge, to any person
10 | # obtaining a copy of this software and associated documentation files
11 | # (the Software"), to deal in the Software without restriction,
12 | # including without limitation the rights to use, copy, modify, merge,
13 | # publish, distribute, sublicense, and/or sell copies of the Software,
14 | # and to permit persons to whom the Software is furnished to do so,
15 | # subject to the following conditions:
16 | 
17 | # The above copyright notice and this permission notice shall be
18 | # included in all copies or substantial portions of the Software.
19 | 
20 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
21 | # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
22 | # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
23 | # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
24 | # LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
25 | # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
26 | # WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
27 | 
28 | # Commentary:
29 | # - ディジタルな正弦波を作成してプロットする
30 | 
31 | import numpy as np
32 | import matplotlib.pyplot as plt
33 | 
34 | samplerate = 16000
35 | freq = 3                        # 正弦波の周波数 (Hz)
36 | duration = 2                    # 音の継続時間 (sec)
37 | amplitude = 8000                # 正弦波の振幅
38 | 
39 | T = 1.0 / samplerate            # 標本化周期 (sec)
40 | 
41 | # 正弦波作成
42 | time = np.arange(0, duration, T)  # 継続時間に等しい標本点の作成
43 | sine_wave = amplitude * np.sin(2 * np.pi * freq * time)
44 | 
45 | # 正弦波のプロット
46 | plt.plot(time, sine_wave)
47 | plt.xlabel("Time (sec)")
48 | plt.ylabel("Amplitude")
49 | plt.title("Sine Wave")
50 | plt.show()
51 | 


--------------------------------------------------------------------------------
/DigitalSignalProcessing/dsp_triangle_fourier.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | """ 音声情報処理 n本ノック !! """
 4 | 
 5 | # MIT License
 6 | 
 7 | # Copyright (C) 2020 by Akira TAMAMORI
 8 | 
 9 | # Permission is hereby granted, free of charge, to any person
10 | # obtaining a copy of this software and associated documentation files
11 | # (the Software"), to deal in the Software without restriction,
12 | # including without limitation the rights to use, copy, modify, merge,
13 | # publish, distribute, sublicense, and/or sell copies of the Software,
14 | # and to permit persons to whom the Software is furnished to do so,
15 | # subject to the following conditions:
16 | 
17 | # The above copyright notice and this permission notice shall be
18 | # included in all copies or substantial portions of the Software.
19 | 
20 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
21 | # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
22 | # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
23 | # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
24 | # LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
25 | # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
26 | # WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
27 | 
28 | # Commentary:
29 | # - 三角波をフーリエ級数近似により作成する
30 | # - scipyを用いたwav出力
31 | 
32 | import matplotlib.pyplot as plt
33 | import numpy as np
34 | import numpy.matlib
35 | from scipy.io import wavfile
36 | 
37 | OUT_WAVE_FILE = "out_triangle.wav"
38 | 
39 | sample_rate = 16000             # 標本化周波数 (Hz)
40 | freq = 500                      # 三角波の周波数 (Hz)
41 | duration = 1                    # 三角波の継続時間 (sec)
42 | amplitude = 8000                # 振幅 (ゲイン)
43 | order = 1000                    # 級数近似における倍音次数の上限値
44 | 
45 | period = 1.0 / freq             # 三角波の周期 (sec)
46 | 
47 | # 標本点の数
48 | sample_num = int(np.floor(duration * sample_rate))
49 | 
50 | # 標本点
51 | time_axis = np.arange(0, sample_num).T / sample_rate
52 | 
53 | # フーリエ級数の倍音の次数 (1倍音, 3倍音, 5倍音,...)
54 | orders = np.arange(1, order, 2)  # 引数 start, stop, step
55 | 
56 | # 三角波のフーリエ係数
57 | coef = 1.0 / (orders * orders) * np.sin(orders * np.pi / 2.0)
58 | coef *= 8.0 * duration / (np.pi * np.pi)
59 | 
60 | # 三角波の級数近似
61 | triwav = np.empty(sample_num)
62 | for n, t in enumerate(time_axis):
63 |     triwav[n] = coef.dot(np.sin(2 * np.pi * orders * t / period))
64 | 
65 | triwav *= amplitude
66 | 
67 | plt.plot(triwav)
68 | plt.xlabel("Time (sec)")
69 | plt.ylabel("Amplitude")
70 | plt.title("Waveform")
71 | plt.show()
72 | 
73 | # wavの書き込み
74 | triwav = triwav.astype(np.int16)  # 16bit整数に変換
75 | wavfile.write(OUT_WAVE_FILE, sample_rate, triwav)
76 | 


--------------------------------------------------------------------------------
/DigitalSignalProcessing/dsp_window_blackman.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | """ 音声情報処理 n本ノック !! """
 4 | 
 5 | # MIT License
 6 | 
 7 | # Copyright (C) 2020 by Akira TAMAMORI
 8 | 
 9 | # Permission is hereby granted, free of charge, to any person
10 | # obtaining a copy of this software and associated documentation files
11 | # (the Software"), to deal in the Software without restriction,
12 | # including without limitation the rights to use, copy, modify, merge,
13 | # publish, distribute, sublicense, and/or sell copies of the Software,
14 | # and to permit persons to whom the Software is furnished to do so,
15 | # subject to the following conditions:
16 | 
17 | # The above copyright notice and this permission notice shall be
18 | # included in all copies or substantial portions of the Software.
19 | 
20 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
21 | # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
22 | # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
23 | # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
24 | # LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
25 | # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
26 | # WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
27 | 
28 | # Commentary:
29 | # - ディジタルな正弦波を作成する
30 | # - scipyのsignalモジュールでBlackman窓を作る
31 | # - 定義式に従ってBlackman窓を作る
32 | 
33 | import matplotlib.pyplot as plt
34 | import numpy as np
35 | from scipy import signal
36 | 
37 | n_framerate = 2000  # 標本化周波数 (Hz)
38 | 
39 | freq = 20  # 正弦波の周波数 (Hz)
40 | duration = 1  # 音の継続時間 (sec)
41 | amplitude = 8000  # 正弦波の振幅
42 | 
43 | T = 1.0 / n_framerate  # 標本化周期 (sec)
44 | 
45 | # Blackman窓の作成
46 | window_len = 1025
47 | blackman_window = signal.blackman(window_len)
48 | blackman_window_scratch = np.empty(window_len)
49 | for n in range(window_len):
50 |     blackman_window_scratch[n] = (
51 |         0.42
52 |         - 0.5 * np.cos(2 * np.pi * n / (window_len - 1))
53 |         + 0.08 * np.cos(4 * np.pi * n / (window_len - 1))
54 |     )
55 | 
56 | # scipyから作った窓関数と、定義式から作った窓関数をプロットして比較する
57 | plt.plot(blackman_window, label="scipy", linewidth=3)
58 | plt.plot(blackman_window_scratch, label="scratch")
59 | plt.xlabel("Index")
60 | plt.ylabel("Amplitude")
61 | plt.title("Blackman window")
62 | plt.legend()
63 | plt.show()
64 | 
65 | # 正弦波作成
66 | time = np.arange(0, duration, T)  # 継続時間に等しい標本点の作成
67 | sine_wave = amplitude * np.sin(2 * np.pi * freq * time)
68 | 
69 | # 正弦波に窓をかける
70 | windowed = sine_wave[:window_len] * blackman_window
71 | 
72 | # 正弦波のプロット
73 | plt.plot(time[:window_len], sine_wave[:window_len], label="original")
74 | plt.plot(time[:window_len], windowed, label="windowed")
75 | plt.xlabel("Time (sec)")
76 | plt.ylabel("Amplitude")
77 | plt.title("Sine Wave")
78 | plt.legend()
79 | plt.show()
80 | 


--------------------------------------------------------------------------------
/DigitalSignalProcessing/dsp_window_hamming.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | """ 音声情報処理 n本ノック !! """
 4 | 
 5 | # MIT License
 6 | 
 7 | # Copyright (C) 2020 by Akira TAMAMORI
 8 | 
 9 | # Permission is hereby granted, free of charge, to any person
10 | # obtaining a copy of this software and associated documentation files
11 | # (the Software"), to deal in the Software without restriction,
12 | # including without limitation the rights to use, copy, modify, merge,
13 | # publish, distribute, sublicense, and/or sell copies of the Software,
14 | # and to permit persons to whom the Software is furnished to do so,
15 | # subject to the following conditions:
16 | 
17 | # The above copyright notice and this permission notice shall be
18 | # included in all copies or substantial portions of the Software.
19 | 
20 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
21 | # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
22 | # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
23 | # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
24 | # LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
25 | # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
26 | # WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
27 | 
28 | # Commentary:
29 | # - ディジタルな正弦波を作成する
30 | # - scipyのsignalモジュールでHamming窓を作る
31 | # - 定義式に従ってHamming窓を作る
32 | 
33 | import matplotlib.pyplot as plt
34 | import numpy as np
35 | from scipy import signal
36 | 
37 | n_framerate = 2000  # 標本化周波数 (Hz)
38 | 
39 | freq = 20  # 正弦波の周波数 (Hz)
40 | duration = 1  # 音の継続時間 (sec)
41 | amplitude = 8000  # 正弦波の振幅
42 | 
43 | T = 1.0 / n_framerate  # 標本化周期 (sec)
44 | 
45 | # Hann窓の作成
46 | WINDOW_LEN = 1025
47 | hamming_window = signal.hamming(WINDOW_LEN)
48 | hamming_window_scratch = np.empty(WINDOW_LEN)
49 | for n in range(WINDOW_LEN):
50 |     hamming_window_scratch[n] = 0.54 - 0.46 * np.cos(2 * np.pi * n / (WINDOW_LEN - 1))
51 | 
52 | # scipyから作った窓関数と、定義式から作った窓関数をプロットして比較する
53 | plt.plot(hamming_window, label="scipy", linewidth=3)
54 | plt.plot(hamming_window_scratch, label="scratch")
55 | plt.xlabel("Index")
56 | plt.ylabel("Amplitude")
57 | plt.title("Hamming window")
58 | plt.legend()
59 | plt.show()
60 | 
61 | # 正弦波作成
62 | time = np.arange(0, duration, T)  # 継続時間に等しい標本点の作成
63 | sine_wave = amplitude * np.sin(2 * np.pi * freq * time)
64 | 
65 | # 正弦波に窓をかける
66 | windowed = sine_wave[:WINDOW_LEN] * hamming_window
67 | 
68 | # 正弦波のプロット
69 | plt.plot(time[:WINDOW_LEN], sine_wave[:WINDOW_LEN], label="original")
70 | plt.plot(time[:WINDOW_LEN], windowed, label="windowed")
71 | plt.xlabel("Time (sec)")
72 | plt.ylabel("Amplitude")
73 | plt.title("Sine Wave")
74 | plt.legend()
75 | plt.show()
76 | 


--------------------------------------------------------------------------------
/DigitalSignalProcessing/dsp_window_hann.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | """ 音声情報処理 n本ノック !! """
 4 | 
 5 | # MIT License
 6 | 
 7 | # Copyright (C) 2020 by Akira TAMAMORI
 8 | 
 9 | # Permission is hereby granted, free of charge, to any person
10 | # obtaining a copy of this software and associated documentation files
11 | # (the Software"), to deal in the Software without restriction,
12 | # including without limitation the rights to use, copy, modify, merge,
13 | # publish, distribute, sublicense, and/or sell copies of the Software,
14 | # and to permit persons to whom the Software is furnished to do so,
15 | # subject to the following conditions:
16 | 
17 | # The above copyright notice and this permission notice shall be
18 | # included in all copies or substantial portions of the Software.
19 | 
20 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
21 | # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
22 | # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
23 | # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
24 | # LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
25 | # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
26 | # WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
27 | 
28 | # Commentary:
29 | # - ディジタルな正弦波を作成する
30 | # - scipyのsignalモジュールでHann窓を作る
31 | # - 定義式に従ってHann窓を作る
32 | 
33 | import matplotlib.pyplot as plt
34 | import numpy as np
35 | from scipy import signal
36 | 
37 | n_framerate = 2000  # 標本化周波数 (Hz)
38 | 
39 | freq = 20  # 正弦波の周波数 (Hz)
40 | duration = 1  # 音の継続時間 (sec)
41 | amplitude = 8000  # 正弦波の振幅
42 | 
43 | T = 1.0 / n_framerate  # 標本化周期 (sec)
44 | 
45 | # Hann窓の作成
46 | WINDOW_LEN = 1025
47 | hann_window = signal.hann(WINDOW_LEN)
48 | hann_window_scratch = np.empty(WINDOW_LEN)
49 | for n in range(WINDOW_LEN):
50 |     hann_window_scratch[n] = 0.5 - 0.5 * np.cos(2 * np.pi * n / (WINDOW_LEN - 1))
51 | 
52 | # scipyから作った窓関数と、定義式から作った窓関数をプロットして比較する
53 | plt.plot(hann_window, label="scipy", linewidth=3)
54 | plt.plot(hann_window_scratch, label="scratch")
55 | plt.xlabel("Index")
56 | plt.ylabel("Amplitude")
57 | plt.title("Hann window")
58 | plt.legend()
59 | plt.show()
60 | 
61 | # 正弦波作成
62 | time = np.arange(0, duration, T)  # 継続時間に等しい標本点の作成
63 | sine_wave = amplitude * np.sin(2 * np.pi * freq * time)
64 | 
65 | # 正弦波に窓をかける
66 | windowed = sine_wave[:WINDOW_LEN] * hann_window
67 | 
68 | # 正弦波のプロット
69 | plt.plot(time[:WINDOW_LEN], sine_wave[:WINDOW_LEN], label="original")
70 | plt.plot(time[:WINDOW_LEN], windowed, label="windowed")
71 | plt.xlabel("Time (sec)")
72 | plt.ylabel("Amplitude")
73 | plt.title("Sine Wave")
74 | plt.legend()
75 | plt.show()
76 | 


--------------------------------------------------------------------------------
/DigitalSignalProcessing/dsp_window_triangle.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | """ 音声情報処理 n本ノック !! """
 4 | 
 5 | # MIT License
 6 | 
 7 | # Copyright (C) 2020-2022 by Akira TAMAMORI
 8 | 
 9 | # Permission is hereby granted, free of charge, to any person
10 | # obtaining a copy of this software and associated documentation files
11 | # (the Software"), to deal in the Software without restriction,
12 | # including without limitation the rights to use, copy, modify, merge,
13 | # publish, distribute, sublicense, and/or sell copies of the Software,
14 | # and to permit persons to whom the Software is furnished to do so,
15 | # subject to the following conditions:
16 | 
17 | # The above copyright notice and this permission notice shall be
18 | # included in all copies or substantial portions of the Software.
19 | 
20 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
21 | # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
22 | # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
23 | # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
24 | # LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
25 | # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
26 | # WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
27 | 
28 | # Commentary:
29 | # - ディジタルな正弦波を作成する
30 | # - scipyのsignalモジュールで三角窓を作る
31 | # - 定義式に従って三角窓を作る
32 | 
33 | import matplotlib.pyplot as plt
34 | import numpy as np
35 | from scipy import signal
36 | 
37 | n_framerate = 2000  # 標本化周波数 (Hz)
38 | 
39 | freq = 20  # 正弦波の周波数 (Hz)
40 | duration = 1  # 音の継続時間 (sec)
41 | amplitude = 8000  # 正弦波の振幅
42 | 
43 | T = 1.0 / n_framerate  # 標本化周期 (sec)
44 | 
45 | # 三角窓の作成
46 | window_len = 1025
47 | triangle_window = signal.triang(window_len)
48 | triangle_window_scratch = np.empty(window_len)
49 | for n in range(window_len // 2):
50 |     triangle_window_scratch[n] = 2 * n / (window_len - 1)
51 | for n in range(window_len // 2, window_len):
52 |     triangle_window_scratch[n] = 2 - 2 * n / (window_len - 1)
53 | 
54 | # scipyから作った窓関数と、定義式から作った窓関数をプロットして比較する
55 | plt.plot(triangle_window, label="scipy", linewidth=3)
56 | plt.plot(triangle_window_scratch, label="scratch")
57 | plt.xlabel("Index")
58 | plt.ylabel("Amplitude")
59 | plt.title("Triangle window")
60 | plt.legend()
61 | plt.show()
62 | 
63 | # 正弦波作成
64 | time = np.arange(0, duration, T)  # 継続時間に等しい標本点の作成
65 | sine_wave = amplitude * np.sin(2 * np.pi * freq * time)
66 | 
67 | # 正弦波に窓をかける
68 | windowed = sine_wave[:window_len] * triangle_window
69 | 
70 | # 正弦波のプロット
71 | plt.plot(time[:window_len], sine_wave[:window_len], label="original")
72 | plt.plot(time[:window_len], windowed, label="windowed")
73 | plt.xlabel("Time (sec)")
74 | plt.ylabel("Amplitude")
75 | plt.title("Sine Wave")
76 | plt.legend()
77 | plt.show()
78 | 


--------------------------------------------------------------------------------
/DigitalSignalProcessing/rectangle_anime.mp4:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tam17aki/speech_process_exercise/9d5e1359b948d66046744cc0c461d43d20e1ec66/DigitalSignalProcessing/rectangle_anime.mp4


--------------------------------------------------------------------------------
/DigitalSignalProcessing/sawtooth_anime.mp4:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tam17aki/speech_process_exercise/9d5e1359b948d66046744cc0c461d43d20e1ec66/DigitalSignalProcessing/sawtooth_anime.mp4


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2020 Akira TAMAMORI
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/PhaseRetrieval/README.md:
--------------------------------------------------------------------------------
 1 | # 位相復元
 2 | 
 3 | ## はじめに
 4 | 
 5 | ```
 6 | pip3 install numpy
 7 | pip3 install soundfile
 8 | pip3 install oct2py
 9 | pip3 install scipy
10 | ```
11 | 
12 | Oct2Py経由でMATLAB/GNU Octave用ライブラリLTFATとPHASERETを利用し，音声の位相復元を実装する．
13 | 
14 | 事前にOctaveのインストールを済ませておく．
15 | 
16 | 1. GitHubからltfatの[最新版](https://github.com/ltfat/ltfat)をダウンロードし，適切な場所で解凍する．
17 | 
18 |    例えばパスは /home/hoge/ltfat-main とする
19 |    
20 | 2. GitHubからphaseretの[最新版](https://github.com/ltfat/phaseret)をダウンロードし， ltfat-main直下に解凍する．
21 | 
22 |    例えばパスは /home/hoge/ltfat-main/phaseret-main とする
23 |    
24 | 3. ltfat-mainに移動して octave を起動し，
25 | 
26 |    ```
27 |    octave> ltfatstart;
28 |    octave> ltfatmex;
29 |    ```
30 |    によって事前にライブラリのコンパイルを済ませておく（'octave>' はプロンプト）．
31 |    
32 |    octave上からphaseret-mainに移動して，同様にコンパイルを済ませておく．
33 | 
34 |    ```
35 |    octave> phaseretstart;
36 |    octave> phaseretmex;
37 |    ```
38 | 
39 | ## ファイル一覧
40 | ### Pythonスクリプト
41 | - Single Pass Spectrogram Inversion (SPSI) による位相復元 [phaseret_spsi.py](https://github.com/tam17aki/speech_process_exercise/blob/master/PhaseRetrieval/phaseret_spsi.py)
42 | - Phase Gradient Heap Integration (PGHI) による位相復元 [phaseret_pghi.py](https://github.com/tam17aki/speech_process_exercise/blob/master/PhaseRetrieval/phaseret_pghi.py)
43 | - Real-Time Phase Gradient Heap Integration (RTPGHI) による位相復元 [phaseret_rtpghi.py](https://github.com/tam17aki/speech_process_exercise/blob/master/PhaseRetrieval/phaseret_rtpghi.py)
44 | - Real-Time Iterative Spectrogram Inversion with Look Ahead (RTISILA) による位相復元 [phaseret_rtisila.py](https://github.com/tam17aki/speech_process_exercise/blob/master/PhaseRetrieval/phaseret_rtisila.py)
45 | 


--------------------------------------------------------------------------------
/PhaseRetrieval/phaseret_pghi.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """Demonstration of Phase Gradient Heap Integration (PGHI).
 3 | 
 4 | Copyright (C) 2024 by Akira TAMAMORI
 5 | 
 6 | Permission is hereby granted, free of charge, to any person obtaining a copy
 7 | of this software and associated documentation files (the "Software"), to deal
 8 | in the Software without restriction, including without limitation the rights
 9 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10 | copies of the Software, and to permit persons to whom the Software is
11 | furnished to do so, subject to the following conditions:
12 | 
13 | The above copyright notice and this permission notice shall be included in all
14 | copies or substantial portions of the Software.
15 | 
16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 | SOFTWARE.
23 | """
24 | 
25 | import argparse
26 | from pathlib import Path
27 | 
28 | import numpy as np
29 | import soundfile as sf
30 | from oct2py import octave
31 | from scipy import signal
32 | 
33 | 
34 | def main():
35 |     """Reconstruct phase by using PGHI."""
36 |     parser = argparse.ArgumentParser()
37 |     parser.add_argument("--ltfat_dir", type=str, default="/work/tamamori/ltfat-main")
38 |     parser.add_argument("--win_len", type=int, default=512)
39 |     parser.add_argument("--hop_len", type=int, default=128)
40 |     parser.add_argument("--fft_len", type=int, default=512)
41 |     parser.add_argument("--window", type=str, default="hann")
42 |     parser.add_argument("--in_wavdir", type=str, default="/home/tamamori")
43 |     parser.add_argument("--in_wav", type=str, default="in.wav")
44 |     parser.add_argument("--out_wavdir", type=str, default="/home/tamamori")
45 |     parser.add_argument("--out_wav", type=str, default="out.wav")
46 |     args = parser.parse_args()
47 | 
48 |     # initialization
49 |     octave.addpath(octave.genpath(args.ltfat_dir))
50 |     octave.ltfatstart(0)
51 |     octave.phaseretstart(0)
52 | 
53 |     # compute magnitude spectrum
54 |     audio, rate = sf.read(Path(args.in_wavdir, args.in_wav))
55 |     stfft = signal.ShortTimeFFT(
56 |         win=signal.get_window(args.window, args.win_len),
57 |         hop=args.hop_len,
58 |         fs=rate,
59 |         mfft=args.fft_len,
60 |     )
61 |     mag_spec = np.abs(stfft.stft(audio))
62 | 
63 |     # reconstruct phase spectrum with PGHI
64 |     gamma = octave.pghi_findgamma(args.window, args.hop_len, args.win_len)
65 |     reconst_spec = octave.pghi(mag_spec, gamma, args.hop_len, args.win_len)
66 |     audio = stfft.istft(reconst_spec)
67 |     sf.write(Path(args.out_wavdir, args.out_wav), audio, rate)
68 | 
69 | 
70 | if __name__ == "__main__":
71 |     main()
72 | 


--------------------------------------------------------------------------------
/PhaseRetrieval/phaseret_rtpghi.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """Demonstration of Real-Time Phase Gradient Heap Integration (RTPGHI).
 3 | 
 4 | Copyright (C) 2024 by Akira TAMAMORI
 5 | 
 6 | Permission is hereby granted, free of charge, to any person obtaining a copy
 7 | of this software and associated documentation files (the "Software"), to deal
 8 | in the Software without restriction, including without limitation the rights
 9 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10 | copies of the Software, and to permit persons to whom the Software is
11 | furnished to do so, subject to the following conditions:
12 | 
13 | The above copyright notice and this permission notice shall be included in all
14 | copies or substantial portions of the Software.
15 | 
16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 | SOFTWARE.
23 | """
24 | 
25 | import argparse
26 | from pathlib import Path
27 | 
28 | import numpy as np
29 | import soundfile as sf
30 | from oct2py import octave
31 | from scipy import signal
32 | 
33 | 
34 | def main():
35 |     """Reconstruct phase by using RTPGHI."""
36 |     parser = argparse.ArgumentParser()
37 |     parser.add_argument("--ltfat_dir", type=str, default="/work/tamamori/ltfat-main")
38 |     parser.add_argument("--win_len", type=int, default=512)
39 |     parser.add_argument("--hop_len", type=int, default=128)
40 |     parser.add_argument("--fft_len", type=int, default=512)
41 |     parser.add_argument("--window", type=str, default="hann")
42 |     parser.add_argument("--pghi_type", choices=["normal", "causal"], default="causal")
43 |     parser.add_argument("--in_wavdir", type=str, default="/home/tamamori")
44 |     parser.add_argument("--in_wav", type=str, default="in.wav")
45 |     parser.add_argument("--out_wavdir", type=str, default="/home/tamamori")
46 |     parser.add_argument("--out_wav", type=str, default="out.wav")
47 |     args = parser.parse_args()
48 | 
49 |     # initialization
50 |     octave.addpath(octave.genpath(args.ltfat_dir))
51 |     octave.ltfatstart(0)
52 |     octave.phaseretstart(0)
53 | 
54 |     # compute magnitude spectrum
55 |     audio, rate = sf.read(Path(args.in_wavdir, args.in_wav))
56 |     stfft = signal.ShortTimeFFT(
57 |         win=signal.get_window(args.window, args.win_len),
58 |         hop=args.hop_len,
59 |         fs=rate,
60 |         mfft=args.fft_len,
61 |     )
62 |     mag_spec = np.abs(stfft.stft(audio))
63 | 
64 |     # reconstruct phase spectrum with RTPGHI
65 |     gamma = octave.pghi_findgamma(args.window, args.hop_len, args.win_len)
66 |     reconst_spec = octave.rtpghi(
67 |         mag_spec, gamma, args.hop_len, args.win_len, args.pghi_type
68 |     )
69 |     audio = stfft.istft(reconst_spec)
70 |     sf.write(Path(args.out_wavdir, args.out_wav), audio, rate)
71 | 
72 | 
73 | if __name__ == "__main__":
74 |     main()
75 | 


--------------------------------------------------------------------------------
/PhaseRetrieval/phaseret_spsi.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """Demonstration of Single Pass Spectrogram Inversion (SPSI).
 3 | 
 4 | Copyright (C) 2024 by Akira TAMAMORI
 5 | 
 6 | Permission is hereby granted, free of charge, to any person obtaining a copy
 7 | of this software and associated documentation files (the "Software"), to deal
 8 | in the Software without restriction, including without limitation the rights
 9 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10 | copies of the Software, and to permit persons to whom the Software is
11 | furnished to do so, subject to the following conditions:
12 | 
13 | The above copyright notice and this permission notice shall be included in all
14 | copies or substantial portions of the Software.
15 | 
16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 | SOFTWARE.
23 | """
24 | 
25 | import argparse
26 | from pathlib import Path
27 | 
28 | import numpy as np
29 | import soundfile as sf
30 | from oct2py import octave
31 | from scipy import signal
32 | 
33 | 
34 | def main():
35 |     """Reconstruct phase by using SPSI."""
36 |     parser = argparse.ArgumentParser()
37 |     parser.add_argument("--ltfat_dir", type=str, default="/work/tamamori/ltfat-main")
38 |     parser.add_argument("--win_len", type=int, default=512)
39 |     parser.add_argument("--hop_len", type=int, default=128)
40 |     parser.add_argument("--fft_len", type=int, default=512)
41 |     parser.add_argument("--window", type=str, default="hann")
42 |     parser.add_argument("--in_wavdir", type=str, default="/home/tamamori")
43 |     parser.add_argument("--in_wav", type=str, default="in.wav")
44 |     parser.add_argument("--out_wavdir", type=str, default="/home/tamamori")
45 |     parser.add_argument("--out_wav", type=str, default="out.wav")
46 |     args = parser.parse_args()
47 | 
48 |     # initialization
49 |     octave.addpath(octave.genpath(args.ltfat_dir))
50 |     octave.ltfatstart(0)
51 |     octave.phaseretstart(0)
52 | 
53 |     # compute magnitude spectrum
54 |     audio, rate = sf.read(Path(args.in_wavdir, args.in_wav))
55 |     stfft = signal.ShortTimeFFT(
56 |         win=signal.get_window(args.window, args.win_len),
57 |         hop=args.hop_len,
58 |         fs=rate,
59 |         mfft=args.fft_len,
60 |     )
61 |     mag_spec = np.abs(stfft.stft(audio))
62 | 
63 |     # reconstruct phase spectrum with SPSI
64 |     reconst_spec = octave.spsi(mag_spec, args.hop_len, args.win_len)
65 |     audio = stfft.istft(reconst_spec)
66 |     sf.write(Path(args.out_wavdir, args.out_wav), audio, rate)
67 | 
68 | 
69 | if __name__ == "__main__":
70 |     main()
71 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # speech_process_exercise
 2 | 音声情報処理n本ノックを目指して
 3 | 
 4 | ## [第1章 準備運動（音声読み込み・書き込み・波形プロットなど）](https://github.com/tam17aki/speech_process_exercise/tree/master/WarmUp)
 5 | ## [第2章 ディジタル信号処理の基礎](https://github.com/tam17aki/speech_process_exercise/tree/master/DigitalSignalProcessing)
 6 | ## [第3章 音声加工とサウンドエフェクト](https://github.com/tam17aki/speech_process_exercise/tree/master/SoundEffect)
 7 | ## [第4章 音声の特徴量抽出](https://github.com/tam17aki/speech_process_exercise/tree/master/SpeechAnalysis)
 8 | ## [第5章 音声の分析合成](https://github.com/tam17aki/speech_process_exercise/tree/master/SpeechAnalysisSynthesis)
 9 | ## [第6章 音声合成](https://github.com/tam17aki/speech_process_exercise/tree/master/SpeechSynthesis)
10 | ## [第7章 音声認識](https://github.com/tam17aki/speech_process_exercise/tree/master/SpeechRecognition)
11 | ## 第8章 音声対話システム
12 | ## [第9章 音声変換](https://github.com/tam17aki/speech_process_exercise/tree/master/VoiceConversion)
13 | ## [第10章 話者認識](https://github.com/tam17aki/speech_process_exercise/tree/master/SpeakerRecognition)
14 | ## [第11章 音源分離](https://github.com/tam17aki/speech_process_exercise/tree/master/AudioSourceSeparation)
15 | ## [第12章 音の電子透かし](https://github.com/tam17aki/speech_process_exercise/tree/master/AudioWatermark)
16 | ## [第13章 音の位相復元](https://github.com/tam17aki/speech_process_exercise/tree/master/PhaseRetrieval)
17 | 


--------------------------------------------------------------------------------
/SoundEffect/pysox_bandpass_bandreject.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | """ 音声情報処理 n本ノック !! """
 4 | 
 5 | # MIT License
 6 | 
 7 | # Copyright (C) 2020 by Akira TAMAMORI
 8 | 
 9 | # Permission is hereby granted, free of charge, to any person
10 | # obtaining a copy of this software and associated documentation files
11 | # (the Software"), to deal in the Software without restriction,
12 | # including without limitation the rights to use, copy, modify, merge,
13 | # publish, distribute, sublicense, and/or sell copies of the Software,
14 | # and to permit persons to whom the Software is furnished to do so,
15 | # subject to the following conditions:
16 | 
17 | # The above copyright notice and this permission notice shall be
18 | # included in all copies or substantial portions of the Software.
19 | 
20 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
21 | # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
22 | # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
23 | # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
24 | # LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
25 | # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
26 | # WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
27 | 
28 | # Commentary:
29 | # - PySoXを用いた音声情報処理シリーズ
30 | # - バンドパスフィルタ / バンドリジェクトフィルタをかける
31 | #   →特定周波数帯域の通過 (pass) / 遮断 (rejection)
32 | 
33 | import sox
34 | 
35 | IN_WAVE_FILE = "in.wav"       # 入力音声
36 | OUT_WAVE_FILE_PASS = "bandpass.wav"    # バンドパスフィルタ適用済み音声
37 | OUT_WAVE_FILE_REJECT = "bandreject.wav"  # バンドリジェクトフィルタ適用済み音声
38 | 
39 | transformer = sox.Transformer()
40 | 
41 | # 遮断周波数は「中心周波数」から-3dB（パワーは0.5倍、振幅は0.707倍）になる周波数
42 | BANDPASS_FREQ = 500    # バンドフィルタの「中心」周波数 (Hz)
43 | BANDREJECT_FREQ = 500  # バンドリジェクトフィルタの「中心」周波数 (Hz)
44 | 
45 | # バンドパスフィルタ
46 | transformer.bandpass(frequency=BANDPASS_FREQ)
47 | transformer.build(IN_WAVE_FILE, OUT_WAVE_FILE_PASS)
48 | 
49 | # バンドリジェクトフィルタ
50 | transformer.bandreject(frequency=BANDREJECT_FREQ)
51 | transformer.build(IN_WAVE_FILE, OUT_WAVE_FILE_REJECT)
52 | 


--------------------------------------------------------------------------------
/SoundEffect/pysox_change_bitdepth.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | """ 音声情報処理 n本ノック !! """
 4 | 
 5 | # MIT License
 6 | 
 7 | # Copyright (C) 2020 by Akira TAMAMORI
 8 | 
 9 | # Permission is hereby granted, free of charge, to any person
10 | # obtaining a copy of this software and associated documentation files
11 | # (the Software"), to deal in the Software without restriction,
12 | # including without limitation the rights to use, copy, modify, merge,
13 | # publish, distribute, sublicense, and/or sell copies of the Software,
14 | # and to permit persons to whom the Software is furnished to do so,
15 | # subject to the following conditions:
16 | 
17 | # The above copyright notice and this permission notice shall be
18 | # included in all copies or substantial portions of the Software.
19 | 
20 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
21 | # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
22 | # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
23 | # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
24 | # LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
25 | # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
26 | # WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
27 | 
28 | # Commentary:
29 | # - PySoxを用いた音声情報処理シリーズ
30 | # - 量子化ビット数を変更
31 | 
32 | import sox
33 | 
34 | IN_WAVE_FILE = "mono.wav"          # モノラル音声 (量子化ビット数 16bit)
35 | OUT_WAVE_FILE = "out.wav"          # モノラル音声
36 | 
37 | BITDEPTH = 8
38 | 
39 | # create trasnformer (単一ファイルに対する重ねがけ)
40 | transformer = sox.Transformer()
41 | 
42 | # 量子化ビット数を8bitに変更
43 | transformer.convert(bitdepth=BITDEPTH)
44 | transformer.build(IN_WAVE_FILE, OUT_WAVE_FILE)
45 | 


--------------------------------------------------------------------------------
/SoundEffect/pysox_change_samplerate.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | """ 音声情報処理 n本ノック !! """
 4 | 
 5 | # MIT License
 6 | 
 7 | # Copyright (C) 2020 by Akira TAMAMORI
 8 | 
 9 | # Permission is hereby granted, free of charge, to any person
10 | # obtaining a copy of this software and associated documentation files
11 | # (the Software"), to deal in the Software without restriction,
12 | # including without limitation the rights to use, copy, modify, merge,
13 | # publish, distribute, sublicense, and/or sell copies of the Software,
14 | # and to permit persons to whom the Software is furnished to do so,
15 | # subject to the following conditions:
16 | 
17 | # The above copyright notice and this permission notice shall be
18 | # included in all copies or substantial portions of the Software.
19 | 
20 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
21 | # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
22 | # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
23 | # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
24 | # LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
25 | # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
26 | # WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
27 | 
28 | # Commentary:
29 | # - PySoxを用いた音声情報処理シリーズ
30 | # - サンプリング周波数を変更
31 | 
32 | import sox
33 | 
34 | IN_WAVE_FILE = "mono.wav"          # モノラル音声 (サンプリング周波数 16kHz)
35 | OUT_WAVE_FILE = "out.wav"          # モノラル音声
36 | 
37 | SAMPLERATE = 8000                  # サンプリング周波数（Hz）
38 | 
39 | # create trasnformer (単一ファイルに対する重ねがけ)
40 | transformer = sox.Transformer()
41 | 
42 | # サンプリング周波数を変更
43 | transformer.rate(samplerate=SAMPLERATE)
44 | transformer.build(IN_WAVE_FILE, OUT_WAVE_FILE)
45 | 


--------------------------------------------------------------------------------
/SoundEffect/pysox_downsample.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | """ 音声情報処理 n本ノック !! """
 4 | 
 5 | # MIT License
 6 | 
 7 | # Copyright (C) 2020 by Akira TAMAMORI
 8 | 
 9 | # Permission is hereby granted, free of charge, to any person
10 | # obtaining a copy of this software and associated documentation files
11 | # (the Software"), to deal in the Software without restriction,
12 | # including without limitation the rights to use, copy, modify, merge,
13 | # publish, distribute, sublicense, and/or sell copies of the Software,
14 | # and to permit persons to whom the Software is furnished to do so,
15 | # subject to the following conditions:
16 | 
17 | # The above copyright notice and this permission notice shall be
18 | # included in all copies or substantial portions of the Software.
19 | 
20 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
21 | # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
22 | # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
23 | # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
24 | # LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
25 | # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
26 | # WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
27 | 
28 | # Commentary:
29 | # - PySoXを用いた音声情報処理シリーズ
30 | # - ダウンサンプリング
31 | 
32 | 
33 | import sox
34 | 
35 | IN_WAVE_FILE = "in.wav"  # 入力音声
36 | OUT_WAVE_FILE = "downsample.wav"  # ダウンサンプリングした音声
37 | 
38 | # トランスフォーマーをつくる（単一音声に対する処理）
39 | transformer = sox.Transformer()
40 | 
41 | # ダウンサンプリング の パラメタ
42 | FACTOR = 2  # ダウンサンプリング率 (正の整数)
43 | 
44 | # transformerにダウンサンプリングを設定する
45 | transformer.downsample(factor=FACTOR)
46 | 
47 | # ダウンサンプリングした結果をファイルに保存
48 | transformer.build(IN_WAVE_FILE, OUT_WAVE_FILE)
49 | 
50 | # ダウンサンプリングした結果をarrayとして取得
51 | downsamples = transformer.build_array(IN_WAVE_FILE)
52 | 


--------------------------------------------------------------------------------
/SoundEffect/pysox_echo.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | """ 音声情報処理 n本ノック !! """
 4 | 
 5 | # MIT License
 6 | 
 7 | # Copyright (C) 2020 by Akira TAMAMORI
 8 | 
 9 | # Permission is hereby granted, free of charge, to any person
10 | # obtaining a copy of this software and associated documentation files
11 | # (the Software"), to deal in the Software without restriction,
12 | # including without limitation the rights to use, copy, modify, merge,
13 | # publish, distribute, sublicense, and/or sell copies of the Software,
14 | # and to permit persons to whom the Software is furnished to do so,
15 | # subject to the following conditions:
16 | 
17 | # The above copyright notice and this permission notice shall be
18 | # included in all copies or substantial portions of the Software.
19 | 
20 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
21 | # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
22 | # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
23 | # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
24 | # LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
25 | # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
26 | # WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
27 | 
28 | # Commentary:
29 | # - PySoXを用いた音声情報処理シリーズ
30 | # - エコーをかける
31 | 
32 | 
33 | import sox
34 | from scipy.io import wavfile
35 | 
36 | IN_WAVE_FILE = "in.wav"  # 入力音声
37 | OUT_WAVE_FILE = "echo.wav"  # エコー済み音声
38 | 
39 | # トランスフォーマーをつくる（単一音声に対する処理）
40 | transformer = sox.Transformer()
41 | 
42 | # エコー の パラメタ
43 | n_echos = 2  # エコー回数
44 | delays = [375]  # 遅延時間 (ms)
45 | decays = [0.5]  # 減衰率
46 | 
47 | # エコー回数分、遅延時間と減衰率を与える必要がある
48 | # → エコー回数に等しい長さの「リスト」を 遅延時間と減衰率それぞれで用意する
49 | # → n_echos が 2 なら遅延時間は [375, 750], 減衰率は [0.5, 0.25]
50 | for i in range(1, n_echos):
51 |     delays.append(delays[0] * (i + 1))  # 遅延時間は線形的
52 |     decays.append(decays[0] ** (i + 1))  # 減衰率は指数的
53 | 
54 | # transformerにエコーを設定する
55 | transformer.echo(n_echos=n_echos, delays=delays, decays=decays)
56 | 
57 | # エコーをかけた結果をファイルに保存
58 | transformer.build(IN_WAVE_FILE, OUT_WAVE_FILE)
59 | 
60 | # エコーをかけた結果をarrayとして取得
61 | echos = transformer.build_array(IN_WAVE_FILE)
62 | 


--------------------------------------------------------------------------------
/SoundEffect/pysox_flanger.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | """ 音声情報処理 n本ノック !! """
 4 | 
 5 | # MIT License
 6 | 
 7 | # Copyright (C) 2020 by Akira TAMAMORI
 8 | 
 9 | # Permission is hereby granted, free of charge, to any person
10 | # obtaining a copy of this software and associated documentation files
11 | # (the Software"), to deal in the Software without restriction,
12 | # including without limitation the rights to use, copy, modify, merge,
13 | # publish, distribute, sublicense, and/or sell copies of the Software,
14 | # and to permit persons to whom the Software is furnished to do so,
15 | # subject to the following conditions:
16 | 
17 | # The above copyright notice and this permission notice shall be
18 | # included in all copies or substantial portions of the Software.
19 | 
20 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
21 | # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
22 | # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
23 | # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
24 | # LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
25 | # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
26 | # WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
27 | 
28 | # Commentary:
29 | # - PySoXを用いた音声情報処理シリーズ
30 | # - フランジャ（うなり、うねり）をかける
31 | 
32 | import sox
33 | 
34 | IN_WAVE_FILE = "in.wav"  # 入力音声
35 | OUT_WAVE_FILE = "flanger.wav"  # フランジャをかけた音声
36 | 
37 | # create trasnformer (単一ファイルに対する処理)
38 | transformer = sox.Transformer()
39 | 
40 | # フランジャ の パラメタ
41 | DELAY = 15  # 大もとの遅延時間 (ms)
42 | DEPTH = 3  # DELAY ± DEPTHの遅延をかける (ms)
43 | REGEN = 0  # 出力をフィードバックするときのゲイン量 (-95 to 95)
44 | WIDTH = 75  # ディレイさせた音の振幅をどれだけ減衰させたうえで重ねるか (%)
45 | SPEED = 1.0  # うなりの速さ; 遅延時間の揺れの速さ (Hz)
46 | SHAPE = "sine"  # フランジャのスイープ特性;
47 | # sine的に遅延時間が変化 or 三角波("triangle")的に遅延時間が変化
48 | 
49 | PHASE = 0  # 多チャネルの音にフランジャをかけるときの位相ずれ率 (%)
50 | # 実際の位相ズレはPHASE×2π[rad]
51 | 
52 | # transformerにフランジャを設定する
53 | transformer.flanger(
54 |     delay=DELAY,
55 |     depth=DEPTH,
56 |     regen=REGEN,
57 |     width=WIDTH,
58 |     speed=SPEED,
59 |     shape=SHAPE,
60 |     phase=PHASE,
61 | )
62 | 
63 | # フランジャをかけた結果をファイルに保存
64 | transformer.build(IN_WAVE_FILE, OUT_WAVE_FILE)
65 | 
66 | # フランジャをかけた結果をarrayとして取得
67 | flangers = transformer.build_array(IN_WAVE_FILE)
68 | 


--------------------------------------------------------------------------------
/SoundEffect/pysox_lowpass-highpass.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | """ 音声情報処理 n本ノック !! """
 4 | 
 5 | # MIT License
 6 | 
 7 | # Copyright (C) 2020 by Akira TAMAMORI
 8 | 
 9 | # Permission is hereby granted, free of charge, to any person
10 | # obtaining a copy of this software and associated documentation files
11 | # (the Software"), to deal in the Software without restriction,
12 | # including without limitation the rights to use, copy, modify, merge,
13 | # publish, distribute, sublicense, and/or sell copies of the Software,
14 | # and to permit persons to whom the Software is furnished to do so,
15 | # subject to the following conditions:
16 | 
17 | # The above copyright notice and this permission notice shall be
18 | # included in all copies or substantial portions of the Software.
19 | 
20 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
21 | # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
22 | # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
23 | # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
24 | # LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
25 | # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
26 | # WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
27 | 
28 | # Commentary:
29 | # - PySoXを用いた音声情報処理シリーズ
30 | # - ローパスフィルタ/ハイパスフィルタをかける
31 | 
32 | import sox
33 | 
34 | IN_WAVE_FILE = "in.wav"       # 入力音声
35 | OUT_WAVE_FILE_LOW = "lowpass.wav"    # ローパスフィルタ適用済み音声
36 | OUT_WAVE_FILE_HIGH = "highpass.wav"  # ハイパスフィルタ適用済み音声
37 | 
38 | transformer = sox.Transformer()
39 | 
40 | # 遮断周波数は -3dB（パワーは0.501倍、振幅は0.708倍）になる周波数
41 | LOWPASS_FREQ = 1000  # ローパスフィルタの遮断周波数 (Hz)
42 | HIGHPASS_FREQ = 1000  # ハイパスフィルタの遮断周波数 (Hz)
43 | 
44 | # ローパスフィルタ
45 | transformer.lowpass(frequency=LOWPASS_FREQ)
46 | transformer.build(IN_WAVE_FILE, OUT_WAVE_FILE_LOW)
47 | 
48 | # ハイパスフィルタ
49 | transformer.highpass(frequency=HIGHPASS_FREQ)
50 | transformer.build(IN_WAVE_FILE, OUT_WAVE_FILE_HIGH)
51 | 


--------------------------------------------------------------------------------
/SoundEffect/pysox_pitchshift.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | """ 音声情報処理 n本ノック !! """
 4 | 
 5 | # MIT License
 6 | 
 7 | # Copyright (C) 2020 by Akira TAMAMORI
 8 | 
 9 | # Permission is hereby granted, free of charge, to any person
10 | # obtaining a copy of this software and associated documentation files
11 | # (the Software"), to deal in the Software without restriction,
12 | # including without limitation the rights to use, copy, modify, merge,
13 | # publish, distribute, sublicense, and/or sell copies of the Software,
14 | # and to permit persons to whom the Software is furnished to do so,
15 | # subject to the following conditions:
16 | 
17 | # The above copyright notice and this permission notice shall be
18 | # included in all copies or substantial portions of the Software.
19 | 
20 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
21 | # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
22 | # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
23 | # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
24 | # LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
25 | # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
26 | # WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
27 | 
28 | # Commentary:
29 | # - PySoXを用いた音声情報処理シリーズ
30 | # - ピッチシフトをかける（再生時間を変えずにピッチを上下させる）
31 | 
32 | import sox
33 | 
34 | IN_WAVE_FILE = "in.wav"         # 入力音声
35 | OUT_WAVE_FILE_HIGH = "pitch_high.wav"  # ピッチシフト済み音声（音が高い）
36 | OUT_WAVE_FILE_LOW = "pitch_low.wav"    # ピッチシフト済み音声（音が低い）
37 | 
38 | # create trasnformer (単一ファイルに対する処理)
39 | transformer = sox.Transformer()
40 | 
41 | # ピッチシフト の パラメタ
42 | # 単位：セミトーン（いわゆる半音 -> 1半音の変化は周波数的には約1.06倍）
43 | # 正値は上げる、負値は下げる
44 | # 実際にはfloat値を指定可能
45 | PITCHSHIFT_HIGH = 3.0  # 3半音上げる
46 | PITCHSHIFT_LOW = -5.0  # 5半音下げる
47 | 
48 | # ピッチシフトをかける
49 | transformer.pitch(n_semitones=PITCHSHIFT_HIGH)  # 上げる
50 | transformer.build(IN_WAVE_FILE, OUT_WAVE_FILE_HIGH)
51 | 
52 | transformer.pitch(n_semitones=PITCHSHIFT_LOW)   # 下げる
53 | transformer.build(IN_WAVE_FILE, OUT_WAVE_FILE_LOW)
54 | 


--------------------------------------------------------------------------------
/SoundEffect/pysox_reverb.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | """ 音声情報処理 n本ノック !! """
 4 | 
 5 | # MIT License
 6 | 
 7 | # Copyright (C) 2020 by Akira TAMAMORI
 8 | 
 9 | # Permission is hereby granted, free of charge, to any person
10 | # obtaining a copy of this software and associated documentation files
11 | # (the Software"), to deal in the Software without restriction,
12 | # including without limitation the rights to use, copy, modify, merge,
13 | # publish, distribute, sublicense, and/or sell copies of the Software,
14 | # and to permit persons to whom the Software is furnished to do so,
15 | # subject to the following conditions:
16 | 
17 | # The above copyright notice and this permission notice shall be
18 | # included in all copies or substantial portions of the Software.
19 | 
20 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
21 | # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
22 | # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
23 | # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
24 | # LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
25 | # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
26 | # WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
27 | 
28 | # Commentary:
29 | # - PySoXを用いた音声情報処理シリーズ
30 | # - リバーブをかける
31 | 
32 | import sox
33 | from scipy.io import wavfile
34 | 
35 | IN_WAVE_FILE = "in.wav"  # 入力音声
36 | OUT_WAVE_FILE = "reverb.wav"  # リバーブ済み音声
37 | 
38 | # create trasnformer (単一ファイルに対する処理)
39 | transformer = sox.Transformer()
40 | 
41 | # #################### リバーブ の パラメタ ####################
42 | # リバーブの残響音の長さを調整 (0-100 %)
43 | REVERBERANCE = 80
44 | 
45 | # 高周波反響音の減衰率 (0-100 %)  0だと反響が長い、100だと反響が短い
46 | # →高周波成分が残響の間でどれだけ「吸収」されるかをシミュレート
47 | HIGH_FREQ_DAMPING = 30
48 | 
49 | # 反響する部屋の大きさ (0-100 %)  大きいとホール、小さいと風呂場とか
50 | ROOM_SCALE = 20
51 | 
52 | STEREO_DEPTH = 100
53 | 
54 | # 反響が始まるまでの時間 (up to 500 ms) 大きいと遅れて残響→壁の反射を表現
55 | PRE_DELAY = 100
56 | 
57 | # ウェットゲイン (dB)  付け加えた反響音そのものの大きさ
58 | WET_GAIN = 0
59 | 
60 | # Trueはウェット成分のみ出力
61 | WET_ONLY = False
62 | 
63 | # ##############################################################
64 | 
65 | # transformerにリバーブを設定する
66 | transformer.reverb(
67 |     reverberance=REVERBERANCE,
68 |     high_freq_damping=HIGH_FREQ_DAMPING,
69 |     room_scale=ROOM_SCALE,
70 |     stereo_depth=STEREO_DEPTH,
71 |     pre_delay=PRE_DELAY,
72 |     wet_gain=WET_GAIN,
73 |     wet_only=WET_ONLY,
74 | )
75 | 
76 | # リバーブをかけた結果をファイルに保存
77 | transformer.build(IN_WAVE_FILE, OUT_WAVE_FILE)
78 | 
79 | # リバーブをかけた結果をarrayとして取得
80 | reverb = transformer.build_array(IN_WAVE_FILE)
81 | 


--------------------------------------------------------------------------------
/SoundEffect/pysox_stereo2mono.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | """ 音声情報処理 n本ノック !! """
 4 | 
 5 | # MIT License
 6 | 
 7 | # Copyright (C) 2020 by Akira TAMAMORI
 8 | 
 9 | # Permission is hereby granted, free of charge, to any person
10 | # obtaining a copy of this software and associated documentation files
11 | # (the Software"), to deal in the Software without restriction,
12 | # including without limitation the rights to use, copy, modify, merge,
13 | # publish, distribute, sublicense, and/or sell copies of the Software,
14 | # and to permit persons to whom the Software is furnished to do so,
15 | # subject to the following conditions:
16 | 
17 | # The above copyright notice and this permission notice shall be
18 | # included in all copies or substantial portions of the Software.
19 | 
20 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
21 | # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
22 | # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
23 | # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
24 | # LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
25 | # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
26 | # WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
27 | 
28 | # Commentary:
29 | # - PySoxを用いた音声情報処理シリーズ
30 | # - stereo から mono に変換
31 | 
32 | import sox
33 | 
34 | IN_WAVE_FILE = "stereo.wav"          # ステレオ音声
35 | OUT_WAVE_FILE = "out.wav"           # モノラル音声
36 | 
37 | # create trasnformer (単一ファイルに対する重ねがけ)
38 | transformer = sox.Transformer()
39 | 
40 | # ステレオをモノラルに
41 | transformer.convert(n_channels=1)
42 | transformer.build(IN_WAVE_FILE, OUT_WAVE_FILE)
43 | 


--------------------------------------------------------------------------------
/SoundEffect/pysox_timestretch.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | """ 音声情報処理 n本ノック !! """
 4 | 
 5 | # MIT License
 6 | 
 7 | # Copyright (C) 2020 by Akira TAMAMORI
 8 | 
 9 | # Permission is hereby granted, free of charge, to any person
10 | # obtaining a copy of this software and associated documentation files
11 | # (the Software"), to deal in the Software without restriction,
12 | # including without limitation the rights to use, copy, modify, merge,
13 | # publish, distribute, sublicense, and/or sell copies of the Software,
14 | # and to permit persons to whom the Software is furnished to do so,
15 | # subject to the following conditions:
16 | 
17 | # The above copyright notice and this permission notice shall be
18 | # included in all copies or substantial portions of the Software.
19 | 
20 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
21 | # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
22 | # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
23 | # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
24 | # LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
25 | # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
26 | # WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
27 | 
28 | # Commentary:
29 | # - PySoXを用いた音声情報処理シリーズ
30 | # - タイムストレッチをかける（ピッチを変えずにテンポを変える）
31 | 
32 | import sox
33 | 
34 | IN_WAVE_FILE = "in.wav"       # 入力音声
35 | OUT_WAVE_FILE = "tempo.wav"  # タイムストレッチ済み音声
36 | 
37 | # create trasnformer (単一ファイルに対する処理)
38 | transformer = sox.Transformer()
39 | 
40 | # タイムストレッチ の パラメタ
41 | FACTOR = 1.2  # 早くする (1.0より大きい) / 遅くする (1.0より小さい) 倍率
42 | 
43 | # タイムストレッチをかける
44 | transformer.tempo(factor=FACTOR)
45 | transformer.build(IN_WAVE_FILE, OUT_WAVE_FILE)
46 | 


--------------------------------------------------------------------------------
/SoundEffect/pysox_tremolo.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | """ 音声情報処理 n本ノック !! """
 4 | 
 5 | # MIT License
 6 | 
 7 | # Copyright (C) 2020 by Akira TAMAMORI
 8 | 
 9 | # Permission is hereby granted, free of charge, to any person
10 | # obtaining a copy of this software and associated documentation files
11 | # (the Software"), to deal in the Software without restriction,
12 | # including without limitation the rights to use, copy, modify, merge,
13 | # publish, distribute, sublicense, and/or sell copies of the Software,
14 | # and to permit persons to whom the Software is furnished to do so,
15 | # subject to the following conditions:
16 | 
17 | # The above copyright notice and this permission notice shall be
18 | # included in all copies or substantial portions of the Software.
19 | 
20 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
21 | # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
22 | # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
23 | # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
24 | # LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
25 | # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
26 | # WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
27 | 
28 | # Commentary:
29 | # - PySoXを用いた音声情報処理シリーズ
30 | # - トレモロをかける （周期的な振幅の上下動）
31 | 
32 | import sox
33 | 
34 | IN_WAVE_FILE = "in.wav"       # 入力音声
35 | OUT_WAVE_FILE = "tremolo.wav"  # トレモロ済み音声
36 | 
37 | # create trasnformer (単一ファイルに対する処理)
38 | transformer = sox.Transformer()
39 | 
40 | # トレモロ の パラメタ
41 | # トレモロの速度 (Hz) → 振幅の上下動の頻度
42 | SPEED = 10
43 | 
44 | # トレモロの深さ (%) → 振幅の上下動の深さ（当該振幅を基準にした比）
45 | DEPTH = 50
46 | 
47 | # トレモロをかける
48 | transformer.tremolo(speed=SPEED, depth=DEPTH)
49 | transformer.build(IN_WAVE_FILE, OUT_WAVE_FILE)
50 | 


--------------------------------------------------------------------------------
/SoundEffect/pysox_upsample.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | """ 音声情報処理 n本ノック !! """
 4 | 
 5 | # MIT License
 6 | 
 7 | # Copyright (C) 2020 by Akira TAMAMORI
 8 | 
 9 | # Permission is hereby granted, free of charge, to any person
10 | # obtaining a copy of this software and associated documentation files
11 | # (the Software"), to deal in the Software without restriction,
12 | # including without limitation the rights to use, copy, modify, merge,
13 | # publish, distribute, sublicense, and/or sell copies of the Software,
14 | # and to permit persons to whom the Software is furnished to do so,
15 | # subject to the following conditions:
16 | 
17 | # The above copyright notice and this permission notice shall be
18 | # included in all copies or substantial portions of the Software.
19 | 
20 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
21 | # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
22 | # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
23 | # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
24 | # LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
25 | # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
26 | # WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
27 | 
28 | # Commentary:
29 | # - PySoXを用いた音声情報処理シリーズ
30 | # - アップサンプリング
31 | 
32 | 
33 | import sox
34 | 
35 | IN_WAVE_FILE = "in.wav"  # 入力音声
36 | OUT_WAVE_FILE = "upsample.wav"  # アップサンプリングした音声
37 | 
38 | # トランスフォーマーをつくる（単一音声に対する処理）
39 | transformer = sox.Transformer()
40 | 
41 | # アップサンプリング の パラメタ
42 | FACTOR = 2  # アップサンプリング率 (正の整数)
43 | 
44 | # transformerにアップサンプリングを設定する
45 | transformer.upsample(factor=FACTOR)
46 | 
47 | # アップサンプリングした結果をファイルに保存
48 | transformer.build(IN_WAVE_FILE, OUT_WAVE_FILE)
49 | 
50 | # アップサンプリングした結果をarrayとして取得
51 | upsamples = transformer.build_array(IN_WAVE_FILE)
52 | 


--------------------------------------------------------------------------------
/SoundEffect/pysox_wav2raw.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | """ 音声情報処理 n本ノック !! """
 4 | 
 5 | # MIT License
 6 | 
 7 | # Copyright (C) 2020 by Akira TAMAMORI
 8 | 
 9 | # Permission is hereby granted, free of charge, to any person
10 | # obtaining a copy of this software and associated documentation files
11 | # (the Software"), to deal in the Software without restriction,
12 | # including without limitation the rights to use, copy, modify, merge,
13 | # publish, distribute, sublicense, and/or sell copies of the Software,
14 | # and to permit persons to whom the Software is furnished to do so,
15 | # subject to the following conditions:
16 | 
17 | # The above copyright notice and this permission notice shall be
18 | # included in all copies or substantial portions of the Software.
19 | 
20 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
21 | # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
22 | # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
23 | # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
24 | # LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
25 | # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
26 | # WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
27 | 
28 | # Commentary:
29 | # - PySoxを用いた音声情報処理シリーズ
30 | # - wav から raw に変換
31 | 
32 | import sox
33 | 
34 | IN_WAVE_FILE = "in.wav"  # モノラル音声
35 | OUT_RAW_FILE = "out.raw"  # ヘッダファイルを抜いたもの
36 | 
37 | # create trasnformer (単一ファイルに対する重ねがけ)
38 | transformer = sox.Transformer()
39 | 
40 | # wav to raw -> ファイル名を指定するだけ！
41 | transformer.build(IN_WAVE_FILE, OUT_RAW_FILE)
42 | 


--------------------------------------------------------------------------------
/SpeakerRecognition/README.md:
--------------------------------------------------------------------------------
 1 | # 話者認識
 2 | 
 3 | ## はじめに
 4 | ```
 5 | python3 -m pip install librosa
 6 | python3 -m pip install hydra-core
 7 | python3 -m pip install progressbar2
 8 | python3 -m pip install torch
 9 | python3 -m pip install torchaudio
10 | python3 -m pip install xvector-jtubespeech
11 | ```
12 | ## 使用データ
13 | - [in.wav](https://drive.google.com/file/d/1lsN-is31x_snFBTNGR05pQwX9RhzC8sb/view?usp=sharing)
14 | - [声優統計コーパス](https://voice-statistics.github.io/)
15 | 
16 | ## ファイル一覧
17 | - xvectorの抽出 via xvector-jtubespeech
18 |   - 抽出のお試し [extract_sample.py](https://github.com/tam17aki/speech_process_exercise/blob/master/SpeakerRecognition/extract_sample.py)
19 |   - 声優統計コーパス
20 |     - コーパスのダウンロード [download_voicestats_corpus.py](https://github.com/tam17aki/speech_process_exercise/blob/master/SpeakerRecognition/download_voicestats_corpus.py)
21 |     - 事前学習済モデルのダウンロード [download_pretrained_model.py](https://github.com/tam17aki/speech_process_exercise/blob/master/SpeakerRecognition/download_pretrained_model.py)
22 |     - xvectorを抽出して保存 [extract_xvector_voicestats.py](https://github.com/tam17aki/speech_process_exercise/blob/master/SpeakerRecognition/extract_xvector_voicestats.py)
23 |    
24 | - 話者認識モデルを動かす
25 |   - 声優統計コーパスから抽出済のxvectorを用いる
26 |     - サポートベクトルマシン
27 |     - フィードフォワードニューラルネット (PyTorch) [spk_recog_mlp.py](https://github.com/tam17aki/speech_process_exercise/blob/master/SpeakerRecognition/spk_recog_mlp.py)
28 |     - フィードフォワードニューラルネット (scikit-learn) [spk_recog_mlp_sklearn.py](https://github.com/tam17aki/speech_process_exercise/blob/master/SpeakerRecognition/spk_recog_mlp_sklearn.py)
29 | 


--------------------------------------------------------------------------------
/SpeakerRecognition/config.yaml:
--------------------------------------------------------------------------------
 1 | xvector:
 2 |   root_dir: "/home/tamamori/work/n-hon-knock/SpeakerRecognition/"
 3 |   data_dir: "voice-statistics/data/"
 4 |   feat_dir: "feats/"
 5 |   model_dir: "model/"
 6 |   corpus_url: "https://github.com/voice-statistics/voice-statistics.github.com/raw/master/assets/data/"
 7 |   repo_url: "https://github.com/sarulab-speech/xvector_jtubespeech/archive/refs/heads/master.zip"
 8 |   repo_name: "xvector_jtubespeech-master"
 9 |   n_jobs:
10 | 
11 | actor:
12 |   - "tsuchiya"
13 |   - "fujitou"
14 |   - "uemura"
15 | 
16 | emotion:
17 |   - "angry"
18 |   - "happy"
19 |   - "normal"
20 | 
21 | feature:
22 |   sample_rate: 16000
23 |   num_ceps: 24
24 |   num_melbins: 24
25 | 
26 | pretrained:
27 |   repo_name: "xvector_jtubespeech-master"
28 |   file_name: "xvector.pth"
29 | 
30 | model:
31 |   x_dim: 512
32 |   h_dim: 512
33 |   n_layers: 3
34 |   activation: "relu" # for scikit-learn
35 |   layer_sizes: [512, 512, 512] # for scikit-learn
36 | 
37 | training:
38 |   seed: 0
39 |   n_splits: 5 # クロスバリデーションの分割数
40 |   n_epoch: 50
41 |   n_batch: 16
42 |   learning_rate: 0.0001 # for scikit-learn
43 |   model_file: "model.pytorch"
44 |   optim:
45 |     optimizer:  # 最適化アルゴリズム
46 |       name: Adam
47 |       params:  # 最適化アルゴリズムに応じて項目を追加したり減らしたりする
48 |         lr: 1e-4  # 学習率
49 |         betas: [0.9, 0.98]
50 |         eps: 1e-08
51 |         weight_decay: 0
52 |     lr_scheduler:  # 学習率調整アルゴリズム
53 |       name: MultiStepLR
54 |       params:  # 学習率調整アルゴリズムに応じて項目を追加したり減らしたりする
55 |         milestones:
56 |           - 50
57 |         gamma: 0.6
58 |   use_scheduler: False  # 学習率スケジューリングを使うか否か
59 | 


--------------------------------------------------------------------------------
/SpeakerRecognition/config_sklearn.yaml:
--------------------------------------------------------------------------------
 1 | xvector:
 2 |   root_dir: "/home/tamamori/work/n-hon-knock/SpeakerRecognition/"
 3 |   data_dir: "voice-statistics/data/"
 4 |   feat_dir: "feats/"
 5 |   model_dir: "model/"
 6 |   corpus_url: "https://github.com/voice-statistics/voice-statistics.github.com/raw/master/assets/data/"
 7 |   repo_url: "https://github.com/sarulab-speech/xvector_jtubespeech/archive/refs/heads/master.zip"
 8 |   repo_name: "xvector_jtubespeech-master"
 9 |   n_jobs:
10 | 
11 | actor:
12 |   - "tsuchiya"
13 |   - "fujitou"
14 |   - "uemura"
15 | 
16 | emotion:
17 |   - "angry"
18 |   - "happy"
19 |   - "normal"
20 | 
21 | feature:
22 |   sample_rate: 16000
23 |   num_ceps: 24
24 |   num_melbins: 24
25 | 
26 | pretrained:
27 |   repo_name: "xvector_jtubespeech-master"
28 |   file_name: "xvector.pth"
29 | 
30 | model:
31 |   x_dim: 512
32 |   h_dim: 512
33 |   n_layers: 3
34 |   activation: "relu" # for scikit-learn
35 |   layer_sizes: [512, 512, 512] # for scikit-learn
36 | 
37 | 
38 | training:
39 |   seed: 0
40 |   n_splits: 5 # クロスバリデーションの分割数
41 |   n_epoch: 50
42 |   n_batch: 16
43 |   learning_rate: 0.0001 # for scikit-learn
44 | 


--------------------------------------------------------------------------------
/SpeakerRecognition/download_pretrained_model.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | """ 音声情報処理 n本ノック !! """
 4 | 
 5 | # MIT License
 6 | 
 7 | # Copyright (C) 2023 by Akira TAMAMORI
 8 | 
 9 | # Permission is hereby granted, free of charge, to any person
10 | # obtaining a copy of this software and associated documentation files
11 | # (the Software"), to deal in the Software without restriction,
12 | # including without limitation the rights to use, copy, modify, merge,
13 | # publish, distribute, sublicense, and/or sell copies of the Software,
14 | # and to permit persons to whom the Software is furnished to do so,
15 | # subject to the following conditions:
16 | 
17 | # The above copyright notice and this permission notice shall be
18 | # included in all copies or substantial portions of the Software.
19 | 
20 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
21 | # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
22 | # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
23 | # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
24 | # LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
25 | # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
26 | # WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
27 | 
28 | # Commentary:
29 | # subprocesモジュールを介したwgetによるxvector-jtubespeechの事前学習済モデルのダウンロード
30 | 
31 | import os
32 | import subprocess
33 | 
34 | from hydra import compose, initialize
35 | 
36 | 
37 | def get_pretrained_model(cfg):
38 |     """Download pretrained model."""
39 |     repo_url = cfg.xvector.repo_url
40 |     data_dir = os.path.join(cfg.xvector.root_dir, cfg.xvector.data_dir)
41 |     os.makedirs(data_dir, exist_ok=True)
42 |     model_dir = os.path.join(cfg.xvector.root_dir, cfg.xvector.model_dir)
43 |     os.makedirs(model_dir, exist_ok=True)
44 | 
45 |     subprocess.run(
46 |         "echo -n Downloading pretrained model ...",
47 |         text=True,
48 |         shell=True,
49 |         check=True,
50 |     )
51 | 
52 |     # download pretrained model from github repo.b rerained
53 |     command = "wget " + "-P " + "/tmp/" + " " + repo_url
54 |     subprocess.run(command, text=True, shell=True, capture_output=True, check=True)
55 |     command = "cd " + "/tmp/" + "; " + "unzip " + "master.zip"
56 |     subprocess.run(command, text=True, shell=True, capture_output=True, check=True)
57 |     command = (
58 |         "cp "
59 |         + os.path.join("/tmp/", cfg.pretrained.repo_name, cfg.pretrained.file_name)
60 |         + " "
61 |         + os.path.join(model_dir, cfg.pretrained.file_name)
62 |     )
63 |     subprocess.run(command, text=True, shell=True, capture_output=True, check=True)
64 | 
65 |     # clean up
66 |     command = "rm " + "/tmp/master.zip"
67 |     subprocess.run(command, text=True, shell=True, capture_output=True, check=True)
68 |     command = "rm -rf " + os.path.join("/tmp/", cfg.pretrained.repo_name)
69 |     subprocess.run(command, text=True, shell=True, capture_output=True, check=True)
70 |     print(" done.")
71 | 
72 | 
73 | if __name__ == "__main__":
74 |     with initialize(version_base=None, config_path="."):
75 |         config = compose(config_name="config")
76 |     get_pretrained_model(config)
77 | 


--------------------------------------------------------------------------------
/SpeakerRecognition/download_voicestats_corpus.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | """ 音声情報処理 n本ノック !! """
 4 | 
 5 | # MIT License
 6 | 
 7 | # Copyright (C) 2023 by Akira TAMAMORI
 8 | 
 9 | # Permission is hereby granted, free of charge, to any person
10 | # obtaining a copy of this software and associated documentation files
11 | # (the Software"), to deal in the Software without restriction,
12 | # including without limitation the rights to use, copy, modify, merge,
13 | # publish, distribute, sublicense, and/or sell copies of the Software,
14 | # and to permit persons to whom the Software is furnished to do so,
15 | # subject to the following conditions:
16 | 
17 | # The above copyright notice and this permission notice shall be
18 | # included in all copies or substantial portions of the Software.
19 | 
20 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
21 | # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
22 | # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
23 | # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
24 | # LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
25 | # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
26 | # WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
27 | 
28 | # Commentary:
29 | # subprocesモジュールを介したwgetによる声優統計コーパスダウンロード
30 | 
31 | import os
32 | import subprocess
33 | 
34 | from hydra import compose, initialize
35 | 
36 | 
37 | def get_corpus(cfg):
38 |     """Download voice-statistics corpurs."""
39 |     corpus_url = cfg.xvector.corpus_url
40 |     data_dir = os.path.join(cfg.xvector.root_dir, cfg.xvector.data_dir)
41 |     os.makedirs(data_dir, exist_ok=True)
42 | 
43 |     subprocess.run(
44 |         "echo -n Downloading voice statistics corpus ...",
45 |         text=True,
46 |         shell=True,
47 |         check=True,
48 |     )
49 |     for actor in cfg.actor:  # "tsuchiya", "fujitou", "uemura"
50 |         for emotion in cfg.emotion:  # "angry", "happy", "normal"
51 |             command = "wget " + "-P " + "/tmp/" + " " + corpus_url
52 |             tar_file = actor + "_" + emotion + ".tar.gz"
53 |             command = command + tar_file
54 |             subprocess.run(
55 |                 command, text=True, shell=True, capture_output=True, check=True
56 |             )
57 |             command = "cd " + data_dir + "; " + "tar -xzvf " + "/tmp/" + tar_file
58 |             subprocess.run(
59 |                 command, text=True, shell=True, capture_output=True, check=True
60 |             )
61 |     print(" done.")
62 | 
63 | 
64 | if __name__ == "__main__":
65 |     with initialize(version_base=None, config_path="."):
66 |         config = compose(config_name="config")
67 |     get_corpus(config)
68 | 


--------------------------------------------------------------------------------
/SpeakerRecognition/extract_sample.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | """Sample script for extraction of x-vector from a audio (monaural wav).
 3 | 
 4 | Copyright (C) 2022 sarulab-speech
 5 | Copyright (C) 2023 by Akira TAMAMORI
 6 | 
 7 | Permission is hereby granted, free of charge, to any person obtaining a copy
 8 | of this software and associated documentation files (the "Software"), to deal
 9 | in the Software without restriction, including without limitation the rights
10 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
11 | copies of the Software, and to permit persons to whom the Software is
12 | furnished to do so, subject to the following conditions:
13 | 
14 | The above copyright notice and this permission notice shall be included in all
15 | copies or substantial portions of the Software.
16 | 
17 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
18 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
20 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
21 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
22 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
23 | SOFTWARE.
24 | """
25 | 
26 | import numpy as np
27 | import torch
28 | from scipy.io import wavfile
29 | from torchaudio.compliance import kaldi
30 | from xvector_jtubespeech import XVector
31 | 
32 | 
33 | def extract_xvector(model, wav):
34 |     """Extract x-vector."""
35 |     # extract mfcc
36 |     wav = torch.from_numpy(wav.astype(np.float32)).unsqueeze(0)
37 |     mfcc = kaldi.mfcc(wav, num_ceps=24, num_mel_bins=24)  # [1, T, 24]
38 |     mfcc = mfcc.unsqueeze(0)
39 | 
40 |     # extract xvector
41 |     xvector = model.vectorize(mfcc)  # (1, 512)
42 |     xvector = xvector.to("cpu").detach().numpy().copy()[0]
43 |     return xvector
44 | 
45 | 
46 | def main():
47 |     """Perform extraction demo."""
48 |     _, wav = wavfile.read("in.wav")  # 16kHz mono
49 |     model = XVector("xvector.pth")  # pretrained model
50 |     xvector = extract_xvector(model, wav)
51 |     print(xvector.shape)  # (512, )
52 | 
53 | 
54 | if __name__ == "__main__":
55 |     main()
56 | 


--------------------------------------------------------------------------------
/SpeechAnalysis/feat_cepstrum.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | """ 音声情報処理 n本ノック !! """
 4 | 
 5 | # MIT License
 6 | 
 7 | # Copyright (C) 2021 by Akira TAMAMORI
 8 | 
 9 | # Permission is hereby granted, free of charge, to any person
10 | # obtaining a copy of this software and associated documentation files
11 | # (the Software"), to deal in the Software without restriction,
12 | # including without limitation the rights to use, copy, modify, merge,
13 | # publish, distribute, sublicense, and/or sell copies of the Software,
14 | # and to permit persons to whom the Software is furnished to do so,
15 | # subject to the following conditions:
16 | 
17 | # The above copyright notice and this permission notice shall be
18 | # included in all copies or substantial portions of the Software.
19 | 
20 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
21 | # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
22 | # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
23 | # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
24 | # LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
25 | # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
26 | # WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
27 | 
28 | # Commentary:
29 | # - ケプストラム法によりスペクトル包絡を抽出する
30 | # - パワーが最大となる音声フレームを対象に推定
31 | 
32 | import matplotlib.pyplot as plt
33 | import numpy as np
34 | import scipy
35 | from scipy.io import wavfile
36 | 
37 | import librosa
38 | 
39 | IN_WAVE_FILE = "in.wav"  # 分析対象の音声
40 | 
41 | FRAME_LENGTH = 1024  # フレーム長 (FFTサイズ)
42 | HOP_LENGTH = 80  # フレームのシフト長
43 | FFT_LENGTH = FRAME_LENGTH
44 | 
45 | MAX_Fo = 200  # 分析における基本周波数の最大値 (Hz)
46 | MIN_Fo = 60  # 分析における基本周波数の最小値 (Hz)
47 | 
48 | # 音声のロード
49 | fs, data = wavfile.read(IN_WAVE_FILE)
50 | data = data.astype(np.float64)
51 | 
52 | # ケプストラムの最大次数、最小次数
53 | max_cep_order = int(np.floor(fs / MIN_Fo))
54 | min_cep_order = int(np.floor(fs / MAX_Fo))
55 | 
56 | # フレーム化
57 | frames = librosa.util.frame(data, frame_length=FRAME_LENGTH, hop_length=HOP_LENGTH).T
58 | 
59 | # パワーが最大のフレーム位置を取得
60 | max_ind = np.argmax(np.sum(frames * frames, axis=1))
61 | 
62 | # パワーが最大となるフレームを取り出す
63 | pow_max_frame = frames[max_ind, :]
64 | 
65 | # 窓掛け（ブラックマン窓）
66 | window = scipy.signal.blackman(FFT_LENGTH)
67 | windowed_frame = pow_max_frame * window
68 | 
69 | # ケプストラムの計算 (FFT → 絶対値2乗 → 対数 → 逆FFT)
70 | fft_spec = scipy.fft.rfft(windowed_frame)
71 | log_power = np.log(np.abs(fft_spec) ** 2)
72 | cepstrum = scipy.fft.irfft(log_power).real
73 | # real partを取るのはなぜ？→「対称性」を保証するため
74 | 
75 | # ケプストラム; 0次（直流）成分は外してプロット
76 | plt.title("Cepstrum w/o DC")
77 | n_samples = len(cepstrum)
78 | quef = np.arange(FFT_LENGTH // 2 + 1) / fs
79 | quef *= 1000  # to msec
80 | plt.xlim([0, np.max(quef)])
81 | plt.plot(quef, cepstrum[: len(quef)])
82 | plt.xlabel("Quefrency (msec)")
83 | plt.ylabel("Cepstrum")
84 | plt.show()
85 | 
86 | lifter = 30  # リフタ次数
87 | cepstrum[lifter : FFT_LENGTH - lifter + 1] = 0  # 高次ケプストラムを0にする
88 | envelop = scipy.fft.rfft(cepstrum).real  # fftによりスペクトル包絡にする
89 | 
90 | # 対数パワースペクトル + スペクトル包絡
91 | plt.title("Log power spectrum + spectral envelop")
92 | plt.xlim([0, len(log_power)])
93 | plt.plot(log_power, label="log power")
94 | plt.plot(envelop, label="envelop")
95 | plt.xlabel("Frequency (Hz)")
96 | plt.ylabel("Log power (dB)")
97 | plt.show()
98 | 


--------------------------------------------------------------------------------
/SpeechAnalysis/feat_fo_cepstrum_sequence.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | """ 音声情報処理 n本ノック !! """
 4 | 
 5 | # MIT License
 6 | 
 7 | # Copyright (C) 2020 by Akira TAMAMORI
 8 | 
 9 | # Permission is hereby granted, free of charge, to any person
10 | # obtaining a copy of this software and associated documentation files
11 | # (the Software"), to deal in the Software without restriction,
12 | # including without limitation the rights to use, copy, modify, merge,
13 | # publish, distribute, sublicense, and/or sell copies of the Software,
14 | # and to permit persons to whom the Software is furnished to do so,
15 | # subject to the following conditions:
16 | 
17 | # The above copyright notice and this permission notice shall be
18 | # included in all copies or substantial portions of the Software.
19 | 
20 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
21 | # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
22 | # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
23 | # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
24 | # LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
25 | # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
26 | # WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
27 | 
28 | # Commentary:
29 | # - ケプストラム法により基本周波数の「系列」を推定する
30 | # - 簡易的な有声音の判定と無声音の判定
31 | 
32 | import matplotlib.pyplot as plt
33 | import numpy as np
34 | import scipy
35 | from scipy.io import wavfile
36 | import librosa
37 | 
38 | IN_WAVE_FILE = "in.wav"         # 分析対象の音声
39 | 
40 | FRAME_LENGTH = 1024             # フレーム長 (FFTサイズ)
41 | HOP_LENGTH = 80                 # フレームのシフト長
42 | FFT_LENGTH = FRAME_LENGTH
43 | 
44 | MAX_Fo = 200                # 分析における基本周波数の最大値 (Hz)
45 | MIN_Fo = 60                 # 分析における基本周波数の最小値 (Hz)
46 | 
47 | THRESHOLD_dB = -30          # 無声判定のしきい値 in dB
48 | 
49 | # 音声のロード
50 | fs, data = wavfile.read(IN_WAVE_FILE)
51 | data = data.astype(np.float64)
52 | 
53 | # ケプストラムの最大次数、最小次数
54 | max_cep_order = int(np.floor(fs / MIN_Fo))
55 | min_cep_order = int(np.floor(fs / MAX_Fo))
56 | 
57 | # フレーム化
58 | frames = librosa.util.frame(data, frame_length=FRAME_LENGTH,
59 |                             hop_length=HOP_LENGTH).T
60 | 
61 | # 各フレームで計算したパワーをもとに有声音のフレームを決定（泥臭い）
62 | powers = np.sum(frames * frames, axis=1)
63 | voiced = np.where(10 * np.log(powers / np.max(powers)) > THRESHOLD_dB)
64 | 
65 | # 窓掛け
66 | window = scipy.signal.blackman(FFT_LENGTH)
67 | windowed_frame = frames[voiced] * window
68 | 
69 | # ケプストラムの計算 (FFT → 絶対値 → 対数 → 逆FFT)
70 | fft_spec = scipy.fft.rfft(windowed_frame)
71 | log_amp_spec = np.log(np.abs(fft_spec))
72 | cepstrum = scipy.fft.irfft(log_amp_spec)
73 | 
74 | # ピーク位置の検出
75 | peak_index = np.argmax(cepstrum[:, min_cep_order: max_cep_order], axis=1)
76 | max_quef = peak_index + min_cep_order
77 | 
78 | # ケフレンシから変換して基本周波数の推定
79 | fo = fs / max_quef
80 | 
81 | # 基本周波数の系列：無声音のフレームでは 0 Hzとするため 一様に0で初期化
82 | fo_seq = np.zeros(frames.shape[0])
83 | 
84 | # 有声音のフレームに 推定された基本周波数を格納する
85 | fo_seq[voiced] = fo
86 | 
87 | # 基本周波数の系列を表示
88 | fig = plt.figure(figsize=(12, 4))
89 | plt.plot(fo_seq)
90 | plt.xlabel("Frame number")
91 | plt.ylabel("Frequency (Hz)")
92 | plt.title("Estimation of fundamental frequency via cepstrum method")
93 | plt.tight_layout()
94 | plt.xlim(0, len(fo_seq) - 1)
95 | 
96 | plt.show()
97 | 


--------------------------------------------------------------------------------
/SpeechAnalysis/feat_fo_dio.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | # MIT License
 4 | 
 5 | # Copyright (C) 2020 by Akira TAMAMORI
 6 | 
 7 | # Permission is hereby granted, free of charge, to any person
 8 | # obtaining a copy of this software and associated documentation files
 9 | # (the Software"), to deal in the Software without restriction,
10 | # including without limitation the rights to use, copy, modify, merge,
11 | # publish, distribute, sublicense, and/or sell copies of the Software,
12 | # and to permit persons to whom the Software is furnished to do so,
13 | # subject to the following conditions:
14 | 
15 | # The above copyright notice and this permission notice shall be
16 | # included in all copies or substantial portions of the Software.
17 | 
18 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
19 | # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20 | # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
21 | # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
22 | # LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
23 | # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
24 | # WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
25 | 
26 | # Commentary:
27 | # DIO による基本周波数推定
28 | 
29 | import matplotlib.pyplot as plt
30 | import numpy as np
31 | import pyworld
32 | from scipy.io import wavfile
33 | 
34 | IN_WAVE_FILE = "in.wav"
35 | FRAME_LENGTH = 1024  # フレーム長 (FFTサイズ)
36 | HOP_LENGTH = 80  # フレームのシフト長
37 | 
38 | # 音声のロード
39 | fs, data = wavfile.read(IN_WAVE_FILE)
40 | data = data.astype(np.float64)
41 | 
42 | # DIO に基づく基本周波数推定
43 | fo, _ = pyworld.dio(data, fs)
44 | 
45 | # 波形表示
46 | fig = plt.figure(figsize=(12, 6))
47 | n_samples = len(data)
48 | time = np.arange(n_samples) / fs
49 | axes = fig.add_subplot(2, 1, 1)
50 | axes.plot(time, data)
51 | axes.set_xlabel("Time (sec)")
52 | axes.set_ylabel("Amplitude")
53 | axes.set_title("Waveform")
54 | axes.set_xlim(0, np.max(time))
55 | 
56 | axes = fig.add_subplot(2, 1, 2)
57 | axes.plot(fo)
58 | axes.set_xlabel("Frame number")
59 | axes.set_ylabel("Frequency (Hz)")
60 | axes.set_title("Estimation of fundamental frequency via pYIN method")
61 | axes.set_xlim(0, len(fo) - 1)
62 | 
63 | plt.tight_layout()
64 | plt.show()
65 | 


--------------------------------------------------------------------------------
/SpeechAnalysis/feat_fo_music.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | 
  3 | """ 音声情報処理 n本ノック !! """
  4 | 
  5 | # MIT License
  6 | 
  7 | # Copyright (C) 2020 by Akira TAMAMORI
  8 | 
  9 | # Permission is hereby granted, free of charge, to any person
 10 | # obtaining a copy of this software and associated documentation files
 11 | # (the Software"), to deal in the Software without restriction,
 12 | # including without limitation the rights to use, copy, modify, merge,
 13 | # publish, distribute, sublicense, and/or sell copies of the Software,
 14 | # and to permit persons to whom the Software is furnished to do so,
 15 | # subject to the following conditions:
 16 | 
 17 | # The above copyright notice and this permission notice shall be
 18 | # included in all copies or substantial portions of the Software.
 19 | 
 20 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
 21 | # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
 22 | # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
 23 | # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
 24 | # LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
 25 | # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
 26 | # WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 27 | 
 28 | # Commentary:
 29 | # - 音声セグメントからMUSIC法により基本周波数を推定する
 30 | 
 31 | import matplotlib.pyplot as plt
 32 | import numpy as np
 33 | import scipy
 34 | from scipy.io import wavfile
 35 | import librosa
 36 | 
 37 | IN_WAVE_FILE = "voice_a.wav"    # 「あ」の音声
 38 | 
 39 | FRAME_LENGTH = 1024             # フレーム長 (FFTサイズ)
 40 | HOP_LENGTH = 80                 # フレームのシフト長
 41 | 
 42 | CUTOFF = 4000                   # 遮断周波数 (Hz)
 43 | 
 44 | 
 45 | # 音声のロード
 46 | fs, data = wavfile.read(IN_WAVE_FILE)
 47 | data = data.astype(np.float64)
 48 | 
 49 | # フレーム化
 50 | frames = librosa.util.frame(data, frame_length=FRAME_LENGTH,
 51 |                             hop_length=HOP_LENGTH).T
 52 | 
 53 | # 周波数軸
 54 | freq_axis = np.linspace(0, fs, frames.shape[0])
 55 | 
 56 | # MUSIC法のノイズ成分を高域の周波数成分と見なす
 57 | ORDER = np.min(np.where(freq_axis > CUTOFF))
 58 | 
 59 | # 標本共分散行列の計算
 60 | cov_frames = np.cov(frames, bias=True)
 61 | 
 62 | # 固有値と固有ベクトルを計算
 63 | # →固有値は大きい順に並び、固有ベクトル（縦）もそれに対応して並ぶ
 64 | eigval, eigvec = np.linalg.eig(cov_frames)
 65 | 
 66 | # ノイズ成分の固有ベクトル
 67 | noise_eigvec = eigvec[:, 2 * ORDER + 1:]
 68 | 
 69 | # パワースペクトルをノイズ成分の固有ベクトルから計算
 70 | power_noise_eigvec = np.abs(np.fft.fft(noise_eigvec))
 71 | power_noise_eigvec = power_noise_eigvec ** 2
 72 | 
 73 | # MUSIC法の疑似スペクトルを計算
 74 | music_pseudo_spec = 1.0 / np.sum(power_noise_eigvec, axis=1)
 75 | 
 76 | # 基本周波数の推定
 77 | # →ピーク位置の最小値を与える周波数
 78 | fo = freq_axis[np.min(scipy.signal.argrelmax(music_pseudo_spec))]
 79 | print(f"Estimatied fundamental frequency = {fo:.2f} Hz")
 80 | 
 81 | # 波形表示
 82 | fig = plt.figure(figsize=(10, 6))
 83 | n_samples = len(data)
 84 | time = np.arange(n_samples) / fs
 85 | plt.plot(time, data)
 86 | plt.xlabel("Time (sec)")
 87 | plt.ylabel("Amplitude")
 88 | plt.title("Waveform (/a/)")
 89 | plt.show()
 90 | 
 91 | # MUSIC法による疑似スペクトルの計算結果
 92 | fig = plt.figure(figsize=(10, 6))
 93 | plt.plot(freq_axis, 20 * np.log10(music_pseudo_spec))
 94 | plt.xlim(0, fs/2)
 95 | plt.xlabel("Frequency (Hz)")
 96 | plt.ylabel("Power [dB]")
 97 | plt.title(
 98 |     f"Pseudospectrum via MUSIC method\nFundamental Frequency = {fo:.2f} Hz")
 99 | plt.show()
100 | 


--------------------------------------------------------------------------------
/SpeechAnalysis/feat_fo_pyin.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | # MIT License
 4 | 
 5 | # Copyright (C) 2020 by Akira TAMAMORI
 6 | 
 7 | # Permission is hereby granted, free of charge, to any person
 8 | # obtaining a copy of this software and associated documentation files
 9 | # (the Software"), to deal in the Software without restriction,
10 | # including without limitation the rights to use, copy, modify, merge,
11 | # publish, distribute, sublicense, and/or sell copies of the Software,
12 | # and to permit persons to whom the Software is furnished to do so,
13 | # subject to the following conditions:
14 | 
15 | # The above copyright notice and this permission notice shall be
16 | # included in all copies or substantial portions of the Software.
17 | 
18 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
19 | # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20 | # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
21 | # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
22 | # LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
23 | # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
24 | # WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
25 | 
26 | # Commentary:
27 | # pYIN による基本周波数推定
28 | 
29 | import librosa
30 | import matplotlib.pyplot as plt
31 | import numpy as np
32 | from scipy.io import wavfile
33 | 
34 | IN_WAVE_FILE = "in.wav"
35 | 
36 | FRAME_LENGTH = 1024  # フレーム長 (FFTサイズ)
37 | HOP_LENGTH = 80  # フレームのシフト長
38 | 
39 | MAX_Fo = 200  # 分析における基本周波数の最大値 (Hz)
40 | MIN_Fo = 60  # 分析における基本周波数の最小値 (Hz)
41 | 
42 | # 音声のロード
43 | fs, data = wavfile.read(IN_WAVE_FILE)
44 | data = data.astype(np.float64)
45 | 
46 | # 基本周波数の推定
47 | fo, _, _ = librosa.pyin(
48 |     data,
49 |     fmin=MIN_Fo,
50 |     fmax=MAX_Fo,
51 |     sr=fs,
52 |     frame_length=FRAME_LENGTH,
53 |     hop_length=HOP_LENGTH,
54 |     fill_na=0.0,
55 | )
56 | 
57 | # 波形表示
58 | fig = plt.figure(figsize=(12, 6))
59 | n_samples = len(data)
60 | time = np.arange(n_samples) / fs
61 | axes = fig.add_subplot(2, 1, 1)
62 | axes.plot(time, data)
63 | axes.set_xlabel("Time (sec)")
64 | axes.set_ylabel("Amplitude")
65 | axes.set_title("Waveform")
66 | axes.set_xlim(0, np.max(time))
67 | 
68 | axes = fig.add_subplot(2, 1, 2)
69 | axes.plot(fo)
70 | axes.set_xlabel("Frame number")
71 | axes.set_ylabel("Frequency (Hz)")
72 | axes.set_title("Estimation of fundamental frequency via pYIN method")
73 | axes.set_xlim(0, len(fo) - 1)
74 | axes.set_ylim(0, MAX_Fo)
75 | 
76 | plt.tight_layout()
77 | plt.show()
78 | 


--------------------------------------------------------------------------------
/SpeechAnalysis/feat_fo_yin.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | # MIT License
 4 | 
 5 | # Copyright (C) 2020 by Akira TAMAMORI
 6 | 
 7 | # Permission is hereby granted, free of charge, to any person
 8 | # obtaining a copy of this software and associated documentation files
 9 | # (the Software"), to deal in the Software without restriction,
10 | # including without limitation the rights to use, copy, modify, merge,
11 | # publish, distribute, sublicense, and/or sell copies of the Software,
12 | # and to permit persons to whom the Software is furnished to do so,
13 | # subject to the following conditions:
14 | 
15 | # The above copyright notice and this permission notice shall be
16 | # included in all copies or substantial portions of the Software.
17 | 
18 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
19 | # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20 | # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
21 | # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
22 | # LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
23 | # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
24 | # WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
25 | 
26 | # Commentary:
27 | # YIN による基本周波数推定
28 | #
29 | # De Cheveigné, Alain, and Hideki Kawahara,
30 | # “YIN, a fundamental frequency estimator for speech and music,”
31 | # The Journal of the Acoustical Society of America 111.4 (2002): 1917-1930.
32 | # https://asa.scitation.org/doi/10.1121/1.1458024
33 | 
34 | import librosa
35 | import matplotlib.pyplot as plt
36 | import numpy as np
37 | from scipy.io import wavfile
38 | 
39 | IN_WAVE_FILE = "in.wav"
40 | 
41 | FRAME_LENGTH = 1024  # フレーム長 (FFTサイズ)
42 | HOP_LENGTH = 80  # フレームのシフト長
43 | 
44 | MAX_Fo = 200  # 分析における基本周波数の最大値 (Hz)
45 | MIN_Fo = 60  # 分析における基本周波数の最小値 (Hz)
46 | 
47 | # 音声のロード
48 | fs, data = wavfile.read(IN_WAVE_FILE)
49 | data = data.astype(np.float64)
50 | 
51 | # 基本周波数の推定 (YINアルゴリズム)
52 | fo = librosa.yin(
53 |     data,
54 |     fmin=MIN_Fo,
55 |     fmax=MAX_Fo,
56 |     sr=fs,
57 |     frame_length=FRAME_LENGTH,
58 |     hop_length=HOP_LENGTH,
59 |     trough_threshold=0.1,
60 | )
61 | 
62 | # 波形と基本周波数系列を表示
63 | fig = plt.figure(figsize=(12, 6))
64 | n_samples = len(data)
65 | time = np.arange(n_samples) / fs
66 | axes = fig.add_subplot(2, 1, 1)
67 | axes.plot(time, data)
68 | axes.set_xlabel("Time (sec)")
69 | axes.set_ylabel("Amplitude")
70 | axes.set_title("Waveform")
71 | axes.set_xlim(0, np.max(time))
72 | 
73 | axes = fig.add_subplot(2, 1, 2)
74 | axes.plot(fo)
75 | axes.set_xlabel("Frame number")
76 | axes.set_ylabel("Frequency (Hz)")
77 | axes.set_title("Estimation of fundamental frequency via YIN method")
78 | axes.set_xlim(0, len(fo) - 1)
79 | axes.set_ylim(0, MAX_Fo)
80 | 
81 | plt.tight_layout()
82 | plt.show()
83 | 


--------------------------------------------------------------------------------
/SpeechAnalysis/feat_gla.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | """ 音声情報処理 n本ノック !! """
 4 | 
 5 | # MIT License
 6 | 
 7 | # Copyright (C) 2020 by Akira TAMAMORI
 8 | 
 9 | # Permission is hereby granted, free of charge, to any person
10 | # obtaining a copy of this software and associated documentation files
11 | # (the Software"), to deal in the Software without restriction,
12 | # including without limitation the rights to use, copy, modify, merge,
13 | # publish, distribute, sublicense, and/or sell copies of the Software,
14 | # and to permit persons to whom the Software is furnished to do so,
15 | # subject to the following conditions:
16 | 
17 | # The above copyright notice and this permission notice shall be
18 | # included in all copies or substantial portions of the Software.
19 | 
20 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
21 | # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
22 | # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
23 | # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
24 | # LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
25 | # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
26 | # WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
27 | 
28 | # Commentary:
29 | # - Griffin-Lim法により位相を復元する
30 | 
31 | import numpy as np
32 | from scipy.io import wavfile
33 | import librosa
34 | 
35 | IN_WAVE_FILE = "in.wav"  # モノラル音声
36 | OUT_WAVE_FILE = "out_gla.wav"  # 復元音声
37 | 
38 | FRAME_LENGTH = 1024             # フレーム長 (FFTサイズ)
39 | HOP_LENGTH = 80                 # フレームのシフト長
40 | 
41 | ITERATION = 200                 # Griffin-Lim法における位相推定の最大繰り返し数
42 | 
43 | # 音声のロード
44 | fs, data = wavfile.read(IN_WAVE_FILE)
45 | data = data.astype(np.float64)
46 | 
47 | # 振幅スペクトル（位相復元なので手に入るのはこれのみ）
48 | amp_spec = np.abs(librosa.core.stft(data, n_fft=FRAME_LENGTH,
49 |                                     hop_length=HOP_LENGTH,
50 |                                     win_length=FRAME_LENGTH))
51 | 
52 | # Griffin-Lim法に基づく位相スペクトルの推定
53 | for i in range(ITERATION):
54 |     if i == 0:
55 |         # 初回は乱数で初期化
56 |         phase_spec = np.random.rand(*amp_spec.shape)
57 |     else:
58 |         # 振幅スペクトルと推定された位相スペクトルから複素スペクトログラムを復元
59 |         recovered_spec = amp_spec * np.exp(1j * phase_spec)
60 | 
61 |         # 短時間フーリエ逆変換で音声を復元
62 |         recovered = librosa.core.istft(recovered_spec, hop_length=HOP_LENGTH,
63 |                                        win_length=FRAME_LENGTH)
64 | 
65 |         # 復元音声から複素スペクトログラムを再計算
66 |         complex_spec = librosa.core.stft(recovered, n_fft=FRAME_LENGTH,
67 |                                          hop_length=HOP_LENGTH,
68 |                                          win_length=FRAME_LENGTH)
69 | 
70 |         # 初回以降は計算済みの複素スペクトログラムから位相スペクトルを推定
71 |         phase_spec = np.angle(complex_spec)
72 | 
73 | # 音声を復元
74 | recovered_spec = amp_spec * np.exp(1j * phase_spec)
75 | recovered = librosa.core.istft(recovered_spec, hop_length=HOP_LENGTH,
76 |                                win_length=FRAME_LENGTH)
77 | recovered = recovered.astype(np.int16)
78 | 
79 | # 復元された音声をwavファイルとして保存
80 | wavfile.write(OUT_WAVE_FILE, fs, recovered)
81 | 


--------------------------------------------------------------------------------
/SpeechAnalysis/feat_melspec.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | """ 音声情報処理 n本ノック !! """
 4 | 
 5 | # MIT License
 6 | 
 7 | # Copyright (C) 2020 by Akira TAMAMORI
 8 | 
 9 | # Permission is hereby granted, free of charge, to any person
10 | # obtaining a copy of this software and associated documentation files
11 | # (the Software"), to deal in the Software without restriction,
12 | # including without limitation the rights to use, copy, modify, merge,
13 | # publish, distribute, sublicense, and/or sell copies of the Software,
14 | # and to permit persons to whom the Software is furnished to do so,
15 | # subject to the following conditions:
16 | 
17 | # The above copyright notice and this permission notice shall be
18 | # included in all copies or substantial portions of the Software.
19 | 
20 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
21 | # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
22 | # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
23 | # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
24 | # LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
25 | # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
26 | # WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
27 | 
28 | # Commentary:
29 | # メルスペクトログラムの抽出と可視化 by librosa
30 | # 波形読み込みはscipy.ioのwavfileモジュール
31 | 
32 | import librosa
33 | import librosa.display
34 | import matplotlib.pyplot as plt
35 | import numpy as np
36 | from scipy.io import wavfile
37 | 
38 | IN_WAVE_FILE = "in.wav"  # モノラル音声（前提）
39 | OUT_WAVE_FILE = "out_istft.wav"
40 | 
41 | FRAME_LENGTH = 1024  # フレーム長
42 | HOP_LENGTH = 80  # フレームのシフト長
43 | N_OVERLAP = FRAME_LENGTH - HOP_LENGTH  # オーバーラップ幅
44 | N_MELS = 128  # メルフィルタバンクの数
45 | 
46 | # 音声データ読み込み (fsがサンプリング周波数、dataは音声データ)
47 | fs, data = wavfile.read(IN_WAVE_FILE)
48 | data = data.astype(np.float64)
49 | 
50 | # メルスペクトログラムの抽出
51 | mel_spec = librosa.feature.melspectrogram(
52 |     y=data, sr=fs, n_mels=N_MELS, hop_length=HOP_LENGTH
53 | )
54 | 
55 | # デシベルスケールにする
56 | mel_spec_dB = librosa.power_to_db(mel_spec, ref=np.max)
57 | 
58 | # メルスペクトログラムの表示
59 | fig = plt.figure(figsize=(10, 4))
60 | librosa.display.specshow(
61 |     mel_spec_dB, x_axis="time", y_axis="hz", hop_length=HOP_LENGTH, sr=fs
62 | )
63 | plt.colorbar(format="%+2.0f dB")
64 | plt.tight_layout()
65 | plt.show()
66 | 


--------------------------------------------------------------------------------
/SpeechAnalysis/feat_mfcc.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | """ 音声情報処理 n本ノック !! """
 4 | 
 5 | # MIT License
 6 | 
 7 | # Copyright (C) 2020 by Akira TAMAMORI
 8 | 
 9 | # Permission is hereby granted, free of charge, to any person
10 | # obtaining a copy of this software and associated documentation files
11 | # (the Software"), to deal in the Software without restriction,
12 | # including without limitation the rights to use, copy, modify, merge,
13 | # publish, distribute, sublicense, and/or sell copies of the Software,
14 | # and to permit persons to whom the Software is furnished to do so,
15 | # subject to the following conditions:
16 | 
17 | # The above copyright notice and this permission notice shall be
18 | # included in all copies or substantial portions of the Software.
19 | 
20 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
21 | # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
22 | # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
23 | # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
24 | # LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
25 | # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
26 | # WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
27 | 
28 | # Commentary:
29 | # MFCCの抽出と可視化 by librosa
30 | # 波形読み込みはscipy.ioのwavfileモジュール
31 | 
32 | import librosa
33 | import librosa.display
34 | import matplotlib.pyplot as plt
35 | import numpy as np
36 | from scipy.io import wavfile
37 | 
38 | IN_WAVE_FILE = "in.wav"  # モノラル音声（前提）
39 | 
40 | FRAME_LENGTH = 1024  # フレーム長
41 | HOP_LENGTH = 80  # フレームのシフト長
42 | N_MELS = 128  # メルフィルタバンクの数
43 | N_MFCC = 20  # MFCCの次数
44 | 
45 | # 音声データ読み込み (fsがサンプリング周波数、dataは音声データ)
46 | fs, data = wavfile.read(IN_WAVE_FILE)
47 | data = data.astype(np.float64)
48 | 
49 | # MFCCの抽出 (音声から抽出)
50 | mfcc = librosa.feature.mfcc(y=data, sr=fs, n_mels=N_MELS, hop_length=HOP_LENGTH)
51 | 
52 | # 形状の確認
53 | print("MFCC arrayの形状: ", mfcc.shape)
54 | 
55 | # MFCCの表示
56 | fig = plt.figure(figsize=(10, 4))
57 | librosa.display.specshow(mfcc, x_axis="time", hop_length=HOP_LENGTH, sr=fs)
58 | plt.colorbar()
59 | plt.tight_layout()
60 | plt.show()
61 | 
62 | # メルスペクトログラムの抽出
63 | mel_spec = librosa.feature.melspectrogram(
64 |     y=data, sr=fs, n_mels=N_MELS, hop_length=HOP_LENGTH
65 | )
66 | 
67 | # デシベルスケールにする
68 | mel_spec_dB = librosa.power_to_db(mel_spec, ref=np.max)
69 | 
70 | # MFCCの抽出
71 | mfcc = librosa.feature.mfcc(S=mel_spec_dB, sr=fs, n_mels=N_MELS, hop_length=HOP_LENGTH)
72 | 
73 | # メルスペクトログラムの表示
74 | fig = plt.figure(figsize=(10, 4))
75 | librosa.display.specshow(mfcc, x_axis="time", hop_length=HOP_LENGTH, sr=fs)
76 | plt.colorbar()
77 | plt.tight_layout()
78 | plt.show()
79 | 


--------------------------------------------------------------------------------
/SpeechAnalysis/feat_stft.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | """ 音声情報処理 n本ノック !! """
 4 | 
 5 | # MIT License
 6 | 
 7 | # Copyright (C) 2020 by Akira TAMAMORI
 8 | # Copyright (C) 2020 Masahito Togami
 9 | 
10 | # Permission is hereby granted, free of charge, to any person
11 | # obtaining a copy of this software and associated documentation files
12 | # (the Software"), to deal in the Software without restriction,
13 | # including without limitation the rights to use, copy, modify, merge,
14 | # publish, distribute, sublicense, and/or sell copies of the Software,
15 | # and to permit persons to whom the Software is furnished to do so,
16 | # subject to the following conditions:
17 | 
18 | # The above copyright notice and this permission notice shall be
19 | # included in all copies or substantial portions of the Software.
20 | 
21 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
22 | # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
23 | # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
24 | # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
25 | # LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
26 | # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
27 | # WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
28 | 
29 | # Commentary:
30 | # scipyの短時間フーリエ変換
31 | # 波形読み込みはscipy.ioのwavfileモジュール
32 | 
33 | import numpy as np
34 | import scipy.signal as sp
35 | from scipy.io import wavfile
36 | 
37 | IN_WAVE_FILE = "in.wav"  # モノラル音声（前提）
38 | 
39 | FRAME_LENGTH = 512  # フレーム長
40 | HOP_LENGTH = 256  # フレームのシフト長
41 | N_OVERLAP = FRAME_LENGTH - HOP_LENGTH  # オーバーラップ幅
42 | 
43 | # 音声データ読み込み (fsがサンプリング周波数、dataは音声データ)
44 | fs, data = wavfile.read(IN_WAVE_FILE)
45 | 
46 | # 短時間フーリエ変換を行う
47 | f, t, stft_data = sp.stft(
48 |     data, fs=fs, window="hann", nperseg=FRAME_LENGTH, noverlap=N_OVERLAP
49 | )
50 | 
51 | # 短時間フーリエ変換後のデータ形式を確認
52 | print("短時間フーリエ変換後のshape: ", np.shape(stft_data))
53 | 
54 | # 周波数軸の情報
55 | print("周波数軸 [Hz]: ", f)
56 | 
57 | # 時間軸の情報
58 | print("時間軸[sec]: ", t)
59 | 


--------------------------------------------------------------------------------
/SpeechAnalysis/feat_stft_istft.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | """ 音声情報処理 n本ノック !! """
 4 | 
 5 | # MIT License
 6 | 
 7 | # Copyright (C) 2020 by Akira TAMAMORI
 8 | 
 9 | # Permission is hereby granted, free of charge, to any person
10 | # obtaining a copy of this software and associated documentation files
11 | # (the Software"), to deal in the Software without restriction,
12 | # including without limitation the rights to use, copy, modify, merge,
13 | # publish, distribute, sublicense, and/or sell copies of the Software,
14 | # and to permit persons to whom the Software is furnished to do so,
15 | # subject to the following conditions:
16 | 
17 | # The above copyright notice and this permission notice shall be
18 | # included in all copies or substantial portions of the Software.
19 | 
20 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
21 | # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
22 | # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
23 | # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
24 | # LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
25 | # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
26 | # WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
27 | 
28 | # Commentary:
29 | # 短時間フーリエ変換と逆変換
30 | # 波形読み込みはscipy.ioのwavfileモジュール
31 | 
32 | import numpy as np
33 | import scipy.signal as sp
34 | from scipy.io import wavfile
35 | 
36 | IN_WAVE_FILE = "in.wav"  # モノラル音声（前提）
37 | OUT_WAVE_FILE = "out_istft.wav"
38 | 
39 | FRAME_LENGTH = 512  # フレーム長
40 | HOP_LENGTH = 256  # フレームのシフト長
41 | N_OVERLAP = FRAME_LENGTH - HOP_LENGTH  # オーバーラップ幅
42 | 
43 | # 音声データ読み込み (fsがサンプリング周波数、dataは音声データ)
44 | fs, data = wavfile.read(IN_WAVE_FILE)
45 | 
46 | # 短時間フーリエ変換によりフーリエスペクトル系列を得る
47 | _, _, stft_data = sp.stft(
48 |     data, fs=fs, window="hann", nperseg=FRAME_LENGTH, noverlap=N_OVERLAP
49 | )
50 | 
51 | # 短時間フーリエ逆変換により音声に戻す
52 | _, data_inv = sp.istft(
53 |     stft_data, fs=fs, window="hann", nperseg=FRAME_LENGTH, noverlap=N_OVERLAP
54 | )
55 | 
56 | # 音声の書き込み
57 | data_inv = data_inv.astype(np.int16)  # 2byte (16 bit)の整数値に変換
58 | wavfile.write(OUT_WAVE_FILE, fs, data_inv)
59 | 


--------------------------------------------------------------------------------
/SpeechAnalysisSynthesis/README.md:
--------------------------------------------------------------------------------
 1 | # 音声の分析合成
 2 | 
 3 | ## はじめに
 4 | ```
 5 | pip3 install pysptk
 6 | pip3 install pyworld
 7 | ```
 8 | 
 9 | ## ファイル一覧
10 | ### Pythonスクリプト
11 | - 線形予測分析による分析再合成 [pysptk_anasyn_lpc.py](https://github.com/tam17aki/speech_process_exercise/blob/master/SpeechAnalysisSynthesis/pysptk_anasyn_lpc.py)
12 | - PARCOR分析による分析再合成 [pysptk_anasyn_parcor.py](https://github.com/tam17aki/speech_process_exercise/blob/master/SpeechAnalysisSynthesis/pysptk_anasyn_parcor.py)
13 | - 線スペクトル対による分析再合成 [pysptk_anasyn_lsp.py](https://github.com/tam17aki/speech_process_exercise/blob/master/SpeechAnalysisSynthesis/pysptk_anasyn_lsp.py)
14 | - メルケプストラム分析による再合成 [pysptk_anasyn_mlsa.py](https://github.com/tam17aki/speech_process_exercise/blob/master/SpeechAnalysisSynthesis/pysptk_anasyn_mlsa.py)
15 | - メルケプストラム分析による再合成 (パラメタを変えていろいろな声を合成)
16 | [pysptk_anasyn_mlsa_others.py](https://github.com/tam17aki/speech_process_exercise/blob/master/SpeechAnalysisSynthesis/pysptk_anasyn_mlsa_others.py)
17 | - WORLDによる再合成 [pyworld_anasyn.py](https://github.com/tam17aki/speech_process_exercise/blob/master/SpeechAnalysisSynthesis/pyworld_anasyn.py)
18 | 
19 | ### Jupyter notebook
20 | - 線形予測分析による分析再合成 [pysptk_anasyn_lpc.ipynb](https://nbviewer.jupyter.org/github/tam17aki/speech_process_exercise/blob/master/SpeechAnalysisSynthesis/pysptk_anasyn_lpc.ipynb)
21 | - PARCOR分析による分析再合成 [pysptk_anasyn_parcor.ipynb](https://nbviewer.jupyter.org/github/tam17aki/speech_process_exercise/blob/master/SpeechAnalysisSynthesis/pysptk_anasyn_parcor.ipynb)
22 | - 線スペクトル対による分析再合成 [pysptk_anasyn_lsp.ipynb](https://nbviewer.jupyter.org/github/tam17aki/speech_process_exercise/blob/master/SpeechAnalysisSynthesis/pysptk_anasyn_lsp.ipynb)
23 | - メルケプストラム分析による再合成 [pysptk_anasyn_mlsa.ipynb](https://nbviewer.jupyter.org/github/tam17aki/speech_process_exercise/blob/master/SpeechAnalysisSynthesis/pysptk_anasyn_mlsa.ipynb)
24 | - メルケプストラム分析による再合成 (パラメタを変えていろいろな声を合成)
25 | [pysptk_anasyn_mlsa_others.ipynb](https://nbviewer.jupyter.org/github/tam17aki/speech_process_exercise/blob/master/SpeechAnalysisSynthesis/pysptk_anasyn_mlsa_others.ipynb)
26 | - WORLDによる再合成 [pyworld_anasyn.ipynb](https://nbviewer.jupyter.org/github/tam17aki/speech_process_exercise/blob/master/SpeechAnalysisSynthesis/pyworld_anasyn.ipynb)
27 | 
28 | ### Google Colaboratory
29 | - 線形予測分析による分析再合成 [pysptk_anasyn_lpc.ipynb](https://colab.research.google.com/drive/1nUHBRWUk4vQOCakDXC8T-BVvbZZ9jWXJ?usp=sharing)
30 | - PARCOR分析による分析再合成 [pysptk_anasyn_parcor.ipynb](https://colab.research.google.com/drive/1EFMi2VQfJ_kUwJKn367B-JZeOSbNSSaz?usp=sharing)
31 | - 線スペクトル対による分析再合成 [pysptk_anasyn_lsp.ipynb](https://colab.research.google.com/drive/1BxAMGzLgguA5HivfHuGmeyXIBD8uRWdN?usp=sharing)
32 | - メルケプストラム分析による再合成 [pysptk_anasyn_mlsa.ipynb](https://colab.research.google.com/drive/1TZml_LdOAqDBY3UEGtw_x5UPL8ok44P1?usp=sharing)
33 | - メルケプストラム分析による再合成 (パラメタを変えていろいろな声を合成)
34 | [pysptk_anasyn_mlsa_others.ipynb](https://colab.research.google.com/drive/13QK6S_vQdwgU7bX8pXdJErFjnNHnqeQy?usp=sharing)
35 | - WORLDによる再合成 [pyworld_anasyn.ipynb](https://colab.research.google.com/drive/1yeIWMuQNqX2RNti0hRmHxSoAjrlrIjRU?usp=sharing)
36 | 
37 | ### PySimpleGUIによるGUIアプリ
38 | - 音声録音および分析合成（波形表示・スペクトログラム表示も可能）[pysptk_anasyn_recog.py](https://github.com/tam17aki/speech_process_exercise/blob/master/SpeechAnalysisSynthesis/pysptk_anasyn_recog.py)
39 | 


--------------------------------------------------------------------------------
/SpeechAnalysisSynthesis/pysptk_anasyn_lpc.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | """ 音声情報処理 n本ノック !! """
 4 | 
 5 | # MIT License
 6 | 
 7 | # Copyright (C) 2020 by Akira TAMAMORI
 8 | 
 9 | # Permission is hereby granted, free of charge, to any person
10 | # obtaining a copy of this software and associated documentation files
11 | # (the Software"), to deal in the Software without restriction,
12 | # including without limitation the rights to use, copy, modify, merge,
13 | # publish, distribute, sublicense, and/or sell copies of the Software,
14 | # and to permit persons to whom the Software is furnished to do so,
15 | # subject to the following conditions:
16 | 
17 | # The above copyright notice and this permission notice shall be
18 | # included in all copies or substantial portions of the Software.
19 | 
20 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
21 | # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
22 | # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
23 | # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
24 | # LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
25 | # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
26 | # WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
27 | 
28 | # Commentary:
29 | # - PySPTKによる音声の分析再合成 (LPC係数による全極フィルタ)
30 | 
31 | from pysptk.synthesis import AllPoleDF, Synthesizer
32 | from scipy.io import wavfile
33 | import librosa
34 | import numpy as np
35 | import pysptk
36 | 
37 | FRAME_LENGTH = 1024
38 | HOP_LENGTH = 80
39 | MIN_F0 = 60
40 | MAX_F0 = 240
41 | ORDER = 20
42 | 
43 | IN_WAVE_FILE = "in.wav"       # 入力音声
44 | OUT_WAVE_FILE = "out.wav"     # 分析再合成した音声
45 | 
46 | # 音声の読み込み
47 | fs, x = wavfile.read(IN_WAVE_FILE)
48 | x = x.astype(np.float64)
49 | 
50 | # 音声の切り出しと窓掛け
51 | frames = librosa.util.frame(x, frame_length=FRAME_LENGTH,
52 |                             hop_length=HOP_LENGTH).astype(np.float64).T
53 | frames *= pysptk.blackman(FRAME_LENGTH)  # 窓掛け（ブラックマン窓）
54 | 
55 | # ピッチ抽出
56 | pitch = pysptk.swipe(x, fs=fs, hopsize=HOP_LENGTH,
57 |                      min=MIN_F0, max=MAX_F0, otype="pitch")
58 | 
59 | # 励振源信号(声帯音源)の生成
60 | source_excitation = pysptk.excite(pitch, HOP_LENGTH)
61 | 
62 | # 線形予測分析による線形予測係数の抽出
63 | lpc = pysptk.lpc(frames, ORDER)
64 | lpc[:, 0] = np.log(lpc[:, 0])  # loggain for AllPoleDF
65 | 
66 | # 全極フィルタの作成
67 | synthesizer = Synthesizer(AllPoleDF(order=ORDER), HOP_LENGTH)
68 | 
69 | # 励振源信号でフィルタを駆動して音声を合成
70 | y = synthesizer.synthesis(source_excitation, lpc)
71 | 
72 | # 音声の書き込み
73 | y = y.astype(np.int16)
74 | wavfile.write(OUT_WAVE_FILE, fs, y)
75 | 


--------------------------------------------------------------------------------
/SpeechAnalysisSynthesis/pysptk_anasyn_lsp.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | """ 音声情報処理 n本ノック !! """
 4 | 
 5 | # MIT License
 6 | 
 7 | # Copyright (C) 2020 by Akira TAMAMORI
 8 | 
 9 | # Permission is hereby granted, free of charge, to any person
10 | # obtaining a copy of this software and associated documentation files
11 | # (the Software"), to deal in the Software without restriction,
12 | # including without limitation the rights to use, copy, modify, merge,
13 | # publish, distribute, sublicense, and/or sell copies of the Software,
14 | # and to permit persons to whom the Software is furnished to do so,
15 | # subject to the following conditions:
16 | 
17 | # The above copyright notice and this permission notice shall be
18 | # included in all copies or substantial portions of the Software.
19 | 
20 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
21 | # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
22 | # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
23 | # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
24 | # LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
25 | # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
26 | # WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
27 | 
28 | # Commentary:
29 | # - PySPTKによる音声の分析再合成 (線スペクトル対)
30 | 
31 | from pysptk.synthesis import LSPDF, Synthesizer
32 | from scipy.io import wavfile
33 | import librosa
34 | import numpy as np
35 | import pysptk
36 | 
37 | FRAME_LENGTH = 1024
38 | HOP_LENGTH = 80
39 | MIN_F0 = 60
40 | MAX_F0 = 240
41 | ORDER = 20
42 | 
43 | IN_WAVE_FILE = "in.wav"       # 入力音声
44 | OUT_WAVE_FILE = "out.wav"     # 分析再合成した音声
45 | 
46 | # 音声の読み込み
47 | fs, x = wavfile.read(IN_WAVE_FILE)
48 | x = x.astype(np.float64)
49 | 
50 | # 音声の切り出しと窓掛け
51 | frames = librosa.util.frame(x, frame_length=FRAME_LENGTH,
52 |                             hop_length=HOP_LENGTH).astype(np.float64).T
53 | frames *= pysptk.blackman(FRAME_LENGTH)  # 窓掛け（ブラックマン窓）
54 | 
55 | # ピッチ抽出
56 | pitch = pysptk.swipe(x, fs=fs, hopsize=HOP_LENGTH,
57 |                      min=MIN_F0, max=MAX_F0, otype="pitch")
58 | 
59 | # 励振源信号(声帯音源)の生成
60 | source_excitation = pysptk.excite(pitch, HOP_LENGTH)
61 | 
62 | # 線形予測分析による線形予測符号化(LPC)係数の抽出
63 | lpc = pysptk.lpc(frames, ORDER)
64 | lpc[:, 0] = np.log(lpc[:, 0])
65 | 
66 | # LPC係数を線スペクトル対に変換
67 | lsp = pysptk.lpc2lsp(lpc, otype=0, fs=fs)
68 | 
69 | # 全極フィルタの作成
70 | synthesizer = Synthesizer(LSPDF(order=ORDER), HOP_LENGTH)
71 | 
72 | # 励振源信号でフィルタを駆動して音声を合成
73 | y = synthesizer.synthesis(source_excitation, lsp)
74 | 
75 | # 音声の書き込み
76 | y = y.astype(np.int16)
77 | wavfile.write(OUT_WAVE_FILE, fs, y)
78 | 


--------------------------------------------------------------------------------
/SpeechAnalysisSynthesis/pysptk_anasyn_mlsa.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | """ 音声情報処理 n本ノック !! """
 4 | 
 5 | # MIT License
 6 | 
 7 | # Copyright (C) 2020 by Akira TAMAMORI
 8 | 
 9 | # Permission is hereby granted, free of charge, to any person
10 | # obtaining a copy of this software and associated documentation files
11 | # (the Software"), to deal in the Software without restriction,
12 | # including without limitation the rights to use, copy, modify, merge,
13 | # publish, distribute, sublicense, and/or sell copies of the Software,
14 | # and to permit persons to whom the Software is furnished to do so,
15 | # subject to the following conditions:
16 | 
17 | # The above copyright notice and this permission notice shall be
18 | # included in all copies or substantial portions of the Software.
19 | 
20 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
21 | # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
22 | # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
23 | # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
24 | # LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
25 | # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
26 | # WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
27 | 
28 | # Commentary:
29 | # - PySPTKによる音声の分析再合成 (MLSAフィルタ)
30 | 
31 | from pysptk.synthesis import MLSADF, Synthesizer
32 | from scipy.io import wavfile
33 | import librosa
34 | import numpy as np
35 | import pysptk
36 | 
37 | FRAME_LENGTH = 1024
38 | HOP_LENGTH = 80
39 | MIN_F0 = 60
40 | MAX_F0 = 240
41 | ORDER = 25
42 | ALPHA = 0.41
43 | 
44 | IN_WAVE_FILE = "in.wav"       # 入力音声
45 | OUT_WAVE_FILE = "out.wav"     # 分析再合成した音声
46 | 
47 | # 音声の読み込み
48 | fs, x = wavfile.read(IN_WAVE_FILE)
49 | x = x.astype(np.float64)
50 | 
51 | # 音声の切り出しと窓掛け
52 | frames = librosa.util.frame(x, frame_length=FRAME_LENGTH,
53 |                             hop_length=HOP_LENGTH).astype(np.float64).T
54 | frames *= pysptk.blackman(FRAME_LENGTH)  # 窓掛け（ブラックマン窓）
55 | 
56 | # ピッチ抽出
57 | pitch = pysptk.swipe(x, fs=fs, hopsize=HOP_LENGTH,
58 |                      min=MIN_F0, max=MAX_F0, otype="pitch")
59 | 
60 | # 励振源信号(声帯音源)の生成
61 | source_excitation = pysptk.excite(pitch, HOP_LENGTH)
62 | 
63 | # メルケプストラム分析（＝スペクトル包絡の抽出）
64 | mc = pysptk.mcep(frames, ORDER, ALPHA)  # メルケプストラム係数の抽出
65 | 
66 | # メルケプストラム係数からMLSAディジタルフィルタ係数に変換
67 | mlsa_coef = pysptk.mc2b(mc, ALPHA)
68 | 
69 | # MLSAフィルタの作成
70 | synthesizer = Synthesizer(MLSADF(order=ORDER, alpha=ALPHA), HOP_LENGTH)
71 | 
72 | # 励振源信号でフィルタを駆動して音声を合成
73 | y = synthesizer.synthesis(source_excitation, mlsa_coef)
74 | 
75 | # 音声の書き込み
76 | y = y.astype(np.int16)
77 | wavfile.write(OUT_WAVE_FILE, fs, y)
78 | 


--------------------------------------------------------------------------------
/SpeechAnalysisSynthesis/pysptk_anasyn_mlsa_pyworld.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | """ 音声情報処理 n本ノック !! """
 4 | 
 5 | # MIT License
 6 | 
 7 | # Copyright (C) 2020 by Akira TAMAMORI
 8 | 
 9 | # Permission is hereby granted, free of charge, to any person
10 | # obtaining a copy of this software and associated documentation files
11 | # (the Software"), to deal in the Software without restriction,
12 | # including without limitation the rights to use, copy, modify, merge,
13 | # publish, distribute, sublicense, and/or sell copies of the Software,
14 | # and to permit persons to whom the Software is furnished to do so,
15 | # subject to the following conditions:
16 | 
17 | # The above copyright notice and this permission notice shall be
18 | # included in all copies or substantial portions of the Software.
19 | 
20 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
21 | # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
22 | # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
23 | # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
24 | # LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
25 | # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
26 | # WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
27 | 
28 | # Commentary:
29 | # - PySPTKによる音声の分析再合成 (MLSAフィルタ)
30 | # - ただしメルケプストラム係数をWORLDから抽出したスペクトル包絡から計算
31 | 
32 | from pysptk.synthesis import MLSADF, Synthesizer
33 | from scipy.io import wavfile
34 | import numpy as np
35 | import pysptk
36 | import pyworld
37 | 
38 | FRAME_LENGTH = 1024
39 | HOP_LENGTH = 80
40 | MIN_F0 = 60
41 | MAX_F0 = 240
42 | ORDER = 25
43 | ALPHA = 0.41
44 | 
45 | IN_WAVE_FILE = "in.wav"       # 入力音声
46 | OUT_WAVE_FILE = "out.wav"     # 分析再合成した音声
47 | 
48 | # 音声の読み込み
49 | fs, x = wavfile.read(IN_WAVE_FILE)
50 | x = x.astype(np.float64)
51 | 
52 | # 音声の分析 (基本周波数、スペクトル包絡、非周期性指標)
53 | _, sp, _ = pyworld.wav2world(x, fs)
54 | 
55 | # メルケプストラム係数の抽出 from WORLDのスペクトル包絡
56 | mcep = pysptk.sp2mc(sp, order=ORDER, alpha=ALPHA)
57 | 
58 | # ピッチ抽出
59 | pitch = pysptk.swipe(x, fs=fs, hopsize=HOP_LENGTH,
60 |                      min=MIN_F0, max=MAX_F0, otype="pitch")
61 | 
62 | # 励振源信号(声帯音源)の生成
63 | source_excitation = pysptk.excite(pitch, HOP_LENGTH)
64 | 
65 | # メルケプストラム係数からMLSAディジタルフィルタ係数に変換
66 | mlsa_coef = pysptk.mc2b(mcep, ALPHA)
67 | 
68 | # MLSAフィルタの作成
69 | synthesizer = Synthesizer(MLSADF(order=ORDER, alpha=ALPHA), HOP_LENGTH)
70 | 
71 | # 励振源信号でMLSAフィルタを駆動して音声を合成
72 | y = synthesizer.synthesis(source_excitation, mlsa_coef)
73 | 
74 | # 音声の書き込み
75 | y = y.astype(np.int16)
76 | wavfile.write(OUT_WAVE_FILE, fs, y)
77 | 


--------------------------------------------------------------------------------
/SpeechAnalysisSynthesis/pysptk_anasyn_parcor.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | """ 音声情報処理 n本ノック !! """
 4 | 
 5 | # MIT License
 6 | 
 7 | # Copyright (C) 2020 by Akira TAMAMORI
 8 | 
 9 | # Permission is hereby granted, free of charge, to any person
10 | # obtaining a copy of this software and associated documentation files
11 | # (the Software"), to deal in the Software without restriction,
12 | # including without limitation the rights to use, copy, modify, merge,
13 | # publish, distribute, sublicense, and/or sell copies of the Software,
14 | # and to permit persons to whom the Software is furnished to do so,
15 | # subject to the following conditions:
16 | 
17 | # The above copyright notice and this permission notice shall be
18 | # included in all copies or substantial portions of the Software.
19 | 
20 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
21 | # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
22 | # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
23 | # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
24 | # LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
25 | # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
26 | # WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
27 | 
28 | # Commentary:
29 | # - PySPTKによる音声の分析再合成 (PARCOR係数を経由)
30 | 
31 | from pysptk.synthesis import AllPoleLatticeDF, Synthesizer
32 | from scipy.io import wavfile
33 | import librosa
34 | import numpy as np
35 | import pysptk
36 | 
37 | FRAME_LENGTH = 1024
38 | HOP_LENGTH = 80
39 | MIN_F0 = 60
40 | MAX_F0 = 240
41 | ORDER = 20
42 | 
43 | IN_WAVE_FILE = "in.wav"       # 入力音声
44 | OUT_WAVE_FILE = "out.wav"     # 分析再合成した音声
45 | 
46 | # 音声の読み込み
47 | fs, x = wavfile.read(IN_WAVE_FILE)
48 | x = x.astype(np.float64)
49 | 
50 | # 音声の切り出しと窓掛け
51 | frames = librosa.util.frame(x, frame_length=FRAME_LENGTH,
52 |                             hop_length=HOP_LENGTH).astype(np.float64).T
53 | frames *= pysptk.blackman(FRAME_LENGTH)  # 窓掛け（ブラックマン窓）
54 | 
55 | # ピッチ抽出
56 | pitch = pysptk.swipe(x, fs=fs, hopsize=HOP_LENGTH,
57 |                      min=MIN_F0, max=MAX_F0, otype="pitch")
58 | 
59 | # 励振源信号(声帯音源)の生成
60 | source_excitation = pysptk.excite(pitch, HOP_LENGTH)
61 | 
62 | # 線形予測分析による線形予測符号化(LPC)係数の抽出
63 | lpc = pysptk.lpc(frames, ORDER)
64 | lpc[:, 0] = np.log(lpc[:, 0])
65 | 
66 | # LPC係数をPARCOR係数に変換
67 | parcor = pysptk.lpc2par(lpc)
68 | 
69 | # 全極フィルタの作成
70 | synthesizer = Synthesizer(AllPoleLatticeDF(order=ORDER), HOP_LENGTH)
71 | 
72 | # 励振源信号でフィルタを駆動して音声を合成
73 | y = synthesizer.synthesis(source_excitation, parcor)
74 | 
75 | # 音声の書き込み
76 | y = y.astype(np.int16)
77 | wavfile.write(OUT_WAVE_FILE, fs, y)
78 | 


--------------------------------------------------------------------------------
/SpeechAnalysisSynthesis/pyworld_anasyn.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | """ 音声情報処理 n本ノック !! """
 4 | 
 5 | # MIT License
 6 | 
 7 | # Copyright (C) 2020 by Akira TAMAMORI
 8 | 
 9 | # Permission is hereby granted, free of charge, to any person
10 | # obtaining a copy of this software and associated documentation files
11 | # (the Software"), to deal in the Software without restriction,
12 | # including without limitation the rights to use, copy, modify, merge,
13 | # publish, distribute, sublicense, and/or sell copies of the Software,
14 | # and to permit persons to whom the Software is furnished to do so,
15 | # subject to the following conditions:
16 | 
17 | # The above copyright notice and this permission notice shall be
18 | # included in all copies or substantial portions of the Software.
19 | 
20 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
21 | # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
22 | # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
23 | # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
24 | # LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
25 | # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
26 | # WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
27 | 
28 | # Commentary:
29 | # - PyWORLDによる音声の分析再合成
30 | 
31 | from scipy.io import wavfile
32 | import numpy as np
33 | import pyworld
34 | 
35 | IN_WAVE_FILE = "in.wav"       # 入力音声
36 | OUT_WAVE_FILE = "out.wav"     # 分析再合成した音声
37 | 
38 | # 音声の読み込み
39 | fs, x = wavfile.read(IN_WAVE_FILE)
40 | x = x.astype(np.float64)
41 | 
42 | # 音声の分析 (基本周波数、スペクトル包絡、非周期性指標)
43 | f0, sp, ap = pyworld.wav2world(x, fs)
44 | 
45 | # 音声の再合成
46 | y = pyworld.synthesize(f0, sp, ap, fs)
47 | y = y.astype(np.int16)
48 | 
49 | # wavファイルに保存
50 | wavfile.write(OUT_WAVE_FILE, fs, y)
51 | 


--------------------------------------------------------------------------------
/SpeechAnalysisSynthesis/pyworld_anasyn_encdec.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | """ 音声情報処理 n本ノック !! """
 4 | 
 5 | # MIT License
 6 | 
 7 | # Copyright (C) 2020 by Akira TAMAMORI
 8 | 
 9 | # Permission is hereby granted, free of charge, to any person
10 | # obtaining a copy of this software and associated documentation files
11 | # (the Software"), to deal in the Software without restriction,
12 | # including without limitation the rights to use, copy, modify, merge,
13 | # publish, distribute, sublicense, and/or sell copies of the Software,
14 | # and to permit persons to whom the Software is furnished to do so,
15 | # subject to the following conditions:
16 | 
17 | # The above copyright notice and this permission notice shall be
18 | # included in all copies or substantial portions of the Software.
19 | 
20 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
21 | # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
22 | # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
23 | # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
24 | # LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
25 | # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
26 | # WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
27 | 
28 | # Commentary:
29 | # - PyWORLDによる音声の分析再合成
30 | # - ただしスペクトル包絡と非周期性指標をエンコード/デコード
31 | 
32 | from scipy.io import wavfile
33 | import numpy as np
34 | import pyworld
35 | 
36 | IN_WAVE_FILE = "in.wav"       # 入力音声
37 | OUT_WAVE_FILE = "out.wav"     # 分析再合成した音声
38 | 
39 | SP_DIM = 50                     # スペクトル包絡の圧縮後の次元
40 | 
41 | # 音声の読み込み
42 | fs, x = wavfile.read(IN_WAVE_FILE)
43 | x = x.astype(np.float64)
44 | 
45 | # 音声の分析 (基本周波数、スペクトル包絡、非周期性指標)
46 | f0, sp, ap = pyworld.wav2world(x, fs)
47 | fft_size = pyworld.get_cheaptrick_fft_size(fs)
48 | 
49 | # スペクトル包絡をエンコード / デコード
50 | # https://www.isca-speech.org/archive/Interspeech_2017/abstracts/0067.html
51 | code_sp = pyworld.code_spectral_envelope(sp, fs, SP_DIM)
52 | decode_sp = pyworld.decode_spectral_envelope(code_sp, fs, fft_size)
53 | 
54 | # 非周期性指標をエンコード / デコード
55 | code_ap = pyworld.code_aperiodicity(ap, fs)
56 | decode_ap = pyworld.decode_aperiodicity(code_ap, fs, fft_size)
57 | 
58 | # 音声の再合成
59 | y = pyworld.synthesize(f0, decode_sp, decode_ap, fs)
60 | y = y.astype(np.int16)
61 | 
62 | # 音声の書き込み
63 | wavfile.write(OUT_WAVE_FILE, fs, y)
64 | 


--------------------------------------------------------------------------------
/SpeechRecognition/README.md:
--------------------------------------------------------------------------------
 1 | # 音声認識
 2 | 
 3 | ## はじめに
 4 | ```
 5 | pip3 install pysimplegui
 6 | pip3 install sounddevice
 7 | pip3 install soundfile
 8 | pip3 install SpeechRecognition
 9 | pip3 install gtts
10 | pip3 install wikipedia
11 | pip3 install vosk
12 | ```
13 | 
14 | ## ファイル一覧
15 | - 指定秒数だけ音声（wav）を録音 with soundfile & sounddevice ([record_speech.py](https://github.com/tam17aki/speech_process_exercise/blob/master/SpeechRecognition/record_speech.py))
16 | - 収録済み音声（wav）に対する音声認識 with VOSK ([vosk_asr_recorded.py](https://github.com/tam17aki/speech_process_exercise/blob/master/SpeechRecognition/vosk_asr_recorded.py))
17 | - マイク音声入力によるストリーミング音声認識 with VOSK ([vosk_asr_streaming.py](https://github.com/tam17aki/speech_process_exercise/blob/master/SpeechRecognition/vosk_asr_streaming.py))
18 | - マイク音声入力によるVADつきストリーミング音声認識 with VOSK ([vosk_asr_streaming_vad.py](https://github.com/tam17aki/speech_process_exercise/blob/master/SpeechRecognition/vosk_asr_streaming_vad.py))
19 | 
20 | ### PySimpleGUIによるGUIアプリ
21 | - 指定秒数だけ音声を録音し、音声認識をかける with SpeechRecognition ([recog_speech_rec.py](https://github.com/tam17aki/speech_process_exercise/blob/master/SpeechRecognition/recog_speech_rec.py))
22 | - Google Homeもどき ([google_mode_modoki.py](https://github.com/tam17aki/speech_process_exercise/blob/master/SpeechRecognition/google_mode_modoki.py))
23 | - 音声認識結果を使ったWikipedia検索＆読み上げ ([recog_wikipedia.py](https://github.com/tam17aki/speech_process_exercise/blob/master/SpeechRecognition/recog_wikipedia.py))
24 | 
25 | ### Google Colaboratory
26 | - Juliusの日本語ディクテーションキット ([Link](https://colab.research.google.com/drive/1pdp9lmzzslLzN95iu69siTkTxMk-hzXf?usp=sharing))
27 | - SpeechRecognition ライブラリのデモンストレーション ([Link](https://colab.research.google.com/drive/1w96tb5SxCPWqnNXaVlFQpaMPzJ24w0F3?usp=sharing)) 
28 | - ESPnet2　事前学習済モデルを用いた音声認識デモンストレーション
29 |   - LaboroTVSpeechコーパス ([Link](https://colab.research.google.com/drive/1xJ96-7JSSPBNJ-bAwysESDcaGvnbblAR?usp=sharing))
30 | - VOSK ライブラリを用いた音声認識デモンストレーション ([Link](https://colab.research.google.com/drive/1Dvhw4H2hT3WxDniX2M8w7q1pae5qgXYy?usp=sharing))
31 | 


--------------------------------------------------------------------------------
/SpeechRecognition/google_mode_modoki.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | """ 音声情報処理 n本ノック !! """
 4 | 
 5 | # MIT License
 6 | 
 7 | # Copyright (C) 2022 by Akira TAMAMORI
 8 | 
 9 | # Permission is hereby granted, free of charge, to any person
10 | # obtaining a copy of this software and associated documentation files
11 | # (the Software"), to deal in the Software without restriction,
12 | # including without limitation the rights to use, copy, modify, merge,
13 | # publish, distribute, sublicense, and/or sell copies of the Software,
14 | # and to permit persons to whom the Software is furnished to do so,
15 | # subject to the following conditions:
16 | 
17 | # The above copyright notice and this permission notice shall be
18 | # included in all copies or substantial portions of the Software.
19 | 
20 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
21 | # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
22 | # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
23 | # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
24 | # LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
25 | # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
26 | # WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
27 | 
28 | # Commentary:
29 | # Google Homeもどきを実現するサンプル
30 | 
31 | import PySimpleGUI as sg
32 | import speech_recognition as sr
33 | 
34 | # マイク設定
35 | rec = sr.Recognizer()
36 | mic = sr.Microphone()
37 | with mic as source:
38 |     rec.adjust_for_ambient_noise(source)
39 | 
40 | TIMEOUT = 1000  # タイムアウト時間（単位：ミリ秒）
41 | WAKE_WORD = "Ok Google"  # ウェイクワード
42 | 
43 | # フォント指定
44 | FONT = ("Hiragino Maru Gothic ProN", 20)
45 | 
46 | # レイアウト定義
47 | LAYOUT = [
48 |     [sg.Text("お好きなタイミングで話しかけてください", size=(35, 1))],
49 |     [sg.Text("認識結果: ", size=(40, 1), key="-RECOG_TEXT-")],
50 |     [sg.Button("終了", key="-QUIT-")],
51 | ]
52 | 
53 | # ウィンドウ生成
54 | WINDOW = sg.Window("Google Home sample", LAYOUT, font=FONT)
55 | 
56 | while True:
57 |     event, values = WINDOW.read(timeout=TIMEOUT, timeout_key="-RECOG_TRIGGER-")
58 | 
59 |     if event in (sg.WIN_CLOSED, "-QUIT-"):
60 |         break
61 | 
62 |     elif event in "-RECOG_TRIGGER-":
63 | 
64 |         with mic as source:
65 |             audio = rec.listen(source)
66 |             try:  # ウェイクワードの認識
67 |                 text = rec.recognize_google(audio, language="ja-JP")
68 |                 if WAKE_WORD in text:  # 認識結果にウェイクワードが含まれるならば
69 | 
70 |                     # 認識結果文字列のWAKE_WORDを空文字列で置き換える
71 |                     # →後段の処理に利用可能
72 |                     text = text.replace(WAKE_WORD, "")
73 | 
74 |                     WINDOW["-RECOG_TEXT-"].Update("認識結果: " + text)
75 |                 else:
76 |                     # 認識結果をクリア
77 |                     WINDOW["-RECOG_TEXT-"].Update("認識結果: ")
78 | 
79 |             except sr.UnknownValueError:
80 |                 WINDOW["-RECOG_TEXT-"].Update("認識に失敗しました")
81 | 
82 | WINDOW.close()
83 | 


--------------------------------------------------------------------------------
/SpeechRecognition/recog_speech_rec.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | 
  3 | """ 音声情報処理 n本ノック !! """
  4 | 
  5 | # MIT License
  6 | 
  7 | # Copyright (C) 2022 by Akira TAMAMORI
  8 | 
  9 | # Permission is hereby granted, free of charge, to any person
 10 | # obtaining a copy of this software and associated documentation files
 11 | # (the Software"), to deal in the Software without restriction,
 12 | # including without limitation the rights to use, copy, modify, merge,
 13 | # publish, distribute, sublicense, and/or sell copies of the Software,
 14 | # and to permit persons to whom the Software is furnished to do so,
 15 | # subject to the following conditions:
 16 | 
 17 | # The above copyright notice and this permission notice shall be
 18 | # included in all copies or substantial portions of the Software.
 19 | 
 20 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
 21 | # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
 22 | # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
 23 | # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
 24 | # LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
 25 | # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
 26 | # WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 27 | 
 28 | # Commentary:
 29 | # 指定秒数だけ音声を録音し、音声認識
 30 | 
 31 | import PySimpleGUI as sg
 32 | import sounddevice as sd  # 録音・再生系のライブラリ
 33 | import soundfile as sf  # 読み込み・書き出しのライブラリ
 34 | import speech_recognition as sr
 35 | 
 36 | # マイクの設定
 37 | r = sr.Recognizer()
 38 | m = sr.Microphone()
 39 | with m as source:
 40 |     r.adjust_for_ambient_noise(source)
 41 | 
 42 | # フォント指定
 43 | FONT = ("Hiragino Maru Gothic ProN", 20)
 44 | 
 45 | # 音声データ一時保存用
 46 | AUDIO = None
 47 | 
 48 | SAMPLE_RATE = 16000  # サンプリング周波数
 49 | N_CHANNEL = 1  # チャンネル数 モノラルは1, ステレオは2
 50 | DURATION = 5  # 収録秒数
 51 | BUFFER = 0.1
 52 | OUTPUT_FILE = "/tmp/record.wav"  # 出力先の音声ファイル名
 53 | 
 54 | 
 55 | # レイアウト定義
 56 | LAYOUT = [
 57 |     [
 58 |         sg.Text("「認識」ボタンを押して" + str(DURATION) + "秒間話しかけてください", size=(35, 1), key="txt"),
 59 |     ],
 60 |     [sg.Text(size=(40, 1), key="-RECOG_TEXT-")],
 61 |     [
 62 |         sg.Button("認識", key="recog"),
 63 |         sg.Button("終了", key="quit"),
 64 |     ],
 65 | ]
 66 | 
 67 | # ウィンドウ生成
 68 | WINDOW = sg.Window("Speech-To-Text sample", LAYOUT, font=FONT)
 69 | 
 70 | 
 71 | def recog():
 72 |     """リッスンする関数"""
 73 | 
 74 |     # 音声録音を指定秒数実行
 75 |     AUDIO = sd.rec(
 76 |         int(DURATION * SAMPLE_RATE), samplerate=SAMPLE_RATE, channels=N_CHANNEL
 77 |     )
 78 |     sd.wait()
 79 | 
 80 |     # 一旦ファイルに書き込む
 81 |     sf.write(
 82 |         file=OUTPUT_FILE,
 83 |         data=AUDIO,
 84 |         samplerate=SAMPLE_RATE,
 85 |         format="WAV",
 86 |         subtype="PCM_16",
 87 |     )
 88 | 
 89 |     with sr.AudioFile(OUTPUT_FILE) as source:
 90 |         audio = r.listen(source)  # 音声取得
 91 |         try:
 92 |             text = r.recognize_google(audio, language="ja-JP")
 93 |             WINDOW["-RECOG_TEXT-"].Update("認識結果: " + text)
 94 |         except sr.UnknownValueError:
 95 |             WINDOW["-RECOG_TEXT-"].Update("認識に失敗しました")
 96 | 
 97 | 
 98 | # イベントループ
 99 | while True:
100 | 
101 |     # イベント読み込み
102 |     event, values = WINDOW.read()
103 | 
104 |     if event == sg.WINDOW_CLOSED or event == "quit":
105 |         break
106 | 
107 |     elif event == "recog":
108 |         recog()
109 | 
110 | # ウィンドウを閉じて終了
111 | WINDOW.close()
112 | 


--------------------------------------------------------------------------------
/SpeechRecognition/record_speech.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | """ 音声情報処理 n本ノック !! """
 4 | 
 5 | # MIT License
 6 | 
 7 | # Copyright (C) 2022 by Akira TAMAMORI
 8 | 
 9 | # Permission is hereby granted, free of charge, to any person
10 | # obtaining a copy of this software and associated documentation files
11 | # (the Software"), to deal in the Software without restriction,
12 | # including without limitation the rights to use, copy, modify, merge,
13 | # publish, distribute, sublicense, and/or sell copies of the Software,
14 | # and to permit persons to whom the Software is furnished to do so,
15 | # subject to the following conditions:
16 | 
17 | # The above copyright notice and this permission notice shall be
18 | # included in all copies or substantial portions of the Software.
19 | 
20 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
21 | # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
22 | # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
23 | # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
24 | # LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
25 | # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
26 | # WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
27 | 
28 | # Commentary:
29 | # 指定秒数だけ音声を録音
30 | 
31 | from typing import NamedTuple
32 | import sounddevice as sd  # 録音・再生系のライブラリ
33 | import soundfile as sf  # 読み込み・書き出しのライブラリ
34 | 
35 | 
36 | class RecordingConfig(NamedTuple):
37 |     """Configuration for recording."""
38 | 
39 |     sample_rate: float = 16000  # Hz
40 |     duration: int = 3.0  # sec
41 |     n_channels: int = 1  # 1: mono
42 | 
43 | 
44 | def record_wav(out_wavfile: str, config: RecordingConfig):
45 |     """音声(wav)を録音する.
46 | 
47 |     Args:
48 |         out_wavfile (str): 出力となるwavファイル名
49 |         config (RecordingConfig): 録音の設定
50 |     """
51 |     sample_rate = config.sample_rate
52 |     duration = config.duration
53 |     n_channels = config.n_channels
54 | 
55 |     # 音声録音を指定秒数実行
56 |     audio = sd.rec(
57 |         int(duration * sample_rate), samplerate=sample_rate, channels=n_channels
58 |     )
59 |     sd.wait()
60 | 
61 |     # ファイルに書き込む
62 |     sf.write(
63 |         file=out_wavfile,
64 |         data=audio,
65 |         samplerate=sample_rate,
66 |         format="WAV",
67 |         subtype="PCM_16",
68 |     )
69 | 
70 | 
71 | def main(duration: int = 3.0, wav_file: str = "out.wav"):
72 |     """音声を録音する.
73 | 
74 |     Args:
75 |         duration (int): 録音秒数
76 |         wav_file (str): 出力wavファイルへのパス
77 |     """
78 |     # 入力デバイス情報に基づき、サンプリング周波数の情報を取得
79 |     input_device_info = sd.query_devices(kind="input")
80 |     sample_rate = int(input_device_info["default_samplerate"])
81 | 
82 |     # 指定秒数だけ音声を録音
83 |     record_config = RecordingConfig(sample_rate, duration)
84 |     print("＜録音開始＞")
85 |     record_wav(wav_file, record_config)
86 |     print("＜認識終了＞")
87 | 
88 | 
89 | if __name__ == "__main__":
90 |     main()
91 | 


--------------------------------------------------------------------------------
/SpeechRecognition/vosk_asr_recorded.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """収録済み音声ファイルに対する音声認識 via VOSK.
 3 | 
 4 | Copyright (C) 2022 by Akira TAMAMORI
 5 | 
 6 | Permission is hereby granted, free of charge, to any person obtaining a copy
 7 | of this software and associated documentation files (the "Software"), to deal
 8 | in the Software without restriction, including without limitation the rights
 9 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10 | copies of the Software, and to permit persons to whom the Software is
11 | furnished to do so, subject to the following conditions:
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
17 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
18 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
19 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
20 | SOFTWARE.
21 | """
22 | import wave
23 | import sys
24 | import json
25 | 
26 | from vosk import Model, KaldiRecognizer, SetLogLevel
27 | 
28 | 
29 | def get_asr_result(recognizer, stream, chunk_size):
30 |     """音声認識APIを実行して最終的な認識結果を得る.
31 | 
32 |     Args:
33 |        recognizer (KaldiRecognizer): 音声認識モジュール
34 |        stream (Wave_read): wav読み取りのための入力ストリーム
35 |        chunk_size (int): wavを一度に読み取るサイズ
36 | 
37 |     Returns:
38 |        recog_text (str): 音声認識結果
39 |     """
40 |     while True:
41 |         data = stream.readframes(chunk_size)
42 |         if len(data) == 0:
43 |             break
44 |         recognizer.AcceptWaveform(data)
45 | 
46 |     recog_result = json.loads(recognizer.FinalResult())
47 |     recog_text = recog_result["text"].split()
48 |     recog_text = "".join(recog_text)
49 |     return recog_text
50 | 
51 | 
52 | def main(chunk_size=4000, wav_file="in.wav"):
53 |     """収録済み音声に対して音声認識デモンストレーションを実行.
54 | 
55 |     Args:
56 |        chunk_size (int): 音声データを受け取る単位（サンプル数）
57 |        wav_file (str): wavファイルへのパス
58 |     """
59 |     SetLogLevel(-1)  # VOSK起動時のログ表示を抑制
60 | 
61 |     wf = wave.open(wav_file, "rb")
62 |     if wf.getnchannels() != 1 or wf.getsampwidth() != 2 or wf.getcomptype() != "NONE":
63 |         print("Audio file must be WAV format mono PCM.")
64 |         sys.exit(1)
65 | 
66 |     model = Model("model")
67 |     recognizer = KaldiRecognizer(model, wf.getframerate())
68 | 
69 |     recog_text = get_asr_result(recognizer, wf, chunk_size)
70 |     print(f"認識結果: {recog_text}")
71 | 
72 | 
73 | if __name__ == "__main__":
74 |     main()
75 | 


--------------------------------------------------------------------------------
/SpeechSynthesis/README.md:
--------------------------------------------------------------------------------
 1 | # 音声合成
 2 | 
 3 | ## はじめに
 4 | ```
 5 | pip3 install pysimplegui
 6 | pip3 install sounddevice
 7 | pip3 install soundfile
 8 | pip3 install pyopenjtalk[marine]
 9 | pip3 install gtts
10 | pip3 install ttslearn
11 | pip3 install torch
12 | pip3 install pyttsx3
13 | ```
14 | 
15 | ## ファイル一覧
16 | ### Pythonスクリプト
17 | - gTTSによるテキスト読み上げ ([synth_gtts.py](https://github.com/tam17aki/speech_process_exercise/blob/master/SpeechSynthesis/synth_gtts.py))
18 | - PyOpenJTalkによるテキスト読み上げ ([synth_pyopenjtalk.py](https://github.com/tam17aki/speech_process_exercise/blob/master/SpeechSynthesis/synth_pyopenjtalk.py))
19 | - pyttsx3によるテキスト読み上げ ([synth_pyttsx.py](https://github.com/tam17aki/speech_process_exercise/blob/master/SpeechSynthesis/synth_pyttsx.py))
20 | 
21 | ### PySimpleGUIによるGUIアプリ
22 | - gTTSによるテキスト読み上げ ([synth_gtts_gui.py](https://github.com/tam17aki/speech_process_exercise/blob/master/SpeechSynthesis/synth_gtts_gui.py))
23 | - PyOpenJTalkによるテキスト読み上げ ([synth_pyopenjtalk_gui.py](https://github.com/tam17aki/speech_process_exercise/blob/master/SpeechSynthesis/synth_pyopenjtalk_gui.py))
24 | - ttslearnによる複数話者テキスト読み上げ ([synth_ttslearn_multi_gui.py](https://github.com/tam17aki/speech_process_exercise/blob/master/SpeechSynthesis/synth_ttslearn_multi_gui.py))
25 | 


--------------------------------------------------------------------------------
/SpeechSynthesis/synth_gtts.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | """音声情報処理 n本ノック."""
 4 | 
 5 | # MIT License
 6 | 
 7 | # Copyright (C) 2022 by Akira TAMAMORI
 8 | 
 9 | # Permission is hereby granted, free of charge, to any person
10 | # obtaining a copy of this software and associated documentation files
11 | # (the Software"), to deal in the Software without restriction,
12 | # including without limitation the rights to use, copy, modify, merge,
13 | # publish, distribute, sublicense, and/or sell copies of the Software,
14 | # and to permit persons to whom the Software is furnished to do so,
15 | # subject to the following conditions:
16 | 
17 | # The above copyright notice and this permission notice shall be
18 | # included in all copies or substantial portions of the Software.
19 | 
20 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
21 | # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
22 | # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
23 | # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
24 | # LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
25 | # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
26 | # WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
27 | 
28 | # Commentary:
29 | # gTTSによるテキスト音声合成のサンプルスクリプト
30 | 
31 | from gtts import gTTS
32 | from pydub import AudioSegment
33 | from pydub.playback import play
34 | 
35 | 
36 | class TextToSpeech:
37 |     """Class for Text-to-Speech."""
38 | 
39 |     def __init__(self, lang: str = "ja", out_file: str = "/tmp/tts.mp3"):
40 |         """Initialize the class."""
41 |         self.lang = lang
42 |         self.out_file = out_file
43 | 
44 |     def generate(self, text):
45 |         """Perform text-to-speech."""
46 |         tts = gTTS(text, lang=self.lang)
47 |         tts.save(self.out_file)  # save audio in mp3 format
48 | 
49 |     def play(self):
50 |         """Play synthesized speech."""
51 |         audio_data = AudioSegment.from_mp3(self.out_file)
52 |         play(audio_data)
53 | 
54 | 
55 | def main(text: str = "こんにちは"):
56 |     """Perform Text-to-Speech."""
57 |     tts = TextToSpeech()
58 |     tts.generate(text)
59 |     tts.play()
60 | 
61 | 
62 | if __name__ == "__main__":
63 |     main("こんにちは")
64 | 


--------------------------------------------------------------------------------
/SpeechSynthesis/synth_gtts_gui.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | """ 音声情報処理 n本ノック !! """
 4 | 
 5 | # MIT License
 6 | 
 7 | # Copyright (C) 2022 by Akira TAMAMORI
 8 | 
 9 | # Permission is hereby granted, free of charge, to any person
10 | # obtaining a copy of this software and associated documentation files
11 | # (the Software"), to deal in the Software without restriction,
12 | # including without limitation the rights to use, copy, modify, merge,
13 | # publish, distribute, sublicense, and/or sell copies of the Software,
14 | # and to permit persons to whom the Software is furnished to do so,
15 | # subject to the following conditions:
16 | 
17 | # The above copyright notice and this permission notice shall be
18 | # included in all copies or substantial portions of the Software.
19 | 
20 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
21 | # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
22 | # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
23 | # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
24 | # LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
25 | # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
26 | # WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
27 | 
28 | # Commentary:
29 | # gTTSによるテキスト音声合成のサンプルスクリプト with PySimpleGUI
30 | 
31 | import subprocess
32 | 
33 | import PySimpleGUI as sg
34 | from gtts import gTTS
35 | 
36 | # 一時用保存ファイル
37 | OUT_MP3 = "/tmp/tts.mp3"
38 | 
39 | # 合成エンジンの言語
40 | LANG = "ja"
41 | 
42 | # フォント指定
43 | FONT = ("Hiragino Maru Gothic ProN", 24)
44 | 
45 | # レイアウト定義
46 | LAYOUT = [
47 |     [
48 |         sg.InputText("音声合成のサンプルです", size=(35, 1), key="txt"),
49 |     ],
50 |     [
51 |         sg.Button("合成", key="synth"),
52 |         sg.Button("終了", key="quit"),
53 |     ],
54 | ]
55 | 
56 | # ウィンドウ生成
57 | WINDOW = sg.Window("TTS-sample", LAYOUT, font=FONT)
58 | 
59 | # イベントループ
60 | while True:
61 | 
62 |     # イベント読み込み
63 |     event, values = WINDOW.read()
64 | 
65 |     if event == sg.WINDOW_CLOSED or event == "quit":
66 |         break
67 | 
68 |     elif event == "synth":  # 入力されたテキストを音声合成する
69 |         text = values["txt"]
70 | 
71 |         # 音声合成（テキストデータ→音声データ）
72 |         tts = gTTS(text, lang=LANG)
73 | 
74 |         # mp3形式でファイルを保存
75 |         tts.save(OUT_MP3)
76 | 
77 |         # 再生
78 |         subprocess.run("afplay " + OUT_MP3, shell=True)
79 | 
80 | # ウィンドウを閉じて終了
81 | WINDOW.close()
82 | 


--------------------------------------------------------------------------------
/SpeechSynthesis/synth_pyopenjtalk.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | """ 音声情報処理 n本ノック !! """
 4 | 
 5 | # MIT License
 6 | 
 7 | # Copyright (C) 2022 by Akira TAMAMORI
 8 | 
 9 | # Permission is hereby granted, free of charge, to any person
10 | # obtaining a copy of this software and associated documentation files
11 | # (the Software"), to deal in the Software without restriction,
12 | # including without limitation the rights to use, copy, modify, merge,
13 | # publish, distribute, sublicense, and/or sell copies of the Software,
14 | # and to permit persons to whom the Software is furnished to do so,
15 | # subject to the following conditions:
16 | 
17 | # The above copyright notice and this permission notice shall be
18 | # included in all copies or substantial portions of the Software.
19 | 
20 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
21 | # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
22 | # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
23 | # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
24 | # LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
25 | # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
26 | # WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
27 | 
28 | # Commentary:
29 | # pyopenjtalkによるテキスト音声合成のサンプルスクリプト
30 | 
31 | import numpy as np
32 | import pyopenjtalk
33 | import sounddevice as sd
34 | 
35 | 
36 | class TextToSpeech:
37 |     """Class for Text-to-Speech."""
38 | 
39 |     def __init__(self, run_marine=False):
40 |         """Initialize the class.
41 | 
42 |         run_marine (bool): enabel MARINE model to improve Japanese accent estimation.
43 |         """
44 |         self.audio = None
45 |         self.sr = None
46 |         self.run_marine = run_marine
47 | 
48 |     def generate(self, text):
49 |         """Perform text-to-speech."""
50 |         self.audio, self.sr = pyopenjtalk.tts(text, run_marine=self.run_marine)
51 | 
52 |     def play(self):
53 |         """Play synthesized speech."""
54 |         audio = self.audio / np.abs(self.audio).max()
55 |         audio = audio * (np.iinfo(np.int16).max / 2 - 1)
56 |         audio = audio.astype(np.int16)
57 |         sd.play(audio, self.sr)
58 |         sd.sleep(int(1000 * len(audio) / self.sr))
59 | 
60 | 
61 | def main(text: str = "こんにちは", run_marine: bool = False):
62 |     """main module."""
63 |     tts = TextToSpeech(run_marine)
64 |     tts.generate(text)
65 |     tts.play()
66 | 
67 | 
68 | if __name__ == "__main__":
69 |     print("MARINEによるアクセント推定 ON")
70 |     main("いつでも話しかけてくださいね。", True)
71 | 
72 |     print("MARINEによるアクセント推定 OFF")
73 |     main("いつでも話しかけてくださいね。", False)
74 | 


--------------------------------------------------------------------------------
/SpeechSynthesis/synth_pyopenjtalk_gui.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | """ 音声情報処理 n本ノック !! """
 4 | 
 5 | # MIT License
 6 | 
 7 | # Copyright (C) 2022 by Akira TAMAMORI
 8 | 
 9 | # Permission is hereby granted, free of charge, to any person
10 | # obtaining a copy of this software and associated documentation files
11 | # (the Software"), to deal in the Software without restriction,
12 | # including without limitation the rights to use, copy, modify, merge,
13 | # publish, distribute, sublicense, and/or sell copies of the Software,
14 | # and to permit persons to whom the Software is furnished to do so,
15 | # subject to the following conditions:
16 | 
17 | # The above copyright notice and this permission notice shall be
18 | # included in all copies or substantial portions of the Software.
19 | 
20 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
21 | # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
22 | # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
23 | # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
24 | # LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
25 | # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
26 | # WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
27 | 
28 | # Commentary:
29 | # PyOpenJTalkによるテキスト音声合成のサンプルスクリプト with PySimpleGUI
30 | 
31 | import numpy as np
32 | import pyopenjtalk
33 | import PySimpleGUI as sg
34 | import sounddevice as sd
35 | 
36 | OUT_WAV = "/tmp/tts.wav"
37 | 
38 | FONT = ("Arial", 30)
39 | LAYOUT = [
40 |     [
41 |         sg.InputText(default_text="音声合成のサンプルです。", size=(40, 3), key="text"),
42 |         sg.Button("合成", key="synth"),
43 |     ]
44 | ]
45 | 
46 | WINDOW = sg.Window("TTS-sample", LAYOUT, font=FONT)
47 | 
48 | while True:
49 |     event, values = WINDOW.read()
50 | 
51 |     if event is None:
52 |         break
53 |     else:
54 |         # 入力されたテキストを音声合成する
55 |         if event == "synth":
56 |             text = values["text"]
57 | 
58 |             # 音声合成（テキストデータ→音声データ）
59 |             audio, sr = pyopenjtalk.tts(text)
60 | 
61 |             # 振幅の正規化
62 |             audio = audio / np.abs(audio).max()
63 |             audio = audio * (np.iinfo(np.int16).max / 2 - 1)
64 |             audio = audio.astype(np.int16)
65 | 
66 |             # 再生
67 |             sd.play(audio, sr)
68 | 
69 |             # 再生は非同期に行われるので、明示的にsleepさせる
70 |             sd.sleep(int(1000 * len(audio) / sr))
71 | 
72 | WINDOW.close()
73 | 


--------------------------------------------------------------------------------
/SpeechSynthesis/synth_pyttsx.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | """ 音声情報処理 n本ノック !! """
 4 | 
 5 | # MIT License
 6 | 
 7 | # Copyright (C) 2022 by Akira TAMAMORI
 8 | 
 9 | # Permission is hereby granted, free of charge, to any person
10 | # obtaining a copy of this software and associated documentation files
11 | # (the Software"), to deal in the Software without restriction,
12 | # including without limitation the rights to use, copy, modify, merge,
13 | # publish, distribute, sublicense, and/or sell copies of the Software,
14 | # and to permit persons to whom the Software is furnished to do so,
15 | # subject to the following conditions:
16 | 
17 | # The above copyright notice and this permission notice shall be
18 | # included in all copies or substantial portions of the Software.
19 | 
20 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
21 | # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
22 | # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
23 | # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
24 | # LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
25 | # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
26 | # WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
27 | 
28 | # Commentary:
29 | # pyttsx3によるテキスト音声合成のサンプルスクリプト
30 | 
31 | import pyttsx3
32 | 
33 | 
34 | class TextToSpeech:
35 |     """Class for Text-to-Speech."""
36 | 
37 |     def __init__(self):
38 |         """Initialize the class."""
39 |         self.engine = pyttsx3.init()
40 | 
41 |     def generate(self, text):
42 |         """Queues a command to speak an utterance."""
43 |         self.engine.say(text)
44 | 
45 |     def play(self):
46 |         """Play synthesized speech."""
47 |         self.engine.runAndWait()
48 | 
49 | 
50 | def main(text: str = "こんにちは"):
51 |     """main module."""
52 |     tts = TextToSpeech()
53 |     tts.generate(text)
54 |     tts.play()
55 | 
56 | 
57 | if __name__ == "__main__":
58 |     main("こんにちは")
59 | 


--------------------------------------------------------------------------------
/VoiceConversion/README.md:
--------------------------------------------------------------------------------
 1 | # 音声変換
 2 | 
 3 | ## はじめに
 4 | ```
 5 | pip3 install pysimplegui
 6 | pip3 install pyaudio
 7 | pip3 install pyworld
 8 | pip3 install numpy
 9 | pip3 install scipy
10 | ```
11 | 
12 | ## PySimpleGUIによるGUIアプリ
13 | - PyAudioで音声を取り込み、PyWorldで分析再合成するリアルタイム音声変換 ([pysimplegui_realtime_vc.py](https://github.com/tam17aki/speech_process_exercise/blob/master/VoiceConversion/pysimplegui_realtime_vc.py))
14 |   - 動作例 <blockquote class="twitter-tweet"><p lang="ja" dir="ltr">pysimpleguiとpyaudioとpyworldでボイスチェンジャーができた <a href="https://t.co/5V8A6I9ZX4">pic.twitter.com/5V8A6I9ZX4</a></p>&mdash; mat (@ballforest) <a href="https://twitter.com/ballforest/status/1480427188407402500?ref_src=twsrc%5Etfw">January 10, 2022</a></blockquote>
15 | 


--------------------------------------------------------------------------------
/WarmUp/ffmpeg_mp3_to_wav.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | """ 音声情報処理 n本ノック !! """
 4 | 
 5 | # MIT License
 6 | 
 7 | # Copyright (C) 2022 by Akira TAMAMORI
 8 | 
 9 | # Permission is hereby granted, free of charge, to any person
10 | # obtaining a copy of this software and associated documentation files
11 | # (the Software"), to deal in the Software without restriction,
12 | # including without limitation the rights to use, copy, modify, merge,
13 | # publish, distribute, sublicense, and/or sell copies of the Software,
14 | # and to permit persons to whom the Software is furnished to do so,
15 | # subject to the following conditions:
16 | 
17 | # The above copyright notice and this permission notice shall be
18 | # included in all copies or substantial portions of the Software.
19 | 
20 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
21 | # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
22 | # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
23 | # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
24 | # LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
25 | # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
26 | # WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
27 | 
28 | # Commentary:
29 | # ffmpeg-pythonによりmp3をwavへエクスポート
30 | 
31 | import ffmpeg
32 | 
33 | IN_MP3_FILE = "in.mp3"
34 | OUT_WAVE_FILE = "out.wav"
35 | 
36 | stream = ffmpeg.input(IN_MP3_FILE)
37 | stream = ffmpeg.output(stream, OUT_WAVE_FILE, format="wav")
38 | stream.run()
39 | 


--------------------------------------------------------------------------------
/WarmUp/ffmpeg_wav_to_mp3.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | """ 音声情報処理 n本ノック !! """
 4 | 
 5 | # MIT License
 6 | 
 7 | # Copyright (C) 2022 by Akira TAMAMORI
 8 | 
 9 | # Permission is hereby granted, free of charge, to any person
10 | # obtaining a copy of this software and associated documentation files
11 | # (the Software"), to deal in the Software without restriction,
12 | # including without limitation the rights to use, copy, modify, merge,
13 | # publish, distribute, sublicense, and/or sell copies of the Software,
14 | # and to permit persons to whom the Software is furnished to do so,
15 | # subject to the following conditions:
16 | 
17 | # The above copyright notice and this permission notice shall be
18 | # included in all copies or substantial portions of the Software.
19 | 
20 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
21 | # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
22 | # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
23 | # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
24 | # LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
25 | # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
26 | # WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
27 | 
28 | # Commentary:
29 | # ffmpeg-pythonによりwavをmp3へエクスポート
30 | 
31 | import wave
32 | 
33 | import ffmpeg
34 | 
35 | IN_WAVE_FILE = "in.wav"
36 | OUT_MP3_FILE = "out.mp3"
37 | 
38 | with wave.open(IN_WAVE_FILE, "r") as sound:
39 |     n_channel = sound.getnchannels()  # チャネル数 (mono:1, stereo:2)
40 |     channel_layout = "mono" if n_channel == 1 else "stereo"
41 | 
42 | stream = ffmpeg.input(IN_WAVE_FILE, channel_layout=channel_layout)
43 | stream = ffmpeg.output(stream, OUT_MP3_FILE, format="mp3")
44 | stream.run()
45 | 


--------------------------------------------------------------------------------
/WarmUp/librosa_plot_specgram.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | """ 音声情報処理 n本ノック !! """
 4 | 
 5 | # MIT License
 6 | 
 7 | # Copyright (C) 2020 by Akira TAMAMORI
 8 | 
 9 | # Permission is hereby granted, free of charge, to any person
10 | # obtaining a copy of this software and associated documentation files
11 | # (the Software"), to deal in the Software without restriction,
12 | # including without limitation the rights to use, copy, modify, merge,
13 | # publish, distribute, sublicense, and/or sell copies of the Software,
14 | # and to permit persons to whom the Software is furnished to do so,
15 | # subject to the following conditions:
16 | 
17 | # The above copyright notice and this permission notice shall be
18 | # included in all copies or substantial portions of the Software.
19 | 
20 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
21 | # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
22 | # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
23 | # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
24 | # LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
25 | # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
26 | # WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
27 | 
28 | # Commentary:
29 | # librosa を用いた波形プロット (やや処理が重たい)
30 | 
31 | import librosa
32 | import librosa.display
33 | import matplotlib.pyplot as plt
34 | import numpy as np
35 | from scipy.io import wavfile
36 | 
37 | IN_WAVE_FILE = "in.wav"  # モノラル音声（前提）
38 | 
39 | FRAME_LENGTH = 1024  # フレーム長
40 | HOP_LENGTH = 80  # フレームのシフト長
41 | 
42 | # 音声データ読み込み (fsがサンプリング周波数、dataは音声データ)
43 | fs, data = wavfile.read(IN_WAVE_FILE)
44 | data = data.astype(np.float64)
45 | 
46 | # 短時間フーリエ変換
47 | data_stft = librosa.stft(data, hop_length=HOP_LENGTH, n_fft=FRAME_LENGTH)
48 | 
49 | # 振幅スペクトル
50 | data_ampspec = np.abs(data_stft)
51 | 
52 | # 振幅スペクトルをデシベルスケールにする
53 | data_ampspec_dB = librosa.amplitude_to_db(data_ampspec, ref=np.max)
54 | 
55 | # プロット枠を確保 (10がヨコのサイズ、4はタテのサイズ)
56 | plt.figure(figsize=(10, 4))
57 | 
58 | # スペクトログラムの表示
59 | librosa.display.specshow(
60 |     data_ampspec_dB, x_axis="time", y_axis="linear", hop_length=HOP_LENGTH, sr=fs
61 | )
62 | 
63 | # x軸のラベル
64 | plt.xlabel("Time (sec)")
65 | 
66 | # y軸のラベル
67 | plt.ylabel("Hz")
68 | 
69 | # 画像のタイトル
70 | plt.title("Spectrogram")
71 | 
72 | # 余白を少なくする
73 | plt.tight_layout()
74 | 
75 | # 画像を画面表示 (必須)
76 | plt.show()
77 | 


--------------------------------------------------------------------------------
/WarmUp/librosa_plot_waveform.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | """ 音声情報処理 n本ノック !! """
 4 | 
 5 | # MIT License
 6 | 
 7 | # Copyright (C) 2020 by Akira TAMAMORI
 8 | 
 9 | # Permission is hereby granted, free of charge, to any person
10 | # obtaining a copy of this software and associated documentation files
11 | # (the Software"), to deal in the Software without restriction,
12 | # including without limitation the rights to use, copy, modify, merge,
13 | # publish, distribute, sublicense, and/or sell copies of the Software,
14 | # and to permit persons to whom the Software is furnished to do so,
15 | # subject to the following conditions:
16 | 
17 | # The above copyright notice and this permission notice shall be
18 | # included in all copies or substantial portions of the Software.
19 | 
20 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
21 | # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
22 | # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
23 | # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
24 | # LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
25 | # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
26 | # WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
27 | 
28 | # Commentary:
29 | # librosa を用いた波形プロット (やや処理が重たい)
30 | # 波形読み込みはscipy.ioのwavfileモジュール
31 | 
32 | import librosa
33 | import librosa.display
34 | import matplotlib.pyplot as plt
35 | import numpy as np
36 | from scipy.io import wavfile
37 | 
38 | IN_WAVE_FILE = "in.wav"  # モノラル音声（前提）
39 | 
40 | # 音声データ読み込み (fsがサンプリング周波数、dataは音声データ)
41 | fs, data = wavfile.read(IN_WAVE_FILE)
42 | data = data.astype(np.float64)
43 | 
44 | # プロット枠を確保 (10がヨコのサイズ、4はタテのサイズ)
45 | plt.figure(figsize=(10, 4))
46 | 
47 | # 波形の表示
48 | librosa.display.waveplot(data, sr=fs)
49 | 
50 | # x軸のラベル
51 | plt.xlabel("Time (sec)")
52 | 
53 | # y軸のラベル
54 | plt.ylabel("Amplitude")
55 | 
56 | # 画像のタイトル
57 | plt.title("Waveform")
58 | 
59 | # 余白を少なくする
60 | plt.tight_layout()
61 | 
62 | # 画像を画面表示 (必須)
63 | plt.show()
64 | 


--------------------------------------------------------------------------------
/WarmUp/plt_specgram.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | """ 音声情報処理 n本ノック !! """
 4 | 
 5 | # MIT License
 6 | 
 7 | # Copyright (C) 2020 by Akira TAMAMORI
 8 | 
 9 | # Permission is hereby granted, free of charge, to any person
10 | # obtaining a copy of this software and associated documentation files
11 | # (the Software"), to deal in the Software without restriction,
12 | # including without limitation the rights to use, copy, modify, merge,
13 | # publish, distribute, sublicense, and/or sell copies of the Software,
14 | # and to permit persons to whom the Software is furnished to do so,
15 | # subject to the following conditions:
16 | 
17 | # The above copyright notice and this permission notice shall be
18 | # included in all copies or substantial portions of the Software.
19 | 
20 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
21 | # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
22 | # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
23 | # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
24 | # LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
25 | # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
26 | # WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
27 | 
28 | # Commentary:
29 | # matplotlib を用いたスペクトログラムのプロット
30 | # 波形読み込みはscipy.ioのwavfileモジュール
31 | 
32 | import matplotlib.pyplot as plt
33 | from scipy.io import wavfile
34 | 
35 | IN_WAVE_FILE = "in.wav"  # モノラル音声（前提）
36 | 
37 | FRAME_LENGTH = 1024  # フレーム長
38 | HOP_LENGTH = 80  # フレームのシフト長
39 | FFT_LENGTH = FRAME_LENGTH  # FFTサイズ
40 | N_OVERLAP = FRAME_LENGTH - HOP_LENGTH  # オーバーラップ幅
41 | 
42 | # 音声データ読み込み (fsがサンプリング周波数、dataは音声データ)
43 | fs, data = wavfile.read(IN_WAVE_FILE)
44 | 
45 | # プロット枠を確保
46 | plt.figure(figsize=(10, 4))
47 | 
48 | # スペクトログラムのプロット
49 | plt.specgram(data, NFFT=FFT_LENGTH, noverlap=N_OVERLAP, Fs=fs, cmap="jet")
50 | 
51 | # x軸のラベル
52 | plt.xlabel("Time (sec)")
53 | 
54 | # y軸のラベル
55 | plt.ylabel("Frequency (Hz)")
56 | 
57 | # 画像のタイトル
58 | plt.title("Spectrogram")
59 | 
60 | # 画像を画面表示
61 | plt.show()
62 | 


--------------------------------------------------------------------------------
/WarmUp/plt_waveform.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | """ 音声情報処理 n本ノック !! """
 4 | 
 5 | # MIT License
 6 | 
 7 | # Copyright (C) 2020 by Akira TAMAMORI
 8 | 
 9 | # Permission is hereby granted, free of charge, to any person
10 | # obtaining a copy of this software and associated documentation files
11 | # (the Software"), to deal in the Software without restriction,
12 | # including without limitation the rights to use, copy, modify, merge,
13 | # publish, distribute, sublicense, and/or sell copies of the Software,
14 | # and to permit persons to whom the Software is furnished to do so,
15 | # subject to the following conditions:
16 | 
17 | # The above copyright notice and this permission notice shall be
18 | # included in all copies or substantial portions of the Software.
19 | 
20 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
21 | # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
22 | # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
23 | # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
24 | # LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
25 | # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
26 | # WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
27 | 
28 | # Commentary:
29 | # matplotlib を用いた波形プロット
30 | # 波形読み込みはwaveモジュール
31 | 
32 | import wave
33 | 
34 | import matplotlib.pyplot as plt
35 | import numpy as np
36 | 
37 | IN_WAVE_FILE = "in.wav"  # モノラル音声（前提）
38 | 
39 | # wavの読み込み
40 | with wave.open(IN_WAVE_FILE, "r") as sound:
41 |     params = sound.getparams()
42 |     n_channel = sound.getnchannels()  # チャネル数 (mono:1, stereo:2)
43 |     bitdepth = sound.getsampwidth()  # 量子化ビット数 (byte!)
44 |     sample_freq = sound.getframerate()  # サンプリング周波数
45 |     n_frames = sound.getnframes()  # チャネルあたりのサンプル数
46 |     n_samples = n_channel * n_frames  # 総サンプル数
47 |     data = sound.readframes(n_frames)  # 音声データ (bytesオブジェクト)
48 | 
49 | 
50 | # 2バイト(16bit)の整数値系列に変換
51 | data = np.frombuffer(data, dtype=np.int16)
52 | 
53 | # 時間軸を設定
54 | n_samples = len(data)
55 | time = np.arange(n_samples) / sample_freq
56 | 
57 | # 音声データのプロット
58 | plt.plot(time, data)
59 | 
60 | # x軸のラベル
61 | plt.xlabel("Time (sec)")
62 | 
63 | # y軸のラベル
64 | plt.ylabel("Amplitude")
65 | 
66 | # 画像のタイトル
67 | plt.title("Waveform")
68 | 
69 | # 画像を画面表示
70 | plt.show()
71 | 


--------------------------------------------------------------------------------
/WarmUp/plt_waveform_scipy.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | """ 音声情報処理 n本ノック !! """
 4 | 
 5 | # MIT License
 6 | 
 7 | # Copyright (C) 2020 by Akira TAMAMORI
 8 | 
 9 | # Permission is hereby granted, free of charge, to any person
10 | # obtaining a copy of this software and associated documentation files
11 | # (the Software"), to deal in the Software without restriction,
12 | # including without limitation the rights to use, copy, modify, merge,
13 | # publish, distribute, sublicense, and/or sell copies of the Software,
14 | # and to permit persons to whom the Software is furnished to do so,
15 | # subject to the following conditions:
16 | 
17 | # The above copyright notice and this permission notice shall be
18 | # included in all copies or substantial portions of the Software.
19 | 
20 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
21 | # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
22 | # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
23 | # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
24 | # LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
25 | # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
26 | # WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
27 | 
28 | # Commentary:
29 | # matplotlib を用いた波形プロット
30 | 
31 | from scipy.io import wavfile
32 | import numpy as np
33 | import matplotlib.pyplot as plt
34 | 
35 | IN_WAVE_FILE = "in.wav"  # モノラル音声（前提）
36 | 
37 | # 波形表示
38 | fs, data = wavfile.read(IN_WAVE_FILE)
39 | n_samples = len(data)
40 | time = np.arange(n_samples) / fs
41 | plt.plot(time, data)
42 | plt.xlabel("Time (sec)")
43 | plt.ylabel("Amplitude")
44 | plt.title("Waveform")
45 | plt.show()
46 | 


--------------------------------------------------------------------------------
/WarmUp/pydub_mp3_to_wav.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | """ 音声情報処理 n本ノック !! """
 4 | 
 5 | # MIT License
 6 | 
 7 | # Copyright (C) 2022 by Akira TAMAMORI
 8 | 
 9 | # Permission is hereby granted, free of charge, to any person
10 | # obtaining a copy of this software and associated documentation files
11 | # (the Software"), to deal in the Software without restriction,
12 | # including without limitation the rights to use, copy, modify, merge,
13 | # publish, distribute, sublicense, and/or sell copies of the Software,
14 | # and to permit persons to whom the Software is furnished to do so,
15 | # subject to the following conditions:
16 | 
17 | # The above copyright notice and this permission notice shall be
18 | # included in all copies or substantial portions of the Software.
19 | 
20 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
21 | # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
22 | # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
23 | # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
24 | # LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
25 | # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
26 | # WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
27 | 
28 | # Commentary:
29 | # pydubによりmp3をwavへエクスポート
30 | 
31 | import pydub
32 | 
33 | IN_MP3_FILE = "in.mp3"
34 | OUT_WAV_FILE = "out.wav"
35 | 
36 | audio = pydub.AudioSegment.from_mp3(IN_MP3_FILE)
37 | audio.export(OUT_WAV_FILE, format="wav")
38 | 


--------------------------------------------------------------------------------
/WarmUp/pydub_wav_to_mp3.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | """ 音声情報処理 n本ノック !! """
 4 | 
 5 | # MIT License
 6 | 
 7 | # Copyright (C) 2022 by Akira TAMAMORI
 8 | 
 9 | # Permission is hereby granted, free of charge, to any person
10 | # obtaining a copy of this software and associated documentation files
11 | # (the Software"), to deal in the Software without restriction,
12 | # including without limitation the rights to use, copy, modify, merge,
13 | # publish, distribute, sublicense, and/or sell copies of the Software,
14 | # and to permit persons to whom the Software is furnished to do so,
15 | # subject to the following conditions:
16 | 
17 | # The above copyright notice and this permission notice shall be
18 | # included in all copies or substantial portions of the Software.
19 | 
20 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
21 | # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
22 | # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
23 | # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
24 | # LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
25 | # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
26 | # WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
27 | 
28 | # Commentary:
29 | # pydubによりwavをmp3へエクスポート
30 | 
31 | import pydub
32 | 
33 | IN_WAVE_FILE = "in.wav"
34 | OUT_MP3_FILE = "out.mp3"
35 | 
36 | audio = pydub.AudioSegment.from_wav(IN_WAVE_FILE)
37 | audio.export(OUT_MP3_FILE, format="mp3")
38 | 


--------------------------------------------------------------------------------
/WarmUp/sounddevice_play_wav.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | """ 音声情報処理 n本ノック !! """
 4 | 
 5 | # MIT License
 6 | 
 7 | # Copyright (C) 2020 by Akira TAMAMORI
 8 | 
 9 | # Permission is hereby granted, free of charge, to any person
10 | # obtaining a copy of this software and associated documentation files
11 | # (the Software"), to deal in the Software without restriction,
12 | # including without limitation the rights to use, copy, modify, merge,
13 | # publish, distribute, sublicense, and/or sell copies of the Software,
14 | # and to permit persons to whom the Software is furnished to do so,
15 | # subject to the following conditions:
16 | 
17 | # The above copyright notice and this permission notice shall be
18 | # included in all copies or substantial portions of the Software.
19 | 
20 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
21 | # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
22 | # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
23 | # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
24 | # LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
25 | # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
26 | # WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
27 | 
28 | # Commentary:
29 | # 波形読み込みはscipy.ioのwavfileモジュール
30 | # sounddeviceモジュールによるwav再生
31 | 
32 | import sounddevice as sd
33 | from scipy.io import wavfile
34 | 
35 | IN_WAVE_FILE = "in.wav"  # モノラル音声（前提）
36 | 
37 | # 音声データ読み込み (fsがサンプリング周波数、dataは音声データ)
38 | fs, data = wavfile.read(IN_WAVE_FILE)
39 | 
40 | # 再生
41 | sd.play(data, fs)
42 | sd.wait()
43 | 


--------------------------------------------------------------------------------
/WarmUp/sounddevice_rec_wav.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | """ 音声情報処理 n本ノック !! """
 4 | 
 5 | # MIT License
 6 | 
 7 | # Copyright (C) 2020 by Akira TAMAMORI
 8 | 
 9 | # Permission is hereby granted, free of charge, to any person
10 | # obtaining a copy of this software and associated documentation files
11 | # (the Software"), to deal in the Software without restriction,
12 | # including without limitation the rights to use, copy, modify, merge,
13 | # publish, distribute, sublicense, and/or sell copies of the Software,
14 | # and to permit persons to whom the Software is furnished to do so,
15 | # subject to the following conditions:
16 | 
17 | # The above copyright notice and this permission notice shall be
18 | # included in all copies or substantial portions of the Software.
19 | 
20 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
21 | # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
22 | # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
23 | # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
24 | # LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
25 | # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
26 | # WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
27 | 
28 | # Commentary:
29 | # sounddeviceモジュールによる録音
30 | # 波形書き込みはwaveモジュール
31 | 
32 | import wave
33 | 
34 | import numpy as np
35 | import sounddevice as sd
36 | 
37 | OUT_WAVE_FILE = "out.wav"
38 | 
39 | fs = 16000  # サンプリング周波数 (Hz)
40 | duration = 3  # 録音時間 (sec)
41 | n_channels = 1  # モノラル
42 | 
43 | n_frames = int(fs * duration)  # 総サンプル数
44 | 
45 | # 音声の録音
46 | data = sd.rec(frames=n_frames, samplerate=fs, channels=n_channels)
47 | sd.wait()
48 | 
49 | # 振幅の正規化
50 | data = data / data.max() * np.iinfo(np.int16).max
51 | 
52 | # floatを2byte整数に変換
53 | data = data.astype(np.int16)
54 | 
55 | # wavの書き込み
56 | with wave.open(OUT_WAVE_FILE, mode="wb") as sound:
57 |     sound.setnchannels(n_channels)  # モノラル
58 |     sound.setsampwidth(2)  # 量子化ビット数 (byte表示)
59 |     sound.setframerate(fs)
60 |     sound.writeframes(data.tobytes())
61 | 


--------------------------------------------------------------------------------
/WarmUp/subprocess_play_wav.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | """ 音声情報処理 n本ノック !! """
 4 | 
 5 | # MIT License
 6 | 
 7 | # Copyright (C) 2020 by Akira TAMAMORI
 8 | 
 9 | # Permission is hereby granted, free of charge, to any person
10 | # obtaining a copy of this software and associated documentation files
11 | # (the Software"), to deal in the Software without restriction,
12 | # including without limitation the rights to use, copy, modify, merge,
13 | # publish, distribute, sublicense, and/or sell copies of the Software,
14 | # and to permit persons to whom the Software is furnished to do so,
15 | # subject to the following conditions:
16 | 
17 | # The above copyright notice and this permission notice shall be
18 | # included in all copies or substantial portions of the Software.
19 | 
20 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
21 | # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
22 | # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
23 | # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
24 | # LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
25 | # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
26 | # WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
27 | 
28 | # Commentary:
29 | # subprocessモジュールによるwav再生
30 | 
31 | import subprocess
32 | 
33 | IN_WAVE_FILE = "in.wav"  # モノラル音声（前提）
34 | PLAY_COMMAND = "afplay"  # 音声再生コマンド
35 | 
36 | # 再生（同期処理）
37 | subprocess.run(PLAY_COMMAND + " " + IN_WAVE_FILE, shell=True)
38 | 
39 | # 再生（非同期処理）
40 | proc = subprocess.Popen(PLAY_COMMAND + " " + IN_WAVE_FILE, shell=True)
41 | proc.communicate()
42 | 


--------------------------------------------------------------------------------
/WarmUp/wave_change_bitdepth.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | """ 音声情報処理 n本ノック !! """
 4 | 
 5 | # MIT License
 6 | 
 7 | # Copyright (C) 2020 by Akira TAMAMORI
 8 | 
 9 | # Permission is hereby granted, free of charge, to any person
10 | # obtaining a copy of this software and associated documentation files
11 | # (the Software"), to deal in the Software without restriction,
12 | # including without limitation the rights to use, copy, modify, merge,
13 | # publish, distribute, sublicense, and/or sell copies of the Software,
14 | # and to permit persons to whom the Software is furnished to do so,
15 | # subject to the following conditions:
16 | 
17 | # The above copyright notice and this permission notice shall be
18 | # included in all copies or substantial portions of the Software.
19 | 
20 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
21 | # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
22 | # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
23 | # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
24 | # LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
25 | # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
26 | # WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
27 | 
28 | # Commentary:
29 | # 量子化ビット数を変更したwavファイルの作成
30 | 
31 | import wave
32 | import numpy as np
33 | 
34 | IN_WAVE_FILE = "in.wav"  # 16bit モノラル音声（前提）
35 | OUT_WAVE_FILE = "out.wav"
36 | 
37 | # wavの読み込み
38 | with wave.open(IN_WAVE_FILE, "r") as sound:
39 |     params = sound.getparams()
40 |     n_channel = sound.getnchannels()    # チャネル数 (mono:1, stereo:2)
41 |     bitdepth = sound.getsampwidth()     # 量子化ビット数 (byte!)
42 |     n_framerate = sound.getframerate()  # サンプリング周波数
43 |     n_frames = sound.getnframes()       # チャネルあたりのサンプル数
44 |     n_samples = n_channel * n_frames    # 総サンプル数
45 |     sound_frames = sound.readframes(n_frames)  # 音声データ (bytesオブジェクト)
46 | 
47 | # ヘッダ情報の表示
48 | print(f"入力ファイル名: {IN_WAVE_FILE}")
49 | print(f"  ・チャネル数: {n_channel}")
50 | print(f"  ・量子化ビット数: {bitdepth * 8}")
51 | print(f"  ・サンプリング周波数: {n_framerate}")
52 | print(f"  ・サンプル数: {n_samples}")
53 | 
54 | # 量子化ビット数 変更 (16bit to 8bit) →下位ビットを捨てない
55 | x = np.frombuffer(sound_frames, dtype=np.int16)
56 | volume = np.max(x) / (2 ** 16)
57 | x = (x / np.max(x)) * (2 ** 7 - 1)
58 | x *= volume
59 | x = x.astype(np.int8)
60 | sound_frames = x.tobytes()
61 | 
62 | # ヘッダ情報の変更
63 | bitdepth = 1  # 2 byte to 1byte
64 | 
65 | # wavの書き込み
66 | with wave.open(OUT_WAVE_FILE, "w") as sound:
67 |     sound.setnchannels(n_channel)    # チャネル数 (mono:1, stereo:2)
68 |     sound.setsampwidth(bitdepth)     # 量子化ビット数 (byte!)
69 |     sound.setframerate(n_framerate)  # 標本化周波数の変更
70 |     sound.setnframes(n_frames)       # チャネルあたりのサンプル数
71 |     sound.writeframes(sound_frames)  # 音声データの書き込み
72 | 
73 | print(f"出力ファイル名: {OUT_WAVE_FILE}")
74 | print(f"  ・チャネル数: {n_channel}")
75 | print(f"  ・量子化ビット数: {bitdepth * 8}")
76 | print(f"  ・サンプリング周波数: {n_framerate}")
77 | print(f"  ・サンプル数: {n_samples}")
78 | 


--------------------------------------------------------------------------------
/WarmUp/wave_change_framerate.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | """ 音声情報処理 n本ノック !! """
 4 | 
 5 | # MIT License
 6 | 
 7 | # Copyright (C) 2020 by Akira TAMAMORI
 8 | 
 9 | # Permission is hereby granted, free of charge, to any person
10 | # obtaining a copy of this software and associated documentation files
11 | # (the Software"), to deal in the Software without restriction,
12 | # including without limitation the rights to use, copy, modify, merge,
13 | # publish, distribute, sublicense, and/or sell copies of the Software,
14 | # and to permit persons to whom the Software is furnished to do so,
15 | # subject to the following conditions:
16 | 
17 | # The above copyright notice and this permission notice shall be
18 | # included in all copies or substantial portions of the Software.
19 | 
20 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
21 | # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
22 | # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
23 | # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
24 | # LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
25 | # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
26 | # WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
27 | 
28 | # Commentary:
29 | # サンプリング周波数を変更したwavファイルの作成
30 | 
31 | import wave
32 | 
33 | IN_WAVE_FILE = "in.wav"  # モノラル音声（前提）
34 | OUT_WAVE_FILE = "out.wav"
35 | 
36 | SAMPLE_FREQ = 8000  # 変更後のサンプリング周波数
37 | 
38 | # wavの読み込み
39 | with wave.open(IN_WAVE_FILE, "r") as sound:
40 |     params = sound.getparams()
41 |     n_channel = sound.getnchannels()    # チャネル数 (mono:1, stereo:2)
42 |     bitdepth = sound.getsampwidth()     # 量子化ビット数 (byte!)
43 |     sample_freq = sound.getframerate()  # サンプリング周波数
44 |     n_frames = sound.getnframes()       # チャネルあたりのサンプル数
45 |     n_samples = n_channel * n_frames    # 総サンプル数
46 |     sound_frames = sound.readframes(n_frames)  # 音声データ (bytesオブジェクト)
47 | 
48 | # ヘッダ情報の表示
49 | print(f"チャネル数: {n_channel}")
50 | print(f"量子化ビット数: {bitdepth * 8}")
51 | print(f"サンプリング周波数: {sample_freq}")
52 | print(f"サンプル数: {n_samples}")
53 | 
54 | # wavの書き込み
55 | with wave.open(OUT_WAVE_FILE, "w") as sound:
56 |     sound.setnchannels(n_channel)    # チャネル数 (mono:1, stereo:2)
57 |     sound.setsampwidth(bitdepth)     # 量子化ビット数 (byte!)
58 |     sound.setframerate(SAMPLE_FREQ)  # 標本化周波数の変更
59 |     sound.setnframes(n_frames)       # チャネルあたりのサンプル数
60 |     sound.writeframes(sound_frames)  # 音声データの書き込み
61 | 


--------------------------------------------------------------------------------
/WarmUp/wave_normalize.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | """ 音声情報処理 n本ノック !! """
 4 | 
 5 | # MIT License
 6 | 
 7 | # Copyright (C) 2020 by Akira TAMAMORI
 8 | 
 9 | # Permission is hereby granted, free of charge, to any person
10 | # obtaining a copy of this software and associated documentation files
11 | # (the Software"), to deal in the Software without restriction,
12 | # including without limitation the rights to use, copy, modify, merge,
13 | # publish, distribute, sublicense, and/or sell copies of the Software,
14 | # and to permit persons to whom the Software is furnished to do so,
15 | # subject to the following conditions:
16 | 
17 | # The above copyright notice and this permission notice shall be
18 | # included in all copies or substantial portions of the Software.
19 | 
20 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
21 | # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
22 | # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
23 | # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
24 | # LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
25 | # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
26 | # WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
27 | 
28 | # Commentary:
29 | # 振幅を正規化
30 | 
31 | import wave
32 | import numpy as np
33 | 
34 | IN_WAVE_FILE = "in.wav"   # 16bit モノラル音声（前提）
35 | OUT_WAVE_FILE = "out.wav"
36 | 
37 | # wavの読み込み
38 | with wave.open(IN_WAVE_FILE, "r") as sound:
39 |     params = sound.getparams()
40 |     n_channel = sound.getnchannels()    # チャネル数 (mono:1, stereo:2)
41 |     bitdepth = sound.getsampwidth()     # 量子化ビット数 (byte!)
42 |     n_framerate = sound.getframerate()  # サンプリング周波数
43 |     n_frames = sound.getnframes()       # チャネルあたりのサンプル数
44 |     n_samples = n_channel * n_frames    # 総サンプル数
45 |     sound_frames = sound.readframes(n_frames)  # 音声データ (bytesオブジェクト)
46 | 
47 | # ヘッダ情報の表示
48 | print(f"入力ファイル名: {IN_WAVE_FILE}")
49 | print(f"  ・チャネル数: {n_channel}")
50 | print(f"  ・量子化ビット数: {bitdepth * 8}")
51 | print(f"  ・サンプリング周波数: {n_framerate}")
52 | print(f"  ・サンプル数: {n_samples}")
53 | 
54 | # 振幅の正規化
55 | x = np.frombuffer(sound_frames, dtype=np.int16)
56 | x = (x / np.max(x)) * (2 ** (bitdepth * 8 - 1) - 1)
57 | x = x.astype(np.int16)
58 | sound_frames = x.tobytes()
59 | 
60 | # wavの書き込み
61 | with wave.open(OUT_WAVE_FILE, "w") as sound:
62 |     sound.setnchannels(n_channel)    # チャネル数 (mono:1, stereo:2)
63 |     sound.setsampwidth(bitdepth)     # 量子化ビット数 (byte!)
64 |     sound.setframerate(n_framerate)  # 標本化周波数の変更
65 |     sound.setnframes(n_frames)       # チャネルあたりのサンプル数
66 |     sound.writeframes(sound_frames)  # 音声データの書き込み
67 | 
68 | print(f"出力ファイル名: {OUT_WAVE_FILE}")
69 | print(f"  ・チャネル数: {n_channel}")
70 | print(f"  ・量子化ビット数: {bitdepth * 8}")
71 | print(f"  ・サンプリング周波数: {n_framerate}")
72 | print(f"  ・サンプル数: {n_samples}")
73 | 


--------------------------------------------------------------------------------
/WarmUp/wave_read_write.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | """ 音声情報処理 n本ノック !! """
 4 | 
 5 | # MIT License
 6 | 
 7 | # Copyright (C) 2020 by Akira TAMAMORI
 8 | 
 9 | # Permission is hereby granted, free of charge, to any person
10 | # obtaining a copy of this software and associated documentation files
11 | # (the Software"), to deal in the Software without restriction,
12 | # including without limitation the rights to use, copy, modify, merge,
13 | # publish, distribute, sublicense, and/or sell copies of the Software,
14 | # and to permit persons to whom the Software is furnished to do so,
15 | # subject to the following conditions:
16 | 
17 | # The above copyright notice and this permission notice shall be
18 | # included in all copies or substantial portions of the Software.
19 | 
20 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
21 | # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
22 | # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
23 | # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
24 | # LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
25 | # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
26 | # WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
27 | 
28 | # Commentary:
29 | # waveモジュールを用いた音声入出力 (コピー作成)
30 | 
31 | import wave
32 | 
33 | IN_WAVE_FILE = "in.wav"  # モノラル音声（前提）
34 | OUT_WAVE_FILE = "out.wav"
35 | 
36 | # wavの読み込み
37 | with wave.open(IN_WAVE_FILE, "r") as sound:
38 |     params = sound.getparams()
39 |     n_channel = sound.getnchannels()    # チャネル数 (mono:1, stereo:2)
40 |     bitdepth = sound.getsampwidth()     # 量子化ビット数 (byte!)
41 |     sample_freq = sound.getframerate()  # サンプリング周波数
42 |     n_frames = sound.getnframes()       # チャネルあたりのサンプル数
43 |     n_samples = n_channel * n_frames    # 総サンプル数
44 |     sound_frames = sound.readframes(n_frames)  # 音声データ (bytesオブジェクト)
45 | 
46 | # ヘッダ情報の表示
47 | print(f"チャネル数: {n_channel}")
48 | print(f"量子化ビット数: {bitdepth * 8}")
49 | print(f"サンプリング周波数: {sample_freq}")
50 | print(f"サンプル数: {n_samples}")
51 | 
52 | # wavの書き込み
53 | with wave.open(OUT_WAVE_FILE, "w") as sound:
54 |     sound.setparams(params)   # ヘッダ情報の書き込み
55 |     sound.writeframes(sound_frames)  # 音声データの書き込み
56 | 


--------------------------------------------------------------------------------
/WarmUp/wave_read_write_scipy.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | """ 音声情報処理 n本ノック !! """
 4 | 
 5 | # MIT License
 6 | 
 7 | # Copyright (C) 2020 by Akira TAMAMORI
 8 | 
 9 | # Permission is hereby granted, free of charge, to any person
10 | # obtaining a copy of this software and associated documentation files
11 | # (the Software"), to deal in the Software without restriction,
12 | # including without limitation the rights to use, copy, modify, merge,
13 | # publish, distribute, sublicense, and/or sell copies of the Software,
14 | # and to permit persons to whom the Software is furnished to do so,
15 | # subject to the following conditions:
16 | 
17 | # The above copyright notice and this permission notice shall be
18 | # included in all copies or substantial portions of the Software.
19 | 
20 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
21 | # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
22 | # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
23 | # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
24 | # LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
25 | # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
26 | # WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
27 | 
28 | # Commentary:
29 | # scipyモジュールを用いた音声入出力 (コピー作成)
30 | 
31 | from scipy.io import wavfile
32 | import numpy as np
33 | 
34 | IN_WAVE_FILE = "in.wav"  # モノラル音声（前提）
35 | OUT_WAVE_FILE = "out.wav"
36 | 
37 | # 音声の読み込み
38 | fs, x = wavfile.read(IN_WAVE_FILE)
39 | x = x.astype(np.float64)
40 | 
41 | # 音声の書き込み
42 | x = x.astype(np.int16)
43 | wavfile.write(OUT_WAVE_FILE, fs, x)
44 | 


--------------------------------------------------------------------------------
/WarmUp/wave_stereo_to_mono.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | """ 音声情報処理 n本ノック !! """
 4 | 
 5 | # MIT License
 6 | 
 7 | # Copyright (C) 2020 by Akira TAMAMORI
 8 | 
 9 | # Permission is hereby granted, free of charge, to any person
10 | # obtaining a copy of this software and associated documentation files
11 | # (the Software"), to deal in the Software without restriction,
12 | # including without limitation the rights to use, copy, modify, merge,
13 | # publish, distribute, sublicense, and/or sell copies of the Software,
14 | # and to permit persons to whom the Software is furnished to do so,
15 | # subject to the following conditions:
16 | 
17 | # The above copyright notice and this permission notice shall be
18 | # included in all copies or substantial portions of the Software.
19 | 
20 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
21 | # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
22 | # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
23 | # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
24 | # LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
25 | # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
26 | # WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
27 | 
28 | # Commentary:
29 | # ステレオからモノラルへと変更
30 | 
31 | import wave
32 | import numpy as np
33 | 
34 | IN_WAVE_FILE = "in.wav"   # ステレオ音声（前提）
35 | OUT_WAVE_FILE = "out.wav"
36 | 
37 | # wavの読み込み
38 | with wave.open(IN_WAVE_FILE, "r") as sound:
39 |     params = sound.getparams()
40 |     n_channel = sound.getnchannels()    # チャネル数 (mono:1, stereo:2)
41 |     bitdepth = sound.getsampwidth()     # 量子化ビット数 (byte!)
42 |     n_framerate = sound.getframerate()  # サンプリング周波数
43 |     n_frames = sound.getnframes()       # チャネルあたりのサンプル数
44 |     n_samples = n_channel * n_frames    # 総サンプル数
45 |     sound_frames = sound.readframes(n_frames)  # 音声データ (bytesオブジェクト)
46 | 
47 | # ヘッダ情報の表示
48 | print(f"入力ファイル名: {IN_WAVE_FILE}")
49 | print(f"  ・チャネル数: {n_channel}")
50 | print(f"  ・量子化ビット数: {bitdepth * 8}")
51 | print(f"  ・サンプリング周波数: {n_framerate}")
52 | print(f"  ・サンプル数: {n_samples}")
53 | 
54 | # ステレオからモノラルへの変換（左右チャネルの平均）
55 | channels = np.frombuffer(sound_frames, dtype=np.int16)
56 | l_channel = channels[0::2].astype(np.float32)  # 左チャネル
57 | r_channel = channels[1::2].astype(np.float32)  # 右チャネル
58 | mono_channel = (l_channel + r_channel) / 2
59 | mono_channel = mono_channel.astype(np.int16)
60 | 
61 | # bytesオブジェクトへの変換
62 | sound_frames = mono_channel.tobytes()
63 | 
64 | # チャネル数の変更
65 | n_channel = 1  # mono
66 | 
67 | # wavの書き込み
68 | with wave.open(OUT_WAVE_FILE, "w") as sound:
69 |     sound.setnchannels(n_channel)    # チャネル数 (mono:1, stereo:2)
70 |     sound.setsampwidth(bitdepth)     # 量子化ビット数 (byte!)
71 |     sound.setframerate(n_framerate)  # 標本化周波数の変更
72 |     sound.setnframes(n_frames)       # チャネルあたりのサンプル数
73 |     sound.writeframes(sound_frames)  # 音声データの書き込み
74 | 
75 | print(f"出力ファイル名: {OUT_WAVE_FILE}")
76 | print(f"  ・チャネル数: {n_channel}")
77 | print(f"  ・量子化ビット数: {bitdepth * 8}")
78 | print(f"  ・サンプリング周波数: {n_framerate}")
79 | print(f"  ・サンプル数: {n_samples}")
80 | 


--------------------------------------------------------------------------------
/WarmUp/wave_write_whitenoise.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | """ 音声情報処理 n本ノック !! """
 4 | 
 5 | # MIT License
 6 | 
 7 | # Copyright (C) 2020 by Akira TAMAMORI
 8 | 
 9 | # Permission is hereby granted, free of charge, to any person
10 | # obtaining a copy of this software and associated documentation files
11 | # (the Software"), to deal in the Software without restriction,
12 | # including without limitation the rights to use, copy, modify, merge,
13 | # publish, distribute, sublicense, and/or sell copies of the Software,
14 | # and to permit persons to whom the Software is furnished to do so,
15 | # subject to the following conditions:
16 | 
17 | # The above copyright notice and this permission notice shall be
18 | # included in all copies or substantial portions of the Software.
19 | 
20 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
21 | # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
22 | # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
23 | # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
24 | # LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
25 | # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
26 | # WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
27 | 
28 | # Commentary:
29 | # 白色雑音をwavとして書き込む
30 | 
31 | import wave
32 | 
33 | import numpy as np
34 | 
35 | OUT_WAVE_FILE = "out_whitenoise.wav"
36 | 
37 | # 白色雑音のサンプル数を設定
38 | n_samples = 40000
39 | 
40 | # サンプリング周波数
41 | sample_freq = 16000
42 | 
43 | # 白色雑音を生成
44 | data = np.random.normal(scale=0.1, size=n_samples)
45 | 
46 | # 値の範囲を調整
47 | data = data * np.iinfo(np.int16).max
48 | 
49 | # 2バイト(16bit)の整数値に変換
50 | data = data.astype(np.int16)
51 | 
52 | # wavの書き込み
53 | with wave.open(OUT_WAVE_FILE, "w") as sound:
54 |     sound.setnchannels(1)  # モノラル
55 |     sound.setsampwidth(2)  # 量子化ビット数（2byte = 16bit）
56 |     sound.setframerate(sample_freq)  # サンプリング周波数
57 |     sound.writeframes(data)  # 音声データの書き込み
58 | 


--------------------------------------------------------------------------------