├── README.md ├── trans_to_wav.py ├── test.py ├── trian_crnn.py ├── get_feature.py └── detect_gui.py /README.md: -------------------------------------------------------------------------------- 1 | # 基于随机森林的语音情感识别 2 | 3 | ### 原数据集enterface database下载 4 | 链接:[https://pan.baidu.com/s/1AXb31ov3kJhg5_Bo4C-ElA?pwd=5kxk](https://pan.baidu.com/s/1AXb31ov3kJhg5_Bo4C-ElA?pwd=5kxk) 5 | 提取码:5kxk 6 | 7 | ## 系统要求 8 | 建议使用 python3 或以上版本。 9 | 10 | ## 数据集格式 11 | 请将数据集调整为以下格式: 12 | - 数据集主文件夹包含若干子文件夹 13 | - 每个子文件夹中有6中情绪的子文件夹,每个子文件夹名对应情绪标签 14 | - 每个情绪子文件夹中包含若干个子文件夹,每个子文件夹中包含一个语音文件 15 | 16 | 调整每个.py代码中的路径参数为相应的路径。 17 | 18 | ## 项目运行流程 19 | 1. 运行 `trans_to_wav.py`:调整语音文件格式(此步可省略如果语音文件已经为wav格式) 20 | 2. 运行 `get_feature.py`:提取特征 21 | 3. 运行 `train_rcnn.py`:训练模型 22 | 4. 运行 `test.py`:测试模型 23 | 24 | ## 可视化界面 25 | 项目包含一个可视化界面文件 `detect_gui.py`,其中集成了完整功能,方便用户使用。 26 | 27 | ## 使用说明 28 | 1. 下载并解压项目文件。 29 | 2. 打开 MATLAB,并将当前文件夹设置为项目根目录。 30 | 3. 按照上述运行流程执行对应的 MATLAB 文件。 31 | 4. 运行 `detect_gui.py` 以使用可视化界面进行导入模型、导入待测音频、预处理音频、提取特征、情感识别。 32 | 33 | ## 联系我们 34 | 如果您在使用过程中有任何问题,请通过以下方式联系我: 35 | - 邮箱:w1372988970@gmail.com 36 | 37 | ![star-history-2025627 (2)](https://github.com/user-attachments/assets/abf72bca-c068-4839-a220-c3c9cea6e789) 38 | -------------------------------------------------------------------------------- /trans_to_wav.py: -------------------------------------------------------------------------------- 1 | import os 2 | import glob 3 | import moviepy.editor as mp 4 | from scipy.io import wavfile 5 | 6 | # 设置文件夹路径 7 | root_path = 'E:/代码接单/rcnn语音情感识别/project2_database/enterface database' 8 | subject_folders = glob.glob(os.path.join(root_path, 'subject *')) 9 | emotions = ['anger', 'disgust', 'fear', 'happiness', 'sadness', 'surprise'] 10 | 11 | # 遍历所有志愿者文件夹 12 | for subject_folder in subject_folders: 13 | 14 | # 遍历所有情感文件夹 15 | for emotion in emotions: 16 | emotion_folder = os.path.join(subject_folder, emotion) 17 | 18 | # 遍历所有句子文件夹 19 | for sentence_index in range(1, 6): 20 | sentence_folder = os.path.join(emotion_folder, f'sentence {sentence_index}') 21 | 22 | # 检查句子文件夹是否存在 23 | if os.path.isdir(sentence_folder): 24 | avi_files = glob.glob(os.path.join(sentence_folder, '*.avi')) 25 | 26 | # 检查AVI文件是否存在 27 | if avi_files: 28 | avi_file_path = avi_files[0] 29 | 30 | # 转换视频文件为音频文件 31 | video = mp.VideoFileClip(avi_file_path) 32 | audio = video.audio 33 | audio_data = audio.to_soundarray() 34 | audio_fs = audio.fps 35 | 36 | # 保存音频文件为WAV格式 37 | wav_file_path = os.path.join(sentence_folder, 38 | f'{os.path.splitext(os.path.basename(avi_file_path))[0]}.wav') 39 | wavfile.write(wav_file_path, audio_fs, audio_data) 40 | -------------------------------------------------------------------------------- /test.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from tensorflow.keras.models import load_model 3 | from tensorflow.keras.utils import to_categorical 4 | 5 | # 定义情感列表 6 | emotions = ['anger', 'disgust', 'fear', 'happiness', 'sadness', 'surprise'] 7 | 8 | # 加载特征矩阵和标签向量 9 | pitch_features_labels = np.load('pitch_features_labels.npy') 10 | timbre_features_labels = np.load('timbre_features_labels.npy') 11 | loudness_features_labels = np.load('loudness_features_labels.npy') 12 | duration_features_labels = np.load('duration_features_labels.npy') 13 | 14 | # 提取特征和标签 15 | pitch_features, pitch_labels = pitch_features_labels[:, :-1], pitch_features_labels[:, -1] 16 | timbre_features, timbre_labels = timbre_features_labels[:, :-1], timbre_features_labels[:, -1] 17 | loudness_features, loudness_labels = loudness_features_labels[:, :-1], loudness_features_labels[:, -1] 18 | duration_features, duration_labels = duration_features_labels[:, :-1], duration_features_labels[:, -1] 19 | 20 | # 将特征堆叠在一起 21 | stacked_features = np.hstack((pitch_features, timbre_features, loudness_features, duration_features)) 22 | 23 | # 数据预处理:将特征数据转换为适合CRNN的形状 24 | n_features = stacked_features.shape[1] 25 | all_data = stacked_features.reshape(-1, n_features, 1) 26 | 27 | # 将标签转换为分类形式 28 | num_classes = len(emotions) 29 | all_labels = to_categorical(pitch_labels, num_classes) 30 | 31 | # 加载训练好的模型 32 | model_path = 'C:/Users/13729/PycharmProjects/mood/models/emotion_recognition_crnn_epoch067.h5' 33 | trained_model = load_model(model_path) 34 | 35 | # 评估模型在整个数据集上的准确率 36 | _, accuracy = trained_model.evaluate(all_data, all_labels, batch_size=32) 37 | print(f"模型在整个数据集上的准确率为:{accuracy * 100:.2f}%") 38 | -------------------------------------------------------------------------------- /trian_crnn.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from sklearn.model_selection import train_test_split 3 | from tensorflow.keras.models import Sequential 4 | from tensorflow.keras.layers import Dense, Dropout, Activation, Flatten, LSTM, TimeDistributed, Conv1D, MaxPooling1D 5 | from tensorflow.keras.optimizers import Adam 6 | from tensorflow.keras.utils import to_categorical 7 | from tensorflow.keras.callbacks import ModelCheckpoint 8 | from tensorflow.keras.models import load_model 9 | 10 | # 定义情感列表 11 | emotions = ['anger', 'disgust', 'fear', 'happiness', 'sadness', 'surprise'] 12 | 13 | # 加载特征矩阵和标签向量 14 | pitch_features_labels = np.load('pitch_features_labels.npy') 15 | timbre_features_labels = np.load('timbre_features_labels.npy') 16 | loudness_features_labels = np.load('loudness_features_labels.npy') 17 | duration_features_labels = np.load('duration_features_labels.npy') 18 | 19 | # 提取特征和标签 20 | pitch_features, pitch_labels = pitch_features_labels[:, :-1], pitch_features_labels[:, -1] 21 | timbre_features, timbre_labels = timbre_features_labels[:, :-1], timbre_features_labels[:, -1] 22 | loudness_features, loudness_labels = loudness_features_labels[:, :-1], loudness_features_labels[:, -1] 23 | duration_features, duration_labels = duration_features_labels[:, :-1], duration_features_labels[:, -1] 24 | 25 | # 将特征堆叠在一起 26 | stacked_features = np.hstack((pitch_features, timbre_features, loudness_features, duration_features)) 27 | 28 | # 数据划分为训练集和测试集 29 | X_train, X_test, y_train, y_test = train_test_split(stacked_features, pitch_labels, test_size=0.2, random_state=42) 30 | 31 | # 数据预处理:将特征数据转换为适合CRNN的形状 32 | n_features = stacked_features.shape[1] 33 | X_train = X_train.reshape(-1, n_features, 1) 34 | X_test = X_test.reshape(-1, n_features, 1) 35 | 36 | # 将标签转换为分类形式 37 | num_classes = len(emotions) 38 | y_train = to_categorical(y_train, num_classes) 39 | y_test = to_categorical(y_test, num_classes) 40 | 41 | # 构建CRNN模型 42 | model = Sequential() 43 | 44 | # 卷积层 45 | model.add(Conv1D(64, kernel_size=3, activation='relu', input_shape=(n_features, 1))) 46 | 47 | # 循环层 48 | model.add(LSTM(128, return_sequences=True)) 49 | model.add(Dropout(0.5)) 50 | model.add(LSTM(128, return_sequences=False)) 51 | model.add(Dropout(0.5)) 52 | 53 | # 全连接层 54 | model.add(Dense(128, activation='relu')) 55 | model.add(Dropout(0.5)) 56 | model.add(Dense(num_classes, activation='softmax')) 57 | 58 | # 编译模型 59 | model.compile(loss='categorical_crossentropy', optimizer=Adam(learning_rate=0.001), metrics=['accuracy']) 60 | 61 | # 加载已训练的模型 62 | # model_path = 'emotion_recognition_crnn_epoch030.h5' 63 | # trained_model = load_model(model_path) 64 | 65 | # 创建回调每轮都保存模型 66 | checkpoint = ModelCheckpoint('emotion_recognition_crnn_epoch{epoch:03d}.h5', save_freq='epoch') 67 | 68 | # 训练模型 69 | model.fit(X_train, y_train, epochs=100, batch_size=32, validation_data=(X_test, y_test), callbacks=[checkpoint]) 70 | 71 | # 从第30轮开始训练模型 72 | # initial_epoch = 30 73 | # trained_model.fit(X_train, y_train, epochs=100, initial_epoch=initial_epoch, batch_size=32, validation_data=(X_test, y_test), callbacks=[checkpoint]) 74 | 75 | # 保存模型 76 | model.save('emotion_recognition_crnn.h5') 77 | -------------------------------------------------------------------------------- /get_feature.py: -------------------------------------------------------------------------------- 1 | import os 2 | import numpy as np 3 | import scipy.io.wavfile as wavfile 4 | import librosa 5 | from python_speech_features import mfcc 6 | from scipy.signal.windows import hamming 7 | 8 | # 设置文件夹路径 9 | root_path = r'E:\代码接单\rcnn语音情感识别\project2_database\enterface database' 10 | subject_folders = [folder for folder in os.listdir(root_path) if folder.startswith('subject')] 11 | emotions = ['anger', 'disgust', 'fear', 'happiness', 'sadness', 'surprise'] 12 | 13 | # 预处理参数 14 | target_sample_rate = 16000 # 重采样目标采样率(Hz) 15 | frame_length = 0.025 # 帧长度(s) 16 | frame_overlap = 0.01 # 帧重叠(s) 17 | 18 | # 初始化特征矩阵和标签向量 19 | pitch_features = None 20 | timbre_features = None 21 | loudness_features = None 22 | duration_features = None 23 | labels = None 24 | 25 | # 遍历所有志愿者文件夹 26 | for subject_index in range(len(subject_folders)): 27 | subject_folder = os.path.join(root_path, subject_folders[subject_index]) 28 | 29 | # 遍历所有情感文件夹 30 | for emotion_index in range(len(emotions)): 31 | emotion_folder = os.path.join(subject_folder, emotions[emotion_index]) 32 | 33 | # 遍历所有句子文件夹 34 | for sentence_index in range(1, 6): 35 | sentence_folder = os.path.join(emotion_folder, f'sentence {sentence_index}') 36 | 37 | # 检查句子文件夹是否存在 38 | if os.path.isdir(sentence_folder): 39 | wav_files = [file for file in os.listdir(sentence_folder) if file.endswith('.wav')] 40 | 41 | # 检查WAV文件是否存在 42 | if wav_files: 43 | wav_file_path = os.path.join(sentence_folder, wav_files[0]) 44 | 45 | # 读取音频文件 46 | audio_fs, audio_data = wavfile.read(wav_file_path) 47 | 48 | # 重采样 49 | if audio_fs != target_sample_rate: 50 | audio_data = librosa.resample(audio_data.astype(np.float32), orig_sr=audio_fs, 51 | target_sr=target_sample_rate) 52 | audio_fs = target_sample_rate 53 | 54 | # 转换为单声道 55 | if audio_data.ndim > 1: 56 | audio_data = np.mean(audio_data, axis=1) 57 | 58 | # 提取音色特征 (MFCC) 59 | mfccs = mfcc(audio_data, audio_fs) 60 | 61 | # 提取音高特征 62 | pitch_values = librosa.yin(audio_data, fmin=librosa.note_to_hz('C2'), fmax=librosa.note_to_hz('C7')) 63 | pitch_values = np.mean(pitch_values) * np.ones((mfccs.shape[0], 1)) 64 | 65 | # 计算音强特征 66 | frame_length_samples = int(round(frame_length * audio_fs)) 67 | frame_overlap_samples = int(round(frame_overlap * audio_fs)) 68 | rms_window = hamming(frame_length_samples, sym=False) 69 | frame_starts = np.arange(0, len(audio_data) - frame_length_samples + 1, frame_overlap_samples) 70 | rms_values = np.zeros((len(frame_starts), 1)) 71 | for i in range(len(frame_starts)): 72 | frame = audio_data[frame_starts[i]:frame_starts[i] + frame_length_samples] 73 | rms_values[i] = np.sqrt(np.mean(frame ** 2)) 74 | rms_values = rms_values[:mfccs.shape[0], :] 75 | 76 | # 提取持续时间特征 77 | duration_value = len(audio_data) / audio_fs 78 | 79 | # 将特征添加到特征矩阵中 80 | if pitch_features is None: 81 | pitch_features = pitch_values 82 | else: 83 | pitch_features = np.vstack((pitch_features, pitch_values)) 84 | 85 | if timbre_features is None: 86 | timbre_features = mfccs 87 | else: 88 | timbre_features = np.vstack((timbre_features, mfccs)) 89 | 90 | if loudness_features is None: 91 | loudness_features = rms_values 92 | else: 93 | loudness_features = np.vstack((loudness_features, rms_values)) 94 | 95 | if duration_features is None: 96 | duration_features = np.full((mfccs.shape[0], 1), duration_value) 97 | else: 98 | duration_features = np.vstack((duration_features, np.full((mfccs.shape[0], 1), duration_value))) 99 | 100 | # 将情感标签添加到标签向量中 101 | emotion_label = emotion_index 102 | if labels is None: 103 | labels = np.full((len(pitch_values), 1), emotion_label) 104 | else: 105 | labels = np.vstack((labels, np.full((len(pitch_values), 1), emotion_label))) 106 | 107 | # 转换为NumPy数组 108 | pitch_features = np.array(pitch_features) 109 | timbre_features = np.array(timbre_features) 110 | loudness_features = np.array(loudness_features) 111 | duration_features = np.array(duration_features) 112 | labels = np.array(labels) 113 | 114 | # 确保特征矩阵和标签向量的长度一致 115 | min_length = min( 116 | [len(pitch_features), len(timbre_features), len(loudness_features), len(duration_features), 117 | len(labels)]) 118 | pitch_features = pitch_features[:min_length, :] 119 | timbre_features = timbre_features[:min_length, :] 120 | loudness_features = loudness_features[:min_length, :] 121 | duration_features = duration_features[:min_length, :] 122 | labels = labels[:min_length] 123 | 124 | # 保存特征矩阵和标签向量 125 | np.save('pitch_features_labels.npy', np.hstack((pitch_features, labels))) 126 | np.save('timbre_features_labels.npy', np.hstack((timbre_features, labels))) 127 | np.save('loudness_features_labels.npy', np.hstack((loudness_features, labels))) 128 | np.save('duration_features_labels.npy', np.hstack((duration_features, labels))) 129 | -------------------------------------------------------------------------------- /detect_gui.py: -------------------------------------------------------------------------------- 1 | import tkinter as tk 2 | from tkinter import filedialog 3 | import matplotlib.pyplot as plt 4 | from matplotlib.backends.backend_tkagg import FigureCanvasTkAgg 5 | import os 6 | import numpy as np 7 | import scipy.io.wavfile as wavfile 8 | import librosa 9 | from python_speech_features import mfcc 10 | from scipy.signal.windows import hamming 11 | from matplotlib.font_manager import FontProperties 12 | from keras.models import load_model as keras_load_model 13 | from tkinter import messagebox 14 | 15 | # 使用黑体字体 16 | font = FontProperties(fname=r"C:\Windows\Fonts\simhei.ttf", size=14) 17 | plt.rcParams['font.sans-serif'] = ['SimHei'] # 设置全局字体 18 | plt.rcParams['axes.unicode_minus'] = False # 解决保存图像时负号'-'显示为方块的问题 19 | 20 | def load_saved_model(model_path): 21 | global model 22 | model = keras_load_model(model_path) 23 | 24 | def load_wav_file(): 25 | global wav_file 26 | wav_file = filedialog.askopenfilename(filetypes=[('WAV files', '*.wav')]) 27 | rate, data = wavfile.read(wav_file) 28 | plot_waveform(rate, data) 29 | 30 | def plot_waveform(rate, data): 31 | fig, ax = plt.subplots() 32 | ax.plot(data) 33 | ax.set_title('待测语音原始波形') 34 | ax.set_xlabel('时间') 35 | ax.set_ylabel('振幅') 36 | waveform_plot = FigureCanvasTkAgg(fig, window) 37 | waveform_plot.get_tk_widget().grid(row=2, column=0) 38 | 39 | def extract_features(): 40 | global wav_file, extracted_features 41 | rate, data = wavfile.read(wav_file) 42 | extracted_features = extract_features_from_audio(data, rate) 43 | 44 | def extract_features_from_audio(audio_data, audio_fs): 45 | # 预处理参数 46 | target_sample_rate = 16000 # 重采样目标采样率(Hz) 47 | frame_length = 0.025 # 帧长度(s) 48 | frame_overlap = 0.01 # 帧重叠(s) 49 | 50 | # 重采样 51 | if audio_fs != target_sample_rate: 52 | audio_data = librosa.resample(audio_data.astype(np.float32), orig_sr=audio_fs, 53 | target_sr=target_sample_rate) 54 | audio_fs = target_sample_rate 55 | 56 | # 转换为单声道 57 | if audio_data.ndim > 1: 58 | audio_data = np.mean(audio_data, axis=1) 59 | 60 | # 提取音色特征 (MFCC) 61 | mfccs = mfcc(audio_data, audio_fs) 62 | 63 | # 提取音高特征 64 | pitch_values = librosa.yin(audio_data, fmin=librosa.note_to_hz('C2'), fmax=librosa.note_to_hz('C7')) 65 | pitch_values = np.mean(pitch_values) * np.ones((mfccs.shape[0], 1)) 66 | 67 | # 计算音强特征 68 | frame_length_samples = int(round(frame_length * audio_fs)) 69 | frame_overlap_samples = int(round(frame_overlap * audio_fs)) 70 | rms_window = hamming(frame_length_samples, sym=False) 71 | frame_starts = np.arange(0, len(audio_data) - frame_length_samples + 1, frame_overlap_samples) 72 | rms_values = np.zeros((len(frame_starts), 1)) 73 | for i in range(len(frame_starts)): 74 | frame = audio_data[frame_starts[i]:frame_starts[i] + frame_length_samples] 75 | rms_values[i] = np.sqrt(np.mean(frame ** 2)) 76 | rms_values = rms_values[:mfccs.shape[0], :] 77 | 78 | # 将所有特征裁剪为相同的长度 79 | min_len = min(mfccs.shape[0], rms_values.shape[0], pitch_values.shape[0]) 80 | mfccs = mfccs[:min_len, :] 81 | rms_values = rms_values[:min_len, :] 82 | pitch_values = pitch_values[:min_len, :] 83 | 84 | # 提取持续时间特征 85 | duration_value = len(audio_data) / audio_fs 86 | 87 | # 将特征堆叠在一起 88 | stacked_features = np.hstack((pitch_values, mfccs, rms_values, np.full((mfccs.shape[0], 1), duration_value))) 89 | 90 | return stacked_features 91 | 92 | def plot_features(): 93 | global extracted_features 94 | 95 | # 获取各种特征的长度 96 | pitch_length = len(extracted_features[:, 0]) 97 | mfcc_length = extracted_features.shape[1] - 2 - 1 98 | rms_length = len(extracted_features[:, -2]) 99 | duration_length = len(extracted_features[:, -1]) 100 | 101 | # 创建一个带有4个子图的画布 102 | fig, axes = plt.subplots(4, 1, figsize=(12, 16)) 103 | 104 | # 绘制音高特征 105 | axes[0].plot(np.arange(pitch_length), extracted_features[:, 0], color='blue', label='Pitch') 106 | axes[0].set_title('音高特征') 107 | axes[0].set_xlabel('时间') 108 | axes[0].set_ylabel('频率') 109 | axes[0].legend() 110 | 111 | # 绘制音色特征 (MFCC) 112 | img = axes[1].imshow(extracted_features[:, 1:1+mfcc_length].T, origin='lower', aspect='auto', cmap='viridis') 113 | axes[1].set_title('音色特征') 114 | axes[1].set_xlabel('时间') 115 | axes[1].set_ylabel('MFCC系数') 116 | fig.colorbar(img, ax=axes[1], label='MFCC Value') 117 | 118 | # 绘制音强特征 119 | axes[2].plot(np.arange(rms_length), extracted_features[:, -2], color='green', label='RMS') 120 | axes[2].set_title('音强特征') 121 | axes[2].set_xlabel('时间') 122 | axes[2].set_ylabel('均方根振幅') 123 | axes[2].legend() 124 | 125 | # 绘制持续时间特征 126 | axes[3].bar(np.arange(duration_length), extracted_features[:, -1], color='red', label='Duration') 127 | axes[3].set_title('持续时间特征') 128 | axes[3].set_xlabel('时间') 129 | axes[3].set_ylabel('持续时间') 130 | axes[3].legend() 131 | 132 | # 调整布局并显示图像 133 | plt.tight_layout() 134 | plt.show() 135 | 136 | def detect_emotion(): 137 | global model, extracted_features 138 | 139 | # 预处理特征以匹配模型输入 140 | input_features = np.expand_dims(extracted_features, axis=0) # 增加批次维度 141 | 142 | # 调整输入特征的形状 143 | n_features = extracted_features.shape[1] 144 | input_features = input_features.reshape(-1, n_features, 1) 145 | 146 | # 预测情感类别概率 147 | emotion_probabilities = model.predict(input_features)[0] 148 | 149 | # 获取最大概率对应的情感标签 150 | emotions = ['anger', 'disgust', 'fear', 'happiness', 'sadness', 'surprise'] 151 | predicted_emotion = emotions[np.argmax(emotion_probabilities)] 152 | 153 | # 显示预测结果 154 | tk.messagebox.showinfo('预测结果', f'预测情感: {predicted_emotion}\n各类别概率: {emotion_probabilities}') 155 | 156 | # 创建主窗口 157 | window = tk.Tk() 158 | window.title('语音情感识别') 159 | 160 | # 创建并放置按钮 161 | load_model_button = tk.Button(window, text='加载模型', command=lambda: load_saved_model(filedialog.askopenfilename(filetypes=[('HDF5 files', '*.h5')]))) 162 | load_model_button.grid(row=0, column=0) 163 | 164 | load_wav_button = tk.Button(window, text='加载WAV文件', command=load_wav_file) 165 | load_wav_button.grid(row=1, column=0) 166 | 167 | extract_features_button = tk.Button(window, text='提取特征', command=extract_features) 168 | extract_features_button.grid(row=3, column=0) 169 | 170 | plot_features_button = tk.Button(window, text='显示特征', command=plot_features) 171 | plot_features_button.grid(row=4, column=0) 172 | 173 | detect_emotion_button = tk.Button(window, text='检测情感', command=detect_emotion) 174 | detect_emotion_button.grid(row=5, column=0) 175 | 176 | # 运行主循环 177 | window.mainloop() 178 | --------------------------------------------------------------------------------