├── model └── 20160203c.pkl ├── mfcc.py ├── README.md ├── ffmpeg.py ├── LICENSE ├── train.py ├── sh.py ├── predict.py ├── .gitignore └── run.py /model/20160203c.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/begeekmyfriend/ezfm_diarisation/HEAD/model/20160203c.pkl -------------------------------------------------------------------------------- /mfcc.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | 3 | __author__ = 'hou' 4 | 5 | from python_speech_features import mfcc 6 | from python_speech_features import logfbank 7 | import scipy.io.wavfile as wav 8 | import numpy as np 9 | 10 | def extract(input_wav, duration): 11 | (rate, sig) = wav.read(input_wav) 12 | feature = mfcc(sig, rate) 13 | 14 | size_per_sec = len(feature) / duration 15 | time_window = 1 16 | std_tmp = [] 17 | for j in range(0, len(feature[0])): 18 | xx = [] 19 | for i in range(0, duration, time_window): 20 | xx.append(np.std(feature[i*size_per_sec: (i+time_window)*size_per_sec, j])) 21 | std_tmp.append(xx) 22 | 23 | std = np.transpose(std_tmp) 24 | return std 25 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # ezfm_diarisation 2 | Fork from [ezfm_process](https://github.com/zhichenghou/ezfm_process) 3 | 4 | 根据MFCC提取音频特征,训练[“飞鱼秀”](http://zaoaoaoaoao.com)音频节目语音和音乐的分类。 5 | 6 | # Material 7 | [应用机器学习分类声音文件中的音乐和人声](http://houzhicheng.com/blog/ml/2015/04/03/machine-learning-audio-process.html) 8 | 9 | # Preinstallation 10 | - [FFmpeg](https://ffmpeg.org) 11 | - [python_speech_features](https://github.com/jameslyons/python_speech_features) 12 | - [scikit-learn](https://scikit-learn.org) 13 | - [scipy](https://www.scipy.org) 14 | 15 | # Prediction 16 | > python run.py -p input_audio -m model/20160203c.pkl 17 | 18 | # Training 19 | > python run.py -t input_audio 20 | 21 | 本程序不对“飞鱼秀”以外的音频节目预测正确率负责,如有需要请自行训练。 22 | -------------------------------------------------------------------------------- /ffmpeg.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | 3 | __author__ = 'hou' 4 | 5 | import os 6 | import sh 7 | 8 | def split(filepath): 9 | splits = filepath.split('/') 10 | name = splits[len(splits) - 1] 11 | splits = name.split('.') 12 | return splits[0], splits[1] 13 | 14 | def convert(input_file): 15 | name, suffix = split(input_file) 16 | wav_file = name + ".wav" 17 | cmd = "ffmpeg -i " + input_file + " -ar 16000 -ac 1 -f wav " + wav_file 18 | sh.run(cmd) 19 | return wav_file 20 | 21 | def cut(input_file, delimit_points): 22 | if len(delimit_points) <= 2: return 23 | name, suffix = split(input_file) 24 | for i in range(0, len(delimit_points) - 1): 25 | cmd = "ffmpeg -i " + input_file + " -acodec copy -ss " + str(delimit_points[i]) + " -to " + str(delimit_points[i + 1]) + " " + name + "_" + str(i) + "." + suffix 26 | sh.run(cmd) 27 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2017 Leo Ma 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /train.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | 3 | __author__ = 'hou' 4 | 5 | from sklearn import linear_model 6 | from sklearn import svm 7 | from sklearn.neighbors import KNeighborsClassifier 8 | from sklearn.externals import joblib 9 | import numpy as np 10 | import sys 11 | import os 12 | import wave 13 | import mfcc 14 | 15 | def label(samples, marks, duration): 16 | sam_per_sec = len(samples) / duration 17 | labels = np.zeros(len(samples), dtype=np.int) 18 | 19 | for i in range(0, len(marks)): 20 | begin_sec = marks[i][0] 21 | end_sec = marks[i][1] 22 | labels[int(begin_sec * sam_per_sec): int(end_sec * sam_per_sec)] = 1 23 | 24 | return labels 25 | 26 | def lgr_train(X, y): 27 | logreg = linear_model.LogisticRegression() 28 | logreg.fit(X, y) 29 | return logreg 30 | 31 | def svm_train(X, y): 32 | clf = svm.SVC() 33 | clf.fit(X, y) 34 | return clf 35 | 36 | def knc_train(X, Y): 37 | knc = KNeighborsClassifier(n_neighbors=3) 38 | knc.fit(X, Y) 39 | return knc 40 | 41 | def run(input_wav, songs): 42 | f = wave.open(input_wav, "r") 43 | duration = f.getnframes() / f.getframerate() 44 | 45 | X = mfcc.extract(input_wav, duration) 46 | y = label(X, songs, duration) 47 | 48 | clf = svm_train(X, y) 49 | model = "model/" + input_wav.split('.')[0] + ".pkl" 50 | joblib.dump(clf, model) 51 | print "Train OK, " + model + " generated." 52 | return model 53 | -------------------------------------------------------------------------------- /sh.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | 3 | __author__ = 'hou' 4 | 5 | import subprocess 6 | 7 | def run(cmd, capture = False): 8 | out_stream = subprocess.PIPE if capture else None 9 | err_stream = subprocess.PIPE if capture else None 10 | 11 | print cmd 12 | p = subprocess.Popen(cmd, shell=True, stdout=out_stream, stderr=err_stream) 13 | (stdout, stderr) = p.communicate() 14 | 15 | stdout = stdout.strip() if stdout else "" 16 | stderr = stderr.strip() if stderr else "" 17 | 18 | return_code = p.returncode 19 | success = (return_code == 0) 20 | 21 | return Result(cmd, stdout, stderr, success, return_code) 22 | 23 | class Result(object): 24 | def __init__(self, cmd, stdout, stderr, success, return_code): 25 | self.value = {} 26 | self.value.setdefault('cmd', cmd) 27 | self.value.setdefault('stdout', stdout) 28 | self.value.setdefault('stderr', stderr) 29 | self.value.setdefault('success', success) 30 | self.value.setdefault('return_code', return_code) 31 | 32 | def cmd(self): 33 | return self.value.get('cmd', '') 34 | 35 | def stdout(self): 36 | return self.value.get('stdout', '') 37 | 38 | def stderr(self): 39 | return self.value.get('stderr', '') 40 | 41 | def success(self): 42 | return self.value.get('success', False) 43 | 44 | def return_code(self): 45 | return self.value.get('return_code', -1) 46 | 47 | def __repr__(self): 48 | return self.value.__repr__() 49 | -------------------------------------------------------------------------------- /predict.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | 3 | __author__ = 'hou' 4 | 5 | from sklearn.externals import joblib 6 | import numpy as np 7 | import wave 8 | import mfcc 9 | 10 | # smooth tiny interval 11 | # 1100000111000 --> 000000000000 (111 < interval) 12 | # 0011111000111 --> 111111111111 (000 < interval) 13 | def interval_size(Yp, interval): 14 | last = Yp[0] 15 | start = 0 16 | for i in range(0, len(Yp)): 17 | if Yp[i] != last: 18 | delta = i - start 19 | if delta < interval: 20 | Yp[start:i] = Yp[i] 21 | else: 22 | start = i 23 | last = Yp[i] 24 | 25 | # smooth error labels which occupy littlse in a window size 26 | def window(Y, duration): 27 | time_window = duration / 60 28 | if time_window < 5: 29 | time_window = 5 30 | threshold = time_window 31 | Yp = np.ones(len(Y), dtype=np.int) 32 | 33 | for i in range(0, len(Y)): 34 | d = Y[np.max([0, i - time_window]) : np.min([i + time_window, len(Y)])] 35 | if (np.sum(d) < threshold): 36 | Yp[i] = 0 37 | 38 | # The interval of songs and speech should not be shorter than the window 39 | interval_size(Yp, time_window) 40 | return Yp 41 | 42 | def run(input_wav, model): 43 | f = wave.open(input_wav, "r") 44 | duration = f.getnframes() / f.getframerate() 45 | 46 | X = mfcc.extract(input_wav, duration) 47 | 48 | clf = joblib.load(model) 49 | 50 | Y = clf.predict(X[:,:]) 51 | #np.savetxt("Y.txt", Y) 52 | Yp = window(Y, duration) 53 | #np.savetxt("Yp.txt", Yp) 54 | 55 | last = Yp[0] 56 | delimit_point = [0] 57 | for i in range(0, len(Yp)): 58 | if Yp[i] != last: 59 | delimit_point.append(i) 60 | last = Yp[i] 61 | 62 | return Yp, delimit_point 63 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | env/ 12 | build/ 13 | develop-eggs/ 14 | dist/ 15 | downloads/ 16 | eggs/ 17 | .eggs/ 18 | lib/ 19 | lib64/ 20 | parts/ 21 | sdist/ 22 | var/ 23 | wheels/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | 28 | # PyInstaller 29 | # Usually these files are written by a python script from a template 30 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 31 | *.manifest 32 | *.spec 33 | 34 | # Installer logs 35 | pip-log.txt 36 | pip-delete-this-directory.txt 37 | 38 | # Unit test / coverage reports 39 | htmlcov/ 40 | .tox/ 41 | .coverage 42 | .coverage.* 43 | .cache 44 | nosetests.xml 45 | coverage.xml 46 | *.cover 47 | .hypothesis/ 48 | 49 | # Translations 50 | *.mo 51 | *.pot 52 | 53 | # Django stuff: 54 | *.log 55 | local_settings.py 56 | 57 | # Flask stuff: 58 | instance/ 59 | .webassets-cache 60 | 61 | # Scrapy stuff: 62 | .scrapy 63 | 64 | # Sphinx documentation 65 | docs/_build/ 66 | 67 | # PyBuilder 68 | target/ 69 | 70 | # Jupyter Notebook 71 | .ipynb_checkpoints 72 | 73 | # pyenv 74 | .python-version 75 | 76 | # celery beat schedule file 77 | celerybeat-schedule 78 | 79 | # SageMath parsed files 80 | *.sage.py 81 | 82 | # dotenv 83 | .env 84 | 85 | # virtualenv 86 | .venv 87 | venv/ 88 | ENV/ 89 | 90 | # Spyder project settings 91 | .spyderproject 92 | .spyproject 93 | 94 | # Rope project settings 95 | .ropeproject 96 | 97 | # mkdocs documentation 98 | /site 99 | 100 | # mypy 101 | .mypy_cache/ 102 | -------------------------------------------------------------------------------- /run.py: -------------------------------------------------------------------------------- 1 | import ffmpeg 2 | import mfcc 3 | import predict 4 | import train 5 | import sh 6 | import sys, getopt 7 | import numpy as np 8 | import matplotlib.pyplot as plt 9 | 10 | def song_dump(songs): 11 | for p in songs: 12 | print str(p[0] / 3600) + ":" + str(p[0] % 3600 / 60) + ":" + str(p[0] % 60) 13 | print str(p[1] / 3600) + ":" + str(p[1] % 3600 / 60) + ":" + str(p[1] % 60) 14 | 15 | def usage(): 16 | print "python run.py -t input.wav | -p input.wav -m model.pkl" 17 | print "-t Extract MFCC feature from the input autio in standard deviation mode, train in SVM and generate the model" 18 | print "-p Predict the diarisation of the input audio" 19 | print "-m Specify the model when predition" 20 | 21 | def main(): 22 | model = "" 23 | opts, args = getopt.getopt(sys.argv[1:], "ht:p:m:") 24 | for op, value in opts: 25 | if op == "-h": 26 | usage() 27 | sys.exit(); 28 | elif op == "-t": 29 | input_file = value 30 | elif op == "-p": 31 | input_file = value 32 | elif op == "-m": 33 | model = value 34 | 35 | input_wav = ffmpeg.convert(input_file) 36 | if model == "": 37 | input_wav = "20160203c.wav" 38 | songs_20160203c = [[0, 265], [1028,1245], [1440, 1696], [2177, 2693]] 39 | song_dump(songs_20160203c) 40 | model = train.run(input_wav, songs_20160203c) 41 | Y, delimit_points = predict.run(input_wav, model) 42 | for i in delimit_points: 43 | print str(i / 3600) + ":" + str(i % 3600 / 60) + ":" + str(i % 60) 44 | 45 | ffmpeg.cut(input_file, delimit_points) 46 | 47 | plt.figure() 48 | plt.plot(-0.2) 49 | plt.plot(1.2) 50 | if model == "": 51 | plt.plot(songs_20160203c, 'b') 52 | plt.plot(Y,'r') 53 | plt.show() 54 | 55 | if __name__ == '__main__': 56 | main() 57 | --------------------------------------------------------------------------------