├── model
    └── 20160203c.pkl
├── mfcc.py
├── README.md
├── ffmpeg.py
├── LICENSE
├── train.py
├── sh.py
├── predict.py
├── .gitignore
└── run.py


/model/20160203c.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/begeekmyfriend/ezfm_diarisation/HEAD/model/20160203c.pkl


--------------------------------------------------------------------------------
/mfcc.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | 
 3 | __author__ = 'hou'
 4 | 
 5 | from python_speech_features import mfcc
 6 | from python_speech_features import logfbank
 7 | import scipy.io.wavfile as wav
 8 | import numpy as np
 9 | 
10 | def extract(input_wav, duration):
11 | 	(rate, sig) = wav.read(input_wav)
12 | 	feature = mfcc(sig, rate)
13 | 
14 | 	size_per_sec = len(feature) / duration
15 | 	time_window = 1
16 | 	std_tmp = []
17 | 	for j in range(0, len(feature[0])):
18 | 		xx = []
19 | 		for i in range(0, duration, time_window):
20 | 			xx.append(np.std(feature[i*size_per_sec: (i+time_window)*size_per_sec, j]))
21 | 		std_tmp.append(xx)
22 | 
23 | 	std = np.transpose(std_tmp)
24 |         return std
25 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # ezfm_diarisation
 2 | Fork from [ezfm_process](https://github.com/zhichenghou/ezfm_process)
 3 | 
 4 | 根据MFCC提取音频特征，训练[“飞鱼秀”](http://zaoaoaoaoao.com)音频节目语音和音乐的分类。
 5 | 
 6 | # Material
 7 | [应用机器学习分类声音文件中的音乐和人声](http://houzhicheng.com/blog/ml/2015/04/03/machine-learning-audio-process.html)
 8 | 
 9 | # Preinstallation
10 | - [FFmpeg](https://ffmpeg.org)
11 | - [python_speech_features](https://github.com/jameslyons/python_speech_features)
12 | - [scikit-learn](https://scikit-learn.org)
13 | - [scipy](https://www.scipy.org)
14 | 
15 | # Prediction
16 | > python run.py -p input_audio -m model/20160203c.pkl
17 | 
18 | # Training
19 | > python run.py -t input_audio
20 | 
21 | 本程序不对“飞鱼秀”以外的音频节目预测正确率负责，如有需要请自行训练。
22 | 


--------------------------------------------------------------------------------
/ffmpeg.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | 
 3 | __author__ = 'hou'
 4 | 
 5 | import os
 6 | import sh
 7 | 
 8 | def split(filepath):
 9 |         splits = filepath.split('/')
10 |         name = splits[len(splits) - 1]
11 |         splits = name.split('.')
12 |         return splits[0], splits[1]
13 | 
14 | def convert(input_file):
15 |         name, suffix = split(input_file)
16 |         wav_file = name + ".wav"
17 |         cmd = "ffmpeg -i " + input_file + " -ar 16000 -ac 1 -f wav " + wav_file
18 |         sh.run(cmd)
19 |         return wav_file
20 | 
21 | def cut(input_file, delimit_points):
22 | 	if len(delimit_points) <= 2: return
23 |         name, suffix = split(input_file)
24 | 	for i in range(0, len(delimit_points) - 1):
25 | 		cmd = "ffmpeg -i " + input_file + " -acodec copy -ss " + str(delimit_points[i]) + " -to " + str(delimit_points[i + 1]) + " " + name + "_" + str(i) + "." + suffix
26 | 		sh.run(cmd)
27 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2017 Leo Ma
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/train.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | 
 3 | __author__ = 'hou'
 4 | 
 5 | from sklearn import linear_model
 6 | from sklearn import svm
 7 | from sklearn.neighbors import KNeighborsClassifier
 8 | from sklearn.externals import joblib
 9 | import numpy as np
10 | import sys
11 | import os
12 | import wave
13 | import mfcc
14 | 
15 | def label(samples, marks, duration):
16 | 	sam_per_sec = len(samples) / duration
17 | 	labels = np.zeros(len(samples), dtype=np.int)
18 | 
19 | 	for i in range(0, len(marks)):
20 | 		begin_sec = marks[i][0]
21 | 		end_sec = marks[i][1]
22 | 		labels[int(begin_sec * sam_per_sec): int(end_sec * sam_per_sec)] = 1
23 | 
24 | 	return labels
25 | 
26 | def lgr_train(X, y):
27 | 	logreg = linear_model.LogisticRegression()
28 | 	logreg.fit(X, y)
29 | 	return logreg
30 | 
31 | def svm_train(X, y):
32 | 	clf = svm.SVC()
33 | 	clf.fit(X, y)
34 | 	return clf
35 | 
36 | def knc_train(X, Y):
37 | 	knc = KNeighborsClassifier(n_neighbors=3)
38 | 	knc.fit(X, Y) 
39 | 	return knc
40 | 
41 | def run(input_wav, songs):
42 |         f = wave.open(input_wav, "r")
43 |         duration = f.getnframes() / f.getframerate()
44 | 
45 |         X = mfcc.extract(input_wav, duration)
46 | 	y = label(X, songs, duration)
47 | 
48 | 	clf = svm_train(X, y)
49 |         model = "model/" + input_wav.split('.')[0] + ".pkl"
50 | 	joblib.dump(clf, model)
51 |         print "Train OK, " + model + " generated."
52 |         return model
53 | 


--------------------------------------------------------------------------------
/sh.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | 
 3 | __author__ = 'hou'
 4 | 
 5 | import subprocess
 6 | 
 7 | def run(cmd, capture = False):
 8 |     out_stream = subprocess.PIPE if capture else None
 9 |     err_stream = subprocess.PIPE if capture else None
10 | 
11 |     print cmd
12 |     p = subprocess.Popen(cmd, shell=True, stdout=out_stream, stderr=err_stream)
13 |     (stdout, stderr) = p.communicate()
14 | 
15 |     stdout = stdout.strip() if stdout else ""
16 |     stderr = stderr.strip() if stderr else ""
17 | 
18 |     return_code = p.returncode
19 |     success = (return_code == 0)
20 | 
21 |     return Result(cmd, stdout, stderr, success, return_code)
22 | 
23 | class Result(object):
24 |     def __init__(self, cmd, stdout, stderr, success, return_code):
25 |         self.value = {}
26 |         self.value.setdefault('cmd', cmd)
27 |         self.value.setdefault('stdout', stdout)
28 |         self.value.setdefault('stderr', stderr)
29 |         self.value.setdefault('success', success)
30 |         self.value.setdefault('return_code', return_code)
31 | 
32 |     def cmd(self):
33 |         return self.value.get('cmd', '')
34 | 
35 |     def stdout(self):
36 |         return self.value.get('stdout', '')
37 | 
38 |     def stderr(self):
39 |         return self.value.get('stderr', '')
40 | 
41 |     def success(self):
42 |         return self.value.get('success', False)
43 | 
44 |     def return_code(self):
45 |         return self.value.get('return_code', -1)
46 | 
47 |     def __repr__(self):
48 |         return self.value.__repr__()
49 | 


--------------------------------------------------------------------------------
/predict.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | 
 3 | __author__ = 'hou'
 4 | 
 5 | from sklearn.externals import joblib
 6 | import numpy as np
 7 | import wave
 8 | import mfcc
 9 | 
10 | # smooth tiny interval
11 | # 1100000111000 --> 000000000000 (111 < interval)
12 | # 0011111000111 --> 111111111111 (000 < interval)
13 | def interval_size(Yp, interval):
14 | 	last = Yp[0]
15 | 	start = 0
16 | 	for i in range(0, len(Yp)):
17 | 		if Yp[i] != last:
18 | 			delta = i - start
19 | 			if delta < interval:
20 | 				Yp[start:i] = Yp[i]
21 | 			else:
22 | 				start = i
23 | 		last = Yp[i]
24 | 
25 | # smooth error labels which occupy littlse in a window size
26 | def window(Y, duration):
27 | 	time_window = duration / 60
28 |         if time_window < 5:
29 |                 time_window = 5
30 | 	threshold = time_window
31 | 	Yp = np.ones(len(Y), dtype=np.int)
32 | 
33 | 	for i in range(0, len(Y)):
34 | 		d = Y[np.max([0, i - time_window]) : np.min([i + time_window, len(Y)])]
35 | 		if (np.sum(d) < threshold):
36 | 			Yp[i] = 0
37 | 
38 |         # The interval of songs and speech should not be shorter than the window
39 |         interval_size(Yp, time_window)
40 | 	return Yp
41 | 
42 | def run(input_wav, model):
43 |         f = wave.open(input_wav, "r")
44 |         duration = f.getnframes() / f.getframerate()
45 | 
46 | 	X = mfcc.extract(input_wav, duration)
47 | 
48 | 	clf = joblib.load(model)
49 | 
50 | 	Y = clf.predict(X[:,:])
51 | 	#np.savetxt("Y.txt", Y)
52 | 	Yp = window(Y, duration)
53 | 	#np.savetxt("Yp.txt", Yp)
54 | 
55 | 	last = Yp[0]
56 | 	delimit_point = [0]
57 | 	for i in range(0, len(Yp)):
58 | 		if Yp[i] != last:
59 | 			delimit_point.append(i)
60 | 		        last = Yp[i]
61 | 
62 | 	return Yp, delimit_point
63 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | env/
 12 | build/
 13 | develop-eggs/
 14 | dist/
 15 | downloads/
 16 | eggs/
 17 | .eggs/
 18 | lib/
 19 | lib64/
 20 | parts/
 21 | sdist/
 22 | var/
 23 | wheels/
 24 | *.egg-info/
 25 | .installed.cfg
 26 | *.egg
 27 | 
 28 | # PyInstaller
 29 | #  Usually these files are written by a python script from a template
 30 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 31 | *.manifest
 32 | *.spec
 33 | 
 34 | # Installer logs
 35 | pip-log.txt
 36 | pip-delete-this-directory.txt
 37 | 
 38 | # Unit test / coverage reports
 39 | htmlcov/
 40 | .tox/
 41 | .coverage
 42 | .coverage.*
 43 | .cache
 44 | nosetests.xml
 45 | coverage.xml
 46 | *.cover
 47 | .hypothesis/
 48 | 
 49 | # Translations
 50 | *.mo
 51 | *.pot
 52 | 
 53 | # Django stuff:
 54 | *.log
 55 | local_settings.py
 56 | 
 57 | # Flask stuff:
 58 | instance/
 59 | .webassets-cache
 60 | 
 61 | # Scrapy stuff:
 62 | .scrapy
 63 | 
 64 | # Sphinx documentation
 65 | docs/_build/
 66 | 
 67 | # PyBuilder
 68 | target/
 69 | 
 70 | # Jupyter Notebook
 71 | .ipynb_checkpoints
 72 | 
 73 | # pyenv
 74 | .python-version
 75 | 
 76 | # celery beat schedule file
 77 | celerybeat-schedule
 78 | 
 79 | # SageMath parsed files
 80 | *.sage.py
 81 | 
 82 | # dotenv
 83 | .env
 84 | 
 85 | # virtualenv
 86 | .venv
 87 | venv/
 88 | ENV/
 89 | 
 90 | # Spyder project settings
 91 | .spyderproject
 92 | .spyproject
 93 | 
 94 | # Rope project settings
 95 | .ropeproject
 96 | 
 97 | # mkdocs documentation
 98 | /site
 99 | 
100 | # mypy
101 | .mypy_cache/
102 | 


--------------------------------------------------------------------------------
/run.py:
--------------------------------------------------------------------------------
 1 | import ffmpeg
 2 | import mfcc
 3 | import predict
 4 | import train
 5 | import sh
 6 | import sys, getopt
 7 | import numpy as np
 8 | import matplotlib.pyplot as plt
 9 | 
10 | def song_dump(songs):
11 |         for p in songs:
12 |                 print str(p[0] / 3600) + ":" + str(p[0] % 3600 / 60) + ":" + str(p[0] % 60)
13 |                 print str(p[1] / 3600) + ":" + str(p[1] % 3600 / 60) + ":" + str(p[1] % 60)
14 | 
15 | def usage():
16 |         print "python run.py -t input.wav | -p input.wav -m model.pkl"
17 |         print "-t Extract MFCC feature from the input autio in standard deviation mode, train in SVM and generate the model"
18 |         print "-p Predict the diarisation of the input audio"
19 |         print "-m Specify the model when predition"
20 | 
21 | def main():
22 |         model = ""
23 |         opts, args = getopt.getopt(sys.argv[1:], "ht:p:m:")
24 |         for op, value in opts:
25 |                 if op == "-h":
26 |                         usage()
27 |                         sys.exit();
28 |                 elif op == "-t":
29 |                         input_file = value
30 |                 elif op == "-p":
31 |                         input_file = value
32 |                 elif op == "-m":
33 |                         model = value
34 | 
35 |         input_wav = ffmpeg.convert(input_file)
36 |         if model == "":
37 |                 input_wav = "20160203c.wav"
38 | 	        songs_20160203c = [[0, 265], [1028,1245], [1440, 1696], [2177, 2693]]
39 |                 song_dump(songs_20160203c)
40 |                 model = train.run(input_wav, songs_20160203c)
41 | 	Y, delimit_points = predict.run(input_wav, model)
42 |         for i in delimit_points:
43 |                 print str(i / 3600) + ":" + str(i % 3600 / 60) + ":" + str(i % 60)
44 | 
45 |         ffmpeg.cut(input_file, delimit_points)
46 | 
47 | 	plt.figure()
48 | 	plt.plot(-0.2)
49 | 	plt.plot(1.2)
50 |         if model == "":
51 |                 plt.plot(songs_20160203c, 'b')
52 | 	plt.plot(Y,'r')
53 | 	plt.show()
54 | 
55 | if __name__ == '__main__':
56 | 	main()
57 | 


--------------------------------------------------------------------------------