├── speech.pyc ├── svm-scale ├── utils.pyc ├── mood_svm.pyc ├── spectrogram.pyc ├── record_process.pyc ├── speech_process.pyc ├── dir_process.py ├── data_analyze.py ├── spectrogram.py ├── speech_process.py ├── ReadMe ├── mood_svm.py ├── record_process.py ├── record_analyze.py ├── log_analyze.py ├── utils.py └── speech.py /speech.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/willsongrui/signal_process/HEAD/speech.pyc -------------------------------------------------------------------------------- /svm-scale: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/willsongrui/signal_process/HEAD/svm-scale -------------------------------------------------------------------------------- /utils.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/willsongrui/signal_process/HEAD/utils.pyc -------------------------------------------------------------------------------- /mood_svm.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/willsongrui/signal_process/HEAD/mood_svm.pyc -------------------------------------------------------------------------------- /spectrogram.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/willsongrui/signal_process/HEAD/spectrogram.pyc -------------------------------------------------------------------------------- /record_process.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/willsongrui/signal_process/HEAD/record_process.pyc -------------------------------------------------------------------------------- /speech_process.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/willsongrui/signal_process/HEAD/speech_process.pyc -------------------------------------------------------------------------------- /dir_process.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import speech_process 4 | def usage(): 5 | print "Used to process wav in a labeled directory " 6 | print "python dir_process.py directory label" 7 | if len(sys.argv)<4 : 8 | usage() 9 | sys.exit() 10 | if os.path.isdir(sys.argv[1])==False: 11 | print "%s is not a directory"&sys.arg[1] 12 | sys.exit() 13 | out = sys.argv[3] 14 | path = sys.argv[1] 15 | files = os.listdir(path) 16 | label = sys.argv[2] 17 | print label 18 | if path.endswith('/')==False: 19 | path = path + '/' 20 | 21 | for f in files: 22 | print f 23 | f = path + f 24 | speech_process.speech_process(f,label=label,dataFile=out) -------------------------------------------------------------------------------- /data_analyze.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import os 3 | import numpy as np 4 | result = [[0,0,0,0,0,0],[0,0,0,0,0,0]] 5 | fs = open(sys.argv[1],'r') 6 | count = [0,0] 7 | for line in fs.readlines(): 8 | data = line.split(' ') 9 | #print data 10 | cur = [] 11 | for i in range(1,6): 12 | result[int(data[0])][i-1] = result[int(data[0])][i-1] + float(data[i][2:]) 13 | result[int(data[0])][5] = result[int(data[0])][5] + float(data[6][2:-2]) 14 | count[int(data[0])] = count[int(data[0])]+1 15 | 16 | print 'neutral:' 17 | for i in range(6): 18 | print result[0][i]*1.0/count[0] 19 | print 'angry' 20 | for i in range(6): 21 | print result[1][i]*1.0/count[1] -------------------------------------------------------------------------------- /spectrogram.py: -------------------------------------------------------------------------------- 1 | """ 2 | Compute and display a spectrogram. 3 | Give WAV file as input 4 | """ 5 | import matplotlib.pyplot as plt 6 | import scipy.io.wavfile 7 | import numpy as np 8 | import sys 9 | 10 | 11 | def spectrogram(m): 12 | sr = m.sampleRate 13 | x = m.rawData 14 | ## Parameters: 10ms step, 30ms window 15 | #nstep = int(sr * 0.01) 16 | #nwin = int(sr * 0.03) 17 | nstep = m.step 18 | nwin = m.frameSize 19 | nfft = nwin 20 | window = np.hamming(nwin) 21 | ## will take windows x[n1:n2]. generate 22 | ## and loop over n2 such that all frames 23 | ## fit within the waveform 24 | nn = range(nwin, len(x), nstep) 25 | X = np.zeros( (len(nn), nfft/2) ) 26 | for i,n in enumerate(nn): 27 | xseg = x[n-nwin:n] 28 | z = np.fft.fft(window * xseg, nfft) 29 | X[i,:] = np.log(np.abs(z[:nfft/2])) 30 | 31 | plt.imshow(X.T, interpolation='nearest',origin='lower',aspect='auto') 32 | -------------------------------------------------------------------------------- /speech_process.py: -------------------------------------------------------------------------------- 1 | from speech import * 2 | frameSize = 256 3 | overLap = 128 4 | 5 | 6 | def speech_process(source, nchannels=1, sampleRate=8000, sampleWidth=2,littleEndian=1,minLen=0.2,minSilence=0.3,feature_file="/home/will/Documents/data.txt",label='0'): 7 | try: 8 | record = Speech(source,nchannels,sampleRate,sampleWidth,littleEndian) 9 | except Exception,error: 10 | print Exception," : ",error 11 | return 12 | 13 | record.getSpeechSegmentByAbsVolume(frameSize,overLap,minLen,minSilence) 14 | record.energyZeroCount() 15 | record.getFramePitch() 16 | record.getWordsPerSeg() 17 | record.freqAnalyze() 18 | #record.LPC() 19 | record.getEnergyBelow250() 20 | record.getSpeechPercentage() 21 | record.dataProcess() 22 | record.writeToFile(feature_file,label) 23 | 24 | return record 25 | 26 | 27 | 28 | def predict(record,scale_model='scale_model',model_file='train_model',label_file='records_information'): 29 | record.predict(scale_model,model_file,label_file) 30 | 31 | 32 | -------------------------------------------------------------------------------- /ReadMe: -------------------------------------------------------------------------------- 1 | class Speech: 2 | 成员变量: 3 | nchannels: 4 | 声道数 5 | sampleWidth: 6 | 采样数据带宽 7 | sampleRate: 8 | 采样率 9 | nframes: 10 | 总采样数据数目 11 | rawData: 12 | 实际的采样数据 13 | totalLength: 14 | 时间长度 15 | speechSegment: 16 | 通话有声段 17 | frame: 18 | 将rawData分帧后保存的list 19 | frameSize: 20 | 帧的长度 21 | overLap: 22 | 帧重复段的长度 23 | step: 24 | 每一帧的步长(frameSize - overLap) 25 | frameNum: 26 | 总帧数 27 | zcr: 28 | 短时的过零率。计算方法中涉及一个zcrThread. 29 | volume: 30 | 每一帧的音量。计算方法是10*log((每一帧的音量/最大音量)^2) 31 | speed: 32 | isBlank: 33 | 是否为空白录音 34 | 35 | absVolume: 36 | 音量的绝对值。计算方法是每一帧数据的绝对值和 37 | shortTimeEnergy: 38 | 短时能量。 和音量的区别:音量除以最大音量,而短时能量不。 39 | ezr: 40 | energy-zero-rate 41 | ezm: 42 | energy-zero-multiplication 43 | pitchSeg: 44 | 基频段。对于每一个speechSeg对应一个pitchSeg,每个pitchSeg保存着每一个元音的开始和结尾位置 45 | 46 | 成员函数: 47 | getSpeechSegmentByAbsVolume: 48 | 通过音量的绝对值找到通话有声段,计算方法是根据absVolume参数,通过双门限法(minLen,minSilence) 49 | 得到有声段的区域,保存在speechFrame成员变量中 50 | energyZeroCount: 51 | 能量-过零率比。 52 | getFramePitch: 53 | 54 | -------------------------------------------------------------------------------- /mood_svm.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import os 3 | sys.path.append('/home/will/Documents/data/lib/libsvm-3.17/python') 4 | from svmutil import * 5 | 6 | 7 | 8 | def train(model_file,train_data,scale_model): 9 | cmd = './svm-scale -s %s %s > %s'%(scale_model,train_data,'train_scaled_data') 10 | os.system(cmd) 11 | y,x = svm_read_problem('train_scaled_data') 12 | model = svm_train(y,x) 13 | svm_save_model(model_file , model) 14 | 15 | def classify(scale_model,model_file,predict_data,evaluate=False): 16 | m = svm_load_model(model_file) 17 | cmd = './svm-scale -r %s %s > %s'%(scale_model,predict_data,'predict_scaled_data') 18 | os.system(cmd) 19 | 20 | y,x = svm_read_problem('predict_scaled_data') 21 | p_labels,p_acc,p_vals = svm_predict(y,x,m) 22 | if evaluate == True: 23 | (acc,mse,scc) = evaluations(y,p_labels) 24 | print acc,mse,scc 25 | else: 26 | return p_labels 27 | 28 | def train_and_classify(scale_model,model_file,train_data,predict_data): 29 | cmd = './svm-scale -s %s %s > %s'%(scale_model,train_data,'train_scaled_data') 30 | os.system(cmd) 31 | cmd = './svm-scale -r %s %s > %s'%(scale_model,predict_data,'predict_scaled_data') 32 | os.system(cmd) 33 | train.train('train_scaled_data',model_file) 34 | classify.classify(model_file,'predict_scaled_data') 35 | 36 | -------------------------------------------------------------------------------- /record_process.py: -------------------------------------------------------------------------------- 1 | import speech_process 2 | import numpy as np 3 | import matplotlib.pyplot as plt 4 | import sys 5 | import os 6 | 7 | def usage(): 8 | #if len(sys.argv)!=2 and len(sys.argv)!=6: 9 | print "Usage:\n python record_process.py 'record_file' [-cChannelNum] [-rSampleRate] [-wSampleWidth] [-eLittleEndian] [-lLogFile]" 10 | print "Default Value:\n -c1 -r8000 -w2 -e1 -llog.txt" 11 | print "Attention\n Only PCM records need parameters above" 12 | 13 | def record_process(recordFile,logFile,channelNum,sampleRate,sampleWidth,littleEndian): 14 | speech_process.speech_process(recordFile,nchannels=channelNum,sampleRate=sampleRate,sampleWidth=sampleWidth,littleEndian=littleEndian,logFile=logFile) 15 | 16 | if __name__ == '__main__': 17 | if len(sys.argv)<2: 18 | usage() 19 | sys.exit() 20 | recordFile = sys.argv[1] 21 | 22 | logFile = 'log.txt' 23 | channelNum = 1 24 | sampleRate = 8000 25 | sampleWidth = 2 26 | littleEndian = 1 27 | for arg in sys.argv[2:]: 28 | if arg.startswith('-c'): 29 | channelNum = int(arg[2:]) 30 | elif arg.startswith('-r'): 31 | sampleRate = int(arg[2:]) 32 | elif arg.startswith('-w'): 33 | sampleWidth = int(arg[2:]) 34 | elif arg.startswith('-e'): 35 | littleEndian = int(arg[2:]) 36 | elif arg.startswith('-l'): 37 | logFile = str(arg[2:]) 38 | else: 39 | print "Unrecgonize parameter :%s"%arg 40 | usage() 41 | sys.exit() 42 | record_process(recordFile,logFile,channelNum,sampleRate,sampleWidth,littleEndian) 43 | 44 | -------------------------------------------------------------------------------- /record_analyze.py: -------------------------------------------------------------------------------- 1 | import speech_process 2 | import numpy as np 3 | import matplotlib.pyplot as plt 4 | import sys 5 | import os 6 | import matplotlib.pyplot as plt 7 | def usage(): 8 | if len(sys.argv)!=2 and len(sys.argv)!=6: 9 | print "Usage:\n python record_process.py 'record_file' [-cChannelNum] [-rSampleRate] [-wSampleWidth] [-eLittleEndian] [-lLogFile]" 10 | print "Default Value:\n -c1 -r8000 -w2 -e1 -llog.txt" 11 | print "Attention\n Only PCM records need parameters above" 12 | 13 | 14 | 15 | logFile = 'log.txt' 16 | channelNum = 1 17 | sampleRate = 8000 18 | sampleWidth = 2 19 | littleEndian = 1 20 | for arg in sys.argv[2:]: 21 | if arg.startswith('-c'): 22 | channelNum = int(arg[2:]) 23 | elif arg.startswith('-r'): 24 | sampleRate = int(arg[2:]) 25 | elif arg.startswith('-w'): 26 | sampleWidth = int(arg[2:]) 27 | elif arg.startswith('-e'): 28 | littleEndian = int(arg[2:]) 29 | elif arg.startswith('-l'): 30 | logFile = str(arg[2:]) 31 | else: 32 | print "Unrecgonize parameter :%s"%arg 33 | usage() 34 | sys.exit() 35 | record = speech_process.speech_process(sys.argv[1],nchannels=channelNum,sampleRate=sampleRate,sampleWidth=sampleWidth,littleEndian=littleEndian,logFile=logFile) 36 | 37 | time = np.arange(record.frameNum) 38 | plt.subplot(511) 39 | plt.plot(time,record.volume) 40 | 41 | plt.subplot(512) 42 | plt.plot(time,record.zcr) 43 | plt.subplot(513) 44 | plt.plot(time,record.shortTimeEnergy) 45 | plt.subplot(514) 46 | plt.specgram(record.rawData,Fs=record.sampleRate,scale_by_freq = True) 47 | 48 | time2 = np.arange(len(record.pitch)) 49 | plt.subplot(515) 50 | #plt.plot(time2,record.pitch) 51 | plt.show() 52 | 53 | 54 | -------------------------------------------------------------------------------- /log_analyze.py: -------------------------------------------------------------------------------- 1 | #coding=utf-8 2 | import os 3 | import sys 4 | def evaluate(a,b,c): 5 | return (a+b+c)/3 6 | 7 | def usage(): 8 | pass 9 | 10 | def log_analyze(log_input,log_output,sortStrategy,sortNum): 11 | try: 12 | fs = open(log_input,'r') 13 | except: 14 | print "File not Found" 15 | sys.exit() 16 | flag = False 17 | variance = [] 18 | recordNum = 0 19 | for line in fs.readlines(): 20 | if flag==True: 21 | count = count + 1 22 | if count==speechNum+1: 23 | p = line.split() 24 | item = [record,float(p[1]),float(p[2]),float(p[3])] 25 | variance.append(item) 26 | recordNum = recordNum + 1 27 | flag = False 28 | continue 29 | words = line.split() 30 | if len(words)<=0: 31 | continue 32 | if words[0].endswith('pcm') or words[0].endswith('wav'): 33 | #print words 34 | flag = True 35 | record = words[0] 36 | speechNum = int(words[1]) 37 | count = 0 38 | if sortStrategy!=4: 39 | variance=sorted(variance,key=lambda v:v[sortStrategy],reverse=True) 40 | else: 41 | speedMax = max([v[1] for v in variance]) 42 | pitchMax = max([v[2] for v in variance]) 43 | volumeMax = max([v[3] for v in variance]) 44 | for v in variance: 45 | a = v[1]/speedMax 46 | b = v[2]/pitchMax 47 | c = v[3]/volumeMax 48 | rank = evaluate(a,b,c) 49 | v.append(rank) 50 | variance = sorted(variance,key=lambda v:v[4],reverse=True) 51 | 52 | if sortNum=='INF': 53 | sortNum = recordNum 54 | 55 | variance = variance[:sortNum] 56 | 57 | fs = open(log_output,'w') 58 | if sortStrategy==1: 59 | strategy = '最大语速方差' 60 | elif sortStrategy==2: 61 | strategy='最大语调方差' 62 | elif sortStrategy==3: 63 | strategy='最大音量方差' 64 | else: 65 | strategy='综合' 66 | info = ' 根据%s策略得到%d条记录\n'%(strategy,recordNum) 67 | fs.write(info) 68 | info ='%-21s%-18s%-18s%-15s\n'%('文件名','语速','语调','音量') 69 | fs.write(info) 70 | for v in variance: 71 | info = '%-17s%-15.2f%-15.2f%-15.2f\n'%(v[0],v[1],v[2],v[3]) 72 | fs.write(info) 73 | fs.close() 74 | 75 | if __name__=='__main__': 76 | if len(sys.argv)<2: 77 | usage() 78 | sys.exit() 79 | sortStrategy = 4 80 | sortNum = 'INF' 81 | log_input = 'NULL' 82 | log_output = 'NULL' 83 | 84 | for arg in sys.argv[1:]: 85 | if arg.startswith('-n'): 86 | sortNum = int(arg[2:]) 87 | elif arg.startswith('-s'): 88 | sortStrategy = 1 89 | elif arg.startswith('-p'): 90 | sortStrategy = 2 91 | elif arg.startswith('-v'): 92 | sortStrategy = 3 93 | elif arg.startswith('-i'): 94 | log_input = arg[2:] 95 | elif arg.startswith('-o'): 96 | log_output = arg[2:] 97 | else: 98 | usage() 99 | sys.exit() 100 | if sortNum != 'INF' and sortNum < 1: 101 | print '排序记录数指定错误' 102 | usage() 103 | sys.exit() 104 | if log_input=='NULL' or log_output=='NULL': 105 | print '没有指定输入和输出文件' 106 | usage() 107 | sys.exit() 108 | #print log_input,log_output,sortStrategy,sortNum 109 | log_analyze(log_input,log_output,sortStrategy,sortNum) 110 | 111 | -------------------------------------------------------------------------------- /utils.py: -------------------------------------------------------------------------------- 1 | import math 2 | import numpy as np 3 | import pylab as pl 4 | from spectrogram import spectrogram 5 | def plotVolume(m): 6 | pl.subplot([m.volume,m.absVolume,m.shortTimeEnergy]) 7 | def plotPitch(m): 8 | pitch = [] 9 | pitchValue = [] 10 | end = 0 11 | for index,r in enumerate(m.speechSegment): 12 | pitch = pitch + [0]*(r[0]-end) + m.pitch[index] 13 | pitchValue = pitchValue + [0]*(r[0]-end) + [t[1] for t in m.tmp[index]] 14 | end = r[1] 15 | pitch = pitch + [0]*(m.frameNum-m.speechSegment[-1][1]) 16 | pitchValue = pitchValue + [0]*(m.frameNum-m.speechSegment[-1][1]) 17 | pl.subplot(511) 18 | pl.ylabel('pitch') 19 | pl.plot(pitch) 20 | for index,s in enumerate(m.pitchSeg): 21 | for p in s: 22 | pl.plot([p[0]+m.speechSegment[index][0],p[0]+m.speechSegment[index][0]],[0,500],color='red') 23 | pl.plot([p[1]+m.speechSegment[index][0],p[1]+m.speechSegment[index][0]],[0,500],color='green') 24 | pl.subplot(512) 25 | for index,s in enumerate(m.pitchSeg): 26 | for p in s: 27 | pl.plot([p[0]+m.speechSegment[index][0],p[0]+m.speechSegment[index][0]],[0,max(pitchValue)],color='red') 28 | pl.plot([p[1]+m.speechSegment[index][0],p[1]+m.speechSegment[index][0]],[0,max(pitchValue)],color='green') 29 | pl.ylabel('value') 30 | pl.plot(pitchValue) 31 | pl.subplot(513) 32 | for index,s in enumerate(m.pitchSeg): 33 | for p in s: 34 | pl.plot([p[0]+m.speechSegment[index][0],p[0]+m.speechSegment[index][0]],[0,max(m.zcr)],color='red') 35 | pl.plot([p[1]+m.speechSegment[index][0],p[1]+m.speechSegment[index][0]],[0,max(m.zcr)],color='green') 36 | pl.ylabel('zcr') 37 | pl.plot(m.zcr) 38 | pl.subplot(514) 39 | for index,s in enumerate(m.pitchSeg): 40 | for p in s: 41 | pl.plot([p[0]+m.speechSegment[index][0],p[0]+m.speechSegment[index][0]],[0,1],color='red') 42 | pl.plot([p[1]+m.speechSegment[index][0],p[1]+m.speechSegment[index][0]],[0,1],color='green') 43 | pl.ylabel('volume') 44 | pl.plot(m.absVolume/max(m.absVolume)) 45 | pl.subplot(515) 46 | for index,s in enumerate(m.pitchSeg): 47 | for p in s: 48 | pl.plot([p[0]+m.speechSegment[index][0],p[0]+m.speechSegment[index][0]],[0,max(m.ezr)],color='red') 49 | pl.plot([p[1]+m.speechSegment[index][0],p[1]+m.speechSegment[index][0]],[0,max(m.ezr)],color='green') 50 | pl.ylabel('ezr') 51 | pl.plot(m.ezr) 52 | pl.show() 53 | 54 | def plot(m): 55 | pl.plot(m) 56 | pl.show() 57 | def subplot(m): 58 | n = len(m) 59 | n = n*100+10+1 60 | for k in m: 61 | pl.subplot(n) 62 | pl.plot(k) 63 | n = n+1 64 | pl.show() 65 | def plotAll(m): 66 | pitch = [] 67 | end = 0 68 | for index,r in enumerate(m.speechSegment): 69 | pitch = pitch + [0]*(r[0]-end) + m.pitch[index] 70 | end = r[1] 71 | pitch = pitch + [0]*(m.frameNum-m.speechSegment[-1][1]) 72 | pl.subplot(611) 73 | pl.plot(m.absVolume) 74 | #pl.xlabel('Frame Num') 75 | pl.ylabel('Volume') 76 | #pl.grid(True) 77 | for s in m.speechSegment: 78 | pl.plot([s[0],s[0]],[0,max(m.absVolume)],color='red') 79 | pl.plot([s[1],s[1]],[0,max(m.absVolume)],color='green') 80 | 81 | pl.subplot(612) 82 | pl.plot(m.zcr) 83 | pl.ylabel('Zero Cross Rate') 84 | #pl.xlabel('Frame Num') 85 | pl.subplot(613) 86 | pl.plot(pitch) 87 | #pl.xlabel('Frame Num') 88 | pl.ylabel('Pitch') 89 | for index,s in enumerate(m.pitchSeg): 90 | for p in s: 91 | pl.plot([p[0]+m.speechSegment[index][0],p[0]+m.speechSegment[index][0]],[0,500],color='red') 92 | pl.plot([p[1]+m.speechSegment[index][0],p[1]+m.speechSegment[index][0]],[0,500],color='green') 93 | pl.subplot(614) 94 | 95 | 96 | pl.plot(m.f1) 97 | pl.ylabel('Formant 1') 98 | pl.subplot(615) 99 | spectrogram(m) 100 | pl.ylabel("spectrogram") 101 | pl.xlabel('Frame Num') 102 | pl.subplot(616) 103 | pl.ylabel("Energy Below 250Hz") 104 | pl.plot(m.energyBelow250) 105 | pl.show() 106 | 107 | # method 1: absSum 108 | def calVolume(waveData, frameSize, overLap): 109 | wlen = len(waveData) 110 | step = frameSize - overLap 111 | frameNum = int(math.ceil(wlen*1.0/step)) 112 | volume = np.zeros((frameNum,1)) 113 | for i in range(frameNum): 114 | curFrame = waveData[np.arange(i*step,min(i*step+frameSize,wlen))] 115 | curFrame = curFrame - np.median(curFrame) # zero-justified 116 | volume[i] = np.sum(np.abs(curFrame)) 117 | return volume 118 | 119 | # method 2: 10 times log10 of square sum 120 | def calVolumeDB(waveData, frameSize, overLap): 121 | wlen = len(waveData) 122 | step = frameSize - overLap 123 | frameNum = int(math.ceil(wlen*1.0/step)) 124 | volume = np.zeros((frameNum,1)) 125 | for i in range(frameNum): 126 | curFrame = waveData[np.arange(i*step,min(i*step+frameSize,wlen))] 127 | curFrame = curFrame - np.mean(curFrame) # zero-justified 128 | volume[i] = 10*np.log10(np.sum(curFrame*curFrame)) 129 | return volume 130 | 131 | def argLocalMax(R): 132 | result = [] 133 | for r in range(1,len(R)-1): 134 | if(R[r-1]R[r+1]): 135 | result.append(r) 136 | return result 137 | 138 | # Auto-Correlation Function 139 | def ACF(frame): 140 | flen = len(frame) 141 | acf = np.zeros(flen) 142 | for i in range(flen): 143 | try: 144 | acf[i] = np.dot(frame[i:flen],frame[0:flen-i]) 145 | except: 146 | print frame[i:flen] 147 | print i,flen 148 | raise Exception('Error') 149 | 150 | return acf 151 | 152 | def zcr(frame,zcrThread): 153 | n = len(frame) 154 | cnt = 0 155 | for i in range(n-1): 156 | if ((np.sign(frame[i])*np.sign(frame[i+1])<0) and (abs(frame[i]-frame[i+1])>zcrThread)): 157 | cnt = cnt+1 158 | return cnt 159 | 160 | def advancedACF(frame1,frame2): 161 | if len(frame1)!=len(frame2): 162 | return 163 | frame = frame1+frame2 164 | flen = len(frame1) 165 | acf = np.zeros(flen) 166 | if len(frame1)!=len(frame2): 167 | return -1 168 | for k in range(flen/2): 169 | acf[k] = np.dot(frame[:flen],frame[k:flen+k]) 170 | return acf 171 | 172 | # average magnitude difference function 173 | def AMDF(frame): 174 | flen = len(frame) 175 | amdf = np.zeros(flen) 176 | for i in range(flen): 177 | amdf[i] = np.sum(abs(frame[i:flen]-frame[0:flen-i]))/(flen-i+1) 178 | return findPitch(amdf) 179 | 180 | def findPitch(amdf): 181 | #print np.argmin(amdf[30:])+30 182 | return np.argmin(amdf[30:])+30 183 | 184 | 185 | def complexGreater(a,b): 186 | return abs(a)**2>abs(b)**2 187 | 188 | def isPositive(frame): 189 | trans = [] 190 | for i in frame: 191 | if i>=0: 192 | trans.append(1) 193 | else: 194 | trans.append(-1) 195 | return np.array(trans) -------------------------------------------------------------------------------- /speech.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import os 3 | import math 4 | import wave 5 | import numpy as np 6 | from math import ceil,log 7 | import utils 8 | import scipy.signal as signal 9 | from scipy.signal import argrelmax 10 | import scipy 11 | import pylab as pl 12 | from scikits.talkbox import lpc as talkboxLpc 13 | from mood_svm import classify 14 | 15 | 16 | class Speech: 17 | def __init__(self,source,nchannels,sampleRate,sampleWidth,littleEndian): 18 | self.error = [] 19 | self.fileName = source 20 | if source.find('.pcm')!=-1: 21 | if nchannels==0 or sampleWidth==0 or sampleRate==0 or littleEndian==-1: 22 | raise Exception('WrongParametersForPCM') 23 | try: 24 | fw = open(source,'r') 25 | except: 26 | print "File %s can't be found"%(source) 27 | raise Exception('FileNotFound') 28 | self.nchannels = nchannels 29 | self.sampleRate = sampleRate 30 | self.sampleWidth = sampleWidth 31 | rawData = fw.read() 32 | if sampleWidth == 1: 33 | dtype = np.int8 34 | elif sampleWidth == 2: 35 | dtype = np.int16 36 | elif sampleWidth == 4: 37 | dtype = np.int32 38 | self.rawData = np.fromstring(rawData,dtype=dtype) 39 | self.nframes = len(self.rawData) 40 | fw.close() 41 | 42 | elif source.find('.wav')!=-1: 43 | try: 44 | fw = wave.open(source,'r') 45 | except: 46 | print "File %s can't be found"%(source) 47 | raise Exception('FileNotFound') 48 | params = fw.getparams() 49 | 50 | self.nchannels,self.sampleWidth,self.sampleRate,self.nframes = params[:4] 51 | rawData = fw.readframes(self.nframes) 52 | if self.sampleWidth == 1: 53 | dtype = np.int8 54 | elif self.sampleWidth == 2: 55 | dtype = np.int16 56 | elif self.sampleWidth == 4: 57 | dtype = np.int32 58 | self.rawData = np.fromstring(rawData,dtype=dtype) 59 | fw.close() 60 | 61 | maxData = max(abs(self.rawData)) 62 | #print maxData 63 | if maxData < 1200: 64 | self.isBlank = True 65 | print 'blank' 66 | else: 67 | self.isBlank = False 68 | 69 | self.maxData = maxData 70 | #self.rawData = self.rawData*1.0/maxData 71 | 72 | self.rawData = self.rawData*1.0 73 | self.totalLength = self.nframes*1.0/self.nchannels/self.sampleRate 74 | self.speechSegment = [] 75 | self.frame = [] 76 | self.zcr = [] 77 | self.shortTimeEnergy = [] 78 | self.volume = [] 79 | self.speed = [] 80 | 81 | def __del__(self): 82 | pass 83 | 84 | def getSingleWords(self): 85 | pass 86 | def getEnergyBelow250(self): 87 | if len(self.speechSegment)==0 or self.isBlank==True: 88 | return 89 | 90 | self.energyBelow250 = [] 91 | loc = int(250.0/4000*self.frameSize) 92 | for fftFrame in self.fftFrameAbs: 93 | totalEnergy = np.sum(fftFrame) 94 | below250 = np.sum(fftFrame[:loc]) 95 | if totalEnergy == 0: 96 | totalEnergy = 1 97 | self.energyBelow250.append(below250/totalEnergy) 98 | 99 | # get speechSegment,volume,volumeAbs,and shortTimeEnergy 100 | def getSpeechPercentage(self): 101 | if len(self.speechSegment)==0 or self.isBlank==True: 102 | return 103 | 104 | self.speechPercentage = 0 105 | speechFrame = 0 106 | for i in self.speechSegment: 107 | speechFrame = speechFrame + i[1] - i[0] 108 | self.speechPercentage = speechFrame*1.0/self.frameNum 109 | self.speechLength = self.speechPercentage*self.totalLength 110 | 111 | def energyZeroCount(self): 112 | self.ezr = [] 113 | self.ezm = [] 114 | for i in range(len(self.shortTimeEnergy)): 115 | if self.zcr[i]!=0: 116 | ezr = self.shortTimeEnergy[i]/self.zcr[i]/10000000 117 | else: 118 | ezr = self.shortTimeEnergy[i]/10000000 119 | ezm = self.shortTimeEnergy[i]*self.zcr[i]/10000000 120 | self.ezr.append(ezr) 121 | self.ezm.append(ezm) 122 | 123 | def getSpeechSegmentByAbsVolume(self,frameSize,overLap,minLen,minSilence): 124 | zcrThread = 0 125 | if self.isBlank == True: 126 | return 127 | if frameSize<=overLap: 128 | raise Exception('Wrong getFrames parameters') 129 | self.frameSize = frameSize 130 | self.overLap = overLap 131 | self.step = self.frameSize-self.overLap 132 | self.frameNum = int(ceil(self.nframes/self.step)) 133 | self.absVolume = [] 134 | for i in range(self.frameNum): 135 | self.frame.append(self.rawData[i*self.step:min(i*self.step+frameSize,self.nframes)]) 136 | #zcrThread = max(self.frame[i])/8 137 | zcr = utils.zcr(self.frame[i],zcrThread) 138 | self.zcr.append(zcr) 139 | self.shortTimeEnergy.append(sum([k**2 for k in self.frame[i]])) 140 | cal = np.sum(self.frame[i]*self.frame[i]*1.0/self.maxData/self.maxData) 141 | if cal==0: 142 | cal = 0.001 143 | self.volume.append(10*np.log(cal)) 144 | self.absVolume.append(np.sum(np.abs(self.frame[i]))) 145 | # Two threadholds for shortTimeEnergy 146 | tHoldLow = min(max(self.absVolume)/10,3*self.maxData) 147 | tHoldHigh = min(max(self.absVolume)/6,6*self.maxData) 148 | self.tHoldHigh = tHoldHigh 149 | self.tHoldLow = tHoldLow 150 | #print self.tHoldHigh 151 | self.segmentTime = [] 152 | # status is used to show the status of the endPointDetection machine 153 | # 0=>silence 1=>mayBegin 2=>speechSegment 3=>end 154 | status = 0 155 | count = 0 156 | segmentBeg = 0 157 | silence = 0 #Used to indicate the length of silence frames 158 | minSilence = int(minSilence*self.sampleRate/self.frameSize) #If we meet minSilence consecutive silence than the speech is probably end 159 | minLen = int(minLen*self.sampleRate/self.frameSize) #A speech should at least longer than minLen frames 160 | segmentEnd = 0 161 | #print "minSilence",minSilence,"minLen",minLen 162 | for i in range(self.frameNum): 163 | if (status == 0) or (status == 1): 164 | if self.absVolume[i]>tHoldHigh: 165 | segmentBeg = i-count 166 | status = 2 167 | silence = 0 168 | count = count + 1 169 | #print "beg" 170 | elif self.absVolume[i]>tHoldLow: 171 | status = 1 172 | count = count + 1 173 | else: 174 | status = 0 175 | count = 0 176 | elif status == 2: 177 | if self.absVolume[i] > tHoldLow: 178 | count = count + 1 179 | silence = 0 180 | else: 181 | silence = silence + 1 182 | if silence < minSilence: #silence is not long enough to end the speech 183 | count = count + 1 184 | elif count < minLen: #speech is so short that it should be noise 185 | status = 0 186 | silence = 0 187 | count = 0 188 | #print "endOfNoise" 189 | else: 190 | status = 0 191 | segmentEnd = i - minSilence 192 | self.speechSegment.append((segmentBeg,segmentEnd)) 193 | self.segmentTime.append((segmentBeg*self.totalLength/self.frameNum,segmentEnd*self.totalLength/self.frameNum)) 194 | #print "end success" 195 | #print "beg speech %d %f"%(segmentBeg,segmentBeg*self.totalLength/self.frameNum) 196 | #print "end speech %d %f"%(segmentEnd,segmentEnd*self.totalLength/self.frameNum) 197 | status = 0 198 | count = 0 199 | silence = 0 200 | if status == 2: 201 | self.speechSegment.append((segmentBeg,self.frameNum)) 202 | self.segmentTime.append((segmentBeg*1.0/self.sampleRate,self.frameNum*1.0/self.sampleRate)) 203 | #self.segmentTime.append((self.frameNum-segmentBeg)*1.0/self.sampleRate) 204 | self.speechTime = sum([v[1]-v[0] for v in self.segmentTime]) 205 | self.totalSeg = len(self.speechSegment) 206 | 207 | #50Hz ~ 450Hz 208 | #为了消除共振峰的影响,使用带通滤波器(60~900)或中心削波 209 | def getFramePitch(self): 210 | #放浊音基音的开始和结尾 211 | #pitchSeg is used to store the "ZHUOYIN" pitch seg of Each speechSeg,its length is the same as self.speechSeg 212 | #For example, self.pitchSeg[m][n] is the (n+1)th "ZHUOYIN" pitch seg in m+1 speechSeg 213 | self.pitchSeg = [] 214 | #self.tmp = [] 215 | if len(self.speechSegment)==0 or self.isBlank==True: 216 | return 217 | 218 | pitchThread = int(self.sampleRate/450) 219 | self.pitch = [] 220 | self.tmp=[] 221 | b, a = signal.iirdesign([60.0*2/self.sampleRate,950.0*2/self.sampleRate],[50.0*2/self.sampleRate,1000.0*2/self.sampleRate],2,40) 222 | for segTime in self.speechSegment: 223 | tmp = [] 224 | pitchSum = 0 225 | beg = segTime[0] 226 | end = segTime[1] 227 | curFramePitch = [] 228 | for frame in self.frame[beg:end]: 229 | #frameFilt = signal.lfilter(b,a,frame) 230 | pitch = utils.ACF(frame) 231 | pitch[:pitchThread] = -abs(pitch[0]) 232 | pitchMax = np.argmax(pitch) 233 | if pitchMax == 0: 234 | self.error.append(('pitch error',pitch,utils.ACF(frame))) 235 | tmp.append((self.sampleRate/pitchMax,pitch[pitchMax]/1000000)) 236 | curFramePitch.append(self.sampleRate/pitchMax) 237 | self.tmp.append(tmp) 238 | pitchHigh = np.max(tmp,0)[1]/12.0 239 | pitchLow = np.max(tmp,0)[1]/24.0 240 | #pitchHigh = 0 241 | #pitchLow = 0 242 | ezrLevel = max(self.ezr[beg:end])*0.2 243 | volumeHigh = np.max(self.absVolume[beg:end])/4 244 | volumeLow = volumeHigh/2 245 | 246 | #zcrHigh = np.max(self.zcr[beg:end])/2 247 | #zcrLow = zcrHigh/1 248 | zcrHigh = 1000 249 | zcrLow = 1000 250 | #0 => 清音 1=>可能是浊音 2=>浊音 3=>浊音结束 251 | status = 0 252 | trange = [] 253 | count = 0 254 | silence = 0 255 | self.pitch.append(curFramePitch) 256 | #print 'ezrLevel',ezrLevel,'volumeHigh',volumeHigh,'pitchHigh',pitchHigh 257 | for t in range(len(tmp)): 258 | if tmp[t][1]>pitchHigh and self.ezr[t+beg]>ezrLevel: 259 | #print 'beg ',t,'status',status 260 | if status == 0: 261 | start = t 262 | duration = 0 263 | status = 1 264 | duration = duration+1 265 | else: 266 | if status == 1: 267 | trange.append((start,t)) 268 | status = 0 269 | duration = 0 270 | if status == 1: 271 | trange.append((start,len(tmp))) 272 | self.pitchSeg.append(trange) 273 | self.tmp.append(tmp) 274 | 275 | 276 | def getFramePitchAdvanced(self): 277 | if len(self.speechSegment)==0 or self.isBlank==True: 278 | return 279 | 280 | self.p = [] 281 | b, a = signal.iirdesign([60.0*2/self.sampleRate,950.0*2/self.sampleRate],[50.0*2/self.sampleRate,1000.0*2/self.sampleRate],2,40) 282 | for frame in self.frame: 283 | filt = signal.lfilter(b,a,frame) 284 | minAMDF = utils.AMDF(filt) 285 | pitch = self.sampleRate/minAMDF 286 | self.p.append(pitch) 287 | 288 | 289 | def tt(self,a,b): 290 | if len(self.speechSegment)==0 or self.isBlank==True: 291 | return 292 | 293 | self.getFramePitchAdvanced() 294 | #self.p = np.clip(self.p,0,450) 295 | pl.subplot(211) 296 | pl.plot(self.p[a:b]) 297 | pl.subplot(212) 298 | pl.plot(self.absVolume[a:b]) 299 | pl.show() 300 | 301 | def LPC(self): 302 | print "LPC" 303 | if len(self.speechSegment)==0 or self.isBlank==True: 304 | return 305 | self.ar = [] 306 | self.fmt = [] 307 | self.bw = [] 308 | self.frqs = [] 309 | for frame in self.frame: 310 | #[ar, var, reflec] = yulewalker.aryule(frame, 8) 311 | [ar,var,reflec] = talkboxLpc(frame,8) 312 | self.ar.append(ar) 313 | rts = np.roots(ar) 314 | rts = [r for r in rts if np.imag(r)>=0] 315 | #angz = np.atan2(np.imag(rts),np.real(rts)) 316 | angz = np.asarray([math.atan2(np.imag(r),np.real(r)) for r in rts]) 317 | angz = angz*self.sampleRate/(np.pi*2) 318 | #print angz 319 | #[frqs,indices] = sort(angz) 320 | frqs = [(angz[i],i) for i in range(len(angz))] 321 | frqs.sort() 322 | self.frqs.append(frqs) 323 | fmt = [] 324 | bandwidth = [] 325 | for kk in range(len(frqs)): 326 | bw = -1.0/2*(self.sampleRate/(2*np.pi))*np.log(np.abs(rts[frqs[kk][1]])) 327 | #print frqs[kk][0],bw 328 | if ((frqs[kk][0]>90) and (bw<400) ): 329 | fmt.append(frqs[kk][0]) 330 | #print frqs[kk][0] 331 | bandwidth.append(bw) 332 | 333 | fmt.sort() 334 | fmt = fmt[:3] 335 | self.fmt.append(fmt) 336 | self.bw.append(bandwidth) 337 | self.f1 = [] 338 | for f in self.fmt: 339 | if len(f) == 0: 340 | self.f1.append(0) 341 | else: 342 | self.f1.append(f[0]) 343 | 344 | def freqAnalyze(self): 345 | print "freqAnalyze" 346 | if len(self.speechSegment)==0 or self.isBlank==True: 347 | return 348 | self.shortTimeLinjieVector = [] 349 | self.formant = [] 350 | self.fftFrameAbs = [] 351 | #短时谱的临界带特征矢量 352 | F = [0] 353 | fs = self.sampleRate/self.frameSize 354 | for i in range(1,19): 355 | m = int((i+0.53)*1960/(26.81-0.53-i)) 356 | n = m/fs 357 | F.append(n) 358 | self.formantValue = [] 359 | self.frameSize/2+1 360 | self.fftFrame = [] 361 | #h1,f1 = signal.freqz([1,-0.98],[1]) 362 | #cc = 0 363 | for frame in self.frame[:-1]: 364 | #窗函数 365 | #cc = cc+1 366 | f = frame*signal.hamming(self.frameSize,sym=0) 367 | #预加重 368 | #f = scipy.signal.lfilter([1,-0.97],1,f) 369 | fftFrame = np.fft.rfft(f)/(self.frameSize/2) 370 | self.fftFrame.append(fftFrame) 371 | fftFrameAbs = [abs(fft) for fft in fftFrame] 372 | self.fftFrameAbs.append(fftFrameAbs) 373 | #短时谱临界带特征矢量 374 | ''' 375 | g = np.zeros(17) 376 | beg = 1 377 | 378 | for i in range(1,17): 379 | for k in range(beg,min(len(fftFrame),F[i]+1)): 380 | g[i] = g[i] + abs(fftFrame[k])**2 381 | beg = F[i]+1 382 | self.shortTimeLinjieVector.append(g) 383 | #共振峰 384 | g0 = utils.argLocalMax(fftFrameAbs) 385 | points = [(fftFrameAbs[g],g) for g in g0] 386 | points.sort() 387 | #print g0[0] 388 | m = min(3,len(g0)) 389 | formant = [] 390 | for i in range(m): 391 | formant.append(points[-i-1][1]*fs) 392 | 393 | self.formantValue.append(points[-1][0]) 394 | formant.sort() 395 | self.formant.append(formant) 396 | ''' 397 | 398 | #3,2 399 | def getWordsPerSeg(self,minLen=10,minSilence=5,preLen=2): 400 | print "getWordsPerSeg" 401 | if len(self.speechSegment)==0 or self.isBlank==True: 402 | return 403 | status = 0 404 | self.segWord = [] 405 | for seg in self.speechSegment: 406 | status = 0 407 | silence = 0 408 | segBeg = seg[0] 409 | segEnd = seg[1] 410 | volumeHigh = max(self.volume)/4 411 | volumeLow = max(self.volume)/8 412 | zcrHigh = max(self.zcr)/4 413 | zcrLow = max(self.zcr)/8 414 | #print volumeHigh,volumeLow 415 | word = 0 416 | segWord = [] 417 | count = 0 418 | precount = 0 419 | crest = 0 420 | wordBeg = 0 421 | for frame in range(segBeg,segEnd): 422 | if status==0 or status==1: 423 | crest = max(crest,self.volume[frame]) 424 | if self.volume[frame] >= volumeHigh: 425 | status = 2 426 | count = precount + 1 427 | wordBeg = frame-count 428 | #print "begin",frame 429 | elif self.volume[frame] >= volumeLow: 430 | status = 1 431 | precount = precount + 1 432 | if precount >= preLen: 433 | status = 2 434 | #print "begin",frame 435 | wordBeg = frame-precount 436 | count = precount 437 | precount = 0 438 | else: 439 | precount = 0 440 | status = 0 441 | elif status == 2: 442 | crest = max(crest,self.volume[frame]) 443 | #print "crest",crest 444 | if self.volume[frame] >= volumeLow and self.volume[frame]>= crest/2: 445 | count = count + 1 446 | silence = 0 447 | else: 448 | silence = silence + 1 449 | if silence > minSilence: 450 | status = 0 451 | crest = 0 452 | if frame-wordBeg+1 > minLen: 453 | word = word + 1 454 | segWord.append((wordBeg,frame)) 455 | #print "end success",frame 456 | else: 457 | pass 458 | #print "end of too short",frame 459 | precount = 0 460 | count = 0 461 | if status == 2: 462 | segWord.append((wordBeg,segEnd)) 463 | self.segWord.append(segWord) 464 | #print segWord 465 | self.speed.append(len(segWord)*1.0/((segEnd-segBeg)*1.0*(self.frameSize-self.overLap)/self.sampleRate)) 466 | 467 | 468 | def dataProcess(self): 469 | #self.maxData = 1 470 | #语速 基音频率 基音范围 最大基频 最小基频 基频一阶差分绝对值平均值 振幅 振幅标准差 振幅最大值 在250Hz能量以下所占百分比 第一共振峰 第一共振峰范围 471 | #self.speed self.pitchAverage self.pitchRange, self.volumeAverage self.volumeStd self.fmtAverage self.fmtRange 472 | if len(self.speechSegment)==0 or self.isBlank==True: 473 | return 474 | self.num = len(self.speechSegment) 475 | 476 | #for i in range(self.num): 477 | # pitchAverage = np.average(self.pitchSeg) 478 | print "We have %d speech segments in all"%self.num 479 | self.pitchAverage = 0 480 | self.pitchAveragePerSeg = [] 481 | self.pitchRange = [] 482 | self.volumeAverage = [] 483 | self.volumeStd = [] 484 | self.fmtAverage = [] 485 | self.pitchMax = [] 486 | self.pitchMin = [] 487 | self.pitchStd = [] 488 | self.volumeMax = [] 489 | self.volumeMin = [] 490 | self.volumeDiff = [] 491 | self.pitchDiff = [] 492 | self.below250 = [] 493 | self.pitchNum = 0 494 | #self.fmtRange = [] 495 | for i in range(self.num): 496 | pitchSum = 0 497 | pitchNum = 0 498 | pitchMax = 0 499 | pitchMin = 1000 500 | pitchDiff = 0 501 | for k in range(len(self.pitchSeg[i])): 502 | #print k 503 | #pitchSeg 是记录在pitch中满足50~450之间的pitch的起始点的 可以用来计算pitch的变化规律 504 | pitchSum = pitchSum + sum(self.pitch[i][self.pitchSeg[i][k][0]:self.pitchSeg[i][k][1]]) 505 | pitchNum = pitchNum + self.pitchSeg[i][k][1] - self.pitchSeg[i][k][0] 506 | 507 | if self.pitchSeg[i][k][0]==self.pitchSeg[i][k][1]: 508 | print self.fileName 509 | pitchMax = max(max(self.pitch[i][self.pitchSeg[i][k][0]:self.pitchSeg[i][k][1]]),pitchMax) 510 | pitchMin = min(min(self.pitch[i][self.pitchSeg[i][k][0]:self.pitchSeg[i][k][1]]),pitchMin) 511 | if k!=len(self.pitchSeg[i])-1: 512 | pitchDiff = pitchDiff + abs(self.pitch[i][k+1]-self.pitch[i][k]) 513 | 514 | 515 | #print pitchSum 516 | #print pitchNum 517 | if pitchNum==0: 518 | self.pitchAveragePerSeg.append(0) 519 | self.pitchRange.append(0) 520 | self.pitchMax.append(0) 521 | self.pitchMin.append(0) 522 | self.pitchDiff.append(0) 523 | continue 524 | self.pitchAverage = self.pitchAverage + pitchSum 525 | self.pitchNum = self.pitchNum + pitchNum 526 | self.pitchAveragePerSeg.append(pitchSum*1.0/pitchNum) 527 | pitchDiff = pitchDiff*1.0/pitchNum 528 | self.pitchDiff.append(pitchDiff) 529 | self.pitchRange.append((pitchMax-pitchMin)) 530 | self.pitchMax.append(pitchMax) 531 | self.pitchMin.append(pitchMin) 532 | 533 | if self.pitchNum!=0: 534 | self.pitchAverage = self.pitchAverage/self.pitchNum 535 | for i in range(self.num): 536 | beg = self.speechSegment[i][0] 537 | end = self.speechSegment[i][1] 538 | below250 = np.average(self.energyBelow250[beg:end]) 539 | self.below250.append(below250) 540 | volume = np.average(self.absVolume[beg:end]) 541 | volumeStd = np.std(self.absVolume[beg:end]) 542 | #fmtAverage = np.average(self.f1[beg:end]) 543 | self.volumeAverage.append(volume) 544 | self.volumeStd.append(volumeStd) 545 | self.volumeMax.append(np.max(self.absVolume[beg:end])) 546 | self.volumeMin.append(np.min(self.absVolume[beg:end])) 547 | #self.fmtAverage.append(fmtAverage) 548 | volumeDiff = 0 549 | for k in range(beg,end-1): 550 | volumeDiff = volumeDiff + abs(self.absVolume[k+1]-self.absVolume[k]) 551 | self.volumeDiff.append(volumeDiff*1.0/(end-beg)) 552 | self.features = [] 553 | for i in range(self.num): 554 | features = [self.pitchMax[i],self.pitchAveragePerSeg[i],self.pitchRange[i],self.pitchMin[i],self.pitchDiff[i],self.volumeAverage[i],self.volumeStd[i],self.volumeMax[i],self.volumeDiff[i],self.below250[i]] 555 | self.features.append(features) 556 | 557 | 558 | def predict(self,scale_model,model_file,label_file): 559 | self.gender = 'Unknown' 560 | self.category = -1 561 | predict_data = 'predict_data' 562 | #全静音 单交互正常挂机 单交互异常挂机 多交互正常挂机 多交互异常挂机 563 | if self.isBlank==True: 564 | self.category = '全静音' 565 | else: 566 | if self.pitchAverage in range(100,200): 567 | self.gender = '男' 568 | elif self.pitchAverage > 200: 569 | self.gender = '女' 570 | self.writeToFile(predict_data,'0') 571 | self.labels = classify(scale_model,model_file,predict_data) 572 | cmd = 'rm %s'%predict_data 573 | os.system(cmd) 574 | fs = open(label_file,'aw') 575 | if len(self.labels)==1: 576 | if self.labels[0]==-1: 577 | self.category = '单交互正常挂机' 578 | else: 579 | self.category = '单交互异常挂机' 580 | else: 581 | for label in self.labels: 582 | if label == 1: 583 | self.category = '多交互异常挂机' 584 | break 585 | if self.category == -1: 586 | self.category = '多交互异常挂机' 587 | self.label = np.average(self.labels) 588 | 589 | fs.write('File: %s\nCategory: %s\nLabel: %s\n'%(self.fileName,self.category,self.labels)) 590 | #fs.write('%-20s%-20s%-20s%-20s%-20s%-20s\n'%('总时长','通话时长','通话段数','通话人音量','通话人性别','通话人语调')) 591 | fs.write('总时长 通话时长 通话段数 通话人音量 通话人性别 通话人语调\n') 592 | fs.write('%-15.2f%-15.2f%-15f%-18.2f%-16s%-20.2f\n'%(self.totalLength,self.speechLength,self.num,np.average(self.volumeAverage),self.gender,self.pitchAverage)) 593 | fs.close() 594 | 595 | 596 | 597 | 598 | 599 | 600 | 601 | 602 | 603 | #用于机器学习 604 | def writeToFile(self,dataFile,label='0'): 605 | #基音频率 基音范围 振幅 振幅标准差 第一共振峰 第一共振峰范围 606 | #pitchAverage pitchRange pitchMax pitchMin pitchDiff volumeMax volumeAverage volumeStd volumeDiff below250 607 | fs = open(dataFile,'aw') 608 | cnt = 1 609 | #fs.write(self.fileName+'\n') 610 | if self.isBlank==True: 611 | fs.write('%s %d:%f %d:%f %d:%f %d:%f %d:%f %d:%f %d:%f %d:%f %d:%f %d:%f\n'%('0',1,0,2,0,3,0,4,0,5,0,6,0,7,0,8,0,9,0,10,0)) 612 | else: 613 | for i in range(self.num): 614 | #fs.write('Wav File Name:%s\n'%(self.fileName)) 615 | # fs.write('Segment Num %d:\n'%(cnt)) 616 | cnt = cnt + 1 617 | fs.write('%s %d:%f %d:%f %d:%f %d:%f %d:%f %d:%f %d:%f %d:%f %d:%f %d:%f\n'%(label,1,self.pitchMax[i],2,self.pitchAveragePerSeg[i],3,self.pitchRange[i],4,self.pitchMin[i],5,self.pitchDiff[i],6,self.volumeAverage[i],7,self.volumeStd[i],8,self.volumeMax[i],9,self.volumeDiff[i],10,self.below250[i])) 618 | 619 | fs.close() 620 | ''' 621 | def lpc(self): 622 | print "lpc" 623 | if len(self.speechSegment)==0 or self.isBlank==True: 624 | return 625 | self.numerator = [] 626 | for frame in self.frame: 627 | acdata = acorr(frame) 628 | filt = levinson_durbin(acdata,8) 629 | self.numerator.append(filt.numerator) 630 | 631 | 632 | 633 | 634 | def pre_getWordsPerSeg(self, a=2, T=3): 635 | self.transitionTag = [] 636 | for seg in self.speechSegment: 637 | segBeg = seg[0] 638 | segEnd = seg[1] 639 | TshortTimeEnergyBefore = sum(self.speechSegment[segBeg:segBeg+3]) 640 | TshortTimeEnergyAfter = sum(self.speechSegment[segBeg+2:segBeg+5]) 641 | for k in xrange(segBeg+2,segEnd-4): 642 | curShortEnergy = self.shortTimeEnergy[k] 643 | curZcr = self.zcr[k] 644 | flag = False 645 | if curShortEnergy>a*self.shortTimeEnergy[k+1] or curShortEnergy*aa*self.zcr[k+1] or curZcr*a