├── toolbox ├── readme.txt ├── GetLenScp.exe ├── step3_le2be.m ├── le2be_for_all_files_func.m ├── weights │ ├── gen_rand_net │ │ ├── Gen_rand_net │ │ ├── Extend_rand_net │ │ ├── Gen_rand_net.cpp │ │ ├── Extend_rand_net.cpp │ │ ├── Extend_rand_net_boost │ │ └── Extend_rand_net_boost.cpp │ ├── readme.txt │ ├── Gen_rand_wts_for_ReLUs_forCudaTrain.pl │ └── change_cudaSavedModels2matlabWeigths_4layers.m ├── rand_list.pl └── main_dir_le2be.m ├── BPtrain ├── BP_GPU.h ├── BPtrain.cc ├── enh_wav_example ├── readme.txt ├── test1_mySEDNN.wav ├── test1_org_noisy.wav ├── test1_other_cnn.wav ├── test2_noisy_chinese.wav ├── test2_mySEDNN_chinese.wav ├── test3_ForestGump_noisy.wav ├── test3_ForestGump_logMMSE_enh.wav └── test3_ForestGump_Proposed DNN_enh.wav ├── BP_GPU.h.bak ├── Interface.cc ├── BP_GPU.cu.bak ├── BPtrain.cc.bak ├── Interface.cc.bak ├── Makefile.bak ├── Makefile ├── read_htk_fea.m ├── README.md~ ├── Interface.h.bak ├── Interface.h ├── how_to_get_pfile.txt ├── README.md ├── DevFunc.cu.bak ├── finetune_DNN_speech_enhancement_dropout_NAT.pl ├── DevFunc.cu ├── DevFunc.h ├── DevFunc.h.bak └── BP_GPU.cu /toolbox/readme.txt: -------------------------------------------------------------------------------- 1 | some useful tools: get_len, random_list, etc. 2 | -------------------------------------------------------------------------------- /BPtrain: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yongxuUSTC/DNN-for-speech-enhancement/HEAD/BPtrain -------------------------------------------------------------------------------- /BP_GPU.h: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yongxuUSTC/DNN-for-speech-enhancement/HEAD/BP_GPU.h -------------------------------------------------------------------------------- /BPtrain.cc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yongxuUSTC/DNN-for-speech-enhancement/HEAD/BPtrain.cc -------------------------------------------------------------------------------- /enh_wav_example/readme.txt: -------------------------------------------------------------------------------- 1 | collected enhanced noisy example, compared with other methods 2 | -------------------------------------------------------------------------------- /BP_GPU.h.bak: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yongxuUSTC/DNN-for-speech-enhancement/HEAD/BP_GPU.h.bak -------------------------------------------------------------------------------- /Interface.cc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yongxuUSTC/DNN-for-speech-enhancement/HEAD/Interface.cc -------------------------------------------------------------------------------- /BP_GPU.cu.bak: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yongxuUSTC/DNN-for-speech-enhancement/HEAD/BP_GPU.cu.bak -------------------------------------------------------------------------------- /BPtrain.cc.bak: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yongxuUSTC/DNN-for-speech-enhancement/HEAD/BPtrain.cc.bak -------------------------------------------------------------------------------- /Interface.cc.bak: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yongxuUSTC/DNN-for-speech-enhancement/HEAD/Interface.cc.bak -------------------------------------------------------------------------------- /toolbox/GetLenScp.exe: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yongxuUSTC/DNN-for-speech-enhancement/HEAD/toolbox/GetLenScp.exe -------------------------------------------------------------------------------- /toolbox/step3_le2be.m: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yongxuUSTC/DNN-for-speech-enhancement/HEAD/toolbox/step3_le2be.m -------------------------------------------------------------------------------- /enh_wav_example/test1_mySEDNN.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yongxuUSTC/DNN-for-speech-enhancement/HEAD/enh_wav_example/test1_mySEDNN.wav -------------------------------------------------------------------------------- /enh_wav_example/test1_org_noisy.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yongxuUSTC/DNN-for-speech-enhancement/HEAD/enh_wav_example/test1_org_noisy.wav -------------------------------------------------------------------------------- /enh_wav_example/test1_other_cnn.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yongxuUSTC/DNN-for-speech-enhancement/HEAD/enh_wav_example/test1_other_cnn.wav -------------------------------------------------------------------------------- /toolbox/le2be_for_all_files_func.m: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yongxuUSTC/DNN-for-speech-enhancement/HEAD/toolbox/le2be_for_all_files_func.m -------------------------------------------------------------------------------- /enh_wav_example/test2_noisy_chinese.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yongxuUSTC/DNN-for-speech-enhancement/HEAD/enh_wav_example/test2_noisy_chinese.wav -------------------------------------------------------------------------------- /enh_wav_example/test2_mySEDNN_chinese.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yongxuUSTC/DNN-for-speech-enhancement/HEAD/enh_wav_example/test2_mySEDNN_chinese.wav -------------------------------------------------------------------------------- /enh_wav_example/test3_ForestGump_noisy.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yongxuUSTC/DNN-for-speech-enhancement/HEAD/enh_wav_example/test3_ForestGump_noisy.wav -------------------------------------------------------------------------------- /toolbox/weights/gen_rand_net/Gen_rand_net: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yongxuUSTC/DNN-for-speech-enhancement/HEAD/toolbox/weights/gen_rand_net/Gen_rand_net -------------------------------------------------------------------------------- /toolbox/weights/gen_rand_net/Extend_rand_net: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yongxuUSTC/DNN-for-speech-enhancement/HEAD/toolbox/weights/gen_rand_net/Extend_rand_net -------------------------------------------------------------------------------- /toolbox/weights/gen_rand_net/Gen_rand_net.cpp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yongxuUSTC/DNN-for-speech-enhancement/HEAD/toolbox/weights/gen_rand_net/Gen_rand_net.cpp -------------------------------------------------------------------------------- /enh_wav_example/test3_ForestGump_logMMSE_enh.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yongxuUSTC/DNN-for-speech-enhancement/HEAD/enh_wav_example/test3_ForestGump_logMMSE_enh.wav -------------------------------------------------------------------------------- /toolbox/weights/gen_rand_net/Extend_rand_net.cpp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yongxuUSTC/DNN-for-speech-enhancement/HEAD/toolbox/weights/gen_rand_net/Extend_rand_net.cpp -------------------------------------------------------------------------------- /toolbox/weights/gen_rand_net/Extend_rand_net_boost: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yongxuUSTC/DNN-for-speech-enhancement/HEAD/toolbox/weights/gen_rand_net/Extend_rand_net_boost -------------------------------------------------------------------------------- /enh_wav_example/test3_ForestGump_Proposed DNN_enh.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yongxuUSTC/DNN-for-speech-enhancement/HEAD/enh_wav_example/test3_ForestGump_Proposed DNN_enh.wav -------------------------------------------------------------------------------- /toolbox/weights/gen_rand_net/Extend_rand_net_boost.cpp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yongxuUSTC/DNN-for-speech-enhancement/HEAD/toolbox/weights/gen_rand_net/Extend_rand_net_boost.cpp -------------------------------------------------------------------------------- /toolbox/weights/readme.txt: -------------------------------------------------------------------------------- 1 | Gen_rand_wts_for_ReLUs_forCudaTrain.pl : generate random ReLU based weights for training, initialize the weights randomly 2 | 3 | change_cudaSavedModels2matlabWeigths_4layers.m : change back the CUDA saved weights into matlab supported weights for decoding 4 | -------------------------------------------------------------------------------- /toolbox/rand_list.pl: -------------------------------------------------------------------------------- 1 | use strict; 2 | use List::Util 'shuffle'; 3 | 4 | my ($ilist, $olist) = @ARGV; 5 | my @list; 6 | my @ind; 7 | my @shuffled; 8 | my $num; 9 | my $i; 10 | 11 | open(FILE_IN, "$ilist"); 12 | open(FILE_OUT,">$olist"); 13 | @list = ; 14 | $num = @list; 15 | @ind = (0..$num-1); 16 | @shuffled = shuffle(@ind); 17 | foreach $i(@shuffled) 18 | { 19 | print FILE_OUT $list[$i]; 20 | } 21 | 22 | close(FILE_IN); 23 | close(FILE_OUT); 24 | 25 | -------------------------------------------------------------------------------- /toolbox/main_dir_le2be.m: -------------------------------------------------------------------------------- 1 | clc 2 | clear all; 3 | 4 | scp_list='CleanTR08.SCP'; 5 | flsp=fopen(scp_list); 6 | tline=fgetl(flsp); 7 | system('mkdir clean_be'); 8 | line_num=0; 9 | while(tline~=-1) 10 | line_num=line_num+1; 11 | 12 | old_tline=tline; 13 | out_tline=strrep(tline,'clean','clean_be'); 14 | tline=old_tline; 15 | out_tline=strrep(out_tline,'.08','.08'); 16 | 17 | le2be_for_all_files_func(tline, out_tline); 18 | % break; 19 | tline=fgetl(flsp); 20 | end -------------------------------------------------------------------------------- /Makefile.bak: -------------------------------------------------------------------------------- 1 | NVCC := /usr/local/cuda/bin/nvcc 2 | #NVCC := /usr/local/cuda-5.0/bin/nvcc 3 | #NVCC := nvcc 4 | CC := g++ 5 | 6 | all:BPtrain clean 7 | 8 | 9 | BPtrain: BPtrain.cc BP_GPU.o DevFunc.o Interface.o 10 | ${NVCC} BPtrain.cc BP_GPU.o DevFunc.o Interface.o -o BPtrain -L/usr/local/cuda/lib64 -lcublas -lcurand 11 | Interface.o: Interface.h Interface.cc 12 | ${CC} -c Interface.cc 13 | DevFunc.o: DevFunc.h DevFunc.cu 14 | ${NVCC} -c DevFunc.cu 15 | BP_GPU.o: BP_GPU.h BP_GPU.cu DevFunc.o 16 | ${NVCC} -c BP_GPU.cu 17 | 18 | 19 | 20 | clean: 21 | rm DevFunc.o BP_GPU.o Interface.o -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | #NVCC := /usr/local/cuda/bin/nvcc 2 | NVCC := /usr/local/cuda/bin/nvcc -I/usr/local/cuda/include 3 | #NVCC := nvcc 4 | CC := g++ -I/usr/local/cuda/include 5 | 6 | all:BPtrain clean 7 | 8 | 9 | BPtrain: BPtrain.cc BP_GPU.o DevFunc.o Interface.o 10 | ${NVCC} BPtrain.cc BP_GPU.o DevFunc.o Interface.o -o BPtrain -L/usr/local/cuda/lib64 -lcublas -lcurand 11 | Interface.o: Interface.h Interface.cc 12 | ${CC} -c Interface.cc 13 | DevFunc.o: DevFunc.h DevFunc.cu 14 | ${NVCC} -c DevFunc.cu 15 | BP_GPU.o: BP_GPU.h BP_GPU.cu DevFunc.o 16 | ${NVCC} -c BP_GPU.cu 17 | 18 | 19 | 20 | clean: 21 | rm DevFunc.o BP_GPU.o Interface.o -------------------------------------------------------------------------------- /toolbox/weights/Gen_rand_wts_for_ReLUs_forCudaTrain.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | use strict; 3 | use warnings; 4 | 5 | my $i; 6 | my $numlayers = 5; 7 | my $beta = 0.5; 8 | my $flag = 1; 9 | my $root_dir = "/disk4/yongxu_d4/step2_BP_GPU_timit_104NT/gen_rand_net"; 10 | my $fname = "Rand_2056_3hid2048_284.belta$beta"; 11 | my $out_wts_dir = "$root_dir/pretraining_weights/$fname"; 12 | my $out_pfilename = "$out_wts_dir/$fname.wts"; 13 | system("mkdir $out_wts_dir"); 14 | my $cmd = "$root_dir/Gen_rand_net $numlayers 2056 2048 2048 2048 284 $out_wts_dir $out_pfilename $flag $beta"; 15 | system($cmd); 16 | $cmd = "cp $out_wts_dir/$fname.wts pretraining_weights/."; 17 | system($cmd); 18 | 19 | -------------------------------------------------------------------------------- /toolbox/weights/change_cudaSavedModels2matlabWeigths_4layers.m: -------------------------------------------------------------------------------- 1 | clc 2 | clear all; 3 | 4 | for i=25 5 | 6 | mat_name=strcat('mlp.',num2str(i),'.wts.mat'); 7 | load(mat_name); 8 | % break; 9 | 10 | w1=[weights12 bias2']; 11 | w1=w1'; 12 | w2=[weights23 bias3']; 13 | w2=w2'; 14 | w3=[weights34 bias4']; 15 | w3=w3'; 16 | % w4=[weights45 bias5']; 17 | % w4=w4'; 18 | % w5=[weights56 bias6']; 19 | % w5=w5'; 20 | % w6=[weights67 bias7']; 21 | % w6=w6'; 22 | 23 | 24 | wts_name=sprintf('se_weights%d',i); 25 | % save (wts_name,'w1','w2'); 26 | save (wts_name,'w1','w2','w3'); 27 | % save (wts_name,'w1','w2','w3','w4'); 28 | % save (wts_name,'w1','w2','w3','w4','w5'); 29 | % save (wts_name,'w1','w2','w3','w4','w5','w6'); 30 | 31 | clearvars -EXCEPT i; 32 | end; 33 | 34 | clear all; -------------------------------------------------------------------------------- /read_htk_fea.m: -------------------------------------------------------------------------------- 1 | 2 | %% ************************************************************************ 3 | % readHTK - just incase you ever want to go backwards 4 | %************************************************************************** 5 | function [htkdata,nframes,sampPeriod,sampSize,paramKind] = readHTK_new(filename,byte_order) 6 | 7 | if nargin<2 8 | byte_order = 'be'; 9 | end 10 | 11 | fid = fopen(filename,'r',sprintf('ieee-%s',byte_order)); 12 | 13 | nframes = fread(fid,1,'int32'); 14 | sampPeriod = fread(fid,1,'int32'); 15 | sampSize = fread(fid,1,'int16'); 16 | paramKind = fread(fid,1,'int16'); 17 | 18 | % read the data 19 | 20 | htkdata = fread(fid,nframes*(sampSize/4),'float32'); 21 | htkdata = reshape(htkdata,sampSize/4,nframes); 22 | fclose(fid); 23 | end % ------ OF READHTK 24 | -------------------------------------------------------------------------------- /README.md~: -------------------------------------------------------------------------------- 1 | GPU code for Deep neural network (DNN) based speech enhancement 2 | 3 | How to use? 4 | 5 | 1. make 6 | 7 | 2. use *.pl to call BPtrain 8 | 9 | How to prepare the input and output files ? 10 | 11 | 1. use quicknet toolset to prepare Pfile as the input and the output files, Pfile is the big file of all training features. 12 | 13 | What are the functions in this code ? 14 | 15 | 1. ReLU or Sigmoid 16 | 17 | 2. Noise aware training 18 | 19 | 3. Dropout 20 | 21 | How to do decoding or speech enhancement in the test phase ? 22 | 23 | 1. Please ref: DNN based speech enhancement tool is open now and can be downloaded at https://drive.google.com/file/d/0B5r5bvRpQ5DRXzJXd05BNl95alE/view 24 | 25 | What else can this code use for? 26 | 27 | 1. It is designed for any regression tasks, like speech enhancement, ideal binary/ratio mask (IBM/IRM) estimation, audio/music tagging, acoustic event detection, etc. 28 | 29 | Please cite the following papers if you use this code: 30 | 31 | [1]A Regression Approach to Speech Enhancement Based on Deep Neural Networks.YongXu,JunDu,Li-Rong Dai and Chin-Hui Lee, IEEE/ACM Transactions on Audio,Speech, and Language Processing,P.7-19,Vol.23,No.1, 2015 32 | 33 | [2]An Experimental Study on Speech Enhancement Based on Deep Neural Networks.YongXu, JunDu, Li-Rong Dai and Chin-Hui Lee,IEEE signal processing letters, p. 65-68,vol.21,no. 1,January 2014 34 | 35 | Some DNN based speech enhancemen demos: 36 | 37 | 1. http://home.ustc.edu.cn/~xuyong62/demo/SE_DNN_taslp.html 38 | 39 | 2. http://home.ustc.edu.cn/~xuyong62/demo/IS15.html 40 | -------------------------------------------------------------------------------- /Interface.h.bak: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | #define MAXLAYER 10 5 | #define MAXLINE 1024 6 | #define MAXCHUNK 102400 7 | 8 | struct WorkPara 9 | { 10 | char fea_FN[MAXLINE]; 11 | char fea_normFN[MAXLINE]; 12 | int fea_dim; 13 | int fea_context; 14 | 15 | char targ_FN[MAXLINE]; 16 | int targ_offset; 17 | int dropoutflag; 18 | int traincache; ////frames to memory one time 19 | int bunchsize; 20 | int layersizes[MAXLAYER]; 21 | float momentum; 22 | float weightcost; 23 | float lrate; 24 | 25 | char init_weightFN[MAXLINE]; 26 | char out_weightFN[MAXLINE]; 27 | char log_FN[MAXLINE]; 28 | 29 | char train_sent_range[MAXLINE]; 30 | char cv_sent_range[MAXLINE]; 31 | 32 | int gpu_used; 33 | int init_randem_seed; 34 | float init_randem_weight_min; 35 | float init_randem_weight_max; 36 | float init_randem_bias_max; 37 | float init_randem_bias_min; 38 | 39 | float *indata; 40 | //int *targ; 41 | float *targ;///////////////////////////////////////////////////////////////////////////by yongxu 42 | float *weights[MAXLAYER -1]; 43 | float *bias[MAXLAYER -1]; 44 | }; 45 | 46 | class Interface 47 | { 48 | public: 49 | Interface(); 50 | ~Interface(); 51 | public: 52 | void Initial(int argc, char **argv); 53 | void Writeweights(); 54 | int Readdata(); 55 | void get_pfile_info(); 56 | void get_chunk_info(char *range); 57 | void get_chunk_info_cv(char *range); 58 | int Readchunk(int index); 59 | int Readchunk_cv(int index); 60 | void GetRandIndex(int *vec, int len); 61 | public: 62 | struct WorkPara *para; 63 | 64 | unsigned int total_frames; 65 | unsigned int total_sents; 66 | unsigned int total_chunks; 67 | unsigned int total_samples; 68 | unsigned int cv_total_chunks; 69 | unsigned int cv_total_samples; 70 | 71 | int *framesBeforeSent; 72 | int *chunk_frame_st; 73 | int *cv_chunk_frame_st; 74 | 75 | FILE *fp_log; 76 | int numlayers; 77 | int realbunchsize; 78 | private: 79 | void get_uint(const char* hdr, const char* argname, unsigned int* val); 80 | void read_tail(FILE *fp, long int file_offset, unsigned int sentnum, int *out); 81 | 82 | void GetRandWeight(float *vec, float min, float max, int len); 83 | 84 | FILE *fp_data; 85 | FILE *fp_targ; 86 | FILE *fp_init_weight; 87 | FILE *fp_norm; 88 | FILE *fp_out; 89 | 90 | int data_rand_index[MAXLINE]; 91 | 92 | float *mean; 93 | float *dVar; 94 | 95 | int sent_st, sent_en; 96 | int cv_sent_st, cv_sent_en; 97 | int cur_chunk_index; 98 | int frames_read; 99 | }; 100 | -------------------------------------------------------------------------------- /Interface.h: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | #define MAXLAYER 10 5 | #define MAXLINE 1024 6 | #define MAXCHUNK 102400 7 | 8 | struct WorkPara 9 | { 10 | char fea_FN[MAXLINE]; 11 | char fea_normFN[MAXLINE]; 12 | int fea_dim; 13 | int fea_context; 14 | 15 | char targ_FN[MAXLINE]; 16 | int targ_offset; 17 | int dropoutflag; 18 | int traincache; ////frames to memory one time 19 | int bunchsize; 20 | int layersizes[MAXLAYER]; 21 | float momentum; 22 | float weightcost; 23 | float lrate; 24 | float visible_omit; 25 | float hid_omit; 26 | 27 | char init_weightFN[MAXLINE]; 28 | char out_weightFN[MAXLINE]; 29 | char log_FN[MAXLINE]; 30 | 31 | char train_sent_range[MAXLINE]; 32 | char cv_sent_range[MAXLINE]; 33 | 34 | int gpu_used; 35 | int init_randem_seed; 36 | float init_randem_weight_min; 37 | float init_randem_weight_max; 38 | float init_randem_bias_max; 39 | float init_randem_bias_min; 40 | 41 | float *indata; 42 | //int *targ; 43 | float *targ;///////////////////////////////////////////////////////////////////////////by yongxu 44 | float *weights[MAXLAYER -1]; 45 | float *bias[MAXLAYER -1]; 46 | }; 47 | 48 | class Interface 49 | { 50 | public: 51 | Interface(); 52 | ~Interface(); 53 | public: 54 | void Initial(int argc, char **argv); 55 | void Writeweights(); 56 | int Readdata(); 57 | void get_pfile_info(); 58 | void get_chunk_info(char *range); 59 | void get_chunk_info_cv(char *range); 60 | int Readchunk(int index); 61 | int Readchunk_cv(int index); 62 | void GetRandIndex(int *vec, int len); 63 | public: 64 | struct WorkPara *para; 65 | 66 | unsigned int total_frames; 67 | unsigned int total_sents; 68 | unsigned int total_chunks; 69 | unsigned int total_samples; 70 | unsigned int cv_total_chunks; 71 | unsigned int cv_total_samples; 72 | 73 | int *framesBeforeSent; 74 | int *chunk_frame_st; 75 | int *cv_chunk_frame_st; 76 | 77 | FILE *fp_log; 78 | int numlayers; 79 | int realbunchsize; 80 | private: 81 | void get_uint(const char* hdr, const char* argname, unsigned int* val); 82 | void read_tail(FILE *fp, long int file_offset, unsigned int sentnum, int *out); 83 | 84 | void GetRandWeight(float *vec, float min, float max, int len); 85 | 86 | FILE *fp_data; 87 | FILE *fp_targ; 88 | FILE *fp_init_weight; 89 | FILE *fp_norm; 90 | FILE *fp_out; 91 | 92 | int data_rand_index[MAXLINE]; 93 | 94 | float *mean; 95 | float *dVar; 96 | 97 | int sent_st, sent_en; 98 | int cv_sent_st, cv_sent_en; 99 | int cur_chunk_index; 100 | int frames_read; 101 | }; 102 | -------------------------------------------------------------------------------- /how_to_get_pfile.txt: -------------------------------------------------------------------------------- 1 | ############################################################################################### 2 | # detailed steps for get pfile 3 | # step1: used the tool "Wav2LogSpec.exe" to extract all ".lsp" files from raw audio files. (RAW means no header, there is header info in wav format audio files, help/wav2raw.exe can delete the header info) 4 | # step2: use "toolbox/le2be.m" to convert the little endian (le) ".lsp" features into big endian (be) ".lsp_be" features. Here it is a little tricky, maybe you can find some better ways. 5 | # step3: use "toolbox/randomlist.pl" to rand your scp lists 6 | # step4: prepare ".len" TXT file (the frame number of each ".lsp" file, one number on each line). You can use #"toolbox/getlenscp.exe in.scp out.len" 7 | #".len" example: 8 | #120 9 | #234 10 | #451 11 | #99 12 | #... 13 | # step5: use the ".scp (be format feature files, not le format feature files)" and ".len" to get pfile as following. 14 | # summary: "Wav2LogSpec.exe" only can extract "little endian (le)" format feautres from RAW audio files. But quicknet tools only accept "big endian (be)" feature files. So that is why you need "le2be". Normally, HTK toolset (like HCopy.exe) can directly extract "be" formart features (log-power spectra), but i never try it. 15 | # summary: here it is a little tricky and ugly, i am planing to re-write all of the DNN-SE stuff based on Tensorflow. 16 | 17 | ############################################################################# 18 | # An example for a Perl script to get a PFile 19 | # Quicknet tool set is here: http://www1.icsi.berkeley.edu/Speech/qn.html 20 | # http://www1.icsi.berkeley.edu/Speech/icsi-speech-tools.html 21 | ############################################################################# 22 | #!/usr/local/bin/perl -w 23 | 24 | #attentions: please confirm that all the input files from windows system are converted from DOS to UNIX 25 | #attentions: please add path: export PATH=$PATH:/home/xxx/tools/QN/basic/bin 26 | 27 | my $ROOT_DIR = "/disk4/yongxu_d4/get_timit_labpfile/lib"; 28 | my $TOOL_DIR = "/home/yongxu/tools/QN/atlas1/bin"; 29 | my $CF_DIR = "$ROOT_DIR"; 30 | 31 | my $len_scp = "$CF_DIR/timit_115NT_7SNRs_each190utts_clean_lab_be_random_linux.len"; ### frame number for each feature file 32 | 33 | my $fea_scp = "$CF_DIR/timit_115NT_7SNRs_each190utts_clean_lab_be_random_linux.scp"; ### feature file path for each feature file, *.len should be corresponding to the *.scp file (in the same order) 34 | 35 | my $fea_tr = "$CF_DIR/timit_115NT_7SNRs_each190utts_clean_lab_be_random_linux.pfile"; ### output big pfile 36 | 37 | 38 | my $part_num = 9693; ### how many utterances for each parallel part 39 | my $split_num = 10; #46980*4=187920 utts ### parallel computing in 10 parts 40 | my $i; 41 | my @pid; 42 | my $pfile_list; 43 | 44 | 45 | print "ok\n"; 46 | system("split -l $part_num -d -a 1 $fea_scp $fea_scp"); 47 | system("split -l $part_num -d -a 1 $len_scp $len_scp"); 48 | 49 | foreach $i (0..$split_num-1) 50 | { 51 | defined($pid[$i] = fork) or die "can't fork: $!"; 52 | unless ($pid[$i]) 53 | { 54 | system("$TOOL_DIR/feacat -period 16.0 -ipformat htk -deslenfile $len_scp$i -lists -o $fea_tr$i $fea_scp$i");### frame shift = 16ms, attention here 55 | exit(0); 56 | } 57 | } 58 | 59 | $pfile_list = ""; 60 | foreach $i (0..$split_num-1) 61 | { 62 | waitpid($pid[$i], 0); 63 | $pfile_list = $pfile_list . "$fea_tr$i "; 64 | } 65 | 66 | system("$TOOL_DIR/pfile_concat -o $fea_tr $pfile_list"); 67 | 68 | ##foreach $i (0..$split_num-1) 69 | ##{ 70 | ## system("rm -rf $fea_scp$i"); 71 | ## system("rm -rf $len_scp$i"); 72 | ## system("rm -rf $fea_tr$i"); 73 | ##} 74 | ##system("rm -rf $fea_lst"); 75 | 76 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | This code is only for research. If you want to use it for commercial reason, please contact me: yong.xu.ustc@gmail.com 2 | 3 | GPU code for Deep neural network (DNN) based speech enhancement 4 | 5 | How to use? 6 | 7 | 1. make 8 | 9 | 2. use *.pl to call BPtrain 10 | 11 | How to prepare the input and output files ? 12 | 13 | 0. Training clean speech data: standard TIMIT corpus training set (about 4 hours) 14 | 15 | Training noise data: 115 noise types data, you can download here: 16 | http://staff.ustc.edu.cn/~jundu/The%20team/yongxu/demo/115noises.html 17 | 18 | USTC-made 15 noise types: https://pan.baidu.com/s/1dER6UUt or https://drive.google.com/file/d/13CqTkrow_EPdl5x4BQeNHmIdawKRaUcA/view 19 | 20 | 100 Ohio noise types: http://web.cse.ohio-state.edu/pnl/corpus/HuNonspeech/HuCorpus.html 21 | 22 | Test clean speech data: standard TIMIT corpus test set 23 | 24 | Test unseen noise type data: NoiseX-92 (15 types): http://www.speech.cs.cmu.edu/comp.speech/Section1/Data/noisex.html 25 | 26 | 1. use quicknet toolset to prepare Pfile as the input and the output files, Pfile is the big file of all training features. 27 | 28 | quicknet tool set is here : http://www1.icsi.berkeley.edu/Speech/qn.html 29 | how to get a Pfile example (Perl Script): https://github.com/yongxuUSTC/DNN-for-speech-enhancement/blob/master/how_to_get_pfile.txt 30 | 31 | What are the functions in this code ? 32 | 33 | 1. ReLU or Sigmoid 34 | 35 | 2. Noise aware training 36 | 37 | 3. Dropout 38 | 39 | How to do decoding or speech enhancement in the test phase ? 40 | 41 | 1. Please ref: DNN based speech enhancement tool is open now and can be downloaded at https://drive.google.com/file/d/1Wg14r0m41kWv2ja-stBkq2uZgjOP5yME/view?usp=sharing or https://drive.google.com/file/d/0B5r5bvRpQ5DRR1lIV1hpZ0RLQ0E/view?usp=sharing 42 | 43 | or (@ Baidu Yun) 44 | http://pan.baidu.com/s/1eRJGrx4 45 | 46 | What kinds of noisy speech can the DNN-enh tool enhance ? 47 | 48 | 1. It can enhance any kinds of noisy speech, even the real-world noisy speech. one real-world noisy speech enh demo for the movie : http://staff.ustc.edu.cn/~jundu/The%20team/yongxu/demo/IS15.html 49 | 50 | 2. The model is trained only on TIMIT data, so it can get the best performance on the TIMIT test set. 51 | 52 | 3. The model can get the best performance on English dataset because TIMIT is US-English. But this tool still can be used to enhance the noisy speech in other languages, like Chinese. 53 | 54 | 4. You can use multi-language data to retrain this model to get a general DNN-enh tool. 55 | 56 | 57 | What else can this code use for? 58 | 59 | 1. It is designed for any regression tasks, like speech enhancement, ideal binary/ratio mask (IBM/IRM) estimation, audio/music tagging, acoustic event detection, etc. 60 | 61 | Please cite the following papers if you use this code: 62 | 63 | [1] A Regression Approach to Speech Enhancement Based on Deep Neural Networks. 64 | Yong Xu, Jun Du,Li-Rong Dai and Chin-Hui Lee, IEEE/ACM Transactions on Audio,Speech, and Language Processing,P.7-19,Vol.23,No.1, 2015 (2018 IEEE SPS Best paper award, citations > 600) 65 | 66 | [2] An Experimental Study on Speech Enhancement Based on Deep Neural Networks. 67 | Yong Xu, Jun Du, Li-Rong Dai and Chin-Hui Lee,IEEE signal processing letters, p. 65-68,vol.21,no. 1,January 2014 (citations > 550) 68 | 69 | [3] Multi-Objective Learning and Mask-Based Post-Processing for Deep Neural Network Based Speech Enhancement 70 | Yong Xu, Jun Du, Zhen Huang, Li-Rong Dai, Chin-Hui Lee, Interspeech2015 71 | 72 | Some DNN based speech enhancemen demos: 73 | 74 | http://staff.ustc.edu.cn/~jundu/The%20team/yongxu/demo/SE_DNN_taslp.html 75 | http://staff.ustc.edu.cn/~jundu/The%20team/yongxu/demo/IS15.html 76 | -------------------------------------------------------------------------------- /DevFunc.cu.bak: -------------------------------------------------------------------------------- 1 | #include "DevFunc.h" 2 | #include 3 | 4 | __global__ void kernBinary(int n, float* in_vec, float* rand_vec) 5 | { 6 | int i = (blockIdx.x * blockDim.x) + threadIdx.x; 7 | if (i < n) 8 | { 9 | if(in_vec[i] > rand_vec[i]) 10 | { 11 | in_vec[i] = 1.0f; 12 | } 13 | else 14 | { 15 | in_vec[i] = 0.0f; 16 | } 17 | } 18 | } 19 | 20 | __global__ void kernWeightMultiP( int n, float p, float* in_vec ) 21 | { 22 | // int i = (blockIdx.x * blockDim.x) + threadIdx.x; 23 | // int j = (blockIdx.y * blockDim.y) + threadIdx.y; 24 | // if(i < prev_n&& j < cur_n) 25 | // { 26 | // in_vec[i+cur_n*j] = in_vec[i+cur_n*j]*p; 27 | // } 28 | int i = (blockIdx.x * blockDim.x) + threadIdx.x; 29 | if( i < n ) 30 | { 31 | in_vec[i]=in_vec[i]*p; 32 | } 33 | } 34 | __global__ void kernDropout(int n, float p ,float* in, float* rand_vec) 35 | { 36 | int i = (blockIdx.x * blockDim.x) + threadIdx.x; 37 | if(i < n) 38 | { 39 | if(rand_vec[i]max) 88 | max = val; 89 | } 90 | // Now put exp(in-max) in out 91 | inptr = invec; 92 | outptr = outvec; 93 | float sumexp = 0; 94 | for (i=cols; i!=0; i--) 95 | { 96 | float f, e; 97 | 98 | f = *inptr++; 99 | e = expf(f - max); 100 | *outptr++ = e; 101 | sumexp += e; 102 | } 103 | // Now scale the output 104 | float scale = 1.0f/sumexp; 105 | outptr = outvec; 106 | for (i=cols; i!=0; i--) 107 | { 108 | *outptr = (*outptr) * scale; 109 | outptr++; 110 | } 111 | } 112 | } 113 | 114 | __global__ void kernLinearOutCopy(int rows, int cols, float* in_vec, float* out_vec) 115 | { 116 | int row = (blockIdx.x * blockDim.x) + threadIdx.x; 117 | if (row < rows) 118 | { 119 | //int i; //xuyong 120 | //const int index = row * cols; 121 | //const float* invec = &in_vec[index]; 122 | //float* outvec = &in_vec[index]; 123 | //////////////////////////////////////////////////// 124 | int j; 125 | for(j =0; j< cols;j++) 126 | out_vec[cols *row +j] = in_vec[cols *row +j]; 127 | 128 | } 129 | } 130 | 131 | __global__ void kernMultiCopy(int mat_height, int vec_len, 132 | float* vec, float* mat) 133 | { 134 | int col = (blockIdx.x * blockDim.x) + threadIdx.x; 135 | 136 | if (col < vec_len) 137 | { 138 | int j; 139 | float val = vec[col]; 140 | float* top = &mat[col]; 141 | for (j=mat_height; j!=0; j--) 142 | { 143 | *top = val; 144 | top += vec_len; 145 | } 146 | } 147 | } 148 | 149 | __global__ void kernSumcol(int rows, int cols, float* in, float* res) 150 | { 151 | int col = (blockIdx.x * blockDim.x) + threadIdx.x; 152 | 153 | if (col < cols) 154 | { 155 | int j; 156 | const float* fromp = &in[col]; 157 | float* top = &res[col]; 158 | 159 | (*top) = (*fromp); 160 | fromp +=cols; 161 | for (j=rows-1; j!=0; j--) 162 | { 163 | (*top) += (*fromp); 164 | fromp+=cols; 165 | } 166 | } 167 | } 168 | 169 | __global__ void kernAccSumcol(int rows, int cols, float* in, float* res, float alpha, float beta) 170 | { 171 | int col = (blockIdx.x * blockDim.x) + threadIdx.x; 172 | 173 | if (col < cols) 174 | { 175 | int j; 176 | const float* fromp = &in[col]; 177 | float* top = &res[col]; 178 | 179 | (*top) = (*top) *alpha + beta *(*fromp); 180 | fromp +=cols; 181 | for (j=rows-1; j!=0; j--) 182 | { 183 | (*top) += beta *(*fromp); 184 | fromp+=cols; 185 | } 186 | } 187 | } 188 | 189 | __global__ void kernAccSumrow(int rows, int cols, float* in, float* res, float alpha, float beta) 190 | { 191 | int row = (blockIdx.x * blockDim.x) + threadIdx.x; 192 | 193 | if (row < rows) 194 | { 195 | int j; 196 | const float* fromp = &in[row]; 197 | float* top = &res[row]; 198 | 199 | (*top) = (*top) *alpha + beta *(*fromp); 200 | fromp +=rows; 201 | for (j= cols -1; j!=0; j--) 202 | { 203 | (*top) += beta *(*fromp); 204 | fromp += rows; 205 | } 206 | } 207 | } 208 | 209 | __global__ void kernVecMul(int n, float* in_vec1, float* in_vec2, float* out_vec) 210 | { 211 | int i = (blockIdx.x * blockDim.x) + threadIdx.x; 212 | 213 | if (i max) 255 | // { 256 | // max = p[j]; 257 | // maxinx = j; 258 | // } 259 | // } 260 | // outvec[i] = maxinx; 261 | // } 262 | //} 263 | 264 | __global__ void kernDivide(int n, float* in_vec, float* out_vec,float beta) 265 | { 266 | int i = (blockIdx.x * blockDim.x) + threadIdx.x; 267 | if (i < n) 268 | out_vec[i] = in_vec[i]/beta; 269 | } 270 | 271 | __global__ void kernUpdatedelta(int size, float* delta, float* weights, float* gradient, int n, float momentum, float lr, float weightcost) 272 | { 273 | int i = (blockIdx.x * blockDim.x) + threadIdx.x; 274 | if (i < size) 275 | delta[i] = momentum * delta[i] - lr * (gradient[i] / n + weightcost * weights[i]); 276 | } 277 | 278 | -------------------------------------------------------------------------------- /finetune_DNN_speech_enhancement_dropout_NAT.pl: -------------------------------------------------------------------------------- 1 | use strict; 2 | 3 | my $i; 4 | my $j; 5 | my $line; 6 | my $curacc; 7 | my $preacc; 8 | #my $threshold=0.1; 9 | 10 | 11 | my $numlayers=5; 12 | 13 | my $lrate=1; 14 | my $layersizes = "1548"; # 129*11+129 15 | for(my $i=0;$i<$numlayers -2;$i++) 16 | { 17 | $layersizes .= ",2048"; 18 | } 19 | $layersizes .= ",129";######################## 20 | 21 | my $node=2048; 22 | 23 | # my $hidname = ""; 24 | # for(my $i=0;$i<$numlayers -2;$i++) 25 | # { 26 | # $hidname .= "_h500"; 27 | # } 28 | 29 | my $exe = "./code_BP_GPU_DNN_Dropout_NAT_speech_enhancement_GPU1/BPtrain"; 30 | my $gpu_used = 1; 31 | # my $numlayers = 4; 32 | # my $layersizes = "429,1024,1024,183"; 33 | 34 | my $bunchsize = 128;#128 35 | my $momentum = 0.5; 36 | my $weightcost = 0; 37 | my $fea_dim = 129;#123 38 | my $fea_context = 11; 39 | my $traincache = 102400; ############ how many samples per chunk #102400 40 | my $init_randem_seed= 27863875; ############ every epoch must change 41 | my $targ_offset = 5; 42 | 43 | # my $CF_DIR = "config"; 44 | # my $norm_file = "$CF_DIR/fea_tr.norm_data_timit_SNR_20_15_10_5_0_-5"; 45 | # my $fea_file = "$CF_DIR/timit_Multi_NT_SNR_100h_all_trainset_25cases_random_ts2000_noisy.pfile"; 46 | # my $targ_file = "$CF_DIR/timit_Multi_NT_SNR_100h_all_trainset_25cases_random_ts2000_clean.pfile";######################## 47 | my $CF_DIR = "/home/yongxu/step1_prepare_data/data_timit_104NT_7SNRs_100h_phase_from18/pretrain_pfile"; 48 | my $norm_file = "$CF_DIR/104NT_7SNRs_2500h_EachCase4H_trainset_random_ts2500.fea_norm"; 49 | my $fea_file = "$CF_DIR/104NT_7SNRs_100h_EachCase4H_trainset_random_ts2500_noisy_linux.pfile"; 50 | my $targ_file = "/disk1/yongxu_d1/config/get_100h_104NT_7SNRs_random_ts2500/104NT_7SNRs_100h_EachCase4H_trainset_estIBM_refCLEAN_LC5dB_random_ts2500_noisy_linux.pfile";######################## 51 | 52 | # my $train_sent_range = "0-115499"; 53 | # #my $train_sent_range = "8-9"; 54 | # my $cv_sent_range = "115500-117499"; 55 | # #my $cv_sent_range = "1-1"; 56 | # my $train_sent_range = "0-721874"; #625h 57 | # #my $train_sent_range = "8-9"; 58 | # my $cv_sent_range = "2887500-2889999"; 59 | # #my $cv_sent_range = "1-1"; 60 | #my $train_sent_range = "0-2887499";#2500h 61 | # my $train_sent_range = "0-1443749"; #1250h 62 | # #my $train_sent_range = "8-9"; 63 | # my $cv_sent_range = "2887500-2889999"; 64 | # #my $cv_sent_range = "1-1"; 65 | my $train_sent_range = "0-115499"; #100h 66 | #my $train_sent_range = "8-9"; 67 | my $cv_sent_range = "115500-117999"; 68 | #my $cv_sent_range = "1-1"; 69 | 70 | my $MLP_DIR = "models/104NT_7SNRs_100h_EachCase4H_trainset_random_ts2500_batch$bunchsize\_momentum$momentum\_frContext$fea_context\_lrate$lrate\_node$node\_numlayer$numlayers\-randomPretr-v0.1-h0.2-dropout-NAT-estIBM_refCLEAN_LC5dB-GPU1";########################################################################### 71 | 72 | system("mkdir $MLP_DIR"); 73 | my $outwts_file = "$MLP_DIR/mlp.1.wts"; 74 | my $log_file = "$MLP_DIR/mlp.1.log"; 75 | my $initwts_file = "pretraining_weights/random_1548_2048_2048_2048_129.wts";######################### 76 | ###my $initwts_file = "/home/jiapan/new_BP_Code/BPtrain_v1_mlp/mlp.6.wts.right"; 77 | # 78 | #printf("2"); 79 | print "iter 1 lrate is $lrate\n"; 80 | system("$exe" . 81 | " gpu_used=$gpu_used". 82 | " numlayers=$numlayers". 83 | " layersizes=$layersizes". 84 | " bunchsize=$bunchsize". 85 | " momentum=$momentum". 86 | " weightcost=$weightcost". 87 | " lrate=$lrate". 88 | " fea_dim=$fea_dim". 89 | " fea_context=$fea_context". 90 | " traincache=$traincache". 91 | " init_randem_seed=$init_randem_seed". 92 | " targ_offset=$targ_offset". 93 | " initwts_file=$initwts_file". 94 | " norm_file=$norm_file". 95 | " fea_file=$fea_file". 96 | " targ_file=$targ_file". 97 | " outwts_file=$outwts_file". 98 | " log_file=$log_file". 99 | " train_sent_range=$train_sent_range". 100 | " cv_sent_range=$cv_sent_range". 101 | " dropoutflag=1". 102 | " visible_omit=0.1". 103 | " hid_omit=0.2" 104 | ); 105 | 106 | # die; 107 | # 108 | # my $success=open LOG, "$log_file"; 109 | # if(!$success) 110 | # { 111 | # printf "open log fail\n"; 112 | # } 113 | # while() 114 | # { 115 | # chomp; 116 | # if(/CV over.*/) 117 | # { 118 | # s/CV over\. right num: \d+, ACC: //; 119 | # s/%//; 120 | # $curacc=$_; 121 | # } 122 | # } 123 | # close LOG; 124 | # 125 | $preacc=$curacc; 126 | my $destep=0; 127 | ######################################## 128 | # $init_randem_seed=27865600; 129 | # $momentum=0.7; 130 | ######################################## 131 | for($i= 2;$i <= 10;$i++){ 132 | 133 | $j = $i -1; 134 | $initwts_file = "$MLP_DIR/mlp.$j.wts"; 135 | $outwts_file = "$MLP_DIR/mlp.$i.wts"; 136 | $log_file = "$MLP_DIR/mlp.$i.log"; 137 | $init_randem_seed += 345; 138 | $momentum=$momentum+0.04; 139 | print "iter $i lrate is $lrate\n"; 140 | system("$exe" . 141 | " gpu_used=$gpu_used". 142 | " numlayers=$numlayers". 143 | " layersizes=$layersizes". 144 | " bunchsize=$bunchsize". 145 | " momentum=$momentum". 146 | " weightcost=$weightcost". 147 | " lrate=$lrate". 148 | " fea_dim=$fea_dim". 149 | " fea_context=$fea_context". 150 | " traincache=$traincache". 151 | " init_randem_seed=$init_randem_seed". 152 | " targ_offset=$targ_offset". 153 | " initwts_file=$initwts_file". 154 | " norm_file=$norm_file". 155 | " fea_file=$fea_file". 156 | " targ_file=$targ_file". 157 | " outwts_file=$outwts_file". 158 | " log_file=$log_file". 159 | " train_sent_range=$train_sent_range". 160 | " cv_sent_range=$cv_sent_range". 161 | " dropoutflag=1". 162 | " visible_omit=0.1". 163 | " hid_omit=0.2" 164 | ); 165 | } 166 | 167 | # my $success=open LOG, "$log_file"; 168 | # if(!$success) 169 | # { 170 | # printf "open log fail\n"; 171 | # } 172 | # while() 173 | # { 174 | # chomp; 175 | # if(/CV over.*/) 176 | # { 177 | # s/CV over\. right num: \d+, ACC: //; 178 | # s/%//; 179 | # $curacc=$_; 180 | # } 181 | # } 182 | # close LOG; 183 | # 184 | # if($curacc<$preacc+$threshold) 185 | # { 186 | # print "iter $i ACC $curacc < iter $j ACC $preacc+threshold($threshold)\n"; 187 | # $destep++; 188 | # print "destep is $destep\n"; 189 | # if($destep>=3) 190 | # { 191 | # 192 | # unlink($outwts_file) or warn "can not delete weights file"; 193 | # unlink($log_file) or warn "can not delete log file"; 194 | # $i+100; 195 | # #print "finetune end\n"; 196 | # last; 197 | # } 198 | # else 199 | # { 200 | # $i--; 201 | # $lrate *=0.5; 202 | # } 203 | # } 204 | # else 205 | # { 206 | # $destep=0; 207 | # $preacc=$curacc; 208 | # print "1\n\n\n\n\n\n\n\n"; 209 | # } 210 | # 211 | # } 212 | # 213 | ## 214 | for($i= 11;$i <= 100;$i++){ 215 | $j = $i -1; 216 | $initwts_file = "$MLP_DIR/mlp.$j.wts"; 217 | $outwts_file = "$MLP_DIR/mlp.$i.wts"; 218 | $log_file = "$MLP_DIR/mlp.$i.log"; 219 | #$lrate *= 0.9; 220 | #$lrate = 0.01; 221 | $momentum=0.9; 222 | $init_randem_seed += 345; 223 | 224 | system("$exe" . 225 | " gpu_used=$gpu_used". 226 | " numlayers=$numlayers". 227 | " layersizes=$layersizes". 228 | " bunchsize=$bunchsize". 229 | " momentum=$momentum". 230 | " weightcost=$weightcost". 231 | " lrate=$lrate". 232 | " fea_dim=$fea_dim". 233 | " fea_context=$fea_context". 234 | " traincache=$traincache". 235 | " init_randem_seed=$init_randem_seed". 236 | " targ_offset=$targ_offset". 237 | " initwts_file=$initwts_file". 238 | " norm_file=$norm_file". 239 | " fea_file=$fea_file". 240 | " targ_file=$targ_file". 241 | " outwts_file=$outwts_file". 242 | " log_file=$log_file". 243 | " train_sent_range=$train_sent_range". 244 | " cv_sent_range=$cv_sent_range". 245 | " dropoutflag=1". 246 | " visible_omit=0.1". 247 | " hid_omit=0.2" 248 | ); 249 | } 250 | -------------------------------------------------------------------------------- /DevFunc.cu: -------------------------------------------------------------------------------- 1 | #include "DevFunc.h" 2 | #include 3 | 4 | __global__ void kernBinary(int n, float* in_vec, float* rand_vec) 5 | { 6 | int i = (blockIdx.x * blockDim.x) + threadIdx.x; 7 | if (i < n) 8 | { 9 | if(in_vec[i] > rand_vec[i]) 10 | { 11 | in_vec[i] = 1.0f; 12 | } 13 | else 14 | { 15 | in_vec[i] = 0.0f; 16 | } 17 | } 18 | } 19 | 20 | __global__ void kernWeightMultiP( int n, float p, float* in_vec ) 21 | { 22 | // int i = (blockIdx.x * blockDim.x) + threadIdx.x; 23 | // int j = (blockIdx.y * blockDim.y) + threadIdx.y; 24 | // if(i < prev_n&& j < cur_n) 25 | // { 26 | // in_vec[i+cur_n*j] = in_vec[i+cur_n*j]*p; 27 | // } 28 | int i = (blockIdx.x * blockDim.x) + threadIdx.x; 29 | if( i < n ) 30 | { 31 | in_vec[i]=in_vec[i]*p; 32 | } 33 | } 34 | __global__ void kernDropout(int n, float p ,float* in, float* rand_vec) 35 | { 36 | int i = (blockIdx.x * blockDim.x) + threadIdx.x; 37 | if(i < n) 38 | { 39 | if(rand_vec[i]0) 76 | out_vec[i]=in_vec[i]; 77 | else 78 | out_vec[i]=0.0f; 79 | } 80 | 81 | global void kernDsigmoid(int n, float* in_vec, float* out_vec) 82 | { 83 | int i = (blockIdx.x * blockDim.x) + threadIdx.x; 84 | 85 | if (i0) 93 | out_vec[i]=1.0f; 94 | else 95 | out_vec[i]=0.0f; 96 | } 97 | } 98 | ////////////////////////////////////////////////////////// 99 | 100 | __global__ void kernSoftmax(int rows, int cols, float* in_vec, float* out_vec) 101 | { 102 | int row = (blockIdx.x * blockDim.x) + threadIdx.x; 103 | if (row < rows) 104 | { 105 | int i; 106 | const int index = row * cols; 107 | const float* invec = &in_vec[index]; 108 | float* outvec = &out_vec[index]; 109 | const float* inptr; 110 | float* outptr; 111 | 112 | // First find the max of each vector 113 | float max; 114 | 115 | inptr = invec; 116 | max = *inptr++; 117 | for (i=cols-1; i!=0; i--) 118 | { 119 | float val; 120 | 121 | val = *inptr++; 122 | if (val>max) 123 | max = val; 124 | } 125 | // Now put exp(in-max) in out 126 | inptr = invec; 127 | outptr = outvec; 128 | float sumexp = 0; 129 | for (i=cols; i!=0; i--) 130 | { 131 | float f, e; 132 | 133 | f = *inptr++; 134 | e = expf(f - max); 135 | *outptr++ = e; 136 | sumexp += e; 137 | } 138 | // Now scale the output 139 | float scale = 1.0f/sumexp; 140 | outptr = outvec; 141 | for (i=cols; i!=0; i--) 142 | { 143 | *outptr = (*outptr) * scale; 144 | outptr++; 145 | } 146 | } 147 | } 148 | 149 | __global__ void kernLinearOutCopy(int rows, int cols, float* in_vec, float* out_vec) 150 | { 151 | int row = (blockIdx.x * blockDim.x) + threadIdx.x; 152 | if (row < rows) 153 | { 154 | //int i; //xuyong 155 | //const int index = row * cols; 156 | //const float* invec = &in_vec[index]; 157 | //float* outvec = &in_vec[index]; 158 | //////////////////////////////////////////////////// 159 | int j; 160 | for(j =0; j< cols;j++) 161 | out_vec[cols *row +j] = in_vec[cols *row +j]; 162 | 163 | } 164 | } 165 | 166 | __global__ void kernMultiCopy(int mat_height, int vec_len, 167 | float* vec, float* mat) 168 | { 169 | int col = (blockIdx.x * blockDim.x) + threadIdx.x; 170 | 171 | if (col < vec_len) 172 | { 173 | int j; 174 | float val = vec[col]; 175 | float* top = &mat[col]; 176 | for (j=mat_height; j!=0; j--) 177 | { 178 | *top = val; 179 | top += vec_len; 180 | } 181 | } 182 | } 183 | 184 | __global__ void kernSumcol(int rows, int cols, float* in, float* res) 185 | { 186 | int col = (blockIdx.x * blockDim.x) + threadIdx.x; 187 | 188 | if (col < cols) 189 | { 190 | int j; 191 | const float* fromp = &in[col]; 192 | float* top = &res[col]; 193 | 194 | (*top) = (*fromp); 195 | fromp +=cols; 196 | for (j=rows-1; j!=0; j--) 197 | { 198 | (*top) += (*fromp); 199 | fromp+=cols; 200 | } 201 | } 202 | } 203 | 204 | __global__ void kernAccSumcol(int rows, int cols, float* in, float* res, float alpha, float beta) 205 | { 206 | int col = (blockIdx.x * blockDim.x) + threadIdx.x; 207 | 208 | if (col < cols) 209 | { 210 | int j; 211 | const float* fromp = &in[col]; 212 | float* top = &res[col]; 213 | 214 | (*top) = (*top) *alpha + beta *(*fromp); 215 | fromp +=cols; 216 | for (j=rows-1; j!=0; j--) 217 | { 218 | (*top) += beta *(*fromp); 219 | fromp+=cols; 220 | } 221 | } 222 | } 223 | 224 | __global__ void kernAccSumrow(int rows, int cols, float* in, float* res, float alpha, float beta) 225 | { 226 | int row = (blockIdx.x * blockDim.x) + threadIdx.x; 227 | 228 | if (row < rows) 229 | { 230 | int j; 231 | const float* fromp = &in[row]; 232 | float* top = &res[row]; 233 | 234 | (*top) = (*top) *alpha + beta *(*fromp); 235 | fromp +=rows; 236 | for (j= cols -1; j!=0; j--) 237 | { 238 | (*top) += beta *(*fromp); 239 | fromp += rows; 240 | } 241 | } 242 | } 243 | 244 | __global__ void kernVecMul(int n, float* in_vec1, float* in_vec2, float* out_vec) 245 | { 246 | int i = (blockIdx.x * blockDim.x) + threadIdx.x; 247 | 248 | if (i max) 290 | // { 291 | // max = p[j]; 292 | // maxinx = j; 293 | // } 294 | // } 295 | // outvec[i] = maxinx; 296 | // } 297 | //} 298 | 299 | __global__ void kernDivide(int n, float* in_vec, float* out_vec,float beta) 300 | { 301 | int i = (blockIdx.x * blockDim.x) + threadIdx.x; 302 | if (i < n) 303 | out_vec[i] = in_vec[i]/beta; 304 | } 305 | 306 | //__global__ void kernUpdatedelta(int size, float* delta, float* weights, float* gradient, int n, float momentum, float lr, float weightcost) 307 | //{ 308 | // int i = (blockIdx.x * blockDim.x) + threadIdx.x; 309 | // if (i < size) 310 | // delta[i] = momentum * delta[i] - lr * (gradient[i] / n + weightcost * weights[i]); 311 | //} 312 | 313 | __global__ void kernUpdatedelta(int size, float* delta, float* weights, float* gradient, int n, float momentum, float lr, float weightcost) 314 | { 315 | int i = (blockIdx.x * blockDim.x) + threadIdx.x; 316 | if (i < size) 317 | delta[i] = momentum * delta[i] - (1-momentum)*lr*(gradient[i] / n + weightcost * weights[i]);//3.16 dropoutʱҪ³Ë1-momentum 318 | } 319 | -------------------------------------------------------------------------------- /DevFunc.h: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | //#include "/usr/local/cuda-5.0/include/cublas_v2.h" 5 | //#include "/usr/local/cuda/include/cublas_v2.h" 6 | 7 | static const int CUDA_MAXBLOCKS = 65535; 8 | static const int NTHREADS = 256; 9 | static const int BASICSIZE = 32; 10 | 11 | __global__ void kernSigmoid(int n, float* in_vec, float* out_vec); 12 | __global__ void kernBinary(int n, float* in_vec, float* rand_vec); 13 | __global__ void kernMultiCopy(int mat_height, int vec_len, float* vec, float* mat); 14 | __global__ void kernSumcol(int rows, int cols, float* in, float* res); 15 | __global__ void kernAccSumcol(int rows, int cols, float* in, float* res, float alpha, float beta); 16 | __global__ void kernAccSumrow(int rows, int cols, float* in, float* res, float alpha, float beta); 17 | __global__ void kernSoftmax(int rows, int cols, float *in_vec, float* out_vec); //kernLinearOutCopy 18 | __global__ void kernLinearOutCopy(int rows, int cols, float *in_vec, float* out_vec); 19 | __global__ void kernDsigmoid(int n, float* in_vec, float* out_vec); 20 | __global__ void kernVecMul(int n, float *in_vec1, float *in_vec2, float *res_vec); 21 | //__global__ void kernSubIndex(int rows , int cols, const float *in_vec1, const int *in_index, float *res_vec); 22 | __global__ void kernSubClean(int rows , int cols, const float *in_vec1, const float *in_clean, float *res_vec); 23 | __global__ void kernAccSum(int n, float* in, float* res, float beta); 24 | //__global__ void kernGetMaxIndex(int rows, int cols, float* invec, int* outvec); 25 | __global__ void kernDivide(int n, float* in_vec, float* out_vec,float beta); 26 | __global__ void kernUpdatedelta(int size, float* delta, float* weights, float* gradient, int n, float momentum, float lr, float weightcost); 27 | __global__ void kernWeightMultiP(int n, float p, float* in_vec); 28 | __global__ void kernDropout(int n, float p ,float* in, float* rand_vec); 29 | inline void SgemmTN(cublasHandle_t handle,int m, int k, 30 | int n, const float* A, const float* B, float* C, 31 | const float alpha, const float beta) 32 | { 33 | cublasStatus_t e =cublasSgemm(handle,CUBLAS_OP_T, CUBLAS_OP_N, 34 | m, n, k, &beta, (float*)A, k, (float*) B, k, &alpha, C, m); 35 | if(e != CUBLAS_STATUS_SUCCESS) 36 | { 37 | printf("%d,%d,%d...........SgemmTN wrong\n",m,k,n); 38 | } 39 | if(e == CUBLAS_STATUS_EXECUTION_FAILED) 40 | { 41 | printf("...........1\n"); 42 | } 43 | } 44 | 45 | inline void SgemmNN(cublasHandle_t handle,int m, int k, 46 | int n, const float* A,const float* B, float* C, 47 | const float alpha, const float beta) 48 | { 49 | cublasStatus_t e =cublasSgemm(handle,CUBLAS_OP_N, CUBLAS_OP_N, 50 | m, n, k, &beta, (float*)A, m, (float*) B, k, &alpha, C, m); 51 | if(e != CUBLAS_STATUS_SUCCESS) 52 | { 53 | printf("...........SgemmNN wrong\n"); 54 | } 55 | } 56 | 57 | inline void SgemmNT(cublasHandle_t handle,int m, int k, 58 | int n, const float* A, 59 | const float* B, float* C, const float alpha, const float beta) 60 | { 61 | cublasStatus_t e =cublasSgemm(handle,CUBLAS_OP_N, CUBLAS_OP_T, 62 | m, n, k, &beta, (float*)A, m, (float*) B, n, &alpha, C, m); 63 | if(e != CUBLAS_STATUS_SUCCESS) 64 | { 65 | printf("...........SgemmNT wrong\n"); 66 | } 67 | } 68 | 69 | inline void DevWeightMultiP(cudaStream_t stream, int n, float p, float* in_vec) 70 | { 71 | int nblocks=(n + NTHREADS-1)/NTHREADS; 72 | if (nblocks > CUDA_MAXBLOCKS) 73 | printf("DevWeightMultiP: nblocks too large\n"); 74 | kernWeightMultiP<<>>( n, p, in_vec ); 75 | // int nblocks=(Prev_n*cur_n + NTHREADS-1)/NTHREADS; 76 | // dim3 dimBlock(NTHREADS,NTHREADS); 77 | // dim3 dimGrid((prev_n+NTHREADS-1)/NTHREADS,(cur_n+NTHREADS-1)/NTHREADS); 78 | // kernWeightMultiP<<>>(p, prev_n, cur_n, in_vec); 79 | } 80 | 81 | inline void DevDropout(cudaStream_t stream, int n,float p, float * in_vec, float* rand_vec) 82 | { 83 | int nblocks=(n + NTHREADS-1)/NTHREADS; 84 | if (nblocks > CUDA_MAXBLOCKS) 85 | printf("DevDropout: nblocks too large\n"); 86 | kernDropout<<>>( n, p, in_vec, rand_vec); 87 | } 88 | 89 | inline void DevSigmoid(cudaStream_t stream, int n, float* in_vec, float* out_vec) 90 | { 91 | int nblocks = (n + NTHREADS-1)/NTHREADS; 92 | if (nblocks > CUDA_MAXBLOCKS) 93 | printf("DevSigmoid: nblocks too large\n"); 94 | kernSigmoid<<>>(n, in_vec, out_vec); 95 | } 96 | 97 | inline void DevDsigmoid(cudaStream_t stream, int n, float* in_vec, float* out_vec) 98 | { 99 | int nblocks = (n + NTHREADS-1)/NTHREADS; 100 | if (nblocks> CUDA_MAXBLOCKS) 101 | printf("DevDsigmoid: nblocks too large\n"); 102 | kernDsigmoid<<>>(n, in_vec, out_vec); 103 | } 104 | 105 | inline void DevSoftmax(cudaStream_t stream, int rows, int cols, float* in_vecs, float* out_vecs) 106 | { 107 | int nblocks = (rows + NTHREADS-1)/NTHREADS; 108 | if (nblocks > CUDA_MAXBLOCKS) 109 | printf("DevSoftmax: nblocks too large\n"); 110 | kernSoftmax<<>>(rows, cols, in_vecs, out_vecs); 111 | } 112 | 113 | inline void DevLinearOutCopy(cudaStream_t stream, int rows, int cols, float* in_vecs, float* out_vecs) 114 | { 115 | int nblocks = (rows + NTHREADS-1)/NTHREADS; 116 | if (nblocks > CUDA_MAXBLOCKS) 117 | printf("DevLinearOutCopy: nblocks too large\n"); 118 | kernLinearOutCopy<<>>(rows, cols, in_vecs, out_vecs); 119 | } 120 | 121 | inline void DevMultiCopy(cudaStream_t stream,int mat_height, int vec_len, 122 | float* vec, float* mat) 123 | { 124 | int nblocks = (vec_len + NTHREADS-1)/NTHREADS; 125 | if (nblocks>CUDA_MAXBLOCKS) 126 | printf("DevMultiCopy: nblocks too large\n"); 127 | kernMultiCopy<<>>(mat_height, vec_len, vec, mat); 128 | } 129 | 130 | inline void DevSumcol(cudaStream_t stream,int rows, int cols, float* in, float* res) 131 | { 132 | int nblocks = (cols + NTHREADS-1)/NTHREADS; 133 | if (nblocks>CUDA_MAXBLOCKS) 134 | printf("DevSumcol: nblocks too large\n"); 135 | kernSumcol<<>>(rows, cols, in, res); 136 | } 137 | 138 | inline void DevAccSumcol(cudaStream_t stream,int rows, int cols, float* in, float* res, float alpha, float beta) 139 | { 140 | int nblocks = (cols + NTHREADS-1)/NTHREADS; 141 | if (nblocks>CUDA_MAXBLOCKS) 142 | printf("DevSumcol: nblocks too large\n"); 143 | kernAccSumcol<<>>(rows, cols, in, res, alpha, beta); 144 | } 145 | 146 | inline void DevAccSumrow(cudaStream_t stream,int rows, int cols, float* in, float* res, float alpha, float beta) 147 | { 148 | int nblocks = (rows + NTHREADS-1)/NTHREADS; 149 | if (nblocks>CUDA_MAXBLOCKS) 150 | printf("DevSumrow: nblocks too large\n"); 151 | kernAccSumrow<<>>(rows, cols, in, res, alpha, beta); 152 | } 153 | 154 | inline void DevAccSum(cudaStream_t stream, int n, float* in, float* res, float beta) 155 | { 156 | int nblocks = (n + NTHREADS-1)/NTHREADS; 157 | if (nblocks> CUDA_MAXBLOCKS) 158 | printf("DevAccSum: nblocks too large\n"); 159 | kernAccSum<<>>(n, in, res, beta); 160 | } 161 | 162 | inline void DevVecMul(cudaStream_t stream, int n, float *in_vec1, float *in_vec2, float *res_vec) 163 | { 164 | int nblocks = (n + NTHREADS-1)/NTHREADS; 165 | if (nblocks > CUDA_MAXBLOCKS) 166 | printf("DevVecMul: nblocks too large\n"); 167 | kernVecMul<<>>(n, in_vec1, in_vec2, res_vec); 168 | } 169 | 170 | //inline void DevSubIndex(cudaStream_t stream, int rows , int cols, const float *in_vec1, const int *in_index, float *res_vec) 171 | inline void DevSubClean(cudaStream_t stream, int rows , int cols, const float *in_vec1, const float *in_clean, float *res_vec)////////yongxu 172 | { 173 | int nblocks = (rows + NTHREADS-1)/NTHREADS; 174 | if (nblocks > CUDA_MAXBLOCKS) 175 | //printf("DevSubIndex: nblocks too large\n"); 176 | printf("DevSubClean: nblocks too large\n"); 177 | //kernSubIndex<<>>( rows, cols, in_vec1, in_index, res_vec); 178 | kernSubClean<<>>( rows, cols, in_vec1, in_clean, res_vec); 179 | } 180 | 181 | //inline void DevGetMaxIndex(cudaStream_t stream, int rows , int cols, float *invec, int *outvec) 182 | //{ 183 | // int nblocks = (cols + NTHREADS-1)/NTHREADS; 184 | // if (nblocks > CUDA_MAXBLOCKS) 185 | // printf("DevSubIndex: nblocks too large\n"); 186 | // kernGetMaxIndex<<>>( rows, cols, invec, outvec); 187 | //} 188 | 189 | inline void DevDivide(cudaStream_t stream, int n, float* in_vec, float* out_vec,float beta) 190 | { 191 | int nblocks = (n + NTHREADS-1)/NTHREADS; 192 | if (nblocks > CUDA_MAXBLOCKS) 193 | printf("DevDevide: nblocks too large\n"); 194 | kernDivide<<>>( n, in_vec, out_vec, beta); 195 | } 196 | 197 | inline void updatedelta(cudaStream_t stream, int size, float* delta, float* weights, float* gradient, int n, float momentum, float lr, float weightcost) 198 | { 199 | int nblocks = (size +NTHREADS-1)/NTHREADS; 200 | if (nblocks > CUDA_MAXBLOCKS) 201 | printf("updatedelta: nblocks too large\n"); 202 | kernUpdatedelta<<>>( size, delta, weights, gradient, n, momentum, lr, weightcost); 203 | } -------------------------------------------------------------------------------- /DevFunc.h.bak: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | //#include 4 | //#include "/usr/local/cuda-5.0/include/cublas_v2.h" 5 | #include "/usr/local/cuda/include/cublas_v2.h" 6 | 7 | static const int CUDA_MAXBLOCKS = 65535; 8 | static const int NTHREADS = 256; 9 | static const int BASICSIZE = 32; 10 | 11 | __global__ void kernSigmoid(int n, float* in_vec, float* out_vec); 12 | __global__ void kernBinary(int n, float* in_vec, float* rand_vec); 13 | __global__ void kernMultiCopy(int mat_height, int vec_len, float* vec, float* mat); 14 | __global__ void kernSumcol(int rows, int cols, float* in, float* res); 15 | __global__ void kernAccSumcol(int rows, int cols, float* in, float* res, float alpha, float beta); 16 | __global__ void kernAccSumrow(int rows, int cols, float* in, float* res, float alpha, float beta); 17 | __global__ void kernSoftmax(int rows, int cols, float *in_vec, float* out_vec); //kernLinearOutCopy 18 | __global__ void kernLinearOutCopy(int rows, int cols, float *in_vec, float* out_vec); 19 | __global__ void kernDsigmoid(int n, float* in_vec, float* out_vec); 20 | __global__ void kernVecMul(int n, float *in_vec1, float *in_vec2, float *res_vec); 21 | //__global__ void kernSubIndex(int rows , int cols, const float *in_vec1, const int *in_index, float *res_vec); 22 | __global__ void kernSubClean(int rows , int cols, const float *in_vec1, const float *in_clean, float *res_vec); 23 | __global__ void kernAccSum(int n, float* in, float* res, float beta); 24 | //__global__ void kernGetMaxIndex(int rows, int cols, float* invec, int* outvec); 25 | __global__ void kernDivide(int n, float* in_vec, float* out_vec,float beta); 26 | __global__ void kernUpdatedelta(int size, float* delta, float* weights, float* gradient, int n, float momentum, float lr, float weightcost); 27 | __global__ void kernWeightMultiP(int n, float p, float* in_vec); 28 | __global__ void kernDropout(int n, float p ,float* in, float* rand_vec); 29 | inline void SgemmTN(cublasHandle_t handle,int m, int k, 30 | int n, const float* A, const float* B, float* C, 31 | const float alpha, const float beta) 32 | { 33 | cublasStatus_t e =cublasSgemm(handle,CUBLAS_OP_T, CUBLAS_OP_N, 34 | m, n, k, &beta, (float*)A, k, (float*) B, k, &alpha, C, m); 35 | if(e != CUBLAS_STATUS_SUCCESS) 36 | { 37 | printf("%d,%d,%d...........SgemmTN wrong\n",m,k,n); 38 | } 39 | if(e == CUBLAS_STATUS_EXECUTION_FAILED) 40 | { 41 | printf("...........1\n"); 42 | } 43 | } 44 | 45 | inline void SgemmNN(cublasHandle_t handle,int m, int k, 46 | int n, const float* A,const float* B, float* C, 47 | const float alpha, const float beta) 48 | { 49 | cublasStatus_t e =cublasSgemm(handle,CUBLAS_OP_N, CUBLAS_OP_N, 50 | m, n, k, &beta, (float*)A, m, (float*) B, k, &alpha, C, m); 51 | if(e != CUBLAS_STATUS_SUCCESS) 52 | { 53 | printf("...........SgemmNN wrong\n"); 54 | } 55 | } 56 | 57 | inline void SgemmNT(cublasHandle_t handle,int m, int k, 58 | int n, const float* A, 59 | const float* B, float* C, const float alpha, const float beta) 60 | { 61 | cublasStatus_t e =cublasSgemm(handle,CUBLAS_OP_N, CUBLAS_OP_T, 62 | m, n, k, &beta, (float*)A, m, (float*) B, n, &alpha, C, m); 63 | if(e != CUBLAS_STATUS_SUCCESS) 64 | { 65 | printf("...........SgemmNT wrong\n"); 66 | } 67 | } 68 | 69 | inline void DevWeightMultiP(cudaStream_t stream, int n, float p, float* in_vec) 70 | { 71 | int nblocks=(n + NTHREADS-1)/NTHREADS; 72 | if (nblocks > CUDA_MAXBLOCKS) 73 | printf("DevWeightMultiP: nblocks too large\n"); 74 | kernWeightMultiP<<>>( n, p, in_vec ); 75 | // int nblocks=(Prev_n*cur_n + NTHREADS-1)/NTHREADS; 76 | // dim3 dimBlock(NTHREADS,NTHREADS); 77 | // dim3 dimGrid((prev_n+NTHREADS-1)/NTHREADS,(cur_n+NTHREADS-1)/NTHREADS); 78 | // kernWeightMultiP<<>>(p, prev_n, cur_n, in_vec); 79 | } 80 | 81 | inline void DevDropout(cudaStream_t stream, int n,float p, float * in_vec, float* rand_vec) 82 | { 83 | int nblocks=(n + NTHREADS-1)/NTHREADS; 84 | if (nblocks > CUDA_MAXBLOCKS) 85 | printf("DevDropout: nblocks too large\n"); 86 | kernDropout<<>>( n, p, in_vec, rand_vec); 87 | } 88 | 89 | inline void DevSigmoid(cudaStream_t stream, int n, float* in_vec, float* out_vec) 90 | { 91 | int nblocks = (n + NTHREADS-1)/NTHREADS; 92 | if (nblocks > CUDA_MAXBLOCKS) 93 | printf("DevSigmoid: nblocks too large\n"); 94 | kernSigmoid<<>>(n, in_vec, out_vec); 95 | } 96 | 97 | inline void DevDsigmoid(cudaStream_t stream, int n, float* in_vec, float* out_vec) 98 | { 99 | int nblocks = (n + NTHREADS-1)/NTHREADS; 100 | if (nblocks> CUDA_MAXBLOCKS) 101 | printf("DevDsigmoid: nblocks too large\n"); 102 | kernDsigmoid<<>>(n, in_vec, out_vec); 103 | } 104 | 105 | inline void DevSoftmax(cudaStream_t stream, int rows, int cols, float* in_vecs, float* out_vecs) 106 | { 107 | int nblocks = (rows + NTHREADS-1)/NTHREADS; 108 | if (nblocks > CUDA_MAXBLOCKS) 109 | printf("DevSoftmax: nblocks too large\n"); 110 | kernSoftmax<<>>(rows, cols, in_vecs, out_vecs); 111 | } 112 | 113 | inline void DevLinearOutCopy(cudaStream_t stream, int rows, int cols, float* in_vecs, float* out_vecs) 114 | { 115 | int nblocks = (rows + NTHREADS-1)/NTHREADS; 116 | if (nblocks > CUDA_MAXBLOCKS) 117 | printf("DevLinearOutCopy: nblocks too large\n"); 118 | kernLinearOutCopy<<>>(rows, cols, in_vecs, out_vecs); 119 | } 120 | 121 | inline void DevMultiCopy(cudaStream_t stream,int mat_height, int vec_len, 122 | float* vec, float* mat) 123 | { 124 | int nblocks = (vec_len + NTHREADS-1)/NTHREADS; 125 | if (nblocks>CUDA_MAXBLOCKS) 126 | printf("DevMultiCopy: nblocks too large\n"); 127 | kernMultiCopy<<>>(mat_height, vec_len, vec, mat); 128 | } 129 | 130 | inline void DevSumcol(cudaStream_t stream,int rows, int cols, float* in, float* res) 131 | { 132 | int nblocks = (cols + NTHREADS-1)/NTHREADS; 133 | if (nblocks>CUDA_MAXBLOCKS) 134 | printf("DevSumcol: nblocks too large\n"); 135 | kernSumcol<<>>(rows, cols, in, res); 136 | } 137 | 138 | inline void DevAccSumcol(cudaStream_t stream,int rows, int cols, float* in, float* res, float alpha, float beta) 139 | { 140 | int nblocks = (cols + NTHREADS-1)/NTHREADS; 141 | if (nblocks>CUDA_MAXBLOCKS) 142 | printf("DevSumcol: nblocks too large\n"); 143 | kernAccSumcol<<>>(rows, cols, in, res, alpha, beta); 144 | } 145 | 146 | inline void DevAccSumrow(cudaStream_t stream,int rows, int cols, float* in, float* res, float alpha, float beta) 147 | { 148 | int nblocks = (rows + NTHREADS-1)/NTHREADS; 149 | if (nblocks>CUDA_MAXBLOCKS) 150 | printf("DevSumrow: nblocks too large\n"); 151 | kernAccSumrow<<>>(rows, cols, in, res, alpha, beta); 152 | } 153 | 154 | inline void DevAccSum(cudaStream_t stream, int n, float* in, float* res, float beta) 155 | { 156 | int nblocks = (n + NTHREADS-1)/NTHREADS; 157 | if (nblocks> CUDA_MAXBLOCKS) 158 | printf("DevAccSum: nblocks too large\n"); 159 | kernAccSum<<>>(n, in, res, beta); 160 | } 161 | 162 | inline void DevVecMul(cudaStream_t stream, int n, float *in_vec1, float *in_vec2, float *res_vec) 163 | { 164 | int nblocks = (n + NTHREADS-1)/NTHREADS; 165 | if (nblocks > CUDA_MAXBLOCKS) 166 | printf("DevVecMul: nblocks too large\n"); 167 | kernVecMul<<>>(n, in_vec1, in_vec2, res_vec); 168 | } 169 | 170 | //inline void DevSubIndex(cudaStream_t stream, int rows , int cols, const float *in_vec1, const int *in_index, float *res_vec) 171 | inline void DevSubClean(cudaStream_t stream, int rows , int cols, const float *in_vec1, const float *in_clean, float *res_vec)////////yongxu 172 | { 173 | int nblocks = (rows + NTHREADS-1)/NTHREADS; 174 | if (nblocks > CUDA_MAXBLOCKS) 175 | //printf("DevSubIndex: nblocks too large\n"); 176 | printf("DevSubClean: nblocks too large\n"); 177 | //kernSubIndex<<>>( rows, cols, in_vec1, in_index, res_vec); 178 | kernSubClean<<>>( rows, cols, in_vec1, in_clean, res_vec); 179 | } 180 | 181 | //inline void DevGetMaxIndex(cudaStream_t stream, int rows , int cols, float *invec, int *outvec) 182 | //{ 183 | // int nblocks = (cols + NTHREADS-1)/NTHREADS; 184 | // if (nblocks > CUDA_MAXBLOCKS) 185 | // printf("DevSubIndex: nblocks too large\n"); 186 | // kernGetMaxIndex<<>>( rows, cols, invec, outvec); 187 | //} 188 | 189 | inline void DevDivide(cudaStream_t stream, int n, float* in_vec, float* out_vec,float beta) 190 | { 191 | int nblocks = (n + NTHREADS-1)/NTHREADS; 192 | if (nblocks > CUDA_MAXBLOCKS) 193 | printf("DevDevide: nblocks too large\n"); 194 | kernDivide<<>>( n, in_vec, out_vec, beta); 195 | } 196 | 197 | inline void updatedelta(cudaStream_t stream, int size, float* delta, float* weights, float* gradient, int n, float momentum, float lr, float weightcost) 198 | { 199 | int nblocks = (size +NTHREADS-1)/NTHREADS; 200 | if (nblocks > CUDA_MAXBLOCKS) 201 | printf("updatedelta: nblocks too large\n"); 202 | kernUpdatedelta<<>>( size, delta, weights, gradient, n, momentum, lr, weightcost); 203 | } -------------------------------------------------------------------------------- /BP_GPU.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | #include 5 | #include "BP_GPU.h" 6 | #include "DevFunc.h" 7 | 8 | #define THREUSEMULTIGPU 256 9 | 10 | BP_GPU::BP_GPU(int a_GPU_selected, int a_numlayers, int *a_layersizes, int a_bunchsize, float a_lrate, float a_momentum, 11 | float a_weightcost,float **weights, float **bias,int a_dropoutflag,float a_visible_omit,float a_hid_omit) 12 | :GPU_selected(a_GPU_selected),numlayers(a_numlayers),bunchsize(a_bunchsize),momentum(a_momentum),lrate(a_lrate),weightcost(a_weightcost),dropoutflag(a_dropoutflag), visible_omit(a_visible_omit),hid_omit(a_hid_omit) 13 | { 14 | int i,j; 15 | int maxlayersize=0; 16 | //// set GPU num 17 | cudaGetDeviceCount(&GPU_total); 18 | printf("Total GPU Device : %d\n",GPU_total); 19 | 20 | if(GPU_selected > GPU_total || GPU_selected < 1) 21 | { 22 | printf("GPU Num %d Not In Range %d-%d\n",GPU_selected,1,GPU_total); 23 | exit(0); 24 | } 25 | printf("Use GPU Device : %d\n",GPU_selected); 26 | 27 | 28 | 29 | int bunch_part[GPU_selected]; 30 | int part = bunchsize/GPU_selected; 31 | 32 | for(i= 0; i< GPU_selected-1;i++) 33 | { 34 | bunch_part[i] = part; 35 | } 36 | bunch_part[GPU_selected -1] = bunchsize -part*(GPU_selected -1); 37 | 38 | 39 | 40 | ////Init cublas && streams 41 | dev = (BP_WorkSpace*) malloc(GPU_selected * sizeof(BP_WorkSpace)); 42 | handles = (cublasHandle_t*) malloc(GPU_selected * sizeof(cublasHandle_t)); 43 | streams = (cudaStream_t*) malloc(GPU_selected * sizeof(cudaStream_t)); 44 | gen = (curandGenerator_t*) malloc(GPU_selected * sizeof(curandGenerator_t)); 45 | 46 | for(i = 0;i < GPU_selected;i++) 47 | { 48 | cudaError_t er; 49 | curandStatus_t eg; 50 | 51 | er = cudaSetDevice(i); 52 | //er = cudaSetDevice(1); 53 | if (er!=cudaSuccess) 54 | printf("cudaSetDevice(%d) failed\n",i); 55 | 56 | 57 | er =cudaStreamCreate(&(streams[i])); 58 | if (er!=cudaSuccess) 59 | printf("cudaStreamCreate(%d) failed\n",i); 60 | 61 | cublasStatus_t eb = cublasCreate(&handles[i]); 62 | if (eb!=CUBLAS_STATUS_SUCCESS) 63 | printf("cublasCreate(%d) failed\n",i); 64 | 65 | eb = cublasSetStream(handles[i],streams[i]); 66 | if (eb!=CUBLAS_STATUS_SUCCESS) 67 | printf("cublasSetStream(handles[%d],streams[%d]) failed\n",i,i); 68 | 69 | eg = curandCreateGenerator(&gen[i] ,CURAND_RNG_PSEUDO_DEFAULT); 70 | if(eg!=CURAND_STATUS_SUCCESS) 71 | printf("curandCreateGenerator(%d) failed\n",i); 72 | 73 | eg = curandSetStream(gen[i],streams[i]); 74 | if(eg!=CURAND_STATUS_SUCCESS) 75 | printf("curandSetStream(%d) failed\n",i); 76 | 77 | srand(unsigned(time(NULL))); 78 | curandSetPseudoRandomGeneratorSeed(gen[i] ,rand()); 79 | } 80 | if(GPU_selected >1) 81 | { 82 | 83 | 84 | for(i =0;i< GPU_selected;i++) 85 | { 86 | 87 | cudaSetDevice(i); 88 | // cudaSetDevice(1); 89 | for(j =0;j< GPU_selected;j++) 90 | { 91 | 92 | if(j != i) 93 | { 94 | int UVA; 95 | cudaDeviceCanAccessPeer(&UVA,j,i); 96 | if(UVA==0) 97 | { 98 | printf("cudaDeviceCanAccessPeer error\n"); 99 | exit(0); 100 | } 101 | else 102 | { 103 | printf("cudaDeviceCanAccessPeer between Device %d and Device %d OK\n",j,i); 104 | cudaDeviceEnablePeerAccess(j, 0); 105 | } 106 | 107 | } 108 | 109 | } 110 | } 111 | } 112 | 113 | //// Alloc device Memory 114 | for(i =0; i < numlayers;i++) 115 | { 116 | layersizes[i] = a_layersizes[i]; 117 | if (maxlayersize1) 175 | { 176 | cudaDeviceSynchronize(); 177 | } 178 | 179 | ////copy weights && biases to devices 180 | for(j =0;j< GPU_selected;j++) 181 | { 182 | 183 | cudaSetDevice(j); 184 | //cudaSetDevice(1); 185 | 186 | for(i = 1; i< numlayers; i++) 187 | { 188 | todev_vf_vf("weights", layersizes[i-1] *layersizes[i], weights[i], dev[j].weights[i], streams[j]); 189 | todev_vf_vf("bias", layersizes[i], bias[i], dev[j].bias[i], streams[j]); 190 | } 191 | } 192 | if(GPU_selected >1) 193 | { 194 | cudaDeviceSynchronize(); 195 | } 196 | printf("Created net with %d layers, bunchsize %d.\n", numlayers, bunchsize); 197 | } 198 | 199 | BP_GPU::~BP_GPU() 200 | { 201 | int i,j; 202 | 203 | ////streams & cublas free 204 | for(j =0;j< GPU_selected;j++) 205 | { 206 | 207 | cudaSetDevice(j); 208 | //cudaSetDevice(1); 209 | 210 | devfree_vf("in", dev[j].in); 211 | devfree_vf("out", dev[j].out); 212 | //devfree_vi("targ", dev[j].targ); 213 | devfree_vf("targ", dev[j].targ);/////////////////////////////////////////////////yongxu 214 | devfree_vf("DevRandVector",dev[j].DevRandVector); 215 | devfree_vi("DevSeed", dev[j].DevSeed); 216 | for (i = 1; i< numlayers; i++) 217 | { 218 | devfree_vf("weights", dev[j].weights[i]); 219 | devfree_vf("bias", dev[j].bias[i]); 220 | devfree_vf("delta_weights", dev[j].delta_weights[i]); 221 | devfree_vf("delta_bias", dev[j].delta_bias[i]); 222 | devfree_vf("layer_x", dev[j].layer_x[i]); 223 | devfree_vf("layer_y", dev[j].layer_y[i]); 224 | devfree_vf("layer_dedx", dev[j].layer_dedx[i]); 225 | devfree_vf("layer_dydx", dev[j].layer_dydx[i]); 226 | devfree_vf("layer_dedy", dev[j].layer_dedy[i]); 227 | devfree_vf("layer_ydedx", dev[j].layer_ydedx[i]); 228 | devfree_vf("layer_sumdedx", dev[j].layer_sumdedx[i]); 229 | } 230 | 231 | 232 | cublasDestroy(handles[j]); 233 | cudaStreamDestroy(streams[j]); 234 | curandDestroyGenerator(gen[j]); 235 | } 236 | delete[] dev; 237 | } 238 | 239 | //void BP_GPU::train(int n_frames, const float* in, const int *targ) 240 | //void BP_GPU::train(int n_frames, const float* in, const float *targ)////////////////////////////by yongxu 241 | void BP_GPU::train(int n_frames, float* in, const float *targ) 242 | { 243 | 244 | int i,j; 245 | //int t; 246 | int frames_this_bunch; // Number of frames to handle this bunch 247 | int n_input = layersizes[0]; 248 | int out_dims= layersizes[numlayers-1]; 249 | 250 | float **realin = new float*[GPU_selected]; 251 | //int **realtarg = new int*[GPU_selected]; 252 | float **realtarg = new float*[GPU_selected];///////////////////////////////////by yongxu 253 | //float *realin; 254 | //int *realtarg; 255 | 256 | int n_frames_part = n_frames/GPU_selected; 257 | 258 | // for (t=0;t<517;t++) 259 | // { printf("in[%d]=%f,",t,in[t]); 260 | // } 261 | // printf ("\n"); 262 | // 263 | // for (t=0;t<200;t++) 264 | // {printf("targ[%d]=%f,",t,targ[t]); 265 | // } 266 | // printf ("\n"); 267 | 268 | // First copy data to GPU 269 | for(i= 0; i< GPU_selected;i++) 270 | { 271 | 272 | cudaSetDevice(i); 273 | //cudaSetDevice(1); 274 | todev_vf_vf("in",n_frames_part * n_input, in + i* n_frames_part* n_input, dev[i].in, streams[i]); 275 | //todev_vi_vi("targ", n_frames_part, targ + i* n_frames_part, dev[i].targ, streams[i]); 276 | todev_vf_vf("targ", n_frames_part * out_dims, targ + i* n_frames_part, dev[i].targ, streams[i]); 277 | } 278 | if(GPU_selected >1) 279 | { 280 | cudaDeviceSynchronize(); 281 | } 282 | //printf("Copy Data Sucess , %d Frames\n",n_frames); 283 | 284 | 285 | for(i= 0; i< GPU_selected;i++) 286 | { 287 | realin[i] = dev[i].in; 288 | realtarg[i] = dev[i].targ; 289 | } 290 | 291 | 292 | 293 | //printf("GPU_selected : %d\n",GPU_selected); 294 | for (i=0; i< n_frames; i+= bunchsize) 295 | { 296 | //printf("i=%d\n",i); 297 | frames_this_bunch = (bunchsize > n_frames - i)?(n_frames - i):bunchsize; 298 | if(frames_this_bunch == bunchsize) 299 | { 300 | //printf("in \n"); 301 | if(GPU_selected == 1) 302 | { 303 | //printf("in-in \n"); 304 | //printf("realin[0][1]=%f,realtarg[0][1]=%f\n",realin[0][1],realtarg[0][1]);//这个地方输不出来,也不报错 305 | //printf("dev[0].in[1]=%f,dev[0].targ[1]=%f\n",in[1],targ[1]); 306 | //printf("begin to run train_bunch_single\n"); 307 | 308 | train_bunch_single(frames_this_bunch, realin[0], realtarg[0]);//[0]表示第0块cuda device,//realin[0], realtarg[0] 309 | //这里是每个batch地去跑,用realin和realtarg,来每次指向GPU里的每个batch 310 | //printf("complete train_bunch_single\n"); 311 | } 312 | //else 313 | //train_bunch_multi(frames_this_bunch, realin, realtarg); 314 | } 315 | else 316 | { 317 | printf("this bunch has only %d samples and is ignored.\n",frames_this_bunch); 318 | } 319 | 320 | for(j= 0; j< GPU_selected;j++) 321 | { 322 | realin[j] += n_input * frames_this_bunch/GPU_selected; 323 | realtarg[j] += out_dims * frames_this_bunch/GPU_selected; 324 | } 325 | 326 | } 327 | //printf("end here before\n"); 328 | delete[] realin; 329 | delete[] realtarg; 330 | //printf("end here\n"); 331 | } 332 | 333 | ////void BP_GPU::train(int n_frames, const float* in, const int *targ) 334 | ////徐勇写,将上面的多个GPU去跑的程序注释掉,以免发生混乱 335 | //void BP_GPU::train(int n_frames, const float* in, const float *targ)////////////////////////////by yongxu 336 | //{ 337 | // 338 | // int i,t; 339 | // int frames_this_bunch; // Number of frames to handle this bunch 340 | // int n_input = layersizes[0]; 341 | // float *realin = new float[GPU_selected]; 342 | // //int **realtarg = new int*[GPU_selected]; 343 | // float *realtarg = new float[GPU_selected];///////////////////////////////////by yongxu 344 | // //float *realin; 345 | // //int *realtarg; 346 | // 347 | // int n_frames_part = n_frames/1; 348 | // 349 | //// for (t=0;t<560;t++)//这里check了,拼帧后,训练集三帧对应target一帧的现象 350 | //// { printf("in[%d]=%f,",t,in[t]); 351 | //// } 352 | //// printf ("\n"); 353 | //// 354 | //// for (t=0;t<200;t++) 355 | //// {printf("targ[%d]=%f,",t,targ[t]); 356 | //// } 357 | //// printf ("\n"); 358 | // 359 | // // First copy data to GPU 360 | // cudaSetDevice(0); 361 | // todev_vf_vf("in",n_frames_part * n_input, in + 0* n_frames_part* n_input, dev[0].in, streams[0]); 362 | // //todev_vi_vi("targ", n_frames_part, targ + i* n_frames_part, dev[i].targ, streams[i]); 363 | // todev_vf_vf("targ", n_frames_part * out_dims, targ + 0* n_frames_part * out_dims, dev[0].targ, streams[0]); 364 | // 365 | // printf("Copy Data Sucess , %d Frames\n",n_frames); 366 | // 367 | // realin = dev[0].in; 368 | // realtarg = dev[0].targ; 369 | // 370 | // printf("GPU_selected : %d\n",GPU_selected); 371 | // for (i=0; i< n_frames; i+= bunchsize) 372 | // { 373 | // printf("i=%d\n",i); 374 | // frames_this_bunch = (bunchsize > n_frames - i)?(n_frames - i):bunchsize; 375 | // if(frames_this_bunch == bunchsize) 376 | // { 377 | // printf("in \n"); 378 | // 379 | // //printf("realin[0]=%f,realtarg[0]=%f\n",realin[0],realtarg[0]);//这个地方输不出来,也不报错 380 | // //printf("dev[0].in[1]=%f,dev[0].targ[1]=%f\n",in[1],targ[1]); 381 | // printf("begin to run train_bunch_single\n"); 382 | // 383 | // train_bunch_single(frames_this_bunch, realin, realtarg);//[0]表示第0块cuda device 384 | // printf("complete train_bunch_single\n"); 385 | // 386 | // 387 | // } 388 | // else 389 | // { 390 | // printf("this bunch has only %d samples and is ignored.\n",frames_this_bunch); 391 | // } 392 | // 393 | // 394 | // realin += n_input * frames_this_bunch/1; 395 | // realtarg += out_dims * frames_this_bunch/1; 396 | // 397 | // 398 | // } 399 | // printf("this train end\n"); 400 | // delete[] realin; 401 | // delete[] realtarg; 402 | // 403 | // printf("this train end 2 \n"); 404 | // 405 | //} 406 | 407 | //int BP_GPU::CrossValid(int n_frames, const float* in, const int *targ) 408 | float BP_GPU::CrossValid(int n_frames, const float* in, const float *targ)/////////////////////////////////////by yongxu 409 | { 410 | //only use one GPU 411 | //int correct_samples =0; 412 | float squared_err=0.0f;/////////////////////////////////////////////by yongxu 413 | //int *out = new int [bunchsize]; 414 | int out_dims= layersizes[numlayers-1]; 415 | 416 | float *out = new float [bunchsize*out_dims];///////////////////////////////by yongxu, 这个地方是一个二维特征(batch*feadim) 417 | //int *out; 418 | //cudaMallocHost((void**)&out, bunchsize * sizeof(int)); 419 | int i,j,d; 420 | //int t; 421 | int frames_this_bunch; // Number of frames to handle this bunch 422 | int n_input = layersizes[0];//输入的特征维数(可能是扩展帧的) 423 | float *realin; 424 | 425 | 426 | // 427 | // for (t=0;t<560;t++)//这里check了,拼帧后,训练集三帧对应target一帧的现象 428 | // { printf("in[%d]=%f,",t,in[t]); 429 | // } 430 | // printf ("\n"); 431 | // 432 | // for (t=0;t<200;t++) 433 | // {printf("targ[%d]=%f,",t,targ[t]); 434 | // } 435 | // printf ("\n"); 436 | 437 | 438 | // First copy data to GPU 439 | cudaSetDevice(0); 440 | //cudaSetDevice(1); 441 | todev_vf_vf("in", n_frames* n_input, in, dev[0].in, streams[0]); 442 | 443 | realin = dev[0].in; 444 | 445 | FILE *fp=fopen("CV_out.txt","w"); 446 | 447 | for (i=0; i< n_frames; i+= bunchsize)//n_frames是该CV集的总帧数;bunchsize指的是一个bunch里有多少帧;然后每个bunch分别计算 448 | { 449 | 450 | frames_this_bunch = (bunchsize > n_frames - i)?(n_frames - i):bunchsize; 451 | 452 | //cv_bunch_single(frames_this_bunch, realin, out[i]); 453 | cv_bunch_single(frames_this_bunch, realin, out); 454 | 455 | //// compute correct_samples 456 | ////////////compute squared error 457 | //fprintf(fp,"%d\n\n",frames_this_bunch); 458 | for(j =0; j< frames_this_bunch;j++) 459 | { 460 | for(d=0;d0; cur_layer--) 589 | { 590 | //printf("Backward ing\n"); 591 | prev_layer = cur_layer - 1; 592 | cur_layer_units = layersizes[cur_layer]; 593 | prev_layer_units = layersizes[prev_layer]; 594 | cur_layer_size = cur_layer_units * n_frames; 595 | cur_layer_y = dev[0].layer_y[cur_layer]; 596 | if (cur_layer==1) 597 | prev_layer_y = in; 598 | else 599 | prev_layer_y = dev[0].layer_y[prev_layer]; 600 | cur_layer_dydx = dev[0].layer_dydx[cur_layer]; 601 | cur_layer_dedy = dev[0].layer_dedy[cur_layer]; 602 | prev_layer_dedy = dev[0].layer_dedy[prev_layer]; 603 | cur_layer_dedx = dev[0].layer_dedx[cur_layer]; 604 | cur_layer_ydedx = dev[0].layer_ydedx[cur_layer]; 605 | cur_layer_sumdedx = dev[0].layer_sumdedx[cur_layer]; 606 | cur_layer_bias = dev[0].bias[cur_layer]; 607 | cur_layer_delta_bias = dev[0].delta_bias[cur_layer]; 608 | cur_layer_delta_weights = dev[0].delta_weights[cur_layer]; 609 | cur_weights = dev[0].weights[cur_layer]; 610 | 611 | if (cur_layer != numlayers - 1) 612 | { 613 | //printf("former layers' sigmoid\n"); 614 | DevDsigmoid(streams[0], cur_layer_size, cur_layer_y, cur_layer_dydx); 615 | DevVecMul(streams[0], cur_layer_size, cur_layer_dydx, cur_layer_dedy, cur_layer_dedx); 616 | } 617 | //else 618 | //{ 619 | 620 | //DevSubIndex(streams[0], n_frames, cur_layer_units, dev[0].out, targ, cur_layer_dedx); 621 | //从cpu复制到gpu 622 | // DevLinearOutCopy(streams[0], n_frames, cur_layer_units, dev[0].out, targ, cur_layer_dedx); 623 | //} 624 | //对平方误差求导,//////////////////////////////////////////yongxu 625 | else 626 | { 627 | //printf("begin to cal squared error\n"); 628 | //printf("targ[0]=%f,targ[1]=%f\n",targ[0],targ[1]); 629 | //DevSubClean(streams[0], n_frames, cur_layer_units, dev[0].layer_x[numlayers - 1], targ, cur_layer_dedx); 630 | DevSubClean(streams[0], n_frames, cur_layer_units, dev[0].out, targ, cur_layer_dedx); 631 | //dev[0].layer_x[numlayers - 1] 632 | } 633 | 634 | if (cur_layer != 1) 635 | { 636 | SgemmTN(handles[0], prev_layer_units, cur_layer_units, n_frames, cur_weights, cur_layer_dedx, prev_layer_dedy, zero, one); 637 | } 638 | 639 | // Update weights. 640 | //printf("Update weights\n"); 641 | //SgemmNT(handles[0], cur_layer_units, n_frames, prev_layer_units, cur_layer_dedx, prev_layer_y, cur_layer_delta_weights ,momentum, -cur_lrate/n_frames); 642 | SgemmNT(handles[0], cur_layer_units, n_frames, prev_layer_units, cur_layer_dedx, prev_layer_y, cur_layer_ydedx ,zero, one); 643 | updatedelta(streams[0], cur_layer_units * prev_layer_units, cur_layer_delta_weights, cur_weights, cur_layer_ydedx, n_frames, momentum, cur_lrate, weightcost); 644 | //cublasSaxpy(handles[0],cur_layer_units *prev_layer_units, &cur_lr_wc, cur_weights,1,cur_layer_delta_weights ,1); 645 | 646 | //DevAccSumrow(streams[0], cur_layer_units, n_frames, cur_layer_dedx, cur_layer_delta_bias, momentum, -cur_lrate/n_frames); 647 | DevAccSumrow(streams[0], cur_layer_units, n_frames, cur_layer_dedx, cur_layer_sumdedx, zero, one); 648 | updatedelta(streams[0], cur_layer_units, cur_layer_delta_bias, cur_layer_bias, cur_layer_sumdedx, n_frames, momentum, cur_lrate, zero); 649 | //cublasSaxpy(handles[0],cur_layer_units, &cur_lr_wc, cur_layer_bias,1,cur_layer_delta_bias ,1); 650 | 651 | DevAccSum(streams[0], cur_layer_units *prev_layer_units, cur_layer_delta_weights, cur_weights, 1.0); 652 | DevAccSum(streams[0], cur_layer_units, cur_layer_delta_bias, cur_layer_bias, 1.0); 653 | 654 | /// 655 | /* 656 | if(cur_layer ==1){ 657 | float *tmpout = new float[1 *cur_layer_units]; 658 | fromdev_vf_vf("data",1 *cur_layer_units, cur_layer_bias,tmpout, streams[0]); 659 | for(int tmpj =0 ;tmpj < cur_layer_units ;tmpj ++) 660 | { 661 | for(int tmpi =0;tmpi< 1; tmpi++) 662 | { 663 | printf("%f\n",(tmpout[tmpj + tmpi *cur_layer_units])); 664 | } 665 | } 666 | delete [] tmpout; 667 | exit(0);} 668 | */ 669 | /// 670 | //printf("come to end\n"); 671 | } 672 | //fclose(fp); 673 | } 674 | 675 | //void BP_GPU::cv_bunch_single(int n_frames, const float *in, int* out) 676 | void BP_GPU::cv_bunch_single(int n_frames, const float *in, float* out)///////////////////////////////by yongxu 677 | { 678 | 679 | 680 | const float one = 1.0f; 681 | //const float zero = 0.0f; 682 | //int i,j; 683 | int cur_layer; // The index of the current layer. 684 | int prev_layer; // The index of the previous layer. 685 | int cur_layer_units; // The number of units in the current layer. 686 | int prev_layer_units; // The number of units in the previous layer. 687 | int cur_layer_size; // The size of the current layer. 688 | int out_dims= layersizes[numlayers-1]; 689 | 690 | float* cur_layer_x; 691 | float* cur_layer_y; // Output from the current layer 692 | const float* prev_layer_y; // Output from the previous non-linearity. 693 | float* cur_layer_bias; // Biases for the current layer. 694 | float* cur_weights; // Weights inputing to the current layer. 695 | 696 | //int *devout; 697 | //devnew_vi("devout", n_frames, &devout); 698 | float *devout;/////////////////////////////////by yongxu 699 | devnew_vf("devout", n_frames*out_dims, &devout); 700 | 701 | //dropout参数 702 | int weight_size; 703 | float vis_keep; 704 | float hid_keep; 705 | vis_keep=1.0f-visible_omit; 706 | hid_keep=1.0f-hid_omit; 707 | 708 | //// Forward 709 | for (cur_layer=1; cur_layer< numlayers; cur_layer++) 710 | { 711 | prev_layer = cur_layer - 1; 712 | cur_layer_units = layersizes[cur_layer]; 713 | prev_layer_units = layersizes[prev_layer]; 714 | cur_layer_size = cur_layer_units * n_frames; 715 | cur_layer_x = dev[0].layer_x[cur_layer]; 716 | cur_layer_y = dev[0].layer_y[cur_layer]; 717 | 718 | weight_size=prev_layer_units*cur_layer_units; 719 | 720 | if (cur_layer==1) 721 | prev_layer_y = in; 722 | else 723 | prev_layer_y = dev[0].layer_y[prev_layer]; 724 | cur_layer_bias = dev[0].bias[cur_layer]; 725 | 726 | if (dropoutflag==1) 727 | { 728 | if(cur_layer==1) 729 | DevWeightMultiP(streams[0], weight_size, vis_keep, dev[0].weights[cur_layer]); 730 | else 731 | DevWeightMultiP(streams[0], weight_size, hid_keep, dev[0].weights[cur_layer]); 732 | } 733 | 734 | cur_weights = dev[0].weights[cur_layer]; 735 | 736 | DevMultiCopy(streams[0],n_frames, cur_layer_units, cur_layer_bias, cur_layer_x); 737 | SgemmNN(handles[0],cur_layer_units, prev_layer_units, n_frames, cur_weights, prev_layer_y, cur_layer_x, one, one); 738 | 739 | if (dropoutflag==1) 740 | { 741 | if(cur_layer==1) 742 | DevWeightMultiP(streams[0], weight_size, 1.0f/vis_keep, dev[0].weights[cur_layer]); 743 | else 744 | DevWeightMultiP(streams[0], weight_size, 1.0f/hid_keep, dev[0].weights[cur_layer]); 745 | 746 | } 747 | 748 | if (cur_layer != numlayers - 1){ 749 | DevSigmoid(streams[0],cur_layer_size, cur_layer_x, cur_layer_y); 750 | } 751 | else{ /////////////////////////////////////////yongxu 注释掉就可以得到一个线性输出 752 | // DevSoftmax(streams[0],n_frames, cur_layer_units, cur_layer_x, dev[0].out); 753 | //DevSigmoid(streams[0],cur_layer_size, cur_layer_x, cur_layer_y); 754 | // DevGetMaxIndex(streams[0], cur_layer_units, n_frames, dev[0].out, devout); 755 | //DevLinearOutCopy(streams[0],n_frames, cur_layer_units, cur_layer_x, dev[0].out); 756 | //cudaMemcpy(dev[0].out,cur_layer_x,n_frames*cur_layer_units*sizeof(float),cudaMemcpyDeviceToDevice); 757 | cudaMemcpy(devout,cur_layer_x,n_frames*cur_layer_units*sizeof(float),cudaMemcpyDeviceToDevice); 758 | } 759 | } 760 | //fromdev_vi_vi("devout",n_frames,devout,out, streams[0]); 761 | //devfree_vi("devout",devout);/////////////////////////////////////////yongxu 762 | fromdev_vf_vf("devout",n_frames*out_dims,devout,out, streams[0]); 763 | devfree_vf("devout",devout); 764 | 765 | //// 766 | // float *asf = new float[cur_layer_units* n_frames]; 767 | // //fromdev_vf_vf("out", cur_layer_units* n_frames, dev[0].out ,asf, streams[0]); 768 | // for(int tmp=0;tmp < n_frames;tmp++) 769 | // printf("%d\n",out[tmp]); 770 | // delete []asf; 771 | // exit(0); 772 | 773 | } 774 | 775 | ////void BP_GPU::train_bunch_multi(int n_frames, float **in, int** targ) 776 | //void BP_GPU::train_bunch_multi(int n_frames, float **in, float** targ)/////////////////////yongxu 777 | //{ 778 | // const float one = 1.0f; 779 | // const float zero = 0.0f; 780 | // int i; 781 | // int cur_layer; // The index of the current layer. 782 | // int prev_layer; // The index of the previous layer. 783 | // 784 | // float cur_lrate = lrate; 785 | // 786 | // int n_frames_part[GPU_selected]; 787 | // int part = bunchsize/GPU_selected; 788 | // 789 | // for(i= 0; i< GPU_selected;i++) 790 | // { 791 | // n_frames_part[i] = part; 792 | // } 793 | // n_frames_part[GPU_selected -1] = n_frames -part*(GPU_selected -1); 794 | // 795 | // for(i=0;i0; cur_layer--) 821 | // { 822 | // prev_layer = cur_layer - 1; 823 | // 824 | // 825 | // if (cur_layer != numlayers - 1) 826 | // { 827 | // DevDsigmoid(streams[i], layersizes[cur_layer] * n_frames_part[i], dev[i].layer_y[cur_layer], dev[i].layer_dydx[cur_layer]); 828 | // DevVecMul(streams[i], layersizes[cur_layer] * n_frames_part[i], dev[i].layer_dydx[cur_layer], dev[i].layer_dedy[cur_layer], dev[i].layer_dedx[cur_layer]); 829 | // 830 | // } 831 | // //else/////////////////////////////////yongxu, 注释掉就能得到线性输出吗? 832 | // //{ 833 | // // 834 | // // DevSubIndex(streams[i], n_frames_part[i], layersizes[cur_layer], dev[i].out, targ[i], dev[i].layer_dedx[cur_layer]); 835 | // // 836 | // //} 837 | // //对平方误差求导,//////////////////////////////////////////yongxu 838 | // else 839 | // { 840 | // 841 | // DevSubClean(streams[i], n_frames_part[i], layersizes[cur_layer], dev[i].layer_x[numlayers - 1], targ[i], dev[i].layer_dedx[cur_layer]); 842 | // 843 | // } 844 | // 845 | // if (cur_layer != 1) 846 | // { 847 | // SgemmTN(handles[i], layersizes[prev_layer], layersizes[cur_layer], n_frames_part[i], dev[i].weights[cur_layer], dev[i].layer_dedx[cur_layer], dev[i].layer_dedy[prev_layer], zero, one); 848 | // 849 | // } 850 | // 851 | // // Update weights. 852 | // if (cur_layer ==1) 853 | // SgemmNT(handles[i], layersizes[cur_layer], n_frames_part[i], layersizes[prev_layer], dev[i].layer_dedx[cur_layer], in[i], dev[i].layer_ydedx[cur_layer] ,zero, one); 854 | // else 855 | // SgemmNT(handles[i], layersizes[cur_layer], n_frames_part[i], layersizes[prev_layer], dev[i].layer_dedx[cur_layer], dev[i].layer_y[prev_layer], dev[i].layer_ydedx[cur_layer] ,zero, one); 856 | // DevAccSumrow(streams[i], layersizes[cur_layer], n_frames_part[i], dev[i].layer_dedx[cur_layer], dev[i].layer_sumdedx[cur_layer], zero, one); 857 | // 858 | // } 859 | // } 860 | // cudaDeviceSynchronize(); 861 | // cudaSetDevice(0); 862 | // 863 | // for(i= 1; i< GPU_selected;i++) 864 | // { 865 | // cudaDeviceEnablePeerAccess(i, 0); 866 | // for (cur_layer=1; cur_layer< numlayers; cur_layer++) 867 | // { 868 | // prev_layer = cur_layer - 1; 869 | // 870 | // cublasSaxpy(handles[0],layersizes[cur_layer] * layersizes[prev_layer], &one, dev[i].layer_ydedx[cur_layer], 1, dev[0].layer_ydedx[cur_layer] , 1); 871 | // cublasSaxpy(handles[0],layersizes[cur_layer], &one, dev[i].layer_sumdedx[cur_layer], 1, dev[0].layer_sumdedx[cur_layer] , 1); 872 | // 873 | // } 874 | // } 875 | // cudaDeviceSynchronize(); 876 | // for (cur_layer=1; cur_layer< numlayers; cur_layer++) 877 | // { 878 | // prev_layer = cur_layer - 1; 879 | // 880 | // updatedelta(streams[0], layersizes[cur_layer] * layersizes[prev_layer], dev[0].delta_weights[cur_layer], dev[0].weights[cur_layer], dev[0].layer_ydedx[cur_layer], n_frames, momentum, cur_lrate, weightcost); 881 | // updatedelta(streams[0], layersizes[cur_layer], dev[0].delta_bias[cur_layer], dev[0].bias[cur_layer], dev[0].layer_sumdedx[cur_layer], n_frames, momentum, cur_lrate, zero); 882 | // DevAccSum(streams[0], layersizes[cur_layer] * layersizes[prev_layer], dev[0].delta_weights[cur_layer], dev[0].weights[cur_layer], 1.0); 883 | // DevAccSum(streams[0], layersizes[cur_layer], dev[0].delta_bias[cur_layer], dev[0].bias[cur_layer], 1.0); 884 | // } 885 | // //cudaStreamSynchronize(streams[0]); 886 | // 887 | // ////copy paras to other gpus 888 | // for(i= 1; i< GPU_selected;i++) 889 | // { 890 | // //cudaSetDevice(i); 891 | // //cudaDeviceEnablePeerAccess(i, 0); 892 | // for (cur_layer=1; cur_layer< numlayers; cur_layer++) 893 | // { 894 | // prev_layer = cur_layer - 1; 895 | // 896 | // cublasScopy(handles[0], layersizes[cur_layer] * layersizes[prev_layer], dev[0].weights[cur_layer],1,dev[i].weights[cur_layer] ,1); 897 | // cublasScopy(handles[0], layersizes[cur_layer], dev[0].bias[cur_layer],1, dev[i].bias[cur_layer] ,1); 898 | // 899 | // 900 | // cublasScopy(handles[0],layersizes[cur_layer] * layersizes[prev_layer], dev[0].delta_weights[cur_layer],1,dev[i].delta_weights[cur_layer] ,1); 901 | // cublasScopy(handles[0],layersizes[cur_layer], dev[0].delta_bias[cur_layer],1, dev[i].delta_bias[cur_layer] ,1); 902 | // } 903 | // 904 | // } 905 | // cudaStreamSynchronize(streams[0]); 906 | // cudaDeviceSynchronize(); 907 | // 908 | //} 909 | 910 | void BP_GPU::returnWeights(float **weights, float **bias) 911 | { 912 | int i; 913 | ////copy weights && biases to devices 914 | 915 | cudaSetDevice(0); 916 | // cudaSetDevice(1); 917 | 918 | for(i = 1; i< numlayers; i++) 919 | { 920 | fromdev_vf_vf("weights", layersizes[i-1] *layersizes[i], dev[0].weights[i], weights[i], streams[0]); 921 | fromdev_vf_vf("bias", layersizes[i], dev[0].bias[i], bias[i], streams[0]); 922 | } 923 | } 924 | 925 | ///// following are alloc and free functions 926 | void BP_GPU::devnew_vf(const char* varname, int n, float **devptr) 927 | { 928 | cudaError_t cudaStat = cudaMalloc((void **) devptr, n* sizeof(float)); 929 | if(cudaStat !=cudaSuccess ) 930 | { 931 | printf("%s device momory alloc error\n", varname); 932 | exit(0); 933 | } 934 | //float *zero = new float [n]; 935 | float *zero; 936 | cudaMallocHost((void**)&zero,n*sizeof(float)); 937 | 938 | for(int i=0;i< n;i++) 939 | zero[i] = 0.0f; 940 | cublasSetVector(n,sizeof(float),zero,1,(*devptr),1); 941 | //delete []zero; 942 | cudaFreeHost(zero); 943 | } 944 | 945 | void BP_GPU::devnew_vi(const char* varname, int n, int **devptr) 946 | { 947 | cudaError_t cudaStat = cudaMalloc((void **) devptr, n* sizeof(int)); 948 | if(cudaStat !=cudaSuccess ) 949 | { 950 | printf( "%s device momory alloc error\n", varname); 951 | exit(0); 952 | } 953 | //int *zero = new int [n]; 954 | int *zero; 955 | cudaMallocHost((void**)&zero,n*sizeof(int)); 956 | 957 | for(int i=0;i< n;i++) 958 | zero[i] = 0; 959 | cublasSetVector(n,sizeof(int),zero,1,(*devptr),1); 960 | //delete []zero; 961 | cudaFreeHost(zero); 962 | } 963 | 964 | void BP_GPU::devfree_vf(const char* varname, float* devptr) 965 | { 966 | cudaFree((void *) devptr); 967 | } 968 | 969 | void BP_GPU::devfree_vi(const char* varname, int* devptr) 970 | { 971 | cudaFree((void *) devptr); 972 | } 973 | 974 | void BP_GPU::todev_vf_vf(const char* varname, int n, const float* from, float* devto, cudaStream_t stream) 975 | { 976 | cublasStatus_t e = cublasSetVectorAsync(n, sizeof(float), from, 1, devto, 1, stream); 977 | if (e != CUBLAS_STATUS_SUCCESS) 978 | { 979 | printf("cuda blas todev_vf_vf error variable %s\n",varname); 980 | exit(0); 981 | } 982 | } 983 | 984 | void BP_GPU::fromdev_vf_vf(const char* varname, int n, const float* devfrom, float* to, cudaStream_t stream) 985 | { 986 | cublasStatus_t e = cublasGetVectorAsync(n, sizeof(float), devfrom, 1, to, 1, stream); 987 | if (e != CUBLAS_STATUS_SUCCESS) 988 | { 989 | printf("cuda blas fromdev_vf_vf error variable %s\n",varname); 990 | exit(0); 991 | } 992 | } 993 | 994 | //void BP_GPU::todev_vi_vi(const char* varname, int n, const int* from,int *devto, cudaStream_t stream) 995 | //{ 996 | // cublasStatus_t e = cublasSetVectorAsync(n, sizeof(int), from, 1, devto, 1, stream); 997 | // if (e != CUBLAS_STATUS_SUCCESS) 998 | // { 999 | // printf("cuda blas todev_vi_vi error variable %s\n", varname); 1000 | // exit(0); 1001 | // } 1002 | //} 1003 | 1004 | //void BP_GPU::fromdev_vi_vi(const char* varname, int n,const int* devfrom, int* to, cudaStream_t stream) 1005 | //{ 1006 | // cublasStatus_t e = cublasGetVectorAsync(n, sizeof(int), devfrom, 1, to, 1, stream); 1007 | // if (e != CUBLAS_STATUS_SUCCESS) 1008 | // { 1009 | // printf("cuda blas fromdev_vi_vi error variable %s\n", varname); 1010 | // exit(0); 1011 | // } 1012 | //} 1013 | --------------------------------------------------------------------------------