├── .gitignore ├── LICENSE ├── cuda ├── cuda_stream.cu ├── lr │ ├── build.sh │ ├── dot.cc │ ├── dot.cu │ └── dot_cusparse.cu ├── seq.cu └── seq_runtime.cu ├── matrix └── matrix.cpp └── mxnet ├── lstm_sort ├── infer_sort.py ├── lstm_sort.py └── sort_io.py ├── nce-loss ├── lstm_word.py ├── nce.py ├── toy_nce.py ├── toy_softmax.py └── wordvec.py ├── ocr ├── cnn_ocr.py └── infer_cnn_ocr.py └── triple-loss ├── test.py └── triplet_loss.py /.gitignore: -------------------------------------------------------------------------------- 1 | run 2 | tensorflow 3 | mxnet/word2vec 4 | *.pyc 5 | *.bin 6 | *.run 7 | warpctc -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2016 Xiang Liang 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /cuda/cuda_stream.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | using namespace std; 5 | 6 | #include 7 | 8 | #define CUDA_CALL(func, name) \ 9 | { \ 10 | cudaError_t e = (func); \ 11 | if(e != cudaSuccess) \ 12 | cout << "CUDA: " << cudaGetErrorString(e) << ": " << name << endl; \ 13 | else \ 14 | cout << "CUDA SUCC: " << (name) << endl; \ 15 | } 16 | 17 | void fill_array(int * data, const int num) { 18 | for(int i = 0; i < num; i++){ 19 | data[i] = i; 20 | } 21 | } 22 | 23 | void check_array(char * device_prefix, 24 | int * data, 25 | const int num) { 26 | bool error_found = false; 27 | for(int i = 0; i < num; i++) { 28 | if(data[i] != i * 2){ 29 | cout << "error: " << device_prefix << "\t" << i << "\t" << data[i] << endl; 30 | error_found = true; 31 | } 32 | } 33 | if (!error_found) 34 | cout << "passed: " << device_prefix << endl; 35 | } 36 | 37 | __global__ void gpu_test_kernel(int * data) { 38 | const int tid = (blockIdx.x * blockDim.x) + threadIdx.x; 39 | for(int i = 0; i < 10000; i++){ 40 | data[tid] *= 2; 41 | data[tid] /= 2; 42 | } 43 | data[tid] *= 2; 44 | } 45 | 46 | #define MAX_NUM_DEVICES (4) 47 | #define NUM_ELEM (1024*1024*8) 48 | 49 | cudaStream_t stream[MAX_NUM_DEVICES]; 50 | 51 | char device_prefix[MAX_NUM_DEVICES][300]; 52 | 53 | int * gpu_data[MAX_NUM_DEVICES]; 54 | int * cpu_src_data[MAX_NUM_DEVICES]; 55 | int * cpu_dst_data[MAX_NUM_DEVICES]; 56 | 57 | cudaEvent_t kernel_start_event[MAX_NUM_DEVICES]; 58 | cudaEvent_t memcpy_to_start_event[MAX_NUM_DEVICES]; 59 | cudaEvent_t memcpy_from_start_event[MAX_NUM_DEVICES]; 60 | cudaEvent_t memcpy_from_stop_event[MAX_NUM_DEVICES]; 61 | 62 | __host__ void gpu_kernel(void) { 63 | 64 | const int shared_memory_usage = 0; 65 | const size_t single_gpu_chunk_size = sizeof(int) * NUM_ELEM; 66 | const int num_threads = 256; 67 | const int num_blocks = ((NUM_ELEM + (num_threads - 1)) / num_threads); 68 | cout << "begin" << endl; 69 | 70 | int num_devices; 71 | CUDA_CALL(cudaGetDeviceCount(&num_devices), "cudaGetDeviceCount"); 72 | if(num_devices > MAX_NUM_DEVICES) 73 | num_devices = MAX_NUM_DEVICES; 74 | 75 | cout << "num devices: " << num_devices << endl; 76 | 77 | for(int device_num = 0; device_num < num_devices; device_num++) { 78 | CUDA_CALL(cudaSetDevice(device_num), "cudaSetDevice"); 79 | 80 | struct cudaDeviceProp device_prop; 81 | CUDA_CALL(cudaGetDeviceProperties(&device_prop, 82 | device_num), "cudaGetDeviceProperties"); 83 | sprintf(&device_prefix[device_num][0], "\nID: %d %s : ", device_num, 84 | device_prop.name); 85 | 86 | CUDA_CALL(cudaStreamCreate(&stream[device_num]), "cudaStreamCreate"); 87 | CUDA_CALL(cudaMalloc((void**)&gpu_data[device_num], single_gpu_chunk_size), "cudaMalloc"); 88 | 89 | CUDA_CALL(cudaMallocHost((void**)&cpu_src_data[device_num], 90 | single_gpu_chunk_size), "cudaMallocHost"); 91 | 92 | CUDA_CALL(cudaMallocHost((void**)&cpu_dst_data[device_num], 93 | single_gpu_chunk_size), "cudaMallocHost"); 94 | 95 | fill_array(cpu_src_data[device_num], NUM_ELEM); 96 | 97 | CUDA_CALL(cudaEventCreate(&memcpy_to_start_event[device_num]), "create memcpy_to_start_event"); 98 | CUDA_CALL(cudaEventCreate(&kernel_start_event[device_num]), "create kernel_start_event"); 99 | CUDA_CALL(cudaEventCreate(&memcpy_from_start_event[device_num]), "create memcpy_from_start_event"); 100 | CUDA_CALL(cudaEventCreate(&memcpy_from_stop_event[device_num]), "create memcpy_from_stop_event"); 101 | 102 | CUDA_CALL(cudaEventRecord(memcpy_to_start_event[device_num]), "memcpy_to_start_event"); 103 | CUDA_CALL(cudaMemcpyAsync(gpu_data[device_num], 104 | cpu_src_data[device_num], 105 | single_gpu_chunk_size, 106 | cudaMemcpyHostToDevice, 107 | stream[device_num]), "cudaMemcpyAsync"); 108 | 109 | CUDA_CALL(cudaEventRecord(kernel_start_event[device_num]), "cudaEventRecord"); 110 | gpu_test_kernel<<>>(gpu_data[device_num]); 114 | 115 | CUDA_CALL(cudaEventRecord(memcpy_from_start_event[device_num]), "memcpy_from_start_event"); 116 | CUDA_CALL(cudaMemcpyAsync(cpu_dst_data[device_num], 117 | gpu_data[device_num], 118 | single_gpu_chunk_size, 119 | cudaMemcpyDeviceToHost, 120 | stream[device_num]), "cudaMemcpyAsync"); 121 | CUDA_CALL(cudaEventRecord(memcpy_from_stop_event[device_num]), "memcpy_from_stop_event"); 122 | } 123 | 124 | for(int device_num = 0; device_num < num_devices; device_num++) { 125 | CUDA_CALL(cudaSetDevice(device_num), ""); 126 | CUDA_CALL(cudaStreamSynchronize(stream[device_num]), ""); 127 | 128 | float time_copy_to_ms; 129 | CUDA_CALL(cudaEventElapsedTime(&time_copy_to_ms, 130 | memcpy_to_start_event[device_num], 131 | kernel_start_event[device_num]), ""); 132 | 133 | float time_kernel_ms; 134 | CUDA_CALL(cudaEventElapsedTime(&time_kernel_ms, 135 | kernel_start_event[device_num], 136 | memcpy_from_start_event[device_num]), ""); 137 | 138 | float time_copy_from_ms; 139 | CUDA_CALL(cudaEventElapsedTime(&time_copy_from_ms, 140 | memcpy_from_start_event[device_num], 141 | memcpy_from_stop_event[device_num]), ""); 142 | 143 | float time_exec_ms; 144 | CUDA_CALL(cudaEventElapsedTime(&time_exec_ms, 145 | memcpy_to_start_event[device_num], 146 | memcpy_from_stop_event[device_num]), ""); 147 | 148 | CUDA_CALL(cudaStreamDestroy(stream[device_num]), ""); 149 | CUDA_CALL(cudaFree(gpu_data[device_num]), ""); 150 | 151 | check_array(device_prefix[device_num], 152 | cpu_dst_data[device_num], 153 | NUM_ELEM); 154 | 155 | CUDA_CALL(cudaFreeHost(cpu_src_data[device_num]), ""); 156 | CUDA_CALL(cudaFreeHost(cpu_dst_data[device_num]), ""); 157 | CUDA_CALL(cudaDeviceReset(), ""); 158 | 159 | cout << time_copy_to_ms << "\t" << time_kernel_ms << "\t" << time_copy_from_ms << "\n" 160 | << time_exec_ms << "\t" << time_copy_to_ms + time_kernel_ms + time_copy_from_ms << endl; 161 | } 162 | } 163 | 164 | int main(){ 165 | gpu_kernel(); 166 | } -------------------------------------------------------------------------------- /cuda/lr/build.sh: -------------------------------------------------------------------------------- 1 | nvcc dot.cu -o dot.run -L/usr/local/cuda/lib64/ -lcusparse -lcurand 2 | -------------------------------------------------------------------------------- /cuda/lr/dot.cc: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | using namespace std; 8 | 9 | #include 10 | #include 11 | 12 | double cpu_dot(const vector< pair > & a, const vector & b) { 13 | double ret = 0.0; 14 | for(vector< pair >::const_iterator i = a.begin(); 15 | i != a.end(); i++) { 16 | ret += i->second * b[i->first]; 17 | } 18 | return ret; 19 | } 20 | 21 | vector cpu_batch_dot(const vector< vector< pair > > & data, const vector & b) { 22 | vector rets(data.size(), 0); 23 | for(int i = 0; i < data.size(); i++){ 24 | rets[i] = cpu_dot(data[i], b); 25 | } 26 | return rets; 27 | } 28 | 29 | double sigmoid(double x) { 30 | return 1.0 / (1.0 + exp(-1.0 * x)); 31 | } 32 | 33 | double cpu_grad(const vector< pair > & x, 34 | const double wtx, 35 | const int label, 36 | vector & w, 37 | const double learning_rate, 38 | const double lambda) { 39 | double err = (double)label - sigmoid(wtx); 40 | for(vector< pair >::const_iterator i = x.begin(); 41 | i != x.end(); i++) { 42 | w[i->first] += learning_rate * (err - lambda * w[i->first]); 43 | } 44 | return abs(err); 45 | } 46 | 47 | double cpu_batch_grad(const vector< vector< pair > > & data, 48 | const vector< int > & label, 49 | vector & b, 50 | const double learning_rate, 51 | const double lambda) { 52 | vector dot = cpu_batch_dot(data, b); 53 | double err = 0.; 54 | double total = 0.; 55 | for(int i = 0; i < data.size(); i++) { 56 | err += cpu_grad(data[i], dot[i], label[i], b, learning_rate, lambda); 57 | total += 1.; 58 | } 59 | return err / total; 60 | } 61 | 62 | void mock_sample(const int max_feature_id, vector< pair > & out, int * label) { 63 | int count = rand() % 100 + 10; 64 | int ret = 0; 65 | for(int i = 0; i < count; i++) { 66 | int fid = rand() % max_feature_id; 67 | if(fid % 2 == 0) ret += 1; 68 | else ret -= 1; 69 | out.push_back(make_pair(fid, 1.0)); 70 | } 71 | *label = (ret > 0) ? 1 : 0; 72 | } 73 | 74 | void cpu_lr(const int max_feature_id, const int n_batch, const int batch_size) { 75 | double learning_rate = 0.01; 76 | double lambda = 0.01; 77 | vector model(max_feature_id + 1, 0); 78 | for(int i = 0; i < model.size(); i++) { 79 | model[i] = 0.5 - (double)(rand() % 10000) / 10000.0; 80 | } 81 | for(int i = 0; i < n_batch; i++) { 82 | vector< vector< pair > > batch_data; 83 | vector< int > batch_label; 84 | for(int j = 0; j < batch_size; j++) { 85 | vector< pair > x; 86 | int l; 87 | mock_sample(max_feature_id, x, &l); 88 | batch_data.push_back(x); 89 | batch_label.push_back(l); 90 | } 91 | 92 | double err = cpu_batch_grad(batch_data, batch_label, model, 93 | learning_rate, lambda); 94 | 95 | if(i % 10000 == 0){ 96 | cout << "iter " << i << "\t" << err << endl; 97 | for(int k = 0; k < 10; k++) { 98 | cout << model[k] << "\t"; 99 | } 100 | cout << endl; 101 | } 102 | } 103 | } 104 | 105 | int main() { 106 | cpu_lr(1000000, 1000000, 50); 107 | } 108 | -------------------------------------------------------------------------------- /cuda/lr/dot.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | using namespace std; 8 | 9 | #include 10 | #include 11 | 12 | #define CUDA_CALL(func)\ 13 | {\ 14 | cudaError_t e = (func);\ 15 | if(e != cudaSuccess)\ \ 16 | cout << "CUDA: " << cudaGetErrorString(e) << endl; \ 17 | } 18 | 19 | #define CURAND_CALL(func) \ 20 | { \ 21 | curandStatus_t e = (func); \ 22 | if(e != CURAND_STATUS_SUCCESS) \ 23 | cout << "CURAND: " << e << endl; \ 24 | } 25 | 26 | #define NUM_THREADS 1024 27 | 28 | cudaStream_t stream; 29 | 30 | float * copyFloatToGPU(const vector & x) { 31 | float * ret; 32 | CUDA_CALL(cudaMalloc((void**)&ret, sizeof(float) * x.size())); 33 | CUDA_CALL(cudaMemcpyAsync(ret, x.data(), sizeof(float) * x.size(), cudaMemcpyHostToDevice, stream)); 34 | return ret; 35 | } 36 | 37 | int * copyIntToGPU(const vector & x) { 38 | int * ret; 39 | CUDA_CALL(cudaMalloc((void**)&ret, sizeof(int) * x.size())); 40 | CUDA_CALL(cudaMemcpyAsync(ret, x.data(), sizeof(int) * x.size(), cudaMemcpyHostToDevice, stream)); 41 | return ret; 42 | } 43 | 44 | int total_count(const vector< vector< pair > > & data) { 45 | int ret = 0; 46 | for(int i = 0; i < data.size(); i++) ret += data[i].size(); 47 | return ret; 48 | } 49 | 50 | struct CooMatrix { 51 | float *val, *err, *label, *act; 52 | int *row_ind, *col_ind; 53 | int nnz, nrow; 54 | int max_length; 55 | }; 56 | 57 | __global__ void dot(float * val, int *row_ind, int *col_ind, int nnz, float * ret, float * w) { 58 | const int tid = (blockIdx.x * blockDim.x) + threadIdx.x; 59 | if (tid < nnz) { 60 | int r = row_ind[tid]; 61 | int c = col_ind[tid]; 62 | float v = val[tid]; 63 | atomicAdd(&ret[r], v * w[c]); 64 | } 65 | } 66 | 67 | __global__ void vec_sigmoid(float * d, int num) { 68 | const int tid = (blockIdx.x * blockDim.x) + threadIdx.x; 69 | if (tid < num) { 70 | if(d[tid] > 10.0) d[tid] = 1.0; 71 | else if(d[tid] < -10.0) d[tid] = 0.0; 72 | else d[tid] = 1.0 / (1.0 + exp(-1.0 * d[tid])); 73 | } 74 | } 75 | 76 | __global__ void grad(float * val, int * row_ind, int *col_ind, float * mat_err, 77 | int nnz, float *act, float *label, 78 | float *w, float learning_rate) { 79 | const int tid = (blockIdx.x * blockDim.x) + threadIdx.x; 80 | if (tid < nnz) { 81 | int r = row_ind[tid]; 82 | int c = col_ind[tid]; 83 | float v = val[tid]; 84 | mat_err[tid] = abs(label[r] - act[r]); 85 | float err = v * (label[r] - act[r]); 86 | atomicAdd(&w[c], learning_rate * err); 87 | } 88 | } 89 | 90 | CooMatrix zeroCooMatrix(int batch_size, int max_length) { 91 | CooMatrix mat; 92 | mat.max_length = max_length; 93 | CUDA_CALL(cudaMalloc((void**)&mat.val, max_length * sizeof(float))); 94 | CUDA_CALL(cudaMalloc((void**)&mat.act, batch_size * sizeof(float))); 95 | CUDA_CALL(cudaMalloc((void**)&mat.label, batch_size * sizeof(float))); 96 | CUDA_CALL(cudaMalloc((void**)&mat.err, max_length * sizeof(float))); 97 | CUDA_CALL(cudaMalloc((void**)&mat.row_ind, max_length * sizeof(int))); 98 | CUDA_CALL(cudaMalloc((void**)&mat.col_ind, max_length * sizeof(int))); 99 | return mat; 100 | } 101 | 102 | void freeCooMatrix(CooMatrix * mat) { 103 | CUDA_CALL(cudaFree(mat->val)); 104 | CUDA_CALL(cudaFree(mat->act)); 105 | CUDA_CALL(cudaFree(mat->label)); 106 | CUDA_CALL(cudaFree(mat->err)); 107 | CUDA_CALL(cudaFree(mat->row_ind)); 108 | CUDA_CALL(cudaFree(mat->col_ind)); 109 | } 110 | 111 | struct CooMatrixHost { 112 | float * val; 113 | int *row_ind; 114 | int *col_ind; 115 | int max_length; 116 | int nnz; 117 | }; 118 | 119 | CooMatrixHost zeroCooMatrixHost(int batch_size, int max_length) { 120 | CooMatrixHost mat; 121 | mat.max_length = max_length; 122 | CUDA_CALL(cudaMallocHost((void**)&mat.val, sizeof(float)*max_length)); 123 | CUDA_CALL(cudaMallocHost((void**)&mat.row_ind, sizeof(int)*max_length)); 124 | CUDA_CALL(cudaMallocHost((void**)&mat.col_ind, sizeof(int)*max_length)); 125 | return mat; 126 | } 127 | 128 | void freeCooMatrixHost(CooMatrixHost * mat){ 129 | CUDA_CALL(cudaFreeHost(mat->val)); 130 | CUDA_CALL(cudaFreeHost(mat->row_ind)); 131 | CUDA_CALL(cudaFreeHost(mat->col_ind)); 132 | } 133 | 134 | void vec2coo(const vector< vector< pair > > & data, CooMatrixHost * mat_host, CooMatrix * mat) { 135 | int nnz = total_count(data); 136 | if(nnz > mat->max_length) cout << nnz << "\t" << mat->max_length << endl; 137 | mat->nnz = nnz; 138 | mat->nrow = data.size(); 139 | CUDA_CALL(cudaMemset(mat->err, 0, mat->max_length * sizeof(float))); 140 | 141 | int n = 0; 142 | for(int i = 0; i < data.size(); i++){ 143 | for(vector< pair >::const_iterator j = data[i].begin(); 144 | j != data[i].end(); j++) { 145 | mat_host->val[n] = j->second; 146 | mat_host->row_ind[n] = i; 147 | mat_host->col_ind[n] = j->first; 148 | ++n; 149 | } 150 | } 151 | 152 | CUDA_CALL(cudaMemcpyAsync(mat->val, mat_host->val, nnz*sizeof(float), 153 | cudaMemcpyHostToDevice, stream)); 154 | CUDA_CALL(cudaMemcpyAsync(mat->row_ind, mat_host->row_ind, nnz*sizeof(int), 155 | cudaMemcpyHostToDevice, stream)); 156 | CUDA_CALL(cudaMemcpyAsync(mat->col_ind, mat_host->col_ind, nnz*sizeof(int), 157 | cudaMemcpyHostToDevice, stream)); 158 | } 159 | 160 | void lr(const vector< vector< pair > > & data, 161 | const vector & label, 162 | CooMatrixHost * coo_mat_host, 163 | CooMatrix * coo_mat, 164 | float * w, int ncol, int batch) { 165 | vec2coo(data, coo_mat_host, coo_mat); 166 | CUDA_CALL(cudaMemcpyAsync(coo_mat->label, label.data(), sizeof(float) * label.size(), cudaMemcpyHostToDevice, stream)); 167 | CUDA_CALL(cudaMemset(coo_mat->act, 0, sizeof(float) * data.size())); 168 | 169 | int shared_memory_usage = 1; 170 | int num_blocks = ((coo_mat->nnz + (NUM_THREADS - 1)) / NUM_THREADS); 171 | dot<<>>(coo_mat->val, 172 | coo_mat->row_ind, 173 | coo_mat->col_ind, 174 | coo_mat->nnz, 175 | coo_mat->act, w); 176 | 177 | num_blocks = ((data.size() + (NUM_THREADS - 1)) / NUM_THREADS); 178 | vec_sigmoid<<>>(coo_mat->act, data.size()); 179 | 180 | num_blocks = ((coo_mat->nnz + (NUM_THREADS - 1)) / NUM_THREADS); 181 | grad<<>>(coo_mat->val, 182 | coo_mat->row_ind, 183 | coo_mat->col_ind, 184 | coo_mat->err, 185 | coo_mat->nnz, 186 | coo_mat->act, 187 | coo_mat->label, 188 | w, 0.01); 189 | if (batch % 10000 == 0){ 190 | float * err = (float*) malloc(sizeof(float) * coo_mat->nnz); 191 | CUDA_CALL(cudaMemcpyAsync(err, coo_mat->err, sizeof(float) * coo_mat->nnz, cudaMemcpyDeviceToHost, stream)); 192 | float total = 0.; 193 | for(int i = 0; i < coo_mat->nnz; i++) total += err[i]; 194 | cout << total / (float) coo_mat->nnz << endl; 195 | } 196 | } 197 | 198 | void mock_sample(const int max_feature_id, vector< pair > & out, int * label) { 199 | int count = rand() % 100 + 10; 200 | int ret = 0; 201 | for(int i = 0; i < count; i++) { 202 | int fid = rand() % max_feature_id; 203 | if(fid % 2 == 0) ret += 1; 204 | else ret -= 1; 205 | if(abs(ret) > 10) break; 206 | out.push_back(make_pair(fid, 1.0)); 207 | } 208 | *label = (ret > 0) ? 1 : 0; 209 | } 210 | 211 | #define MODEL_SIZE 1000000 212 | 213 | __global__ void fill(float * w, float val, int size) { 214 | const int tid = (blockIdx.x * blockDim.x) + threadIdx.x; 215 | if (tid < size) w[tid] = val; 216 | } 217 | 218 | 219 | int main(int argc, char ** argv) { 220 | srand(time(NULL)); 221 | CUDA_CALL(cudaSetDevice(1)); 222 | CUDA_CALL(cudaStreamCreateWithFlags(&stream,cudaStreamNonBlocking)); 223 | 224 | float * w; 225 | CUDA_CALL(cudaMalloc((void**)&w, sizeof(float) * MODEL_SIZE)); 226 | CUDA_CALL(cudaMemset(w, 0, sizeof(float) * MODEL_SIZE)); 227 | const int shared_memory_usage = 0; 228 | const int num_blocks = ((MODEL_SIZE + (NUM_THREADS - 1)) / NUM_THREADS); 229 | fill<<>>(w, 1, MODEL_SIZE); 233 | 234 | curandGenerator_t rand_gen; 235 | const curandRngType_t gen_type = CURAND_RNG_PSEUDO_DEFAULT; 236 | 237 | CURAND_CALL(curandCreateGenerator(&rand_gen, gen_type)); 238 | CURAND_CALL(curandSetStream(rand_gen, stream)); 239 | CURAND_CALL(curandSetPseudoRandomGeneratorSeed(rand_gen, time(NULL))); 240 | CURAND_CALL(curandGenerateNormal(rand_gen, w, MODEL_SIZE, 0, 0.1)); 241 | 242 | int batch_size = atoi(argv[1]); 243 | int total_batch = 1024 * 1024 / batch_size; 244 | CooMatrix mat = zeroCooMatrix(batch_size, batch_size * 256); 245 | CooMatrixHost mat_host = zeroCooMatrixHost(batch_size, batch_size * 256); 246 | for(int batch = 0; batch < total_batch; batch++){ 247 | vector< vector< pair > > samples; 248 | vector labels; 249 | for(int i = 0; i < batch_size; i++){ 250 | vector< pair > sample; 251 | int label; 252 | mock_sample(MODEL_SIZE, sample, &label); 253 | samples.push_back(sample); 254 | labels.push_back((float)label); 255 | } 256 | lr(samples, labels, &mat_host, &mat, w, MODEL_SIZE, batch); 257 | } 258 | CUDA_CALL(cudaStreamDestroy(stream)); 259 | freeCooMatrix(&mat); 260 | freeCooMatrixHost(&mat_host); 261 | } 262 | -------------------------------------------------------------------------------- /cuda/lr/dot_cusparse.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | using namespace std; 8 | 9 | #include 10 | #include 11 | #include 12 | #include 13 | 14 | #define CUDA_CALL(func) \ 15 | { \ 16 | cudaError_t e = (func); \ 17 | if(e != cudaSuccess) \ 18 | cout << "CUDA: " << cudaGetErrorString(e) << endl; \ 19 | } 20 | 21 | #define CUSP_CALL(func) \ 22 | { \ 23 | cusparseStatus_t e = (func); \ 24 | if(e != CUSPARSE_STATUS_SUCCESS) \ 25 | cout << "CUSP: " << e << endl; \ 26 | } 27 | 28 | #define CURAND_CALL(func) \ 29 | { \ 30 | curandStatus_t e = (func); \ 31 | if(e != CURAND_STATUS_SUCCESS) \ 32 | cout << "CURAND: " << e << endl; \ 33 | } 34 | 35 | cudaStream_t stream; 36 | cusparseHandle_t sparse_handle; 37 | 38 | float sigmoid(float x) { 39 | return 1.0 / (1.0 + exp(-1.0 * x)); 40 | } 41 | 42 | float * copyFloatToGPU(const vector & x) { 43 | float * ret; 44 | CUDA_CALL(cudaMalloc((void**)&ret, sizeof(float) * x.size())); 45 | CUDA_CALL(cudaMemcpyAsync(ret, x.data(), sizeof(float) * x.size(), cudaMemcpyHostToDevice, stream)); 46 | return ret; 47 | } 48 | 49 | int * copyIntToGPU(const vector & x) { 50 | int * ret; 51 | CUDA_CALL(cudaMalloc((void**)&ret, sizeof(int) * x.size())); 52 | CUDA_CALL(cudaMemcpyAsync(ret, x.data(), sizeof(int) * x.size(), cudaMemcpyHostToDevice, stream)); 53 | return ret; 54 | } 55 | 56 | int total_count(const vector< vector< pair > > & data) { 57 | int ret = 0; 58 | for(int i = 0; i < data.size(); i++) ret += data[i].size(); 59 | return ret; 60 | } 61 | 62 | struct CooMatrix { 63 | float *val; 64 | int *row_ind, *col_ind; 65 | int nnz, nrow; 66 | }; 67 | 68 | CooMatrix vec2coo(const vector< vector< pair > > & data) { 69 | int nnz = total_count(data); 70 | CooMatrix mat; 71 | mat.nnz = nnz; 72 | mat.nrow = data.size(); 73 | CUDA_CALL(cudaMalloc((void**)&mat.val, nnz * sizeof(float))); 74 | CUDA_CALL(cudaMalloc((void**)&mat.row_ind, nnz * sizeof(int))); 75 | CUDA_CALL(cudaMalloc((void**)&mat.col_ind, nnz * sizeof(int))); 76 | 77 | vector val(nnz, 0); 78 | vector row_ind(nnz, 0), col_ind(nnz, 0); 79 | int n = 0; 80 | for(int i = 0; i < data.size(); i++){ 81 | for(vector< pair >::const_iterator j = data[i].begin(); 82 | j != data[i].end(); j++) { 83 | val[n] = j->second; 84 | row_ind[n] = i; 85 | col_ind[n] = j->first; 86 | ++n; 87 | } 88 | } 89 | 90 | CUDA_CALL(cudaMemcpyAsync(mat.val, val.data(), nnz*sizeof(float), 91 | cudaMemcpyHostToDevice, stream)); 92 | CUDA_CALL(cudaMemcpyAsync(mat.row_ind, row_ind.data(), nnz*sizeof(int), 93 | cudaMemcpyHostToDevice, stream)); 94 | CUDA_CALL(cudaMemcpyAsync(mat.col_ind, col_ind.data(), nnz*sizeof(int), 95 | cudaMemcpyHostToDevice, stream)); 96 | 97 | return mat; 98 | } 99 | 100 | struct CsrMatrix { 101 | float *val; 102 | int *row_ptr, *col_ind; 103 | int nnz, nrow; 104 | }; 105 | 106 | CsrMatrix coo2csr(CooMatrix coo_mat) { 107 | CsrMatrix csr_mat; 108 | csr_mat.nnz = coo_mat.nnz; 109 | csr_mat.val = coo_mat.val; 110 | csr_mat.col_ind = coo_mat.col_ind; 111 | csr_mat.nrow = coo_mat.nrow; 112 | int byte_size = (coo_mat.nrow + 1) * sizeof(int); 113 | CUDA_CALL(cudaMalloc((void**)&csr_mat.row_ptr, byte_size)); 114 | CUSP_CALL(cusparseXcoo2csr(sparse_handle, coo_mat.row_ind, coo_mat.nnz, 115 | coo_mat.nrow, csr_mat.row_ptr, CUSPARSE_INDEX_BASE_ZERO)); 116 | return csr_mat; 117 | } 118 | 119 | float grad(const vector< vector< pair > > & data, 120 | const vector & label, 121 | float * b, int ncol) { 122 | CooMatrix coo_mat = vec2coo(data); 123 | CsrMatrix csr_mat = coo2csr(coo_mat); 124 | 125 | float alpha = 1.0; 126 | float beta = 0.0; 127 | float *y; 128 | 129 | cusparseMatDescr_t descr = 0; 130 | CUSP_CALL(cusparseCreateMatDescr(&descr)); 131 | cusparseSetMatType(descr,CUSPARSE_MATRIX_TYPE_GENERAL); 132 | cusparseSetMatIndexBase(descr,CUSPARSE_INDEX_BASE_ZERO); 133 | 134 | CUDA_CALL(cudaMalloc((void**)&y, sizeof(float)*data.size())); 135 | CUSP_CALL(cusparseScsrmv(sparse_handle, CUSPARSE_OPERATION_NON_TRANSPOSE, 136 | csr_mat.nrow, ncol, csr_mat.nnz, &alpha, 137 | descr, csr_mat.val, 138 | csr_mat.row_ptr, csr_mat.col_ind, b, &beta, y)); 139 | float *pred = (float *)malloc(data.size() * sizeof(float)); 140 | CUDA_CALL(cudaMemcpyAsync(pred, y, data.size() * sizeof(float), cudaMemcpyDeviceToHost, stream)); 141 | 142 | float total_err = 0.; 143 | for(int i = 0; i < data.size(); i++) { 144 | float err = label[i] - sigmoid(pred[i]); 145 | total_err += abs(err); 146 | vector xind; 147 | vector xval; 148 | for(vector< pair >::const_iterator j = data[i].begin(); 149 | j != data[i].end(); j++) { 150 | xind.push_back(j->first); 151 | xval.push_back(0.01 * err); 152 | } 153 | int * gpu_xind = copyIntToGPU(xind); 154 | float * gpu_xval = copyFloatToGPU(xval); 155 | float a = 1; 156 | CUSP_CALL(cusparseSaxpyi(sparse_handle, xind.size(), &a, 157 | gpu_xval, gpu_xind, b, CUSPARSE_INDEX_BASE_ZERO)); 158 | CUDA_CALL(cudaFree(gpu_xind)); 159 | CUDA_CALL(cudaFree(gpu_xval)); 160 | } 161 | free(pred); 162 | return total_err / (float)data.size(); 163 | } 164 | 165 | void mock_sample(const int max_feature_id, vector< pair > & out, int * label) { 166 | int count = rand() % 100 + 100; 167 | int ret = 0; 168 | for(int i = 0; i < count; i++) { 169 | int fid = rand() % max_feature_id; 170 | if(fid % 2 == 0) ret += 1; 171 | else ret -= 1; 172 | out.push_back(make_pair(fid, 1.0)); 173 | } 174 | *label = (ret > 0) ? 1 : 0; 175 | } 176 | 177 | #define MODEL_SIZE 1000000 178 | 179 | __global__ void fill(float * w, float val, int size) { 180 | const int tid = (blockIdx.x * blockDim.x) + threadIdx.x; 181 | if (tid < size) w[tid] = val; 182 | } 183 | 184 | 185 | int main() { 186 | srand(time(NULL)); 187 | CUDA_CALL(cudaSetDevice(1)); 188 | CUDA_CALL(cudaStreamCreate(&stream)); 189 | CUSP_CALL(cusparseCreate(&sparse_handle)); 190 | CUSP_CALL(cusparseSetStream(sparse_handle, stream)); 191 | 192 | float * w; 193 | CUDA_CALL(cudaMalloc((void**)&w, sizeof(float) * MODEL_SIZE)); 194 | CUDA_CALL(cudaMemset(w, 0, sizeof(float) * MODEL_SIZE)); 195 | const int shared_memory_usage = 0; 196 | const int num_threads = 256; 197 | const int num_blocks = ((MODEL_SIZE + (num_threads - 1)) / num_threads); 198 | fill<<>>(w, 1, MODEL_SIZE); 202 | 203 | curandGenerator_t rand_gen; 204 | const curandRngType_t gen_type = CURAND_RNG_PSEUDO_DEFAULT; 205 | 206 | CURAND_CALL(curandCreateGenerator(&rand_gen, gen_type)); 207 | CURAND_CALL(curandSetStream(rand_gen, stream)); 208 | CURAND_CALL(curandSetPseudoRandomGeneratorSeed(rand_gen, time(NULL))); 209 | CURAND_CALL(curandGenerateNormal(rand_gen, w, MODEL_SIZE, 0, 0.1)); 210 | 211 | for(int batch = 0; batch < 100000; batch++){ 212 | vector< vector< pair > > samples; 213 | vector labels; 214 | for(int i = 0; i < 50; i++){ 215 | vector< pair > sample; 216 | int label; 217 | mock_sample(MODEL_SIZE, sample, &label); 218 | samples.push_back(sample); 219 | labels.push_back((float)label); 220 | } 221 | 222 | float err = grad(samples, labels, w, MODEL_SIZE); 223 | if(batch % 1000 == 0) cout << err << endl; 224 | } 225 | } -------------------------------------------------------------------------------- /cuda/seq.cu: -------------------------------------------------------------------------------- 1 | #include 2 | using namespace std; 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | 9 | int main() 10 | { 11 | const int N = 50000; 12 | thrust::device_vector a(N); 13 | thrust::sequence(a.begin(), a.end(), 0); 14 | int sumA = thrust::reduce(a.begin(), a.end(), 0); 15 | 16 | int sumCheck = 0; 17 | for (int i = 0; i < N; i++) sumCheck += i; 18 | 19 | if (sumA == sumCheck) cout << "Test Succeeded!" << endl; 20 | else { 21 | cout << "Test Failed" << endl; 22 | return 1; 23 | } 24 | return 0; 25 | } -------------------------------------------------------------------------------- /cuda/seq_runtime.cu: -------------------------------------------------------------------------------- 1 | #include 2 | using namespace std; 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | 9 | __global__ void fillKernel(int *a, int n) { 10 | int tid = blockIdx.x * blockDim.x + threadIdx.x; 11 | if (tid < n) a[tid] = tid; 12 | } 13 | 14 | void fill(int *d_a, int n) { 15 | int nThreadsPerBlock = 512; 16 | int nBlocks = n / nThreadsPerBlock + ((n % nThreadsPerBlock) ? 1 : 0); 17 | fillKernel <<< nBlocks, nThreadsPerBlock >>> (d_a, n); 18 | } 19 | 20 | int main() { 21 | const int N = 50000; 22 | thrust::device_vector a(N); 23 | fill(thrust::raw_pointer_cast(&a[0]), N); 24 | 25 | int sumA = thrust::reduce(a.begin(), a.end(), 0); 26 | int sumCheck = 0; 27 | for(int i = 0; i < N; i++) sumCheck += i; 28 | 29 | if(sumA == sumCheck) cout << "ok" << endl; 30 | else cout << "fail" << endl; 31 | return 0; 32 | } -------------------------------------------------------------------------------- /matrix/matrix.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | 7 | #include 8 | #include 9 | 10 | #define NOW() std::chrono::high_resolution_clock::now() 11 | #define ELAPSED(msg, x, n) std::cout << msg << ": " \ 12 | << (float)(n) * 0.001 / (float)(std::chrono::duration_cast(NOW() - x).count()) \ 13 | << "GFPS" << std::endl 14 | 15 | 16 | void gemm(float * a, float * b, float * c, int n) { 17 | for (int i = 0; i < n; ++i) { 18 | for (int j = 0; j < n; ++j) { 19 | for (int p = 0; p < n; ++p) { 20 | c[i * n + j] += a[i * n + p] * b[p * n + j]; 21 | } 22 | } 23 | } 24 | } 25 | 26 | 27 | void gemm1(float * a, float * b, float * c, int n) { 28 | for (int j = 0; j < n; j += 4) { 29 | for (int i = 0; i < n; ++i) { 30 | float * cij = c + i * n + j; 31 | float * ai = a + i * n; 32 | __m128 cij4 = {cij[0], cij[1], cij[2], cij[3]}; 33 | float * bpj = b + j; 34 | for (int p = 0; p < n; p += 4){ 35 | __m128 aip4 = _mm_set_ps1(*ai); 36 | cij4 = _mm_add_ps(_mm_mul_ps(aip4, *((__m128 *)bpj)), cij4); 37 | ++ai; 38 | bpj += n; 39 | 40 | aip4 = _mm_set_ps1(*ai); 41 | cij4 = _mm_add_ps(_mm_mul_ps(aip4, *((__m128 *)bpj)), cij4); 42 | ++ai; 43 | bpj += n; 44 | 45 | aip4 = _mm_set_ps1(*ai); 46 | cij4 = _mm_add_ps(_mm_mul_ps(aip4, *((__m128 *)bpj)), cij4); 47 | ++ai; 48 | bpj += n; 49 | 50 | aip4 = _mm_set_ps1(*ai); 51 | cij4 = _mm_add_ps(_mm_mul_ps(aip4, *((__m128 *)bpj)), cij4); 52 | ++ai; 53 | bpj += n; 54 | } 55 | cij[0] = cij4[0]; 56 | cij[1] = cij4[1]; 57 | cij[2] = cij4[2]; 58 | cij[3] = cij4[3]; 59 | } 60 | } 61 | } 62 | 63 | int main(int argc, char ** argv) { 64 | srand(time(NULL)); 65 | for (int size = 200; size < 1000; size += 100) { 66 | int sz = size * size; 67 | float * a = (float*) malloc(sz * sizeof(float)); 68 | float * b = (float*) malloc(sz * sizeof(float)); 69 | float * c = (float*) malloc(sz * sizeof(float)); 70 | for (int i = 0; i < size * size; ++i) { 71 | a[i] = (float((i * 11) % 10000) / 10000.0) - 0.5; 72 | b[i] = (float((i * 7) % 10000) / 10000.0) - 0.5; 73 | c[i] = 0.0; 74 | } 75 | 76 | float total = size; 77 | total *= (float)size; 78 | total *= (float)size; 79 | auto start = NOW(); 80 | gemm1(a, b, c, size); 81 | ELAPSED(std::to_string(size), start, total); 82 | 83 | /* 84 | if (size == 200) { 85 | float * c1 = new float[size * size]; 86 | for (int i = 0; i < size * size; ++i) c1[i] = 0.0; 87 | gemm(a, b, c1, size); 88 | for (int i = 0; i < size * size; ++i) { 89 | if (fabs(c[i] - c1[i]) > 1e-9) { 90 | std::cout << fabs(c[i] - c1[i]) << std::endl; 91 | } 92 | } 93 | delete []c1; 94 | } 95 | */ 96 | free(a); 97 | free(b); 98 | free(c); 99 | } 100 | } 101 | 102 | -------------------------------------------------------------------------------- /mxnet/lstm_sort/infer_sort.py: -------------------------------------------------------------------------------- 1 | # pylint: disable=C0111,too-many-arguments,too-many-instance-attributes,too-many-locals,redefined-outer-name,fixme 2 | # pylint: disable=superfluous-parens, no-member, invalid-name 3 | import sys 4 | sys.path.insert(0, "../../python") 5 | import numpy as np 6 | import mxnet as mx 7 | 8 | from lstm import lstm_unroll 9 | from sort_io import BucketSentenceIter, default_build_vocab 10 | from rnn_model import LSTMInferenceModel 11 | 12 | def MakeInput(char, vocab, arr): 13 | idx = vocab[char] 14 | tmp = np.zeros((1,)) 15 | tmp[0] = idx 16 | arr[:] = tmp 17 | 18 | if __name__ == '__main__': 19 | batch_size = 32 20 | #buckets = [10, 20, 30, 40, 50, 60] 21 | #buckets = [32] 22 | buckets = [] 23 | num_hidden = 200 24 | num_embed = 200 25 | num_lstm_layer = 3 26 | 27 | num_epoch = 3 28 | learning_rate = 0.1 29 | momentum = 0.9 30 | 31 | # dummy data is used to test speed without IO 32 | dummy_data = False 33 | 34 | contexts = [mx.context.gpu(i) for i in range(1)] 35 | 36 | vocab = default_build_vocab("./data/sort.train.txt") 37 | rvocab = {} 38 | for k, v in vocab.items(): 39 | rvocab[v] = k 40 | def sym_gen(seq_len): 41 | return lstm_unroll(num_lstm_layer, seq_len, len(vocab), 42 | num_hidden=num_hidden, num_embed=num_embed, 43 | num_label=len(vocab)) 44 | 45 | init_c = [('l%d_init_c'%l, (batch_size, num_hidden)) for l in range(num_lstm_layer)] 46 | init_h = [('l%d_init_h'%l, (batch_size, num_hidden)) for l in range(num_lstm_layer)] 47 | init_states = init_c + init_h 48 | 49 | if len(buckets) == 1: 50 | # only 1 bucket, disable bucketing 51 | symbol = sym_gen(buckets[0]) 52 | else: 53 | symbol = sym_gen 54 | 55 | _, arg_params, __ = mx.model.load_checkpoint("sort", 3) 56 | 57 | model = LSTMInferenceModel(num_lstm_layer, len(vocab), 58 | num_hidden=num_hidden, num_embed=num_embed, 59 | num_label=len(vocab), arg_params=arg_params, ctx=contexts, dropout=0.2) 60 | 61 | tks = sys.argv[1:] 62 | input_ndarray = mx.nd.zeros((1,)) 63 | for k in range(len(tks)): 64 | MakeInput(tks[k], vocab, input_ndarray) 65 | prob = model.forward(input_ndarray, False) 66 | idx = np.argmax(prob, axis=1)[0] 67 | print tks[k], prob.shape, idx, rvocab[idx] 68 | for k in range(len(tks)): 69 | MakeInput(' ', vocab, input_ndarray) 70 | prob = model.forward(input_ndarray, False) 71 | idx = np.argmax(prob, axis=1)[0] 72 | print ' ', prob.shape, idx, rvocab[idx] 73 | -------------------------------------------------------------------------------- /mxnet/lstm_sort/lstm_sort.py: -------------------------------------------------------------------------------- 1 | # pylint: disable=C0111,too-many-arguments,too-many-instance-attributes,too-many-locals,redefined-outer-name,fixme 2 | # pylint: disable=superfluous-parens, no-member, invalid-name 3 | import sys 4 | sys.path.insert(0, "../../python") 5 | import numpy as np 6 | import mxnet as mx 7 | 8 | from lstm import lstm_unroll 9 | from sort_io import BucketSentenceIter, default_build_vocab 10 | 11 | def Perplexity(label, pred): 12 | label = label.T.reshape((-1,)) 13 | loss = 0. 14 | for i in range(pred.shape[0]): 15 | loss += -np.log(max(1e-10, pred[i][int(label[i])])) 16 | return np.exp(loss / label.size) 17 | 18 | if __name__ == '__main__': 19 | batch_size = 100 20 | #buckets = [10, 20, 30, 40, 50, 60] 21 | #buckets = [32] 22 | buckets = [] 23 | num_hidden = 200 24 | num_embed = 200 25 | num_lstm_layer = 3 26 | 27 | num_epoch = 1 28 | learning_rate = 0.1 29 | momentum = 0.9 30 | 31 | # dummy data is used to test speed without IO 32 | dummy_data = False 33 | 34 | contexts = [mx.context.gpu(i) for i in range(1)] 35 | 36 | vocab = default_build_vocab("./data/sort.train.txt") 37 | 38 | def sym_gen(seq_len): 39 | return lstm_unroll(num_lstm_layer, seq_len, len(vocab), 40 | num_hidden=num_hidden, num_embed=num_embed, 41 | num_label=len(vocab)) 42 | 43 | init_c = [('l%d_init_c'%l, (batch_size, num_hidden)) for l in range(num_lstm_layer)] 44 | init_h = [('l%d_init_h'%l, (batch_size, num_hidden)) for l in range(num_lstm_layer)] 45 | init_states = init_c + init_h 46 | 47 | data_train = BucketSentenceIter("./data/sort.train.txt", vocab, 48 | buckets, batch_size, init_states) 49 | data_val = BucketSentenceIter("./data/sort.valid.txt", vocab, 50 | buckets, batch_size, init_states) 51 | 52 | if dummy_data: 53 | data_train = DummyIter(data_train) 54 | data_val = DummyIter(data_val) 55 | 56 | if len(buckets) == 1: 57 | # only 1 bucket, disable bucketing 58 | symbol = sym_gen(buckets[0]) 59 | else: 60 | symbol = sym_gen 61 | 62 | model = mx.model.FeedForward(ctx=contexts, 63 | symbol=symbol, 64 | num_epoch=num_epoch, 65 | learning_rate=learning_rate, 66 | momentum=momentum, 67 | wd=0.00001, 68 | initializer=mx.init.Xavier(factor_type="in", magnitude=2.34)) 69 | 70 | import logging 71 | head = '%(asctime)-15s %(message)s' 72 | logging.basicConfig(level=logging.DEBUG, format=head) 73 | 74 | model.fit(X=data_train, eval_data=data_val, 75 | eval_metric = mx.metric.np(Perplexity), 76 | batch_end_callback=mx.callback.Speedometer(batch_size, 50),) 77 | 78 | model.save("sort") 79 | -------------------------------------------------------------------------------- /mxnet/lstm_sort/sort_io.py: -------------------------------------------------------------------------------- 1 | # pylint: disable=C0111,too-many-arguments,too-many-instance-attributes,too-many-locals,redefined-outer-name,fixme 2 | # pylint: disable=superfluous-parens, no-member, invalid-name 3 | import sys 4 | sys.path.insert(0, "../../python") 5 | import numpy as np 6 | import mxnet as mx 7 | 8 | # The interface of a data iter that works for bucketing 9 | # 10 | # DataIter 11 | # - default_bucket_key: the bucket key for the default symbol. 12 | # 13 | # DataBatch 14 | # - provide_data: same as DataIter, but specific to this batch 15 | # - provide_label: same as DataIter, but specific to this batch 16 | # - bucket_key: the key for the bucket that should be used for this batch 17 | 18 | def default_read_content(path): 19 | with open(path) as ins: 20 | content = ins.read() 21 | content = content.replace('\n', ' ').replace('. ', ' ') 22 | return content 23 | 24 | def default_build_vocab(path): 25 | content = default_read_content(path) 26 | content = content.split(' ') 27 | 28 | words = set([x for x in content if len(x) > 0]) 29 | words = [x for x in words] 30 | words = sorted(words) 31 | the_vocab = {} 32 | idx = 1 # 0 is left for zero-padding 33 | the_vocab[' '] = 0 # put a dummy element here so that len(vocab) is correct 34 | for word in words: 35 | if len(word) == 0: 36 | continue 37 | if not word in the_vocab: 38 | the_vocab[word] = idx 39 | idx += 1 40 | return the_vocab 41 | 42 | def default_text2id(sentence, the_vocab): 43 | words = sentence.split(' ') 44 | words = [the_vocab[w] for w in words if len(w) > 0] 45 | count = len(words) 46 | for i in range(count): 47 | words.append(0) 48 | return words 49 | 50 | def default_gen_buckets(sentences, batch_size, the_vocab): 51 | len_dict = {} 52 | max_len = -1 53 | for sentence in sentences: 54 | words = default_text2id(sentence, the_vocab) 55 | lw = len(words) 56 | if lw == 0: 57 | continue 58 | if lw > max_len: 59 | max_len = lw 60 | if lw in len_dict: 61 | len_dict[lw] += 1 62 | else: 63 | len_dict[lw] = 1 64 | print(len_dict) 65 | 66 | tl = 0 67 | buckets = [] 68 | for l, n in len_dict.iteritems(): # TODO: There are better heuristic ways to do this 69 | if n + tl >= batch_size: 70 | buckets.append(l) 71 | tl = 0 72 | else: 73 | tl += n 74 | if tl > 0: 75 | buckets.append(max_len) 76 | return buckets 77 | 78 | 79 | class SimpleBatch(object): 80 | def __init__(self, data_names, data, label_names, label, bucket_key): 81 | self.data = data 82 | self.label = label 83 | self.data_names = data_names 84 | self.label_names = label_names 85 | self.bucket_key = bucket_key 86 | 87 | self.pad = 0 88 | self.index = None # TODO: what is index? 89 | 90 | @property 91 | def provide_data(self): 92 | return [(n, x.shape) for n, x in zip(self.data_names, self.data)] 93 | 94 | @property 95 | def provide_label(self): 96 | return [(n, x.shape) for n, x in zip(self.label_names, self.label)] 97 | 98 | class DummyIter(mx.io.DataIter): 99 | "A dummy iterator that always return the same batch, used for speed testing" 100 | def __init__(self, real_iter): 101 | super(DummyIter, self).__init__() 102 | self.real_iter = real_iter 103 | self.provide_data = real_iter.provide_data 104 | self.provide_label = real_iter.provide_label 105 | self.batch_size = real_iter.batch_size 106 | 107 | for batch in real_iter: 108 | self.the_batch = batch 109 | break 110 | 111 | def __iter__(self): 112 | return self 113 | 114 | def next(self): 115 | return self.the_batch 116 | 117 | class BucketSentenceIter(mx.io.DataIter): 118 | def __init__(self, path, vocab, buckets, batch_size, 119 | init_states, data_name='data', label_name='label', 120 | seperate_char=' ', text2id=None, read_content=None): 121 | super(BucketSentenceIter, self).__init__() 122 | 123 | if text2id == None: 124 | self.text2id = default_text2id 125 | else: 126 | self.text2id = text2id 127 | if read_content == None: 128 | self.read_content = default_read_content 129 | else: 130 | self.read_content = read_content 131 | content = self.read_content(path) 132 | sentences = content.split(seperate_char) 133 | 134 | if len(buckets) == 0: 135 | buckets = default_gen_buckets(sentences, batch_size, vocab) 136 | 137 | self.vocab_size = len(vocab) 138 | self.data_name = data_name 139 | self.label_name = label_name 140 | 141 | buckets.sort() 142 | self.buckets = buckets 143 | self.data = [[] for _ in buckets] 144 | 145 | # pre-allocate with the largest bucket for better memory sharing 146 | self.default_bucket_key = max(buckets) 147 | 148 | for sentence in sentences: 149 | sentence = self.text2id(sentence, vocab) 150 | if len(sentence) == 0: 151 | continue 152 | for i, bkt in enumerate(buckets): 153 | if bkt >= len(sentence): 154 | self.data[i].append(sentence) 155 | break 156 | # we just ignore the sentence it is longer than the maximum 157 | # bucket size here 158 | 159 | # convert data into ndarrays for better speed during training 160 | data = [np.zeros((len(x), buckets[i])) for i, x in enumerate(self.data)] 161 | for i_bucket in range(len(self.buckets)): 162 | for j in range(len(self.data[i_bucket])): 163 | sentence = self.data[i_bucket][j] 164 | data[i_bucket][j, :len(sentence)] = sentence 165 | self.data = data 166 | 167 | # Get the size of each bucket, so that we could sample 168 | # uniformly from the bucket 169 | bucket_sizes = [len(x) for x in self.data] 170 | 171 | print("Summary of dataset ==================") 172 | for bkt, size in zip(buckets, bucket_sizes): 173 | print("bucket of len %3d : %d samples" % (bkt, size)) 174 | 175 | self.batch_size = batch_size 176 | self.make_data_iter_plan() 177 | 178 | self.init_states = init_states 179 | self.init_state_arrays = [mx.nd.zeros(x[1]) for x in init_states] 180 | 181 | self.provide_data = [('data', (batch_size, self.default_bucket_key))] + init_states 182 | self.provide_label = [('softmax_label', (self.batch_size, self.default_bucket_key))] 183 | 184 | def make_data_iter_plan(self): 185 | "make a random data iteration plan" 186 | # truncate each bucket into multiple of batch-size 187 | bucket_n_batches = [] 188 | for i in range(len(self.data)): 189 | bucket_n_batches.append(len(self.data[i]) / self.batch_size) 190 | self.data[i] = self.data[i][:int(bucket_n_batches[i]*self.batch_size)] 191 | 192 | bucket_plan = np.hstack([np.zeros(n, int)+i for i, n in enumerate(bucket_n_batches)]) 193 | np.random.shuffle(bucket_plan) 194 | 195 | bucket_idx_all = [np.random.permutation(len(x)) for x in self.data] 196 | 197 | self.bucket_plan = bucket_plan 198 | self.bucket_idx_all = bucket_idx_all 199 | self.bucket_curr_idx = [0 for x in self.data] 200 | 201 | self.data_buffer = [] 202 | self.label_buffer = [] 203 | for i_bucket in range(len(self.data)): 204 | data = np.zeros((self.batch_size, self.buckets[i_bucket])) 205 | label = np.zeros((self.batch_size, self.buckets[i_bucket])) 206 | self.data_buffer.append(data) 207 | self.label_buffer.append(label) 208 | 209 | def __iter__(self): 210 | init_state_names = [x[0] for x in self.init_states] 211 | 212 | for i_bucket in self.bucket_plan: 213 | data = self.data_buffer[i_bucket] 214 | label = self.label_buffer[i_bucket] 215 | 216 | i_idx = self.bucket_curr_idx[i_bucket] 217 | idx = self.bucket_idx_all[i_bucket][i_idx:i_idx+self.batch_size] 218 | self.bucket_curr_idx[i_bucket] += self.batch_size 219 | data[:] = self.data[i_bucket][idx] 220 | 221 | for k in range(len(data)): 222 | label[k] = sorted(data[k]) 223 | 224 | #label[:, :-1] = data[:, 1:] 225 | #label[:, -1] = 0 226 | 227 | data_all = [mx.nd.array(data)] + self.init_state_arrays 228 | label_all = [mx.nd.array(label)] 229 | data_names = ['data'] + init_state_names 230 | label_names = ['softmax_label'] 231 | 232 | data_batch = SimpleBatch(data_names, data_all, label_names, label_all, 233 | self.buckets[i_bucket]) 234 | yield data_batch 235 | 236 | def reset(self): 237 | self.bucket_curr_idx = [0 for x in self.data] 238 | -------------------------------------------------------------------------------- /mxnet/nce-loss/lstm_word.py: -------------------------------------------------------------------------------- 1 | # pylint:skip-file 2 | import sys, random, time, math 3 | sys.path.insert(0, "../../python") 4 | import mxnet as mx 5 | import numpy as np 6 | from collections import namedtuple 7 | from nce import * 8 | from operator import itemgetter 9 | 10 | LSTMState = namedtuple("LSTMState", ["c", "h"]) 11 | LSTMParam = namedtuple("LSTMParam", ["i2h_weight", "i2h_bias", 12 | "h2h_weight", "h2h_bias"]) 13 | LSTMModel = namedtuple("LSTMModel", ["rnn_exec", "symbol", 14 | "init_states", "last_states", 15 | "seq_data", "seq_labels", "seq_outputs", 16 | "param_blocks"]) 17 | 18 | def lstm(num_hidden, indata, prev_state, param, seqidx, layeridx, dropout=0.): 19 | """LSTM Cell symbol""" 20 | if dropout > 0.: 21 | indata = mx.sym.Dropout(data=indata, p=dropout) 22 | i2h = mx.sym.FullyConnected(data=indata, 23 | weight=param.i2h_weight, 24 | bias=param.i2h_bias, 25 | num_hidden=num_hidden * 4, 26 | name="t%d_l%d_i2h" % (seqidx, layeridx)) 27 | h2h = mx.sym.FullyConnected(data=prev_state.h, 28 | weight=param.h2h_weight, 29 | bias=param.h2h_bias, 30 | num_hidden=num_hidden * 4, 31 | name="t%d_l%d_h2h" % (seqidx, layeridx)) 32 | gates = i2h + h2h 33 | slice_gates = mx.sym.SliceChannel(gates, num_outputs=4, 34 | name="t%d_l%d_slice" % (seqidx, layeridx)) 35 | in_gate = mx.sym.Activation(slice_gates[0], act_type="sigmoid") 36 | in_transform = mx.sym.Activation(slice_gates[1], act_type="tanh") 37 | forget_gate = mx.sym.Activation(slice_gates[2], act_type="sigmoid") 38 | out_gate = mx.sym.Activation(slice_gates[3], act_type="sigmoid") 39 | next_c = (forget_gate * prev_state.c) + (in_gate * in_transform) 40 | next_h = out_gate * mx.sym.Activation(next_c, act_type="tanh") 41 | return LSTMState(c=next_c, h=next_h) 42 | 43 | 44 | def get_net(vocab_size, seq_len, num_label, num_lstm_layer, num_hidden): 45 | param_cells = [] 46 | last_states = [] 47 | for i in range(num_lstm_layer): 48 | param_cells.append(LSTMParam(i2h_weight=mx.sym.Variable("l%d_i2h_weight" % i), 49 | i2h_bias=mx.sym.Variable("l%d_i2h_bias" % i), 50 | h2h_weight=mx.sym.Variable("l%d_h2h_weight" % i), 51 | h2h_bias=mx.sym.Variable("l%d_h2h_bias" % i))) 52 | state = LSTMState(c=mx.sym.Variable("l%d_init_c" % i), 53 | h=mx.sym.Variable("l%d_init_h" % i)) 54 | last_states.append(state) 55 | 56 | data = mx.sym.Variable('data') 57 | label = mx.sym.Variable('label') 58 | label_weight = mx.sym.Variable('label_weight') 59 | embed_weight = mx.sym.Variable('embed_weight') 60 | label_embed_weight = mx.sym.Variable('label_embed_weight') 61 | data_embed = mx.sym.Embedding(data = data, input_dim = vocab_size, 62 | weight = embed_weight, 63 | output_dim = 100, name = 'data_embed') 64 | datavec = mx.sym.SliceChannel(data = data_embed, 65 | num_outputs = seq_len, 66 | squeeze_axis = True, name = 'data_slice') 67 | labelvec = mx.sym.SliceChannel(data = label, 68 | num_outputs = seq_len, 69 | squeeze_axis = True, name = 'label_slice') 70 | labelweightvec = mx.sym.SliceChannel(data = label_weight, 71 | num_outputs = seq_len, 72 | squeeze_axis = True, name = 'label_weight_slice') 73 | probs = [] 74 | for seqidx in range(seq_len): 75 | hidden = datavec[seqidx] 76 | 77 | for i in range(num_lstm_layer): 78 | next_state = lstm(num_hidden, indata = hidden, 79 | prev_state = last_states[i], 80 | param = param_cells[i], 81 | seqidx = seqidx, layeridx = i) 82 | hidden = next_state.h 83 | last_states[i] = next_state 84 | 85 | probs += nce_loss(data = hidden, 86 | label = labelvec[seqidx], 87 | label_weight = labelweightvec[seqidx], 88 | embed_weight = label_embed_weight, 89 | vocab_size = vocab_size, 90 | num_hidden = 100, 91 | num_label = num_label) 92 | return mx.sym.Group(probs) 93 | 94 | 95 | def load_data(name): 96 | buf = open(name).read() 97 | tks = buf.split(' ') 98 | vocab = {} 99 | freq = [0] 100 | data = [] 101 | for tk in tks: 102 | if len(tk) == 0: 103 | continue 104 | if tk not in vocab: 105 | vocab[tk] = len(vocab) + 1 106 | freq.append(0) 107 | wid = vocab[tk] 108 | data.append(wid) 109 | freq[wid] += 1 110 | negative = [] 111 | for i, v in enumerate(freq): 112 | if i == 0 or v < 5: 113 | continue 114 | v = int(math.pow(v * 1.0, 0.75)) 115 | negative += [i for _ in range(v)] 116 | return data, negative, vocab, freq 117 | 118 | class SimpleBatch(object): 119 | def __init__(self, data_names, data, label_names, label): 120 | self.data = data 121 | self.label = label 122 | self.data_names = data_names 123 | self.label_names = label_names 124 | 125 | @property 126 | def provide_data(self): 127 | return [(n, x.shape) for n, x in zip(self.data_names, self.data)] 128 | 129 | @property 130 | def provide_label(self): 131 | return [(n, x.shape) for n, x in zip(self.label_names, self.label)] 132 | 133 | 134 | class DataIter(mx.io.DataIter): 135 | def __init__(self, name, batch_size, seq_len, num_label, init_states): 136 | super(DataIter, self).__init__() 137 | self.batch_size = batch_size 138 | self.data, self.negative, self.vocab, self.freq = load_data(name) 139 | self.vocab_size = 1 + len(self.vocab) 140 | print self.vocab_size 141 | self.seq_len = seq_len 142 | self.num_label = num_label 143 | self.init_states = init_states 144 | self.init_state_names = [x[0] for x in self.init_states] 145 | self.init_state_arrays = [mx.nd.zeros(x[1]) for x in init_states] 146 | self.provide_data = [('data', (batch_size, seq_len))] + init_states 147 | self.provide_label = [('label', (self.batch_size, seq_len, num_label)), 148 | ('label_weight', (self.batch_size, seq_len, num_label))] 149 | 150 | def sample_ne(self): 151 | return self.negative[random.randint(0, len(self.negative) - 1)] 152 | 153 | def __iter__(self): 154 | print 'begin' 155 | batch_data = [] 156 | batch_label = [] 157 | batch_label_weight = [] 158 | for i in range(0, len(self.data) - self.seq_len - 1, self.seq_len): 159 | data = self.data[i: i+self.seq_len] 160 | label = [[self.data[i+k+1]] \ 161 | + [self.sample_ne() for _ in range(self.num_label-1)]\ 162 | for k in range(self.seq_len)] 163 | label_weight = [[1.0] \ 164 | + [0.0 for _ in range(self.num_label-1)]\ 165 | for k in range(self.seq_len)] 166 | 167 | batch_data.append(data) 168 | batch_label.append(label) 169 | batch_label_weight.append(label_weight) 170 | if len(batch_data) == self.batch_size: 171 | data_all = [mx.nd.array(batch_data)] + self.init_state_arrays 172 | label_all = [mx.nd.array(batch_label), mx.nd.array(batch_label_weight)] 173 | data_names = ['data'] + self.init_state_names 174 | label_names = ['label', 'label_weight'] 175 | batch_data = [] 176 | batch_label = [] 177 | batch_label_weight = [] 178 | yield SimpleBatch(data_names, data_all, label_names, label_all) 179 | 180 | def reset(self): 181 | pass 182 | 183 | class NceAccuracy(mx.metric.EvalMetric): 184 | def __init__(self): 185 | super(NceAccuracy, self).__init__('nce-accuracy') 186 | 187 | def update(self, labels, preds): 188 | label_weight = labels[1].asnumpy() 189 | preds = np.array([x.asnumpy() for x in preds]).transpose() 190 | for i in range(preds.shape[0]): 191 | if np.argmax(label_weight[i]) == np.argmax(preds[i]): 192 | self.sum_metric += 1 193 | self.num_inst += 1 194 | 195 | class NceAuc(mx.metric.EvalMetric): 196 | def __init__(self): 197 | super(NceAuc, self).__init__('nce-auc') 198 | 199 | def update(self, labels, preds): 200 | preds = np.array([x.asnumpy() for x in preds]).transpose() 201 | label_weight = labels[1].asnumpy().reshape((preds.shape[0], preds.shape[1])) 202 | tmp = [] 203 | for i in range(preds.shape[0]): 204 | for j in range(preds.shape[1]): 205 | tmp.append((label_weight[i][j], preds[i][j])) 206 | tmp = sorted(tmp, key = itemgetter(1), reverse = True) 207 | m = 0.0 208 | n = 0.0 209 | z = 0.0 210 | k = 0 211 | for a, b in tmp: 212 | if a > 0.5: 213 | m += 1.0 214 | z += len(tmp) - k 215 | else: 216 | n += 1.0 217 | k += 1 218 | z -= m * (m + 1.0) / 2.0 219 | z /= m 220 | z /= n 221 | self.sum_metric += z 222 | self.num_inst += 1 223 | 224 | if __name__ == '__main__': 225 | batch_size = 1024 226 | num_label = 5 227 | num_lstm_layer = 2 228 | num_hidden = 100 229 | 230 | init_c = [('l%d_init_c'%l, (batch_size, num_hidden)) for l in range(num_lstm_layer)] 231 | init_h = [('l%d_init_h'%l, (batch_size, num_hidden)) for l in range(num_lstm_layer)] 232 | init_states = init_c + init_h 233 | 234 | data_train = DataIter("./data/text8", batch_size, num_label, num_label, 235 | init_states) 236 | 237 | network = get_net(data_train.vocab_size, num_label, num_label, num_lstm_layer, num_hidden) 238 | devs = [mx.gpu(i) for i in range(1)] 239 | model = mx.model.FeedForward(ctx = devs, 240 | symbol = network, 241 | num_epoch = 20, 242 | learning_rate = 0.3, 243 | momentum = 0.9, 244 | wd = 0.0000, 245 | initializer=mx.init.Xavier(factor_type="in", magnitude=2.34)) 246 | 247 | import logging 248 | head = '%(asctime)-15s %(message)s' 249 | logging.basicConfig(level=logging.DEBUG, format=head) 250 | 251 | metric = NceAuc() 252 | model.fit(X = data_train, 253 | eval_metric = metric, 254 | #kvstore = 'local_allreduce_device', 255 | batch_end_callback = mx.callback.Speedometer(batch_size, 50),) 256 | 257 | -------------------------------------------------------------------------------- /mxnet/nce-loss/nce.py: -------------------------------------------------------------------------------- 1 | # pylint:skip-file 2 | import sys 3 | sys.path.insert(0, "../../python") 4 | import mxnet as mx 5 | 6 | def nce_loss(data, label, label_weight, embed_weight, vocab_size, num_hidden, num_label): 7 | label_embed = mx.sym.Embedding(data = label, input_dim = vocab_size, 8 | weight = embed_weight, 9 | output_dim = num_hidden, name = 'label_embed') 10 | label_embed = mx.sym.SliceChannel(data = label_embed, 11 | num_outputs = num_label, 12 | squeeze_axis = 1, name = 'label_slice') 13 | label_weight = mx.sym.SliceChannel(data = label_weight, 14 | num_outputs = num_label, 15 | squeeze_axis = 1) 16 | probs = [] 17 | for i in range(num_label): 18 | vec = label_embed[i] 19 | vec = vec * data 20 | vec = mx.sym.sum(vec, axis = 1) 21 | sm = mx.sym.LogisticRegressionOutput(data = vec, 22 | label = label_weight[i]) 23 | probs.append(sm) 24 | return probs 25 | 26 | -------------------------------------------------------------------------------- /mxnet/nce-loss/toy_nce.py: -------------------------------------------------------------------------------- 1 | # pylint:skip-file 2 | import sys, random, time 3 | sys.path.insert(0, "../../python") 4 | import mxnet as mx 5 | import numpy as np 6 | from collections import namedtuple 7 | from nce import * 8 | 9 | ToyModel = namedtuple("ToyModel", ["ex", "symbol", "param_blocks"]) 10 | 11 | def get_net(vocab_size, num_label): 12 | data = mx.sym.Variable('data') 13 | label = mx.sym.Variable('label') 14 | label_weight = mx.sym.Variable('label_weight') 15 | embed_weight = mx.sym.Variable('embed_weight') 16 | pred = mx.sym.FullyConnected(data = data, num_hidden = 100) 17 | probs = nce_loss(data = pred, 18 | label = label, 19 | label_weight = label_weight, 20 | embed_weight = embed_weight, 21 | vocab_size = vocab_size, 22 | num_hidden = 100, 23 | num_label = num_label) 24 | return mx.sym.Group(probs) 25 | 26 | class SimpleBatch(object): 27 | def __init__(self, data_names, data, label_names, label): 28 | self.data = data 29 | self.label = label 30 | self.data_names = data_names 31 | self.label_names = label_names 32 | 33 | @property 34 | def provide_data(self): 35 | return [(n, x.shape) for n, x in zip(self.data_names, self.data)] 36 | 37 | @property 38 | def provide_label(self): 39 | return [(n, x.shape) for n, x in zip(self.label_names, self.label)] 40 | 41 | 42 | class DataIter(mx.io.DataIter): 43 | def __init__(self, count, batch_size, vocab_size, num_label, feature_size): 44 | super(DataIter, self).__init__() 45 | self.batch_size = batch_size 46 | self.count = count 47 | self.vocab_size = vocab_size 48 | self.num_label = num_label 49 | self.feature_size = feature_size 50 | self.provide_data = [('data', (batch_size, feature_size))] 51 | self.provide_label = [('label', (self.batch_size, num_label)), 52 | ('label_weight', (self.batch_size, num_label))] 53 | 54 | def mock_sample(self): 55 | ret = np.zeros(self.feature_size) 56 | rn = set() 57 | while len(rn) < 3: 58 | rn.add(random.randint(0, self.feature_size - 1)) 59 | s = 0 60 | for k in rn: 61 | ret[k] = 1.0 62 | s *= self.feature_size 63 | s += k 64 | la = [s % self.vocab_size] +\ 65 | [random.randint(0, self.vocab_size - 1) for _ in range(self.num_label - 1)] 66 | return ret, la 67 | 68 | def __iter__(self): 69 | for _ in range(self.count / self.batch_size): 70 | data = [] 71 | label = [] 72 | label_weight = [] 73 | for i in range(self.batch_size): 74 | d, l = self.mock_sample() 75 | data.append(d) 76 | label.append(l) 77 | label_weight.append([1.0] + [0.0 for _ in range(self.num_label - 1)]) 78 | data_all = [mx.nd.array(data)] 79 | label_all = [mx.nd.array(label), mx.nd.array(label_weight)] 80 | data_names = ['data'] 81 | label_names = ['label', 'label_weight'] 82 | yield SimpleBatch(data_names, data_all, label_names, label_all) 83 | 84 | def reset(self): 85 | pass 86 | 87 | class NceAccuracy(mx.metric.EvalMetric): 88 | def __init__(self): 89 | super(NceAccuracy, self).__init__('nce-accuracy') 90 | 91 | def update(self, labels, preds): 92 | label_weight = labels[1].asnumpy() 93 | preds = np.array([x.asnumpy() for x in preds]).transpose() 94 | for i in range(preds.shape[0]): 95 | if np.argmax(label_weight[i]) == np.argmax(preds[i]): 96 | self.sum_metric += 1 97 | self.num_inst += 1 98 | 99 | if __name__ == '__main__': 100 | batch_size = 128 101 | vocab_size = 10000 102 | feature_size = 100 103 | num_label = 6 104 | 105 | data_train = DataIter(100000, batch_size, vocab_size, num_label, feature_size) 106 | data_test = DataIter(1000, batch_size, vocab_size, num_label, feature_size) 107 | 108 | network = get_net(vocab_size, num_label) 109 | devs = [mx.gpu(0)] 110 | model = mx.model.FeedForward(ctx = devs, 111 | symbol = network, 112 | num_epoch = 20, 113 | learning_rate = 0.03, 114 | momentum = 0.9, 115 | wd = 0.0000, 116 | initializer=mx.init.Xavier(factor_type="in", magnitude=2.34)) 117 | import logging 118 | head = '%(asctime)-15s %(message)s' 119 | logging.basicConfig(level=logging.DEBUG, format=head) 120 | 121 | metric = NceAccuracy() 122 | model.fit(X = data_train, eval_data = data_test, 123 | eval_metric = metric, 124 | batch_end_callback = mx.callback.Speedometer(batch_size, 50),) 125 | 126 | -------------------------------------------------------------------------------- /mxnet/nce-loss/toy_softmax.py: -------------------------------------------------------------------------------- 1 | # pylint:skip-file 2 | import sys, random, time 3 | sys.path.insert(0, "../../python") 4 | import mxnet as mx 5 | import numpy as np 6 | from collections import namedtuple 7 | from nce import * 8 | import train 9 | 10 | ToyModel = namedtuple("ToyModel", ["ex", "symbol", "param_blocks"]) 11 | 12 | def get_net(vocab_size): 13 | data = mx.sym.Variable('data') 14 | label = mx.sym.Variable('label') 15 | pred = mx.sym.FullyConnected(data = data, num_hidden = 100) 16 | pred = mx.sym.FullyConnected(data = pred, num_hidden = vocab_size) 17 | sm = mx.sym.SoftmaxOutput(data = pred, label = label) 18 | return sm 19 | 20 | class SimpleBatch(object): 21 | def __init__(self, data_names, data, label_names, label): 22 | self.data = data 23 | self.label = label 24 | self.data_names = data_names 25 | self.label_names = label_names 26 | 27 | @property 28 | def provide_data(self): 29 | return [(n, x.shape) for n, x in zip(self.data_names, self.data)] 30 | 31 | @property 32 | def provide_label(self): 33 | return [(n, x.shape) for n, x in zip(self.label_names, self.label)] 34 | 35 | 36 | class DataIter(mx.io.DataIter): 37 | def __init__(self, count, batch_size, vocab_size, num_label, feature_size): 38 | super(DataIter, self).__init__() 39 | self.batch_size = batch_size 40 | self.count = count 41 | self.vocab_size = vocab_size 42 | self.num_label = num_label 43 | self.feature_size = feature_size 44 | self.provide_data = [('data', (batch_size, feature_size))] 45 | self.provide_label = [('label', (self.batch_size,))] 46 | 47 | def mock_sample(self): 48 | ret = np.zeros(self.feature_size) 49 | rn = set() 50 | while len(rn) < 3: 51 | rn.add(random.randint(0, self.feature_size - 1)) 52 | s = 0 53 | for k in rn: 54 | ret[k] = 1.0 55 | s *= self.feature_size 56 | s += k 57 | return ret, s % self.vocab_size 58 | 59 | def __iter__(self): 60 | for _ in range(self.count / self.batch_size): 61 | data = [] 62 | label = [] 63 | for i in range(self.batch_size): 64 | d, l = self.mock_sample() 65 | data.append(d) 66 | label.append(l) 67 | data_all = [mx.nd.array(data)] 68 | label_all = [mx.nd.array(label)] 69 | data_names = ['data'] 70 | label_names = ['label'] 71 | yield SimpleBatch(data_names, data_all, label_names, label_all) 72 | 73 | def reset(self): 74 | pass 75 | 76 | if __name__ == '__main__': 77 | batch_size = 128 78 | vocab_size = 10000 79 | feature_size = 100 80 | num_label = 6 81 | 82 | data_train = DataIter(100000, batch_size, vocab_size, num_label, feature_size) 83 | data_test = DataIter(1000, batch_size, vocab_size, num_label, feature_size) 84 | 85 | network = get_net(vocab_size) 86 | devs = [mx.gpu(0)] 87 | model = mx.model.FeedForward(ctx = devs, 88 | symbol = network, 89 | num_epoch = 20, 90 | learning_rate = 0.03, 91 | momentum = 0.9, 92 | wd = 0.0000, 93 | initializer=mx.init.Xavier(factor_type="in", magnitude=2.34)) 94 | import logging 95 | head = '%(asctime)-15s %(message)s' 96 | logging.basicConfig(level=logging.DEBUG, format=head) 97 | 98 | model.fit(X = data_train, eval_data = data_test, 99 | batch_end_callback = mx.callback.Speedometer(batch_size, 50),) 100 | 101 | -------------------------------------------------------------------------------- /mxnet/nce-loss/wordvec.py: -------------------------------------------------------------------------------- 1 | # pylint:skip-file 2 | import sys, random, time, math 3 | sys.path.insert(0, "../../python") 4 | import mxnet as mx 5 | import numpy as np 6 | from collections import namedtuple 7 | from nce import * 8 | from operator import itemgetter 9 | 10 | ToyModel = namedtuple("ToyModel", ["ex", "symbol", "param_blocks"]) 11 | 12 | def get_net(vocab_size, num_input, num_label): 13 | data = mx.sym.Variable('data') 14 | label = mx.sym.Variable('label') 15 | label_weight = mx.sym.Variable('label_weight') 16 | embed_weight = mx.sym.Variable('embed_weight') 17 | data_embed = mx.sym.Embedding(data = data, input_dim = vocab_size, 18 | weight = embed_weight, 19 | output_dim = 100, name = 'data_embed') 20 | datavec = mx.sym.SliceChannel(data = data_embed, 21 | num_outputs = num_input, 22 | squeeze_axis = 1, name = 'data_slice') 23 | pred = datavec[0] 24 | for i in range(1, num_input): 25 | pred = pred + datavec[i] 26 | probs = nce_loss(data = pred, 27 | label = label, 28 | label_weight = label_weight, 29 | embed_weight = embed_weight, 30 | vocab_size = vocab_size, 31 | num_hidden = 100, 32 | num_label = num_label) 33 | return mx.sym.Group(probs) 34 | 35 | def load_data(name): 36 | buf = open(name).read() 37 | tks = buf.split(' ') 38 | vocab = {} 39 | freq = [0] 40 | data = [] 41 | for tk in tks: 42 | if len(tk) == 0: 43 | continue 44 | if tk not in vocab: 45 | vocab[tk] = len(vocab) + 1 46 | freq.append(0) 47 | wid = vocab[tk] 48 | data.append(wid) 49 | freq[wid] += 1 50 | negative = [] 51 | for i, v in enumerate(freq): 52 | if i == 0 or v < 5: 53 | continue 54 | v = int(math.pow(v * 1.0, 0.75)) 55 | negative += [i for _ in range(v)] 56 | return data, negative, vocab, freq 57 | 58 | class SimpleBatch(object): 59 | def __init__(self, data_names, data, label_names, label): 60 | self.data = data 61 | self.label = label 62 | self.data_names = data_names 63 | self.label_names = label_names 64 | 65 | @property 66 | def provide_data(self): 67 | return [(n, x.shape) for n, x in zip(self.data_names, self.data)] 68 | 69 | @property 70 | def provide_label(self): 71 | return [(n, x.shape) for n, x in zip(self.label_names, self.label)] 72 | 73 | 74 | class DataIter(mx.io.DataIter): 75 | def __init__(self, name, batch_size, num_label): 76 | super(DataIter, self).__init__() 77 | self.batch_size = batch_size 78 | self.data, self.negative, self.vocab, self.freq = load_data(name) 79 | self.vocab_size = 1 + len(self.vocab) 80 | print self.vocab_size 81 | self.num_label = num_label 82 | self.provide_data = [('data', (batch_size, num_label - 1))] 83 | self.provide_label = [('label', (self.batch_size, num_label)), 84 | ('label_weight', (self.batch_size, num_label))] 85 | 86 | def sample_ne(self): 87 | return self.negative[random.randint(0, len(self.negative) - 1)] 88 | 89 | def __iter__(self): 90 | print 'begin' 91 | batch_data = [] 92 | batch_label = [] 93 | batch_label_weight = [] 94 | start = random.randint(0, self.num_label - 1) 95 | for i in range(start, len(self.data) - self.num_label - start, self.num_label): 96 | context = self.data[i: i + self.num_label / 2] \ 97 | + self.data[i + 1 + self.num_label / 2: i + self.num_label] 98 | target_word = self.data[i + self.num_label / 2] 99 | if self.freq[target_word] < 5: 100 | continue 101 | target = [target_word] \ 102 | + [self.sample_ne() for _ in range(self.num_label - 1)] 103 | target_weight = [1.0] + [0.0 for _ in range(self.num_label - 1)] 104 | batch_data.append(context) 105 | batch_label.append(target) 106 | batch_label_weight.append(target_weight) 107 | if len(batch_data) == self.batch_size: 108 | data_all = [mx.nd.array(batch_data)] 109 | label_all = [mx.nd.array(batch_label), mx.nd.array(batch_label_weight)] 110 | data_names = ['data'] 111 | label_names = ['label', 'label_weight'] 112 | batch_data = [] 113 | batch_label = [] 114 | batch_label_weight = [] 115 | yield SimpleBatch(data_names, data_all, label_names, label_all) 116 | 117 | def reset(self): 118 | pass 119 | 120 | class NceAccuracy(mx.metric.EvalMetric): 121 | def __init__(self): 122 | super(NceAccuracy, self).__init__('nce-accuracy') 123 | 124 | def update(self, labels, preds): 125 | label_weight = labels[1].asnumpy() 126 | preds = np.array([x.asnumpy() for x in preds]).transpose() 127 | for i in range(preds.shape[0]): 128 | if np.argmax(label_weight[i]) == np.argmax(preds[i]): 129 | self.sum_metric += 1 130 | self.num_inst += 1 131 | 132 | class NceAuc(mx.metric.EvalMetric): 133 | def __init__(self): 134 | super(NceAuc, self).__init__('nce-auc') 135 | 136 | def update(self, labels, preds): 137 | label_weight = labels[1].asnumpy() 138 | preds = np.array([x.asnumpy() for x in preds]).transpose() 139 | tmp = [] 140 | for i in range(preds.shape[0]): 141 | for j in range(preds.shape[1]): 142 | tmp.append((label_weight[i][j], preds[i][j])) 143 | tmp = sorted(tmp, key = itemgetter(1), reverse = True) 144 | m = 0.0 145 | n = 0.0 146 | z = 0.0 147 | k = 0 148 | for a, b in tmp: 149 | if a > 0.5: 150 | m += 1.0 151 | z += len(tmp) - k 152 | else: 153 | n += 1.0 154 | k += 1 155 | z -= m * (m + 1.0) / 2.0 156 | z /= m 157 | z /= n 158 | self.sum_metric += z 159 | self.num_inst += 1 160 | 161 | if __name__ == '__main__': 162 | batch_size = 1024 163 | num_label = 5 164 | 165 | data_train = DataIter("./data/text8", batch_size, num_label) 166 | 167 | network = get_net(data_train.vocab_size, num_label - 1, num_label) 168 | devs = [mx.gpu(i) for i in range(1)] 169 | model = mx.model.FeedForward(ctx = devs, 170 | symbol = network, 171 | num_epoch = 20, 172 | learning_rate = 0.3, 173 | momentum = 0.9, 174 | wd = 0.0000, 175 | initializer=mx.init.Xavier(factor_type="in", magnitude=2.34)) 176 | 177 | import logging 178 | head = '%(asctime)-15s %(message)s' 179 | logging.basicConfig(level=logging.DEBUG, format=head) 180 | 181 | metric = NceAuc() 182 | model.fit(X = data_train, 183 | eval_metric = metric, 184 | #kvstore = 'local_allreduce_device', 185 | batch_end_callback = mx.callback.Speedometer(batch_size, 50),) 186 | 187 | -------------------------------------------------------------------------------- /mxnet/ocr/cnn_ocr.py: -------------------------------------------------------------------------------- 1 | # pylint: disable=C0111,too-many-arguments,too-many-instance-attributes,too-many-locals,redefined-outer-name,fixme 2 | # pylint: disable=superfluous-parens, no-member, invalid-name 3 | import sys 4 | sys.path.insert(0, "../../python") 5 | import mxnet as mx 6 | import numpy as np 7 | import cv2, random 8 | from io import BytesIO 9 | from captcha.image import ImageCaptcha 10 | 11 | class OCRBatch(object): 12 | def __init__(self, data_names, data, label_names, label): 13 | self.data = data 14 | self.label = label 15 | self.data_names = data_names 16 | self.label_names = label_names 17 | 18 | @property 19 | def provide_data(self): 20 | return [(n, x.shape) for n, x in zip(self.data_names, self.data)] 21 | 22 | @property 23 | def provide_label(self): 24 | return [(n, x.shape) for n, x in zip(self.label_names, self.label)] 25 | 26 | def gen_rand(): 27 | buf = "" 28 | for i in range(4): 29 | buf += str(random.randint(0,9)) 30 | return buf 31 | 32 | def get_label(buf): 33 | a = [int(x) for x in buf] 34 | return np.array(a) 35 | 36 | def gen_sample(captcha, width, height): 37 | num = gen_rand() 38 | img = captcha.generate(num) 39 | img = np.fromstring(img.getvalue(), dtype='uint8') 40 | img = cv2.imdecode(img, cv2.IMREAD_COLOR) 41 | img = cv2.resize(img, (width, height)) 42 | img = np.multiply(img, 1/255.0) 43 | img = img.transpose(2, 0, 1) 44 | return (num, img) 45 | 46 | class OCRIter(mx.io.DataIter): 47 | def __init__(self, count, batch_size, num_label, height, width): 48 | super(OCRIter, self).__init__() 49 | self.captcha = ImageCaptcha(fonts=['./data/Xerox.ttf']) 50 | 51 | self.batch_size = batch_size 52 | self.count = count 53 | self.height = height 54 | self.width = width 55 | self.provide_data = [('data', (batch_size, 3, height, width))] 56 | self.provide_label = [('softmax_label', (self.batch_size, num_label))] 57 | 58 | def __iter__(self): 59 | for k in range(self.count / self.batch_size): 60 | data = [] 61 | label = [] 62 | for i in range(self.batch_size): 63 | num, img = gen_sample(self.captcha, self.width, self.height) 64 | data.append(img) 65 | label.append(get_label(num)) 66 | 67 | data_all = [mx.nd.array(data)] 68 | label_all = [mx.nd.array(label)] 69 | data_names = ['data'] 70 | label_names = ['softmax_label'] 71 | 72 | data_batch = OCRBatch(data_names, data_all, label_names, label_all) 73 | yield data_batch 74 | 75 | def reset(self): 76 | pass 77 | 78 | def get_ocrnet(): 79 | data = mx.symbol.Variable('data') 80 | label = mx.symbol.Variable('softmax_label') 81 | conv1 = mx.symbol.Convolution(data=data, kernel=(5,5), num_filter=32) 82 | pool1 = mx.symbol.Pooling(data=conv1, pool_type="max", kernel=(2,2), stride=(1, 1)) 83 | relu1 = mx.symbol.Activation(data=pool1, act_type="relu") 84 | 85 | conv2 = mx.symbol.Convolution(data=relu1, kernel=(5,5), num_filter=32) 86 | pool2 = mx.symbol.Pooling(data=conv2, pool_type="avg", kernel=(2,2), stride=(1, 1)) 87 | relu2 = mx.symbol.Activation(data=pool2, act_type="relu") 88 | 89 | conv3 = mx.symbol.Convolution(data=relu2, kernel=(3,3), num_filter=32) 90 | pool3 = mx.symbol.Pooling(data=conv3, pool_type="avg", kernel=(2,2), stride=(1, 1)) 91 | relu3 = mx.symbol.Activation(data=pool3, act_type="relu") 92 | 93 | conv4 = mx.symbol.Convolution(data=relu3, kernel=(3,3), num_filter=32) 94 | pool4 = mx.symbol.Pooling(data=conv4, pool_type="avg", kernel=(2,2), stride=(1, 1)) 95 | relu4 = mx.symbol.Activation(data=pool4, act_type="relu") 96 | 97 | flatten = mx.symbol.Flatten(data = relu4) 98 | fc1 = mx.symbol.FullyConnected(data = flatten, num_hidden = 256) 99 | fc21 = mx.symbol.FullyConnected(data = fc1, num_hidden = 10) 100 | fc22 = mx.symbol.FullyConnected(data = fc1, num_hidden = 10) 101 | fc23 = mx.symbol.FullyConnected(data = fc1, num_hidden = 10) 102 | fc24 = mx.symbol.FullyConnected(data = fc1, num_hidden = 10) 103 | fc2 = mx.symbol.Concat(*[fc21, fc22, fc23, fc24], dim = 0) 104 | label = mx.symbol.transpose(data = label) 105 | label = mx.symbol.Reshape(data = label, target_shape = (0, )) 106 | return mx.symbol.SoftmaxOutput(data = fc2, label = label, name = "softmax") 107 | 108 | 109 | def Accuracy(label, pred): 110 | label = label.T.reshape((-1, )) 111 | hit = 0 112 | total = 0 113 | for i in range(pred.shape[0] / 4): 114 | ok = True 115 | for j in range(4): 116 | k = i * 4 + j 117 | if np.argmax(pred[k]) != int(label[k]): 118 | ok = False 119 | break 120 | if ok: 121 | hit += 1 122 | total += 1 123 | return 1.0 * hit / total 124 | 125 | if __name__ == '__main__': 126 | network = get_ocrnet() 127 | devs = [mx.gpu(i) for i in range(1)] 128 | model = mx.model.FeedForward(ctx = devs, 129 | symbol = network, 130 | num_epoch = 1, 131 | learning_rate = 0.001, 132 | wd = 0.00001, 133 | initializer = mx.init.Xavier(factor_type="in", magnitude=2.34), 134 | momentum = 0.9) 135 | 136 | batch_size = 8 137 | data_train = OCRIter(100000, batch_size, 4, 30, 80) 138 | data_test = OCRIter(1000, batch_size, 4, 30, 80) 139 | 140 | import logging 141 | head = '%(asctime)-15s %(message)s' 142 | logging.basicConfig(level=logging.DEBUG, format=head) 143 | 144 | model.fit(X = data_train, eval_data = data_test, eval_metric = Accuracy, batch_end_callback=mx.callback.Speedometer(batch_size, 50),) 145 | 146 | model.save("cnn-ocr") 147 | -------------------------------------------------------------------------------- /mxnet/ocr/infer_cnn_ocr.py: -------------------------------------------------------------------------------- 1 | # pylint: disable=C0111,too-many-arguments,too-many-instance-attributes,too-many-locals,redefined-outer-name,fixme 2 | # pylint: disable=superfluous-parens, no-member, invalid-name 3 | import sys 4 | sys.path.insert(0, "../../python") 5 | import mxnet as mx 6 | import numpy as np 7 | import cv2, random 8 | from io import BytesIO 9 | from captcha.image import ImageCaptcha 10 | from cnn_ocr import gen_rand, get_label, gen_sample 11 | 12 | def get_ocrnet(): 13 | data = mx.symbol.Variable('data') 14 | conv1 = mx.symbol.Convolution(data=data, kernel=(5,5), num_filter=32) 15 | pool1 = mx.symbol.Pooling(data=conv1, pool_type="max", kernel=(2,2), stride=(1, 1)) 16 | relu1 = mx.symbol.Activation(data=pool1, act_type="relu") 17 | 18 | conv2 = mx.symbol.Convolution(data=relu1, kernel=(5,5), num_filter=32) 19 | pool2 = mx.symbol.Pooling(data=conv2, pool_type="avg", kernel=(2,2), stride=(1, 1)) 20 | relu2 = mx.symbol.Activation(data=pool2, act_type="relu") 21 | 22 | conv3 = mx.symbol.Convolution(data=relu2, kernel=(3,3), num_filter=32) 23 | pool3 = mx.symbol.Pooling(data=conv3, pool_type="avg", kernel=(2,2), stride=(1, 1)) 24 | relu3 = mx.symbol.Activation(data=pool3, act_type="relu") 25 | 26 | conv4 = mx.symbol.Convolution(data=relu3, kernel=(3,3), num_filter=32) 27 | pool4 = mx.symbol.Pooling(data=conv4, pool_type="avg", kernel=(2,2), stride=(1, 1)) 28 | relu4 = mx.symbol.Activation(data=pool4, act_type="relu") 29 | 30 | flatten = mx.symbol.Flatten(data = relu4) 31 | fc1 = mx.symbol.FullyConnected(data = flatten, num_hidden = 256) 32 | fc21 = mx.symbol.FullyConnected(data = fc1, num_hidden = 10) 33 | fc22 = mx.symbol.FullyConnected(data = fc1, num_hidden = 10) 34 | fc23 = mx.symbol.FullyConnected(data = fc1, num_hidden = 10) 35 | fc24 = mx.symbol.FullyConnected(data = fc1, num_hidden = 10) 36 | fc2 = mx.symbol.Concat(*[fc21, fc22, fc23, fc24], dim = 0) 37 | return mx.symbol.SoftmaxOutput(data = fc2, name = "softmax") 38 | 39 | if __name__ == '__main__': 40 | captcha = ImageCaptcha(fonts=['./data/Xerox.ttf']) 41 | num, img = gen_sample(captcha, 80, 30) 42 | print 'gen captcha:', num 43 | 44 | batch_size = 1 45 | _, arg_params, __ = mx.model.load_checkpoint("cnn-ocr", 1) 46 | data_shape = [("data", (batch_size, 3, 30, 80))] 47 | input_shapes = dict(data_shape) 48 | sym = get_ocrnet() 49 | executor = sym.simple_bind(ctx = mx.cpu(), **input_shapes) 50 | for key in executor.arg_dict.keys(): 51 | if key in arg_params: 52 | arg_params[key].copyto(executor.arg_dict[key]) 53 | 54 | executor.forward(is_train = True, data = mx.nd.array([img])) 55 | probs = executor.outputs[0].asnumpy() 56 | line = '' 57 | for i in range(probs.shape[0]): 58 | line += str(np.argmax(probs[i])) 59 | print 'predicted: ' + line 60 | -------------------------------------------------------------------------------- /mxnet/triple-loss/test.py: -------------------------------------------------------------------------------- 1 | import sys, random, os 2 | sys.path.insert(0, '../../python') 3 | import mxnet as mx 4 | import numpy as np 5 | import imgnet_triloss, cv2 6 | from operator import itemgetter 7 | 8 | _, arg_params, __ = mx.model.load_checkpoint(sys.argv[2], 100) 9 | 10 | batch_size = 1 11 | network = imgnet_triloss.get_sim_net() 12 | 13 | input_shapes = dict([('same', (batch_size, 3, 32, 32)),\ 14 | ('diff', (batch_size, 3, 32, 32))]) 15 | executor = network.simple_bind(ctx = mx.gpu(), **input_shapes) 16 | for key in executor.arg_dict.keys(): 17 | if key in arg_params: 18 | print key, arg_params[key].shape, executor.arg_dict[key].shape 19 | arg_params[key].copyto(executor.arg_dict[key]) 20 | 21 | root = sys.argv[1] 22 | names = [] 23 | for fn in os.listdir(root): 24 | if fn.endswith('.npy'): 25 | names.append(root + '/' + fn) 26 | random.shuffle(names) 27 | 28 | imgs = [] 29 | for i in range(10): 30 | imgs.append(np.load(names[i])) 31 | 32 | def save_img(fname, im): 33 | a = np.copy(im) * 255.0 34 | cv2.imwrite(fname, a.transpose(1, 2, 0)) 35 | 36 | src = imgs[0][random.randint(0, len(imgs[0]) - 1)] 37 | save_img("src.png", src) 38 | dsts = [] 39 | for i in range(10): 40 | for j in range(128): 41 | k = random.randint(0, len(imgs[i]) - 1) 42 | dst = imgs[i][k] 43 | outputs = executor.forward(is_train = True, same = mx.nd.array([src]), 44 | diff = mx.nd.array([dst])) 45 | dis = outputs[0].asnumpy()[0] 46 | dsts.append((dst, dis, i)) 47 | 48 | i = 0 49 | for img, w, la in sorted(dsts, key = itemgetter(1))[:10]: 50 | print w, la 51 | save_img("dst_" + str(i) + ".png", img) 52 | i += 1 53 | 54 | -------------------------------------------------------------------------------- /mxnet/triple-loss/triplet_loss.py: -------------------------------------------------------------------------------- 1 | # pylint: disable=C0111,too-many-arguments,too-many-instance-attributes,too-many-locals,redefined-outer-name,fixme 2 | # pylint: disable=superfluous-parens, no-member, invalid-name 3 | import sys, random, os 4 | sys.path.insert(0, "../../python") 5 | import mxnet as mx 6 | import numpy as np 7 | from operator import itemgetter 8 | 9 | class Batch(object): 10 | def __init__(self, data_names, data, label_names, label): 11 | self.data = data 12 | self.label = label 13 | self.data_names = data_names 14 | self.label_names = label_names 15 | 16 | @property 17 | def provide_data(self): 18 | return [(n, x.shape) for n, x in zip(self.data_names, self.data)] 19 | 20 | @property 21 | def provide_label(self): 22 | return [(n, x.shape) for n, x in zip(self.label_names, self.label)] 23 | 24 | class DataIter(mx.io.DataIter): 25 | def __init__(self, names, batch_size): 26 | super(DataIter, self).__init__() 27 | self.cache = [] 28 | for name in names: 29 | self.cache.append(np.load(name)) 30 | print 'load data ok' 31 | self.batch_size = batch_size 32 | self.provide_data = [('same', (batch_size, 3, 32, 32)), \ 33 | ('diff', (batch_size, 3, 32, 32)), \ 34 | ('one', (batch_size, ))] 35 | self.provide_label = [('anchor', (batch_size, 3, 32, 32))] 36 | 37 | def generate_batch(self, n): 38 | n1, n2 = random.sample(range(len(self.cache)), 2) 39 | d1 = self.cache[n1] 40 | d2 = self.cache[n2] 41 | ret = [] 42 | while len(ret) < n: 43 | k1 = random.randint(0, len(d1) - 1) 44 | k2 = random.randint(0, len(d1) - 1) 45 | k3 = random.randint(0, len(d2) - 1) 46 | if k1 == k2: 47 | continue 48 | ret.append((d1[k1], d1[k2], d2[k3])) 49 | return ret 50 | 51 | def __iter__(self): 52 | print 'begin' 53 | count = 100000 / self.batch_size 54 | for i in range(count): 55 | batch = self.generate_batch(self.batch_size) 56 | batch_anchor = [x[0] for x in batch] 57 | batch_same = [x[1] for x in batch] 58 | batch_diff = [x[2] for x in batch] 59 | batch_one = np.ones(self.batch_size) 60 | 61 | data_all = [mx.nd.array(batch_same), mx.nd.array(batch_diff), \ 62 | mx.nd.array(batch_one)] 63 | label_all = [mx.nd.array(batch_anchor)] 64 | data_names = ['same', 'diff', 'one'] 65 | label_names = ['anchor'] 66 | 67 | data_batch = Batch(data_names, data_all, label_names, label_all) 68 | yield data_batch 69 | 70 | def reset(self): 71 | pass 72 | 73 | def get_conv(data, conv_weight, conv_bias, fc_weight, fc_bias): 74 | cdata = data 75 | ks = [5, 3, 3] 76 | for i in range(3): 77 | cdata = mx.sym.Convolution(data=cdata, kernel=(ks[i],ks[i]), num_filter=32, 78 | weight = conv_weight[i], bias = conv_bias[i], 79 | name = 'conv' + str(i)) 80 | cdata = mx.sym.Pooling(data=cdata, pool_type="avg", kernel=(2,2), stride=(1, 1)) 81 | cdata = mx.sym.Activation(data=cdata, act_type="relu") 82 | 83 | cdata = mx.sym.Flatten(data = cdata) 84 | cdata = mx.sym.FullyConnected(data = cdata, num_hidden = 1024, 85 | weight = fc_weight, bias = fc_bias, name='fc') 86 | cdata = mx.sym.L2Normalization(data = cdata) 87 | return cdata 88 | 89 | def get_sim_net(): 90 | same = mx.sym.Variable('same') 91 | diff = mx.sym.Variable('diff') 92 | conv_weight = [] 93 | conv_bias = [] 94 | for i in range(3): 95 | conv_weight.append(mx.sym.Variable('conv' + str(i) + '_weight')) 96 | conv_bias.append(mx.sym.Variable('conv' + str(i) + '_bias')) 97 | fc_weight = mx.sym.Variable('fc_weight') 98 | fc_bias = mx.sym.Variable('fc_bias') 99 | fs = get_conv(same, conv_weight, conv_bias, fc_weight, fc_bias) 100 | fd = get_conv(diff, conv_weight, conv_bias, fc_weight, fc_bias) 101 | fs = fs - fd 102 | fs = fs * fs 103 | return mx.sym.sum(fs, axis = 1) 104 | 105 | 106 | def get_net(batch_size): 107 | same = mx.sym.Variable('same') 108 | diff = mx.sym.Variable('diff') 109 | anchor = mx.sym.Variable('anchor') 110 | one = mx.sym.Variable('one') 111 | one = mx.sym.Reshape(data = one, shape = (-1, 1)) 112 | conv_weight = [] 113 | conv_bias = [] 114 | for i in range(3): 115 | conv_weight.append(mx.sym.Variable('conv' + str(i) + '_weight')) 116 | conv_bias.append(mx.sym.Variable('conv' + str(i) + '_bias')) 117 | fc_weight = mx.sym.Variable('fc_weight') 118 | fc_bias = mx.sym.Variable('fc_bias') 119 | fa = get_conv(anchor, conv_weight, conv_bias, fc_weight, fc_bias) 120 | fs = get_conv(same, conv_weight, conv_bias, fc_weight, fc_bias) 121 | fd = get_conv(diff, conv_weight, conv_bias, fc_weight, fc_bias) 122 | 123 | fs = fa - fs 124 | fd = fa - fd 125 | fs = fs * fs 126 | fd = fd * fd 127 | fs = mx.sym.sum(fs, axis = 1, keepdims = 1) 128 | fd = mx.sym.sum(fd, axis = 1, keepdims = 1) 129 | loss = fd - fs 130 | loss = one - loss 131 | loss = mx.sym.Activation(data = loss, act_type = 'relu') 132 | return mx.sym.MakeLoss(loss) 133 | 134 | class Auc(mx.metric.EvalMetric): 135 | def __init__(self): 136 | super(Auc, self).__init__('auc') 137 | 138 | def update(self, labels, preds): 139 | pred = preds[0].asnumpy().reshape(-1) 140 | self.sum_metric += np.sum(pred) 141 | self.num_inst += len(pred) 142 | 143 | if __name__ == '__main__': 144 | batch_size = 128 145 | network = get_net(batch_size) 146 | devs = [mx.gpu(2)] 147 | model = mx.model.FeedForward(ctx = devs, 148 | symbol = network, 149 | num_epoch = 100, 150 | learning_rate = 0.01, 151 | wd = 0.00001, 152 | initializer = mx.init.Xavier(factor_type="in", magnitude=2.34), 153 | momentum = 0.0) 154 | names = [] 155 | root = sys.argv[1] 156 | for fn in os.listdir(root): 157 | if fn.endswith('.npy'): 158 | names.append(root + '/' + fn) 159 | print len(names) 160 | data_train = DataIter(names, batch_size) 161 | 162 | import logging 163 | head = '%(asctime)-15s %(message)s' 164 | logging.basicConfig(level=logging.DEBUG, format=head) 165 | 166 | metric = Auc() 167 | model.fit(X = data_train, 168 | eval_metric = metric, 169 | kvstore = 'local_allreduce_device', 170 | batch_end_callback=mx.callback.Speedometer(batch_size, 50),) 171 | 172 | model.save(sys.argv[2]) 173 | --------------------------------------------------------------------------------