├── Alphabet.h ├── AttRecursiveGatedNN.h ├── AttentionPooling.h ├── AvgPerceptron1O.h ├── BiLayer.h ├── CheckGrad.h ├── Concat.h ├── Dropout.h ├── GRNN.h ├── GatedPooling.h ├── Hash_map.hpp ├── IO.h ├── LSTM.h ├── LSTM_CHD.h ├── LSTM_KER.h ├── LSTM_STD.h ├── LookupTable.h ├── MLCRFLoss.h ├── MMCRFLoss.h ├── Metric.h ├── MyLib.h ├── N3L.h ├── NRMat.h ├── Pooling.h ├── README.md ├── RNN.h ├── RecursiveGatedNN.h ├── RecursiveNN.h ├── SoftMaxLoss.h ├── SparseUniLayer.h ├── SparseUniLayer1O.h ├── TensorLayer.h ├── TriLayer.h ├── TriLayerLSTM.h ├── UniLayer.h ├── UniLayer1O.h ├── Utils.h ├── Utiltensor.h ├── Windowlized.h └── description(expect for lrec2016).pdf /Alphabet.h: -------------------------------------------------------------------------------- 1 | #ifndef _ALPHABET_ 2 | #define _ALPHABET_ 3 | 4 | #include "MyLib.h" 5 | #include "Hash_map.hpp" 6 | #include "IO.h" 7 | 8 | /* 9 | This class serializes feature from string to int. 10 | Index starts from 0. 11 | */ 12 | 13 | /** 14 | * The basic class of quark class. 15 | * @param std::string String class name to be used. 16 | * @param int ID class name to be used. 17 | * @author Naoaki Okazaki 18 | */ 19 | class basic_quark { 20 | protected: 21 | typedef hash_map StringToId; 22 | typedef std::vector IdToString; 23 | 24 | StringToId m_string_to_id; 25 | IdToString m_id_to_string; 26 | bool m_b_fixed; 27 | int m_size; 28 | 29 | public: 30 | /** 31 | * Construct. 32 | */ 33 | basic_quark() 34 | { 35 | clear(); 36 | } 37 | 38 | /** 39 | * Destruct. 40 | */ 41 | virtual ~basic_quark() 42 | { 43 | } 44 | 45 | /** 46 | * Map a string to its associated ID. 47 | * If string-to-integer association does not exist, allocate a new ID. 48 | * @param str String value. 49 | * @return Associated ID for the string value. 50 | */ 51 | int operator[](const std::string& str) 52 | { 53 | typename StringToId::const_iterator it = m_string_to_id.find(str); 54 | if (it != m_string_to_id.end()) { 55 | return it->second; 56 | } else if (!m_b_fixed){ 57 | int newid = m_size; 58 | m_id_to_string.push_back(str); 59 | m_string_to_id.insert(std::pair(str, newid)); 60 | m_size++; 61 | return newid; 62 | } 63 | else 64 | { 65 | return -1; 66 | } 67 | } 68 | 69 | 70 | /** 71 | * Convert ID value into the associated string value. 72 | * @param qid ID. 73 | * @param def Default value if the ID was out of range. 74 | * @return String value associated with the ID. 75 | */ 76 | const std::string& from_id(const int& qid, const std::string& def = "") const 77 | { 78 | if (qid < 0 || m_size <= qid) { 79 | return def; 80 | } else { 81 | return m_id_to_string[qid]; 82 | } 83 | } 84 | 85 | 86 | 87 | /** 88 | * Convert string value into the associated ID value. 89 | * @param str String value. 90 | * @return ID if any, otherwise -1. 91 | */ 92 | int from_string(const std::string& str) 93 | { 94 | typename StringToId::const_iterator it = m_string_to_id.find(str); 95 | if (it != m_string_to_id.end()) { 96 | return it->second; 97 | } else if (!m_b_fixed){ 98 | int newid = m_size; 99 | m_id_to_string.push_back(str); 100 | m_string_to_id.insert(std::pair(str, newid)); 101 | m_size++; 102 | return newid; 103 | } 104 | else 105 | { 106 | return -1; 107 | } 108 | } 109 | 110 | void clear() 111 | { 112 | m_string_to_id.clear(); 113 | m_id_to_string.clear(); 114 | m_b_fixed = false; 115 | m_size = 0; 116 | } 117 | 118 | void set_fixed_flag(bool bfixed) 119 | { 120 | m_b_fixed = bfixed; 121 | } 122 | 123 | /** 124 | * Get the number of string-to-id associations. 125 | * @return The number of association. 126 | */ 127 | size_t size() const 128 | { 129 | return m_size; 130 | } 131 | 132 | 133 | void read(std::ifstream &inf) 134 | { 135 | clear(); 136 | static string tmp; 137 | my_getline(inf, tmp); 138 | chomp(tmp); 139 | m_size = atoi(tmp.c_str()); 140 | std::vector featids; 141 | for (int i = 0; i < m_size; ++i) { 142 | 143 | my_getline(inf, tmp); 144 | split_bychars(tmp, featids); 145 | m_string_to_id[featids[0]] = i; 146 | assert(atoi(featids[1].c_str()) == i); 147 | } 148 | } 149 | 150 | void write(std::ofstream &outf) const 151 | { 152 | outf << m_size << std::endl; 153 | for (int i=0; i 22 | class AttRecursiveGatedNN { 23 | public: 24 | BiLayer _reset_left; 25 | BiLayer _reset_right; 26 | BiLayer _update_left; 27 | BiLayer _update_right; 28 | BiLayer _update_tilde; 29 | BiLayer _recursive_tilde; 30 | 31 | 32 | Tensor nxl; 33 | Tensor nxr; 34 | Tensor sum; 35 | 36 | Tensor pxl; 37 | Tensor pxr; 38 | Tensor pmy; 39 | 40 | 41 | Tensor lrxl; 42 | Tensor lrxr; 43 | Tensor lmy; 44 | Tensor luxl; 45 | Tensor luxr; 46 | Tensor lumy; 47 | 48 | Tensor lnxl; 49 | Tensor lnxr; 50 | Tensor lsum; 51 | 52 | Tensor lpxl; 53 | Tensor lpxr; 54 | Tensor lpmy; 55 | 56 | 57 | public: 58 | AttRecursiveGatedNN() { 59 | } 60 | 61 | inline void initial(int dimension, int attDim, int seed = 0) { 62 | _reset_left.initial(dimension, dimension, attDim, false, seed, 1); 63 | _reset_right.initial(dimension, dimension, attDim, false, seed + 10, 1); 64 | _update_left.initial(dimension, dimension, attDim, false, seed + 20, 3); 65 | _update_right.initial(dimension, dimension, attDim, false, seed + 30, 3); 66 | _update_tilde.initial(dimension, dimension, attDim, false, seed + 40, 3); 67 | _recursive_tilde.initial(dimension, dimension, dimension, false, seed + 50, 0); 68 | 69 | nxl = NewTensor(Shape2(1, dimension), d_zero); 70 | nxr = NewTensor(Shape2(1, dimension), d_zero); 71 | sum = NewTensor(Shape2(1, dimension), d_zero); 72 | 73 | pxl = NewTensor(Shape2(1, dimension), d_zero); 74 | pxr = NewTensor(Shape2(1, dimension), d_zero); 75 | pmy = NewTensor(Shape2(1, dimension), d_zero); 76 | 77 | 78 | lrxl = NewTensor(Shape2(1, dimension), d_zero); 79 | lrxr = NewTensor(Shape2(1, dimension), d_zero); 80 | lmy = NewTensor(Shape2(1, dimension), d_zero); 81 | luxl = NewTensor(Shape2(1, dimension), d_zero); 82 | luxr = NewTensor(Shape2(1, dimension), d_zero); 83 | lumy = NewTensor(Shape2(1, dimension), d_zero); 84 | 85 | lnxl = NewTensor(Shape2(1, dimension), d_zero); 86 | lnxr = NewTensor(Shape2(1, dimension), d_zero); 87 | lsum = NewTensor(Shape2(1, dimension), d_zero); 88 | 89 | lpxl = NewTensor(Shape2(1, dimension), d_zero); 90 | lpxr = NewTensor(Shape2(1, dimension), d_zero); 91 | lpmy = NewTensor(Shape2(1, dimension), d_zero); 92 | } 93 | 94 | 95 | inline void initial(Tensor rW1, Tensor rU1, 96 | Tensor rW2, Tensor rU2, 97 | Tensor uW1, Tensor uU1, 98 | Tensor uW2, Tensor uU2, 99 | Tensor uW3, Tensor uU3, 100 | Tensor W1, Tensor W2, Tensor W3,Tensor b) { 101 | _reset_left.initial(rW1, rU1, 1); 102 | _reset_right.initial(rW2, rU2, 1); 103 | 104 | _update_left.initial(uW1, uU1, 3); 105 | _update_right.initial(uW2, uU2, 3); 106 | _update_tilde.initial(uW3, uU3, 3); 107 | 108 | _recursive_tilde.initial(W1, W2, W3, b, 0); 109 | } 110 | 111 | inline void release() { 112 | _reset_left.release(); 113 | _reset_right.release(); 114 | 115 | _update_left.release(); 116 | _update_right.release(); 117 | _update_tilde.release(); 118 | 119 | _recursive_tilde.release(); 120 | 121 | FreeSpace(&nxl); 122 | FreeSpace(&nxr); 123 | FreeSpace(&sum); 124 | FreeSpace(&pxl); 125 | FreeSpace(&pxr); 126 | FreeSpace(&pmy); 127 | FreeSpace(&lnxl); 128 | FreeSpace(&lnxr); 129 | FreeSpace(&lsum); 130 | FreeSpace(&lpxl); 131 | FreeSpace(&lpxr); 132 | FreeSpace(&lpmy); 133 | FreeSpace(&lrxl); 134 | FreeSpace(&lrxr); 135 | FreeSpace(&lmy); 136 | FreeSpace(&luxl); 137 | FreeSpace(&luxr); 138 | FreeSpace(&lumy); 139 | } 140 | 141 | virtual ~AttRecursiveGatedNN() { 142 | // TODO Auto-generated destructor stub 143 | } 144 | 145 | inline dtype squarenormAll() { 146 | dtype norm = _reset_left.squarenormAll(); 147 | norm += _reset_right.squarenormAll(); 148 | norm += _update_left.squarenormAll(); 149 | norm += _update_right.squarenormAll(); 150 | norm += _update_tilde.squarenormAll(); 151 | norm += _recursive_tilde.squarenormAll(); 152 | 153 | return norm; 154 | } 155 | 156 | inline void scaleGrad(dtype scale) { 157 | _reset_left.scaleGrad(scale); 158 | _reset_right.scaleGrad(scale); 159 | 160 | _update_left.scaleGrad(scale); 161 | _update_right.scaleGrad(scale); 162 | _update_tilde.scaleGrad(scale); 163 | 164 | _recursive_tilde.scaleGrad(scale); 165 | } 166 | 167 | public: 168 | 169 | inline void ComputeForwardScore(Tensor xl, Tensor xr, Tensor a, 170 | Tensor rxl, Tensor rxr, Tensor my, 171 | Tensor uxl, Tensor uxr, Tensor umy, 172 | Tensor y) { 173 | 174 | nxl = 0.0; 175 | nxr = 0.0; 176 | sum = 0.0; 177 | 178 | pxl = 0.0; 179 | pxr = 0.0; 180 | pmy = 0.0; 181 | 182 | _reset_left.ComputeForwardScore(xl, a, rxl); 183 | _reset_right.ComputeForwardScore(xr, a, rxr); 184 | 185 | 186 | nxl = rxl * xl; 187 | nxr = rxr * xr; 188 | 189 | _recursive_tilde.ComputeForwardScore(nxl, nxr, my); 190 | 191 | 192 | _update_left.ComputeForwardScore(xl, a, uxl); 193 | _update_right.ComputeForwardScore(xr, a, uxr); 194 | _update_tilde.ComputeForwardScore(my, a, umy); 195 | 196 | sum = uxl + uxr + umy; 197 | 198 | pxl = uxl / sum; 199 | pxr = uxr / sum; 200 | pmy = umy / sum; 201 | 202 | y = pxl * xl + pxr * xr + pmy * my; 203 | 204 | } 205 | 206 | //please allocate the memory outside here 207 | inline void ComputeBackwardLoss(Tensor xl, Tensor xr, Tensor a, 208 | Tensor rxl, Tensor rxr, Tensor my, 209 | Tensor uxl, Tensor uxr, Tensor umy, 210 | Tensor y, Tensor ly, 211 | Tensor lxl, Tensor lxr, Tensor la, 212 | bool bclear = false) { 213 | if (bclear){ 214 | lxl = 0.0; lxr = 0.0; la = 0.0; 215 | } 216 | 217 | nxl = 0.0; 218 | nxr = 0.0; 219 | sum = 0.0; 220 | 221 | pxl = 0.0; 222 | pxr = 0.0; 223 | pmy = 0.0; 224 | 225 | 226 | lrxl = 0.0; 227 | lrxr = 0.0; 228 | lmy = 0.0; 229 | luxl = 0.0; 230 | luxr = 0.0; 231 | lumy = 0.0; 232 | 233 | lnxl = 0.0; 234 | lnxr = 0.0; 235 | lsum = 0.0; 236 | 237 | lpxl = 0.0; 238 | lpxr = 0.0; 239 | lpmy = 0.0; 240 | 241 | nxl = rxl * xl; 242 | nxr = rxr * xr; 243 | 244 | sum = uxl + uxr + umy; 245 | 246 | pxl = uxl / sum; 247 | pxr = uxr / sum; 248 | pmy = umy / sum; 249 | 250 | 251 | lpxl += ly * xl; 252 | lxl += ly * pxl; 253 | 254 | lpxr += ly * xr; 255 | lxr += ly * pxr; 256 | 257 | lpmy += ly * my; 258 | lmy += ly * pmy; 259 | 260 | 261 | 262 | luxl += lpxl / sum; 263 | luxr += lpxr / sum; 264 | lumy += lpmy / sum; 265 | 266 | lsum -= lpxl * pxl / sum; 267 | lsum -= lpxr * pxr / sum; 268 | lsum -= lpmy * pmy / sum; 269 | 270 | 271 | luxl += lsum; 272 | luxr += lsum; 273 | lumy += lsum; 274 | 275 | _update_left.ComputeBackwardLoss(xl, a, uxl, luxl, lxl, la); 276 | _update_right.ComputeBackwardLoss(xr, a, uxr, luxr, lxr, la); 277 | _update_tilde.ComputeBackwardLoss(my, a, umy, lumy, lmy, la); 278 | 279 | _recursive_tilde.ComputeBackwardLoss(nxl, nxr, my, lmy, lnxl, lnxr); 280 | 281 | lrxl += lnxl * xl; 282 | lxl += lnxl * rxl; 283 | 284 | lrxr += lnxr * xr; 285 | lxr += lnxr * rxr; 286 | 287 | _reset_left.ComputeBackwardLoss(xl, a, rxl, lrxl, lxl, la); 288 | _reset_right.ComputeBackwardLoss(xr, a, rxr, lrxr, lxr, la); 289 | 290 | } 291 | 292 | 293 | inline void randomprint(int num) { 294 | _reset_left.randomprint(num); 295 | _reset_right.randomprint(num); 296 | 297 | _update_left.randomprint(num); 298 | _update_right.randomprint(num); 299 | _update_tilde.randomprint(num); 300 | 301 | _recursive_tilde.randomprint(num); 302 | } 303 | 304 | inline void updateAdaGrad(dtype regularizationWeight, dtype adaAlpha, dtype adaEps) { 305 | _reset_left.updateAdaGrad(regularizationWeight, adaAlpha, adaEps); 306 | _reset_right.updateAdaGrad(regularizationWeight, adaAlpha, adaEps); 307 | 308 | _update_left.updateAdaGrad(regularizationWeight, adaAlpha, adaEps); 309 | _update_right.updateAdaGrad(regularizationWeight, adaAlpha, adaEps); 310 | _update_tilde.updateAdaGrad(regularizationWeight, adaAlpha, adaEps); 311 | 312 | _recursive_tilde.updateAdaGrad(regularizationWeight, adaAlpha, adaEps); 313 | } 314 | 315 | void writeModel(LStream &outf) { 316 | _reset_left.writeModel(outf); 317 | _reset_right.writeModel(outf); 318 | _update_left.writeModel(outf); 319 | _update_right.writeModel(outf); 320 | _update_tilde.writeModel(outf); 321 | _recursive_tilde.writeModel(outf); 322 | 323 | SaveBinary(outf, nxl); 324 | SaveBinary(outf, nxr); 325 | SaveBinary(outf, sum); 326 | 327 | SaveBinary(outf, pxl); 328 | SaveBinary(outf, pxr); 329 | SaveBinary(outf, pmy); 330 | 331 | SaveBinary(outf, lrxl); 332 | SaveBinary(outf, lrxr); 333 | SaveBinary(outf, lmy); 334 | SaveBinary(outf, luxl); 335 | SaveBinary(outf, luxr); 336 | SaveBinary(outf, lumy); 337 | 338 | SaveBinary(outf, lnxl); 339 | SaveBinary(outf, lnxr); 340 | SaveBinary(outf, lsum); 341 | 342 | SaveBinary(outf, lpxl); 343 | SaveBinary(outf, lpxr); 344 | SaveBinary(outf, lpmy); 345 | 346 | } 347 | 348 | void loadModel(LStream &inf) { 349 | 350 | _reset_left.loadModel(inf); 351 | _reset_right.loadModel(inf); 352 | _update_left.loadModel(inf); 353 | _update_right.loadModel(inf); 354 | _update_tilde.loadModel(inf); 355 | _recursive_tilde.loadModel(inf); 356 | 357 | 358 | LoadBinary(inf, &nxl, false); 359 | LoadBinary(inf, &nxr, false); 360 | LoadBinary(inf, &sum, false); 361 | 362 | LoadBinary(inf, &pxl, false); 363 | LoadBinary(inf, &pxr, false); 364 | LoadBinary(inf, &pmy, false); 365 | 366 | LoadBinary(inf, &lrxl, false); 367 | LoadBinary(inf, &lrxr, false); 368 | LoadBinary(inf, &lmy, false); 369 | LoadBinary(inf, &luxl, false); 370 | LoadBinary(inf, &luxr, false); 371 | LoadBinary(inf, &lumy, false); 372 | 373 | LoadBinary(inf, &lnxl, false); 374 | LoadBinary(inf, &lnxr, false); 375 | LoadBinary(inf, &lsum, false); 376 | 377 | LoadBinary(inf, &lpxl, false); 378 | LoadBinary(inf, &lpxr, false); 379 | LoadBinary(inf, &lpmy, false); 380 | 381 | } 382 | }; 383 | 384 | 385 | 386 | #endif /* SRC_AttRecursiveGatedNN_H_ */ 387 | -------------------------------------------------------------------------------- /AttentionPooling.h: -------------------------------------------------------------------------------- 1 | /* 2 | * AttentionPooling.h 3 | * 4 | * Created on: Mar 18, 2015 5 | * Author: mszhang 6 | */ 7 | 8 | #ifndef SRC_AttentionPooling_H_ 9 | #define SRC_AttentionPooling_H_ 10 | #include "tensor.h" 11 | 12 | #include "BiLayer.h" 13 | #include "MyLib.h" 14 | #include "Utiltensor.h" 15 | #include "Pooling.h" 16 | #include "UniLayer.h" 17 | 18 | using namespace mshadow; 19 | using namespace mshadow::expr; 20 | using namespace mshadow::utils; 21 | 22 | // For simpleness, we do not provide pooling on specified words, 23 | // which has been implemented in Pooling.h 24 | 25 | 26 | template 27 | class AttentionPooling { 28 | 29 | public: 30 | BiLayer _bi_gates; 31 | UniLayer _uni_gates; 32 | 33 | public: 34 | AttentionPooling() { 35 | } 36 | 37 | inline void initial(int hiddenSize, int attentionSize, bool bUseB = true, int seed = 0) { 38 | _bi_gates.initial(hiddenSize, hiddenSize, attentionSize, bUseB, seed); 39 | _uni_gates.initial(hiddenSize, hiddenSize, false, seed + 10, 3); 40 | } 41 | 42 | inline void initial(Tensor W1, Tensor W2, Tensor W3, Tensor b, bool bUseB = true) { 43 | _bi_gates.initial(W1, W2); 44 | _uni_gates.initial(W3, b, false, 3); 45 | 46 | } 47 | 48 | 49 | inline void release() { 50 | _bi_gates.release(); 51 | _uni_gates.release(); 52 | } 53 | 54 | virtual ~AttentionPooling() { 55 | // TODO Auto-generated destructor stub 56 | } 57 | 58 | inline dtype squarenormAll() { 59 | return _bi_gates.squarenormAll() + _uni_gates.squarenormAll(); 60 | } 61 | 62 | inline void scaleGrad(dtype scale) { 63 | _bi_gates.scaleGrad(scale); 64 | _uni_gates.scaleGrad(scale); 65 | } 66 | 67 | public: 68 | // xExp, xSumIndex, xSum ad xPoolIndex are temporal variables, which reduce computation in back-propagation 69 | inline void ComputeForwardScore(Tensor x, Tensor xAtt, 70 | Tensor xMExp, Tensor xExp, 71 | Tensor xSum, Tensor xPoolIndex, Tensor y) { 72 | y = 0.0; 73 | int seq_size = x.size(0); 74 | if(seq_size == 0) return; 75 | int dim1 = x.size(1), dim2 = x.size(2); 76 | int odim1 = y.size(0), odim2 = y.size(1); 77 | 78 | if (dim1 != odim1 || dim2 != odim2 || dim1 != 1) { 79 | std::cerr << "AttentionPooling Forward error: dim invalid" << std::endl; 80 | } 81 | 82 | _bi_gates.ComputeForwardScore(x, xAtt, xMExp); 83 | _uni_gates.ComputeForwardScore(xMExp, xExp); 84 | 85 | sumpool_forward(xExp, xSum); 86 | for (int idx = 0; idx < seq_size; idx++) { 87 | xPoolIndex[idx] = xExp[idx] / xSum; 88 | } 89 | for (int idx = 0; idx < seq_size; idx++) { 90 | y += x[idx] * xPoolIndex[idx]; 91 | } 92 | } 93 | 94 | inline void ComputeForwardScore(const std::vector >& x, const std::vector >& xAtt, 95 | std::vector >& xMExp, std::vector >& xExp, Tensor xSum, 96 | std::vector >& xPoolIndex, Tensor y) { 97 | y = 0.0; 98 | int seq_size = x.size(); 99 | if(seq_size == 0) return; 100 | int dim1 = x[0].size(0), dim2 = x[0].size(1); 101 | int odim1 = y.size(0), odim2 = y.size(1); 102 | 103 | if (dim1 != odim1 || dim2 != odim2 || dim1 != 1) { 104 | std::cerr << "AttentionPooling Forward error: dim invalid" << std::endl; 105 | } 106 | 107 | _bi_gates.ComputeForwardScore(x, xAtt, xMExp); 108 | _uni_gates.ComputeForwardScore(xMExp, xExp); 109 | 110 | sumpool_forward(xExp, xSum); 111 | for (int idx = 0; idx < seq_size; idx++) { 112 | xPoolIndex[idx] = xExp[idx] / xSum; 113 | } 114 | for (int idx = 0; idx < seq_size; idx++) { 115 | y += x[idx] * xPoolIndex[idx]; 116 | } 117 | } 118 | 119 | 120 | // xExp, xSumIndex, xSum ad xPoolIndex are temporal variables, which reduce computation in back-propagation 121 | inline void ComputeForwardScore(Tensor x, Tensor xAtt, 122 | Tensor xMExp, Tensor xExp, 123 | Tensor xSum, Tensor xPoolIndex, Tensor y) { 124 | y = 0.0; 125 | int seq_size = x.size(0); 126 | if(seq_size == 0) return; 127 | int dim1 = x.size(1), dim2 = x.size(2); 128 | int odim1 = y.size(0), odim2 = y.size(1); 129 | 130 | if (dim1 != odim1 || dim2 != odim2 || dim1 != 1) { 131 | std::cerr << "AttentionPooling Forward error: dim invalid" << std::endl; 132 | } 133 | 134 | for (int idx = 0; idx < seq_size; idx++) { 135 | _bi_gates.ComputeForwardScore(x[idx], xAtt, xMExp[idx]); 136 | } 137 | _uni_gates.ComputeForwardScore(xMExp, xExp); 138 | 139 | sumpool_forward(xExp, xSum); 140 | for (int idx = 0; idx < seq_size; idx++) { 141 | xPoolIndex[idx] = xExp[idx] / xSum; 142 | } 143 | for (int idx = 0; idx < seq_size; idx++) { 144 | y += x[idx] * xPoolIndex[idx]; 145 | } 146 | } 147 | 148 | inline void ComputeForwardScore(const std::vector >& x, Tensor xAtt, 149 | std::vector >& xMExp, std::vector >& xExp, Tensor xSum, 150 | std::vector >& xPoolIndex, Tensor y) { 151 | y = 0.0; 152 | int seq_size = x.size(); 153 | if(seq_size == 0) return; 154 | int dim1 = x[0].size(0), dim2 = x[0].size(1); 155 | int odim1 = y.size(0), odim2 = y.size(1); 156 | 157 | if (dim1 != odim1 || dim2 != odim2 || dim1 != 1) { 158 | std::cerr << "AttentionPooling Forward error: dim invalid" << std::endl; 159 | } 160 | 161 | for (int idx = 0; idx < seq_size; idx++) { 162 | _bi_gates.ComputeForwardScore(x[idx], xAtt, xMExp[idx]); 163 | } 164 | _uni_gates.ComputeForwardScore(xMExp, xExp); 165 | 166 | sumpool_forward(xExp, xSum); 167 | for (int idx = 0; idx < seq_size; idx++) { 168 | xPoolIndex[idx] = xExp[idx] / xSum; 169 | } 170 | for (int idx = 0; idx < seq_size; idx++) { 171 | y += x[idx] * xPoolIndex[idx]; 172 | } 173 | } 174 | 175 | 176 | //please allocate the memory outside here 177 | inline void ComputeBackwardLoss(Tensor x, Tensor xAtt, 178 | Tensor xMExp, Tensor xExp, 179 | Tensor xSum, Tensor xPoolIndex, Tensor y, 180 | Tensor ly, Tensor lx, Tensor lxAtt, bool bclear = false) { 181 | int seq_size = x.size(0); 182 | if(seq_size == 0) return; 183 | int dim1 = x.size(1), dim2 = x.size(2); 184 | int odim1 = y.size(0), odim2 = y.size(1); 185 | 186 | if(bclear) lx = 0.0; 187 | if(bclear) lxAtt = 0.0; 188 | 189 | Tensor xMExpLoss = NewTensor(Shape3(seq_size, dim1, dim2), d_zero); 190 | Tensor xExpLoss = NewTensor(Shape3(seq_size, dim1, dim2), d_zero); 191 | Tensor xSumLoss = NewTensor(Shape2(dim1, dim2), d_zero); 192 | Tensor xPoolIndexLoss = NewTensor(Shape3(seq_size, dim1, dim2), d_zero); 193 | 194 | for (int idx = 0; idx < seq_size; idx++) { 195 | xPoolIndexLoss[idx] = ly * x[idx]; 196 | lx[idx] += ly * xPoolIndex[idx]; 197 | } 198 | 199 | for (int idx = 0; idx < seq_size; idx++) { 200 | xExpLoss[idx] += xPoolIndexLoss[idx] / xSum; 201 | xSumLoss -= xPoolIndexLoss[idx] * xExp[idx] / xSum / xSum; 202 | } 203 | 204 | sumpool_backward(xSumLoss, xExpLoss); 205 | 206 | _uni_gates.ComputeBackwardLoss(xMExp, xExp, xExpLoss, xMExpLoss); 207 | _bi_gates.ComputeBackwardLoss(x, xAtt, xMExp, xMExpLoss, lx, lxAtt); 208 | 209 | FreeSpace(&xMExpLoss); 210 | FreeSpace(&xExpLoss); 211 | FreeSpace(&xSumLoss); 212 | FreeSpace(&xPoolIndexLoss); 213 | } 214 | 215 | inline void ComputeBackwardLoss(const std::vector >& x, std::vector >& xAtt, 216 | std::vector >& xMExp, std::vector >& xExp, 217 | Tensor xSum, std::vector >& xPoolIndex, Tensor y, 218 | Tensor ly, std::vector >& lx, std::vector >& lxAtt, bool bclear = false) { 219 | int seq_size = x.size(); 220 | if(seq_size == 0) return; 221 | int dim1 = x[0].size(0), dim2 = x[0].size(1); 222 | int odim1 = y.size(0), odim2 = y.size(1); 223 | 224 | 225 | if(bclear){ 226 | for (int idx = 0; idx < seq_size; idx++) { 227 | lx[idx] = 0.0; 228 | lxAtt[idx] = 0.0; 229 | } 230 | } 231 | 232 | vector > xMExpLoss(seq_size), xExpLoss(seq_size), xPoolIndexLoss(seq_size); 233 | for (int idx = 0; idx < seq_size; idx++) { 234 | xMExpLoss[idx] = NewTensor(Shape2(dim1, dim2), d_zero); 235 | xExpLoss[idx] = NewTensor(Shape2(dim1, dim2), d_zero); 236 | xPoolIndexLoss[idx] = NewTensor(Shape2(dim1, dim2), d_zero); 237 | } 238 | 239 | Tensor xSumLoss = NewTensor(Shape2(dim1, dim2), d_zero); 240 | 241 | for (int idx = 0; idx < seq_size; idx++) { 242 | xPoolIndexLoss[idx] = ly * x[idx]; 243 | lx[idx] += ly * xPoolIndex[idx]; 244 | } 245 | 246 | for (int idx = 0; idx < seq_size; idx++) { 247 | xExpLoss[idx] += xPoolIndexLoss[idx] / xSum; 248 | xSumLoss -= xPoolIndexLoss[idx] * xExp[idx] / xSum / xSum; 249 | } 250 | 251 | sumpool_backward(xSumLoss, xExpLoss); 252 | 253 | _uni_gates.ComputeBackwardLoss(xMExp, xExp, xExpLoss, xMExpLoss); 254 | _bi_gates.ComputeBackwardLoss(x, xAtt, xMExp, xMExpLoss, lx, lxAtt); 255 | 256 | FreeSpace(&xSumLoss); 257 | for (int idx = 0; idx < seq_size; idx++) { 258 | FreeSpace(&(xMExpLoss[idx])); 259 | FreeSpace(&(xExpLoss[idx])); 260 | FreeSpace(&(xPoolIndexLoss[idx])); 261 | } 262 | } 263 | 264 | //please allocate the memory outside here 265 | inline void ComputeBackwardLoss(Tensor x, Tensor xAtt, 266 | Tensor xMExp, Tensor xExp, 267 | Tensor xSum, Tensor xPoolIndex, Tensor y, 268 | Tensor ly, Tensor lx, Tensor lxAtt, bool bclear = false) { 269 | int seq_size = x.size(0); 270 | if(seq_size == 0) return; 271 | int dim1 = x.size(1), dim2 = x.size(2); 272 | int odim1 = y.size(0), odim2 = y.size(1); 273 | 274 | if(bclear) lx = 0.0; 275 | if(bclear) lxAtt = 0.0; 276 | 277 | Tensor xMExpLoss = NewTensor(Shape3(seq_size, dim1, dim2), d_zero); 278 | Tensor xExpLoss = NewTensor(Shape3(seq_size, dim1, dim2), d_zero); 279 | Tensor xSumLoss = NewTensor(Shape2(dim1, dim2), d_zero); 280 | Tensor xPoolIndexLoss = NewTensor(Shape3(seq_size, dim1, dim2), d_zero); 281 | 282 | for (int idx = 0; idx < seq_size; idx++) { 283 | xPoolIndexLoss[idx] = ly * x[idx]; 284 | lx[idx] += ly * xPoolIndex[idx]; 285 | } 286 | 287 | for (int idx = 0; idx < seq_size; idx++) { 288 | xExpLoss[idx] += xPoolIndexLoss[idx] / xSum; 289 | xSumLoss -= xPoolIndexLoss[idx] * xExp[idx] / xSum / xSum; 290 | } 291 | 292 | sumpool_backward(xSumLoss, xExpLoss); 293 | 294 | _uni_gates.ComputeBackwardLoss(xMExp, xExp, xExpLoss, xMExpLoss); 295 | for (int idx = 0; idx < seq_size; idx++) { 296 | _bi_gates.ComputeBackwardLoss(x[idx], xAtt, xMExp[idx], xMExpLoss[idx], lx[idx], lxAtt); 297 | } 298 | 299 | FreeSpace(&xMExpLoss); 300 | FreeSpace(&xExpLoss); 301 | FreeSpace(&xSumLoss); 302 | FreeSpace(&xPoolIndexLoss); 303 | } 304 | 305 | inline void ComputeBackwardLoss(const std::vector >& x, Tensor xAtt, 306 | std::vector >& xMExp, std::vector >& xExp, 307 | Tensor xSum, std::vector >& xPoolIndex, Tensor y, 308 | Tensor ly, std::vector >& lx, Tensor lxAtt, bool bclear = false) { 309 | int seq_size = x.size(); 310 | if(seq_size == 0) return; 311 | int dim1 = x[0].size(0), dim2 = x[0].size(1); 312 | int odim1 = y.size(0), odim2 = y.size(1); 313 | 314 | 315 | if(bclear){ 316 | for (int idx = 0; idx < seq_size; idx++) { 317 | lx[idx] = 0.0; 318 | lxAtt[idx] = 0.0; 319 | } 320 | } 321 | 322 | vector > xMExpLoss(seq_size), xExpLoss(seq_size), xPoolIndexLoss(seq_size); 323 | for (int idx = 0; idx < seq_size; idx++) { 324 | xMExpLoss[idx] = NewTensor(Shape2(dim1, dim2), d_zero); 325 | xExpLoss[idx] = NewTensor(Shape2(dim1, dim2), d_zero); 326 | xPoolIndexLoss[idx] = NewTensor(Shape2(dim1, dim2), d_zero); 327 | } 328 | 329 | Tensor xSumLoss = NewTensor(Shape2(dim1, dim2), d_zero); 330 | 331 | for (int idx = 0; idx < seq_size; idx++) { 332 | xPoolIndexLoss[idx] = ly * x[idx]; 333 | lx[idx] += ly * xPoolIndex[idx]; 334 | } 335 | 336 | for (int idx = 0; idx < seq_size; idx++) { 337 | xExpLoss[idx] += xPoolIndexLoss[idx] / xSum; 338 | xSumLoss -= xPoolIndexLoss[idx] * xExp[idx] / xSum / xSum; 339 | } 340 | 341 | sumpool_backward(xSumLoss, xExpLoss); 342 | 343 | _uni_gates.ComputeBackwardLoss(xMExp, xExp, xExpLoss, xMExpLoss); 344 | for (int idx = 0; idx < seq_size; idx++) { 345 | _bi_gates.ComputeBackwardLoss(x[idx], xAtt, xMExp[idx], xMExpLoss[idx], lx[idx], lxAtt); 346 | } 347 | 348 | FreeSpace(&xSumLoss); 349 | for (int idx = 0; idx < seq_size; idx++) { 350 | FreeSpace(&(xExpLoss[idx])); 351 | FreeSpace(&(xPoolIndexLoss[idx])); 352 | } 353 | } 354 | 355 | inline void randomprint(int num) { 356 | _bi_gates.randomprint(num); 357 | _uni_gates.randomprint(num); 358 | } 359 | 360 | inline void updateAdaGrad(dtype regularizationWeight, dtype adaAlpha, dtype adaEps) { 361 | _bi_gates.updateAdaGrad(regularizationWeight, adaAlpha, adaEps); 362 | _uni_gates.updateAdaGrad(regularizationWeight, adaAlpha, adaEps); 363 | } 364 | 365 | void writeModel(LStream &outf) { 366 | _bi_gates.writeModel(outf); 367 | _uni_gates.writeModel(outf); 368 | 369 | } 370 | 371 | void loadModel(LStream &inf) { 372 | _bi_gates.loadModel(inf); 373 | _uni_gates.loadModel(inf); 374 | 375 | } 376 | }; 377 | 378 | #endif /* SRC_AttentionPooling_H_ */ 379 | -------------------------------------------------------------------------------- /AvgPerceptron1O.h: -------------------------------------------------------------------------------- 1 | /* 2 | * AvgPerceptron1O.h 3 | * 4 | * Created on: Oct 22, 2015 5 | * Author: mszhang 6 | */ 7 | 8 | #ifndef AVGPERCEPTRON1O_H_ 9 | #define AVGPERCEPTRON1O_H_ 10 | 11 | #include "tensor.h" 12 | #include "Utiltensor.h" 13 | #include "MyLib.h" 14 | 15 | using namespace mshadow; 16 | using namespace mshadow::expr; 17 | using namespace mshadow::utils; 18 | 19 | // Weight updating process implemented without theory support, 20 | // but recently find an EMNLP 2015 paper "An Empirical Analysis of Optimization for Max-Margin NLP" 21 | // In all my papers that use adagrad for sparse features, I use it for parameter updating. 22 | 23 | template 24 | class AvgPerceptron1O { 25 | 26 | public: 27 | 28 | hash_set _indexers; 29 | 30 | Tensor _W; 31 | 32 | Tensor _gradW; 33 | 34 | Tensor _sumW; 35 | 36 | int _max_update; 37 | NRVec _last_update; 38 | 39 | public: 40 | 41 | AvgPerceptron1O() { 42 | _indexers.clear(); 43 | } 44 | 45 | inline void initial(int nISize, int seed = 0) { 46 | dtype bound = sqrt(6.0 / (nISize + 1)); 47 | //dtype bound = 0.01; 48 | 49 | _W = NewTensor(Shape1(nISize), d_zero); 50 | _gradW = NewTensor(Shape1(nISize), d_zero); 51 | _sumW = NewTensor(Shape1(nISize), d_one); 52 | 53 | _max_update = 0; 54 | _last_update.resize(nISize); 55 | _last_update = 0; 56 | } 57 | 58 | inline void initial(Tensor W) { 59 | static int nOSize, nISize; 60 | nISize = W.size(0); 61 | 62 | _W = NewTensor(Shape1(nISize), d_zero); 63 | _gradW = NewTensor(Shape1(nISize), d_zero); 64 | _sumW = NewTensor(Shape1(nISize), d_one); 65 | Copy(_W, W); 66 | 67 | _max_update = 0; 68 | _last_update.resize(nISize); 69 | _last_update = 0; 70 | } 71 | 72 | inline void release() { 73 | FreeSpace(&_W); 74 | FreeSpace(&_gradW); 75 | FreeSpace(&_sumW); 76 | _indexers.clear(); 77 | } 78 | 79 | virtual ~AvgPerceptron1O() { 80 | // TODO Auto-generated destructor stub 81 | } 82 | 83 | inline dtype squarenormAll() { 84 | dtype result = squarenorm(_gradW); 85 | 86 | return result; 87 | } 88 | 89 | inline void scaleGrad(dtype scale) { 90 | _gradW = _gradW * scale; 91 | } 92 | 93 | public: 94 | void ComputeForwardScore(const std::vector& x, dtype& y, bool bTrain = false) { 95 | static long long featNum, featId; 96 | featNum = x.size(); 97 | y = 0.0; 98 | for (int idx = 0; idx < featNum; idx++) { 99 | featId = x[idx]; 100 | if (featId >= _W.size(0)) 101 | continue; 102 | if (bTrain) 103 | y += _W[featId]; 104 | else 105 | y += sumWeight(featId); 106 | //y += _W[featId]; 107 | } 108 | } 109 | 110 | // loss is stopped at this layer, since the input is one-hold alike 111 | void ComputeBackwardLoss(const std::vector& x, dtype ly) { 112 | //_gradW 113 | static long long featNum, featId; 114 | featNum = x.size(); 115 | for (int idx = 0; idx < featNum; idx++) { 116 | featId = x[idx]; 117 | if (featId >= _W.size(0)) 118 | continue; 119 | _indexers.insert(featId); 120 | _gradW[featId] += ly; 121 | } 122 | } 123 | 124 | void randomprint(int num) { 125 | static int nISize; 126 | nISize = _W.size(0); 127 | 128 | int count = 0; 129 | while (count < num) { 130 | int idx = rand() % nISize; 131 | std::cout << "_W[" << idx << "]=" << _W[idx] << " "; 132 | count++; 133 | } 134 | 135 | std::cout << std::endl; 136 | } 137 | 138 | void updateAdaGrad(dtype regularizationWeight, dtype adaAlpha, dtype adaEps) { 139 | static int startPos; 140 | 141 | static hash_set::iterator it; 142 | 143 | _max_update++; 144 | 145 | for (it = _indexers.begin(); it != _indexers.end(); ++it) { 146 | int index = *it; 147 | _sumW[index] += (_max_update - _last_update[index]) * _W[index] - _gradW[index]; 148 | _W[index] = _W[index] - _gradW[index]; 149 | _last_update[index] = _max_update; 150 | } 151 | 152 | clearGrad(); 153 | } 154 | 155 | void clearGrad() { 156 | static hash_set::iterator it; 157 | for (it = _indexers.begin(); it != _indexers.end(); ++it) { 158 | int index = *it; 159 | _gradW[index] = 0.0; 160 | } 161 | _indexers.clear(); 162 | 163 | } 164 | 165 | dtype sumWeight(int featId) { 166 | if (_last_update[featId] < _max_update) { 167 | int times = _max_update - _last_update[featId]; 168 | _sumW[featId] += _W[featId] * times; 169 | _last_update[featId] = _max_update; 170 | } 171 | 172 | return _sumW[featId]; 173 | } 174 | 175 | void writeModel(LStream &outf) { 176 | SaveBinary(outf, _W); 177 | SaveBinary(outf, _gradW); 178 | SaveBinary(outf, _sumW); 179 | WriteBinary(outf, _max_update); 180 | WriteVector(outf, _last_update); 181 | 182 | } 183 | 184 | void loadModel(LStream &inf) { 185 | LoadBinary(inf, &_W, false); 186 | LoadBinary(inf, &_gradW, false); 187 | LoadBinary(inf, &_sumW, false); 188 | ReadBinary(inf, _max_update); 189 | ReadVector(inf, _last_update); 190 | } 191 | }; 192 | 193 | #endif /* AVGPERCEPTRON1O_H_ */ 194 | -------------------------------------------------------------------------------- /BiLayer.h: -------------------------------------------------------------------------------- 1 | /* 2 | * BiLayer.h 3 | * 4 | * Created on: Mar 18, 2015 5 | * Author: mszhang 6 | */ 7 | 8 | #ifndef SRC_BiLayer_H_ 9 | #define SRC_BiLayer_H_ 10 | #include "tensor.h" 11 | #include "MyLib.h" 12 | #include "Utiltensor.h" 13 | 14 | using namespace mshadow; 15 | using namespace mshadow::expr; 16 | using namespace mshadow::utils; 17 | 18 | template 19 | class BiLayer { 20 | 21 | public: 22 | 23 | Tensor _WL; 24 | Tensor _WR; 25 | Tensor _b; 26 | 27 | Tensor _gradWL; 28 | Tensor _gradWR; 29 | Tensor _gradb; 30 | 31 | Tensor _eg2WL; 32 | Tensor _eg2WR; 33 | Tensor _eg2b; 34 | 35 | bool _bUseB; 36 | 37 | int _funcType; // 0: tanh, 1: sigmod, 2: f(x)=x, 3: exp 38 | 39 | public: 40 | BiLayer() { 41 | } 42 | 43 | inline void initial(int nOSize, int nLISize, int nRISize, bool bUseB = true, int seed = 0, int funcType = 0) { 44 | dtype bound = sqrt(6.0 / (nOSize + nLISize + nRISize + 1)); 45 | //dtype bound = 0.01; 46 | 47 | _WL = NewTensor(Shape2(nOSize, nLISize), d_zero); 48 | _gradWL = NewTensor(Shape2(nOSize, nLISize), d_zero); 49 | _eg2WL = NewTensor(Shape2(nOSize, nLISize), d_zero); 50 | 51 | _WR = NewTensor(Shape2(nOSize, nRISize), d_zero); 52 | _gradWR = NewTensor(Shape2(nOSize, nRISize), d_zero); 53 | _eg2WR = NewTensor(Shape2(nOSize, nRISize), d_zero); 54 | 55 | _b = NewTensor(Shape2(1, nOSize), d_zero); 56 | _gradb = NewTensor(Shape2(1, nOSize), d_zero); 57 | _eg2b = NewTensor(Shape2(1, nOSize), d_zero); 58 | 59 | random(_WL, -1.0 * bound, 1.0 * bound, seed); 60 | random(_WR, -1.0 * bound, 1.0 * bound, seed+1); 61 | random(_b, -1.0 * bound, 1.0 * bound, seed+2); 62 | 63 | _bUseB = bUseB; 64 | _funcType = funcType; 65 | } 66 | 67 | inline void initial(Tensor WL, Tensor WR, Tensor b, bool bUseB = true, int funcType = 0) { 68 | static int nOSize, nLISize, nRISize; 69 | nOSize = WL.size(0); 70 | nLISize = WL.size(1); 71 | nRISize = WR.size(1); 72 | 73 | _WL = NewTensor(Shape2(nOSize, nLISize), d_zero); 74 | _gradWL = NewTensor(Shape2(nOSize, nLISize), d_zero); 75 | _eg2WL = NewTensor(Shape2(nOSize, nLISize), d_zero); 76 | Copy(_WL, WL); 77 | 78 | _WR = NewTensor(Shape2(nOSize, nRISize), d_zero); 79 | _gradWR = NewTensor(Shape2(nOSize, nRISize), d_zero); 80 | _eg2WR = NewTensor(Shape2(nOSize, nRISize), d_zero); 81 | Copy(_WR, WR); 82 | 83 | _b = NewTensor(Shape2(1, nOSize), d_zero); 84 | _gradb = NewTensor(Shape2(1, nOSize), d_zero); 85 | _eg2b = NewTensor(Shape2(1, nOSize), d_zero); 86 | 87 | if (bUseB) 88 | Copy(_b, b); 89 | 90 | _bUseB = bUseB; 91 | _funcType = funcType; 92 | } 93 | 94 | 95 | inline void initial(Tensor WL, Tensor WR, int funcType = 0) { 96 | static int nOSize, nLISize, nRISize; 97 | nOSize = WL.size(0); 98 | nLISize = WL.size(1); 99 | nRISize = WR.size(1); 100 | 101 | _WL = NewTensor(Shape2(nOSize, nLISize), d_zero); 102 | _gradWL = NewTensor(Shape2(nOSize, nLISize), d_zero); 103 | _eg2WL = NewTensor(Shape2(nOSize, nLISize), d_zero); 104 | Copy(_WL, WL); 105 | 106 | _WR = NewTensor(Shape2(nOSize, nRISize), d_zero); 107 | _gradWR = NewTensor(Shape2(nOSize, nRISize), d_zero); 108 | _eg2WR = NewTensor(Shape2(nOSize, nRISize), d_zero); 109 | Copy(_WR, WR); 110 | 111 | _b = NewTensor(Shape2(1, nOSize), d_zero); 112 | _gradb = NewTensor(Shape2(1, nOSize), d_zero); 113 | _eg2b = NewTensor(Shape2(1, nOSize), d_zero); 114 | 115 | 116 | _bUseB = false; 117 | _funcType = funcType; 118 | } 119 | 120 | inline void release() { 121 | FreeSpace(&_WL); 122 | FreeSpace(&_gradWL); 123 | FreeSpace(&_eg2WL); 124 | FreeSpace(&_WR); 125 | FreeSpace(&_gradWR); 126 | FreeSpace(&_eg2WR); 127 | FreeSpace(&_b); 128 | FreeSpace(&_gradb); 129 | FreeSpace(&_eg2b); 130 | } 131 | 132 | virtual ~BiLayer() { 133 | // TODO Auto-generated destructor stub 134 | } 135 | 136 | inline dtype squarenormAll() { 137 | dtype result = squarenorm(_gradWL); 138 | result += squarenorm(_gradWR); 139 | if (_bUseB) { 140 | result += squarenorm(_gradb); 141 | } 142 | 143 | return result; 144 | } 145 | 146 | inline void scaleGrad(dtype scale) { 147 | _gradWL = _gradWL * scale; 148 | _gradWR = _gradWR * scale; 149 | if (_bUseB) { 150 | _gradb = _gradb * scale; 151 | } 152 | } 153 | 154 | public: 155 | inline void ComputeForwardScore(Tensor xl, Tensor xr, Tensor y) { 156 | y = dot(xl, _WL.T()); 157 | y += dot(xr, _WR.T()); 158 | if (_bUseB) 159 | y = y + _b; 160 | if (_funcType == 0) 161 | y = F(y); 162 | else if (_funcType == 1) 163 | y = F(y); 164 | else if (_funcType == 3) 165 | y = F(y); 166 | } 167 | 168 | 169 | inline void ComputeForwardScore(Tensor xl, Tensor xr, Tensor y) { 170 | int seq_size = y.size(0); 171 | for(int id = 0; id < seq_size; id++){ 172 | y[id] = dot(xl[id], _WL.T()); 173 | y[id] += dot(xr[id], _WR.T()); 174 | if (_bUseB) 175 | y[id] = y[id] + _b; 176 | if (_funcType == 0) 177 | y[id] = F(y[id]); 178 | else if (_funcType == 1) 179 | y[id] = F(y[id]); 180 | else if (_funcType == 3) 181 | y[id] = F(y[id]); 182 | } 183 | } 184 | 185 | inline void ComputeForwardScore(const std::vector >& xl, const std::vector >& xr, 186 | std::vector > &y) { 187 | int seq_size = y.size(); 188 | for(int id = 0; id < seq_size; id++){ 189 | y[id] = dot(xl[id], _WL.T()); 190 | y[id] += dot(xr[id], _WR.T()); 191 | if (_bUseB) 192 | y[id] = y[id] + _b; 193 | if (_funcType == 0) 194 | y[id] = F(y[id]); 195 | else if (_funcType == 1) 196 | y[id] = F(y[id]); 197 | else if (_funcType == 3) 198 | y[id] = F(y[id]); 199 | } 200 | } 201 | 202 | //please allocate the memory outside here 203 | inline void ComputeBackwardLoss(Tensor xl, Tensor xr, Tensor y, Tensor ly, 204 | Tensor lxl, Tensor lxr, bool bclear = false) { 205 | //_gradW 206 | Tensor deri_yx(Shape2(y.size(0), y.size(1))), cly(Shape2(y.size(0), y.size(1))); 207 | AllocSpace(&deri_yx); 208 | AllocSpace(&cly); 209 | if(bclear){ 210 | lxl = 0.0; 211 | lxr = 0.0; 212 | } 213 | if (_funcType == 0) { 214 | deri_yx = F(y); 215 | cly = ly * deri_yx; 216 | } else if (_funcType == 1) { 217 | deri_yx = F(y); 218 | cly = ly * deri_yx; 219 | } else if (_funcType == 3) { 220 | cly = ly * y; 221 | } else { 222 | //cly = ly; 223 | Copy(cly, ly); 224 | } 225 | //_gradW 226 | _gradWL += dot(cly.T(), xl); 227 | _gradWR += dot(cly.T(), xr); 228 | 229 | //_gradb 230 | if (_bUseB) 231 | _gradb += cly; 232 | 233 | //lx 234 | lxl += dot(cly, _WL); 235 | lxr += dot(cly, _WR); 236 | 237 | FreeSpace(&deri_yx); 238 | FreeSpace(&cly); 239 | } 240 | 241 | 242 | //please allocate the memory outside here 243 | inline void ComputeBackwardLoss(Tensor xl, Tensor xr, Tensor y, Tensor ly, 244 | Tensor lxl, Tensor lxr, bool bclear = false) { 245 | int seq_size = y.size(0); 246 | int y_dim1 = y.size(1), y_dim2 = y.size(2); 247 | //_gradW 248 | Tensor deri_yx(Shape2(y_dim1, y_dim2)), cly(Shape2(y_dim1, y_dim2)); 249 | AllocSpace(&deri_yx); 250 | AllocSpace(&cly); 251 | 252 | if(bclear){ 253 | lxl = 0.0; 254 | lxr = 0.0; 255 | } 256 | for (int id = 0; id < seq_size; id++) { 257 | if (_funcType == 0) { 258 | deri_yx = F(y[id]); 259 | cly = ly[id] * deri_yx; 260 | } else if (_funcType == 1) { 261 | deri_yx = F(y[id]); 262 | cly = ly[id] * deri_yx; 263 | } else if (_funcType == 3) { 264 | cly = ly[id] * y[id]; 265 | } else { 266 | //cly = ly; 267 | Copy(cly, ly[id]); 268 | } 269 | //_gradW 270 | _gradWL += dot(cly.T(), xl[id]); 271 | _gradWR += dot(cly.T(), xr[id]); 272 | 273 | //_gradb 274 | if (_bUseB) 275 | _gradb += cly; 276 | 277 | //lx 278 | lxl[id] += dot(cly, _WL); 279 | lxr[id] += dot(cly, _WR); 280 | } 281 | 282 | FreeSpace(&deri_yx); 283 | FreeSpace(&cly); 284 | } 285 | 286 | inline void ComputeBackwardLoss(const std::vector > &xl, const std::vector > &xr, 287 | const std::vector > &y, const std::vector > &ly, 288 | std::vector > &lxl, std::vector > &lxr, bool bclear = false) { 289 | int seq_size = y.size(); 290 | assert(seq_size > 0); 291 | int y_dim1 = y[0].size(0), y_dim2 = y[0].size(1); 292 | //_gradW 293 | Tensor deri_yx(Shape2(y_dim1, y_dim2)), cly(Shape2(y_dim1, y_dim2)); 294 | AllocSpace(&deri_yx); 295 | AllocSpace(&cly); 296 | 297 | if(bclear){ 298 | for (int id = 0; id < seq_size; id++) { 299 | lxl[id] = 0.0; 300 | lxr[id] = 0.0; 301 | } 302 | } 303 | for (int id = 0; id < seq_size; id++) { 304 | if (_funcType == 0) { 305 | deri_yx = F(y[id]); 306 | cly = ly[id] * deri_yx; 307 | } else if (_funcType == 1) { 308 | deri_yx = F(y[id]); 309 | cly = ly[id] * deri_yx; 310 | } else if (_funcType == 3) { 311 | cly = ly[id] * y[id]; 312 | } else { 313 | //cly = ly; 314 | Copy(cly, ly[id]); 315 | } 316 | //_gradW 317 | _gradWL += dot(cly.T(), xl[id]); 318 | _gradWR += dot(cly.T(), xr[id]); 319 | 320 | //_gradb 321 | if (_bUseB) 322 | _gradb += cly; 323 | 324 | //lx 325 | lxl[id] += dot(cly, _WL); 326 | lxr[id] += dot(cly, _WR); 327 | } 328 | 329 | FreeSpace(&deri_yx); 330 | FreeSpace(&cly); 331 | } 332 | 333 | inline void randomprint(int num) { 334 | static int nOSize, nLISize, nRISize; 335 | nOSize = _WL.size(0); 336 | nLISize = _WL.size(1); 337 | nRISize = _WR.size(1); 338 | int count = 0; 339 | while (count < num) { 340 | int idxl = rand() % nOSize; 341 | int idyl = rand() % nLISize; 342 | int idxr = rand() % nOSize; 343 | int idyr = rand() % nRISize; 344 | 345 | std::cout << "_WL[" << idxl << "," << idyl << "]=" << _WL[idxl][idyl] << " "; 346 | std::cout << "_WR[" << idxr << "," << idyr << "]=" << _WR[idxr][idyr] << " "; 347 | 348 | if (_bUseB) { 349 | int idz = rand() % nOSize; 350 | std::cout << "_b[0][" << idz << "]=" << _b[0][idz] << " "; 351 | } 352 | count++; 353 | } 354 | 355 | std::cout << std::endl; 356 | } 357 | 358 | inline void updateAdaGrad(dtype regularizationWeight, dtype adaAlpha, dtype adaEps) { 359 | _gradWL = _gradWL + _WL * regularizationWeight; 360 | _eg2WL = _eg2WL + _gradWL * _gradWL; 361 | _WL = _WL - _gradWL * adaAlpha / F(_eg2WL + adaEps); 362 | 363 | _gradWR = _gradWR + _WR * regularizationWeight; 364 | _eg2WR = _eg2WR + _gradWR * _gradWR; 365 | _WR = _WR - _gradWR * adaAlpha / F(_eg2WR + adaEps); 366 | 367 | if (_bUseB) { 368 | _gradb = _gradb + _b * regularizationWeight; 369 | _eg2b = _eg2b + _gradb * _gradb; 370 | _b = _b - _gradb * adaAlpha / F(_eg2b + adaEps); 371 | } 372 | 373 | clearGrad(); 374 | } 375 | 376 | inline void clearGrad() { 377 | _gradWL = 0; 378 | _gradWR = 0; 379 | if (_bUseB) 380 | _gradb = 0; 381 | } 382 | 383 | void writeModel(LStream &outf) { 384 | SaveBinary(outf, _WL); 385 | SaveBinary(outf, _WR); 386 | SaveBinary(outf, _b); 387 | SaveBinary(outf, _gradWL); 388 | SaveBinary(outf, _gradWR); 389 | SaveBinary(outf, _gradb); 390 | SaveBinary(outf, _eg2WL); 391 | SaveBinary(outf, _eg2WR); 392 | SaveBinary(outf, _eg2b); 393 | 394 | WriteBinary(outf, _bUseB); 395 | WriteBinary(outf, _funcType); 396 | // cout << "Bilayer " << _bUseB << _funcType << endl; 397 | // cout << "Bilayer value: " << _WR[1][1] << endl; 398 | 399 | } 400 | 401 | void loadModel(LStream &inf) { 402 | LoadBinary(inf, &_WL, false); 403 | LoadBinary(inf, &_WR, false); 404 | LoadBinary(inf, &_b, false); 405 | LoadBinary(inf, &_gradWL, false); 406 | LoadBinary(inf, &_gradWR, false); 407 | LoadBinary(inf, &_gradb, false); 408 | LoadBinary(inf, &_eg2WL, false); 409 | LoadBinary(inf, &_eg2WR, false); 410 | LoadBinary(inf, &_eg2b, false); 411 | 412 | ReadBinary(inf, _bUseB); 413 | ReadBinary(inf, _funcType); 414 | // cout << "Bilayer " << _bUseB << _funcType << endl; 415 | // cout << "Bilayer value: " << _WR[1][1] << endl; 416 | } 417 | 418 | }; 419 | 420 | #endif /* SRC_BiLayer_H_ */ 421 | -------------------------------------------------------------------------------- /CheckGrad.h: -------------------------------------------------------------------------------- 1 | /* 2 | * CheckGrad.h 3 | * 4 | * Created on: Dec 4, 2015 5 | * Author: mason 6 | */ 7 | 8 | #ifndef BASIC_CHECKGRAD_H_ 9 | #define BASIC_CHECKGRAD_H_ 10 | 11 | #include 12 | #include "tensor.h" 13 | #include "MyLib.h" 14 | 15 | using namespace nr; 16 | using namespace std; 17 | using namespace mshadow; 18 | using namespace mshadow::expr; 19 | using namespace mshadow::utils; 20 | 21 | template 22 | void checkgrad(Classifier* classifier, const vector& examples, Tensor& Wd, 23 | const Tensor& gradWd, const string& mark, int iter) { 24 | int charseed = mark.length(); 25 | for (int i = 0; i < mark.length(); i++) { 26 | charseed = (int) (mark[i]) * 5 + charseed; 27 | } 28 | srand(iter + charseed); 29 | std::vector idRows, idCols; 30 | idRows.clear(); 31 | idCols.clear(); 32 | for (int i = 0; i < Wd.size(0); ++i) 33 | idRows.push_back(i); 34 | for (int idx = 0; idx < Wd.size(1); idx++) 35 | idCols.push_back(idx); 36 | 37 | random_shuffle(idRows.begin(), idRows.end()); 38 | random_shuffle(idCols.begin(), idCols.end()); 39 | 40 | int check_i = idRows[0], check_j = idCols[0]; 41 | 42 | dtype orginValue = Wd[check_i][check_j]; 43 | 44 | Wd[check_i][check_j] = orginValue + 0.001; 45 | dtype lossAdd = 0.0; 46 | for (int i = 0; i < examples.size(); i++) { 47 | Example oneExam = examples[i]; 48 | lossAdd += classifier->computeScore(oneExam); 49 | } 50 | 51 | Wd[check_i][check_j] = orginValue - 0.001; 52 | dtype lossPlus = 0.0; 53 | for (int i = 0; i < examples.size(); i++) { 54 | Example oneExam = examples[i]; 55 | lossPlus += classifier->computeScore(oneExam); 56 | } 57 | 58 | dtype mockGrad = (lossAdd - lossPlus) / 0.002; 59 | mockGrad = mockGrad / examples.size(); 60 | dtype computeGrad = gradWd[check_i][check_j]; 61 | 62 | printf("Iteration %d, Checking gradient for %s[%d][%d]:\t", iter, 63 | mark.c_str(), check_i, check_j); 64 | printf("mock grad = %.18f, computed grad = %.18f\n", mockGrad, computeGrad); 65 | 66 | Wd[check_i][check_j] = orginValue; 67 | } 68 | 69 | template 70 | void checkgrad(Classifier* classifier, const vector& examples, Tensor& Wd, 71 | const Tensor& gradWd, const string& mark, int iter, 72 | const hash_set& indexes, bool bRow = true) { 73 | if (indexes.size() == 0) 74 | return; 75 | int charseed = mark.length(); 76 | for (int i = 0; i < mark.length(); i++) { 77 | charseed = (int) (mark[i]) * 5 + charseed; 78 | } 79 | srand(iter + charseed); 80 | std::vector idRows, idCols; 81 | idRows.clear(); 82 | idCols.clear(); 83 | static hash_set::iterator it; 84 | if (bRow) { 85 | for (it = indexes.begin(); it != indexes.end(); ++it) 86 | idRows.push_back(*it); 87 | for (int idx = 0; idx < Wd.size(1); idx++) 88 | idCols.push_back(idx); 89 | } else { 90 | for (it = indexes.begin(); it != indexes.end(); ++it) 91 | idCols.push_back(*it); 92 | for (int idx = 0; idx < Wd.size(0); idx++) 93 | idRows.push_back(idx); 94 | } 95 | 96 | random_shuffle(idRows.begin(), idRows.end()); 97 | random_shuffle(idCols.begin(), idCols.end()); 98 | 99 | int check_i = idRows[0], check_j = idCols[0]; 100 | 101 | dtype orginValue = Wd[check_i][check_j]; 102 | 103 | Wd[check_i][check_j] = orginValue + 0.001; 104 | dtype lossAdd = 0.0; 105 | for (int i = 0; i < examples.size(); i++) { 106 | Example oneExam = examples[i]; 107 | lossAdd += classifier->computeScore(oneExam); 108 | } 109 | 110 | Wd[check_i][check_j] = orginValue - 0.001; 111 | dtype lossPlus = 0.0; 112 | for (int i = 0; i < examples.size(); i++) { 113 | Example oneExam = examples[i]; 114 | lossPlus += classifier->computeScore(oneExam); 115 | } 116 | 117 | dtype mockGrad = (lossAdd - lossPlus) / 0.002; 118 | mockGrad = mockGrad / examples.size(); 119 | dtype computeGrad = gradWd[check_i][check_j]; 120 | 121 | printf("Iteration %d, Checking gradient for %s[%d][%d]:\t", iter, 122 | mark.c_str(), check_i, check_j); 123 | printf("mock grad = %.18f, computed grad = %.18f\n", mockGrad, computeGrad); 124 | 125 | Wd[check_i][check_j] = orginValue; 126 | 127 | } 128 | 129 | 130 | #endif /* BASIC_CHECKGRAD_H_ */ 131 | -------------------------------------------------------------------------------- /Dropout.h: -------------------------------------------------------------------------------- 1 | #ifndef DROPOUT 2 | #define DROPOUT 3 | 4 | #include "tensor.h" 5 | #include "MyLib.h" 6 | 7 | 8 | using namespace std; 9 | using namespace mshadow; 10 | using namespace mshadow::expr; 11 | using namespace mshadow::utils; 12 | 13 | 14 | template 15 | inline void dropoutcol(Tensor w, dtype dropOut) 16 | { 17 | w = 1.0; 18 | std::vector indexes; 19 | for (int i = 0; i < w.size(1); ++i) 20 | indexes.push_back(i); 21 | int dropNum = (int) (w.size(1) * dropOut); 22 | 23 | for(int idx = 0; idx < w.size(0); idx++) 24 | { 25 | random_shuffle(indexes.begin(), indexes.end()); 26 | for(int idy = 0; idy < dropNum; idy++) 27 | { 28 | w[idx][indexes[idy]] = 0.0; 29 | } 30 | } 31 | } 32 | 33 | 34 | template 35 | inline void dropoutrow(Tensor w, dtype dropOut) 36 | { 37 | w = 1.0; 38 | std::vector indexes; 39 | for (int i = 0; i < w.size(0); ++i) 40 | indexes.push_back(i); 41 | int dropNum = (int) (w.size(0) * dropOut); 42 | 43 | for(int idx = 0; idx < w.size(1); idx++) 44 | { 45 | random_shuffle(indexes.begin(), indexes.end()); 46 | for(int idy = 0; idy < dropNum; idy++) 47 | { 48 | w[indexes[idy]][idx] = 0.0; 49 | } 50 | } 51 | } 52 | 53 | 54 | #endif 55 | -------------------------------------------------------------------------------- /GRNN.h: -------------------------------------------------------------------------------- 1 | /* 2 | * GRNN.h 3 | * 4 | * Created on: Mar 18, 2015 5 | * Author: mszhang 6 | */ 7 | 8 | #ifndef SRC_GRNN_H_ 9 | #define SRC_GRNN_H_ 10 | #include "tensor.h" 11 | 12 | #include "BiLayer.h" 13 | #include "MyLib.h" 14 | #include "Utiltensor.h" 15 | 16 | using namespace mshadow; 17 | using namespace mshadow::expr; 18 | using namespace mshadow::utils; 19 | 20 | 21 | template 22 | class GRNN { 23 | public: 24 | BiLayer _rnn_update; 25 | BiLayer _rnn_reset; 26 | BiLayer _rnn; 27 | bool _left2right; 28 | 29 | Tensor _null, _nullLoss; 30 | 31 | public: 32 | GRNN() { 33 | } 34 | 35 | inline void initial(int outputsize, int inputsize, int seed = 0) { 36 | _left2right = true; 37 | 38 | _rnn_update.initial(outputsize, outputsize, inputsize, true, seed, 1); 39 | _rnn_reset.initial(outputsize, outputsize, inputsize, true, seed + 10, 1); 40 | _rnn.initial(outputsize, outputsize, inputsize, true, seed + 20, 0); 41 | 42 | _null = NewTensor(Shape2(1, outputsize), d_zero); 43 | _nullLoss = NewTensor(Shape2(1, outputsize), d_zero); 44 | 45 | } 46 | 47 | inline void initial(int outputsize, int inputsize, bool left2right, int seed = 0) { 48 | _left2right = left2right; 49 | 50 | _rnn_update.initial(outputsize, outputsize, inputsize, true, seed, 1); 51 | _rnn_reset.initial(outputsize, outputsize, inputsize, true, seed + 10, 1); 52 | _rnn.initial(outputsize, outputsize, inputsize, true, seed + 20, 0); 53 | 54 | _null = NewTensor(Shape2(1, outputsize), d_zero); 55 | _nullLoss = NewTensor(Shape2(1, outputsize), d_zero); 56 | 57 | } 58 | 59 | inline void initial(Tensor WL, Tensor WR, Tensor b, Tensor uWL, Tensor uWR, 60 | Tensor ub, Tensor rWL, Tensor rWR, Tensor rb, bool left2right = true) { 61 | _left2right = left2right; 62 | 63 | _rnn_update.initial(uWL, uWR, ub, true, 1); 64 | _rnn_reset.initial(rWL, rWR, rb, true, 1); 65 | _rnn.initial(WL, WR, b, true); 66 | 67 | _null = NewTensor(Shape2(1, b.size(1)), d_zero); 68 | _nullLoss = NewTensor(Shape2(1, b.size(1)), d_zero); 69 | } 70 | 71 | inline void release() { 72 | _rnn_update.release(); 73 | _rnn_reset.release(); 74 | _rnn.release(); 75 | 76 | FreeSpace(&_null); 77 | FreeSpace(&_nullLoss); 78 | } 79 | 80 | virtual ~GRNN() { 81 | // TODO Auto-generated destructor stub 82 | } 83 | 84 | inline dtype squarenormAll() { 85 | dtype norm = _rnn_update.squarenormAll(); 86 | norm += _rnn_reset.squarenormAll(); 87 | norm += _rnn.squarenormAll(); 88 | 89 | return norm; 90 | } 91 | 92 | inline void scaleGrad(dtype scale) { 93 | _rnn_update.scaleGrad(scale); 94 | _rnn_reset.scaleGrad(scale); 95 | _rnn.scaleGrad(scale); 96 | } 97 | 98 | public: 99 | 100 | inline void ComputeForwardScore(Tensor x, Tensor mry, Tensor ry, Tensor uy, 101 | Tensor cy, Tensor y) { 102 | mry = 0.0; 103 | ry = 0.0; 104 | uy = 0.0; 105 | cy = 0.0; 106 | y = 0.0; 107 | int seq_size = x.size(0); 108 | if (seq_size == 0) 109 | return; 110 | 111 | if (_left2right) { 112 | for (int idx = 0; idx < seq_size; idx++) { 113 | if (idx == 0) { 114 | _rnn_update.ComputeForwardScore(_null, x[idx], uy[idx]); 115 | _rnn.ComputeForwardScore(_null, x[idx], cy[idx]); 116 | y[idx] = uy[idx] * cy[idx]; 117 | } else { 118 | _rnn_reset.ComputeForwardScore(y[idx - 1], x[idx], mry[idx]); 119 | ry[idx] = mry[idx] * y[idx - 1]; 120 | _rnn_update.ComputeForwardScore(y[idx - 1], x[idx], uy[idx]); 121 | _rnn.ComputeForwardScore(ry[idx], x[idx], cy[idx]); 122 | y[idx] = (1.0 - uy[idx]) * y[idx - 1] + uy[idx] * cy[idx]; 123 | } 124 | } 125 | } else { 126 | for (int idx = seq_size - 1; idx >= 0; idx--) { 127 | if (idx == seq_size - 1) { 128 | _rnn_update.ComputeForwardScore(_null, x[idx], uy[idx]); 129 | _rnn.ComputeForwardScore(_null, x[idx], cy[idx]); 130 | y[idx] = uy[idx] * cy[idx]; 131 | } else { 132 | _rnn_reset.ComputeForwardScore(y[idx + 1], x[idx], mry[idx]); 133 | ry[idx] = mry[idx] * y[idx + 1]; 134 | _rnn_update.ComputeForwardScore(y[idx + 1], x[idx], uy[idx]); 135 | _rnn.ComputeForwardScore(ry[idx], x[idx], cy[idx]); 136 | y[idx] = (1.0 - uy[idx]) * y[idx + 1] + uy[idx] * cy[idx]; 137 | } 138 | } 139 | } 140 | } 141 | 142 | inline void ComputeForwardScore(const vector > &x, vector > &mry, vector > &ry, 143 | vector > &uy, vector > &cy, vector > &y) { 144 | assign(mry, 0.0); 145 | assign(ry, 0.0); 146 | assign(uy, 0.0); 147 | assign(cy, 0.0); 148 | assign(y, 0.0); 149 | int seq_size = x.size(); 150 | if (seq_size == 0) 151 | return; 152 | 153 | if (_left2right) { 154 | for (int idx = 0; idx < seq_size; idx++) { 155 | if (idx == 0) { 156 | _rnn_update.ComputeForwardScore(_null, x[idx], uy[idx]); 157 | _rnn.ComputeForwardScore(_null, x[idx], cy[idx]); 158 | y[idx] = uy[idx] * cy[idx]; 159 | } else { 160 | _rnn_reset.ComputeForwardScore(y[idx - 1], x[idx], mry[idx]); 161 | ry[idx] = mry[idx] * y[idx - 1]; 162 | _rnn_update.ComputeForwardScore(y[idx - 1], x[idx], uy[idx]); 163 | _rnn.ComputeForwardScore(ry[idx], x[idx], cy[idx]); 164 | y[idx] = (1.0 - uy[idx]) * y[idx - 1] + uy[idx] * cy[idx]; 165 | } 166 | } 167 | } else { 168 | for (int idx = seq_size - 1; idx >= 0; idx--) { 169 | if (idx == seq_size - 1) { 170 | _rnn_update.ComputeForwardScore(_null, x[idx], uy[idx]); 171 | _rnn.ComputeForwardScore(_null, x[idx], cy[idx]); 172 | y[idx] = uy[idx] * cy[idx]; 173 | } else { 174 | _rnn_reset.ComputeForwardScore(y[idx + 1], x[idx], mry[idx]); 175 | ry[idx] = mry[idx] * y[idx + 1]; 176 | _rnn_update.ComputeForwardScore(y[idx + 1], x[idx], uy[idx]); 177 | _rnn.ComputeForwardScore(ry[idx], x[idx], cy[idx]); 178 | y[idx] = (1.0 - uy[idx]) * y[idx + 1] + uy[idx] * cy[idx]; 179 | } 180 | } 181 | } 182 | } 183 | 184 | 185 | // This function is used for computing hidden values incrementally at the start position 186 | // It is applied only when the sequential inputs are not fixed in advance, 187 | // which can vary during decoding. 188 | // We need not provide a backward function, since during backward, inputs will be given. 189 | inline void ComputeForwardScoreIncremental(Tensor x, Tensor mry, Tensor ry, 190 | Tensor uy, Tensor cy, Tensor y) { 191 | assert(_left2right); 192 | _rnn_update.ComputeForwardScore(_null, x, uy); 193 | _rnn.ComputeForwardScore(_null, x, cy); 194 | y = uy * cy; 195 | } 196 | 197 | 198 | // This function is used for computing hidden values incrementally at the non-start position 199 | // It is applied only when the sequential inputs are not fixed in advance, 200 | // which can vary during decoding. 201 | // We need not provide a backward function, since during backward, inputs will be given. 202 | inline void ComputeForwardScoreIncremental(Tensor py, Tensor x, Tensor mry, Tensor ry, 203 | Tensor uy, Tensor cy, Tensor y) { 204 | assert(_left2right); 205 | _rnn_reset.ComputeForwardScore(py, x, mry); 206 | ry = mry * py; 207 | _rnn_update.ComputeForwardScore(py, x, uy); 208 | _rnn.ComputeForwardScore(ry, x, cy); 209 | y = (1.0 - uy) * py + uy * cy; 210 | } 211 | 212 | //please allocate the memory outside here 213 | inline void ComputeBackwardLoss(Tensor x, Tensor mry, Tensor ry, Tensor uy, 214 | Tensor cy, Tensor y, Tensor ly, Tensor lx, bool bclear = false) { 215 | int seq_size = x.size(0); 216 | if (seq_size == 0) 217 | return; 218 | 219 | if (bclear) 220 | lx = 0.0; 221 | //left rnn 222 | Tensor lfy = NewTensor(Shape3(y.size(0), y.size(1), y.size(2)), d_zero); 223 | Tensor luy = NewTensor(Shape3(y.size(0), y.size(1), y.size(2)), d_zero); 224 | Tensor lcy = NewTensor(Shape3(y.size(0), y.size(1), y.size(2)), d_zero); 225 | Tensor lry = NewTensor(Shape3(y.size(0), y.size(1), y.size(2)), d_zero); 226 | Tensor lmry = NewTensor(Shape3(y.size(0), y.size(1), y.size(2)), d_zero); 227 | 228 | if (_left2right) { 229 | for (int idx = seq_size - 1; idx >= 0; idx--) { 230 | if (idx < seq_size - 1) 231 | ly[idx] = ly[idx] + lfy[idx]; 232 | 233 | if (idx == 0) { 234 | luy[idx] = ly[idx] * cy[idx]; 235 | lcy[idx] = ly[idx] * uy[idx]; 236 | 237 | _rnn.ComputeBackwardLoss(_null, x[idx], cy[idx], lcy[idx], _nullLoss, lx[idx]); 238 | 239 | _rnn_update.ComputeBackwardLoss(_null, x[idx], uy[idx], luy[idx], _nullLoss, lx[idx]); 240 | } else { 241 | luy[idx] = ly[idx] * (cy[idx] - y[idx - 1]); 242 | lfy[idx - 1] = ly[idx] * (1.0 - uy[idx]); 243 | lcy[idx] = ly[idx] * uy[idx]; 244 | 245 | _rnn.ComputeBackwardLoss(ry[idx], x[idx], cy[idx], lcy[idx], lry[idx], lx[idx]); 246 | _rnn_update.ComputeBackwardLoss(y[idx - 1], x[idx], uy[idx], luy[idx], lfy[idx - 1], lx[idx]); 247 | 248 | lmry[idx] = lry[idx] * y[idx - 1]; 249 | lfy[idx - 1] += lry[idx] * mry[idx]; 250 | 251 | _rnn_reset.ComputeBackwardLoss(y[idx - 1], x[idx], mry[idx], lmry[idx], lfy[idx - 1], lx[idx]); 252 | } 253 | } 254 | } else { 255 | // right rnn 256 | for (int idx = 0; idx < seq_size; idx++) { 257 | if (idx > 0) 258 | ly[idx] = ly[idx] + lfy[idx]; 259 | 260 | if (idx == seq_size - 1) { 261 | luy[idx] = ly[idx] * cy[idx]; 262 | lcy[idx] = ly[idx] * uy[idx]; 263 | 264 | _rnn.ComputeBackwardLoss(_null, x[idx], cy[idx], lcy[idx], _nullLoss, lx[idx]); 265 | _rnn_update.ComputeBackwardLoss(_null, x[idx], uy[idx], luy[idx], _nullLoss, lx[idx]); 266 | } else { 267 | luy[idx] = ly[idx] * (cy[idx] - y[idx + 1]); 268 | lfy[idx + 1] = ly[idx] * (1.0 - uy[idx]); 269 | lcy[idx] = ly[idx] * uy[idx]; 270 | 271 | _rnn.ComputeBackwardLoss(ry[idx], x[idx], cy[idx], lcy[idx], lry[idx], lx[idx]); 272 | _rnn_update.ComputeBackwardLoss(y[idx + 1], x[idx], uy[idx], luy[idx], lfy[idx + 1], lx[idx]); 273 | 274 | lmry[idx] = lry[idx] * y[idx + 1]; 275 | lfy[idx + 1] += lry[idx] * mry[idx]; 276 | 277 | _rnn_reset.ComputeBackwardLoss(y[idx + 1], x[idx], mry[idx], lmry[idx], lfy[idx + 1], lx[idx]); 278 | } 279 | } 280 | } 281 | 282 | FreeSpace(&lfy); 283 | FreeSpace(&luy); 284 | FreeSpace(&lcy); 285 | FreeSpace(&lry); 286 | FreeSpace(&lmry); 287 | } 288 | 289 | //please allocate the memory outside here 290 | inline void ComputeBackwardLoss(const vector > &x, const vector > &mry, const vector > &ry, 291 | const vector > &uy, const vector > &cy, const vector > &y, 292 | vector > &ly, vector > &lx, bool bclear = false) { 293 | int seq_size = x.size(); 294 | if (seq_size == 0) 295 | return; 296 | 297 | if (bclear) 298 | assign(lx, 0.0); 299 | 300 | vector > lfy(seq_size), lcy(seq_size), luy(seq_size), lry(seq_size), lmry(seq_size); 301 | for (int idx = 0; idx < seq_size; idx++) { 302 | lfy[idx] = NewTensor(Shape2(ly[0].size(0), ly[0].size(1)), d_zero); 303 | lcy[idx] = NewTensor(Shape2(ly[0].size(0), ly[0].size(1)), d_zero); 304 | luy[idx] = NewTensor(Shape2(ly[0].size(0), ly[0].size(1)), d_zero); 305 | lry[idx] = NewTensor(Shape2(ly[0].size(0), ly[0].size(1)), d_zero); 306 | lmry[idx] = NewTensor(Shape2(ly[0].size(0), ly[0].size(1)), d_zero); 307 | } 308 | 309 | if (_left2right) { 310 | for (int idx = seq_size - 1; idx >= 0; idx--) { 311 | if (idx < seq_size - 1) 312 | ly[idx] = ly[idx] + lfy[idx]; 313 | 314 | if (idx == 0) { 315 | luy[idx] = ly[idx] * cy[idx]; 316 | lcy[idx] = ly[idx] * uy[idx]; 317 | 318 | _rnn.ComputeBackwardLoss(_null, x[idx], cy[idx], lcy[idx], _nullLoss, lx[idx]); 319 | 320 | _rnn_update.ComputeBackwardLoss(_null, x[idx], uy[idx], luy[idx], _nullLoss, lx[idx]); 321 | } else { 322 | luy[idx] = ly[idx] * (cy[idx] - y[idx - 1]); 323 | lfy[idx - 1] = ly[idx] * (1.0 - uy[idx]); 324 | lcy[idx] = ly[idx] * uy[idx]; 325 | 326 | _rnn.ComputeBackwardLoss(ry[idx], x[idx], cy[idx], lcy[idx], lry[idx], lx[idx]); 327 | _rnn_update.ComputeBackwardLoss(y[idx - 1], x[idx], uy[idx], luy[idx], lfy[idx - 1], lx[idx]); 328 | 329 | lmry[idx] = lry[idx] * y[idx - 1]; 330 | lfy[idx - 1] += lry[idx] * mry[idx]; 331 | 332 | _rnn_reset.ComputeBackwardLoss(y[idx - 1], x[idx], mry[idx], lmry[idx], lfy[idx - 1], lx[idx]); 333 | } 334 | } 335 | } else { 336 | // right rnn 337 | for (int idx = 0; idx < seq_size; idx++) { 338 | if (idx > 0) 339 | ly[idx] = ly[idx] + lfy[idx]; 340 | 341 | if (idx == seq_size - 1) { 342 | luy[idx] = ly[idx] * cy[idx]; 343 | lcy[idx] = ly[idx] * uy[idx]; 344 | 345 | _rnn.ComputeBackwardLoss(_null, x[idx], cy[idx], lcy[idx], _nullLoss, lx[idx]); 346 | _rnn_update.ComputeBackwardLoss(_null, x[idx], uy[idx], luy[idx], _nullLoss, lx[idx]); 347 | } else { 348 | luy[idx] = ly[idx] * (cy[idx] - y[idx + 1]); 349 | lfy[idx + 1] = ly[idx] * (1.0 - uy[idx]); 350 | lcy[idx] = ly[idx] * uy[idx]; 351 | 352 | _rnn.ComputeBackwardLoss(ry[idx], x[idx], cy[idx], lcy[idx], lry[idx], lx[idx]); 353 | _rnn_update.ComputeBackwardLoss(y[idx + 1], x[idx], uy[idx], luy[idx], lfy[idx + 1], lx[idx]); 354 | 355 | lmry[idx] = lry[idx] * y[idx + 1]; 356 | lfy[idx + 1] += lry[idx] * mry[idx]; 357 | 358 | _rnn_reset.ComputeBackwardLoss(y[idx + 1], x[idx], mry[idx], lmry[idx], lfy[idx + 1], lx[idx]); 359 | } 360 | } 361 | } 362 | 363 | for (int idx = 0; idx < seq_size; idx++) { 364 | FreeSpace(&(lfy[idx])); 365 | FreeSpace(&(lcy[idx])); 366 | FreeSpace(&(luy[idx])); 367 | FreeSpace(&(lry[idx])); 368 | FreeSpace(&(lmry[idx])); 369 | } 370 | } 371 | 372 | inline void randomprint(int num) { 373 | _rnn_update.randomprint(num); 374 | _rnn_reset.randomprint(num); 375 | _rnn.randomprint(num); 376 | } 377 | 378 | inline void updateAdaGrad(dtype regularizationWeight, dtype adaAlpha, dtype adaEps) { 379 | _rnn_update.updateAdaGrad(regularizationWeight, adaAlpha, adaEps); 380 | _rnn_reset.updateAdaGrad(regularizationWeight, adaAlpha, adaEps); 381 | _rnn.updateAdaGrad(regularizationWeight, adaAlpha, adaEps); 382 | } 383 | 384 | void writeModel(LStream &outf) { 385 | _rnn_update.writeModel(outf); 386 | _rnn_reset.writeModel(outf); 387 | _rnn.writeModel(outf); 388 | 389 | WriteBinary(outf, _left2right); 390 | 391 | SaveBinary(outf, _null); 392 | SaveBinary(outf, _nullLoss); 393 | } 394 | 395 | void loadModel(LStream &inf) { 396 | _rnn_update.loadModel(inf); 397 | _rnn_reset.loadModel(inf); 398 | _rnn.loadModel(inf); 399 | 400 | ReadBinary(inf, _left2right); 401 | 402 | LoadBinary(inf, &_null, false); 403 | LoadBinary(inf, &_nullLoss, false); 404 | } 405 | }; 406 | 407 | #endif /* SRC_GRNN_H_ */ 408 | -------------------------------------------------------------------------------- /GatedPooling.h: -------------------------------------------------------------------------------- 1 | /* 2 | * GatedPooling.h 3 | * 4 | * Created on: Mar 18, 2015 5 | * Author: mszhang 6 | */ 7 | 8 | #ifndef SRC_GatedPooling_H_ 9 | #define SRC_GatedPooling_H_ 10 | #include "tensor.h" 11 | #include "MyLib.h" 12 | #include "Utiltensor.h" 13 | #include "Pooling.h" 14 | #include "UniLayer.h" 15 | 16 | using namespace mshadow; 17 | using namespace mshadow::expr; 18 | using namespace mshadow::utils; 19 | 20 | // For simpleness, we do not provide pooling on specified words, 21 | // which has been implemented in Pooling.h 22 | 23 | 24 | template 25 | class GatedPooling { 26 | 27 | public: 28 | UniLayer _uni_gates; 29 | 30 | public: 31 | GatedPooling() { 32 | } 33 | 34 | inline void initial(int hiddenSize, int seed = 0) { 35 | _uni_gates.initial(hiddenSize, hiddenSize, false, seed, 3); 36 | } 37 | 38 | inline void initial(Tensor W) { 39 | _uni_gates.initial(W, 3); 40 | } 41 | 42 | 43 | inline void release() { 44 | _uni_gates.release(); 45 | } 46 | 47 | virtual ~GatedPooling() { 48 | // TODO Auto-generated destructor stub 49 | } 50 | 51 | inline dtype squarenormAll() { 52 | return _uni_gates.squarenormAll(); 53 | } 54 | 55 | inline void scaleGrad(dtype scale) { 56 | _uni_gates.scaleGrad(scale); 57 | } 58 | 59 | public: 60 | // xExp, xSumIndex, xSum ad xPoolIndex are temporal variables, which reduce computation in back-propagation 61 | inline void ComputeForwardScore(Tensor x, Tensor xExp, 62 | Tensor xSum, Tensor xPoolIndex, Tensor y) { 63 | y = 0.0; 64 | int seq_size = x.size(0); 65 | if(seq_size == 0) return; 66 | int dim1 = x.size(1), dim2 = x.size(2); 67 | int odim1 = y.size(0), odim2 = y.size(1); 68 | 69 | if (dim1 != odim1 || dim2 != odim2 || dim1 != 1) { 70 | std::cerr << "GatedPooling Forward error: dim invalid" << std::endl; 71 | } 72 | 73 | _uni_gates.ComputeForwardScore(x, xExp); 74 | 75 | sumpool_forward(xExp, xSum); 76 | for (int idx = 0; idx < seq_size; idx++) { 77 | xPoolIndex[idx] = xExp[idx] / xSum; 78 | } 79 | for (int idx = 0; idx < seq_size; idx++) { 80 | y += x[idx] * xPoolIndex[idx]; 81 | } 82 | } 83 | 84 | inline void ComputeForwardScore(const std::vector >& x, std::vector >& xExp, 85 | Tensor xSum, std::vector >& xPoolIndex, Tensor y) { 86 | y = 0.0; 87 | int seq_size = x.size(); 88 | if(seq_size == 0) return; 89 | int dim1 = x[0].size(0), dim2 = x[0].size(1); 90 | int odim1 = y.size(0), odim2 = y.size(1); 91 | 92 | if (dim1 != odim1 || dim2 != odim2 || dim1 != 1) { 93 | std::cerr << "GatedPooling Forward error: dim invalid" << std::endl; 94 | } 95 | 96 | _uni_gates.ComputeForwardScore(x, xExp); 97 | 98 | sumpool_forward(xExp, xSum); 99 | for (int idx = 0; idx < seq_size; idx++) { 100 | xPoolIndex[idx] = xExp[idx] / xSum; 101 | } 102 | for (int idx = 0; idx < seq_size; idx++) { 103 | y += x[idx] * xPoolIndex[idx]; 104 | } 105 | } 106 | 107 | 108 | //please allocate the memory outside here 109 | inline void ComputeBackwardLoss(Tensor x, Tensor xExp, 110 | Tensor xSum, Tensor xPoolIndex, Tensor y, 111 | Tensor ly, Tensor lx, bool bclear = false) { 112 | int seq_size = x.size(0); 113 | if(seq_size == 0) return; 114 | int dim1 = x.size(1), dim2 = x.size(2); 115 | int odim1 = y.size(0), odim2 = y.size(1); 116 | 117 | if(bclear) lx = 0.0; 118 | 119 | Tensor xExpLoss = NewTensor(Shape3(seq_size, dim1, dim2), d_zero); 120 | Tensor xSumLoss = NewTensor(Shape2(dim1, dim2), d_zero); 121 | Tensor xPoolIndexLoss = NewTensor(Shape3(seq_size, dim1, dim2), d_zero); 122 | 123 | for (int idx = 0; idx < seq_size; idx++) { 124 | xPoolIndexLoss[idx] = ly * x[idx]; 125 | lx[idx] += ly * xPoolIndex[idx]; 126 | } 127 | 128 | for (int idx = 0; idx < seq_size; idx++) { 129 | xExpLoss[idx] += xPoolIndexLoss[idx] / xSum; 130 | xSumLoss -= xPoolIndexLoss[idx] * xExp[idx] / xSum / xSum; 131 | } 132 | 133 | sumpool_backward(xSumLoss, xExpLoss); 134 | 135 | _uni_gates.ComputeBackwardLoss(x, xExp, xExpLoss, lx); 136 | 137 | FreeSpace(&xExpLoss); 138 | FreeSpace(&xSumLoss); 139 | FreeSpace(&xPoolIndexLoss); 140 | } 141 | 142 | inline void ComputeBackwardLoss(const std::vector >& x, std::vector >& xExp, 143 | Tensor xSum, std::vector >& xPoolIndex, Tensor y, 144 | Tensor ly, std::vector >& lx, bool bclear = false) { 145 | int seq_size = x.size(); 146 | if(seq_size == 0) return; 147 | int dim1 = x[0].size(0), dim2 = x[0].size(1); 148 | int odim1 = y.size(0), odim2 = y.size(1); 149 | 150 | 151 | if(bclear){ 152 | for (int idx = 0; idx < seq_size; idx++) { 153 | lx[idx] = 0.0; 154 | } 155 | } 156 | 157 | vector > xExpLoss(seq_size), xPoolIndexLoss(seq_size); 158 | for (int idx = 0; idx < seq_size; idx++) { 159 | xExpLoss[idx] = NewTensor(Shape2(dim1, dim2), d_zero); 160 | xPoolIndexLoss[idx] = NewTensor(Shape2(dim1, dim2), d_zero); 161 | } 162 | 163 | Tensor xSumLoss = NewTensor(Shape2(dim1, dim2), d_zero); 164 | 165 | for (int idx = 0; idx < seq_size; idx++) { 166 | xPoolIndexLoss[idx] = ly * x[idx]; 167 | lx[idx] += ly * xPoolIndex[idx]; 168 | } 169 | 170 | for (int idx = 0; idx < seq_size; idx++) { 171 | xExpLoss[idx] += xPoolIndexLoss[idx] / xSum; 172 | xSumLoss -= xPoolIndexLoss[idx] * xExp[idx] / xSum / xSum; 173 | } 174 | 175 | sumpool_backward(xSumLoss, xExpLoss); 176 | 177 | _uni_gates.ComputeBackwardLoss(x, xExp, xExpLoss, lx); 178 | 179 | FreeSpace(&xSumLoss); 180 | for (int idx = 0; idx < seq_size; idx++) { 181 | FreeSpace(&(xExpLoss[idx])); 182 | FreeSpace(&(xPoolIndexLoss[idx])); 183 | } 184 | } 185 | 186 | inline void randomprint(int num) { 187 | _uni_gates.randomprint(num); 188 | } 189 | 190 | inline void updateAdaGrad(dtype regularizationWeight, dtype adaAlpha, dtype adaEps) { 191 | _uni_gates.updateAdaGrad(regularizationWeight, adaAlpha, adaEps); 192 | } 193 | 194 | void writeModel(LStream &outf) { 195 | _uni_gates.writeModel(outf); 196 | 197 | } 198 | 199 | void loadModel(LStream &inf) { 200 | _uni_gates.loadModel(inf); 201 | } 202 | 203 | }; 204 | 205 | #endif /* SRC_GatedPooling_H_ */ 206 | -------------------------------------------------------------------------------- /Hash_map.hpp: -------------------------------------------------------------------------------- 1 | //========================================================= 2 | // @Modify: Chen Xin (xchen@ir.hit.edu.cn) 3 | // @Date: 2011/03/11 4 | // @Brief: Change default hash func to BKDR. 5 | //========================================================== 6 | 7 | 8 | /************************************************************ 9 | unsigned int BKDRHash(const std::string& str) 10 | { 11 | unsigned int seed = 131; // 31 131 1313 13131 131313 etc.. 12 | unsigned int hash = 0; 13 | for(std::size_t i = 0; i < str.length(); i++) 14 | { 15 | hash = (hash * seed) + str[i]; 16 | } 17 | return hash; 18 | } 19 | *************************************************************/ 20 | 21 | /* 22 | * vi:ts=4:tw=78:shiftwidth=4:expandtab 23 | * vim600:fdm=marker 24 | * 25 | * hash_map.hpp - wrapper header as a workaround for several different ways 26 | * of using hash_map/hash_set since this is not ISO standard. 27 | * 28 | * After inclusion of this file hash and hash_map are exported into the global 29 | * namespace. 30 | * 31 | * Copyright (C) 2004 by Zhang Le 32 | * Begin : 26-Jun-2004 33 | * Last Change : 25-Dec-2004. 34 | * 35 | * Permission is hereby granted, free of charge, to any person obtaining a 36 | * copy of this software and associated documentation files (the "Software"), 37 | * to deal in the Software without restriction, including without limitation 38 | * the rights to use, copy, modify, merge, publish, distribute, sublicense, 39 | * and/or sell copies of the Software, and to permit persons to whom the 40 | * Software is furnished to do so, subject to the following conditions: 41 | * 42 | * The above copyright notice and this permission notice shall be included in 43 | * all copies or substantial portions of the Software. 44 | * 45 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 46 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 47 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 48 | * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 49 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 50 | * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 51 | * DEALINGS IN THE SOFTWARE. 52 | */ 53 | 54 | #ifndef HASH_MAP_HPP 55 | #define HASH_MAP_HPP 56 | 57 | #ifdef HAVE_CONFIG_H 58 | #include 59 | #endif 60 | 61 | #include 62 | 63 | #if defined(_STLPORT_VERSION) 64 | #include 65 | #include 66 | using std::hash; 67 | using std::hash_map; 68 | using hash_set; 69 | #else // not using STLPORT 70 | 71 | #ifdef __GNUC__ 72 | #if __GNUC__ >= 3 73 | #include 74 | #include 75 | namespace __gnu_cxx { 76 | template <> 77 | struct hash { 78 | size_t operator()(const std::string& s) const { 79 | unsigned int _seed = 131; // 31 131 1313 13131 131313 etc.. 80 | unsigned int _hash = 0; 81 | for(std::size_t i = 0; i < s.size(); i++) 82 | { 83 | _hash = (_hash * _seed) + s[i]; 84 | } 85 | return size_t(_hash); 86 | } 87 | }; 88 | }; 89 | using __gnu_cxx::hash_map; 90 | using __gnu_cxx::hash_set; 91 | using __gnu_cxx::hash; 92 | #else // GCC 2.x 93 | #include 94 | #include 95 | namespace std { 96 | struct hash { 97 | size_t operator()(const std::string& s) const { 98 | unsigned int _seed = 131; // 31 131 1313 13131 131313 etc.. 99 | unsigned int _hash = 0; 100 | for(std::size_t i = 0; i < s.size(); i++) 101 | { 102 | _hash = (_hash * _seed) + s[i]; 103 | } 104 | return size_t(_hash); 105 | } 106 | }; 107 | }; 108 | using std::hash_map; 109 | using hash_set; 110 | using std::hash; 111 | #endif // end GCC >= 3 112 | #elif defined(_MSC_VER) && ((_MSC_VER >= 1300) || defined(__INTEL_COMPILER)) 113 | // we only support MSVC7+ and Intel C++ 8.0 114 | #include 115 | #include 116 | namespace stdext { 117 | inline size_t hash_value(const std::string& s) { 118 | unsigned int _seed = 131; // 31 131 1313 13131 131313 etc.. 119 | unsigned int _hash = 0; 120 | for(std::size_t i = 0; i < s.size(); i++) 121 | { 122 | _hash = (_hash * _seed) + s[i]; 123 | } 124 | return size_t(_hash); 125 | } 126 | } 127 | using stdext::hash_map; // _MSC_EXTENSIONS, though DEPRECATED 128 | using stdext::hash_set; 129 | #else 130 | #error unknown compiler 131 | #endif //GCC or MSVC7+ 132 | #endif // end STLPORT 133 | 134 | #endif /* ifndef HASH_MAP_HPP */ 135 | 136 | -------------------------------------------------------------------------------- /IO.h: -------------------------------------------------------------------------------- 1 | /*! 2 | * \file IO.h 3 | * \brief definitions of I/O functions for LibN3L 4 | * \author Jie 5 | */ 6 | #ifndef LIBN3L_IO_H_ 7 | #define LIBN3L_IO_H_ 8 | #include 9 | #include "tensor.h" 10 | #include "io.h" 11 | #include "Utiltensor.h" 12 | #include "Utils.h" 13 | 14 | 15 | class LStream : public IStream { 16 | public: 17 | FILE *fp_; 18 | size_t sz_; 19 | 20 | public: 21 | LStream(const string &fname, const char *mode) { 22 | const char *newname = &fname[0]; 23 | Open(newname, mode); 24 | } 25 | void Open(const char *fname, const char *mode) { 26 | fp_ = FopenCheck(fname, mode); 27 | fseek(fp_, 0L, SEEK_END); 28 | sz_ = ftell(fp_); 29 | fseek(fp_, 0L, SEEK_SET); 30 | } 31 | size_t Read(void *ptr, size_t size) { 32 | return fread(ptr, size, 1, fp_); 33 | } 34 | void Write(const void *ptr, size_t size) { 35 | fwrite(ptr, size, 1, fp_); 36 | } 37 | 38 | // size_t StringRead(string &sentence) { 39 | // char buff[100]; 40 | // return fread(ptr, size, 1, fp_); 41 | // } 42 | // void StringWrite(const string &sentence) { 43 | // fputs(sentence,fp_); 44 | // } 45 | 46 | inline void Close(void) { 47 | if (fp_ != NULL){ 48 | fclose(fp_); fp_ = NULL; 49 | } 50 | } 51 | inline size_t Size() { 52 | return sz_; 53 | } 54 | virtual ~LStream(void) { 55 | this->Close(); 56 | } 57 | 58 | inline std::FILE *FopenCheck(const char *fname, const char *flag) { 59 | std::FILE *fp = fopen(fname, flag); 60 | Check(fp != NULL, "can not open file \"%s\"\n", fname); 61 | return fp; 62 | } 63 | 64 | 65 | }; 66 | 67 | 68 | template 69 | inline void WriteBinary(TStream &fo, const DType &target) { 70 | fo.Write(&target, sizeof(target)); 71 | } 72 | 73 | template 74 | inline void ReadBinary(TStream &fo, DType &target) { 75 | fo.Read(&target, sizeof(DType)); 76 | } 77 | 78 | 79 | 80 | template 81 | inline void WriteString(TStream &fo, const string &target) { 82 | int string_size = target.size(); 83 | fo.Write(&string_size, sizeof(string_size)); 84 | if (string_size > 0) { 85 | int char_size = sizeof(target[0]); 86 | fo.Write(&char_size, sizeof(char_size)); 87 | for (int idx = 0; idx < string_size; idx++) { 88 | fo.Write(&target[idx], sizeof(target[idx])); 89 | } 90 | } 91 | } 92 | 93 | template 94 | inline void ReadString(TStream &fo, string &target) { 95 | int string_size; 96 | fo.Read(&string_size, sizeof(int)); 97 | if (string_size > 0) { 98 | int char_size; 99 | fo.Read(&char_size, sizeof(int)); 100 | char character[string_size]; 101 | for (int idx = 0; idx < string_size; idx++) { 102 | fo.Read(&character[idx], char_size); 103 | } 104 | target = string(character, string_size); 105 | assert(target.size()==string_size); 106 | } 107 | } 108 | 109 | 110 | template 111 | inline void WriteVector(TStream &fo, vector &target) { 112 | int vector_size = target.size(); 113 | fo.Write(&vector_size, sizeof(vector_size)); 114 | if (vector_size > 0) { 115 | int element_size = sizeof(target[0]); 116 | fo.Write(&element_size, sizeof(element_size)); 117 | for (int idx = 0; idx < vector_size; idx++) { 118 | fo.Write(&target[idx], sizeof(target[idx])); 119 | // cout << target[idx] << endl; 120 | } 121 | } 122 | } 123 | 124 | template 125 | inline void ReadVector(TStream &fo, vector &target) { 126 | int vector_size; 127 | fo.Read(&vector_size, sizeof(int)); 128 | if (vector_size > 0) { 129 | int element_size; 130 | fo.Read(&element_size, sizeof(int)); 131 | target.resize(vector_size); 132 | for (int idx = 0; idx < vector_size; idx++) { 133 | fo.Read(&target[idx], element_size); 134 | // cout << target[idx] << endl; 135 | } 136 | assert(target.size()== vector_size); 137 | } 138 | } 139 | 140 | template 141 | inline void WriteVector(TStream &fo, vector &target) { 142 | int vector_size = target.size(); 143 | fo.Write(&vector_size, sizeof(vector_size)); 144 | if (vector_size > 0) { 145 | for (int idx = 0; idx < vector_size; idx++) { 146 | WriteString(fo, target[idx]); 147 | // cout << target[idx] << endl; 148 | } 149 | } 150 | } 151 | 152 | template 153 | inline void ReadVector(TStream &fo, vector &target) { 154 | target.clear(); 155 | int vector_size; 156 | string tmp_target; 157 | fo.Read(&vector_size, sizeof(int)); 158 | // cout << "vector_size " << vector_size << endl; 159 | if (vector_size > 0) { 160 | for (int idx = 0; idx < vector_size; idx++) { 161 | ReadString(fo, tmp_target); 162 | target.push_back(tmp_target); 163 | // cout << target[idx] << endl; 164 | } 165 | } 166 | assert(target.size()== vector_size); 167 | } 168 | 169 | 170 | template 171 | inline void WriteVector(TStream &fo, NRVec &target) { 172 | int vector_size = target.size(); 173 | WriteBinary(fo, vector_size); 174 | if (vector_size > 0) { 175 | for (int idx = 0; idx < vector_size; idx++) { 176 | WriteBinary(fo, target[idx]); 177 | } 178 | } 179 | } 180 | 181 | template 182 | inline void ReadVector(TStream &fo, NRVec &target) { 183 | int vector_size; 184 | ReadBinary(fo, vector_size); 185 | if (vector_size > 0) { 186 | target.resize(vector_size); 187 | for (int idx = 0; idx < vector_size; idx++) { 188 | ReadBinary(fo, target[idx]); 189 | } 190 | assert(target.size()== vector_size); 191 | } 192 | } 193 | 194 | 195 | 196 | #endif // LIBN3L_IO_H_ 197 | -------------------------------------------------------------------------------- /LookupTable.h: -------------------------------------------------------------------------------- 1 | /* 2 | * LookupTable.h 3 | * 4 | * Created on: Mar 18, 2015 5 | * Author: mszhang 6 | */ 7 | 8 | #ifndef SRC_LookupTable_H_ 9 | #define SRC_LookupTable_H_ 10 | #include "tensor.h" 11 | #include "Utiltensor.h" 12 | #include "MyLib.h" 13 | 14 | using namespace mshadow; 15 | using namespace mshadow::expr; 16 | using namespace mshadow::utils; 17 | 18 | // Weight updating process implemented without theory support, 19 | // but recently find an EMNLP 2015 paper "An Empirical Analysis of Optimization for Max-Margin NLP" 20 | // In all my papers that use adagrad for sparse features, I use it for parameter updating. 21 | 22 | template 23 | class LookupTable { 24 | 25 | public: 26 | 27 | hash_set _indexers; 28 | 29 | Tensor _E; 30 | Tensor _gradE; 31 | Tensor _eg2E; 32 | 33 | Tensor _ftE; 34 | 35 | bool _bFineTune; 36 | int _nDim; 37 | int _nVSize; 38 | 39 | int _max_update; 40 | NRVec _last_update; 41 | 42 | public: 43 | 44 | LookupTable() { 45 | _indexers.clear(); 46 | } 47 | 48 | 49 | inline void initial(const NRMat& wordEmb) { 50 | _nVSize = wordEmb.nrows(); 51 | _nDim = wordEmb.ncols(); 52 | 53 | _E = NewTensor(Shape2(_nVSize, _nDim), d_zero); 54 | _gradE = NewTensor(Shape2(_nVSize, _nDim), d_zero); 55 | _eg2E = NewTensor(Shape2(_nVSize, _nDim), d_zero); 56 | _ftE = NewTensor(Shape2(_nVSize, _nDim), d_one); 57 | assign(_E, wordEmb); 58 | for (int idx = 0; idx < _nVSize; idx++) { 59 | norm2one(_E, idx); 60 | } 61 | 62 | _bFineTune = true; 63 | 64 | _max_update = 0; 65 | _last_update.resize(_nVSize); 66 | _last_update = 0; 67 | } 68 | 69 | inline void setEmbFineTune(bool bFineTune) { 70 | _bFineTune = bFineTune; 71 | } 72 | 73 | inline void release() { 74 | FreeSpace(&_E); 75 | FreeSpace(&_gradE); 76 | FreeSpace(&_eg2E); 77 | FreeSpace(&_ftE); 78 | _indexers.clear(); 79 | } 80 | 81 | virtual ~LookupTable() { 82 | // TODO Auto-generated destructor stub 83 | } 84 | 85 | inline dtype squarenormAll() { 86 | dtype result = 0; 87 | static hash_set::iterator it; 88 | for (int idx = 0; idx < _nDim; idx++) { 89 | for (it = _indexers.begin(); it != _indexers.end(); ++it) { 90 | result += _gradE[*it][idx] * _gradE[*it][idx]; 91 | } 92 | } 93 | 94 | 95 | return result; 96 | } 97 | 98 | inline void scaleGrad(dtype scale) { 99 | static hash_set::iterator it; 100 | for (int idx = 0; idx < _nDim; idx++) { 101 | for (it = _indexers.begin(); it != _indexers.end(); ++it) { 102 | _gradE[*it][idx] = _gradE[*it][idx] * scale; 103 | } 104 | } 105 | 106 | } 107 | 108 | inline bool bEmbFineTune() 109 | { 110 | return _bFineTune; 111 | } 112 | 113 | public: 114 | void GetEmb(int id, Tensor y) { 115 | updateSparseWeight(id); 116 | assert(y.size(0) == 1); 117 | y = 0.0; 118 | y[0] += _E[id]; 119 | } 120 | 121 | // loss is stopped at this layer, since the input is one-hold alike 122 | void EmbLoss(int id, Tensor ly) { 123 | if(!_bFineTune) return; 124 | //_gradE 125 | assert(ly.size(0) == 1); 126 | _gradE[id] += ly[0]; 127 | _indexers.insert(id); 128 | 129 | } 130 | 131 | 132 | void randomprint(int num) { 133 | static int _nVSize, _nDim; 134 | _nVSize = _E.size(0); 135 | _nDim = _E.size(1); 136 | int count = 0; 137 | while (count < num) { 138 | int idx = rand() % _nVSize; 139 | int idy = rand() % _nDim; 140 | 141 | std::cout << "_E[" << idx << "," << idy << "]=" << _E[idx][idy] << " "; 142 | 143 | count++; 144 | } 145 | 146 | std::cout << std::endl; 147 | } 148 | 149 | void updateAdaGrad(dtype regularizationWeight, dtype adaAlpha, dtype adaEps) { 150 | 151 | if(!_bFineTune) return; 152 | static hash_set::iterator it; 153 | _max_update++; 154 | 155 | Tensor sqrt_eg2E = NewTensor(Shape1(_E.size(1)), d_zero); 156 | 157 | for (it = _indexers.begin(); it != _indexers.end(); ++it) { 158 | int index = *it; 159 | _eg2E[index] = _eg2E[index] + _gradE[index] * _gradE[index]; 160 | sqrt_eg2E = F(_eg2E[index] + adaEps); 161 | _E[index] = (_E[index] * sqrt_eg2E - _gradE[index] * adaAlpha) / (adaAlpha * regularizationWeight + sqrt_eg2E); 162 | _ftE[index] = sqrt_eg2E / (adaAlpha * regularizationWeight + sqrt_eg2E); 163 | } 164 | 165 | FreeSpace(&sqrt_eg2E); 166 | 167 | clearGrad(); 168 | } 169 | 170 | void clearGrad() { 171 | static hash_set::iterator it; 172 | 173 | for (it = _indexers.begin(); it != _indexers.end(); ++it) { 174 | int index = *it; 175 | _gradE[index] = 0.0; 176 | } 177 | 178 | _indexers.clear(); 179 | 180 | } 181 | 182 | void updateSparseWeight(int wordId) { 183 | if(!_bFineTune) return; 184 | if (_last_update[wordId] < _max_update) { 185 | int times = _max_update - _last_update[wordId]; 186 | _E[wordId] = _E[wordId] * F(times * F(_ftE[wordId])); 187 | _last_update[wordId] = _max_update; 188 | } 189 | } 190 | 191 | void writeModel(LStream &outf) { 192 | SaveBinary(outf, _E); 193 | SaveBinary(outf, _gradE); 194 | SaveBinary(outf, _eg2E); 195 | SaveBinary(outf, _ftE); 196 | 197 | WriteBinary(outf, _bFineTune); 198 | WriteBinary(outf, _nDim); 199 | WriteBinary(outf, _nVSize); 200 | WriteBinary(outf, _max_update); 201 | WriteVector(outf, _last_update); 202 | } 203 | void loadModel(LStream &inf) { 204 | LoadBinary(inf, &_E, false); 205 | LoadBinary(inf, &_gradE, false); 206 | LoadBinary(inf, &_eg2E, false); 207 | LoadBinary(inf, &_ftE, false); 208 | 209 | ReadBinary(inf, _bFineTune); 210 | ReadBinary(inf, _nDim); 211 | ReadBinary(inf, _nVSize); 212 | ReadBinary(inf, _max_update); 213 | 214 | ReadVector(inf, _last_update); 215 | } 216 | 217 | }; 218 | 219 | #endif /* SRC_LookupTable_H_ */ 220 | -------------------------------------------------------------------------------- /Metric.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Metric.h 3 | * 4 | * Created on: Mar 17, 2015 5 | * Author: mszhang 6 | */ 7 | 8 | #ifndef SRC_METRIC_H_ 9 | #define SRC_METRIC_H_ 10 | #include "IO.h" 11 | 12 | using namespace std; 13 | 14 | class Metric { 15 | 16 | public: 17 | int overall_label_count; 18 | int correct_label_count; 19 | int predicated_label_count; 20 | 21 | public: 22 | Metric() 23 | { 24 | overall_label_count = 0; 25 | correct_label_count = 0; 26 | predicated_label_count = 0; 27 | } 28 | 29 | ~Metric(){} 30 | 31 | void reset() 32 | { 33 | overall_label_count = 0; 34 | correct_label_count = 0; 35 | predicated_label_count = 0; 36 | } 37 | 38 | bool bIdentical() 39 | { 40 | if(predicated_label_count == 0) 41 | { 42 | if(overall_label_count == correct_label_count) 43 | { 44 | return true; 45 | } 46 | return false; 47 | } 48 | else 49 | { 50 | if(overall_label_count == correct_label_count && predicated_label_count == correct_label_count) 51 | { 52 | return true; 53 | } 54 | return false; 55 | } 56 | } 57 | 58 | double getAccuracy() 59 | { 60 | if(predicated_label_count == 0) 61 | { 62 | return correct_label_count*1.0/overall_label_count; 63 | } 64 | else 65 | { 66 | return correct_label_count*2.0/(overall_label_count + predicated_label_count); 67 | } 68 | } 69 | 70 | 71 | void print() 72 | { 73 | if(predicated_label_count == 0) 74 | { 75 | std::cout << "Accuracy:\tP=" << correct_label_count << "/" << overall_label_count 76 | << "=" << correct_label_count*1.0/overall_label_count << std::endl; 77 | } 78 | else 79 | { 80 | std::cout << "Recall:\tP=" << correct_label_count << "/" << overall_label_count << "=" << correct_label_count*1.0/overall_label_count 81 | << ", " << "Accuracy:\tP=" << correct_label_count << "/" << predicated_label_count << "=" << correct_label_count*1.0/predicated_label_count 82 | << ", " << "Fmeasure:\t" << correct_label_count*2.0/(overall_label_count + predicated_label_count) << std::endl; 83 | } 84 | } 85 | 86 | void loadModel(LStream &inf) { 87 | ReadBinary(inf, overall_label_count); 88 | ReadBinary(inf, correct_label_count); 89 | ReadBinary(inf, predicated_label_count); 90 | // cout << overall_label_count << correct_label_count << predicated_label_count < 21 | class RNN { 22 | public: 23 | BiLayer _rnn; 24 | bool _left2right; 25 | 26 | Tensor _null, _nullLoss; 27 | 28 | public: 29 | RNN() { 30 | } 31 | 32 | inline void initial(int outputsize, int inputsize, int seed = 0) { 33 | _left2right = true; 34 | _rnn.initial(outputsize, outputsize, inputsize, true, seed, 0); 35 | 36 | _null = NewTensor(Shape2(1, outputsize), d_zero); 37 | _nullLoss = NewTensor(Shape2(1, outputsize), d_zero); 38 | 39 | } 40 | 41 | inline void initial(int outputsize, int inputsize, bool left2right, int seed = 0) { 42 | _left2right = left2right; 43 | _rnn.initial(outputsize, outputsize, inputsize, true, seed, 0); 44 | 45 | _null = NewTensor(Shape2(1, outputsize), d_zero); 46 | _nullLoss = NewTensor(Shape2(1, outputsize), d_zero); 47 | 48 | } 49 | 50 | inline void initial(Tensor WL, Tensor WR, Tensor b, bool left2right = true) { 51 | _left2right = left2right; 52 | _rnn.initial(WL, WR, b, true); 53 | 54 | _null = NewTensor(Shape2(1, b.size(1)), d_zero); 55 | _nullLoss = NewTensor(Shape2(1, b.size(1)), d_zero); 56 | } 57 | 58 | inline void release() { 59 | _rnn.release(); 60 | 61 | FreeSpace(&_null); 62 | FreeSpace(&_nullLoss); 63 | } 64 | 65 | virtual ~RNN() { 66 | // TODO Auto-generated destructor stub 67 | } 68 | 69 | inline dtype squarenormAll() { 70 | dtype norm = _rnn.squarenormAll(); 71 | 72 | return norm; 73 | } 74 | 75 | inline void scaleGrad(dtype scale) { 76 | _rnn.scaleGrad(scale); 77 | } 78 | 79 | public: 80 | 81 | inline void ComputeForwardScore(Tensor x, Tensor y) { 82 | y = 0.0; 83 | int seq_size = x.size(0); 84 | if (seq_size == 0) 85 | return; 86 | 87 | if (_left2right) { 88 | for (int idx = 0; idx < seq_size; idx++) { 89 | if (idx == 0) { 90 | _rnn.ComputeForwardScore(_null, x[idx], y[idx]); 91 | } else 92 | _rnn.ComputeForwardScore(y[idx - 1], x[idx], y[idx]); 93 | } 94 | } else { 95 | for (int idx = seq_size - 1; idx >= 0; idx--) { 96 | if (idx == seq_size - 1) 97 | _rnn.ComputeForwardScore(_null, x[idx], y[idx]); 98 | else 99 | _rnn.ComputeForwardScore(y[idx + 1], x[idx], y[idx]); 100 | } 101 | } 102 | } 103 | 104 | inline void ComputeForwardScore(const vector > &x, vector > &y) { 105 | assign(y, 0.0); 106 | int seq_size = x.size(); 107 | if (seq_size == 0) 108 | return; 109 | 110 | if (_left2right) { 111 | for (int idx = 0; idx < seq_size; idx++) { 112 | if (idx == 0) { 113 | _rnn.ComputeForwardScore(_null, x[idx], y[idx]); 114 | } else 115 | _rnn.ComputeForwardScore(y[idx - 1], x[idx], y[idx]); 116 | } 117 | } else { 118 | for (int idx = seq_size - 1; idx >= 0; idx--) { 119 | if (idx == seq_size - 1) 120 | _rnn.ComputeForwardScore(_null, x[idx], y[idx]); 121 | else 122 | _rnn.ComputeForwardScore(y[idx + 1], x[idx], y[idx]); 123 | } 124 | } 125 | } 126 | 127 | // This function is used for computing hidden values incrementally at the start position 128 | // It is applied only when the sequential inputs are not fixed in advance, 129 | // which can vary during decoding. 130 | // We need not provide a backward function, since during backward, inputs will be given. 131 | inline void ComputeForwardScoreIncremental(Tensor x, Tensor y) { 132 | assert(_left2right); 133 | y = 0.0; 134 | _rnn.ComputeForwardScore(_null, x, y); 135 | } 136 | 137 | 138 | // This function is used for computing hidden values incrementally at the non-start position 139 | // It is applied only when the sequential inputs are not fixed in advance, 140 | // which can vary during decoding. 141 | // We need not provide a backward function, since during backward, inputs will be given. 142 | inline void ComputeForwardScoreIncremental(Tensor py, Tensor x, Tensor y) { 143 | assert(_left2right); 144 | y = 0.0; 145 | _rnn.ComputeForwardScore(py, x, y); 146 | } 147 | 148 | //please allocate the memory outside here 149 | inline void ComputeBackwardLoss(Tensor x, Tensor y, Tensor ly, Tensor lx, bool bclear = false) { 150 | int seq_size = x.size(0); 151 | if (seq_size == 0) 152 | return; 153 | 154 | if (bclear) 155 | lx = 0.0; 156 | //left rnn 157 | Tensor lfy = NewTensor(Shape3(y.size(0), y.size(1), y.size(2)), d_zero); 158 | if (_left2right) { 159 | for (int idx = seq_size - 1; idx >= 0; idx--) { 160 | if (idx < seq_size - 1) 161 | ly[idx] = ly[idx] + lfy[idx]; 162 | 163 | if (idx == 0) 164 | _rnn.ComputeBackwardLoss(_null, x[idx], y[idx], ly[idx], _nullLoss, lx[idx]); 165 | else 166 | _rnn.ComputeBackwardLoss(y[idx - 1], x[idx], y[idx], ly[idx], lfy[idx - 1], lx[idx]); 167 | } 168 | } else { 169 | // right rnn 170 | for (int idx = 0; idx < seq_size; idx++) { 171 | if (idx > 0) 172 | ly[idx] = ly[idx] + lfy[idx]; 173 | 174 | if (idx == seq_size - 1) 175 | _rnn.ComputeBackwardLoss(_null, x[idx], y[idx], ly[idx], _nullLoss, lx[idx]); 176 | else 177 | _rnn.ComputeBackwardLoss(y[idx + 1], x[idx], y[idx], ly[idx], lfy[idx + 1], lx[idx]); 178 | } 179 | } 180 | 181 | FreeSpace(&lfy); 182 | } 183 | 184 | //please allocate the memory outside here 185 | inline void ComputeBackwardLoss(const vector > &x, const vector > &y, 186 | vector > &ly, vector > &lx, bool bclear = false) { 187 | int seq_size = x.size(); 188 | if (seq_size == 0) 189 | return; 190 | 191 | if (bclear) 192 | assign(lx, 0.0); 193 | 194 | vector > lfy(seq_size); 195 | for (int idx = 0; idx < seq_size; idx++) { 196 | lfy[idx] = NewTensor(Shape2(ly[0].size(0), ly[0].size(1)), d_zero); 197 | } 198 | 199 | if (_left2right) { 200 | //left rnn 201 | for (int idx = seq_size - 1; idx >= 0; idx--) { 202 | if (idx < seq_size - 1) 203 | ly[idx] = ly[idx] + lfy[idx]; 204 | 205 | if (idx == 0) 206 | _rnn.ComputeBackwardLoss(_null, x[idx], y[idx], ly[idx], _nullLoss, lx[idx]); 207 | else 208 | _rnn.ComputeBackwardLoss(y[idx - 1], x[idx], y[idx], ly[idx], lfy[idx - 1], lx[idx]); 209 | } 210 | } else { 211 | // right rnn 212 | for (int idx = 0; idx < seq_size; idx++) { 213 | if (idx > 0) 214 | ly[idx] = ly[idx] + lfy[idx]; 215 | 216 | if (idx == seq_size - 1) 217 | _rnn.ComputeBackwardLoss(_null, x[idx], y[idx], ly[idx], _nullLoss, lx[idx]); 218 | else 219 | _rnn.ComputeBackwardLoss(y[idx + 1], x[idx], y[idx], ly[idx], lfy[idx + 1], lx[idx]); 220 | } 221 | } 222 | 223 | for (int idx = 0; idx < seq_size; idx++) { 224 | FreeSpace(&(lfy[idx])); 225 | } 226 | } 227 | 228 | inline void randomprint(int num) { 229 | _rnn.randomprint(num); 230 | } 231 | 232 | inline void updateAdaGrad(dtype regularizationWeight, dtype adaAlpha, dtype adaEps) { 233 | _rnn.updateAdaGrad(regularizationWeight, adaAlpha, adaEps); 234 | } 235 | 236 | void writeModel(LStream &outf) { 237 | _rnn.writeModel(outf); 238 | 239 | SaveBinary(outf, _null); 240 | SaveBinary(outf, _nullLoss); 241 | 242 | WriteBinary(outf, _left2right); 243 | } 244 | 245 | void loadModel(LStream &inf) { 246 | _rnn.loadModel(inf); 247 | LoadBinary(inf, &_null, false); 248 | LoadBinary(inf, &_nullLoss, false); 249 | 250 | ReadBinary(inf, _left2right); 251 | } 252 | 253 | }; 254 | 255 | #endif /* SRC_RNN_H_ */ 256 | -------------------------------------------------------------------------------- /RecursiveGatedNN.h: -------------------------------------------------------------------------------- 1 | /* 2 | * RecursiveGatedNN.h 3 | * 4 | * Created on: Mar 18, 2015 5 | * Author: mszhang 6 | */ 7 | 8 | #ifndef SRC_RecursiveGatedNN_H_ 9 | #define SRC_RecursiveGatedNN_H_ 10 | #include "tensor.h" 11 | 12 | #include "BiLayer.h" 13 | #include "UniLayer.h" 14 | #include "MyLib.h" 15 | #include "Utiltensor.h" 16 | 17 | using namespace mshadow; 18 | using namespace mshadow::expr; 19 | using namespace mshadow::utils; 20 | 21 | template 22 | class RecursiveGatedNN { 23 | public: 24 | UniLayer _reset_left; 25 | UniLayer _reset_right; 26 | UniLayer _update_left; 27 | UniLayer _update_right; 28 | UniLayer _update_tilde; 29 | BiLayer _recursive_tilde; 30 | 31 | 32 | Tensor nxl; 33 | Tensor nxr; 34 | Tensor sum; 35 | 36 | Tensor pxl; 37 | Tensor pxr; 38 | Tensor pmy; 39 | 40 | 41 | Tensor lrxl; 42 | Tensor lrxr; 43 | Tensor lmy; 44 | Tensor luxl; 45 | Tensor luxr; 46 | Tensor lumy; 47 | 48 | Tensor lnxl; 49 | Tensor lnxr; 50 | Tensor lsum; 51 | 52 | Tensor lpxl; 53 | Tensor lpxr; 54 | Tensor lpmy; 55 | 56 | 57 | public: 58 | RecursiveGatedNN() { 59 | } 60 | 61 | inline void initial(int dimension, int seed = 0) { 62 | _reset_left.initial(dimension, dimension, false, seed, 1); 63 | _reset_right.initial(dimension, dimension, false, seed + 10, 1); 64 | _update_left.initial(dimension, dimension, false, seed + 20, 3); 65 | _update_right.initial(dimension, dimension, false, seed + 30, 3); 66 | _update_tilde.initial(dimension, dimension, false, seed + 40, 3); 67 | _recursive_tilde.initial(dimension, dimension, dimension, false, seed + 50, 0); 68 | 69 | nxl = NewTensor(Shape2(1, dimension), d_zero); 70 | nxr = NewTensor(Shape2(1, dimension), d_zero); 71 | sum = NewTensor(Shape2(1, dimension), d_zero); 72 | 73 | pxl = NewTensor(Shape2(1, dimension), d_zero); 74 | pxr = NewTensor(Shape2(1, dimension), d_zero); 75 | pmy = NewTensor(Shape2(1, dimension), d_zero); 76 | 77 | 78 | lrxl = NewTensor(Shape2(1, dimension), d_zero); 79 | lrxr = NewTensor(Shape2(1, dimension), d_zero); 80 | lmy = NewTensor(Shape2(1, dimension), d_zero); 81 | luxl = NewTensor(Shape2(1, dimension), d_zero); 82 | luxr = NewTensor(Shape2(1, dimension), d_zero); 83 | lumy = NewTensor(Shape2(1, dimension), d_zero); 84 | 85 | lnxl = NewTensor(Shape2(1, dimension), d_zero); 86 | lnxr = NewTensor(Shape2(1, dimension), d_zero); 87 | lsum = NewTensor(Shape2(1, dimension), d_zero); 88 | 89 | lpxl = NewTensor(Shape2(1, dimension), d_zero); 90 | lpxr = NewTensor(Shape2(1, dimension), d_zero); 91 | lpmy = NewTensor(Shape2(1, dimension), d_zero); 92 | } 93 | 94 | 95 | inline void initial(Tensor rW1, Tensor rW2, 96 | Tensor uW1, Tensor uW2, Tensor uW3, 97 | Tensor W1, Tensor W2, Tensor W3,Tensor b) { 98 | _reset_left.initial(rW1, 1); 99 | _reset_right.initial(rW2, 1); 100 | 101 | _update_left.initial(uW1, 3); 102 | _update_right.initial(uW2, 3); 103 | _update_tilde.initial(uW3, 3); 104 | 105 | _recursive_tilde.initial(W1, W2, W3, b, 0); 106 | } 107 | 108 | inline void release() { 109 | _reset_left.release(); 110 | _reset_right.release(); 111 | 112 | _update_left.release(); 113 | _update_right.release(); 114 | _update_tilde.release(); 115 | 116 | _recursive_tilde.release(); 117 | 118 | FreeSpace(&nxl); 119 | FreeSpace(&nxr); 120 | FreeSpace(&sum); 121 | FreeSpace(&pxl); 122 | FreeSpace(&pxr); 123 | FreeSpace(&pmy); 124 | FreeSpace(&lnxl); 125 | FreeSpace(&lnxr); 126 | FreeSpace(&lsum); 127 | FreeSpace(&lpxl); 128 | FreeSpace(&lpxr); 129 | FreeSpace(&lpmy); 130 | FreeSpace(&lrxl); 131 | FreeSpace(&lrxr); 132 | FreeSpace(&lmy); 133 | FreeSpace(&luxl); 134 | FreeSpace(&luxr); 135 | FreeSpace(&lumy); 136 | } 137 | 138 | virtual ~RecursiveGatedNN() { 139 | // TODO Auto-generated destructor stub 140 | } 141 | 142 | inline dtype squarenormAll() { 143 | dtype norm = _reset_left.squarenormAll(); 144 | norm += _reset_right.squarenormAll(); 145 | norm += _update_left.squarenormAll(); 146 | norm += _update_right.squarenormAll(); 147 | norm += _update_tilde.squarenormAll(); 148 | norm += _recursive_tilde.squarenormAll(); 149 | 150 | return norm; 151 | } 152 | 153 | inline void scaleGrad(dtype scale) { 154 | _reset_left.scaleGrad(scale); 155 | _reset_right.scaleGrad(scale); 156 | 157 | _update_left.scaleGrad(scale); 158 | _update_right.scaleGrad(scale); 159 | _update_tilde.scaleGrad(scale); 160 | 161 | _recursive_tilde.scaleGrad(scale); 162 | } 163 | 164 | public: 165 | 166 | inline void ComputeForwardScore(Tensor xl, Tensor xr, 167 | Tensor rxl, Tensor rxr, Tensor my, 168 | Tensor uxl, Tensor uxr, Tensor umy, 169 | Tensor y) { 170 | 171 | nxl = 0.0; 172 | nxr = 0.0; 173 | sum = 0.0; 174 | 175 | pxl = 0.0; 176 | pxr = 0.0; 177 | pmy = 0.0; 178 | 179 | _reset_left.ComputeForwardScore(xl, rxl); 180 | _reset_right.ComputeForwardScore(xr, rxr); 181 | 182 | 183 | nxl = rxl * xl; 184 | nxr = rxr * xr; 185 | 186 | _recursive_tilde.ComputeForwardScore(nxl, nxr, my); 187 | 188 | 189 | _update_left.ComputeForwardScore(xl, uxl); 190 | _update_right.ComputeForwardScore(xr, uxr); 191 | _update_tilde.ComputeForwardScore(my, umy); 192 | 193 | sum = uxl + uxr + umy; 194 | 195 | pxl = uxl / sum; 196 | pxr = uxr / sum; 197 | pmy = umy / sum; 198 | 199 | y = pxl * xl + pxr * xr + pmy * my; 200 | 201 | } 202 | 203 | //please allocate the memory outside here 204 | inline void ComputeBackwardLoss(Tensor xl, Tensor xr, 205 | Tensor rxl, Tensor rxr, Tensor my, 206 | Tensor uxl, Tensor uxr, Tensor umy, 207 | Tensor y, Tensor ly, 208 | Tensor lxl, Tensor lxr, 209 | bool bclear = false) { 210 | if (bclear){ 211 | lxl = 0.0; lxr = 0.0; 212 | } 213 | 214 | nxl = 0.0; 215 | nxr = 0.0; 216 | sum = 0.0; 217 | 218 | pxl = 0.0; 219 | pxr = 0.0; 220 | pmy = 0.0; 221 | 222 | 223 | lrxl = 0.0; 224 | lrxr = 0.0; 225 | lmy = 0.0; 226 | luxl = 0.0; 227 | luxr = 0.0; 228 | lumy = 0.0; 229 | 230 | lnxl = 0.0; 231 | lnxr = 0.0; 232 | lsum = 0.0; 233 | 234 | lpxl = 0.0; 235 | lpxr = 0.0; 236 | lpmy = 0.0; 237 | 238 | nxl = rxl * xl; 239 | nxr = rxr * xr; 240 | 241 | sum = uxl + uxr + umy; 242 | 243 | pxl = uxl / sum; 244 | pxr = uxr / sum; 245 | pmy = umy / sum; 246 | 247 | 248 | lpxl += ly * xl; 249 | lxl += ly * pxl; 250 | 251 | lpxr += ly * xr; 252 | lxr += ly * pxr; 253 | 254 | lpmy += ly * my; 255 | lmy += ly * pmy; 256 | 257 | 258 | 259 | luxl += lpxl / sum; 260 | luxr += lpxr / sum; 261 | lumy += lpmy / sum; 262 | 263 | lsum -= lpxl * pxl / sum; 264 | lsum -= lpxr * pxr / sum; 265 | lsum -= lpmy * pmy / sum; 266 | 267 | 268 | luxl += lsum; 269 | luxr += lsum; 270 | lumy += lsum; 271 | 272 | _update_left.ComputeBackwardLoss(xl, uxl, luxl, lxl); 273 | _update_right.ComputeBackwardLoss(xr, uxr, luxr, lxr); 274 | _update_tilde.ComputeBackwardLoss(my, umy, lumy, lmy); 275 | 276 | _recursive_tilde.ComputeBackwardLoss(nxl, nxr, my, lmy, lnxl, lnxr); 277 | 278 | lrxl += lnxl * xl; 279 | lxl += lnxl * rxl; 280 | 281 | lrxr += lnxr * xr; 282 | lxr += lnxr * rxr; 283 | 284 | _reset_left.ComputeBackwardLoss(xl, rxl, lrxl, lxl); 285 | _reset_right.ComputeBackwardLoss(xr, rxr, lrxr, lxr); 286 | 287 | } 288 | 289 | 290 | inline void randomprint(int num) { 291 | _reset_left.randomprint(num); 292 | _reset_right.randomprint(num); 293 | 294 | _update_left.randomprint(num); 295 | _update_right.randomprint(num); 296 | _update_tilde.randomprint(num); 297 | 298 | _recursive_tilde.randomprint(num); 299 | } 300 | 301 | inline void updateAdaGrad(dtype regularizationWeight, dtype adaAlpha, dtype adaEps) { 302 | _reset_left.updateAdaGrad(regularizationWeight, adaAlpha, adaEps); 303 | _reset_right.updateAdaGrad(regularizationWeight, adaAlpha, adaEps); 304 | 305 | _update_left.updateAdaGrad(regularizationWeight, adaAlpha, adaEps); 306 | _update_right.updateAdaGrad(regularizationWeight, adaAlpha, adaEps); 307 | _update_tilde.updateAdaGrad(regularizationWeight, adaAlpha, adaEps); 308 | 309 | _recursive_tilde.updateAdaGrad(regularizationWeight, adaAlpha, adaEps); 310 | } 311 | 312 | 313 | void writeModel(LStream &outf) { 314 | 315 | _reset_left.writeModel(outf); 316 | _reset_right.writeModel(outf); 317 | _update_left.writeModel(outf); 318 | _update_right.writeModel(outf); 319 | _update_tilde.writeModel(outf); 320 | _recursive_tilde.writeModel(outf); 321 | 322 | SaveBinary(outf, nxl); 323 | SaveBinary(outf, nxr); 324 | SaveBinary(outf, sum); 325 | 326 | SaveBinary(outf, pxl); 327 | SaveBinary(outf, pxr); 328 | SaveBinary(outf, pmy); 329 | 330 | SaveBinary(outf, lrxl); 331 | SaveBinary(outf, lrxr); 332 | SaveBinary(outf, lmy); 333 | SaveBinary(outf, luxl); 334 | SaveBinary(outf, luxr); 335 | SaveBinary(outf, lumy); 336 | 337 | SaveBinary(outf, lnxl); 338 | SaveBinary(outf, lnxr); 339 | SaveBinary(outf, lsum); 340 | 341 | SaveBinary(outf, lpxl); 342 | SaveBinary(outf, lpxr); 343 | SaveBinary(outf, lpmy); 344 | 345 | } 346 | 347 | void loadModel(LStream &inf) { 348 | 349 | _reset_left.loadModel(inf); 350 | _reset_right.loadModel(inf); 351 | _update_left.loadModel(inf); 352 | _update_right.loadModel(inf); 353 | _update_tilde.loadModel(inf); 354 | _recursive_tilde.loadModel(inf); 355 | 356 | 357 | LoadBinary(inf, &nxl, false); 358 | LoadBinary(inf, &nxr, false); 359 | LoadBinary(inf, &sum, false); 360 | 361 | LoadBinary(inf, &pxl, false); 362 | LoadBinary(inf, &pxr, false); 363 | LoadBinary(inf, &pmy, false); 364 | 365 | LoadBinary(inf, &lrxl, false); 366 | LoadBinary(inf, &lrxr, false); 367 | LoadBinary(inf, &lmy, false); 368 | LoadBinary(inf, &luxl, false); 369 | LoadBinary(inf, &luxr, false); 370 | LoadBinary(inf, &lumy, false); 371 | 372 | LoadBinary(inf, &lnxl, false); 373 | LoadBinary(inf, &lnxr, false); 374 | LoadBinary(inf, &lsum, false); 375 | 376 | LoadBinary(inf, &lpxl, false); 377 | LoadBinary(inf, &lpxr, false); 378 | LoadBinary(inf, &lpmy, false); 379 | 380 | } 381 | 382 | 383 | }; 384 | 385 | #endif /* SRC_RecursiveGatedNN_H_ */ 386 | -------------------------------------------------------------------------------- /RecursiveNN.h: -------------------------------------------------------------------------------- 1 | /* 2 | * RecursiveNN.h 3 | * 4 | * Created on: Mar 18, 2015 5 | * Author: mszhang 6 | */ 7 | 8 | #ifndef SRC_RecursiveNN_H_ 9 | #define SRC_RecursiveNN_H_ 10 | #include "tensor.h" 11 | 12 | #include "BiLayer.h" 13 | #include "MyLib.h" 14 | #include "Utiltensor.h" 15 | 16 | using namespace mshadow; 17 | using namespace mshadow::expr; 18 | using namespace mshadow::utils; 19 | 20 | // Actually, we do not need such a class, BiLayer satisfies it 21 | 22 | template 23 | class RecursiveNN { 24 | public: 25 | BiLayer _rnn; 26 | 27 | public: 28 | RecursiveNN() { 29 | } 30 | 31 | inline void initial(int dimension, int seed = 0) { 32 | _rnn.initial(dimension, dimension, dimension, true, seed, 0); 33 | } 34 | 35 | 36 | inline void initial(Tensor WL, Tensor WR, Tensor b) { 37 | _rnn.initial(WL, WR, b, true); 38 | } 39 | 40 | inline void release() { 41 | _rnn.release(); 42 | } 43 | 44 | virtual ~RecursiveNN() { 45 | // TODO Auto-generated destructor stub 46 | } 47 | 48 | inline dtype squarenormAll() { 49 | dtype norm = _rnn.squarenormAll(); 50 | 51 | return norm; 52 | } 53 | 54 | inline void scaleGrad(dtype scale) { 55 | _rnn.scaleGrad(scale); 56 | } 57 | 58 | public: 59 | 60 | inline void ComputeForwardScore(Tensor xl, Tensor xr, Tensor y) { 61 | y = 0.0; 62 | _rnn.ComputeForwardScore(xl, xr, y); 63 | 64 | } 65 | 66 | //please allocate the memory outside here 67 | inline void ComputeBackwardLoss(Tensor xl, Tensor xr, Tensor y, Tensor ly, 68 | Tensor lxl, Tensor lxr, bool bclear = false) { 69 | if (bclear){ 70 | lxl = 0.0; lxr = 0.0; 71 | } 72 | _rnn.ComputeBackwardLoss(xl, xr, y, ly, lxl, lxr); 73 | } 74 | 75 | 76 | inline void randomprint(int num) { 77 | _rnn.randomprint(num); 78 | } 79 | 80 | inline void updateAdaGrad(dtype regularizationWeight, dtype adaAlpha, dtype adaEps) { 81 | _rnn.updateAdaGrad(regularizationWeight, adaAlpha, adaEps); 82 | } 83 | 84 | void writeModel(LStream &outf) { 85 | _rnn.writeModel(outf); 86 | } 87 | 88 | void loadModel(LStream &inf) { 89 | _rnn.loadModel(inf); 90 | } 91 | 92 | }; 93 | 94 | #endif /* SRC_RecursiveNN_H_ */ 95 | -------------------------------------------------------------------------------- /SoftMaxLoss.h: -------------------------------------------------------------------------------- 1 | #ifndef SOFTMAXLOSS 2 | #define SOFTMAXLOSS 3 | 4 | #include "tensor.h" 5 | #include "MyLib.h" 6 | #include "Metric.h" 7 | 8 | using namespace std; 9 | using namespace mshadow; 10 | using namespace mshadow::expr; 11 | using namespace mshadow::utils; 12 | 13 | template 14 | inline dtype softmax_loss(const vector > &output, const vector > &answers, vector > &loutput, 15 | Metric & eval, int batchsize = 1) { 16 | int seqsize = output.size(); 17 | if (answers.size() != seqsize || seqsize == 0) { 18 | std::cerr << "softmax_loss error: vector size or context size invalid" << std::endl; 19 | } 20 | 21 | int dim1 = output[0].size(0), dim2 = output[0].size(1); 22 | int odim1 = loutput[0].size(0), odim2 = loutput[0].size(1); 23 | int labelsize = answers[0].size(); 24 | 25 | if (labelsize != odim2 || dim2 != odim2 || dim1 != 1 || odim1 != 1) { 26 | std::cerr << "softmax_loss error: dim size invalid" << std::endl; 27 | } 28 | 29 | Tensor scores = NewTensor(Shape3(seqsize, 1, dim2), d_zero); 30 | 31 | for (int idx = 0; idx < seqsize; idx++) { 32 | loutput[idx] = 0.0; 33 | } 34 | 35 | dtype cost = 0.0; 36 | static int optLabel; 37 | for (int idx = 0; idx < seqsize; idx++) { 38 | optLabel = -1; 39 | for (int i = 0; i < dim2; ++i) { 40 | if (answers[idx][i] >= 0) { 41 | if (optLabel < 0 || output[idx][0][i] > output[idx][0][optLabel]) 42 | optLabel = i; 43 | } 44 | } 45 | 46 | dtype sum1 = 0.0; 47 | dtype sum2 = 0.0; 48 | dtype maxScore = output[idx][0][optLabel]; 49 | for (int i = 0; i < dim2; ++i) { 50 | scores[idx][0][i] = -1e10; 51 | if (answers[idx][i] >= 0) { 52 | scores[idx][0][i] = exp(output[idx][0][i] - maxScore); 53 | if (answers[idx][i] == 1) 54 | sum1 += scores[idx][0][i]; 55 | sum2 += scores[idx][0][i]; 56 | } 57 | } 58 | cost += (log(sum2) - log(sum1)) / (batchsize * seqsize); 59 | if (answers[idx][optLabel] == 1) 60 | eval.correct_label_count++; 61 | eval.overall_label_count++; 62 | 63 | for (int i = 0; i < dim2; ++i) { 64 | if (answers[idx][i] >= 0) { 65 | loutput[idx][0][i] = (scores[idx][0][i] / sum2 - answers[idx][i]) / (batchsize * seqsize); 66 | } 67 | } 68 | 69 | } 70 | 71 | FreeSpace(&scores); 72 | return cost; 73 | } 74 | 75 | template 76 | inline dtype softmax_cost(const vector > &output, const vector > &answers) { 77 | int seqsize = output.size(); 78 | if (answers.size() != seqsize || seqsize == 0) { 79 | std::cerr << "softmax_cost error: vector size or context size invalid" << std::endl; 80 | } 81 | 82 | int dim1 = output[0].size(0), dim2 = output[0].size(1); 83 | int labelsize = answers[0].size(); 84 | 85 | if (labelsize != dim2 || dim1 != 1) { 86 | std::cerr << "softmax_cost error: dim size invalid" << std::endl; 87 | } 88 | 89 | Tensor scores = NewTensor(Shape3(seqsize, 1, dim2), d_zero); 90 | 91 | dtype cost = 0.0; 92 | static int optLabel; 93 | for (int idx = 0; idx < seqsize; idx++) { 94 | optLabel = -1; 95 | for (int i = 0; i < dim2; ++i) { 96 | if (answers[idx][i] >= 0) { 97 | if (optLabel < 0 || output[idx][0][i] > output[idx][0][optLabel]) 98 | optLabel = i; 99 | } 100 | } 101 | 102 | dtype sum1 = 0.0; 103 | dtype sum2 = 0.0; 104 | dtype maxScore = output[idx][0][optLabel]; 105 | for (int i = 0; i < dim2; ++i) { 106 | scores[idx][0][i] = -1e10; 107 | if (answers[idx][i] >= 0) { 108 | scores[idx][0][i] = exp(output[idx][0][i] - maxScore); 109 | if (answers[idx][i] == 1) 110 | sum1 += scores[idx][0][i]; 111 | sum2 += scores[idx][0][i]; 112 | } 113 | } 114 | cost += (log(sum2) - log(sum1)) / seqsize; 115 | } 116 | 117 | FreeSpace(&scores); 118 | return cost; 119 | } 120 | 121 | template 122 | inline void softmax_predict(const vector > &output, vector& results) { 123 | int seqsize = output.size(); 124 | if (seqsize == 0) { 125 | std::cerr << "softmax_predict error: vector size or context size invalid" << std::endl; 126 | } 127 | 128 | int dim1 = output[0].size(0), dim2 = output[0].size(1); 129 | if (dim1 != 1) { 130 | std::cerr << "softmax_predict error: dim size invalid" << std::endl; 131 | } 132 | 133 | results.resize(seqsize); 134 | 135 | static int optLabel; 136 | for (int idx = 0; idx < seqsize; idx++) { 137 | optLabel = -1; 138 | for (int i = 0; i < dim2; ++i) { 139 | if (optLabel < 0 || output[idx][0][i] > output[idx][0][optLabel]) 140 | optLabel = i; 141 | } 142 | results[idx] = optLabel; 143 | } 144 | 145 | } 146 | 147 | template 148 | inline dtype softmax_loss(Tensor output, const vector > &answers, Tensor loutput, Metric & eval, int batchsize = 1) { 149 | int seqsize = output.size(0); 150 | if (answers.size() != seqsize || seqsize == 0) { 151 | std::cerr << "softmax_loss error: vector size or context size invalid" << std::endl; 152 | } 153 | 154 | int dim1 = output.size(1), dim2 = output.size(2); 155 | int odim1 = loutput.size(1), odim2 = loutput.size(2); 156 | int labelsize = answers[0].size(); 157 | 158 | if (labelsize != odim2 || dim2 != odim2 || dim1 != 1 || odim1 != 1) { 159 | std::cerr << "softmax_loss error: dim size invalid" << std::endl; 160 | } 161 | 162 | Tensor scores = NewTensor(Shape3(seqsize, 1, dim2), d_zero); 163 | 164 | loutput = 0.0; 165 | dtype cost = 0.0; 166 | static int optLabel; 167 | for (int idx = 0; idx < seqsize; idx++) { 168 | optLabel = -1; 169 | for (int i = 0; i < dim2; ++i) { 170 | if (answers[idx][i] >= 0) { 171 | if (optLabel < 0 || output[idx][0][i] > output[idx][0][optLabel]) 172 | optLabel = i; 173 | } 174 | } 175 | 176 | dtype sum1 = 0.0; 177 | dtype sum2 = 0.0; 178 | dtype maxScore = output[idx][0][optLabel]; 179 | for (int i = 0; i < dim2; ++i) { 180 | scores[idx][0][i] = -1e10; 181 | if (answers[idx][i] >= 0) { 182 | scores[idx][0][i] = exp(output[idx][0][i] - maxScore); 183 | if (answers[idx][i] == 1) 184 | sum1 += scores[idx][0][i]; 185 | sum2 += scores[idx][0][i]; 186 | } 187 | } 188 | cost += (log(sum2) - log(sum1)) / (batchsize * seqsize); 189 | if (answers[idx][optLabel] == 1) 190 | eval.correct_label_count++; 191 | eval.overall_label_count++; 192 | 193 | for (int i = 0; i < dim2; ++i) { 194 | if (answers[idx][i] >= 0) { 195 | loutput[idx][0][i] = (scores[idx][0][i] / sum2 - answers[idx][i]) / (batchsize * seqsize); 196 | } 197 | } 198 | 199 | } 200 | 201 | FreeSpace(&scores); 202 | return cost; 203 | } 204 | 205 | template 206 | inline dtype softmax_cost(Tensor output, const vector > &answers, int batchsize = 1) { 207 | int seqsize = output.size(0); 208 | if (answers.size() != seqsize || seqsize == 0) { 209 | std::cerr << "softmax_cost error: vector size or context size invalid" << std::endl; 210 | } 211 | 212 | int dim1 = output.size(1), dim2 = output.size(2); 213 | int labelsize = answers[0].size(); 214 | 215 | if (labelsize != dim2 || dim1 != 1) { 216 | std::cerr << "softmax_cost error: dim size invalid" << std::endl; 217 | } 218 | 219 | Tensor scores = NewTensor(Shape3(seqsize, 1, dim2), d_zero); 220 | 221 | dtype cost = 0.0; 222 | static int optLabel; 223 | for (int idx = 0; idx < seqsize; idx++) { 224 | optLabel = -1; 225 | for (int i = 0; i < dim2; ++i) { 226 | if (answers[idx][i] >= 0) { 227 | if (optLabel < 0 || output[idx][0][i] > output[idx][0][optLabel]) 228 | optLabel = i; 229 | } 230 | } 231 | 232 | dtype sum1 = 0.0; 233 | dtype sum2 = 0.0; 234 | dtype maxScore = output[idx][0][optLabel]; 235 | for (int i = 0; i < dim2; ++i) { 236 | scores[idx][0][i] = -1e10; 237 | if (answers[idx][i] >= 0) { 238 | scores[idx][0][i] = exp(output[idx][0][i] - maxScore); 239 | if (answers[idx][i] == 1) 240 | sum1 += scores[idx][0][i]; 241 | sum2 += scores[idx][0][i]; 242 | } 243 | } 244 | cost += (log(sum2) - log(sum1)) / (batchsize * seqsize); 245 | } 246 | 247 | FreeSpace(&scores); 248 | return cost; 249 | } 250 | 251 | template 252 | inline void softmax_predict(Tensor output, vector& results) { 253 | int seqsize = output.size(0); 254 | if (seqsize == 0) { 255 | std::cerr << "softmax_predict error: vector size or context size invalid" << std::endl; 256 | } 257 | 258 | int dim1 = output.size(1), dim2 = output.size(2); 259 | if (dim1 != 1) { 260 | std::cerr << "softmax_predict error: dim size invalid" << std::endl; 261 | } 262 | 263 | results.resize(seqsize); 264 | 265 | static int optLabel; 266 | for (int idx = 0; idx < seqsize; idx++) { 267 | optLabel = -1; 268 | for (int i = 0; i < dim2; ++i) { 269 | if (optLabel < 0 || output[idx][0][i] > output[idx][0][optLabel]) 270 | optLabel = i; 271 | } 272 | results[idx] = optLabel; 273 | } 274 | 275 | } 276 | 277 | template 278 | inline dtype softmax_loss(Tensor output, const vector &answer, Tensor loutput, Metric & eval, int batchsize = 1) { 279 | int dim1 = output.size(0), dim2 = output.size(1); 280 | int odim1 = loutput.size(0), odim2 = loutput.size(1); 281 | int labelsize = answer.size(); 282 | 283 | if (labelsize != odim2 || dim2 != odim2 || dim1 != 1 || odim1 != 1) { 284 | std::cerr << "softmax_loss error: dim size invalid" << std::endl; 285 | } 286 | 287 | Tensor scores = NewTensor(Shape2(1, dim2), d_zero); 288 | 289 | loutput = 0.0; 290 | dtype cost = 0.0; 291 | 292 | int optLabel = -1; 293 | for (int i = 0; i < dim2; ++i) { 294 | if (answer[i] >= 0) { 295 | if (optLabel < 0 || output[0][i] > output[0][optLabel]) 296 | optLabel = i; 297 | } 298 | } 299 | 300 | dtype sum1 = 0.0; 301 | dtype sum2 = 0.0; 302 | dtype maxScore = output[0][optLabel]; 303 | for (int i = 0; i < dim2; ++i) { 304 | scores[0][i] = -1e10; 305 | if (answer[i] >= 0) { 306 | scores[0][i] = exp(output[0][i] - maxScore); 307 | if (answer[i] == 1) 308 | sum1 += scores[0][i]; 309 | sum2 += scores[0][i]; 310 | } 311 | } 312 | cost += (log(sum2) - log(sum1)) / batchsize; 313 | if (answer[optLabel] == 1) 314 | eval.correct_label_count++; 315 | eval.overall_label_count++; 316 | 317 | for (int i = 0; i < dim2; ++i) { 318 | if (answer[i] >= 0) { 319 | loutput[0][i] = (scores[0][i] / sum2 - answer[i]) / batchsize; 320 | } 321 | } 322 | 323 | FreeSpace(&scores); 324 | return cost; 325 | } 326 | 327 | template 328 | inline dtype softmax_cost(Tensor output, const vector &answer, int batchsize = 1) { 329 | int dim1 = output.size(0), dim2 = output.size(1); 330 | int labelsize = answer.size(); 331 | 332 | if (labelsize != dim2 || dim1 != 1) { 333 | std::cerr << "softmax_cost error: dim size invalid" << std::endl; 334 | } 335 | 336 | Tensor scores = NewTensor(Shape2(1, dim2), d_zero); 337 | 338 | dtype cost = 0.0; 339 | 340 | int optLabel = -1; 341 | for (int i = 0; i < dim2; ++i) { 342 | if (answer[i] >= 0) { 343 | if (optLabel < 0 || output[0][i] > output[0][optLabel]) 344 | optLabel = i; 345 | } 346 | } 347 | 348 | dtype sum1 = 0.0; 349 | dtype sum2 = 0.0; 350 | dtype maxScore = output[0][optLabel]; 351 | for (int i = 0; i < dim2; ++i) { 352 | scores[0][i] = -1e10; 353 | if (answer[i] >= 0) { 354 | scores[0][i] = exp(output[0][i] - maxScore); 355 | if (answer[i] == 1) 356 | sum1 += scores[0][i]; 357 | sum2 += scores[0][i]; 358 | } 359 | } 360 | cost += (log(sum2) - log(sum1)) / batchsize; 361 | 362 | FreeSpace(&scores); 363 | return cost; 364 | } 365 | 366 | template 367 | inline void softmax_predict(Tensor output, int& result) { 368 | int dim1 = output.size(0), dim2 = output.size(1); 369 | if (dim1 != 1) { 370 | std::cerr << "softmax_predict error: dim size invalid" << std::endl; 371 | } 372 | 373 | int optLabel = -1; 374 | for (int i = 0; i < dim2; ++i) { 375 | if (optLabel < 0 || output[0][i] > output[0][optLabel]) 376 | optLabel = i; 377 | } 378 | result = optLabel; 379 | 380 | } 381 | 382 | template 383 | inline int softmax_predict(Tensor output, vector& results) { 384 | int dim1 = output.size(0), dim2 = output.size(1); 385 | if (dim1 != 1) { 386 | std::cerr << "softmax_predict error: dim size invalid" << std::endl; 387 | } 388 | 389 | int optLabel = -1; 390 | for (int i = 0; i < dim2; ++i) { 391 | if (optLabel < 0 || output[0][i] > output[0][optLabel]) 392 | optLabel = i; 393 | } 394 | 395 | dtype maxScore = output[0][optLabel]; 396 | results.resize(dim2); 397 | 398 | dtype sum = 0.0; 399 | for (int i = 0; i < dim2; ++i) { 400 | results[i] = exp(output[0][i] - maxScore); 401 | sum += results[i]; 402 | } 403 | 404 | for (int i = 0; i < dim2; ++i) { 405 | results[i] = results[i]/sum; 406 | } 407 | 408 | return optLabel; 409 | 410 | } 411 | 412 | #endif 413 | -------------------------------------------------------------------------------- /SparseUniLayer.h: -------------------------------------------------------------------------------- 1 | /* 2 | * SparseUniLayer.h 3 | * 4 | * Created on: Mar 18, 2015 5 | * Author: mszhang 6 | */ 7 | 8 | #ifndef SRC_SparseUniLayer_H_ 9 | #define SRC_SparseUniLayer_H_ 10 | #include "tensor.h" 11 | #include "Utiltensor.h" 12 | #include "MyLib.h" 13 | 14 | using namespace mshadow; 15 | using namespace mshadow::expr; 16 | using namespace mshadow::utils; 17 | 18 | // Weight updating process implemented without theory support, 19 | // but recently find an EMNLP 2015 paper "An Empirical Analysis of Optimization for Max-Margin NLP" 20 | // In all my papers that use adagrad for sparse features, I use it for parameter updating. 21 | 22 | template 23 | class SparseUniLayer { 24 | 25 | public: 26 | 27 | hash_set _indexers; 28 | 29 | Tensor _W; 30 | Tensor _b; 31 | 32 | Tensor _gradW; 33 | Tensor _gradb; 34 | 35 | Tensor _eg2W; 36 | Tensor _eg2b; 37 | 38 | Tensor _ftW; 39 | 40 | bool _bUseB; 41 | 42 | int _funcType; // 0: tanh, 1: sigmod, 2: f(x)=x, 3: exp 43 | 44 | int _max_update; 45 | NRVec _last_update; 46 | 47 | 48 | public: 49 | 50 | SparseUniLayer() { 51 | _indexers.clear(); 52 | } 53 | 54 | inline void initial(int nOSize, int nISize, bool bUseB = true, int seed = 0, int funcType = 0) { 55 | dtype bound = sqrt(6.0 / (nOSize + nISize + 1)); 56 | //dtype bound = 0.01; 57 | 58 | _W = NewTensor(Shape2(nISize, nOSize), d_zero); 59 | _gradW = NewTensor(Shape2(nISize, nOSize), d_zero); 60 | _eg2W = NewTensor(Shape2(nISize, nOSize), d_zero); 61 | _ftW = NewTensor(Shape2(nISize, nOSize), d_one); 62 | 63 | _b = NewTensor(Shape2(1, nOSize), d_zero); 64 | _gradb = NewTensor(Shape2(1, nOSize), d_zero); 65 | _eg2b = NewTensor(Shape2(1, nOSize), d_zero); 66 | 67 | random(_W, -1.0 * bound, 1.0 * bound, seed); 68 | random(_b, -1.0 * bound, 1.0 * bound, seed + 1); 69 | 70 | _bUseB = bUseB; 71 | _funcType = funcType; 72 | 73 | _max_update = 0; 74 | _last_update.resize(nISize); 75 | _last_update = 0; 76 | } 77 | 78 | inline void initial(Tensor W, Tensor b, bool bUseB = true, int funcType = 0) { 79 | static int nOSize, nISize; 80 | nISize = W.size(0); 81 | nOSize = W.size(1); 82 | 83 | _W = NewTensor(Shape2(nOSize, nISize), d_zero); 84 | _gradW = NewTensor(Shape2(nOSize, nISize), d_zero); 85 | _eg2W = NewTensor(Shape2(nOSize, nISize), d_zero); 86 | _ftW = NewTensor(Shape2(nOSize, nISize), d_one); 87 | Copy(_W, W); 88 | 89 | _b = NewTensor(Shape2(1, nOSize), d_zero); 90 | _gradb = NewTensor(Shape2(1, nOSize), d_zero); 91 | _eg2b = NewTensor(Shape2(1, nOSize), d_zero); 92 | 93 | if (bUseB) 94 | Copy(_b, b); 95 | 96 | _bUseB = bUseB; 97 | _funcType = funcType; 98 | 99 | _max_update = 0; 100 | _last_update.resize(nISize); 101 | _last_update = 0; 102 | } 103 | 104 | inline void release() { 105 | FreeSpace(&_W); 106 | FreeSpace(&_gradW); 107 | FreeSpace(&_eg2W); 108 | FreeSpace(&_ftW); 109 | FreeSpace(&_b); 110 | FreeSpace(&_gradb); 111 | FreeSpace(&_eg2b); 112 | _indexers.clear(); 113 | } 114 | 115 | virtual ~SparseUniLayer() { 116 | // TODO Auto-generated destructor stub 117 | } 118 | 119 | inline dtype squarenormAll() { 120 | dtype result = squarenorm(_gradW); 121 | 122 | if (_bUseB) { 123 | result += squarenorm(_gradb); 124 | } 125 | 126 | return result; 127 | } 128 | 129 | inline void scaleGrad(dtype scale) { 130 | _gradW = _gradW * scale; 131 | if (_bUseB) { 132 | _gradb = _gradb * scale; 133 | } 134 | } 135 | 136 | public: 137 | void ComputeForwardScore(const std::vector& x, Tensor y) { 138 | static long long featNum, featId; 139 | featNum = x.size(); 140 | y = 0.0; 141 | for (int idx = 0; idx < featNum; idx++) { 142 | featId = x[idx]; 143 | updateSparseWeight(featId); 144 | y[0] += _W[featId]; 145 | } 146 | 147 | if (_bUseB) 148 | y = y + _b; 149 | if (_funcType == 0) 150 | y = F(y); 151 | else if (_funcType == 1) 152 | y = F(y); 153 | else if (_funcType == 3) 154 | y = F(y); 155 | } 156 | 157 | void ComputeForwardScore(const std::vector >& x, Tensor y) { 158 | static long long featNum, featId; 159 | 160 | int seq_size = y.size(0); 161 | 162 | for (int id = 0; id < seq_size; id++) { 163 | featNum = x[id].size(); 164 | y[id] = 0.0; 165 | for (int idx = 0; idx < featNum; idx++) { 166 | featId = x[id][idx]; 167 | updateSparseWeight(featId); 168 | y[id][0] += _W[featId]; 169 | } 170 | 171 | if (_bUseB) 172 | y[id] = y[id] + _b; 173 | if (_funcType == 0) 174 | y[id] = F(y[id]); 175 | else if (_funcType == 1) 176 | y[id] = F(y[id]); 177 | else if (_funcType == 3) 178 | y[id] = F(y[id]); 179 | } 180 | } 181 | 182 | void ComputeForwardScore(const std::vector >& x, std::vector > &y) { 183 | static long long featNum, featId; 184 | int seq_size = y.size(); 185 | 186 | for (int id = 0; id < seq_size; id++) { 187 | featNum = x[id].size(); 188 | y[id] = 0.0; 189 | for (int idx = 0; idx < featNum; idx++) { 190 | featId = x[id][idx]; 191 | updateSparseWeight(featId); 192 | y[id][0] += _W[featId]; 193 | } 194 | 195 | if (_bUseB) 196 | y[id] = y[id] + _b; 197 | if (_funcType == 0) 198 | y[id] = F(y[id]); 199 | else if (_funcType == 1) 200 | y[id] = F(y[id]); 201 | else if (_funcType == 3) 202 | y[id] = F(y[id]); 203 | } 204 | } 205 | // loss is stopped at this layer, since the input is one-hold alike 206 | void ComputeBackwardLoss(const std::vector& x, Tensor y, Tensor ly) { 207 | Tensor deri_yx(Shape2(y.size(0), y.size(1))), cly(Shape2(y.size(0), y.size(1))); 208 | AllocSpace(&deri_yx); 209 | AllocSpace(&cly); 210 | if (_funcType == 0) { 211 | deri_yx = F(y); 212 | cly = ly * deri_yx; 213 | } else if (_funcType == 1) { 214 | deri_yx = F(y); 215 | cly = ly * deri_yx; 216 | } else if (_funcType == 3) { 217 | cly = ly * y; 218 | } else { 219 | //cly = ly; 220 | Copy(cly, ly); 221 | } 222 | 223 | //_gradW 224 | static long long featNum, featId; 225 | featNum = x.size(); 226 | for (int idx = 0; idx < featNum; idx++) { 227 | featId = x[idx]; 228 | _indexers.insert(featId); 229 | _gradW[featId] += cly[0]; 230 | } 231 | 232 | if (_bUseB) 233 | _gradb = _gradb + cly; 234 | 235 | FreeSpace(&deri_yx); 236 | FreeSpace(&cly); 237 | } 238 | 239 | void ComputeBackwardLoss(const std::vector >& x, Tensor y, Tensor ly) { 240 | int seq_size = y.size(0); 241 | int y_dim1 = y.size(1), y_dim2 = y.size(2); 242 | 243 | static long long featNum, featId; 244 | Tensor deri_yx(Shape2(y_dim1, y_dim2)), cly(Shape2(y_dim1, y_dim2)); 245 | AllocSpace(&deri_yx); 246 | AllocSpace(&cly); 247 | 248 | for (int id = 0; id < seq_size; id++) { 249 | if (_funcType == 0) { 250 | deri_yx = F(y[id]); 251 | cly = ly[id] * deri_yx; 252 | } else if (_funcType == 1) { 253 | deri_yx = F(y[id]); 254 | cly = ly[id] * deri_yx; 255 | } else if (_funcType == 3) { 256 | cly = ly[id] * y[id]; 257 | } else { 258 | //cly = ly; 259 | Copy(cly, ly[id]); 260 | } 261 | //_gradW 262 | featNum = x[id].size(); 263 | for (int idx = 0; idx < featNum; idx++) { 264 | featId = x[id][idx]; 265 | _indexers.insert(featId); 266 | _gradW[featId] += cly[0]; 267 | } 268 | 269 | if (_bUseB) 270 | _gradb = _gradb + cly; 271 | } 272 | 273 | FreeSpace(&deri_yx); 274 | FreeSpace(&cly); 275 | } 276 | 277 | void ComputeBackwardLoss(const std::vector >& x, const std::vector > &y, 278 | const std::vector > &ly) { 279 | int seq_size = y.size(); 280 | assert(seq_size > 0); 281 | int y_dim1 = y[0].size(0), y_dim2 = y[0].size(1); 282 | 283 | static long long featNum, featId, startPos; 284 | Tensor deri_yx(Shape2(y_dim1, y_dim2)), cly(Shape2(y_dim1, y_dim2)); 285 | AllocSpace(&deri_yx); 286 | AllocSpace(&cly); 287 | 288 | for (int id = 0; id < seq_size; id++) { 289 | if (_funcType == 0) { 290 | deri_yx = F(y[id]); 291 | cly = ly[id] * deri_yx; 292 | } else if (_funcType == 1) { 293 | deri_yx = F(y[id]); 294 | cly = ly[id] * deri_yx; 295 | } else if (_funcType == 3) { 296 | cly = ly[id] * y[id]; 297 | } else { 298 | //cly = ly; 299 | Copy(cly, ly[id]); 300 | } 301 | //_gradW 302 | featNum = x[id].size(); 303 | for (int idx = 0; idx < featNum; idx++) { 304 | featId = x[id][idx]; 305 | _indexers.insert(featId); 306 | _gradW[featId] += cly[0]; 307 | } 308 | 309 | if (_bUseB) 310 | _gradb = _gradb + cly; 311 | } 312 | 313 | FreeSpace(&deri_yx); 314 | FreeSpace(&cly); 315 | } 316 | 317 | void randomprint(int num) { 318 | static int nOSize, nISize; 319 | nISize = _W.size(0); 320 | nOSize = _W.size(1); 321 | 322 | int count = 0; 323 | while (count < num) { 324 | int idx = rand() % nOSize; 325 | int idy = rand() % nISize; 326 | 327 | std::cout << "_W[" << idx << "," << idy << "]=" << _W[idy][idx] << " "; 328 | 329 | if (_bUseB) { 330 | int idz = rand() % nOSize; 331 | std::cout << "_b[0][" << idz << "]=" << _b[0][idz] << " "; 332 | } 333 | count++; 334 | } 335 | 336 | std::cout << std::endl; 337 | } 338 | 339 | void updateAdaGrad(dtype regularizationWeight, dtype adaAlpha, dtype adaEps) { 340 | static int startPos; 341 | 342 | static hash_set::iterator it; 343 | 344 | _max_update++; 345 | 346 | Tensor sqrt_eg2W = NewTensor(Shape1(_W.size(1)), d_zero); 347 | 348 | for (it = _indexers.begin(); it != _indexers.end(); ++it) { 349 | int index = *it; 350 | _eg2W[index] = _eg2W[index] + _gradW[index] * _gradW[index]; 351 | sqrt_eg2W = F(_eg2W[index] + adaEps); 352 | _W[index] = (_W[index] * sqrt_eg2W - _gradW[index] * adaAlpha) / (adaAlpha * regularizationWeight + sqrt_eg2W); 353 | _ftW[index] = sqrt_eg2W / (adaAlpha * regularizationWeight + sqrt_eg2W); 354 | } 355 | 356 | FreeSpace(&sqrt_eg2W); 357 | 358 | if (_bUseB) { 359 | _gradb = _gradb + _b * regularizationWeight; 360 | _eg2b = _eg2b + _gradb * _gradb; 361 | _b = _b - _gradb * adaAlpha / F(_eg2b + adaEps); 362 | } 363 | 364 | clearGrad(); 365 | } 366 | 367 | void clearGrad() { 368 | static hash_set::iterator it; 369 | 370 | for (it = _indexers.begin(); it != _indexers.end(); ++it) { 371 | int index = *it; 372 | _gradW[index] = 0.0; 373 | } 374 | 375 | _indexers.clear(); 376 | if (_bUseB) 377 | _gradb = 0.0; 378 | } 379 | 380 | void updateSparseWeight(long long featId) { 381 | if (_last_update[featId] < _max_update) { 382 | int times = _max_update - _last_update[featId]; 383 | _W[featId] = _W[featId] * F(times * F(_ftW[featId])); 384 | _last_update[featId] = _max_update; 385 | } 386 | } 387 | 388 | void writeModel(LStream &outf) { 389 | SaveBinary(outf, _W); 390 | SaveBinary(outf, _b); 391 | SaveBinary(outf, _gradW); 392 | SaveBinary(outf, _gradb); 393 | SaveBinary(outf, _eg2W); 394 | SaveBinary(outf, _eg2b); 395 | SaveBinary(outf, _ftW); 396 | 397 | 398 | WriteBinary(outf, _bUseB); 399 | WriteBinary(outf, _funcType); 400 | WriteBinary(outf, _max_update); 401 | WriteVector(outf, _last_update); 402 | } 403 | 404 | void loadModel(LStream &inf) { 405 | LoadBinary(inf, &_W, false); 406 | LoadBinary(inf, &_b, false); 407 | LoadBinary(inf, &_gradW, false); 408 | LoadBinary(inf, &_gradb, false); 409 | LoadBinary(inf, &_eg2W, false); 410 | LoadBinary(inf, &_eg2b, false); 411 | LoadBinary(inf, &_ftW, false); 412 | 413 | ReadBinary(inf, _bUseB); 414 | ReadBinary(inf, _funcType); 415 | ReadBinary(inf, _max_update); 416 | ReadVector(inf, _last_update); 417 | } 418 | 419 | 420 | }; 421 | 422 | #endif /* SRC_SparseUniLayer_H_ */ 423 | -------------------------------------------------------------------------------- /SparseUniLayer1O.h: -------------------------------------------------------------------------------- 1 | /* 2 | * SparseUniLayer1O.h 3 | * 4 | * Created on: Oct 22, 2015 5 | * Author: mszhang 6 | */ 7 | 8 | #ifndef SPARSEUNILAYER1O_H_ 9 | #define SPARSEUNILAYER1O_H_ 10 | 11 | #include "tensor.h" 12 | #include "Utiltensor.h" 13 | #include "MyLib.h" 14 | 15 | using namespace mshadow; 16 | using namespace mshadow::expr; 17 | using namespace mshadow::utils; 18 | 19 | // Weight updating process implemented without theory support, 20 | // but recently find an EMNLP 2015 paper "An Empirical Analysis of Optimization for Max-Margin NLP" 21 | // In all my papers that use adagrad for sparse features, I use it for parameter updating. 22 | 23 | template 24 | class SparseUniLayer1O { 25 | 26 | public: 27 | 28 | hash_set _indexers; 29 | 30 | Tensor _W; 31 | 32 | Tensor _gradW; 33 | 34 | Tensor _eg2W; 35 | 36 | Tensor _ftW; 37 | 38 | 39 | int _max_update; 40 | NRVec _last_update; 41 | 42 | 43 | public: 44 | 45 | SparseUniLayer1O() { 46 | _indexers.clear(); 47 | } 48 | 49 | inline void initial(int nISize, int seed = 0) { 50 | dtype bound = sqrt(6.0 / (nISize + 1)); 51 | //dtype bound = 0.01; 52 | 53 | _W = NewTensor(Shape1(nISize), d_zero); 54 | _gradW = NewTensor(Shape1(nISize), d_zero); 55 | _eg2W = NewTensor(Shape1(nISize), d_zero); 56 | _ftW = NewTensor(Shape1(nISize), d_one); 57 | 58 | 59 | random(_W, -1.0 * bound, 1.0 * bound, seed); 60 | 61 | _max_update = 0; 62 | _last_update.resize(nISize); 63 | _last_update = 0; 64 | } 65 | 66 | inline void initial(Tensor W) { 67 | static int nOSize, nISize; 68 | nISize = W.size(0); 69 | 70 | _W = NewTensor(Shape1(nISize), d_zero); 71 | _gradW = NewTensor(Shape1(nISize), d_zero); 72 | _eg2W = NewTensor(Shape1(nISize), d_zero); 73 | _ftW = NewTensor(Shape1(nISize), d_one); 74 | Copy(_W, W); 75 | 76 | 77 | _max_update = 0; 78 | _last_update.resize(nISize); 79 | _last_update = 0; 80 | } 81 | 82 | inline void release() { 83 | FreeSpace(&_W); 84 | FreeSpace(&_gradW); 85 | FreeSpace(&_eg2W); 86 | FreeSpace(&_ftW); 87 | _indexers.clear(); 88 | } 89 | 90 | virtual ~SparseUniLayer1O() { 91 | // TODO Auto-generated destructor stub 92 | } 93 | 94 | inline dtype squarenormAll() { 95 | dtype result = squarenorm(_gradW); 96 | 97 | return result; 98 | } 99 | 100 | inline void scaleGrad(dtype scale) { 101 | _gradW = _gradW * scale; 102 | } 103 | 104 | public: 105 | void ComputeForwardScore(const std::vector& x, dtype& y) { 106 | static long long featNum, featId; 107 | featNum = x.size(); 108 | y = 0.0; 109 | for (int idx = 0; idx < featNum; idx++) { 110 | featId = x[idx]; 111 | if(featId >= _W.size(0))continue; 112 | updateSparseWeight(featId); 113 | y += _W[featId]; 114 | } 115 | 116 | } 117 | 118 | // loss is stopped at this layer, since the input is one-hold alike 119 | void ComputeBackwardLoss(const std::vector& x, dtype ly) { 120 | //_gradW 121 | static long long featNum, featId; 122 | featNum = x.size(); 123 | for (int idx = 0; idx < featNum; idx++) { 124 | featId = x[idx]; 125 | if(featId >= _W.size(0))continue; 126 | _indexers.insert(featId); 127 | _gradW[featId] += ly; 128 | } 129 | } 130 | 131 | 132 | void randomprint(int num) { 133 | static int nISize; 134 | nISize = _W.size(0); 135 | 136 | int count = 0; 137 | while (count < num) { 138 | int idx = rand() % nISize; 139 | std::cout << "_W[" << idx << "]=" << _W[idx] << " "; 140 | count++; 141 | } 142 | 143 | std::cout << std::endl; 144 | } 145 | 146 | void updateAdaGrad(dtype regularizationWeight, dtype adaAlpha, dtype adaEps) { 147 | static int startPos; 148 | 149 | static hash_set::iterator it; 150 | 151 | _max_update++; 152 | 153 | dtype sqrt_eg2W = d_zero; 154 | 155 | 156 | for (it = _indexers.begin(); it != _indexers.end(); ++it) { 157 | int index = *it; 158 | _eg2W[index] = _eg2W[index] + _gradW[index] * _gradW[index]; 159 | sqrt_eg2W = sqrt(_eg2W[index] + adaEps); 160 | _W[index] = (_W[index] * sqrt_eg2W - _gradW[index] * adaAlpha) / (adaAlpha * regularizationWeight + sqrt_eg2W); 161 | _ftW[index] = sqrt_eg2W / (adaAlpha * regularizationWeight + sqrt_eg2W); 162 | } 163 | 164 | 165 | //for (it = _indexers.begin(); it != _indexers.end(); ++it) { 166 | // int index = *it; 167 | // _W[index] = _W[index] - _gradW[index]; 168 | //} 169 | 170 | clearGrad(); 171 | } 172 | 173 | void clearGrad() { 174 | static hash_set::iterator it; 175 | for (it = _indexers.begin(); it != _indexers.end(); ++it) { 176 | int index = *it; 177 | _gradW[index] = 0.0; 178 | } 179 | _indexers.clear(); 180 | 181 | } 182 | 183 | void updateSparseWeight(long long featId) { 184 | 185 | if (_last_update[featId] < _max_update) { 186 | int times = _max_update - _last_update[featId]; 187 | _W[featId] = _W[featId] * exp(times * log(_ftW[featId])); 188 | _last_update[featId] = _max_update; 189 | } 190 | } 191 | 192 | 193 | void writeModel(LStream &outf) { 194 | 195 | SaveBinary(outf, _W); 196 | SaveBinary(outf, _gradW); 197 | SaveBinary(outf, _eg2W); 198 | SaveBinary(outf, _ftW); 199 | 200 | WriteBinary(outf, _max_update); 201 | WriteVector(outf, _last_update); 202 | } 203 | 204 | void loadModel(LStream &inf) { 205 | LoadBinary(inf, &_W, false); 206 | LoadBinary(inf, &_gradW, false); 207 | LoadBinary(inf, &_eg2W, false); 208 | LoadBinary(inf, &_ftW, false); 209 | 210 | ReadBinary(inf, _max_update); 211 | ReadVector(inf, _last_update); 212 | } 213 | 214 | }; 215 | 216 | 217 | 218 | #endif /* SPARSEUNILAYER1O_H_ */ 219 | -------------------------------------------------------------------------------- /TensorLayer.h: -------------------------------------------------------------------------------- 1 | /* 2 | * TensorLayer.h 3 | * 4 | * Created on: Mar 18, 2015 5 | * Author: mszhang 6 | */ 7 | 8 | #ifndef SRC_TensorLayer_H_ 9 | #define SRC_TensorLayer_H_ 10 | #include "tensor.h" 11 | #include "MyLib.h" 12 | #include "Utiltensor.h" 13 | 14 | using namespace mshadow; 15 | using namespace mshadow::expr; 16 | using namespace mshadow::utils; 17 | 18 | template 19 | class TensorLayer { 20 | 21 | public: 22 | 23 | Tensor _W; 24 | Tensor _V; 25 | Tensor _b; 26 | 27 | Tensor _gradW; 28 | Tensor _gradV; 29 | Tensor _gradb; 30 | 31 | Tensor _eg2W; 32 | Tensor _eg2V; 33 | Tensor _eg2b; 34 | 35 | int _mode; // 1: x1 W x2; 2: x1 W x2 + V x2; 3: x1 W x2 + V x2 + b 36 | 37 | int _funcType; // 0: tanh, 1: sigmod, 2: f(x)=x, 3: exp 38 | 39 | public: 40 | TensorLayer() { 41 | } 42 | 43 | inline void initial(int nOSize, int nISize, int mode = 1, int seed = 0, int funcType = 0) { 44 | dtype bound = sqrt(6.0 / (nOSize + nISize + 1)); 45 | //dtype bound = 0.01; 46 | 47 | _W = NewTensor(Shape3(nOSize, nISize, nOSize), d_zero); 48 | _gradW = NewTensor(Shape3(nOSize, nISize, nOSize), d_zero); 49 | _eg2W = NewTensor(Shape3(nOSize, nISize, nOSize), d_zero); 50 | 51 | _V = NewTensor(Shape2(nOSize, nOSize), d_zero); 52 | _gradV = NewTensor(Shape2(nOSize, nOSize), d_zero); 53 | _eg2V = NewTensor(Shape2(nOSize, nOSize), d_zero); 54 | 55 | _b = NewTensor(Shape2(1, nOSize), d_zero); 56 | _gradb = NewTensor(Shape2(1, nOSize), d_zero); 57 | _eg2b = NewTensor(Shape2(1, nOSize), d_zero); 58 | 59 | random(_W, -1.0 * bound, 1.0 * bound, seed); 60 | random(_V, -1.0 * bound, 1.0 * bound, seed + 1); 61 | random(_b, -1.0 * bound, 1.0 * bound, seed + 2); 62 | 63 | _mode = mode; 64 | _funcType = funcType; 65 | } 66 | 67 | inline void initial(Tensor W, Tensor V, Tensor b, int mode = 1, int funcType = 0) { 68 | static int nOSize, nISize; 69 | nOSize = W.size(0); 70 | nISize = W.size(1); 71 | 72 | _W = NewTensor(Shape3(nOSize, nISize, nOSize), d_zero); 73 | _gradW = NewTensor(Shape3(nOSize, nISize, nOSize), d_zero); 74 | _eg2W = NewTensor(Shape3(nOSize, nISize, nOSize), d_zero); 75 | Copy(_W, W); 76 | 77 | _V = NewTensor(Shape2(nOSize, nOSize), d_zero); 78 | _gradV = NewTensor(Shape2(nOSize, nOSize), d_zero); 79 | _eg2V = NewTensor(Shape2(nOSize, nOSize), d_zero); 80 | if (mode >= 2) 81 | Copy(_V, V); 82 | 83 | _b = NewTensor(Shape2(1, nOSize), d_zero); 84 | _gradb = NewTensor(Shape2(1, nOSize), d_zero); 85 | _eg2b = NewTensor(Shape2(1, nOSize), d_zero); 86 | 87 | if (mode >= 3) 88 | Copy(_b, b); 89 | 90 | _mode = mode; 91 | _funcType = funcType; 92 | } 93 | 94 | inline void release() { 95 | FreeSpace(&_W); 96 | FreeSpace(&_gradW); 97 | FreeSpace(&_eg2W); 98 | FreeSpace(&_V); 99 | FreeSpace(&_gradV); 100 | FreeSpace(&_eg2V); 101 | FreeSpace(&_b); 102 | FreeSpace(&_gradb); 103 | FreeSpace(&_eg2b); 104 | } 105 | 106 | virtual ~TensorLayer() { 107 | // TODO Auto-generated destructor stub 108 | } 109 | 110 | inline dtype squarenormAll() { 111 | dtype result = squarenorm(_gradW); 112 | 113 | if (_mode >= 2) { 114 | result += squarenorm(_gradV); 115 | } 116 | 117 | if (_mode >= 3) { 118 | result += squarenorm(_gradb); 119 | } 120 | 121 | return result; 122 | } 123 | 124 | inline void scaleGrad(dtype scale) { 125 | _gradW = _gradW * scale; 126 | if (_mode >= 2) { 127 | _gradV = _gradV * scale; 128 | } 129 | if (_mode >= 3) { 130 | _gradb = _gradb * scale; 131 | } 132 | } 133 | 134 | public: 135 | inline void ComputeForwardScore(Tensor x1, Tensor x2, Tensor y) { 136 | Tensor midresult1 = NewTensor(Shape2(1, y.size(1)), d_zero); 137 | Tensor midresult2 = NewTensor(Shape2(1, 1), d_zero); 138 | for (int idy = 0; idy < y.size(1); idy++) { 139 | midresult1 = dot(x1, _W[idy]); 140 | midresult2 = dot(midresult1, x2.T()); 141 | y[0][idy] = midresult2[0][0]; 142 | } 143 | 144 | if (_mode >= 2) { 145 | midresult1 = dot(x2, _V.T()); 146 | y += midresult1; 147 | } 148 | 149 | if (_mode >= 3) 150 | y = y + _b; 151 | if (_funcType == 0) 152 | y = F(y); 153 | else if (_funcType == 1) 154 | y = F(y); 155 | else if (_funcType == 3) 156 | y = F(y); 157 | 158 | FreeSpace(&midresult1); 159 | FreeSpace(&midresult2); 160 | } 161 | 162 | 163 | inline void ComputeForwardScore(Tensor x1, Tensor x2, Tensor y) { 164 | int seq_size = y.size(0); 165 | Tensor midresult1 = NewTensor(Shape2(1, y.size(2)), d_zero); 166 | Tensor midresult2 = NewTensor(Shape2(1, 1), d_zero); 167 | for(int id = 0; id < seq_size; id++){ 168 | for (int idy = 0; idy < y.size(2); idy++) { 169 | midresult1 = dot(x1[id], _W[idy]); 170 | midresult2 = dot(midresult1, x2[id].T()); 171 | y[id][0][idy] = midresult2[0][0]; 172 | } 173 | 174 | if (_mode >= 2) { 175 | midresult1 = dot(x2[id], _V.T()); 176 | y[id] += midresult1; 177 | } 178 | 179 | if (_mode >= 3) 180 | y[id] = y[id] + _b; 181 | if (_funcType == 0) 182 | y[id] = F(y[id]); 183 | else if (_funcType == 1) 184 | y[id] = F(y[id]); 185 | else if (_funcType == 3) 186 | y[id] = F(y[id]); 187 | } 188 | 189 | FreeSpace(&midresult1); 190 | FreeSpace(&midresult2); 191 | } 192 | 193 | inline void ComputeForwardScore(const std::vector > &x1, const std::vector > &x2, 194 | std::vector > &y) { 195 | int seq_size = y.size(); 196 | assert(seq_size > 0); 197 | Tensor midresult1 = NewTensor(Shape2(1, y[0].size(1)), d_zero); 198 | Tensor midresult2 = NewTensor(Shape2(1, 1), d_zero); 199 | for(int id = 0; id < seq_size; id++){ 200 | for (int idy = 0; idy < y.size(2); idy++) { 201 | midresult1 = dot(x1[id], _W[idy]); 202 | midresult2 = dot(midresult1, x2[id].T()); 203 | y[id][0][idy] = midresult2[0][0]; 204 | } 205 | 206 | if (_mode >= 2) { 207 | midresult1 = dot(x2[id], _V.T()); 208 | y[id] += midresult1; 209 | } 210 | 211 | if (_mode >= 3) 212 | y[id] = y[id] + _b; 213 | if (_funcType == 0) 214 | y[id] = F(y[id]); 215 | else if (_funcType == 1) 216 | y[id] = F(y[id]); 217 | else if (_funcType == 3) 218 | y[id] = F(y[id]); 219 | } 220 | 221 | FreeSpace(&midresult1); 222 | FreeSpace(&midresult2); 223 | } 224 | 225 | //please allocate the memory outside here 226 | inline void ComputeBackwardLoss(Tensor x1, Tensor x2, Tensor y, Tensor ly, 227 | Tensor lx1, Tensor lx2, bool bclear = false) { 228 | //_gradW 229 | Tensor deri_yx(Shape2(y.size(0), y.size(1))), cly(Shape2(y.size(0), y.size(1))); 230 | AllocSpace(&deri_yx); 231 | AllocSpace(&cly); 232 | 233 | if(bclear) { 234 | lx1 = 0.0; 235 | lx2 = 0.0; 236 | } 237 | if (_funcType == 0) { 238 | deri_yx = F(y); 239 | cly = ly * deri_yx; 240 | } else if (_funcType == 1) { 241 | deri_yx = F(y); 242 | cly = ly * deri_yx; 243 | } else if (_funcType == 3) { 244 | cly = ly * y; 245 | } else { 246 | //cly = ly; 247 | Copy(cly, ly); 248 | } 249 | 250 | Tensor midresult1 = NewTensor(Shape2(1, y.size(1)), d_zero); 251 | Tensor midresult2 = NewTensor(Shape2(1, y.size(1)), d_zero); 252 | //_gradW 253 | for (int idy = 0; idy < y.size(1); idy++) { 254 | midresult1 = dot(x1, _W[idy]); 255 | lx2 += cly[0][idy] * midresult1; 256 | midresult2 = cly[0][idy] * x2; 257 | _gradW[idy] += dot(x1.T(), midresult2); 258 | lx1 += dot(midresult2, _W[idy].T()); 259 | } 260 | 261 | //_gradV 262 | if (_mode >= 2) { 263 | _gradV += dot(cly.T(), x2); 264 | //lx 265 | lx2 += dot(cly, _V); 266 | } 267 | 268 | //_gradb 269 | if (_mode >= 3) 270 | _gradb += cly; 271 | 272 | FreeSpace(&deri_yx); 273 | FreeSpace(&cly); 274 | FreeSpace(&midresult1); 275 | FreeSpace(&midresult2); 276 | } 277 | 278 | inline void ComputeBackwardLoss(Tensor x1, Tensor x2, Tensor y, Tensor ly, 279 | Tensor lx1, Tensor lx2, bool bclear = false) { 280 | int seq_size = y.size(0); 281 | int y_dim1 = y.size(1), y_dim2 = y.size(2); 282 | assert(y_dim1 == 1); 283 | //_gradW 284 | Tensor deri_yx(Shape2(y_dim1, y_dim2)), cly(Shape2(y_dim1, y_dim2)); 285 | Tensor midresult1 = NewTensor(Shape2(y_dim1, y_dim2), d_zero); 286 | Tensor midresult2 = NewTensor(Shape2(y_dim1, y_dim2), d_zero); 287 | AllocSpace(&deri_yx); 288 | AllocSpace(&cly); 289 | 290 | if(bclear) { 291 | lx1 = 0.0; 292 | lx2 = 0.0; 293 | } 294 | for (int id = 0; id < seq_size; id++) { 295 | if (_funcType == 0) { 296 | deri_yx = F(y[id]); 297 | cly = ly[id] * deri_yx; 298 | } else if (_funcType == 1) { 299 | deri_yx = F(y); 300 | cly = ly[id] * deri_yx; 301 | } else if (_funcType == 3) { 302 | cly = ly[id] * y[id]; 303 | } else { 304 | //cly = ly; 305 | Copy(cly, ly[id]); 306 | } 307 | 308 | //_gradW 309 | for (int idy = 0; idy < y.size(2); idy++) { 310 | midresult1 = dot(x1[id], _W[idy]); 311 | lx2[id] += cly[0][idy] * midresult1; 312 | midresult2 = cly[0][idy] * x2[id]; 313 | _gradW[idy] += dot(x1[id].T(), midresult2); 314 | lx1[id] += dot(midresult2, _W[idy].T()); 315 | } 316 | 317 | //_gradV 318 | if (_mode >= 2) { 319 | _gradV += dot(cly.T(), x2[id]); 320 | //lx 321 | lx2[id] += dot(cly, _V); 322 | } 323 | 324 | //_gradb 325 | if (_mode >= 3) 326 | _gradb += cly; 327 | } 328 | 329 | FreeSpace(&deri_yx); 330 | FreeSpace(&cly); 331 | FreeSpace(&midresult1); 332 | FreeSpace(&midresult2); 333 | } 334 | 335 | inline void ComputeBackwardLoss(const std::vector > &x1, const std::vector > &x2, 336 | const std::vector > &y, const std::vector > &ly, 337 | std::vector > &lx1, std::vector > &lx2, bool bclear = false) { 338 | int seq_size = y.size(); 339 | assert(seq_size > 0); 340 | int y_dim1 = y[0].size(0), y_dim2 = y[0].size(1); 341 | assert(y_dim1 == 1); 342 | //_gradW 343 | Tensor deri_yx(Shape2(y_dim1, y_dim2)), cly(Shape2(y_dim1, y_dim2)); 344 | Tensor midresult1 = NewTensor(Shape2(y_dim1, y_dim2), d_zero); 345 | Tensor midresult2 = NewTensor(Shape2(y_dim1, y_dim2), d_zero); 346 | AllocSpace(&deri_yx); 347 | AllocSpace(&cly); 348 | 349 | if(bclear) { 350 | for (int id = 0; id < seq_size; id++) { 351 | lx1[id] = 0.0; 352 | lx2[id] = 0.0; 353 | } 354 | } 355 | for (int id = 0; id < seq_size; id++) { 356 | if (_funcType == 0) { 357 | deri_yx = F(y[id]); 358 | cly = ly[id] * deri_yx; 359 | } else if (_funcType == 1) { 360 | deri_yx = F(y); 361 | cly = ly[id] * deri_yx; 362 | } else if (_funcType == 3) { 363 | cly = ly[id] * y[id]; 364 | } else { 365 | //cly = ly; 366 | Copy(cly, ly[id]); 367 | } 368 | 369 | //_gradW 370 | for (int idy = 0; idy < y.size(2); idy++) { 371 | midresult1 = dot(x1[id], _W[idy]); 372 | lx2[id] += cly[0][idy] * midresult1; 373 | midresult2 = cly[0][idy] * x2[id]; 374 | _gradW[idy] += dot(x1[id].T(), midresult2); 375 | lx1[id] += dot(midresult2, _W[idy].T()); 376 | } 377 | 378 | //_gradV 379 | if (_mode >= 2) { 380 | _gradV += dot(cly.T(), x2[id]); 381 | //lx 382 | lx2[id] += dot(cly, _V); 383 | } 384 | 385 | //_gradb 386 | if (_mode >= 3) 387 | _gradb += cly; 388 | } 389 | 390 | FreeSpace(&deri_yx); 391 | FreeSpace(&cly); 392 | FreeSpace(&midresult1); 393 | FreeSpace(&midresult2); 394 | } 395 | 396 | inline void randomprint(int num) { 397 | static int nOSize, nISize; 398 | nOSize = _W.size(0); 399 | nISize = _W.size(1); 400 | int count = 0; 401 | while (count < num) { 402 | int idx = rand() % nOSize; 403 | int idy = rand() % nISize; 404 | int idz = rand() % nOSize; 405 | std::cout << "_W[" << idx << "," << idy << "," << idz << "]=" << _W[idx][idy][idz] << " "; 406 | 407 | if (_mode >= 2) { 408 | int idy = rand() % nOSize; 409 | int idz = rand() % nOSize; 410 | std::cout << "_V[" << idy << "," << idz << "]=" << _V[idy][idz] << " "; 411 | } 412 | 413 | if (_mode >= 3) { 414 | int idz = rand() % nOSize; 415 | std::cout << "_b[0][" << idz << "]=" << _b[0][idz] << " "; 416 | } 417 | count++; 418 | } 419 | 420 | std::cout << std::endl; 421 | } 422 | 423 | inline void updateAdaGrad(dtype regularizationWeight, dtype adaAlpha, dtype adaEps) { 424 | _gradW = _gradW + _W * regularizationWeight; 425 | _eg2W = _eg2W + _gradW * _gradW; 426 | _W = _W - _gradW * adaAlpha / F(_eg2W + adaEps); 427 | 428 | if (_mode >= 2) { 429 | _gradV = _gradV + _V * regularizationWeight; 430 | _eg2V = _eg2V + _gradV * _gradV; 431 | _V = _V - _gradV * adaAlpha / F(_eg2V + adaEps); 432 | } 433 | 434 | if (_mode >= 3) { 435 | _gradb = _gradb + _b * regularizationWeight; 436 | _eg2b = _eg2b + _gradb * _gradb; 437 | _b = _b - _gradb * adaAlpha / F(_eg2b + adaEps); 438 | } 439 | 440 | clearGrad(); 441 | } 442 | 443 | inline void clearGrad() { 444 | _gradW = 0; 445 | if (_mode >= 2) 446 | _gradV = 0; 447 | if (_mode >= 3) 448 | _gradb = 0; 449 | } 450 | 451 | void writeModel(LStream &outf) { 452 | SaveBinary(outf, _W); 453 | SaveBinary(outf, _V); 454 | SaveBinary(outf, _b); 455 | 456 | SaveBinary(outf, _gradW); 457 | SaveBinary(outf, _gradV); 458 | SaveBinary(outf, _gradb); 459 | 460 | SaveBinary(outf, _eg2W); 461 | SaveBinary(outf, _eg2V); 462 | SaveBinary(outf, _eg2b); 463 | 464 | WriteBinary(outf, _mode); 465 | WriteBinary(outf, _funcType); 466 | } 467 | 468 | void loadModel(LStream &inf) { 469 | LoadBinary(inf, &_W, false); 470 | LoadBinary(inf, &_V, false); 471 | LoadBinary(inf, &_b, false); 472 | 473 | LoadBinary(inf, &_gradW, false); 474 | LoadBinary(inf, &_gradV, false); 475 | LoadBinary(inf, &_gradb, false); 476 | 477 | LoadBinary(inf, &_eg2W, false); 478 | LoadBinary(inf, &_eg2V, false); 479 | LoadBinary(inf, &_eg2b, false); 480 | 481 | ReadBinary(inf, _mode); 482 | ReadBinary(inf, _funcType); 483 | } 484 | 485 | }; 486 | 487 | #endif /* SRC_TensorLayer_H_ */ 488 | -------------------------------------------------------------------------------- /TriLayer.h: -------------------------------------------------------------------------------- 1 | /* 2 | * TriLayer.h 3 | * 4 | * Created on: Mar 18, 2015 5 | * Author: mszhang 6 | */ 7 | 8 | #ifndef SRC_TriLayer_H_ 9 | #define SRC_TriLayer_H_ 10 | #include "tensor.h" 11 | #include "MyLib.h" 12 | #include "Utiltensor.h" 13 | 14 | using namespace mshadow; 15 | using namespace mshadow::expr; 16 | using namespace mshadow::utils; 17 | 18 | template 19 | class TriLayer { 20 | 21 | public: 22 | 23 | Tensor _W1; 24 | Tensor _W2; 25 | Tensor _W3; 26 | Tensor _b; 27 | 28 | Tensor _gradW1; 29 | Tensor _gradW2; 30 | Tensor _gradW3; 31 | Tensor _gradb; 32 | 33 | Tensor _eg2W1; 34 | Tensor _eg2W2; 35 | Tensor _eg2W3; 36 | Tensor _eg2b; 37 | 38 | bool _bUseB; 39 | 40 | int _funcType; // 0: tanh, 1: sigmod, 2: f(x)=x, 3: exp 41 | 42 | public: 43 | TriLayer() { 44 | } 45 | 46 | inline void initial(int nOSize, int nISize1, int nISize2, int nISize3, bool bUseB = true, int seed = 0, int funcType = 0) { 47 | dtype bound = sqrt(6.0 / (nOSize + nISize1 + nISize2 + nISize3 + 1)); 48 | //dtype bound = 0.01; 49 | 50 | _W1 = NewTensor(Shape2(nOSize, nISize1), d_zero); 51 | _gradW1 = NewTensor(Shape2(nOSize, nISize1), d_zero); 52 | _eg2W1 = NewTensor(Shape2(nOSize, nISize1), d_zero); 53 | 54 | _W2 = NewTensor(Shape2(nOSize, nISize2), d_zero); 55 | _gradW2 = NewTensor(Shape2(nOSize, nISize2), d_zero); 56 | _eg2W2 = NewTensor(Shape2(nOSize, nISize2), d_zero); 57 | 58 | _W3 = NewTensor(Shape2(nOSize, nISize3), d_zero); 59 | _gradW3 = NewTensor(Shape2(nOSize, nISize3), d_zero); 60 | _eg2W3 = NewTensor(Shape2(nOSize, nISize3), d_zero); 61 | 62 | _b = NewTensor(Shape2(1, nOSize), d_zero); 63 | _gradb = NewTensor(Shape2(1, nOSize), d_zero); 64 | _eg2b = NewTensor(Shape2(1, nOSize), d_zero); 65 | 66 | random(_W1, -1.0 * bound, 1.0 * bound, seed); 67 | random(_W2, -1.0 * bound, 1.0 * bound, seed+1); 68 | random(_W3, -1.0 * bound, 1.0 * bound, seed+2); 69 | random(_b, -1.0 * bound, 1.0 * bound, seed+3); 70 | 71 | _bUseB = bUseB; 72 | _funcType = funcType; 73 | } 74 | 75 | inline void initial(Tensor W1, Tensor W2, Tensor W3, Tensor b, bool bUseB = true, 76 | int funcType = 0) { 77 | static int nOSize, nISize1, nISize2, nISize3; 78 | nOSize = W1.size(0); 79 | nISize1 = W1.size(1); 80 | nISize2 = W2.size(1); 81 | nISize3 = W3.size(1); 82 | 83 | _W1 = NewTensor(Shape2(nOSize, nISize1), d_zero); 84 | _gradW1 = NewTensor(Shape2(nOSize, nISize1), d_zero); 85 | _eg2W1 = NewTensor(Shape2(nOSize, nISize1), d_zero); 86 | Copy(_W1, W1); 87 | 88 | _W2 = NewTensor(Shape2(nOSize, nISize2), d_zero); 89 | _gradW2 = NewTensor(Shape2(nOSize, nISize2), d_zero); 90 | _eg2W2 = NewTensor(Shape2(nOSize, nISize2), d_zero); 91 | Copy(_W2, W2); 92 | 93 | _W3 = NewTensor(Shape2(nOSize, nISize3), d_zero); 94 | _gradW3 = NewTensor(Shape2(nOSize, nISize3), d_zero); 95 | _eg2W3 = NewTensor(Shape2(nOSize, nISize3), d_zero); 96 | Copy(_W3, W3); 97 | 98 | _b = NewTensor(Shape2(1, nOSize), d_zero); 99 | _gradb = NewTensor(Shape2(1, nOSize), d_zero); 100 | _eg2b = NewTensor(Shape2(1, nOSize), d_zero); 101 | 102 | if (bUseB) 103 | Copy(_b, b); 104 | 105 | _bUseB = bUseB; 106 | _funcType = funcType; 107 | } 108 | 109 | inline void release() { 110 | FreeSpace(&_W1); 111 | FreeSpace(&_gradW1); 112 | FreeSpace(&_eg2W1); 113 | FreeSpace(&_W2); 114 | FreeSpace(&_gradW2); 115 | FreeSpace(&_eg2W2); 116 | FreeSpace(&_W3); 117 | FreeSpace(&_gradW3); 118 | FreeSpace(&_eg2W3); 119 | FreeSpace(&_b); 120 | FreeSpace(&_gradb); 121 | FreeSpace(&_eg2b); 122 | } 123 | 124 | virtual ~TriLayer() { 125 | // TODO Auto-generated destructor stub 126 | } 127 | 128 | inline dtype squarenormAll() { 129 | dtype result = squarenorm(_gradW1); 130 | result += squarenorm(_gradW2); 131 | result += squarenorm(_gradW3); 132 | if (_bUseB) { 133 | result += squarenorm(_gradb); 134 | } 135 | 136 | return result; 137 | } 138 | 139 | inline void scaleGrad(dtype scale) { 140 | _gradW1 = _gradW1 * scale; 141 | _gradW2 = _gradW2 * scale; 142 | _gradW3 = _gradW3 * scale; 143 | if (_bUseB) { 144 | _gradb = _gradb * scale; 145 | } 146 | } 147 | 148 | public: 149 | inline void ComputeForwardScore(Tensor x1, Tensor x2, Tensor x3, Tensor y) { 150 | y = dot(x1, _W1.T()); 151 | y += dot(x2, _W2.T()); 152 | y += dot(x3, _W3.T()); 153 | if (_bUseB) 154 | y = y + _b; 155 | if (_funcType == 0) 156 | y = F(y); 157 | else if (_funcType == 1) 158 | y = F(y); 159 | else if (_funcType == 3) 160 | y = F(y); 161 | } 162 | 163 | inline void ComputeForwardScore(Tensor x1, Tensor x2, Tensor x3, Tensor y) { 164 | int seq_size = y.size(0); 165 | 166 | for (int id = 0; id < seq_size; id++) { 167 | y[id] = dot(x1[id], _W1.T()); 168 | y[id] += dot(x2[id], _W2.T()); 169 | y[id] += dot(x3[id], _W3.T()); 170 | if (_bUseB) 171 | y[id] = y[id] + _b; 172 | if (_funcType == 0) 173 | y[id] = F(y[id]); 174 | else if (_funcType == 1) 175 | y[id] = F(y[id]); 176 | else if (_funcType == 3) 177 | y[id] = F(y[id]); 178 | } 179 | } 180 | 181 | inline void ComputeForwardScore(const std::vector > &x1, const std::vector > &x2, 182 | const std::vector > &x3, std::vector > &y) { 183 | int seq_size = y.size(); 184 | 185 | for (int id = 0; id < seq_size; id++) { 186 | y[id] = dot(x1[id], _W1.T()); 187 | y[id] += dot(x2[id], _W2.T()); 188 | y[id] += dot(x3[id], _W3.T()); 189 | if (_bUseB) 190 | y[id] = y[id] + _b; 191 | if (_funcType == 0) 192 | y[id] = F(y[id]); 193 | else if (_funcType == 1) 194 | y[id] = F(y[id]); 195 | else if (_funcType == 3) 196 | y[id] = F(y[id]); 197 | } 198 | } 199 | 200 | //please allocate the memory outside here 201 | inline void ComputeBackwardLoss(Tensor x1, Tensor x2, Tensor x3, Tensor y, 202 | Tensor ly, Tensor lx1, Tensor lx2, Tensor lx3, bool bclear = false) { 203 | //_gradW 204 | Tensor deri_yx(Shape2(y.size(0), y.size(1))), cly(Shape2(y.size(0), y.size(1))); 205 | AllocSpace(&deri_yx); 206 | AllocSpace(&cly); 207 | 208 | if(bclear) { 209 | lx1 = 0.0; 210 | lx2 = 0.0; 211 | lx3 = 0.0; 212 | } 213 | if (_funcType == 0) { 214 | deri_yx = F(y); 215 | cly = ly * deri_yx; 216 | } else if (_funcType == 1) { 217 | deri_yx = F(y); 218 | cly = ly * deri_yx; 219 | } else if (_funcType == 3) { 220 | cly = ly * y; 221 | } else { 222 | //cly = ly; 223 | Copy(cly, ly); 224 | } 225 | //_gradW 226 | _gradW1 += dot(cly.T(), x1); 227 | _gradW2 += dot(cly.T(), x2); 228 | _gradW3 += dot(cly.T(), x3); 229 | 230 | //_gradb 231 | if (_bUseB) 232 | _gradb += cly; 233 | 234 | //lx 235 | lx1 += dot(cly, _W1); 236 | lx2 += dot(cly, _W2); 237 | lx3 += dot(cly, _W3); 238 | 239 | FreeSpace(&deri_yx); 240 | FreeSpace(&cly); 241 | } 242 | 243 | 244 | //please allocate the memory outside here 245 | inline void ComputeBackwardLoss(Tensor x1, Tensor x2, Tensor x3, Tensor y, 246 | Tensor ly, Tensor lx1, Tensor lx2, Tensor lx3, bool bclear = false) { 247 | int seq_size = y.size(0); 248 | int y_dim1 = y.size(1), y_dim2 = y.size(2); 249 | //_gradW 250 | Tensor deri_yx(Shape2(y_dim1, y_dim2)), cly(Shape2(y_dim1, y_dim2)); 251 | AllocSpace(&deri_yx); 252 | AllocSpace(&cly); 253 | if(bclear) { 254 | lx1 = 0.0; 255 | lx2 = 0.0; 256 | lx3 = 0.0; 257 | } 258 | for (int id = 0; id < seq_size; id++) { 259 | if (_funcType == 0) { 260 | deri_yx = F(y[id]); 261 | cly = ly[id] * deri_yx; 262 | } else if (_funcType == 1) { 263 | deri_yx = F(y[id]); 264 | cly = ly[id] * deri_yx; 265 | } else if (_funcType == 3) { 266 | cly = ly[id] * y[id]; 267 | } else { 268 | //cly = ly; 269 | Copy(cly, ly[id]); 270 | } 271 | //_gradW 272 | _gradW1 += dot(cly.T(), x1[id]); 273 | _gradW2 += dot(cly.T(), x2[id]); 274 | _gradW3 += dot(cly.T(), x3[id]); 275 | 276 | //_gradb 277 | if (_bUseB) 278 | _gradb += cly; 279 | 280 | //lx 281 | lx1[id] += dot(cly, _W1); 282 | lx2[id] += dot(cly, _W2); 283 | lx3[id] += dot(cly, _W3); 284 | } 285 | 286 | FreeSpace(&deri_yx); 287 | FreeSpace(&cly); 288 | } 289 | 290 | 291 | //please allocate the memory outside here 292 | inline void ComputeBackwardLoss(const std::vector > &x1, const std::vector > &x2, 293 | const std::vector > &x3, const std::vector > &y, 294 | const std::vector > &ly, std::vector > &lx1, 295 | std::vector > &lx2, std::vector > &lx3, bool bclear = false) { 296 | int seq_size = y.size(); 297 | assert(seq_size > 0); 298 | int y_dim1 = y[0].size(0), y_dim2 = y[0].size(1); 299 | //_gradW 300 | Tensor deri_yx(Shape2(y_dim1, y_dim2)), cly(Shape2(y_dim1, y_dim2)); 301 | AllocSpace(&deri_yx); 302 | AllocSpace(&cly); 303 | if(bclear) { 304 | for (int id = 0; id < seq_size; id++) { 305 | lx1[id] = 0.0; 306 | lx2[id] = 0.0; 307 | lx3[id] = 0.0; 308 | } 309 | } 310 | for (int id = 0; id < seq_size; id++) { 311 | if (_funcType == 0) { 312 | deri_yx = F(y[id]); 313 | cly = ly[id] * deri_yx; 314 | } else if (_funcType == 1) { 315 | deri_yx = F(y[id]); 316 | cly = ly[id] * deri_yx; 317 | } else if (_funcType == 3) { 318 | cly = ly[id] * y[id]; 319 | } else { 320 | //cly = ly; 321 | Copy(cly, ly[id]); 322 | } 323 | //_gradW 324 | _gradW1 += dot(cly.T(), x1[id]); 325 | _gradW2 += dot(cly.T(), x2[id]); 326 | _gradW3 += dot(cly.T(), x3[id]); 327 | 328 | //_gradb 329 | if (_bUseB) 330 | _gradb += cly; 331 | 332 | //lx 333 | lx1[id] += dot(cly, _W1); 334 | lx2[id] += dot(cly, _W2); 335 | lx3[id] += dot(cly, _W3); 336 | } 337 | 338 | FreeSpace(&deri_yx); 339 | FreeSpace(&cly); 340 | } 341 | 342 | inline void randomprint(int num) { 343 | static int nOSize, nISize1, nISize2, nISize3; 344 | nOSize = _W1.size(0); 345 | nISize1 = _W1.size(1); 346 | nISize2 = _W2.size(1); 347 | nISize3 = _W3.size(1); 348 | int count = 0; 349 | while (count < num) { 350 | int idx1 = rand() % nOSize; 351 | int idy1 = rand() % nISize1; 352 | int idx2 = rand() % nOSize; 353 | int idy2 = rand() % nISize2; 354 | int idx3 = rand() % nOSize; 355 | int idy3 = rand() % nISize3; 356 | 357 | std::cout << "_W1[" << idx1 << "," << idy1 << "]=" << _W1[idx1][idy1] << " "; 358 | std::cout << "_W2[" << idx2 << "," << idy2 << "]=" << _W2[idx2][idy2] << " "; 359 | std::cout << "_W3[" << idx3 << "," << idy3 << "]=" << _W3[idx3][idy3] << " "; 360 | 361 | if (_bUseB) { 362 | int idz = rand() % nOSize; 363 | std::cout << "_b[0][" << idz << "]=" << _b[0][idz] << " "; 364 | } 365 | count++; 366 | } 367 | 368 | std::cout << std::endl; 369 | } 370 | 371 | inline void updateAdaGrad(dtype regularizationWeight, dtype adaAlpha, dtype adaEps) { 372 | _gradW1 = _gradW1 + _W1 * regularizationWeight; 373 | _eg2W1 = _eg2W1 + _gradW1 * _gradW1; 374 | _W1 = _W1 - _gradW1 * adaAlpha / F(_eg2W1 + adaEps); 375 | 376 | _gradW2 = _gradW2 + _W2 * regularizationWeight; 377 | _eg2W2 = _eg2W2 + _gradW2 * _gradW2; 378 | _W2 = _W2 - _gradW2 * adaAlpha / F(_eg2W2 + adaEps); 379 | 380 | _gradW3 = _gradW3 + _W3 * regularizationWeight; 381 | _eg2W3 = _eg2W3 + _gradW3 * _gradW3; 382 | _W3 = _W3 - _gradW3 * adaAlpha / F(_eg2W3 + adaEps); 383 | 384 | if (_bUseB) { 385 | _gradb = _gradb + _b * regularizationWeight; 386 | _eg2b = _eg2b + _gradb * _gradb; 387 | _b = _b - _gradb * adaAlpha / F(_eg2b + adaEps); 388 | } 389 | 390 | clearGrad(); 391 | } 392 | 393 | inline void clearGrad() { 394 | _gradW1 = 0; 395 | _gradW2 = 0; 396 | _gradW3 = 0; 397 | if (_bUseB) 398 | _gradb = 0; 399 | } 400 | 401 | void writeModel(LStream &outf) { 402 | SaveBinary(outf, _W1); 403 | SaveBinary(outf, _W2); 404 | SaveBinary(outf, _W3); 405 | SaveBinary(outf, _b); 406 | 407 | SaveBinary(outf, _gradW1); 408 | SaveBinary(outf, _gradW2); 409 | SaveBinary(outf, _gradW3); 410 | SaveBinary(outf, _gradb); 411 | 412 | SaveBinary(outf, _eg2W1); 413 | SaveBinary(outf, _eg2W2); 414 | SaveBinary(outf, _eg2W3); 415 | SaveBinary(outf, _eg2b); 416 | 417 | WriteBinary(outf, _bUseB); 418 | WriteBinary(outf, _funcType); 419 | // cout << "TrilayerLSTM " << _bUseB << _funcType << endl; 420 | // cout << "TrilayerLSTM value: " << _W3.size(0) << " and " << _W3.size(1) << " value " << _W3[0][1] << endl; 421 | 422 | 423 | } 424 | 425 | void loadModel(LStream &inf) { 426 | LoadBinary(inf, &_W1, false); 427 | LoadBinary(inf, &_W2, false); 428 | LoadBinary(inf, &_W3, false); 429 | LoadBinary(inf, &_b, false); 430 | 431 | LoadBinary(inf, &_gradW1, false); 432 | LoadBinary(inf, &_gradW2, false); 433 | LoadBinary(inf, &_gradW3, false); 434 | LoadBinary(inf, &_gradb, false); 435 | 436 | LoadBinary(inf, &_eg2W1, false); 437 | LoadBinary(inf, &_eg2W2, false); 438 | LoadBinary(inf, &_eg2W3, false); 439 | LoadBinary(inf, &_eg2b, false); 440 | 441 | ReadBinary(inf, _bUseB); 442 | ReadBinary(inf, _funcType); 443 | // cout << "TrilayerLSTM " << _bUseB << _funcType << endl; 444 | // cout << "TrilayerLSTM value: " << _W3.size(0) << " and " << _W3.size(1) << " value " << _W3[0][1] << endl; 445 | } 446 | 447 | }; 448 | 449 | #endif /* SRC_TriLayer_H_ */ 450 | -------------------------------------------------------------------------------- /TriLayerLSTM.h: -------------------------------------------------------------------------------- 1 | /* 2 | * TriLayerLSTM.h 3 | * 4 | * Created on: Mar 18, 2015 5 | * Author: mszhang 6 | */ 7 | 8 | #ifndef SRC_TriLayerLSTM_H_ 9 | #define SRC_TriLayerLSTM_H_ 10 | #include "tensor.h" 11 | #include "MyLib.h" 12 | #include "Utiltensor.h" 13 | 14 | using namespace mshadow; 15 | using namespace mshadow::expr; 16 | using namespace mshadow::utils; 17 | 18 | template 19 | class TriLayerLSTM { 20 | 21 | public: 22 | 23 | Tensor _W1; 24 | Tensor _W2; 25 | Tensor _W3; 26 | Tensor _b; 27 | 28 | Tensor _gradW1; 29 | Tensor _gradW2; 30 | Tensor _gradW3; 31 | Tensor _gradb; 32 | 33 | Tensor _eg2W1; 34 | Tensor _eg2W2; 35 | Tensor _eg2W3; 36 | Tensor _eg2b; 37 | 38 | bool _bUseB; 39 | 40 | int _funcType; // 0: tanh, 1: sigmod, 2: f(x)=x, 3: exp 41 | 42 | public: 43 | TriLayerLSTM() { 44 | } 45 | 46 | inline void initial(int nOSize, int nISize1, int nISize2, bool bUseB = true, int seed = 0, int funcType = 0) { 47 | dtype bound = sqrt(6.0 / (nOSize + nISize1 + nISize2 + 1)); 48 | //dtype bound = 0.01; 49 | 50 | _W1 = NewTensor(Shape2(nOSize, nISize1), d_zero); 51 | _gradW1 = NewTensor(Shape2(nOSize, nISize1), d_zero); 52 | _eg2W1 = NewTensor(Shape2(nOSize, nISize1), d_zero); 53 | 54 | _W2 = NewTensor(Shape2(nOSize, nISize2), d_zero); 55 | _gradW2 = NewTensor(Shape2(nOSize, nISize2), d_zero); 56 | _eg2W2 = NewTensor(Shape2(nOSize, nISize2), d_zero); 57 | 58 | _W3 = NewTensor(Shape2(1, nOSize), d_zero); 59 | _gradW3 = NewTensor(Shape2(1, nOSize), d_zero); 60 | _eg2W3 = NewTensor(Shape2(1, nOSize), d_zero); 61 | 62 | _b = NewTensor(Shape2(1, nOSize), d_zero); 63 | _gradb = NewTensor(Shape2(1, nOSize), d_zero); 64 | _eg2b = NewTensor(Shape2(1, nOSize), d_zero); 65 | 66 | random(_W1, -1.0 * bound, 1.0 * bound, seed); 67 | random(_W2, -1.0 * bound, 1.0 * bound, seed+1); 68 | random(_W3, -1.0 * bound, 1.0 * bound, seed+2); 69 | random(_b, -1.0 * bound, 1.0 * bound, seed+3); 70 | 71 | _bUseB = bUseB; 72 | _funcType = funcType; 73 | } 74 | 75 | inline void initial(Tensor W1, Tensor W2, Tensor W3, Tensor b, bool bUseB = true, 76 | int funcType = 0) { 77 | static int nOSize, nISize1, nISize2; 78 | nOSize = W1.size(0); 79 | nISize1 = W1.size(1); 80 | nISize2 = W2.size(1); 81 | 82 | 83 | _W1 = NewTensor(Shape2(nOSize, nISize1), d_zero); 84 | _gradW1 = NewTensor(Shape2(nOSize, nISize1), d_zero); 85 | _eg2W1 = NewTensor(Shape2(nOSize, nISize1), d_zero); 86 | Copy(_W1, W1); 87 | 88 | _W2 = NewTensor(Shape2(nOSize, nISize2), d_zero); 89 | _gradW2 = NewTensor(Shape2(nOSize, nISize2), d_zero); 90 | _eg2W2 = NewTensor(Shape2(nOSize, nISize2), d_zero); 91 | Copy(_W2, W2); 92 | 93 | _W3 = NewTensor(Shape2(1, nOSize), d_zero); 94 | _gradW3 = NewTensor(Shape2(1, nOSize), d_zero); 95 | _eg2W3 = NewTensor(Shape2(1, nOSize), d_zero); 96 | Copy(_W3, W3); 97 | 98 | _b = NewTensor(Shape2(1, nOSize), d_zero); 99 | _gradb = NewTensor(Shape2(1, nOSize), d_zero); 100 | _eg2b = NewTensor(Shape2(1, nOSize), d_zero); 101 | 102 | if (bUseB) 103 | Copy(_b, b); 104 | 105 | _bUseB = bUseB; 106 | _funcType = funcType; 107 | } 108 | 109 | inline void release() { 110 | FreeSpace(&_W1); 111 | FreeSpace(&_gradW1); 112 | FreeSpace(&_eg2W1); 113 | FreeSpace(&_W2); 114 | FreeSpace(&_gradW2); 115 | FreeSpace(&_eg2W2); 116 | FreeSpace(&_W3); 117 | FreeSpace(&_gradW3); 118 | FreeSpace(&_eg2W3); 119 | FreeSpace(&_b); 120 | FreeSpace(&_gradb); 121 | FreeSpace(&_eg2b); 122 | } 123 | 124 | virtual ~TriLayerLSTM() { 125 | // TODO Auto-generated destructor stub 126 | } 127 | 128 | inline dtype squarenormAll() { 129 | dtype result = squarenorm(_gradW1); 130 | result += squarenorm(_gradW2); 131 | result += squarenorm(_gradW3); 132 | if (_bUseB) { 133 | result += squarenorm(_gradb); 134 | } 135 | 136 | return result; 137 | } 138 | 139 | inline void scaleGrad(dtype scale) { 140 | _gradW1 = _gradW1 * scale; 141 | _gradW2 = _gradW2 * scale; 142 | _gradW3 = _gradW3 * scale; 143 | if (_bUseB) { 144 | _gradb = _gradb * scale; 145 | } 146 | } 147 | 148 | public: 149 | inline void ComputeForwardScore(Tensor x1, Tensor x2, Tensor x3, Tensor y) { 150 | y = dot(x1, _W1.T()); 151 | y += dot(x2, _W2.T()); 152 | y += x3 * _W3; 153 | if (_bUseB) 154 | y = y + _b; 155 | if (_funcType == 0) 156 | y = F(y); 157 | else if (_funcType == 1) 158 | y = F(y); 159 | else if (_funcType == 3) 160 | y = F(y); 161 | } 162 | 163 | inline void ComputeForwardScore(Tensor x1, Tensor x2, Tensor x3, Tensor y) { 164 | int seq_size = y.size(0); 165 | 166 | for (int id = 0; id < seq_size; id++) { 167 | y[id] = dot(x1[id], _W1.T()); 168 | y[id] += dot(x2[id], _W2.T()); 169 | y[id] += x3[id] * _W3; 170 | if (_bUseB) 171 | y[id] = y[id] + _b; 172 | if (_funcType == 0) 173 | y[id] = F(y[id]); 174 | else if (_funcType == 1) 175 | y[id] = F(y[id]); 176 | else if (_funcType == 3) 177 | y[id] = F(y[id]); 178 | } 179 | } 180 | 181 | inline void ComputeForwardScore(const std::vector > &x1, const std::vector > &x2, 182 | const std::vector > &x3, std::vector > &y) { 183 | int seq_size = y.size(); 184 | 185 | for (int id = 0; id < seq_size; id++) { 186 | y[id] = dot(x1[id], _W1.T()); 187 | y[id] += dot(x2[id], _W2.T()); 188 | y[id] += x3[id] * _W3; 189 | if (_bUseB) 190 | y[id] = y[id] + _b; 191 | if (_funcType == 0) 192 | y[id] = F(y[id]); 193 | else if (_funcType == 1) 194 | y[id] = F(y[id]); 195 | else if (_funcType == 3) 196 | y[id] = F(y[id]); 197 | } 198 | } 199 | 200 | //please allocate the memory outside here 201 | inline void ComputeBackwardLoss(Tensor x1, Tensor x2, Tensor x3, Tensor y, 202 | Tensor ly, Tensor lx1, Tensor lx2, Tensor lx3, bool bclear = false) { 203 | //_gradW 204 | Tensor deri_yx(Shape2(y.size(0), y.size(1))), cly(Shape2(y.size(0), y.size(1))); 205 | AllocSpace(&deri_yx); 206 | AllocSpace(&cly); 207 | 208 | if(bclear) { 209 | lx1 = 0.0; 210 | lx2 = 0.0; 211 | lx3 = 0.0; 212 | } 213 | if (_funcType == 0) { 214 | deri_yx = F(y); 215 | cly = ly * deri_yx; 216 | } else if (_funcType == 1) { 217 | deri_yx = F(y); 218 | cly = ly * deri_yx; 219 | } else if (_funcType == 3) { 220 | cly = ly * y; 221 | } else { 222 | //cly = ly; 223 | Copy(cly, ly); 224 | } 225 | //_gradW 226 | _gradW1 += dot(cly.T(), x1); 227 | _gradW2 += dot(cly.T(), x2); 228 | _gradW3 += cly * x3; 229 | 230 | //_gradb 231 | if (_bUseB) 232 | _gradb += cly; 233 | 234 | //lx 235 | lx1 += dot(cly, _W1); 236 | lx2 += dot(cly, _W2); 237 | lx3 += cly * _W3; 238 | 239 | FreeSpace(&deri_yx); 240 | FreeSpace(&cly); 241 | } 242 | 243 | 244 | //please allocate the memory outside here 245 | inline void ComputeBackwardLoss(Tensor x1, Tensor x2, Tensor x3, Tensor y, 246 | Tensor ly, Tensor lx1, Tensor lx2, Tensor lx3, bool bclear = false) { 247 | int seq_size = y.size(0); 248 | int y_dim1 = y.size(1), y_dim2 = y.size(2); 249 | //_gradW 250 | Tensor deri_yx(Shape2(y_dim1, y_dim2)), cly(Shape2(y_dim1, y_dim2)); 251 | AllocSpace(&deri_yx); 252 | AllocSpace(&cly); 253 | if(bclear) { 254 | lx1 = 0.0; 255 | lx2 = 0.0; 256 | lx3 = 0.0; 257 | } 258 | for (int id = 0; id < seq_size; id++) { 259 | if (_funcType == 0) { 260 | deri_yx = F(y[id]); 261 | cly = ly[id] * deri_yx; 262 | } else if (_funcType == 1) { 263 | deri_yx = F(y[id]); 264 | cly = ly[id] * deri_yx; 265 | } else if (_funcType == 3) { 266 | cly = ly[id] * y[id]; 267 | } else { 268 | //cly = ly; 269 | Copy(cly, ly[id]); 270 | } 271 | //_gradW 272 | _gradW1 += dot(cly.T(), x1[id]); 273 | _gradW2 += dot(cly.T(), x2[id]); 274 | _gradW3 += cly * x3[id]; 275 | 276 | //_gradb 277 | if (_bUseB) 278 | _gradb += cly; 279 | 280 | //lx 281 | lx1[id] += dot(cly, _W1); 282 | lx2[id] += dot(cly, _W2); 283 | lx3[id] += cly * _W3; 284 | } 285 | 286 | FreeSpace(&deri_yx); 287 | FreeSpace(&cly); 288 | } 289 | 290 | 291 | //please allocate the memory outside here 292 | inline void ComputeBackwardLoss(const std::vector > &x1, const std::vector > &x2, 293 | const std::vector > &x3, const std::vector > &y, 294 | const std::vector > &ly, std::vector > &lx1, 295 | std::vector > &lx2, std::vector > &lx3, bool bclear = false) { 296 | int seq_size = y.size(); 297 | assert(seq_size > 0); 298 | int y_dim1 = y[0].size(0), y_dim2 = y[0].size(1); 299 | //_gradW 300 | Tensor deri_yx(Shape2(y_dim1, y_dim2)), cly(Shape2(y_dim1, y_dim2)); 301 | AllocSpace(&deri_yx); 302 | AllocSpace(&cly); 303 | if(bclear) { 304 | for (int id = 0; id < seq_size; id++) { 305 | lx1[id] = 0.0; 306 | lx2[id] = 0.0; 307 | lx3[id] = 0.0; 308 | } 309 | } 310 | for (int id = 0; id < seq_size; id++) { 311 | if (_funcType == 0) { 312 | deri_yx = F(y[id]); 313 | cly = ly[id] * deri_yx; 314 | } else if (_funcType == 1) { 315 | deri_yx = F(y[id]); 316 | cly = ly[id] * deri_yx; 317 | } else if (_funcType == 3) { 318 | cly = ly[id] * y[id]; 319 | } else { 320 | //cly = ly; 321 | Copy(cly, ly[id]); 322 | } 323 | //_gradW 324 | _gradW1 += dot(cly.T(), x1[id]); 325 | _gradW2 += dot(cly.T(), x2[id]); 326 | _gradW3 += cly * x3[id]; 327 | 328 | //_gradb 329 | if (_bUseB) 330 | _gradb += cly; 331 | 332 | //lx 333 | lx1[id] += dot(cly, _W1); 334 | lx2[id] += dot(cly, _W2); 335 | lx3[id] += cly * _W3; 336 | } 337 | 338 | FreeSpace(&deri_yx); 339 | FreeSpace(&cly); 340 | } 341 | 342 | inline void randomprint(int num) { 343 | static int nOSize, nISize1, nISize2, nISize3; 344 | nOSize = _W1.size(0); 345 | nISize1 = _W1.size(1); 346 | nISize2 = _W2.size(1); 347 | nISize3 = _W3.size(1); 348 | int count = 0; 349 | while (count < num) { 350 | int idx1 = rand() % nOSize; 351 | int idy1 = rand() % nISize1; 352 | int idx2 = rand() % nOSize; 353 | int idy2 = rand() % nISize2; 354 | int idx3 = rand() % nOSize; 355 | int idy3 = rand() % nISize3; 356 | 357 | std::cout << "_W1[" << idx1 << "," << idy1 << "]=" << _W1[idx1][idy1] << " "; 358 | std::cout << "_W2[" << idx2 << "," << idy2 << "]=" << _W2[idx2][idy2] << " "; 359 | std::cout << "_W3[" << idx3 << "," << idy3 << "]=" << _W3[idx3][idy3] << " "; 360 | 361 | if (_bUseB) { 362 | int idz = rand() % nOSize; 363 | std::cout << "_b[0][" << idz << "]=" << _b[0][idz] << " "; 364 | } 365 | count++; 366 | } 367 | 368 | std::cout << std::endl; 369 | } 370 | 371 | inline void updateAdaGrad(dtype regularizationWeight, dtype adaAlpha, dtype adaEps) { 372 | _gradW1 = _gradW1 + _W1 * regularizationWeight; 373 | _eg2W1 = _eg2W1 + _gradW1 * _gradW1; 374 | _W1 = _W1 - _gradW1 * adaAlpha / F(_eg2W1 + adaEps); 375 | 376 | _gradW2 = _gradW2 + _W2 * regularizationWeight; 377 | _eg2W2 = _eg2W2 + _gradW2 * _gradW2; 378 | _W2 = _W2 - _gradW2 * adaAlpha / F(_eg2W2 + adaEps); 379 | 380 | _gradW3 = _gradW3 + _W3 * regularizationWeight; 381 | _eg2W3 = _eg2W3 + _gradW3 * _gradW3; 382 | _W3 = _W3 - _gradW3 * adaAlpha / F(_eg2W3 + adaEps); 383 | 384 | if (_bUseB) { 385 | _gradb = _gradb + _b * regularizationWeight; 386 | _eg2b = _eg2b + _gradb * _gradb; 387 | _b = _b - _gradb * adaAlpha / F(_eg2b + adaEps); 388 | } 389 | 390 | clearGrad(); 391 | } 392 | 393 | inline void clearGrad() { 394 | _gradW1 = 0; 395 | _gradW2 = 0; 396 | _gradW3 = 0; 397 | if (_bUseB) 398 | _gradb = 0; 399 | } 400 | 401 | void writeModel(LStream &outf) { 402 | SaveBinary(outf, _W1); 403 | SaveBinary(outf, _W2); 404 | SaveBinary(outf, _W3); 405 | SaveBinary(outf, _b); 406 | 407 | SaveBinary(outf, _gradW1); 408 | SaveBinary(outf, _gradW2); 409 | SaveBinary(outf, _gradW3); 410 | SaveBinary(outf, _gradb); 411 | 412 | SaveBinary(outf, _eg2W1); 413 | SaveBinary(outf, _eg2W2); 414 | SaveBinary(outf, _eg2W3); 415 | SaveBinary(outf, _eg2b); 416 | 417 | WriteBinary(outf, _bUseB); 418 | WriteBinary(outf, _funcType); 419 | // cout << "TrilayerLSTM " << _bUseB << _funcType << endl; 420 | // cout << "TrilayerLSTM value: " << _W3.size(0) << " and " << _W3.size(1) << " value " << _W3[0][1] << endl; 421 | 422 | 423 | } 424 | 425 | void loadModel(LStream &inf) { 426 | LoadBinary(inf, &_W1, false); 427 | LoadBinary(inf, &_W2, false); 428 | LoadBinary(inf, &_W3, false); 429 | LoadBinary(inf, &_b, false); 430 | 431 | LoadBinary(inf, &_gradW1, false); 432 | LoadBinary(inf, &_gradW2, false); 433 | LoadBinary(inf, &_gradW3, false); 434 | LoadBinary(inf, &_gradb, false); 435 | 436 | LoadBinary(inf, &_eg2W1, false); 437 | LoadBinary(inf, &_eg2W2, false); 438 | LoadBinary(inf, &_eg2W3, false); 439 | LoadBinary(inf, &_eg2b, false); 440 | 441 | ReadBinary(inf, _bUseB); 442 | ReadBinary(inf, _funcType); 443 | // cout << "TrilayerLSTM " << _bUseB << _funcType << endl; 444 | // cout << "TrilayerLSTM value: " << _W3.size(0) << " and " << _W3.size(1) << " value " << _W3[0][1] << endl; 445 | } 446 | 447 | }; 448 | 449 | #endif /* SRC_TriLayerLSTM_H_ */ 450 | -------------------------------------------------------------------------------- /UniLayer.h: -------------------------------------------------------------------------------- 1 | /* 2 | * UniLayer.h 3 | * 4 | * Created on: Mar 18, 2015 5 | * Author: mszhang 6 | */ 7 | 8 | #ifndef SRC_UniLayer_H_ 9 | #define SRC_UniLayer_H_ 10 | #include "tensor.h" 11 | #include "MyLib.h" 12 | #include "Utiltensor.h" 13 | 14 | using namespace mshadow; 15 | using namespace mshadow::expr; 16 | using namespace mshadow::utils; 17 | 18 | template 19 | class UniLayer { 20 | 21 | public: 22 | 23 | Tensor _W; 24 | Tensor _b; 25 | 26 | Tensor _gradW; 27 | Tensor _gradb; 28 | 29 | Tensor _eg2W; 30 | Tensor _eg2b; 31 | 32 | bool _bUseB; 33 | 34 | int _funcType; // 0: tanh, 1: sigmod, 2: f(x)=x, 3: exp 35 | 36 | public: 37 | UniLayer() { 38 | } 39 | 40 | inline void initial(int nOSize, int nISize, bool bUseB = true, int seed = 0, int funcType = 0) { 41 | dtype bound = sqrt(6.0 / (nOSize + nISize + 1)); 42 | //dtype bound = 0.01; 43 | 44 | _W = NewTensor(Shape2(nOSize, nISize), d_zero); 45 | _gradW = NewTensor(Shape2(nOSize, nISize), d_zero); 46 | _eg2W = NewTensor(Shape2(nOSize, nISize), d_zero); 47 | 48 | _b = NewTensor(Shape2(1, nOSize), d_zero); 49 | _gradb = NewTensor(Shape2(1, nOSize), d_zero); 50 | _eg2b = NewTensor(Shape2(1, nOSize), d_zero); 51 | 52 | random(_W, -1.0 * bound, 1.0 * bound, seed); 53 | random(_b, -1.0 * bound, 1.0 * bound, seed + 1); 54 | 55 | _bUseB = bUseB; 56 | _funcType = funcType; 57 | } 58 | 59 | inline void initial(Tensor W, Tensor b, bool bUseB = true, int funcType = 0) { 60 | static int nOSize, nISize; 61 | nOSize = W.size(0); 62 | nISize = W.size(1); 63 | 64 | _W = NewTensor(Shape2(nOSize, nISize), d_zero); 65 | _gradW = NewTensor(Shape2(nOSize, nISize), d_zero); 66 | _eg2W = NewTensor(Shape2(nOSize, nISize), d_zero); 67 | Copy(_W, W); 68 | 69 | _b = NewTensor(Shape2(1, nOSize), d_zero); 70 | _gradb = NewTensor(Shape2(1, nOSize), d_zero); 71 | _eg2b = NewTensor(Shape2(1, nOSize), d_zero); 72 | 73 | if (bUseB) 74 | Copy(_b, b); 75 | 76 | _bUseB = bUseB; 77 | _funcType = funcType; 78 | } 79 | 80 | inline void initial(Tensor W, int funcType = 0) { 81 | static int nOSize, nISize; 82 | nOSize = W.size(0); 83 | nISize = W.size(1); 84 | 85 | _W = NewTensor(Shape2(nOSize, nISize), d_zero); 86 | _gradW = NewTensor(Shape2(nOSize, nISize), d_zero); 87 | _eg2W = NewTensor(Shape2(nOSize, nISize), d_zero); 88 | Copy(_W, W); 89 | 90 | _b = NewTensor(Shape2(1, nOSize), d_zero); 91 | _gradb = NewTensor(Shape2(1, nOSize), d_zero); 92 | _eg2b = NewTensor(Shape2(1, nOSize), d_zero); 93 | 94 | 95 | _bUseB = false; 96 | _funcType = funcType; 97 | } 98 | inline void release() { 99 | FreeSpace(&_W); 100 | FreeSpace(&_gradW); 101 | FreeSpace(&_eg2W); 102 | FreeSpace(&_b); 103 | FreeSpace(&_gradb); 104 | FreeSpace(&_eg2b); 105 | } 106 | 107 | virtual ~UniLayer() { 108 | // TODO Auto-generated destructor stub 109 | } 110 | 111 | inline dtype squarenormAll() { 112 | dtype result = squarenorm(_gradW); 113 | 114 | if (_bUseB) { 115 | result += squarenorm(_gradb); 116 | } 117 | 118 | return result; 119 | } 120 | 121 | inline void scaleGrad(dtype scale) { 122 | _gradW = _gradW * scale; 123 | if (_bUseB) { 124 | _gradb = _gradb * scale; 125 | } 126 | } 127 | 128 | public: 129 | inline void ComputeForwardScore(Tensor x, Tensor y) { 130 | y = dot(x, _W.T()); 131 | if (_bUseB) 132 | y = y + _b; 133 | if (_funcType == 0) 134 | y = F(y); 135 | else if (_funcType == 1) 136 | y = F(y); 137 | else if (_funcType == 3) 138 | y = F(y); 139 | } 140 | 141 | inline void ComputeForwardScore(Tensor x, Tensor y) { 142 | int seq_size = y.size(0); 143 | for (int id = 0; id < seq_size; id++) { 144 | y[id] = dot(x[id], _W.T()); 145 | if (_bUseB) 146 | y[id] = y[id] + _b; 147 | if (_funcType == 0) 148 | y[id] = F(y[id]); 149 | else if (_funcType == 1) 150 | y[id] = F(y[id]); 151 | else if (_funcType == 3) 152 | y[id] = F(y[id]); 153 | } 154 | } 155 | 156 | inline void ComputeForwardScore(const std::vector > &x, std::vector > &y) { 157 | int seq_size = y.size(); 158 | for (int id = 0; id < seq_size; id++) { 159 | y[id] = dot(x[id], _W.T()); 160 | if (_bUseB) 161 | y[id] = y[id] + _b; 162 | if (_funcType == 0) 163 | y[id] = F(y[id]); 164 | else if (_funcType == 1) 165 | y[id] = F(y[id]); 166 | else if (_funcType == 3) 167 | y[id] = F(y[id]); 168 | } 169 | } 170 | 171 | //please allocate the memory outside here 172 | inline void ComputeBackwardLoss(Tensor x, Tensor y, Tensor ly, Tensor lx, bool bclear = false) { 173 | //_gradW 174 | Tensor deri_yx(Shape2(y.size(0), y.size(1))), cly(Shape2(y.size(0), y.size(1))); 175 | AllocSpace(&deri_yx); 176 | AllocSpace(&cly); 177 | 178 | if (bclear) 179 | lx = 0.0; 180 | if (_funcType == 0) { 181 | deri_yx = F(y); 182 | cly = ly * deri_yx; 183 | } else if (_funcType == 1) { 184 | deri_yx = F(y); 185 | cly = ly * deri_yx; 186 | } else if (_funcType == 3) { 187 | cly = ly * y; 188 | } else { 189 | //cly = ly; 190 | Copy(cly, ly); 191 | } 192 | //_gradW 193 | _gradW += dot(cly.T(), x); 194 | 195 | //_gradb 196 | if (_bUseB) 197 | _gradb += cly; 198 | 199 | //lx 200 | lx += dot(cly, _W); 201 | 202 | FreeSpace(&deri_yx); 203 | FreeSpace(&cly); 204 | } 205 | 206 | //please allocate the memory outside here 207 | inline void ComputeBackwardLoss(Tensor x, Tensor y, Tensor ly, Tensor lx, bool bclear = false) { 208 | //_gradW 209 | int seq_size = y.size(0); 210 | int y_dim1 = y.size(1), y_dim2 = y.size(2); 211 | Tensor deri_yx(Shape2(y_dim1, y_dim2)), cly(Shape2(y_dim1, y_dim2)); 212 | AllocSpace(&deri_yx); 213 | AllocSpace(&cly); 214 | 215 | if (bclear) 216 | lx = 0.0; 217 | for (int id = 0; id < seq_size; id++) { 218 | if (_funcType == 0) { 219 | deri_yx = F(y[id]); 220 | cly = ly[id] * deri_yx; 221 | } else if (_funcType == 1) { 222 | deri_yx = F(y[id]); 223 | cly = ly[id] * deri_yx; 224 | } else if (_funcType == 3) { 225 | cly = ly[id] * y[id]; 226 | } else { 227 | //cly = ly; 228 | Copy(cly, ly[id]); 229 | } 230 | //_gradW 231 | _gradW += dot(cly.T(), x[id]); 232 | 233 | //_gradb 234 | if (_bUseB) 235 | _gradb += cly; 236 | 237 | //lx 238 | lx[id] += dot(cly, _W); 239 | } 240 | 241 | FreeSpace(&deri_yx); 242 | FreeSpace(&cly); 243 | } 244 | 245 | //please allocate the memory outside here 246 | inline void ComputeBackwardLoss(const std::vector > &x, const std::vector > &y, 247 | const std::vector > &ly, std::vector > &lx, bool bclear = false) { 248 | //_gradW 249 | int seq_size = y.size(); 250 | assert(seq_size > 0); 251 | int y_dim1 = y[0].size(0), y_dim2 = y[0].size(1); 252 | Tensor deri_yx(Shape2(y_dim1, y_dim2)), cly(Shape2(y_dim1, y_dim2)); 253 | AllocSpace(&deri_yx); 254 | AllocSpace(&cly); 255 | 256 | if(bclear) { 257 | for (int id = 0; id < seq_size; id++) { 258 | lx[id] = 0.0; 259 | } 260 | } 261 | for (int id = 0; id < seq_size; id++) { 262 | if (_funcType == 0) { 263 | deri_yx = F(y[id]); 264 | cly = ly[id] * deri_yx; 265 | } else if (_funcType == 1) { 266 | deri_yx = F(y[id]); 267 | cly = ly[id] * deri_yx; 268 | } else if (_funcType == 3) { 269 | cly = ly[id] * y[id]; 270 | } else { 271 | //cly = ly; 272 | Copy(cly, ly[id]); 273 | } 274 | //_gradW 275 | _gradW += dot(cly.T(), x[id]); 276 | 277 | //_gradb 278 | if (_bUseB) 279 | _gradb += cly; 280 | 281 | //lx 282 | lx[id] += dot(cly, _W); 283 | } 284 | 285 | FreeSpace(&deri_yx); 286 | FreeSpace(&cly); 287 | } 288 | 289 | inline void randomprint(int num) { 290 | static int nOSize, nISize; 291 | nOSize = _W.size(0); 292 | nISize = _W.size(1); 293 | int count = 0; 294 | while (count < num) { 295 | int idx = rand() % nOSize; 296 | int idy = rand() % nISize; 297 | 298 | std::cout << "_W[" << idx << "," << idy << "]=" << _W[idx][idy] << " "; 299 | 300 | if (_bUseB) { 301 | int idz = rand() % nOSize; 302 | std::cout << "_b[0][" << idz << "]=" << _b[0][idz] << " "; 303 | } 304 | count++; 305 | } 306 | 307 | std::cout << std::endl; 308 | } 309 | 310 | inline void updateAdaGrad(dtype regularizationWeight, dtype adaAlpha, dtype adaEps) { 311 | _gradW = _gradW + _W * regularizationWeight; 312 | _eg2W = _eg2W + _gradW * _gradW; 313 | _W = _W - _gradW * adaAlpha / F(_eg2W + adaEps); 314 | 315 | if (_bUseB) { 316 | _gradb = _gradb + _b * regularizationWeight; 317 | _eg2b = _eg2b + _gradb * _gradb; 318 | _b = _b - _gradb * adaAlpha / F(_eg2b + adaEps); 319 | } 320 | 321 | clearGrad(); 322 | } 323 | 324 | inline void clearGrad() { 325 | _gradW = 0; 326 | if (_bUseB) 327 | _gradb = 0; 328 | } 329 | 330 | void writeModel(LStream &outf) { 331 | SaveBinary(outf, _W); 332 | SaveBinary(outf, _b); 333 | SaveBinary(outf, _gradW); 334 | SaveBinary(outf, _gradb); 335 | SaveBinary(outf, _eg2W); 336 | SaveBinary(outf, _eg2b); 337 | WriteBinary(outf, _bUseB); 338 | WriteBinary(outf, _funcType); 339 | // cout << "Unilayer " << _bUseB << _funcType << endl; 340 | 341 | } 342 | 343 | void loadModel(LStream &inf) { 344 | LoadBinary(inf, &_W, false); 345 | LoadBinary(inf, &_b, false); 346 | LoadBinary(inf, &_gradW, false); 347 | LoadBinary(inf, &_gradb, false); 348 | LoadBinary(inf, &_eg2W, false); 349 | LoadBinary(inf, &_eg2b, false); 350 | ReadBinary(inf, _bUseB); 351 | ReadBinary(inf, _funcType); 352 | // cout << "Unilayer " << _bUseB << _funcType << endl; 353 | } 354 | 355 | }; 356 | 357 | #endif /* SRC_UniLayer_H_ */ 358 | -------------------------------------------------------------------------------- /UniLayer1O.h: -------------------------------------------------------------------------------- 1 | /* 2 | * UniLayer1O.h 3 | * 4 | * Created on: Mar 18, 2015 5 | * Author: mszhang 6 | */ 7 | /* 8 | * use it only for output layer 9 | */ 10 | #ifndef SRC_UniLayer1O_H_ 11 | #define SRC_UniLayer1O_H_ 12 | #include "tensor.h" 13 | #include "MyLib.h" 14 | #include "Utiltensor.h" 15 | 16 | using namespace mshadow; 17 | using namespace mshadow::expr; 18 | using namespace mshadow::utils; 19 | 20 | template 21 | class UniLayer1O { 22 | 23 | public: 24 | 25 | Tensor _W; 26 | 27 | Tensor _gradW; 28 | 29 | Tensor _eg2W; 30 | 31 | public: 32 | UniLayer1O() { 33 | } 34 | 35 | inline void initial(int nISize, int seed = 0) { 36 | dtype bound = sqrt(6.0 / (1 + nISize + 1)); 37 | //dtype bound = 0.01; 38 | 39 | _W = NewTensor(Shape2(1, nISize), d_zero); 40 | _gradW = NewTensor(Shape2(1, nISize), d_zero); 41 | _eg2W = NewTensor(Shape2(1, nISize), d_zero); 42 | 43 | random(_W, -1.0 * bound, 1.0 * bound, seed); 44 | 45 | } 46 | 47 | inline void initial(Tensor W) { 48 | static int nISize; 49 | nISize = W.size(1); 50 | 51 | _W = NewTensor(Shape2(1, nISize), d_zero); 52 | _gradW = NewTensor(Shape2(1, nISize), d_zero); 53 | _eg2W = NewTensor(Shape2(1, nISize), d_zero); 54 | Copy(_W, W); 55 | 56 | } 57 | 58 | inline void release() { 59 | FreeSpace(&_W); 60 | FreeSpace(&_gradW); 61 | FreeSpace(&_eg2W); 62 | } 63 | 64 | virtual ~UniLayer1O() { 65 | // TODO Auto-generated destructor stub 66 | } 67 | 68 | inline dtype squarenormAll() { 69 | dtype result = squarenorm(_gradW); 70 | 71 | return result; 72 | } 73 | 74 | inline void scaleGrad(dtype scale) { 75 | _gradW = _gradW * scale; 76 | } 77 | 78 | public: 79 | inline void ComputeForwardScore(Tensor x, dtype& y) { 80 | static int nISize; 81 | nISize = _W.size(1); 82 | y = 0.0; 83 | for(int idx = 0; idx < nISize; idx++){ 84 | y += x[0][idx] * _W[0][idx]; 85 | } 86 | } 87 | 88 | 89 | //please allocate the memory outside here 90 | inline void ComputeBackwardLoss(Tensor x, dtype ly, Tensor lx, bool bclear = false) { 91 | //_gradW 92 | _gradW += ly * x; 93 | 94 | if (bclear) 95 | lx = 0.0; 96 | //lx 97 | lx += ly * _W; 98 | 99 | } 100 | 101 | 102 | inline void randomprint(int num) { 103 | static int nISize; 104 | nISize = _W.size(1); 105 | int count = 0; 106 | while (count < num) { 107 | int idy = rand() % nISize; 108 | std::cout << "_W[" << 0 << "," << idy << "]=" << _W[0][idy] << " "; 109 | count++; 110 | } 111 | 112 | std::cout << std::endl; 113 | } 114 | 115 | inline void updateAdaGrad(dtype regularizationWeight, dtype adaAlpha, dtype adaEps) { 116 | _gradW = _gradW + _W * regularizationWeight; 117 | _eg2W = _eg2W + _gradW * _gradW; 118 | _W = _W - _gradW * adaAlpha / F(_eg2W + adaEps); 119 | 120 | 121 | clearGrad(); 122 | } 123 | 124 | inline void clearGrad() { 125 | _gradW = 0; 126 | } 127 | 128 | void writeModel(LStream &outf) { 129 | SaveBinary(outf, _W); 130 | SaveBinary(outf, _gradW); 131 | SaveBinary(outf, _eg2W); 132 | 133 | } 134 | 135 | void loadModel(LStream &inf) { 136 | LoadBinary(inf, &_W, false); 137 | LoadBinary(inf, &_gradW, false); 138 | LoadBinary(inf, &_eg2W, false); 139 | } 140 | }; 141 | 142 | #endif /* SRC_UniLayer1O_H_ */ 143 | -------------------------------------------------------------------------------- /Utils.h: -------------------------------------------------------------------------------- 1 | /*! 2 | * Copyright (c) 2014 by Contributors 3 | * \file Utils.h 4 | * \brief simple utils for error and checkings 5 | * \author Tianqi Chen 6 | */ 7 | #ifndef MSHADOW_UTILS_H_ 8 | #define MSHADOW_UTILS_H_ 9 | #define _CRT_SECURE_NO_WARNINGS 10 | #include 11 | #include 12 | #include 13 | #include 14 | namespace mshadow { 15 | /*! \brief namespace for helper utils of the project */ 16 | namespace utils { 17 | /*! \brief error message buffer length */ 18 | const int kPrintBuffer = 1 << 12; 19 | 20 | #ifndef MSHADOW_CUSTOMIZE_ASSERT_ 21 | /*! 22 | * \brief handling of Assert error, caused by in-apropriate input 23 | * \param msg error message 24 | */ 25 | inline void HandleAssertError(const char *msg) { 26 | fprintf(stderr, "AssertError:%s\n", msg); 27 | exit(-1); 28 | } 29 | /*! 30 | * \brief handling of Check error, caused by in-apropriate input 31 | * \param msg error message 32 | */ 33 | inline void HandleCheckError(const char *msg) { 34 | fprintf(stderr, "%s\n", msg); 35 | exit(-1); 36 | } 37 | #else 38 | // include declarations, some one must implement this 39 | void HandleAssertError(const char *msg); 40 | void HandleCheckError(const char *msg); 41 | void HandlePrint(const char *msg); 42 | #endif 43 | 44 | /*! \brief assert an condition is true, use this to handle debug information */ 45 | inline void Assert(bool exp, const char *fmt, ...) { 46 | if (!exp) { 47 | std::string msg(kPrintBuffer, '\0'); 48 | va_list args; 49 | va_start(args, fmt); 50 | vsnprintf(&msg[0], kPrintBuffer, fmt, args); 51 | va_end(args); 52 | HandleAssertError(msg.c_str()); 53 | } 54 | } 55 | 56 | /*!\brief same as assert, but this is intended to be used as message for user*/ 57 | inline void Check(bool exp, const char *fmt, ...) { 58 | if (!exp) { 59 | std::string msg(kPrintBuffer, '\0'); 60 | va_list args; 61 | va_start(args, fmt); 62 | vsnprintf(&msg[0], kPrintBuffer, fmt, args); 63 | va_end(args); 64 | HandleCheckError(msg.c_str()); 65 | } 66 | } 67 | 68 | /*! \brief report error message, same as check */ 69 | inline void Error(const char *fmt, ...) { 70 | { 71 | std::string msg(kPrintBuffer, '\0'); 72 | va_list args; 73 | va_start(args, fmt); 74 | vsnprintf(&msg[0], kPrintBuffer, fmt, args); 75 | va_end(args); 76 | HandleCheckError(msg.c_str()); 77 | } 78 | } 79 | } // namespace utils 80 | } // namespace mshadow 81 | #endif // MSHADOW_UTILS_H_ 82 | -------------------------------------------------------------------------------- /Utiltensor.h: -------------------------------------------------------------------------------- 1 | #ifndef UTILTENSOR 2 | #define UTILTENSOR 3 | 4 | #include "tensor.h" 5 | #include "MyLib.h" 6 | 7 | using namespace std; 8 | using namespace mshadow; 9 | using namespace mshadow::expr; 10 | using namespace mshadow::utils; 11 | using namespace nr; 12 | 13 | // define tanh operation 14 | struct nl_tanh { 15 | MSHADOW_XINLINE static dtype Map(dtype a) { 16 | // return a>0?a:0; 17 | return tanh(a); 18 | } 19 | }; 20 | struct nl_dtanh { 21 | MSHADOW_XINLINE static dtype Map(dtype a) { 22 | // return a>0?1:0; 23 | return (1.0 - a) * (1.0 + a); 24 | } 25 | }; 26 | struct nl_sigmoid { 27 | MSHADOW_XINLINE static dtype Map(dtype a) { 28 | // return a>0?a:0; 29 | return 1.0 / (1.0 + exp(-a)); 30 | } 31 | }; 32 | struct nl_dsigmoid { 33 | MSHADOW_XINLINE static dtype Map(dtype a) { 34 | // return a>0?1:0; 35 | return (1.0 - a) * a; 36 | } 37 | }; 38 | struct nl_relu { 39 | MSHADOW_XINLINE static dtype Map(dtype a) { 40 | return a > 0 ? a : 0; 41 | } 42 | }; 43 | struct nl_drelu { 44 | MSHADOW_XINLINE static dtype Map(dtype a) { 45 | return a > 0 ? 1 : 0; 46 | } 47 | }; 48 | struct nl_exp { 49 | MSHADOW_XINLINE static dtype Map(dtype a) { 50 | // return a>0?a:0; 51 | return exp(a); 52 | } 53 | }; 54 | struct nl_log { 55 | MSHADOW_XINLINE static dtype Map(dtype a) { 56 | // return a>0?a:0; 57 | return log(a); 58 | } 59 | }; 60 | struct xe_dx { 61 | MSHADOW_XINLINE static dtype Map(dtype a, dtype b) { 62 | return (b - a) / (a * (1.0 - a) + 1e-6); 63 | } 64 | }; 65 | struct xe_ll { 66 | MSHADOW_XINLINE static dtype Map(dtype a, dtype b) { 67 | return b > 0.5f ? log(a + 1e-10) : log(1.0 - a + 1e-10); 68 | } 69 | }; 70 | struct square { 71 | MSHADOW_XINLINE static dtype Map(dtype a) { 72 | return a * a; 73 | 74 | } 75 | }; 76 | struct clip { 77 | MSHADOW_XINLINE static dtype Map(dtype a) { 78 | return a > 10.0 ? 10.0 : (a < -10.0 ? -10.0 : a); 79 | 80 | } 81 | }; 82 | struct inv_sqrt { 83 | MSHADOW_XINLINE static dtype Map(dtype a, dtype b) { 84 | return a / (sqrt(b) + 0.0001); 85 | } 86 | }; 87 | 88 | struct nl_sqrt { 89 | MSHADOW_XINLINE static dtype Map(dtype a) { 90 | return sqrt(a); 91 | } 92 | }; 93 | 94 | struct dropout { 95 | // p: prob to dropout 96 | MSHADOW_XINLINE static dtype Map(dtype p, dtype r) { 97 | if (p > r) 98 | return 0.0; 99 | else 100 | return 1.0 / (1.0 - p); 101 | } 102 | }; 103 | 104 | // \sum x_{ijk}^2 105 | template 106 | inline dtype squarenorm(Tensor w) { 107 | dtype result = 0; 108 | for (int idx = 0; idx < w.size(0); idx++) { 109 | result += w[idx] * w[idx]; 110 | } 111 | return result; 112 | } 113 | 114 | template 115 | inline dtype squarenorm(Tensor w) { 116 | dtype result = 0; 117 | for (int idx = 0; idx < w.size(0); idx++) { 118 | for (int idy = 0; idy < w.size(1); idy++) { 119 | result += w[idx][idy] * w[idx][idy]; 120 | } 121 | } 122 | return result; 123 | } 124 | 125 | template 126 | inline dtype squarenorm(Tensor w) { 127 | dtype result = 0; 128 | for (int idx = 0; idx < w.size(0); idx++) { 129 | for (int idy = 0; idy < w.size(1); idy++) { 130 | for (int idz = 0; idz < w.size(2); idz++) { 131 | result += w[idx][idy][idz] * w[idx][idy][idz]; 132 | } 133 | } 134 | } 135 | return result; 136 | } 137 | 138 | template 139 | inline void assign(Tensor w, const NRVec& wnr) { 140 | int dim = wnr.size(); 141 | for (int idx = 0; idx < dim; idx++) { 142 | w[idx] = wnr[idx]; 143 | } 144 | } 145 | 146 | template 147 | inline void assign(Tensor w, const NRMat& wnr) { 148 | int dim1 = wnr.nrows(); 149 | int dim2 = wnr.ncols(); 150 | for (int idx = 0; idx < dim1; idx++) { 151 | for (int idy = 0; idy < dim2; idy++) { 152 | w[idx][idy] = wnr[idx][idy]; 153 | } 154 | } 155 | } 156 | 157 | template 158 | inline void assign(Tensor w, const NRMat3d& wnr) { 159 | int dim1 = wnr.dim1(); 160 | int dim2 = wnr.dim2(); 161 | int dim3 = wnr.dim3(); 162 | for (int idx = 0; idx < dim1; idx++) { 163 | for (int idy = 0; idy < dim2; idy++) { 164 | for (int idz = 0; idz < dim3; idz++) { 165 | w[idx][idy][idz] = wnr[idx][idy][idz]; 166 | } 167 | } 168 | } 169 | } 170 | 171 | template 172 | inline void assign(vector > &w, dtype value) { 173 | int dim = w.size(); 174 | for (int idx = 0; idx < dim; idx++) { 175 | w[idx] = value; 176 | } 177 | } 178 | 179 | template 180 | inline void assign(vector > &w, dtype value) { 181 | int dim = w.size(); 182 | for (int idx = 0; idx < dim; idx++) { 183 | w[idx] = value; 184 | } 185 | } 186 | 187 | template 188 | inline void assign(vector > &w, dtype value) { 189 | int dim = w.size(); 190 | for (int idx = 0; idx < dim; idx++) { 191 | w[idx] = value; 192 | } 193 | } 194 | 195 | template 196 | inline void norm2one(Tensor w, int idx) { 197 | dtype sum = 0.000001; 198 | for (int idy = 0; idy < w.size(1); idy++) { 199 | sum += w[idx][idy] * w[idx][idy]; 200 | } 201 | dtype scale = sqrt(sum); 202 | for (int idy = 0; idy < w.size(1); idy++) 203 | w[idx][idy] = w[idx][idy] / scale; 204 | } 205 | 206 | template 207 | inline void random(Tensor w, dtype min = 0.0, dtype max = 1.0, int seed = 0) { 208 | srand(seed); 209 | int dim = w.size(0); 210 | for (int idx = 0; idx < dim; idx++) { 211 | w[idx] = min + (max - min) * (1.0 * rand() / RAND_MAX); 212 | } 213 | } 214 | 215 | template 216 | inline void random(Tensor w, dtype min = 0.0, dtype max = 1.0, int seed = 0) { 217 | srand(seed); 218 | int dim1 = w.size(0); 219 | int dim2 = w.size(1); 220 | for (int idx = 0; idx < dim1; idx++) { 221 | for (int idy = 0; idy < dim2; idy++) { 222 | w[idx][idy] = min + (max - min) * (1.0 * rand() / RAND_MAX); 223 | } 224 | } 225 | } 226 | 227 | template 228 | inline void random(Tensor w, dtype min = 0.0, dtype max = 1.0, int seed = 0) { 229 | srand(seed); 230 | int dim1 = w.size(0); 231 | int dim2 = w.size(1); 232 | int dim3 = w.size(2); 233 | for (int idx = 0; idx < dim1; idx++) { 234 | for (int idy = 0; idy < dim2; idy++) { 235 | for (int idz = 0; idz < dim3; idz++) { 236 | w[idx][idy][idz] = min + (max - min) * (1.0 * rand() / RAND_MAX); 237 | } 238 | } 239 | } 240 | } 241 | 242 | /* 243 | template 244 | inline void tcopy(const Tensor& from, Tensor& to, bool bAllocated = true) { 245 | if (bAllocated) { 246 | if (to.size(0) != from.size(0) || to.size(1) != from.size(1) || to.size(2) != from.size(2)) { 247 | FreeSpace(&to); 248 | to = NewTensor(Shape3(from.size(0), from.size(1), from.size(2)), d_zero); 249 | } 250 | } else { 251 | to = NewTensor(Shape3(from.size(0), from.size(1), from.size(2)), d_zero); 252 | } 253 | 254 | Copy(to, from); 255 | } 256 | 257 | template 258 | inline void tcopy(const Tensor& from, Tensor& to, bool bAllocated = true) { 259 | if (bAllocated) { 260 | if (to.size(0) != from.size(0) || to.size(1) != from.size(1)) { 261 | FreeSpace(&to); 262 | to = NewTensor(Shape2(from.size(0), from.size(1)), d_zero); 263 | } 264 | } else { 265 | to = NewTensor(Shape2(from.size(0), from.size(1)), d_zero); 266 | } 267 | Copy(to, from); 268 | } 269 | 270 | template 271 | inline void tcopy(const Tensor&from, Tensor& to, bool bAllocated = true) { 272 | if (bAllocated) { 273 | if (to.size(0) != from.size(0)) { 274 | FreeSpace(&to); 275 | to = NewTensor(Shape1(from.size(0)), d_zero); 276 | } 277 | } else { 278 | to = NewTensor(Shape1(from.size(0)), d_zero); 279 | } 280 | Copy(to, from); 281 | } 282 | */ 283 | #endif 284 | -------------------------------------------------------------------------------- /Windowlized.h: -------------------------------------------------------------------------------- 1 | #ifndef WINDOWLIZED 2 | #define WINDOWLIZED 3 | 4 | #include "tensor.h" 5 | #include "MyLib.h" 6 | 7 | 8 | using namespace std; 9 | using namespace mshadow; 10 | using namespace mshadow::expr; 11 | using namespace mshadow::utils; 12 | 13 | 14 | template 15 | inline void windowlized(const vector > &wi, vector > &wo, int context) 16 | { 17 | int seqsize = wo.size(); 18 | if (wi.size() != seqsize || seqsize == 0 || context < 0) { 19 | std::cerr << "windowlized error: vector size or context size invalid" << std::endl; 20 | } 21 | 22 | int dim1 = wi[0].size(0), dim2 = wi[0].size(1); 23 | int odim1 = wo[0].size(0), odim2 = wo[0].size(1); 24 | int computeddim2 = (2 * context + 1) * dim2; 25 | if(computeddim2 != odim2 || dim1 != 1 || odim1 != 1){ 26 | std::cerr << "windowlized error: dim size invalid" << std::endl; 27 | } 28 | 29 | static int offset; 30 | for (int idx = 0; idx < seqsize; idx++) { 31 | wo[idx] = 0.0; 32 | offset = 0; 33 | for (int idp = idx - context; idp <= idx + context; idp++) { 34 | if (idp < 0 || idp >= seqsize) { 35 | offset += dim2; 36 | } else { 37 | for (int idy = 0; idy < dim2; idy++) { 38 | wo[idx][0][offset] = wi[idp][0][idy]; 39 | offset++; 40 | } 41 | } 42 | } 43 | assert(offset == odim2); 44 | } 45 | 46 | } 47 | 48 | 49 | template 50 | inline void windowlized(Tensor wi, Tensor wo, int context) 51 | { 52 | int seqsize = wo.size(0); 53 | if (wi.size(0) != seqsize || seqsize == 0 || context < 0) { 54 | std::cerr << "windowlized error: vector size or context size invalid" << std::endl; 55 | } 56 | 57 | int dim1 = wi.size(1), dim2 = wi.size(2); 58 | int odim1 = wo.size(1), odim2 = wo.size(2); 59 | int computeddim2 = (2 * context + 1) * dim2; 60 | if(computeddim2 != odim2 || dim1 != 1 || odim1 != 1){ 61 | std::cerr << "windowlized error: dim size invalid" << std::endl; 62 | } 63 | 64 | wo = 0.0; 65 | static int offset; 66 | for (int idx = 0; idx < seqsize; idx++) { 67 | offset = 0; 68 | for (int idp = idx - context; idp <= idx + context; idp++) { 69 | if (idp < 0 || idp >= seqsize) { 70 | offset += dim2; 71 | } else { 72 | for (int idy = 0; idy < dim2; idy++) { 73 | wo[idx][0][offset] = wi[idp][0][idy]; 74 | offset++; 75 | } 76 | } 77 | } 78 | assert(offset == odim2); 79 | } 80 | 81 | } 82 | 83 | 84 | template 85 | inline void windowlized_backward(vector > &lwi, const vector > &lwo, int context, bool bclear = false) 86 | { 87 | int seqsize = lwo.size(); 88 | if (lwi.size() != seqsize || seqsize == 0 || context < 0) { 89 | std::cerr << "windowlized error: vector size or context size invalid" << std::endl; 90 | } 91 | 92 | int dim1 = lwi[0].size(0), dim2 = lwi[0].size(1); 93 | int odim1 = lwo[0].size(0), odim2 = lwo[0].size(1); 94 | int computeddim2 = (2 * context + 1) * dim2; 95 | if(computeddim2 != odim2 || dim1 != 1 || odim1 != 1){ 96 | std::cerr << "windowlized error: dim size invalid" << std::endl; 97 | } 98 | 99 | if(bclear){ 100 | for (int idx = 0; idx < seqsize; idx++) { 101 | lwi[idx] = 0.0; 102 | } 103 | } 104 | static int offset; 105 | for (int idx = 0; idx < seqsize; idx++) { 106 | offset = 0; 107 | for (int idp = idx - context; idp <= idx + context; idp++) { 108 | if (idp < 0 || idp >= seqsize) { 109 | offset += dim2; 110 | } else { 111 | for (int idy = 0; idy < dim2; idy++) { 112 | lwi[idp][0][idy] += lwo[idx][0][offset]; 113 | offset++; 114 | } 115 | } 116 | } 117 | assert(offset == odim2); 118 | } 119 | 120 | } 121 | 122 | 123 | template 124 | inline void windowlized_backward(Tensor lwi, Tensor lwo, int context, bool bclear = false) 125 | { 126 | int seqsize = lwo.size(0); 127 | if (lwi.size(0) != seqsize || seqsize == 0 || context < 0) { 128 | std::cerr << "windowlized error: vector size or context size invalid" << std::endl; 129 | } 130 | 131 | int dim1 = lwi.size(1), dim2 = lwi.size(2); 132 | int odim1 = lwo.size(1), odim2 = lwo.size(2); 133 | int computeddim2 = (2 * context + 1) * dim2; 134 | if(computeddim2 != odim2 || dim1 != 1 || odim1 != 1){ 135 | std::cerr << "windowlized error: dim size invalid" << std::endl; 136 | } 137 | 138 | if(bclear) lwi = 0.0; 139 | static int offset; 140 | for (int idx = 0; idx < seqsize; idx++) { 141 | offset = 0; 142 | for (int idp = idx - context; idp <= idx + context; idp++) { 143 | if (idp < 0 || idp >= seqsize) { 144 | offset += dim2; 145 | } else { 146 | for (int idy = 0; idy < dim2; idy++) { 147 | lwi[idp][0][idy] += lwo[idx][0][offset]; 148 | offset++; 149 | } 150 | } 151 | } 152 | assert(offset == odim2); 153 | } 154 | 155 | } 156 | 157 | 158 | #endif 159 | -------------------------------------------------------------------------------- /description(expect for lrec2016).pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SUTDNLP/LibN3L/da49c8ccf715170a60f6b5ce1930df1e691dc280/description(expect for lrec2016).pdf --------------------------------------------------------------------------------