├── Alphabet.h
├── AttRecursiveGatedNN.h
├── AttentionPooling.h
├── AvgPerceptron1O.h
├── BiLayer.h
├── CheckGrad.h
├── Concat.h
├── Dropout.h
├── GRNN.h
├── GatedPooling.h
├── Hash_map.hpp
├── IO.h
├── LSTM.h
├── LSTM_CHD.h
├── LSTM_KER.h
├── LSTM_STD.h
├── LookupTable.h
├── MLCRFLoss.h
├── MMCRFLoss.h
├── Metric.h
├── MyLib.h
├── N3L.h
├── NRMat.h
├── Pooling.h
├── README.md
├── RNN.h
├── RecursiveGatedNN.h
├── RecursiveNN.h
├── SoftMaxLoss.h
├── SparseUniLayer.h
├── SparseUniLayer1O.h
├── TensorLayer.h
├── TriLayer.h
├── TriLayerLSTM.h
├── UniLayer.h
├── UniLayer1O.h
├── Utils.h
├── Utiltensor.h
├── Windowlized.h
└── description(expect for lrec2016).pdf


/Alphabet.h:
--------------------------------------------------------------------------------
  1 | #ifndef _ALPHABET_
  2 | #define _ALPHABET_
  3 | 
  4 | #include "MyLib.h"
  5 | #include "Hash_map.hpp"
  6 | #include "IO.h"
  7 | 
  8 | /*
  9 | 	This class serializes feature from string to int.
 10 | 	Index starts from 0.
 11 | */
 12 | 
 13 | /**
 14 |  * The basic class of quark class.
 15 |  *  @param  std::string        String class name to be used.
 16 |  *  @param  int         ID class name to be used.
 17 |  *  @author Naoaki Okazaki
 18 |  */
 19 | class basic_quark {
 20 | protected:
 21 |   typedef hash_map<std::string, int> StringToId;
 22 |   typedef std::vector<std::string> IdToString;
 23 | 
 24 |   StringToId m_string_to_id;
 25 |   IdToString m_id_to_string;
 26 |   bool m_b_fixed;
 27 |   int m_size;
 28 | 
 29 | public:
 30 |   /**
 31 |    * Construct.
 32 |    */
 33 |   basic_quark()
 34 |   {
 35 |     clear();
 36 |   }
 37 | 
 38 |   /**
 39 |    * Destruct.
 40 |    */
 41 |   virtual ~basic_quark()
 42 |   {
 43 |   }
 44 | 
 45 |   /**
 46 |    * Map a string to its associated ID.
 47 |    *  If string-to-integer association does not exist, allocate a new ID.
 48 |    *  @param  str         String value.
 49 |    *  @return           Associated ID for the string value.
 50 |    */
 51 |   int operator[](const std::string& str)
 52 |   {
 53 |     typename StringToId::const_iterator it = m_string_to_id.find(str);
 54 |     if (it != m_string_to_id.end()) {
 55 |       return it->second;
 56 |     } else if (!m_b_fixed){
 57 |       int newid = m_size;
 58 |       m_id_to_string.push_back(str);
 59 |       m_string_to_id.insert(std::pair<std::string, int>(str, newid));
 60 |       m_size++;
 61 |       return newid;
 62 |     }
 63 |     else
 64 |     {
 65 |       return -1;
 66 |     }
 67 |   }
 68 | 
 69 | 
 70 |   /**
 71 |    * Convert ID value into the associated string value.
 72 |    *  @param  qid         ID.
 73 |    *  @param  def         Default value if the ID was out of range.
 74 |    *  @return           String value associated with the ID.
 75 |    */
 76 |   const std::string& from_id(const int& qid, const std::string& def = "") const
 77 |   {
 78 |     if (qid < 0 || m_size <= qid) {
 79 |       return def;
 80 |     } else {
 81 |       return m_id_to_string[qid];
 82 |     }
 83 |   }
 84 | 
 85 | 
 86 | 
 87 |   /**
 88 |    * Convert string value into the associated ID value.
 89 |    *  @param  str         String value.
 90 |    *  @return           ID if any, otherwise -1.
 91 |    */
 92 |   int from_string(const std::string& str)
 93 |   {
 94 |     typename StringToId::const_iterator it = m_string_to_id.find(str);
 95 |     if (it != m_string_to_id.end()) {
 96 |       return it->second;
 97 |     } else if (!m_b_fixed){
 98 |       int newid = m_size;
 99 |       m_id_to_string.push_back(str);
100 |       m_string_to_id.insert(std::pair<std::string, int>(str, newid));
101 |       m_size++;
102 |       return newid;
103 |     }
104 |     else
105 |     {
106 |       return -1;
107 |     }
108 |   }
109 | 
110 |   void clear()
111 |   {
112 |     m_string_to_id.clear();
113 |     m_id_to_string.clear();
114 |     m_b_fixed = false;
115 |     m_size = 0;
116 |   }
117 | 
118 |   void set_fixed_flag(bool bfixed)
119 |   {
120 |     m_b_fixed = bfixed;
121 |   }
122 | 
123 |   /**
124 |    * Get the number of string-to-id associations.
125 |    *  @return           The number of association.
126 |    */
127 |   size_t size() const
128 |   {
129 |     return m_size;
130 |   }
131 | 
132 | 
133 |   void read(std::ifstream &inf)
134 |   {
135 |     clear();
136 |     static string tmp;
137 |     my_getline(inf, tmp);
138 |     chomp(tmp);
139 |     m_size = atoi(tmp.c_str());
140 |     std::vector<std::string> featids;
141 |     for (int i = 0; i < m_size; ++i) {
142 | 
143 |       my_getline(inf, tmp);
144 |       split_bychars(tmp, featids);
145 |       m_string_to_id[featids[0]] = i;
146 |       assert(atoi(featids[1].c_str()) == i);
147 |     }
148 |   }
149 | 
150 |   void write(std::ofstream &outf) const
151 |   {
152 |     outf << m_size << std::endl;
153 |     for (int i=0; i<m_size; i++)
154 |     {
155 |       outf << m_id_to_string[i] << i << std::endl;
156 |     }
157 |   }
158 | 
159 | 
160 |   void loadModel(LStream &inf)
161 |   { 
162 |     clear();
163 |     string tmp_string;
164 |     int ID;
165 |     ReadBinary(inf, m_size);
166 |     ReadBinary(inf, m_b_fixed);
167 |     for (int i=0; i<m_size; i++)
168 |     { 
169 |       ReadString(inf, tmp_string);
170 |       ReadBinary(inf, ID);
171 |       m_string_to_id[tmp_string] = i;
172 |       m_id_to_string.push_back(tmp_string);
173 |       // cout << tmp_string << " is " << ID << " and " << i << std::endl;
174 |       // cout << m_id_to_string[i] << " is " << ID << " and " << i << std::endl;
175 |       assert(ID == i);
176 |     }
177 | 
178 |   }
179 | 
180 |   void writeModel(LStream &outf) const
181 |   { 
182 |     WriteBinary(outf, m_size);
183 |     WriteBinary(outf, m_b_fixed);
184 |     for (int i=0; i<m_size; i++)
185 |     { 
186 |       // cout << m_id_to_string[i] << " is " << i << std::endl;
187 |       WriteString(outf, m_id_to_string[i]);
188 |       WriteBinary(outf, i);
189 |     }
190 |   }
191 |   
192 | };
193 | 
194 | typedef basic_quark Alphabet;
195 | 
196 | #endif
197 | 
198 | 


--------------------------------------------------------------------------------
/AttRecursiveGatedNN.h:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * AttRecursiveGatedNN.h
  3 |  *  Gated Recursive Neural network structure with attention technique.
  4 |  *  Created on: Nov 5, 2015
  5 |  *      Author: mszhang
  6 |  */
  7 | 
  8 | #ifndef SRC_AttRecursiveGatedNN_H_
  9 | #define SRC_AttRecursiveGatedNN_H_
 10 | 
 11 | #include "tensor.h"
 12 | 
 13 | #include "BiLayer.h"
 14 | #include "MyLib.h"
 15 | #include "Utiltensor.h"
 16 | 
 17 | using namespace mshadow;
 18 | using namespace mshadow::expr;
 19 | using namespace mshadow::utils;
 20 | 
 21 | template<typename xpu>
 22 | class AttRecursiveGatedNN {
 23 | public:
 24 |   BiLayer<xpu> _reset_left;
 25 |   BiLayer<xpu> _reset_right;
 26 |   BiLayer<xpu> _update_left;
 27 |   BiLayer<xpu> _update_right;
 28 |   BiLayer<xpu> _update_tilde;
 29 |   BiLayer<xpu> _recursive_tilde;
 30 | 
 31 | 
 32 |   Tensor<xpu, 2, dtype> nxl;
 33 |   Tensor<xpu, 2, dtype> nxr;
 34 |   Tensor<xpu, 2, dtype> sum;
 35 | 
 36 |   Tensor<xpu, 2, dtype> pxl;
 37 |   Tensor<xpu, 2, dtype> pxr;
 38 |   Tensor<xpu, 2, dtype> pmy;
 39 | 
 40 | 
 41 |   Tensor<xpu, 2, dtype> lrxl;
 42 |   Tensor<xpu, 2, dtype> lrxr;
 43 |   Tensor<xpu, 2, dtype> lmy;
 44 |   Tensor<xpu, 2, dtype> luxl;
 45 |   Tensor<xpu, 2, dtype> luxr;
 46 |   Tensor<xpu, 2, dtype> lumy;
 47 | 
 48 |   Tensor<xpu, 2, dtype> lnxl;
 49 |   Tensor<xpu, 2, dtype> lnxr;
 50 |   Tensor<xpu, 2, dtype> lsum;
 51 | 
 52 |   Tensor<xpu, 2, dtype> lpxl;
 53 |   Tensor<xpu, 2, dtype> lpxr;
 54 |   Tensor<xpu, 2, dtype> lpmy;
 55 | 
 56 | 
 57 | public:
 58 |   AttRecursiveGatedNN() {
 59 |   }
 60 | 
 61 |   inline void initial(int dimension, int attDim, int seed = 0) {
 62 |     _reset_left.initial(dimension, dimension, attDim, false, seed, 1);
 63 |     _reset_right.initial(dimension, dimension, attDim, false, seed + 10, 1);
 64 |     _update_left.initial(dimension, dimension, attDim, false, seed + 20, 3);
 65 |     _update_right.initial(dimension, dimension, attDim, false, seed + 30, 3);
 66 |     _update_tilde.initial(dimension, dimension, attDim, false, seed + 40, 3);
 67 |     _recursive_tilde.initial(dimension, dimension, dimension, false, seed + 50, 0);
 68 | 
 69 |     nxl = NewTensor<xpu>(Shape2(1, dimension), d_zero);
 70 |     nxr = NewTensor<xpu>(Shape2(1, dimension), d_zero);
 71 |     sum = NewTensor<xpu>(Shape2(1, dimension), d_zero);
 72 | 
 73 |     pxl = NewTensor<xpu>(Shape2(1, dimension), d_zero);
 74 |     pxr = NewTensor<xpu>(Shape2(1, dimension), d_zero);
 75 |     pmy = NewTensor<xpu>(Shape2(1, dimension), d_zero);
 76 | 
 77 | 
 78 |     lrxl = NewTensor<xpu>(Shape2(1, dimension), d_zero);
 79 |     lrxr = NewTensor<xpu>(Shape2(1, dimension), d_zero);
 80 |     lmy = NewTensor<xpu>(Shape2(1, dimension), d_zero);
 81 |     luxl = NewTensor<xpu>(Shape2(1, dimension), d_zero);
 82 |     luxr = NewTensor<xpu>(Shape2(1, dimension), d_zero);
 83 |     lumy = NewTensor<xpu>(Shape2(1, dimension), d_zero);
 84 | 
 85 |     lnxl = NewTensor<xpu>(Shape2(1, dimension), d_zero);
 86 |     lnxr = NewTensor<xpu>(Shape2(1, dimension), d_zero);
 87 |     lsum = NewTensor<xpu>(Shape2(1, dimension), d_zero);
 88 | 
 89 |     lpxl = NewTensor<xpu>(Shape2(1, dimension), d_zero);
 90 |     lpxr = NewTensor<xpu>(Shape2(1, dimension), d_zero);
 91 |     lpmy = NewTensor<xpu>(Shape2(1, dimension), d_zero);
 92 |   }
 93 | 
 94 | 
 95 |   inline void initial(Tensor<xpu, 2, dtype> rW1, Tensor<xpu, 2, dtype> rU1,
 96 |       Tensor<xpu, 2, dtype> rW2, Tensor<xpu, 2, dtype> rU2,
 97 |       Tensor<xpu, 2, dtype> uW1, Tensor<xpu, 2, dtype> uU1,
 98 |       Tensor<xpu, 2, dtype> uW2, Tensor<xpu, 2, dtype> uU2,
 99 |       Tensor<xpu, 2, dtype> uW3, Tensor<xpu, 2, dtype> uU3,
100 |       Tensor<xpu, 2, dtype> W1, Tensor<xpu, 2, dtype> W2, Tensor<xpu, 2, dtype> W3,Tensor<xpu, 2, dtype> b) {
101 |     _reset_left.initial(rW1, rU1, 1);
102 |     _reset_right.initial(rW2, rU2, 1);
103 | 
104 |     _update_left.initial(uW1, uU1, 3);
105 |     _update_right.initial(uW2, uU2, 3);
106 |     _update_tilde.initial(uW3, uU3, 3);
107 | 
108 |     _recursive_tilde.initial(W1, W2, W3, b, 0);
109 |   }
110 | 
111 |   inline void release() {
112 |     _reset_left.release();
113 |     _reset_right.release();
114 | 
115 |     _update_left.release();
116 |     _update_right.release();
117 |     _update_tilde.release();
118 | 
119 |     _recursive_tilde.release();
120 | 
121 |     FreeSpace(&nxl);
122 |     FreeSpace(&nxr);
123 |     FreeSpace(&sum);
124 |     FreeSpace(&pxl);
125 |     FreeSpace(&pxr);
126 |     FreeSpace(&pmy);
127 |     FreeSpace(&lnxl);
128 |     FreeSpace(&lnxr);
129 |     FreeSpace(&lsum);
130 |     FreeSpace(&lpxl);
131 |     FreeSpace(&lpxr);
132 |     FreeSpace(&lpmy);
133 |     FreeSpace(&lrxl);
134 |     FreeSpace(&lrxr);
135 |     FreeSpace(&lmy);
136 |     FreeSpace(&luxl);
137 |     FreeSpace(&luxr);
138 |     FreeSpace(&lumy);
139 |   }
140 | 
141 |   virtual ~AttRecursiveGatedNN() {
142 |     // TODO Auto-generated destructor stub
143 |   }
144 | 
145 |   inline dtype squarenormAll() {
146 |     dtype norm = _reset_left.squarenormAll();
147 |     norm += _reset_right.squarenormAll();
148 |     norm += _update_left.squarenormAll();
149 |     norm += _update_right.squarenormAll();
150 |     norm += _update_tilde.squarenormAll();
151 |     norm += _recursive_tilde.squarenormAll();
152 | 
153 |     return norm;
154 |   }
155 | 
156 |   inline void scaleGrad(dtype scale) {
157 |     _reset_left.scaleGrad(scale);
158 |     _reset_right.scaleGrad(scale);
159 | 
160 |     _update_left.scaleGrad(scale);
161 |     _update_right.scaleGrad(scale);
162 |     _update_tilde.scaleGrad(scale);
163 | 
164 |     _recursive_tilde.scaleGrad(scale);
165 |   }
166 | 
167 | public:
168 | 
169 |   inline void ComputeForwardScore(Tensor<xpu, 2, dtype> xl, Tensor<xpu, 2, dtype> xr, Tensor<xpu, 2, dtype> a,
170 |       Tensor<xpu, 2, dtype> rxl, Tensor<xpu, 2, dtype> rxr, Tensor<xpu, 2, dtype> my,
171 |       Tensor<xpu, 2, dtype> uxl, Tensor<xpu, 2, dtype> uxr, Tensor<xpu, 2, dtype> umy,
172 |       Tensor<xpu, 2, dtype> y) {
173 | 
174 |     nxl = 0.0;
175 |     nxr = 0.0;
176 |     sum = 0.0;
177 | 
178 |     pxl = 0.0;
179 |     pxr = 0.0;
180 |     pmy = 0.0;
181 | 
182 |     _reset_left.ComputeForwardScore(xl, a, rxl);
183 |     _reset_right.ComputeForwardScore(xr, a, rxr);
184 | 
185 | 
186 |     nxl = rxl * xl;
187 |     nxr = rxr * xr;
188 | 
189 |     _recursive_tilde.ComputeForwardScore(nxl, nxr, my);
190 | 
191 | 
192 |     _update_left.ComputeForwardScore(xl, a, uxl);
193 |     _update_right.ComputeForwardScore(xr, a, uxr);
194 |     _update_tilde.ComputeForwardScore(my, a, umy);
195 | 
196 |     sum = uxl + uxr + umy;
197 | 
198 |     pxl = uxl / sum;
199 |     pxr = uxr / sum;
200 |     pmy = umy / sum;
201 | 
202 |     y = pxl * xl + pxr * xr + pmy * my;
203 | 
204 |   }
205 | 
206 |   //please allocate the memory outside here
207 |   inline void ComputeBackwardLoss(Tensor<xpu, 2, dtype> xl, Tensor<xpu, 2, dtype> xr, Tensor<xpu, 2, dtype> a,
208 |       Tensor<xpu, 2, dtype> rxl, Tensor<xpu, 2, dtype> rxr, Tensor<xpu, 2, dtype> my,
209 |       Tensor<xpu, 2, dtype> uxl, Tensor<xpu, 2, dtype> uxr, Tensor<xpu, 2, dtype> umy,
210 |       Tensor<xpu, 2, dtype> y, Tensor<xpu, 2, dtype> ly,
211 |       Tensor<xpu, 2, dtype> lxl, Tensor<xpu, 2, dtype> lxr, Tensor<xpu, 2, dtype> la,
212 |       bool bclear = false) {
213 |     if (bclear){
214 |       lxl = 0.0; lxr = 0.0; la = 0.0;
215 |     }
216 | 
217 |     nxl = 0.0;
218 |     nxr = 0.0;
219 |     sum = 0.0;
220 | 
221 |     pxl = 0.0;
222 |     pxr = 0.0;
223 |     pmy = 0.0;
224 | 
225 | 
226 |     lrxl = 0.0;
227 |     lrxr = 0.0;
228 |     lmy = 0.0;
229 |     luxl = 0.0;
230 |     luxr = 0.0;
231 |     lumy = 0.0;
232 | 
233 |     lnxl = 0.0;
234 |     lnxr = 0.0;
235 |     lsum = 0.0;
236 | 
237 |     lpxl = 0.0;
238 |     lpxr = 0.0;
239 |     lpmy = 0.0;
240 | 
241 |     nxl = rxl * xl;
242 |     nxr = rxr * xr;
243 | 
244 |     sum = uxl + uxr + umy;
245 | 
246 |     pxl = uxl / sum;
247 |     pxr = uxr / sum;
248 |     pmy = umy / sum;
249 | 
250 | 
251 |     lpxl += ly * xl;
252 |     lxl += ly * pxl;
253 | 
254 |     lpxr += ly * xr;
255 |     lxr += ly * pxr;
256 | 
257 |     lpmy += ly * my;
258 |     lmy += ly * pmy;
259 | 
260 | 
261 | 
262 |     luxl += lpxl / sum;
263 |     luxr += lpxr / sum;
264 |     lumy += lpmy / sum;
265 | 
266 |     lsum -= lpxl * pxl / sum;
267 |     lsum -= lpxr * pxr / sum;
268 |     lsum -= lpmy * pmy / sum;
269 | 
270 | 
271 |     luxl += lsum;
272 |     luxr += lsum;
273 |     lumy += lsum;
274 | 
275 |     _update_left.ComputeBackwardLoss(xl, a, uxl, luxl, lxl, la);
276 |     _update_right.ComputeBackwardLoss(xr, a, uxr, luxr, lxr, la);
277 |     _update_tilde.ComputeBackwardLoss(my, a, umy, lumy, lmy, la);
278 | 
279 |     _recursive_tilde.ComputeBackwardLoss(nxl, nxr, my, lmy, lnxl, lnxr);
280 | 
281 |     lrxl += lnxl * xl;
282 |     lxl += lnxl * rxl;
283 | 
284 |     lrxr += lnxr * xr;
285 |     lxr += lnxr * rxr;
286 | 
287 |     _reset_left.ComputeBackwardLoss(xl, a, rxl, lrxl, lxl, la);
288 |     _reset_right.ComputeBackwardLoss(xr, a, rxr, lrxr, lxr, la);
289 | 
290 |   }
291 | 
292 | 
293 |   inline void randomprint(int num) {
294 |     _reset_left.randomprint(num);
295 |     _reset_right.randomprint(num);
296 | 
297 |     _update_left.randomprint(num);
298 |     _update_right.randomprint(num);
299 |     _update_tilde.randomprint(num);
300 | 
301 |     _recursive_tilde.randomprint(num);
302 |   }
303 | 
304 |   inline void updateAdaGrad(dtype regularizationWeight, dtype adaAlpha, dtype adaEps) {
305 |     _reset_left.updateAdaGrad(regularizationWeight, adaAlpha, adaEps);
306 |     _reset_right.updateAdaGrad(regularizationWeight, adaAlpha, adaEps);
307 | 
308 |     _update_left.updateAdaGrad(regularizationWeight, adaAlpha, adaEps);
309 |     _update_right.updateAdaGrad(regularizationWeight, adaAlpha, adaEps);
310 |     _update_tilde.updateAdaGrad(regularizationWeight, adaAlpha, adaEps);
311 | 
312 |     _recursive_tilde.updateAdaGrad(regularizationWeight, adaAlpha, adaEps);
313 |   }
314 | 
315 |   void writeModel(LStream &outf) {
316 |     _reset_left.writeModel(outf);
317 |     _reset_right.writeModel(outf);
318 |     _update_left.writeModel(outf);
319 |     _update_right.writeModel(outf);
320 |     _update_tilde.writeModel(outf);
321 |     _recursive_tilde.writeModel(outf);
322 |     
323 |     SaveBinary(outf, nxl);
324 |     SaveBinary(outf, nxr);
325 |     SaveBinary(outf, sum);
326 | 
327 |     SaveBinary(outf, pxl);
328 |     SaveBinary(outf, pxr);
329 |     SaveBinary(outf, pmy);
330 | 
331 |     SaveBinary(outf, lrxl);
332 |     SaveBinary(outf, lrxr);
333 |     SaveBinary(outf, lmy);
334 |     SaveBinary(outf, luxl);
335 |     SaveBinary(outf, luxr);
336 |     SaveBinary(outf, lumy);
337 | 
338 |     SaveBinary(outf, lnxl);
339 |     SaveBinary(outf, lnxr);
340 |     SaveBinary(outf, lsum);
341 | 
342 |     SaveBinary(outf, lpxl);
343 |     SaveBinary(outf, lpxr);
344 |     SaveBinary(outf, lpmy);
345 | 
346 |   }
347 | 
348 |   void loadModel(LStream &inf) {
349 | 
350 |     _reset_left.loadModel(inf);
351 |     _reset_right.loadModel(inf);
352 |     _update_left.loadModel(inf);
353 |     _update_right.loadModel(inf);
354 |     _update_tilde.loadModel(inf);
355 |     _recursive_tilde.loadModel(inf);
356 | 
357 | 
358 |     LoadBinary(inf, &nxl, false);
359 |     LoadBinary(inf, &nxr, false);
360 |     LoadBinary(inf, &sum, false);
361 | 
362 |     LoadBinary(inf, &pxl, false);
363 |     LoadBinary(inf, &pxr, false);
364 |     LoadBinary(inf, &pmy, false);
365 | 
366 |     LoadBinary(inf, &lrxl, false);
367 |     LoadBinary(inf, &lrxr, false);
368 |     LoadBinary(inf, &lmy, false);
369 |     LoadBinary(inf, &luxl, false);
370 |     LoadBinary(inf, &luxr, false);
371 |     LoadBinary(inf, &lumy, false);
372 | 
373 |     LoadBinary(inf, &lnxl, false);
374 |     LoadBinary(inf, &lnxr, false);
375 |     LoadBinary(inf, &lsum, false);
376 | 
377 |     LoadBinary(inf, &lpxl, false);
378 |     LoadBinary(inf, &lpxr, false);
379 |     LoadBinary(inf, &lpmy, false);
380 | 
381 |   }
382 | };
383 | 
384 | 
385 | 
386 | #endif /* SRC_AttRecursiveGatedNN_H_ */
387 | 


--------------------------------------------------------------------------------
/AttentionPooling.h:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * AttentionPooling.h
  3 |  *
  4 |  *  Created on: Mar 18, 2015
  5 |  *      Author: mszhang
  6 |  */
  7 | 
  8 | #ifndef SRC_AttentionPooling_H_
  9 | #define SRC_AttentionPooling_H_
 10 | #include "tensor.h"
 11 | 
 12 | #include "BiLayer.h"
 13 | #include "MyLib.h"
 14 | #include "Utiltensor.h"
 15 | #include "Pooling.h"
 16 | #include "UniLayer.h"
 17 | 
 18 | using namespace mshadow;
 19 | using namespace mshadow::expr;
 20 | using namespace mshadow::utils;
 21 | 
 22 | // For simpleness, we do not provide pooling on specified words,
 23 | // which has been implemented in Pooling.h
 24 | 
 25 | 
 26 | template<typename xpu>
 27 | class AttentionPooling {
 28 | 
 29 | public:
 30 |   BiLayer<xpu> _bi_gates;
 31 |   UniLayer<xpu> _uni_gates;
 32 | 
 33 | public:
 34 |   AttentionPooling() {
 35 |   }
 36 | 
 37 |   inline void initial(int hiddenSize, int attentionSize, bool bUseB = true, int seed = 0) {
 38 |     _bi_gates.initial(hiddenSize, hiddenSize, attentionSize, bUseB, seed);
 39 |     _uni_gates.initial(hiddenSize, hiddenSize, false, seed + 10, 3);
 40 |   }
 41 | 
 42 |   inline void initial(Tensor<xpu, 2, dtype> W1, Tensor<xpu, 2, dtype> W2, Tensor<xpu, 2, dtype> W3, Tensor<xpu, 2, dtype> b, bool bUseB = true) {
 43 |     _bi_gates.initial(W1, W2);
 44 |     _uni_gates.initial(W3, b, false, 3);
 45 | 
 46 |   }
 47 | 
 48 | 
 49 |   inline void release() {
 50 |     _bi_gates.release();
 51 |     _uni_gates.release();
 52 |   }
 53 | 
 54 |   virtual ~AttentionPooling() {
 55 |     // TODO Auto-generated destructor stub
 56 |   }
 57 | 
 58 |   inline dtype squarenormAll() {
 59 |     return _bi_gates.squarenormAll() + _uni_gates.squarenormAll();
 60 |   }
 61 | 
 62 |   inline void scaleGrad(dtype scale) {
 63 |     _bi_gates.scaleGrad(scale);
 64 |     _uni_gates.scaleGrad(scale);
 65 |   }
 66 | 
 67 | public:
 68 |   // xExp, xSumIndex, xSum ad xPoolIndex are temporal variables, which reduce computation in back-propagation
 69 |   inline void ComputeForwardScore(Tensor<xpu, 3, dtype> x, Tensor<xpu, 3, dtype> xAtt,
 70 |       Tensor<xpu, 3, dtype> xMExp, Tensor<xpu, 3, dtype> xExp,
 71 |       Tensor<xpu, 2, dtype> xSum, Tensor<xpu, 3, dtype> xPoolIndex, Tensor<xpu, 2, dtype> y) {
 72 |     y = 0.0;
 73 |     int seq_size = x.size(0);
 74 |     if(seq_size == 0) return;
 75 |     int dim1 = x.size(1), dim2 = x.size(2);
 76 |     int odim1 = y.size(0), odim2 = y.size(1);
 77 | 
 78 |     if (dim1 != odim1 || dim2 != odim2 || dim1 != 1) {
 79 |       std::cerr << "AttentionPooling Forward error: dim invalid" << std::endl;
 80 |     }
 81 | 
 82 |     _bi_gates.ComputeForwardScore(x, xAtt, xMExp);
 83 |     _uni_gates.ComputeForwardScore(xMExp, xExp);
 84 | 
 85 |     sumpool_forward(xExp, xSum);
 86 |     for (int idx = 0; idx < seq_size; idx++) {
 87 |       xPoolIndex[idx] = xExp[idx] / xSum;
 88 |     }
 89 |     for (int idx = 0; idx < seq_size; idx++) {
 90 |       y += x[idx] * xPoolIndex[idx];
 91 |     }
 92 |   }
 93 | 
 94 |   inline void ComputeForwardScore(const std::vector<Tensor<xpu, 2, dtype> >& x, const std::vector<Tensor<xpu, 2, dtype> >& xAtt,
 95 |       std::vector<Tensor<xpu, 2, dtype> >& xMExp, std::vector<Tensor<xpu, 2, dtype> >& xExp, Tensor<xpu, 2, dtype> xSum,
 96 |       std::vector<Tensor<xpu, 2, dtype> >& xPoolIndex, Tensor<xpu, 2, dtype> y) {
 97 |     y = 0.0;
 98 |     int seq_size = x.size();
 99 |     if(seq_size == 0) return;
100 |     int dim1 = x[0].size(0), dim2 = x[0].size(1);
101 |     int odim1 = y.size(0), odim2 = y.size(1);
102 | 
103 |     if (dim1 != odim1 || dim2 != odim2 || dim1 != 1) {
104 |       std::cerr << "AttentionPooling Forward error: dim invalid" << std::endl;
105 |     }
106 | 
107 |     _bi_gates.ComputeForwardScore(x, xAtt, xMExp);
108 |     _uni_gates.ComputeForwardScore(xMExp, xExp);
109 | 
110 |     sumpool_forward(xExp, xSum);
111 |     for (int idx = 0; idx < seq_size; idx++) {
112 |       xPoolIndex[idx] = xExp[idx] / xSum;
113 |     }
114 |     for (int idx = 0; idx < seq_size; idx++) {
115 |       y += x[idx] * xPoolIndex[idx];
116 |     }
117 |   }
118 | 
119 | 
120 |   // xExp, xSumIndex, xSum ad xPoolIndex are temporal variables, which reduce computation in back-propagation
121 |   inline void ComputeForwardScore(Tensor<xpu, 3, dtype> x, Tensor<xpu, 2, dtype> xAtt,
122 |       Tensor<xpu, 3, dtype> xMExp, Tensor<xpu, 3, dtype> xExp,
123 |       Tensor<xpu, 2, dtype> xSum, Tensor<xpu, 3, dtype> xPoolIndex, Tensor<xpu, 2, dtype> y) {
124 |     y = 0.0;
125 |     int seq_size = x.size(0);
126 |     if(seq_size == 0) return;
127 |     int dim1 = x.size(1), dim2 = x.size(2);
128 |     int odim1 = y.size(0), odim2 = y.size(1);
129 | 
130 |     if (dim1 != odim1 || dim2 != odim2 || dim1 != 1) {
131 |       std::cerr << "AttentionPooling Forward error: dim invalid" << std::endl;
132 |     }
133 | 
134 |     for (int idx = 0; idx < seq_size; idx++) {
135 |       _bi_gates.ComputeForwardScore(x[idx], xAtt, xMExp[idx]);
136 |     }
137 |     _uni_gates.ComputeForwardScore(xMExp, xExp);
138 | 
139 |     sumpool_forward(xExp, xSum);
140 |     for (int idx = 0; idx < seq_size; idx++) {
141 |       xPoolIndex[idx] = xExp[idx] / xSum;
142 |     }
143 |     for (int idx = 0; idx < seq_size; idx++) {
144 |       y += x[idx] * xPoolIndex[idx];
145 |     }
146 |   }
147 | 
148 |   inline void ComputeForwardScore(const std::vector<Tensor<xpu, 2, dtype> >& x, Tensor<xpu, 2, dtype> xAtt,
149 |       std::vector<Tensor<xpu, 2, dtype> >& xMExp, std::vector<Tensor<xpu, 2, dtype> >& xExp, Tensor<xpu, 2, dtype> xSum,
150 |       std::vector<Tensor<xpu, 2, dtype> >& xPoolIndex, Tensor<xpu, 2, dtype> y) {
151 |     y = 0.0;
152 |     int seq_size = x.size();
153 |     if(seq_size == 0) return;
154 |     int dim1 = x[0].size(0), dim2 = x[0].size(1);
155 |     int odim1 = y.size(0), odim2 = y.size(1);
156 | 
157 |     if (dim1 != odim1 || dim2 != odim2 || dim1 != 1) {
158 |       std::cerr << "AttentionPooling Forward error: dim invalid" << std::endl;
159 |     }
160 | 
161 |     for (int idx = 0; idx < seq_size; idx++) {
162 |       _bi_gates.ComputeForwardScore(x[idx], xAtt, xMExp[idx]);
163 |     }
164 |     _uni_gates.ComputeForwardScore(xMExp, xExp);
165 | 
166 |     sumpool_forward(xExp, xSum);
167 |     for (int idx = 0; idx < seq_size; idx++) {
168 |       xPoolIndex[idx] = xExp[idx] / xSum;
169 |     }
170 |     for (int idx = 0; idx < seq_size; idx++) {
171 |       y += x[idx] * xPoolIndex[idx];
172 |     }
173 |   }
174 | 
175 | 
176 |   //please allocate the memory outside here
177 |   inline void ComputeBackwardLoss(Tensor<xpu, 3, dtype> x, Tensor<xpu, 3, dtype> xAtt,
178 |       Tensor<xpu, 3, dtype> xMExp, Tensor<xpu, 3, dtype> xExp,
179 |       Tensor<xpu, 2, dtype> xSum, Tensor<xpu, 3, dtype> xPoolIndex, Tensor<xpu, 2, dtype> y,
180 |       Tensor<xpu, 2, dtype> ly, Tensor<xpu, 3, dtype> lx, Tensor<xpu, 3, dtype> lxAtt, bool bclear = false) {
181 |     int seq_size = x.size(0);
182 |     if(seq_size == 0) return;
183 |     int dim1 = x.size(1), dim2 = x.size(2);
184 |     int odim1 = y.size(0), odim2 = y.size(1);
185 | 
186 |     if(bclear) lx = 0.0;
187 |     if(bclear) lxAtt = 0.0;
188 | 
189 |     Tensor<xpu, 3, dtype> xMExpLoss = NewTensor<xpu>(Shape3(seq_size, dim1, dim2), d_zero);
190 |     Tensor<xpu, 3, dtype> xExpLoss = NewTensor<xpu>(Shape3(seq_size, dim1, dim2), d_zero);
191 |     Tensor<xpu, 2, dtype> xSumLoss = NewTensor<xpu>(Shape2(dim1, dim2), d_zero);
192 |     Tensor<xpu, 3, dtype> xPoolIndexLoss = NewTensor<xpu>(Shape3(seq_size, dim1, dim2), d_zero);
193 | 
194 |     for (int idx = 0; idx < seq_size; idx++) {
195 |       xPoolIndexLoss[idx] = ly * x[idx];
196 |       lx[idx] += ly * xPoolIndex[idx];
197 |     }
198 | 
199 |     for (int idx = 0; idx < seq_size; idx++) {
200 |       xExpLoss[idx] += xPoolIndexLoss[idx] / xSum;
201 |       xSumLoss -= xPoolIndexLoss[idx] * xExp[idx] / xSum / xSum;
202 |     }
203 | 
204 |     sumpool_backward(xSumLoss, xExpLoss);
205 | 
206 |     _uni_gates.ComputeBackwardLoss(xMExp, xExp, xExpLoss, xMExpLoss);
207 |     _bi_gates.ComputeBackwardLoss(x, xAtt, xMExp, xMExpLoss, lx, lxAtt);
208 | 
209 |     FreeSpace(&xMExpLoss);
210 |     FreeSpace(&xExpLoss);
211 |     FreeSpace(&xSumLoss);
212 |     FreeSpace(&xPoolIndexLoss);
213 |   }
214 | 
215 |   inline void ComputeBackwardLoss(const std::vector<Tensor<xpu, 2, dtype> >& x, std::vector<Tensor<xpu, 2, dtype> >& xAtt,
216 |       std::vector<Tensor<xpu, 2, dtype> >& xMExp, std::vector<Tensor<xpu, 2, dtype> >& xExp,
217 |       Tensor<xpu, 2, dtype> xSum, std::vector<Tensor<xpu, 2, dtype> >& xPoolIndex, Tensor<xpu, 2, dtype> y,
218 |       Tensor<xpu, 2, dtype> ly, std::vector<Tensor<xpu, 2, dtype> >& lx, std::vector<Tensor<xpu, 2, dtype> >& lxAtt, bool bclear = false) {
219 |     int seq_size = x.size();
220 |     if(seq_size == 0) return;
221 |     int dim1 = x[0].size(0), dim2 = x[0].size(1);
222 |     int odim1 = y.size(0), odim2 = y.size(1);
223 | 
224 | 
225 |     if(bclear){
226 |       for (int idx = 0; idx < seq_size; idx++) {
227 |         lx[idx] = 0.0;
228 |         lxAtt[idx] = 0.0;
229 |       }
230 |     }
231 | 
232 |     vector<Tensor<xpu, 2, dtype> > xMExpLoss(seq_size), xExpLoss(seq_size), xPoolIndexLoss(seq_size);
233 |     for (int idx = 0; idx < seq_size; idx++) {
234 |       xMExpLoss[idx] = NewTensor<xpu>(Shape2(dim1, dim2), d_zero);
235 |       xExpLoss[idx] = NewTensor<xpu>(Shape2(dim1, dim2), d_zero);
236 |       xPoolIndexLoss[idx] = NewTensor<xpu>(Shape2(dim1, dim2), d_zero);
237 |     }
238 | 
239 |     Tensor<xpu, 2, dtype> xSumLoss = NewTensor<xpu>(Shape2(dim1, dim2), d_zero);
240 | 
241 |     for (int idx = 0; idx < seq_size; idx++) {
242 |       xPoolIndexLoss[idx] = ly * x[idx];
243 |       lx[idx] += ly * xPoolIndex[idx];
244 |     }
245 | 
246 |     for (int idx = 0; idx < seq_size; idx++) {
247 |       xExpLoss[idx] += xPoolIndexLoss[idx] / xSum;
248 |       xSumLoss -= xPoolIndexLoss[idx] * xExp[idx] / xSum / xSum;
249 |     }
250 | 
251 |     sumpool_backward(xSumLoss, xExpLoss);
252 | 
253 |     _uni_gates.ComputeBackwardLoss(xMExp, xExp, xExpLoss, xMExpLoss);
254 |     _bi_gates.ComputeBackwardLoss(x, xAtt, xMExp, xMExpLoss, lx, lxAtt);
255 | 
256 |     FreeSpace(&xSumLoss);
257 |     for (int idx = 0; idx < seq_size; idx++) {
258 |       FreeSpace(&(xMExpLoss[idx]));
259 |       FreeSpace(&(xExpLoss[idx]));
260 |       FreeSpace(&(xPoolIndexLoss[idx]));
261 |     }
262 |   }
263 | 
264 |   //please allocate the memory outside here
265 |   inline void ComputeBackwardLoss(Tensor<xpu, 3, dtype> x, Tensor<xpu, 2, dtype> xAtt,
266 |       Tensor<xpu, 3, dtype> xMExp, Tensor<xpu, 3, dtype> xExp,
267 |       Tensor<xpu, 2, dtype> xSum, Tensor<xpu, 3, dtype> xPoolIndex, Tensor<xpu, 2, dtype> y,
268 |       Tensor<xpu, 2, dtype> ly, Tensor<xpu, 3, dtype> lx, Tensor<xpu, 2, dtype> lxAtt, bool bclear = false) {
269 |     int seq_size = x.size(0);
270 |     if(seq_size == 0) return;
271 |     int dim1 = x.size(1), dim2 = x.size(2);
272 |     int odim1 = y.size(0), odim2 = y.size(1);
273 | 
274 |     if(bclear) lx = 0.0;
275 |     if(bclear) lxAtt = 0.0;
276 | 
277 |     Tensor<xpu, 3, dtype> xMExpLoss = NewTensor<xpu>(Shape3(seq_size, dim1, dim2), d_zero);
278 |     Tensor<xpu, 3, dtype> xExpLoss = NewTensor<xpu>(Shape3(seq_size, dim1, dim2), d_zero);
279 |     Tensor<xpu, 2, dtype> xSumLoss = NewTensor<xpu>(Shape2(dim1, dim2), d_zero);
280 |     Tensor<xpu, 3, dtype> xPoolIndexLoss = NewTensor<xpu>(Shape3(seq_size, dim1, dim2), d_zero);
281 | 
282 |     for (int idx = 0; idx < seq_size; idx++) {
283 |       xPoolIndexLoss[idx] = ly * x[idx];
284 |       lx[idx] += ly * xPoolIndex[idx];
285 |     }
286 | 
287 |     for (int idx = 0; idx < seq_size; idx++) {
288 |       xExpLoss[idx] += xPoolIndexLoss[idx] / xSum;
289 |       xSumLoss -= xPoolIndexLoss[idx] * xExp[idx] / xSum / xSum;
290 |     }
291 | 
292 |     sumpool_backward(xSumLoss, xExpLoss);
293 | 
294 |     _uni_gates.ComputeBackwardLoss(xMExp, xExp, xExpLoss, xMExpLoss);
295 |     for (int idx = 0; idx < seq_size; idx++) {
296 |       _bi_gates.ComputeBackwardLoss(x[idx], xAtt, xMExp[idx], xMExpLoss[idx], lx[idx], lxAtt);
297 |     }
298 | 
299 |     FreeSpace(&xMExpLoss);
300 |     FreeSpace(&xExpLoss);
301 |     FreeSpace(&xSumLoss);
302 |     FreeSpace(&xPoolIndexLoss);
303 |   }
304 | 
305 |   inline void ComputeBackwardLoss(const std::vector<Tensor<xpu, 2, dtype> >& x, Tensor<xpu, 2, dtype> xAtt,
306 |       std::vector<Tensor<xpu, 2, dtype> >& xMExp, std::vector<Tensor<xpu, 2, dtype> >& xExp,
307 |       Tensor<xpu, 2, dtype> xSum, std::vector<Tensor<xpu, 2, dtype> >& xPoolIndex, Tensor<xpu, 2, dtype> y,
308 |       Tensor<xpu, 2, dtype> ly, std::vector<Tensor<xpu, 2, dtype> >& lx, Tensor<xpu, 2, dtype> lxAtt, bool bclear = false) {
309 |     int seq_size = x.size();
310 |     if(seq_size == 0) return;
311 |     int dim1 = x[0].size(0), dim2 = x[0].size(1);
312 |     int odim1 = y.size(0), odim2 = y.size(1);
313 | 
314 | 
315 |     if(bclear){
316 |       for (int idx = 0; idx < seq_size; idx++) {
317 |         lx[idx] = 0.0;
318 |         lxAtt[idx] = 0.0;
319 |       }
320 |     }
321 | 
322 |     vector<Tensor<xpu, 2, dtype> > xMExpLoss(seq_size), xExpLoss(seq_size), xPoolIndexLoss(seq_size);
323 |     for (int idx = 0; idx < seq_size; idx++) {
324 |       xMExpLoss[idx] = NewTensor<xpu>(Shape2(dim1, dim2), d_zero);
325 |       xExpLoss[idx] = NewTensor<xpu>(Shape2(dim1, dim2), d_zero);
326 |       xPoolIndexLoss[idx] = NewTensor<xpu>(Shape2(dim1, dim2), d_zero);
327 |     }
328 | 
329 |     Tensor<xpu, 2, dtype> xSumLoss = NewTensor<xpu>(Shape2(dim1, dim2), d_zero);
330 | 
331 |     for (int idx = 0; idx < seq_size; idx++) {
332 |       xPoolIndexLoss[idx] = ly * x[idx];
333 |       lx[idx] += ly * xPoolIndex[idx];
334 |     }
335 | 
336 |     for (int idx = 0; idx < seq_size; idx++) {
337 |       xExpLoss[idx] += xPoolIndexLoss[idx] / xSum;
338 |       xSumLoss -= xPoolIndexLoss[idx] * xExp[idx] / xSum / xSum;
339 |     }
340 | 
341 |     sumpool_backward(xSumLoss, xExpLoss);
342 | 
343 |     _uni_gates.ComputeBackwardLoss(xMExp, xExp, xExpLoss, xMExpLoss);
344 |     for (int idx = 0; idx < seq_size; idx++) {
345 |       _bi_gates.ComputeBackwardLoss(x[idx], xAtt, xMExp[idx], xMExpLoss[idx], lx[idx], lxAtt);
346 |     }
347 | 
348 |     FreeSpace(&xSumLoss);
349 |     for (int idx = 0; idx < seq_size; idx++) {
350 |       FreeSpace(&(xExpLoss[idx]));
351 |       FreeSpace(&(xPoolIndexLoss[idx]));
352 |     }
353 |   }
354 | 
355 |   inline void randomprint(int num) {
356 |     _bi_gates.randomprint(num);
357 |     _uni_gates.randomprint(num);
358 |   }
359 | 
360 |   inline void updateAdaGrad(dtype regularizationWeight, dtype adaAlpha, dtype adaEps) {
361 |     _bi_gates.updateAdaGrad(regularizationWeight, adaAlpha, adaEps);
362 |     _uni_gates.updateAdaGrad(regularizationWeight, adaAlpha, adaEps);
363 |   }
364 | 
365 |   void writeModel(LStream &outf) {
366 |     _bi_gates.writeModel(outf);
367 |     _uni_gates.writeModel(outf);
368 | 
369 |   }
370 | 
371 |   void loadModel(LStream &inf) {
372 |     _bi_gates.loadModel(inf);
373 |     _uni_gates.loadModel(inf);
374 | 
375 |   }
376 | };
377 | 
378 | #endif /* SRC_AttentionPooling_H_ */
379 | 


--------------------------------------------------------------------------------
/AvgPerceptron1O.h:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * AvgPerceptron1O.h
  3 |  *
  4 |  *  Created on: Oct 22, 2015
  5 |  *      Author: mszhang
  6 |  */
  7 | 
  8 | #ifndef AVGPERCEPTRON1O_H_
  9 | #define AVGPERCEPTRON1O_H_
 10 | 
 11 | #include "tensor.h"
 12 | #include "Utiltensor.h"
 13 | #include "MyLib.h"
 14 | 
 15 | using namespace mshadow;
 16 | using namespace mshadow::expr;
 17 | using namespace mshadow::utils;
 18 | 
 19 | // Weight updating process implemented without theory support,
 20 | // but recently find an EMNLP 2015 paper "An Empirical Analysis of Optimization for Max-Margin NLP"
 21 | // In all my papers that use adagrad for sparse features, I use it for parameter updating.
 22 | 
 23 | template<typename xpu>
 24 | class AvgPerceptron1O {
 25 | 
 26 | public:
 27 | 
 28 |   hash_set<int> _indexers;
 29 | 
 30 |   Tensor<xpu, 1, dtype> _W;
 31 | 
 32 |   Tensor<xpu, 1, dtype> _gradW;
 33 | 
 34 |   Tensor<xpu, 1, dtype> _sumW;
 35 | 
 36 |   int _max_update;
 37 |   NRVec<int> _last_update;
 38 | 
 39 | public:
 40 | 
 41 |   AvgPerceptron1O() {
 42 |     _indexers.clear();
 43 |   }
 44 | 
 45 |   inline void initial(int nISize, int seed = 0) {
 46 |     dtype bound = sqrt(6.0 / (nISize + 1));
 47 |     //dtype bound = 0.01;
 48 | 
 49 |     _W = NewTensor<xpu>(Shape1(nISize), d_zero);
 50 |     _gradW = NewTensor<xpu>(Shape1(nISize), d_zero);
 51 |     _sumW = NewTensor<xpu>(Shape1(nISize), d_one);
 52 | 
 53 |     _max_update = 0;
 54 |     _last_update.resize(nISize);
 55 |     _last_update = 0;
 56 |   }
 57 | 
 58 |   inline void initial(Tensor<xpu, 1, dtype> W) {
 59 |     static int nOSize, nISize;
 60 |     nISize = W.size(0);
 61 | 
 62 |     _W = NewTensor<xpu>(Shape1(nISize), d_zero);
 63 |     _gradW = NewTensor<xpu>(Shape1(nISize), d_zero);
 64 |     _sumW = NewTensor<xpu>(Shape1(nISize), d_one);
 65 |     Copy(_W, W);
 66 | 
 67 |     _max_update = 0;
 68 |     _last_update.resize(nISize);
 69 |     _last_update = 0;
 70 |   }
 71 | 
 72 |   inline void release() {
 73 |     FreeSpace(&_W);
 74 |     FreeSpace(&_gradW);
 75 |     FreeSpace(&_sumW);
 76 |     _indexers.clear();
 77 |   }
 78 | 
 79 |   virtual ~AvgPerceptron1O() {
 80 |     // TODO Auto-generated destructor stub
 81 |   }
 82 | 
 83 |   inline dtype squarenormAll() {
 84 |     dtype result = squarenorm(_gradW);
 85 | 
 86 |     return result;
 87 |   }
 88 | 
 89 |   inline void scaleGrad(dtype scale) {
 90 |     _gradW = _gradW * scale;
 91 |   }
 92 | 
 93 | public:
 94 |   void ComputeForwardScore(const std::vector<int>& x, dtype& y, bool bTrain = false) {
 95 |     static long long featNum, featId;
 96 |     featNum = x.size();
 97 |     y = 0.0;
 98 |     for (int idx = 0; idx < featNum; idx++) {
 99 |       featId = x[idx];
100 |       if (featId >= _W.size(0))
101 |         continue;
102 |       if (bTrain)
103 |         y += _W[featId];
104 |       else
105 |         y += sumWeight(featId);
106 |       //y += _W[featId];
107 |     }
108 |   }
109 | 
110 |   // loss is stopped at this layer, since the input is one-hold alike
111 |   void ComputeBackwardLoss(const std::vector<int>& x, dtype ly) {
112 |     //_gradW
113 |     static long long featNum, featId;
114 |     featNum = x.size();
115 |     for (int idx = 0; idx < featNum; idx++) {
116 |       featId = x[idx];
117 |       if (featId >= _W.size(0))
118 |         continue;
119 |       _indexers.insert(featId);
120 |       _gradW[featId] += ly;
121 |     }
122 |   }
123 | 
124 |   void randomprint(int num) {
125 |     static int nISize;
126 |     nISize = _W.size(0);
127 | 
128 |     int count = 0;
129 |     while (count < num) {
130 |       int idx = rand() % nISize;
131 |       std::cout << "_W[" << idx << "]=" << _W[idx] << " ";
132 |       count++;
133 |     }
134 | 
135 |     std::cout << std::endl;
136 |   }
137 | 
138 |   void updateAdaGrad(dtype regularizationWeight, dtype adaAlpha, dtype adaEps) {
139 |     static int startPos;
140 | 
141 |     static hash_set<int>::iterator it;
142 | 
143 |     _max_update++;
144 | 
145 |     for (it = _indexers.begin(); it != _indexers.end(); ++it) {
146 |       int index = *it;
147 |       _sumW[index] += (_max_update - _last_update[index]) * _W[index] - _gradW[index];
148 |       _W[index] = _W[index] - _gradW[index];
149 |       _last_update[index] = _max_update;
150 |     }
151 | 
152 |     clearGrad();
153 |   }
154 | 
155 |   void clearGrad() {
156 |     static hash_set<int>::iterator it;
157 |     for (it = _indexers.begin(); it != _indexers.end(); ++it) {
158 |       int index = *it;
159 |       _gradW[index] = 0.0;
160 |     }
161 |     _indexers.clear();
162 | 
163 |   }
164 | 
165 |   dtype sumWeight(int featId) {
166 |     if (_last_update[featId] < _max_update) {
167 |       int times = _max_update - _last_update[featId];
168 |       _sumW[featId] += _W[featId] * times;
169 |       _last_update[featId] = _max_update;
170 |     }
171 | 
172 |     return _sumW[featId];
173 |   }
174 | 
175 |   void writeModel(LStream &outf) {
176 |     SaveBinary(outf, _W);
177 |     SaveBinary(outf, _gradW);
178 |     SaveBinary(outf, _sumW);
179 |     WriteBinary(outf, _max_update);
180 |     WriteVector(outf, _last_update);
181 | 
182 |   }
183 |   
184 |   void loadModel(LStream &inf) {
185 |     LoadBinary(inf, &_W, false);
186 |     LoadBinary(inf, &_gradW, false);
187 |     LoadBinary(inf, &_sumW, false);
188 |     ReadBinary(inf, _max_update);
189 |     ReadVector(inf, _last_update);
190 |   }
191 | };
192 | 
193 | #endif /* AVGPERCEPTRON1O_H_ */
194 | 


--------------------------------------------------------------------------------
/BiLayer.h:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * BiLayer.h
  3 |  *
  4 |  *  Created on: Mar 18, 2015
  5 |  *      Author: mszhang
  6 |  */
  7 | 
  8 | #ifndef SRC_BiLayer_H_
  9 | #define SRC_BiLayer_H_
 10 | #include "tensor.h"
 11 | #include "MyLib.h"
 12 | #include "Utiltensor.h"
 13 | 
 14 | using namespace mshadow;
 15 | using namespace mshadow::expr;
 16 | using namespace mshadow::utils;
 17 | 
 18 | template<typename xpu>
 19 | class BiLayer {
 20 | 
 21 | public:
 22 | 
 23 |   Tensor<xpu, 2, dtype> _WL;
 24 |   Tensor<xpu, 2, dtype> _WR;
 25 |   Tensor<xpu, 2, dtype> _b;
 26 | 
 27 |   Tensor<xpu, 2, dtype> _gradWL;
 28 |   Tensor<xpu, 2, dtype> _gradWR;
 29 |   Tensor<xpu, 2, dtype> _gradb;
 30 | 
 31 |   Tensor<xpu, 2, dtype> _eg2WL;
 32 |   Tensor<xpu, 2, dtype> _eg2WR;
 33 |   Tensor<xpu, 2, dtype> _eg2b;
 34 | 
 35 |   bool _bUseB;
 36 | 
 37 |   int _funcType; // 0: tanh, 1: sigmod, 2: f(x)=x, 3: exp
 38 | 
 39 | public:
 40 |   BiLayer() {
 41 |   }
 42 | 
 43 |   inline void initial(int nOSize, int nLISize, int nRISize, bool bUseB = true, int seed = 0, int funcType = 0) {
 44 |     dtype bound = sqrt(6.0 / (nOSize + nLISize + nRISize + 1));
 45 |     //dtype bound = 0.01;
 46 | 
 47 |     _WL = NewTensor<xpu>(Shape2(nOSize, nLISize), d_zero);
 48 |     _gradWL = NewTensor<xpu>(Shape2(nOSize, nLISize), d_zero);
 49 |     _eg2WL = NewTensor<xpu>(Shape2(nOSize, nLISize), d_zero);
 50 | 
 51 |     _WR = NewTensor<xpu>(Shape2(nOSize, nRISize), d_zero);
 52 |     _gradWR = NewTensor<xpu>(Shape2(nOSize, nRISize), d_zero);
 53 |     _eg2WR = NewTensor<xpu>(Shape2(nOSize, nRISize), d_zero);
 54 | 
 55 |     _b = NewTensor<xpu>(Shape2(1, nOSize), d_zero);
 56 |     _gradb = NewTensor<xpu>(Shape2(1, nOSize), d_zero);
 57 |     _eg2b = NewTensor<xpu>(Shape2(1, nOSize), d_zero);
 58 | 
 59 |     random(_WL, -1.0 * bound, 1.0 * bound, seed);
 60 |     random(_WR, -1.0 * bound, 1.0 * bound, seed+1);
 61 |     random(_b, -1.0 * bound, 1.0 * bound, seed+2);
 62 | 
 63 |     _bUseB = bUseB;
 64 |     _funcType = funcType;
 65 |   }
 66 | 
 67 |   inline void initial(Tensor<xpu, 2, dtype> WL, Tensor<xpu, 2, dtype> WR, Tensor<xpu, 2, dtype> b, bool bUseB = true, int funcType = 0) {
 68 |     static int nOSize, nLISize, nRISize;
 69 |     nOSize = WL.size(0);
 70 |     nLISize = WL.size(1);
 71 |     nRISize = WR.size(1);
 72 | 
 73 |     _WL = NewTensor<xpu>(Shape2(nOSize, nLISize), d_zero);
 74 |     _gradWL = NewTensor<xpu>(Shape2(nOSize, nLISize), d_zero);
 75 |     _eg2WL = NewTensor<xpu>(Shape2(nOSize, nLISize), d_zero);
 76 |     Copy(_WL, WL);
 77 | 
 78 |     _WR = NewTensor<xpu>(Shape2(nOSize, nRISize), d_zero);
 79 |     _gradWR = NewTensor<xpu>(Shape2(nOSize, nRISize), d_zero);
 80 |     _eg2WR = NewTensor<xpu>(Shape2(nOSize, nRISize), d_zero);
 81 |     Copy(_WR, WR);
 82 | 
 83 |     _b = NewTensor<xpu>(Shape2(1, nOSize), d_zero);
 84 |     _gradb = NewTensor<xpu>(Shape2(1, nOSize), d_zero);
 85 |     _eg2b = NewTensor<xpu>(Shape2(1, nOSize), d_zero);
 86 | 
 87 |     if (bUseB)
 88 |       Copy(_b, b);
 89 | 
 90 |     _bUseB = bUseB;
 91 |     _funcType = funcType;
 92 |   }
 93 | 
 94 | 
 95 |   inline void initial(Tensor<xpu, 2, dtype> WL, Tensor<xpu, 2, dtype> WR, int funcType = 0) {
 96 |     static int nOSize, nLISize, nRISize;
 97 |     nOSize = WL.size(0);
 98 |     nLISize = WL.size(1);
 99 |     nRISize = WR.size(1);
100 | 
101 |     _WL = NewTensor<xpu>(Shape2(nOSize, nLISize), d_zero);
102 |     _gradWL = NewTensor<xpu>(Shape2(nOSize, nLISize), d_zero);
103 |     _eg2WL = NewTensor<xpu>(Shape2(nOSize, nLISize), d_zero);
104 |     Copy(_WL, WL);
105 | 
106 |     _WR = NewTensor<xpu>(Shape2(nOSize, nRISize), d_zero);
107 |     _gradWR = NewTensor<xpu>(Shape2(nOSize, nRISize), d_zero);
108 |     _eg2WR = NewTensor<xpu>(Shape2(nOSize, nRISize), d_zero);
109 |     Copy(_WR, WR);
110 | 
111 |     _b = NewTensor<xpu>(Shape2(1, nOSize), d_zero);
112 |     _gradb = NewTensor<xpu>(Shape2(1, nOSize), d_zero);
113 |     _eg2b = NewTensor<xpu>(Shape2(1, nOSize), d_zero);
114 | 
115 | 
116 |     _bUseB = false;
117 |     _funcType = funcType;
118 |   }
119 | 
120 |   inline void release() {
121 |     FreeSpace(&_WL);
122 |     FreeSpace(&_gradWL);
123 |     FreeSpace(&_eg2WL);
124 |     FreeSpace(&_WR);
125 |     FreeSpace(&_gradWR);
126 |     FreeSpace(&_eg2WR);
127 |     FreeSpace(&_b);
128 |     FreeSpace(&_gradb);
129 |     FreeSpace(&_eg2b);
130 |   }
131 | 
132 |   virtual ~BiLayer() {
133 |     // TODO Auto-generated destructor stub
134 |   }
135 | 
136 |   inline dtype squarenormAll() {
137 |     dtype result = squarenorm(_gradWL);
138 |     result += squarenorm(_gradWR);
139 |     if (_bUseB) {
140 |       result += squarenorm(_gradb);
141 |     }
142 | 
143 |     return result;
144 |   }
145 | 
146 |   inline void scaleGrad(dtype scale) {
147 |     _gradWL = _gradWL * scale;
148 |     _gradWR = _gradWR * scale;
149 |     if (_bUseB) {
150 |       _gradb = _gradb * scale;
151 |     }
152 |   }
153 | 
154 | public:
155 |   inline void ComputeForwardScore(Tensor<xpu, 2, dtype> xl, Tensor<xpu, 2, dtype> xr, Tensor<xpu, 2, dtype> y) {
156 |     y = dot(xl, _WL.T());
157 |     y += dot(xr, _WR.T());
158 |     if (_bUseB)
159 |       y = y + _b;
160 |     if (_funcType == 0)
161 |       y = F<nl_tanh>(y);
162 |     else if (_funcType == 1)
163 |       y = F<nl_sigmoid>(y);
164 |     else if (_funcType == 3)
165 |       y = F<nl_exp>(y);
166 |   }
167 | 
168 | 
169 |   inline void ComputeForwardScore(Tensor<xpu, 3, dtype> xl, Tensor<xpu, 3, dtype> xr, Tensor<xpu, 3, dtype> y) {
170 |     int seq_size = y.size(0);
171 |     for(int id = 0; id < seq_size; id++){
172 |       y[id] = dot(xl[id], _WL.T());
173 |       y[id] += dot(xr[id], _WR.T());
174 |       if (_bUseB)
175 |         y[id] = y[id] + _b;
176 |       if (_funcType == 0)
177 |         y[id] = F<nl_tanh>(y[id]);
178 |       else if (_funcType == 1)
179 |         y[id] = F<nl_sigmoid>(y[id]);
180 |       else if (_funcType == 3)
181 |         y[id] = F<nl_exp>(y[id]);
182 |     }
183 |   }
184 | 
185 |   inline void ComputeForwardScore(const std::vector<Tensor<xpu, 2, dtype> >& xl, const std::vector<Tensor<xpu, 2, dtype> >& xr,
186 |       std::vector<Tensor<xpu, 2, dtype> > &y) {
187 |     int seq_size = y.size();
188 |     for(int id = 0; id < seq_size; id++){
189 |       y[id] = dot(xl[id], _WL.T());
190 |       y[id] += dot(xr[id], _WR.T());
191 |       if (_bUseB)
192 |         y[id] = y[id] + _b;
193 |       if (_funcType == 0)
194 |         y[id] = F<nl_tanh>(y[id]);
195 |       else if (_funcType == 1)
196 |         y[id] = F<nl_sigmoid>(y[id]);
197 |       else if (_funcType == 3)
198 |         y[id] = F<nl_exp>(y[id]);
199 |     }
200 |   }
201 | 
202 |   //please allocate the memory outside here
203 |   inline void ComputeBackwardLoss(Tensor<xpu, 2, dtype> xl, Tensor<xpu, 2, dtype> xr, Tensor<xpu, 2, dtype> y, Tensor<xpu, 2, dtype> ly,
204 |       Tensor<xpu, 2, dtype> lxl, Tensor<xpu, 2, dtype> lxr, bool bclear = false) {
205 |     //_gradW
206 |     Tensor<xpu, 2, dtype> deri_yx(Shape2(y.size(0), y.size(1))), cly(Shape2(y.size(0), y.size(1)));
207 |     AllocSpace(&deri_yx);
208 |     AllocSpace(&cly);
209 |     if(bclear){
210 |       lxl = 0.0;
211 |       lxr = 0.0;
212 |     }
213 |     if (_funcType == 0) {
214 |       deri_yx = F<nl_dtanh>(y);
215 |       cly = ly * deri_yx;
216 |     } else if (_funcType == 1) {
217 |       deri_yx = F<nl_dsigmoid>(y);
218 |       cly = ly * deri_yx;
219 |     } else if (_funcType == 3) {
220 |       cly = ly * y;
221 |     } else {
222 |       //cly = ly;
223 |       Copy(cly, ly);
224 |     }
225 |     //_gradW
226 |     _gradWL += dot(cly.T(), xl);
227 |     _gradWR += dot(cly.T(), xr);
228 | 
229 |     //_gradb
230 |     if (_bUseB)
231 |       _gradb += cly;
232 | 
233 |     //lx
234 |     lxl += dot(cly, _WL);
235 |     lxr += dot(cly, _WR);
236 | 
237 |     FreeSpace(&deri_yx);
238 |     FreeSpace(&cly);
239 |   }
240 | 
241 | 
242 |   //please allocate the memory outside here
243 |   inline void ComputeBackwardLoss(Tensor<xpu, 3, dtype> xl, Tensor<xpu, 3, dtype> xr, Tensor<xpu, 3, dtype> y, Tensor<xpu, 3, dtype> ly,
244 |       Tensor<xpu, 3, dtype> lxl, Tensor<xpu, 3, dtype> lxr, bool bclear = false) {
245 |     int seq_size = y.size(0);
246 |     int y_dim1 = y.size(1), y_dim2 = y.size(2);
247 |     //_gradW
248 |     Tensor<xpu, 2, dtype> deri_yx(Shape2(y_dim1, y_dim2)), cly(Shape2(y_dim1, y_dim2));
249 |     AllocSpace(&deri_yx);
250 |     AllocSpace(&cly);
251 | 
252 |     if(bclear){
253 |       lxl = 0.0;
254 |       lxr = 0.0;
255 |     }
256 |     for (int id = 0; id < seq_size; id++) {
257 |       if (_funcType == 0) {
258 |         deri_yx = F<nl_dtanh>(y[id]);
259 |         cly = ly[id] * deri_yx;
260 |       } else if (_funcType == 1) {
261 |         deri_yx = F<nl_dsigmoid>(y[id]);
262 |         cly = ly[id] * deri_yx;
263 |       } else if (_funcType == 3) {
264 |         cly = ly[id] * y[id];
265 |       } else {
266 |         //cly = ly;
267 |         Copy(cly, ly[id]);
268 |       }
269 |       //_gradW
270 |       _gradWL += dot(cly.T(), xl[id]);
271 |       _gradWR += dot(cly.T(), xr[id]);
272 | 
273 |       //_gradb
274 |       if (_bUseB)
275 |         _gradb += cly;
276 | 
277 |       //lx
278 |       lxl[id] += dot(cly, _WL);
279 |       lxr[id] += dot(cly, _WR);
280 |     }
281 | 
282 |     FreeSpace(&deri_yx);
283 |     FreeSpace(&cly);
284 |   }
285 | 
286 |   inline void ComputeBackwardLoss(const std::vector<Tensor<xpu, 2, dtype> > &xl, const std::vector<Tensor<xpu, 2, dtype> > &xr,
287 |       const std::vector<Tensor<xpu, 2, dtype> > &y, const std::vector<Tensor<xpu, 2, dtype> > &ly,
288 |       std::vector<Tensor<xpu, 2, dtype> > &lxl, std::vector<Tensor<xpu, 2, dtype> > &lxr, bool bclear = false) {
289 |     int seq_size = y.size();
290 |     assert(seq_size > 0);
291 |     int y_dim1 = y[0].size(0), y_dim2 = y[0].size(1);
292 |     //_gradW
293 |     Tensor<xpu, 2, dtype> deri_yx(Shape2(y_dim1, y_dim2)), cly(Shape2(y_dim1, y_dim2));
294 |     AllocSpace(&deri_yx);
295 |     AllocSpace(&cly);
296 | 
297 |     if(bclear){
298 |       for (int id = 0; id < seq_size; id++) {
299 |         lxl[id] = 0.0;
300 |         lxr[id] = 0.0;
301 |       }
302 |     }
303 |     for (int id = 0; id < seq_size; id++) {
304 |       if (_funcType == 0) {
305 |         deri_yx = F<nl_dtanh>(y[id]);
306 |         cly = ly[id] * deri_yx;
307 |       } else if (_funcType == 1) {
308 |         deri_yx = F<nl_dsigmoid>(y[id]);
309 |         cly = ly[id] * deri_yx;
310 |       } else if (_funcType == 3) {
311 |         cly = ly[id] * y[id];
312 |       } else {
313 |         //cly = ly;
314 |         Copy(cly, ly[id]);
315 |       }
316 |       //_gradW
317 |       _gradWL += dot(cly.T(), xl[id]);
318 |       _gradWR += dot(cly.T(), xr[id]);
319 | 
320 |       //_gradb
321 |       if (_bUseB)
322 |         _gradb += cly;
323 | 
324 |       //lx
325 |       lxl[id] += dot(cly, _WL);
326 |       lxr[id] += dot(cly, _WR);
327 |     }
328 | 
329 |     FreeSpace(&deri_yx);
330 |     FreeSpace(&cly);
331 |   }
332 | 
333 |   inline void randomprint(int num) {
334 |     static int nOSize, nLISize, nRISize;
335 |     nOSize = _WL.size(0);
336 |     nLISize = _WL.size(1);
337 |     nRISize = _WR.size(1);
338 |     int count = 0;
339 |     while (count < num) {
340 |       int idxl = rand() % nOSize;
341 |       int idyl = rand() % nLISize;
342 |       int idxr = rand() % nOSize;
343 |       int idyr = rand() % nRISize;
344 | 
345 |       std::cout << "_WL[" << idxl << "," << idyl << "]=" << _WL[idxl][idyl] << " ";
346 |       std::cout << "_WR[" << idxr << "," << idyr << "]=" << _WR[idxr][idyr] << " ";
347 | 
348 |       if (_bUseB) {
349 |         int idz = rand() % nOSize;
350 |         std::cout << "_b[0][" << idz << "]=" << _b[0][idz] << " ";
351 |       }
352 |       count++;
353 |     }
354 | 
355 |     std::cout << std::endl;
356 |   }
357 | 
358 |   inline void updateAdaGrad(dtype regularizationWeight, dtype adaAlpha, dtype adaEps) {
359 |     _gradWL = _gradWL + _WL * regularizationWeight;
360 |     _eg2WL = _eg2WL + _gradWL * _gradWL;
361 |     _WL = _WL - _gradWL * adaAlpha / F<nl_sqrt>(_eg2WL + adaEps);
362 | 
363 |     _gradWR = _gradWR + _WR * regularizationWeight;
364 |     _eg2WR = _eg2WR + _gradWR * _gradWR;
365 |     _WR = _WR - _gradWR * adaAlpha / F<nl_sqrt>(_eg2WR + adaEps);
366 | 
367 |     if (_bUseB) {
368 |       _gradb = _gradb + _b * regularizationWeight;
369 |       _eg2b = _eg2b + _gradb * _gradb;
370 |       _b = _b - _gradb * adaAlpha / F<nl_sqrt>(_eg2b + adaEps);
371 |     }
372 | 
373 |     clearGrad();
374 |   }
375 | 
376 |   inline void clearGrad() {
377 |     _gradWL = 0;
378 |     _gradWR = 0;
379 |     if (_bUseB)
380 |       _gradb = 0;
381 |   }
382 | 
383 |   void writeModel(LStream &outf) {
384 |     SaveBinary(outf, _WL);
385 |     SaveBinary(outf, _WR);
386 |     SaveBinary(outf, _b);
387 |     SaveBinary(outf, _gradWL);
388 |     SaveBinary(outf, _gradWR);
389 |     SaveBinary(outf, _gradb);
390 |     SaveBinary(outf, _eg2WL);
391 |     SaveBinary(outf, _eg2WR);
392 |     SaveBinary(outf, _eg2b);
393 | 
394 |     WriteBinary(outf, _bUseB);
395 |     WriteBinary(outf, _funcType);
396 |     // cout << "Bilayer " << _bUseB << _funcType << endl;
397 |     // cout << "Bilayer value: " << _WR[1][1] << endl;
398 | 
399 |   }
400 | 
401 |   void loadModel(LStream &inf) {
402 |     LoadBinary(inf, &_WL, false);
403 |     LoadBinary(inf, &_WR, false);
404 |     LoadBinary(inf, &_b, false);
405 |     LoadBinary(inf, &_gradWL, false);
406 |     LoadBinary(inf, &_gradWR, false);
407 |     LoadBinary(inf, &_gradb, false);
408 |     LoadBinary(inf, &_eg2WL, false);
409 |     LoadBinary(inf, &_eg2WR, false);
410 |     LoadBinary(inf, &_eg2b, false);
411 | 
412 |     ReadBinary(inf, _bUseB);
413 |     ReadBinary(inf, _funcType);
414 |     // cout << "Bilayer " << _bUseB << _funcType << endl;
415 |     // cout << "Bilayer value: " << _WR[1][1] << endl;
416 |   }
417 |   
418 | };
419 | 
420 | #endif /* SRC_BiLayer_H_ */
421 | 


--------------------------------------------------------------------------------
/CheckGrad.h:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * CheckGrad.h
  3 |  *
  4 |  *  Created on: Dec 4, 2015
  5 |  *      Author: mason
  6 |  */
  7 | 
  8 | #ifndef BASIC_CHECKGRAD_H_
  9 | #define BASIC_CHECKGRAD_H_
 10 | 
 11 | #include <iostream>
 12 | #include "tensor.h"
 13 | #include "MyLib.h"
 14 | 
 15 | using namespace nr;
 16 | using namespace std;
 17 | using namespace mshadow;
 18 | using namespace mshadow::expr;
 19 | using namespace mshadow::utils;
 20 | 
 21 | template<typename xpu, typename Example, typename Classifier>
 22 | void checkgrad(Classifier* classifier, const vector<Example>& examples, Tensor<xpu, 2, double>& Wd,
 23 | 		const Tensor<xpu, 2, double>& gradWd, const string& mark, int iter) {
 24 | 	int charseed = mark.length();
 25 | 	for (int i = 0; i < mark.length(); i++) {
 26 | 		charseed = (int) (mark[i]) * 5 + charseed;
 27 | 	}
 28 | 	srand(iter + charseed);
 29 | 	std::vector<int> idRows, idCols;
 30 | 	idRows.clear();
 31 | 	idCols.clear();
 32 | 	for (int i = 0; i < Wd.size(0); ++i)
 33 | 		idRows.push_back(i);
 34 | 	for (int idx = 0; idx < Wd.size(1); idx++)
 35 | 		idCols.push_back(idx);
 36 | 
 37 | 	random_shuffle(idRows.begin(), idRows.end());
 38 | 	random_shuffle(idCols.begin(), idCols.end());
 39 | 
 40 | 	int check_i = idRows[0], check_j = idCols[0];
 41 | 
 42 | 	dtype orginValue = Wd[check_i][check_j];
 43 | 
 44 | 	Wd[check_i][check_j] = orginValue + 0.001;
 45 | 	dtype lossAdd = 0.0;
 46 | 	for (int i = 0; i < examples.size(); i++) {
 47 | 		Example oneExam = examples[i];
 48 | 		lossAdd += classifier->computeScore(oneExam);
 49 | 	}
 50 | 
 51 | 	Wd[check_i][check_j] = orginValue - 0.001;
 52 | 	dtype lossPlus = 0.0;
 53 | 	for (int i = 0; i < examples.size(); i++) {
 54 | 		Example oneExam = examples[i];
 55 | 		lossPlus += classifier->computeScore(oneExam);
 56 | 	}
 57 | 
 58 | 	dtype mockGrad = (lossAdd - lossPlus) / 0.002;
 59 | 	mockGrad = mockGrad / examples.size();
 60 | 	dtype computeGrad = gradWd[check_i][check_j];
 61 | 
 62 | 	printf("Iteration %d, Checking gradient for %s[%d][%d]:\t", iter,
 63 | 			mark.c_str(), check_i, check_j);
 64 | 	printf("mock grad = %.18f, computed grad = %.18f\n", mockGrad, computeGrad);
 65 | 
 66 | 	Wd[check_i][check_j] = orginValue;
 67 | }
 68 | 
 69 | template<typename xpu, typename Example, typename Classifier>
 70 | void checkgrad(Classifier* classifier, const vector<Example>& examples, Tensor<xpu, 2, double>& Wd,
 71 | 		const Tensor<xpu, 2, double>& gradWd, const string& mark, int iter,
 72 | 		const hash_set<int>& indexes, bool bRow = true) {
 73 | 	if (indexes.size() == 0)
 74 | 		return;
 75 | 	int charseed = mark.length();
 76 | 	for (int i = 0; i < mark.length(); i++) {
 77 | 		charseed = (int) (mark[i]) * 5 + charseed;
 78 | 	}
 79 | 	srand(iter + charseed);
 80 | 	std::vector<int> idRows, idCols;
 81 | 	idRows.clear();
 82 | 	idCols.clear();
 83 | 	static hash_set<int>::iterator it;
 84 | 	if (bRow) {
 85 | 		for (it = indexes.begin(); it != indexes.end(); ++it)
 86 | 			idRows.push_back(*it);
 87 | 		for (int idx = 0; idx < Wd.size(1); idx++)
 88 | 			idCols.push_back(idx);
 89 | 	} else {
 90 | 		for (it = indexes.begin(); it != indexes.end(); ++it)
 91 | 			idCols.push_back(*it);
 92 | 		for (int idx = 0; idx < Wd.size(0); idx++)
 93 | 			idRows.push_back(idx);
 94 | 	}
 95 | 
 96 | 	random_shuffle(idRows.begin(), idRows.end());
 97 | 	random_shuffle(idCols.begin(), idCols.end());
 98 | 
 99 | 	int check_i = idRows[0], check_j = idCols[0];
100 | 
101 | 	dtype orginValue = Wd[check_i][check_j];
102 | 
103 | 	Wd[check_i][check_j] = orginValue + 0.001;
104 | 	dtype lossAdd = 0.0;
105 | 	for (int i = 0; i < examples.size(); i++) {
106 | 		Example oneExam = examples[i];
107 | 		lossAdd += classifier->computeScore(oneExam);
108 | 	}
109 | 
110 | 	Wd[check_i][check_j] = orginValue - 0.001;
111 | 	dtype lossPlus = 0.0;
112 | 	for (int i = 0; i < examples.size(); i++) {
113 | 		Example oneExam = examples[i];
114 | 		lossPlus += classifier->computeScore(oneExam);
115 | 	}
116 | 
117 | 	dtype mockGrad = (lossAdd - lossPlus) / 0.002;
118 | 	mockGrad = mockGrad / examples.size();
119 | 	dtype computeGrad = gradWd[check_i][check_j];
120 | 
121 | 	printf("Iteration %d, Checking gradient for %s[%d][%d]:\t", iter,
122 | 			mark.c_str(), check_i, check_j);
123 | 	printf("mock grad = %.18f, computed grad = %.18f\n", mockGrad, computeGrad);
124 | 
125 | 	Wd[check_i][check_j] = orginValue;
126 | 
127 | }
128 | 
129 | 
130 | #endif /* BASIC_CHECKGRAD_H_ */
131 | 


--------------------------------------------------------------------------------
/Dropout.h:
--------------------------------------------------------------------------------
 1 | #ifndef DROPOUT
 2 | #define DROPOUT
 3 | 
 4 | #include "tensor.h"
 5 | #include "MyLib.h"
 6 | 
 7 | 
 8 | using namespace std;
 9 | using namespace mshadow;
10 | using namespace mshadow::expr;
11 | using namespace mshadow::utils;
12 | 
13 | 
14 | template<typename xpu>
15 | inline void dropoutcol(Tensor<xpu, 2, dtype> w, dtype dropOut)
16 | {
17 | 	w = 1.0;
18 | 	std::vector<int> indexes;
19 |   for (int i = 0; i < w.size(1); ++i)
20 |     indexes.push_back(i);
21 |   int dropNum =   (int) (w.size(1) * dropOut);
22 |   
23 | 	for(int idx = 0; idx < w.size(0); idx++)
24 | 	{
25 | 		random_shuffle(indexes.begin(), indexes.end());
26 | 		for(int idy = 0; idy < dropNum; idy++)
27 | 		{
28 | 			w[idx][indexes[idy]] = 0.0;
29 | 		}
30 | 	}
31 | }
32 | 
33 | 
34 | template<typename xpu>
35 | inline void dropoutrow(Tensor<xpu, 2, dtype> w, dtype dropOut)
36 | {
37 | 	w = 1.0;
38 | 	std::vector<int> indexes;
39 |   for (int i = 0; i < w.size(0); ++i)
40 |     indexes.push_back(i);
41 |   int dropNum = (int) (w.size(0) * dropOut);
42 |   
43 | 	for(int idx = 0; idx < w.size(1); idx++)
44 | 	{
45 | 		random_shuffle(indexes.begin(), indexes.end());
46 | 		for(int idy = 0; idy < dropNum; idy++)
47 | 		{
48 | 			w[indexes[idy]][idx] = 0.0;
49 | 		}
50 | 	}
51 | }
52 | 
53 | 
54 | #endif
55 | 


--------------------------------------------------------------------------------
/GRNN.h:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * GRNN.h
  3 |  *
  4 |  *  Created on: Mar 18, 2015
  5 |  *      Author: mszhang
  6 |  */
  7 | 
  8 | #ifndef SRC_GRNN_H_
  9 | #define SRC_GRNN_H_
 10 | #include "tensor.h"
 11 | 
 12 | #include "BiLayer.h"
 13 | #include "MyLib.h"
 14 | #include "Utiltensor.h"
 15 | 
 16 | using namespace mshadow;
 17 | using namespace mshadow::expr;
 18 | using namespace mshadow::utils;
 19 | 
 20 | 
 21 | template<typename xpu>
 22 | class GRNN {
 23 | public:
 24 |   BiLayer<xpu> _rnn_update;
 25 |   BiLayer<xpu> _rnn_reset;
 26 |   BiLayer<xpu> _rnn;
 27 |   bool _left2right;
 28 | 
 29 |   Tensor<xpu, 2, dtype> _null, _nullLoss;
 30 | 
 31 | public:
 32 |   GRNN() {
 33 |   }
 34 | 
 35 |   inline void initial(int outputsize, int inputsize, int seed = 0) {
 36 |     _left2right = true;
 37 | 
 38 |     _rnn_update.initial(outputsize, outputsize, inputsize, true, seed, 1);
 39 |     _rnn_reset.initial(outputsize, outputsize, inputsize, true, seed + 10, 1);
 40 |     _rnn.initial(outputsize, outputsize, inputsize, true, seed + 20, 0);
 41 | 
 42 |     _null = NewTensor<xpu>(Shape2(1, outputsize), d_zero);
 43 |     _nullLoss = NewTensor<xpu>(Shape2(1, outputsize), d_zero);
 44 | 
 45 |   }
 46 | 
 47 |   inline void initial(int outputsize, int inputsize, bool left2right, int seed = 0) {
 48 |     _left2right = left2right;
 49 | 
 50 |     _rnn_update.initial(outputsize, outputsize, inputsize, true, seed, 1);
 51 |     _rnn_reset.initial(outputsize, outputsize, inputsize, true, seed + 10, 1);
 52 |     _rnn.initial(outputsize, outputsize, inputsize, true, seed + 20, 0);
 53 | 
 54 |     _null = NewTensor<xpu>(Shape2(1, outputsize), d_zero);
 55 |     _nullLoss = NewTensor<xpu>(Shape2(1, outputsize), d_zero);
 56 | 
 57 |   }
 58 | 
 59 |   inline void initial(Tensor<xpu, 2, dtype> WL, Tensor<xpu, 2, dtype> WR, Tensor<xpu, 2, dtype> b, Tensor<xpu, 2, dtype> uWL, Tensor<xpu, 2, dtype> uWR,
 60 |       Tensor<xpu, 2, dtype> ub, Tensor<xpu, 2, dtype> rWL, Tensor<xpu, 2, dtype> rWR, Tensor<xpu, 2, dtype> rb, bool left2right = true) {
 61 |     _left2right = left2right;
 62 | 
 63 |     _rnn_update.initial(uWL, uWR, ub, true, 1);
 64 |     _rnn_reset.initial(rWL, rWR, rb, true, 1);
 65 |     _rnn.initial(WL, WR, b, true);
 66 | 
 67 |     _null = NewTensor<xpu>(Shape2(1, b.size(1)), d_zero);
 68 |     _nullLoss = NewTensor<xpu>(Shape2(1, b.size(1)), d_zero);
 69 |   }
 70 | 
 71 |   inline void release() {
 72 |     _rnn_update.release();
 73 |     _rnn_reset.release();
 74 |     _rnn.release();
 75 | 
 76 |     FreeSpace(&_null);
 77 |     FreeSpace(&_nullLoss);
 78 |   }
 79 | 
 80 |   virtual ~GRNN() {
 81 |     // TODO Auto-generated destructor stub
 82 |   }
 83 | 
 84 |   inline dtype squarenormAll() {
 85 |     dtype norm = _rnn_update.squarenormAll();
 86 |     norm += _rnn_reset.squarenormAll();
 87 |     norm += _rnn.squarenormAll();
 88 | 
 89 |     return norm;
 90 |   }
 91 | 
 92 |   inline void scaleGrad(dtype scale) {
 93 |     _rnn_update.scaleGrad(scale);
 94 |     _rnn_reset.scaleGrad(scale);
 95 |     _rnn.scaleGrad(scale);
 96 |   }
 97 | 
 98 | public:
 99 | 
100 |   inline void ComputeForwardScore(Tensor<xpu, 3, dtype> x, Tensor<xpu, 3, dtype> mry, Tensor<xpu, 3, dtype> ry, Tensor<xpu, 3, dtype> uy,
101 |       Tensor<xpu, 3, dtype> cy, Tensor<xpu, 3, dtype> y) {
102 |     mry = 0.0;
103 |     ry = 0.0;
104 |     uy = 0.0;
105 |     cy = 0.0;
106 |     y = 0.0;
107 |     int seq_size = x.size(0);
108 |     if (seq_size == 0)
109 |       return;
110 | 
111 |     if (_left2right) {
112 |       for (int idx = 0; idx < seq_size; idx++) {
113 |         if (idx == 0) {
114 |           _rnn_update.ComputeForwardScore(_null, x[idx], uy[idx]);
115 |           _rnn.ComputeForwardScore(_null, x[idx], cy[idx]);
116 |           y[idx] = uy[idx] * cy[idx];
117 |         } else {
118 |           _rnn_reset.ComputeForwardScore(y[idx - 1], x[idx], mry[idx]);
119 |           ry[idx] = mry[idx] * y[idx - 1];
120 |           _rnn_update.ComputeForwardScore(y[idx - 1], x[idx], uy[idx]);
121 |           _rnn.ComputeForwardScore(ry[idx], x[idx], cy[idx]);
122 |           y[idx] = (1.0 - uy[idx]) * y[idx - 1] + uy[idx] * cy[idx];
123 |         }
124 |       }
125 |     } else {
126 |       for (int idx = seq_size - 1; idx >= 0; idx--) {
127 |         if (idx == seq_size - 1) {
128 |           _rnn_update.ComputeForwardScore(_null, x[idx], uy[idx]);
129 |           _rnn.ComputeForwardScore(_null, x[idx], cy[idx]);
130 |           y[idx] = uy[idx] * cy[idx];
131 |         } else {
132 |           _rnn_reset.ComputeForwardScore(y[idx + 1], x[idx], mry[idx]);
133 |           ry[idx] = mry[idx] * y[idx + 1];
134 |           _rnn_update.ComputeForwardScore(y[idx + 1], x[idx], uy[idx]);
135 |           _rnn.ComputeForwardScore(ry[idx], x[idx], cy[idx]);
136 |           y[idx] = (1.0 - uy[idx]) * y[idx + 1] + uy[idx] * cy[idx];
137 |         }
138 |       }
139 |     }
140 |   }
141 | 
142 |   inline void ComputeForwardScore(const vector<Tensor<xpu, 2, dtype> > &x, vector<Tensor<xpu, 2, dtype> > &mry, vector<Tensor<xpu, 2, dtype> > &ry,
143 |       vector<Tensor<xpu, 2, dtype> > &uy, vector<Tensor<xpu, 2, dtype> > &cy, vector<Tensor<xpu, 2, dtype> > &y) {
144 |     assign(mry, 0.0);
145 |     assign(ry, 0.0);
146 |     assign(uy, 0.0);
147 |     assign(cy, 0.0);
148 |     assign(y, 0.0);
149 |     int seq_size = x.size();
150 |     if (seq_size == 0)
151 |       return;
152 | 
153 |     if (_left2right) {
154 |       for (int idx = 0; idx < seq_size; idx++) {
155 |         if (idx == 0) {
156 |           _rnn_update.ComputeForwardScore(_null, x[idx], uy[idx]);
157 |           _rnn.ComputeForwardScore(_null, x[idx], cy[idx]);
158 |           y[idx] = uy[idx] * cy[idx];
159 |         } else {
160 |           _rnn_reset.ComputeForwardScore(y[idx - 1], x[idx], mry[idx]);
161 |           ry[idx] = mry[idx] * y[idx - 1];
162 |           _rnn_update.ComputeForwardScore(y[idx - 1], x[idx], uy[idx]);
163 |           _rnn.ComputeForwardScore(ry[idx], x[idx], cy[idx]);
164 |           y[idx] = (1.0 - uy[idx]) * y[idx - 1] + uy[idx] * cy[idx];
165 |         }
166 |       }
167 |     } else {
168 |       for (int idx = seq_size - 1; idx >= 0; idx--) {
169 |         if (idx == seq_size - 1) {
170 |           _rnn_update.ComputeForwardScore(_null, x[idx], uy[idx]);
171 |           _rnn.ComputeForwardScore(_null, x[idx], cy[idx]);
172 |           y[idx] = uy[idx] * cy[idx];
173 |         } else {
174 |           _rnn_reset.ComputeForwardScore(y[idx + 1], x[idx], mry[idx]);
175 |           ry[idx] = mry[idx] * y[idx + 1];
176 |           _rnn_update.ComputeForwardScore(y[idx + 1], x[idx], uy[idx]);
177 |           _rnn.ComputeForwardScore(ry[idx], x[idx], cy[idx]);
178 |           y[idx] = (1.0 - uy[idx]) * y[idx + 1] + uy[idx] * cy[idx];
179 |         }
180 |       }
181 |     }
182 |   }
183 | 
184 | 
185 |   // This function is used for computing hidden values incrementally at the start position
186 |   // It is applied only when the sequential inputs are not fixed in advance,
187 |   // which can vary during decoding.
188 |   // We need not provide a backward function, since during backward, inputs will be given.
189 |   inline void ComputeForwardScoreIncremental(Tensor<xpu, 2, dtype> x, Tensor<xpu, 2, dtype> mry, Tensor<xpu, 2, dtype> ry,
190 |       Tensor<xpu, 2, dtype> uy, Tensor<xpu, 2, dtype> cy, Tensor<xpu, 2, dtype> y) {
191 |     assert(_left2right);
192 |     _rnn_update.ComputeForwardScore(_null, x, uy);
193 |     _rnn.ComputeForwardScore(_null, x, cy);
194 |     y = uy * cy;
195 |   }
196 | 
197 | 
198 |   // This function is used for computing hidden values incrementally at the non-start position
199 |   // It is applied only when the sequential inputs are not fixed in advance,
200 |   // which can vary during decoding.
201 |   // We need not provide a backward function, since during backward, inputs will be given.
202 |   inline void ComputeForwardScoreIncremental(Tensor<xpu, 2, dtype> py, Tensor<xpu, 2, dtype> x, Tensor<xpu, 2, dtype> mry, Tensor<xpu, 2, dtype> ry,
203 |       Tensor<xpu, 2, dtype> uy, Tensor<xpu, 2, dtype> cy, Tensor<xpu, 2, dtype> y) {
204 |     assert(_left2right);
205 |     _rnn_reset.ComputeForwardScore(py, x, mry);
206 |     ry = mry * py;
207 |     _rnn_update.ComputeForwardScore(py, x, uy);
208 |     _rnn.ComputeForwardScore(ry, x, cy);
209 |     y = (1.0 - uy) * py + uy * cy;
210 |   }
211 | 
212 |   //please allocate the memory outside here
213 |   inline void ComputeBackwardLoss(Tensor<xpu, 3, dtype> x, Tensor<xpu, 3, dtype> mry, Tensor<xpu, 3, dtype> ry, Tensor<xpu, 3, dtype> uy,
214 |       Tensor<xpu, 3, dtype> cy, Tensor<xpu, 3, dtype> y, Tensor<xpu, 3, dtype> ly, Tensor<xpu, 3, dtype> lx, bool bclear = false) {
215 |     int seq_size = x.size(0);
216 |     if (seq_size == 0)
217 |       return;
218 | 
219 |     if (bclear)
220 |       lx = 0.0;
221 |     //left rnn
222 |     Tensor<xpu, 3, dtype> lfy = NewTensor<xpu>(Shape3(y.size(0), y.size(1), y.size(2)), d_zero);
223 |     Tensor<xpu, 3, dtype> luy = NewTensor<xpu>(Shape3(y.size(0), y.size(1), y.size(2)), d_zero);
224 |     Tensor<xpu, 3, dtype> lcy = NewTensor<xpu>(Shape3(y.size(0), y.size(1), y.size(2)), d_zero);
225 |     Tensor<xpu, 3, dtype> lry = NewTensor<xpu>(Shape3(y.size(0), y.size(1), y.size(2)), d_zero);
226 |     Tensor<xpu, 3, dtype> lmry = NewTensor<xpu>(Shape3(y.size(0), y.size(1), y.size(2)), d_zero);
227 | 
228 |     if (_left2right) {
229 |       for (int idx = seq_size - 1; idx >= 0; idx--) {
230 |         if (idx < seq_size - 1)
231 |           ly[idx] = ly[idx] + lfy[idx];
232 | 
233 |         if (idx == 0) {
234 |           luy[idx] = ly[idx] * cy[idx];
235 |           lcy[idx] = ly[idx] * uy[idx];
236 | 
237 |           _rnn.ComputeBackwardLoss(_null, x[idx], cy[idx], lcy[idx], _nullLoss, lx[idx]);
238 | 
239 |           _rnn_update.ComputeBackwardLoss(_null, x[idx], uy[idx], luy[idx], _nullLoss, lx[idx]);
240 |         } else {
241 |           luy[idx] = ly[idx] * (cy[idx] - y[idx - 1]);
242 |           lfy[idx - 1] = ly[idx] * (1.0 - uy[idx]);
243 |           lcy[idx] = ly[idx] * uy[idx];
244 | 
245 |           _rnn.ComputeBackwardLoss(ry[idx], x[idx], cy[idx], lcy[idx], lry[idx], lx[idx]);
246 |           _rnn_update.ComputeBackwardLoss(y[idx - 1], x[idx], uy[idx], luy[idx], lfy[idx - 1], lx[idx]);
247 | 
248 |           lmry[idx] = lry[idx] * y[idx - 1];
249 |           lfy[idx - 1] += lry[idx] * mry[idx];
250 | 
251 |           _rnn_reset.ComputeBackwardLoss(y[idx - 1], x[idx], mry[idx], lmry[idx], lfy[idx - 1], lx[idx]);
252 |         }
253 |       }
254 |     } else {
255 |       // right rnn
256 |       for (int idx = 0; idx < seq_size; idx++) {
257 |         if (idx > 0)
258 |           ly[idx] = ly[idx] + lfy[idx];
259 | 
260 |         if (idx == seq_size - 1) {
261 |           luy[idx] = ly[idx] * cy[idx];
262 |           lcy[idx] = ly[idx] * uy[idx];
263 | 
264 |           _rnn.ComputeBackwardLoss(_null, x[idx], cy[idx], lcy[idx], _nullLoss, lx[idx]);
265 |           _rnn_update.ComputeBackwardLoss(_null, x[idx], uy[idx], luy[idx], _nullLoss, lx[idx]);
266 |         } else {
267 |           luy[idx] = ly[idx] * (cy[idx] - y[idx + 1]);
268 |           lfy[idx + 1] = ly[idx] * (1.0 - uy[idx]);
269 |           lcy[idx] = ly[idx] * uy[idx];
270 | 
271 |           _rnn.ComputeBackwardLoss(ry[idx], x[idx], cy[idx], lcy[idx], lry[idx], lx[idx]);
272 |           _rnn_update.ComputeBackwardLoss(y[idx + 1], x[idx], uy[idx], luy[idx], lfy[idx + 1], lx[idx]);
273 | 
274 |           lmry[idx] = lry[idx] * y[idx + 1];
275 |           lfy[idx + 1] += lry[idx] * mry[idx];
276 | 
277 |           _rnn_reset.ComputeBackwardLoss(y[idx + 1], x[idx], mry[idx], lmry[idx], lfy[idx + 1], lx[idx]);
278 |         }
279 |       }
280 |     }
281 | 
282 |     FreeSpace(&lfy);
283 |     FreeSpace(&luy);
284 |     FreeSpace(&lcy);
285 |     FreeSpace(&lry);
286 |     FreeSpace(&lmry);
287 |   }
288 | 
289 |   //please allocate the memory outside here
290 |   inline void ComputeBackwardLoss(const vector<Tensor<xpu, 2, dtype> > &x, const vector<Tensor<xpu, 2, dtype> > &mry, const vector<Tensor<xpu, 2, dtype> > &ry,
291 |       const vector<Tensor<xpu, 2, dtype> > &uy, const vector<Tensor<xpu, 2, dtype> > &cy, const vector<Tensor<xpu, 2, dtype> > &y, 
292 |       vector<Tensor<xpu, 2, dtype> > &ly, vector<Tensor<xpu, 2, dtype> > &lx, bool bclear = false) {
293 |     int seq_size = x.size();
294 |     if (seq_size == 0)
295 |       return;
296 | 
297 |     if (bclear)
298 |       assign(lx, 0.0);
299 | 
300 |     vector<Tensor<xpu, 2, dtype> > lfy(seq_size), lcy(seq_size), luy(seq_size), lry(seq_size), lmry(seq_size);
301 |     for (int idx = 0; idx < seq_size; idx++) {
302 |       lfy[idx] = NewTensor<xpu>(Shape2(ly[0].size(0), ly[0].size(1)), d_zero);
303 |       lcy[idx] = NewTensor<xpu>(Shape2(ly[0].size(0), ly[0].size(1)), d_zero);
304 |       luy[idx] = NewTensor<xpu>(Shape2(ly[0].size(0), ly[0].size(1)), d_zero);
305 |       lry[idx] = NewTensor<xpu>(Shape2(ly[0].size(0), ly[0].size(1)), d_zero);
306 |       lmry[idx] = NewTensor<xpu>(Shape2(ly[0].size(0), ly[0].size(1)), d_zero);
307 |     }
308 | 
309 |     if (_left2right) {
310 |       for (int idx = seq_size - 1; idx >= 0; idx--) {
311 |         if (idx < seq_size - 1)
312 |           ly[idx] = ly[idx] + lfy[idx];
313 | 
314 |         if (idx == 0) {
315 |           luy[idx] = ly[idx] * cy[idx];
316 |           lcy[idx] = ly[idx] * uy[idx];
317 | 
318 |           _rnn.ComputeBackwardLoss(_null, x[idx], cy[idx], lcy[idx], _nullLoss, lx[idx]);
319 | 
320 |           _rnn_update.ComputeBackwardLoss(_null, x[idx], uy[idx], luy[idx], _nullLoss, lx[idx]);
321 |         } else {
322 |           luy[idx] = ly[idx] * (cy[idx] - y[idx - 1]);
323 |           lfy[idx - 1] = ly[idx] * (1.0 - uy[idx]);
324 |           lcy[idx] = ly[idx] * uy[idx];
325 | 
326 |           _rnn.ComputeBackwardLoss(ry[idx], x[idx], cy[idx], lcy[idx], lry[idx], lx[idx]);
327 |           _rnn_update.ComputeBackwardLoss(y[idx - 1], x[idx], uy[idx], luy[idx], lfy[idx - 1], lx[idx]);
328 | 
329 |           lmry[idx] = lry[idx] * y[idx - 1];
330 |           lfy[idx - 1] += lry[idx] * mry[idx];
331 | 
332 |           _rnn_reset.ComputeBackwardLoss(y[idx - 1], x[idx], mry[idx], lmry[idx], lfy[idx - 1], lx[idx]);
333 |         }
334 |       }
335 |     } else {
336 |       // right rnn
337 |       for (int idx = 0; idx < seq_size; idx++) {
338 |         if (idx > 0)
339 |           ly[idx] = ly[idx] + lfy[idx];
340 | 
341 |         if (idx == seq_size - 1) {
342 |           luy[idx] = ly[idx] * cy[idx];
343 |           lcy[idx] = ly[idx] * uy[idx];
344 | 
345 |           _rnn.ComputeBackwardLoss(_null, x[idx], cy[idx], lcy[idx], _nullLoss, lx[idx]);
346 |           _rnn_update.ComputeBackwardLoss(_null, x[idx], uy[idx], luy[idx], _nullLoss, lx[idx]);
347 |         } else {
348 |           luy[idx] = ly[idx] * (cy[idx] - y[idx + 1]);
349 |           lfy[idx + 1] = ly[idx] * (1.0 - uy[idx]);
350 |           lcy[idx] = ly[idx] * uy[idx];
351 | 
352 |           _rnn.ComputeBackwardLoss(ry[idx], x[idx], cy[idx], lcy[idx], lry[idx], lx[idx]);
353 |           _rnn_update.ComputeBackwardLoss(y[idx + 1], x[idx], uy[idx], luy[idx], lfy[idx + 1], lx[idx]);
354 | 
355 |           lmry[idx] = lry[idx] * y[idx + 1];
356 |           lfy[idx + 1] += lry[idx] * mry[idx];
357 | 
358 |           _rnn_reset.ComputeBackwardLoss(y[idx + 1], x[idx], mry[idx], lmry[idx], lfy[idx + 1], lx[idx]);
359 |         }
360 |       }
361 |     }
362 | 
363 |     for (int idx = 0; idx < seq_size; idx++) {
364 |       FreeSpace(&(lfy[idx]));
365 |       FreeSpace(&(lcy[idx]));
366 |       FreeSpace(&(luy[idx]));
367 |       FreeSpace(&(lry[idx]));
368 |       FreeSpace(&(lmry[idx]));
369 |     }
370 |   }
371 | 
372 |   inline void randomprint(int num) {
373 |     _rnn_update.randomprint(num);
374 |     _rnn_reset.randomprint(num);
375 |     _rnn.randomprint(num);
376 |   }
377 | 
378 |   inline void updateAdaGrad(dtype regularizationWeight, dtype adaAlpha, dtype adaEps) {
379 |     _rnn_update.updateAdaGrad(regularizationWeight, adaAlpha, adaEps);
380 |     _rnn_reset.updateAdaGrad(regularizationWeight, adaAlpha, adaEps);
381 |     _rnn.updateAdaGrad(regularizationWeight, adaAlpha, adaEps);
382 |   }
383 | 
384 |   void writeModel(LStream &outf) {
385 |     _rnn_update.writeModel(outf);
386 |     _rnn_reset.writeModel(outf);
387 |     _rnn.writeModel(outf);
388 |     
389 |     WriteBinary(outf, _left2right);
390 | 
391 |     SaveBinary(outf, _null);
392 |     SaveBinary(outf, _nullLoss);
393 |   }
394 | 
395 |   void loadModel(LStream &inf) {
396 |     _rnn_update.loadModel(inf);
397 |     _rnn_reset.loadModel(inf);
398 |     _rnn.loadModel(inf);
399 | 
400 |     ReadBinary(inf, _left2right);
401 | 
402 |     LoadBinary(inf, &_null, false);
403 |     LoadBinary(inf, &_nullLoss, false);
404 |   }
405 | };
406 | 
407 | #endif /* SRC_GRNN_H_ */
408 | 


--------------------------------------------------------------------------------
/GatedPooling.h:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * GatedPooling.h
  3 |  *
  4 |  *  Created on: Mar 18, 2015
  5 |  *      Author: mszhang
  6 |  */
  7 | 
  8 | #ifndef SRC_GatedPooling_H_
  9 | #define SRC_GatedPooling_H_
 10 | #include "tensor.h"
 11 | #include "MyLib.h"
 12 | #include "Utiltensor.h"
 13 | #include "Pooling.h"
 14 | #include "UniLayer.h"
 15 | 
 16 | using namespace mshadow;
 17 | using namespace mshadow::expr;
 18 | using namespace mshadow::utils;
 19 | 
 20 | // For simpleness, we do not provide pooling on specified words,
 21 | // which has been implemented in Pooling.h
 22 | 
 23 | 
 24 | template<typename xpu>
 25 | class GatedPooling {
 26 | 
 27 | public:
 28 |   UniLayer<xpu> _uni_gates;
 29 | 
 30 | public:
 31 |   GatedPooling() {
 32 |   }
 33 | 
 34 |   inline void initial(int hiddenSize, int seed = 0) {
 35 |     _uni_gates.initial(hiddenSize, hiddenSize, false, seed, 3);
 36 |   }
 37 | 
 38 |   inline void initial(Tensor<xpu, 2, dtype> W) {
 39 |     _uni_gates.initial(W, 3);
 40 |   }
 41 | 
 42 | 
 43 |   inline void release() {
 44 |     _uni_gates.release();
 45 |   }
 46 | 
 47 |   virtual ~GatedPooling() {
 48 |     // TODO Auto-generated destructor stub
 49 |   }
 50 | 
 51 |   inline dtype squarenormAll() {
 52 |     return _uni_gates.squarenormAll();
 53 |   }
 54 | 
 55 |   inline void scaleGrad(dtype scale) {
 56 |     _uni_gates.scaleGrad(scale);
 57 |   }
 58 | 
 59 | public:
 60 |   // xExp, xSumIndex, xSum ad xPoolIndex are temporal variables, which reduce computation in back-propagation
 61 |   inline void ComputeForwardScore(Tensor<xpu, 3, dtype> x, Tensor<xpu, 3, dtype> xExp,
 62 |       Tensor<xpu, 2, dtype> xSum, Tensor<xpu, 3, dtype> xPoolIndex, Tensor<xpu, 2, dtype> y) {
 63 |     y = 0.0;
 64 |     int seq_size = x.size(0);
 65 |     if(seq_size == 0) return;
 66 |     int dim1 = x.size(1), dim2 = x.size(2);
 67 |     int odim1 = y.size(0), odim2 = y.size(1);
 68 | 
 69 |     if (dim1 != odim1 || dim2 != odim2 || dim1 != 1) {
 70 |       std::cerr << "GatedPooling Forward error: dim invalid" << std::endl;
 71 |     }
 72 | 
 73 |     _uni_gates.ComputeForwardScore(x, xExp);
 74 | 
 75 |     sumpool_forward(xExp, xSum);
 76 |     for (int idx = 0; idx < seq_size; idx++) {
 77 |       xPoolIndex[idx] = xExp[idx] / xSum;
 78 |     }
 79 |     for (int idx = 0; idx < seq_size; idx++) {
 80 |       y += x[idx] * xPoolIndex[idx];
 81 |     }
 82 |   }
 83 | 
 84 |   inline void ComputeForwardScore(const std::vector<Tensor<xpu, 2, dtype> >& x, std::vector<Tensor<xpu, 2, dtype> >& xExp,
 85 |       Tensor<xpu, 2, dtype> xSum, std::vector<Tensor<xpu, 2, dtype> >& xPoolIndex, Tensor<xpu, 2, dtype> y) {
 86 |     y = 0.0;
 87 |     int seq_size = x.size();
 88 |     if(seq_size == 0) return;
 89 |     int dim1 = x[0].size(0), dim2 = x[0].size(1);
 90 |     int odim1 = y.size(0), odim2 = y.size(1);
 91 | 
 92 |     if (dim1 != odim1 || dim2 != odim2 || dim1 != 1) {
 93 |       std::cerr << "GatedPooling Forward error: dim invalid" << std::endl;
 94 |     }
 95 | 
 96 |     _uni_gates.ComputeForwardScore(x, xExp);
 97 | 
 98 |     sumpool_forward(xExp, xSum);
 99 |     for (int idx = 0; idx < seq_size; idx++) {
100 |       xPoolIndex[idx] = xExp[idx] / xSum;
101 |     }
102 |     for (int idx = 0; idx < seq_size; idx++) {
103 |       y += x[idx] * xPoolIndex[idx];
104 |     }
105 |   }
106 | 
107 | 
108 |   //please allocate the memory outside here
109 |   inline void ComputeBackwardLoss(Tensor<xpu, 3, dtype> x, Tensor<xpu, 3, dtype> xExp,
110 |       Tensor<xpu, 2, dtype> xSum, Tensor<xpu, 3, dtype> xPoolIndex, Tensor<xpu, 2, dtype> y,
111 |       Tensor<xpu, 2, dtype> ly, Tensor<xpu, 3, dtype> lx, bool bclear = false) {
112 |     int seq_size = x.size(0);
113 |     if(seq_size == 0) return;
114 |     int dim1 = x.size(1), dim2 = x.size(2);
115 |     int odim1 = y.size(0), odim2 = y.size(1);
116 | 
117 |     if(bclear) lx = 0.0;
118 | 
119 |     Tensor<xpu, 3, dtype> xExpLoss = NewTensor<xpu>(Shape3(seq_size, dim1, dim2), d_zero);
120 |     Tensor<xpu, 2, dtype> xSumLoss = NewTensor<xpu>(Shape2(dim1, dim2), d_zero);
121 |     Tensor<xpu, 3, dtype> xPoolIndexLoss = NewTensor<xpu>(Shape3(seq_size, dim1, dim2), d_zero);
122 | 
123 |     for (int idx = 0; idx < seq_size; idx++) {
124 |       xPoolIndexLoss[idx] = ly * x[idx];
125 |       lx[idx] += ly * xPoolIndex[idx];
126 |     }
127 | 
128 |     for (int idx = 0; idx < seq_size; idx++) {
129 |       xExpLoss[idx] += xPoolIndexLoss[idx] / xSum;
130 |       xSumLoss -= xPoolIndexLoss[idx] * xExp[idx] / xSum / xSum;
131 |     }
132 | 
133 |     sumpool_backward(xSumLoss, xExpLoss);
134 | 
135 |     _uni_gates.ComputeBackwardLoss(x, xExp, xExpLoss, lx);
136 | 
137 |     FreeSpace(&xExpLoss);
138 |     FreeSpace(&xSumLoss);
139 |     FreeSpace(&xPoolIndexLoss);
140 |   }
141 | 
142 |   inline void ComputeBackwardLoss(const std::vector<Tensor<xpu, 2, dtype> >& x, std::vector<Tensor<xpu, 2, dtype> >& xExp,
143 |       Tensor<xpu, 2, dtype> xSum, std::vector<Tensor<xpu, 2, dtype> >& xPoolIndex, Tensor<xpu, 2, dtype> y,
144 |       Tensor<xpu, 2, dtype> ly, std::vector<Tensor<xpu, 2, dtype> >& lx, bool bclear = false) {
145 |     int seq_size = x.size();
146 |     if(seq_size == 0) return;
147 |     int dim1 = x[0].size(0), dim2 = x[0].size(1);
148 |     int odim1 = y.size(0), odim2 = y.size(1);
149 | 
150 | 
151 |     if(bclear){
152 |       for (int idx = 0; idx < seq_size; idx++) {
153 |         lx[idx] = 0.0;
154 |       }
155 |     }
156 | 
157 |     vector<Tensor<xpu, 3, dtype> > xExpLoss(seq_size), xPoolIndexLoss(seq_size);
158 |     for (int idx = 0; idx < seq_size; idx++) {
159 |       xExpLoss[idx] = NewTensor<xpu>(Shape2(dim1, dim2), d_zero);
160 |       xPoolIndexLoss[idx] = NewTensor<xpu>(Shape2(dim1, dim2), d_zero);
161 |     }
162 | 
163 |     Tensor<xpu, 2, dtype> xSumLoss = NewTensor<xpu>(Shape2(dim1, dim2), d_zero);
164 | 
165 |     for (int idx = 0; idx < seq_size; idx++) {
166 |       xPoolIndexLoss[idx] = ly * x[idx];
167 |       lx[idx] += ly * xPoolIndex[idx];
168 |     }
169 | 
170 |     for (int idx = 0; idx < seq_size; idx++) {
171 |       xExpLoss[idx] += xPoolIndexLoss[idx] / xSum;
172 |       xSumLoss -= xPoolIndexLoss[idx] * xExp[idx] / xSum / xSum;
173 |     }
174 | 
175 |     sumpool_backward(xSumLoss, xExpLoss);
176 | 
177 |     _uni_gates.ComputeBackwardLoss(x, xExp, xExpLoss, lx);
178 | 
179 |     FreeSpace(&xSumLoss);
180 |     for (int idx = 0; idx < seq_size; idx++) {
181 |       FreeSpace(&(xExpLoss[idx]));
182 |       FreeSpace(&(xPoolIndexLoss[idx]));
183 |     }
184 |   }
185 | 
186 |   inline void randomprint(int num) {
187 |     _uni_gates.randomprint(num);
188 |   }
189 | 
190 |   inline void updateAdaGrad(dtype regularizationWeight, dtype adaAlpha, dtype adaEps) {
191 |     _uni_gates.updateAdaGrad(regularizationWeight, adaAlpha, adaEps);
192 |   }
193 | 
194 |   void writeModel(LStream &outf) {
195 |     _uni_gates.writeModel(outf);
196 | 
197 |   }
198 | 
199 |   void loadModel(LStream &inf) {
200 |     _uni_gates.loadModel(inf);
201 |   }
202 | 
203 | };
204 | 
205 | #endif /* SRC_GatedPooling_H_ */
206 | 


--------------------------------------------------------------------------------
/Hash_map.hpp:
--------------------------------------------------------------------------------
  1 | //=========================================================
  2 | // @Modify: Chen Xin (xchen@ir.hit.edu.cn)
  3 | // @Date:   2011/03/11
  4 | // @Brief:  Change default hash func to BKDR.
  5 | //==========================================================
  6 | 
  7 | 
  8 | /************************************************************
  9 | unsigned int BKDRHash(const std::string& str)
 10 | {
 11 |    unsigned int seed = 131; // 31 131 1313 13131 131313 etc..
 12 |    unsigned int hash = 0;
 13 |    for(std::size_t i = 0; i < str.length(); i++)
 14 |    {
 15 |       hash = (hash * seed) + str[i];
 16 |    }
 17 |    return hash;
 18 | }
 19 | *************************************************************/
 20 | 
 21 | /*
 22 |  * vi:ts=4:tw=78:shiftwidth=4:expandtab
 23 |  * vim600:fdm=marker
 24 |  *
 25 |  * hash_map.hpp  -  wrapper header as a workaround for several different ways
 26 |  * of using hash_map/hash_set since this is not ISO standard.
 27 |  *
 28 |  * After inclusion of this file hash and hash_map are exported into the global
 29 |  * namespace.
 30 |  *
 31 |  * Copyright (C) 2004 by Zhang Le <ejoy@users.sourceforge.net>
 32 |  * Begin       : 26-Jun-2004
 33 |  * Last Change : 25-Dec-2004.
 34 |  *
 35 |  * Permission is hereby granted, free of charge, to any person obtaining a
 36 |  * copy of this software and associated documentation files (the "Software"),
 37 |  * to deal in the Software without restriction, including without limitation
 38 |  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
 39 |  * and/or sell copies of the Software, and to permit persons to whom the
 40 |  * Software is furnished to do so, subject to the following conditions:
 41 |  * 
 42 |  * The above copyright notice and this permission notice shall be included in
 43 |  * all copies or substantial portions of the Software.
 44 |  * 
 45 |  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 46 |  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 47 |  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
 48 |  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 49 |  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
 50 |  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
 51 |  * DEALINGS IN THE SOFTWARE.
 52 |  */
 53 | 
 54 | #ifndef HASH_MAP_HPP
 55 | #define HASH_MAP_HPP
 56 | 
 57 | #ifdef HAVE_CONFIG_H
 58 | #include <config.h>
 59 | #endif
 60 | 
 61 | #include <string>
 62 | 
 63 | #if defined(_STLPORT_VERSION)
 64 |     #include <hash_map>
 65 |     #include <hash_set>
 66 |     using std::hash;
 67 |     using std::hash_map;
 68 |     using hash_set;
 69 | #else // not using STLPORT
 70 | 
 71 |     #ifdef __GNUC__
 72 |         #if __GNUC__ >= 3
 73 |             #include <ext/hash_map>
 74 |             #include <ext/hash_set>
 75 |             namespace __gnu_cxx {
 76 |                 template <>
 77 |                 struct hash<std::string> {
 78 |                     size_t operator()(const std::string& s) const {
 79 |                          unsigned int _seed = 131; // 31 131 1313 13131 131313 etc..
 80 | 						 unsigned int _hash = 0;
 81 | 						 for(std::size_t i = 0; i < s.size(); i++)
 82 | 						 {
 83 | 						     _hash = (_hash * _seed) + s[i];
 84 | 						 }
 85 | 						 return size_t(_hash);
 86 |                     }
 87 |                 };
 88 |             };
 89 |             using __gnu_cxx::hash_map;
 90 | 			using __gnu_cxx::hash_set;
 91 |             using __gnu_cxx::hash;
 92 |         #else // GCC 2.x
 93 |             #include <hash_map>
 94 |             #include <hash_set>
 95 |             namespace std {
 96 |                 struct hash<std::string> {
 97 |                     size_t operator()(const std::string& s) const {
 98 |                          unsigned int _seed = 131; // 31 131 1313 13131 131313 etc..
 99 | 						 unsigned int _hash = 0;
100 | 						 for(std::size_t i = 0; i < s.size(); i++)
101 | 						 {
102 | 						     _hash = (_hash * _seed) + s[i];
103 | 						 }
104 | 						 return size_t(_hash);
105 |                     }
106 |                 };
107 |             };
108 |             using std::hash_map;
109 |             using hash_set;
110 |             using std::hash;
111 |         #endif // end GCC >= 3
112 |     #elif defined(_MSC_VER) && ((_MSC_VER >= 1300) || defined(__INTEL_COMPILER))
113 |         // we only support MSVC7+ and Intel C++ 8.0
114 |         #include <hash_map>
115 |         #include <hash_set>
116 |         namespace stdext {
117 |             inline size_t hash_value(const std::string& s) {
118 |                 unsigned int _seed = 131; // 31 131 1313 13131 131313 etc..
119 | 				unsigned int _hash = 0;
120 | 				for(std::size_t i = 0; i < s.size(); i++)
121 | 				{
122 | 					_hash = (_hash * _seed) + s[i];
123 | 				}
124 | 				return size_t(_hash);
125 |             }
126 |         }
127 |         using stdext::hash_map; // _MSC_EXTENSIONS, though DEPRECATED
128 |         using stdext::hash_set;
129 |     #else
130 |         #error unknown compiler
131 |     #endif //GCC or MSVC7+
132 | #endif // end STLPORT
133 | 
134 | #endif /* ifndef HASH_MAP_HPP */
135 | 
136 | 


--------------------------------------------------------------------------------
/IO.h:
--------------------------------------------------------------------------------
  1 | /*!
  2 |  * \file IO.h
  3 |  * \brief definitions of I/O functions for LibN3L
  4 |  * \author Jie
  5 |  */
  6 | #ifndef LIBN3L_IO_H_
  7 | #define LIBN3L_IO_H_
  8 |  #include <stdio.h>
  9 | #include "tensor.h"
 10 | #include "io.h"
 11 | #include "Utiltensor.h"
 12 | #include "Utils.h"
 13 | 
 14 | 
 15 | class LStream : public IStream {
 16 |  public:
 17 |   FILE *fp_;
 18 |   size_t sz_;
 19 | 
 20 |   public:
 21 |   LStream(const string &fname, const char *mode) {
 22 |     const char *newname = &fname[0];
 23 |     Open(newname, mode);
 24 |   }
 25 |   void Open(const char *fname, const char *mode) {
 26 |     fp_ =  FopenCheck(fname, mode);
 27 |     fseek(fp_, 0L, SEEK_END);
 28 |     sz_ = ftell(fp_);
 29 |     fseek(fp_, 0L, SEEK_SET);
 30 |   }
 31 |   size_t Read(void *ptr, size_t size) {
 32 |     return fread(ptr, size, 1, fp_);
 33 |   }
 34 |   void Write(const void *ptr, size_t size) {
 35 |     fwrite(ptr, size, 1, fp_);
 36 |   }
 37 | 
 38 |   // size_t StringRead(string &sentence) {
 39 |   //   char buff[100];
 40 |   //   return fread(ptr, size, 1, fp_);
 41 |   // }
 42 |   // void StringWrite(const string &sentence) {
 43 |   //   fputs(sentence,fp_);
 44 |   // }
 45 | 
 46 |   inline void Close(void) {
 47 |     if (fp_ != NULL){
 48 |       fclose(fp_); fp_ = NULL;
 49 |     }
 50 |   }
 51 |   inline size_t Size() {
 52 |     return sz_;
 53 |   }
 54 |   virtual ~LStream(void) {
 55 |     this->Close();
 56 |   }
 57 | 
 58 |   inline std::FILE *FopenCheck(const char *fname, const char *flag) {
 59 |     std::FILE *fp = fopen(fname, flag);
 60 |     Check(fp != NULL, "can not open file \"%s\"\n", fname);
 61 |     return fp;
 62 |   }
 63 | 
 64 | 
 65 | };
 66 | 
 67 | 
 68 | template<typename DType, typename TStream>
 69 | inline void WriteBinary(TStream &fo, const DType &target) { 
 70 |   fo.Write(&target, sizeof(target));
 71 | }
 72 | 
 73 | template<typename DType, typename TStream>
 74 | inline void ReadBinary(TStream &fo, DType &target) { 
 75 |   fo.Read(&target, sizeof(DType));
 76 | }
 77 | 
 78 | 
 79 | 
 80 | template<typename TStream>
 81 | inline void WriteString(TStream &fo, const string &target) { 
 82 |   int string_size = target.size();
 83 |   fo.Write(&string_size, sizeof(string_size));
 84 |   if (string_size > 0) {
 85 |     int char_size = sizeof(target[0]);
 86 |     fo.Write(&char_size, sizeof(char_size));
 87 |     for (int idx = 0; idx < string_size; idx++) {
 88 |       fo.Write(&target[idx], sizeof(target[idx]));
 89 |     }
 90 |   }
 91 | }
 92 | 
 93 | template<typename TStream>
 94 | inline void ReadString(TStream &fo, string &target) { 
 95 |   int string_size;
 96 |   fo.Read(&string_size, sizeof(int));
 97 |   if (string_size > 0) {
 98 |     int char_size;
 99 |     fo.Read(&char_size, sizeof(int));
100 |     char character[string_size];
101 |     for (int idx = 0; idx < string_size; idx++) {
102 |       fo.Read(&character[idx], char_size); 
103 |     }    
104 |     target = string(character, string_size);
105 |     assert(target.size()==string_size);
106 |   }
107 | }
108 | 
109 | 
110 | template<typename DType, typename TStream>
111 | inline void WriteVector(TStream &fo, vector<DType> &target) { 
112 |   int vector_size = target.size();
113 |   fo.Write(&vector_size, sizeof(vector_size));
114 |   if (vector_size > 0) {
115 |     int element_size = sizeof(target[0]);
116 |     fo.Write(&element_size, sizeof(element_size));
117 |     for (int idx = 0; idx < vector_size; idx++) {
118 |       fo.Write(&target[idx], sizeof(target[idx]));
119 |       // cout << target[idx] << endl;
120 |     }
121 |   }
122 | }
123 | 
124 | template<typename DType, typename TStream>
125 | inline void ReadVector(TStream &fo, vector<DType> &target) { 
126 |   int vector_size;
127 |   fo.Read(&vector_size, sizeof(int));
128 |   if (vector_size > 0) {
129 |     int element_size;
130 |     fo.Read(&element_size, sizeof(int));
131 |     target.resize(vector_size);
132 |     for (int idx = 0; idx < vector_size; idx++) {
133 |       fo.Read(&target[idx], element_size); 
134 |       // cout << target[idx] << endl;
135 |     }    
136 |     assert(target.size()== vector_size);
137 |   }
138 | }
139 | 
140 | template<typename TStream>
141 | inline void WriteVector(TStream &fo, vector<string> &target) { 
142 |   int vector_size = target.size();
143 |   fo.Write(&vector_size, sizeof(vector_size));
144 |   if (vector_size > 0) {
145 |     for (int idx = 0; idx < vector_size; idx++) {
146 |       WriteString(fo, target[idx]);
147 |       // cout << target[idx] << endl;
148 |     }
149 |   }
150 | }
151 | 
152 | template<typename TStream>
153 | inline void ReadVector(TStream &fo, vector<string> &target) { 
154 |   target.clear();
155 |   int vector_size;
156 |   string tmp_target;
157 |   fo.Read(&vector_size, sizeof(int));
158 |   // cout << "vector_size " << vector_size << endl;
159 |   if (vector_size > 0) {
160 |     for (int idx = 0; idx < vector_size; idx++) {
161 |       ReadString(fo, tmp_target); 
162 |       target.push_back(tmp_target);
163 |       // cout << target[idx] << endl;
164 |     }    
165 |   }
166 |   assert(target.size()== vector_size);
167 | }
168 | 
169 | 
170 | template<typename DType, typename TStream>
171 | inline void WriteVector(TStream &fo, NRVec<DType> &target) { 
172 |   int vector_size = target.size();
173 |   WriteBinary(fo, vector_size);
174 |   if (vector_size > 0) {
175 |     for (int idx = 0; idx < vector_size; idx++) {
176 |       WriteBinary(fo, target[idx]);
177 |     }
178 |   }
179 | }
180 | 
181 | template<typename DType, typename TStream>
182 | inline void ReadVector(TStream &fo, NRVec<DType> &target) { 
183 |   int vector_size;
184 |   ReadBinary(fo, vector_size);
185 |   if (vector_size > 0) {
186 |     target.resize(vector_size);
187 |     for (int idx = 0; idx < vector_size; idx++) {
188 |       ReadBinary(fo, target[idx]);
189 |     }    
190 |     assert(target.size()== vector_size);
191 |   }
192 | }
193 | 
194 | 
195 | 
196 | #endif  // LIBN3L_IO_H_
197 | 


--------------------------------------------------------------------------------
/LookupTable.h:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * LookupTable.h
  3 |  *
  4 |  *  Created on: Mar 18, 2015
  5 |  *      Author: mszhang
  6 |  */
  7 | 
  8 | #ifndef SRC_LookupTable_H_
  9 | #define SRC_LookupTable_H_
 10 | #include "tensor.h"
 11 | #include "Utiltensor.h"
 12 | #include "MyLib.h"
 13 | 
 14 | using namespace mshadow;
 15 | using namespace mshadow::expr;
 16 | using namespace mshadow::utils;
 17 | 
 18 | // Weight updating process implemented without theory support,
 19 | // but recently find an EMNLP 2015 paper "An Empirical Analysis of Optimization for Max-Margin NLP"
 20 | // In all my papers that use adagrad for sparse features, I use it for parameter updating.
 21 | 
 22 | template<typename xpu>
 23 | class LookupTable {
 24 | 
 25 | public:
 26 | 
 27 |   hash_set<int> _indexers;
 28 | 
 29 |   Tensor<xpu, 2, dtype> _E;
 30 |   Tensor<xpu, 2, dtype> _gradE;
 31 |   Tensor<xpu, 2, dtype> _eg2E;
 32 | 
 33 |   Tensor<xpu, 2, dtype> _ftE;
 34 | 
 35 |   bool _bFineTune;
 36 |   int _nDim;
 37 |   int _nVSize;
 38 | 
 39 |   int _max_update;
 40 |   NRVec<int> _last_update;
 41 | 
 42 | public:
 43 | 
 44 |   LookupTable() {
 45 |     _indexers.clear();
 46 |   }
 47 | 
 48 | 
 49 |   inline void initial(const NRMat<dtype>& wordEmb) {
 50 |     _nVSize = wordEmb.nrows();
 51 |     _nDim = wordEmb.ncols();
 52 | 
 53 |     _E = NewTensor<xpu>(Shape2(_nVSize, _nDim), d_zero);
 54 |     _gradE = NewTensor<xpu>(Shape2(_nVSize, _nDim), d_zero);
 55 |     _eg2E = NewTensor<xpu>(Shape2(_nVSize, _nDim), d_zero);
 56 |     _ftE = NewTensor<xpu>(Shape2(_nVSize, _nDim), d_one);
 57 |     assign(_E, wordEmb);
 58 |     for (int idx = 0; idx < _nVSize; idx++) {
 59 |       norm2one(_E, idx);
 60 |     }
 61 | 
 62 |     _bFineTune = true;
 63 | 
 64 |     _max_update = 0;
 65 |     _last_update.resize(_nVSize);
 66 |     _last_update = 0;
 67 |   }
 68 | 
 69 |   inline void setEmbFineTune(bool bFineTune) {
 70 |     _bFineTune = bFineTune;
 71 |   }
 72 | 
 73 |   inline void release() {
 74 |     FreeSpace(&_E);
 75 |     FreeSpace(&_gradE);
 76 |     FreeSpace(&_eg2E);
 77 |     FreeSpace(&_ftE);
 78 |     _indexers.clear();
 79 |   }
 80 | 
 81 |   virtual ~LookupTable() {
 82 |     // TODO Auto-generated destructor stub
 83 |   }
 84 | 
 85 |   inline dtype squarenormAll() {
 86 |     dtype result = 0;
 87 |     static hash_set<int>::iterator it;
 88 |     for (int idx = 0; idx < _nDim; idx++) {
 89 |       for (it = _indexers.begin(); it != _indexers.end(); ++it) {
 90 |         result += _gradE[*it][idx] * _gradE[*it][idx];
 91 |       }
 92 |     }
 93 | 
 94 | 
 95 |     return result;
 96 |   }
 97 | 
 98 |   inline void scaleGrad(dtype scale) {
 99 |     static hash_set<int>::iterator it;
100 |     for (int idx = 0; idx < _nDim; idx++) {
101 |       for (it = _indexers.begin(); it != _indexers.end(); ++it) {
102 |         _gradE[*it][idx] = _gradE[*it][idx] * scale;
103 |       }
104 |     }
105 | 
106 |   }
107 | 
108 |   inline bool bEmbFineTune()
109 |   {
110 |     return _bFineTune;
111 |   }
112 | 
113 | public:
114 |   void GetEmb(int id, Tensor<xpu, 2, dtype> y) {
115 |     updateSparseWeight(id);
116 |     assert(y.size(0) == 1);
117 |     y = 0.0;
118 |     y[0] += _E[id];
119 |   }
120 | 
121 |   // loss is stopped at this layer, since the input is one-hold alike
122 |   void EmbLoss(int id, Tensor<xpu, 2, dtype> ly) {
123 |     if(!_bFineTune) return;
124 |     //_gradE
125 |     assert(ly.size(0) == 1);
126 |     _gradE[id] += ly[0];
127 |     _indexers.insert(id);
128 | 
129 |   }
130 | 
131 | 
132 |   void randomprint(int num) {
133 |     static int _nVSize, _nDim;
134 |     _nVSize = _E.size(0);
135 |     _nDim = _E.size(1);
136 |     int count = 0;
137 |     while (count < num) {
138 |       int idx = rand() % _nVSize;
139 |       int idy = rand() % _nDim;
140 | 
141 |       std::cout << "_E[" << idx << "," << idy << "]=" << _E[idx][idy] << " ";
142 | 
143 |       count++;
144 |     }
145 | 
146 |     std::cout << std::endl;
147 |   }
148 | 
149 |   void updateAdaGrad(dtype regularizationWeight, dtype adaAlpha, dtype adaEps) {
150 | 
151 |     if(!_bFineTune) return;
152 |     static hash_set<int>::iterator it;
153 |     _max_update++;
154 | 
155 |     Tensor<xpu, 1, dtype> sqrt_eg2E = NewTensor<xpu>(Shape1(_E.size(1)), d_zero);
156 | 
157 |     for (it = _indexers.begin(); it != _indexers.end(); ++it) {
158 |       int index = *it;
159 |       _eg2E[index] = _eg2E[index] + _gradE[index] * _gradE[index];
160 |       sqrt_eg2E = F<nl_sqrt>(_eg2E[index] + adaEps);
161 |       _E[index] = (_E[index] * sqrt_eg2E - _gradE[index] * adaAlpha) / (adaAlpha * regularizationWeight + sqrt_eg2E);
162 |       _ftE[index] = sqrt_eg2E / (adaAlpha * regularizationWeight + sqrt_eg2E);
163 |     }
164 | 
165 |     FreeSpace(&sqrt_eg2E);
166 | 
167 |     clearGrad();
168 |   }
169 | 
170 |   void clearGrad() {
171 |     static hash_set<int>::iterator it;
172 | 
173 |     for (it = _indexers.begin(); it != _indexers.end(); ++it) {
174 |       int index = *it;
175 |       _gradE[index] = 0.0;
176 |     }
177 | 
178 |     _indexers.clear();
179 | 
180 |   }
181 | 
182 |   void updateSparseWeight(int wordId) {
183 |     if(!_bFineTune) return;
184 |     if (_last_update[wordId] < _max_update) {
185 |       int times = _max_update - _last_update[wordId];
186 |       _E[wordId] = _E[wordId] * F<nl_exp>(times * F<nl_log>(_ftE[wordId]));
187 |       _last_update[wordId] = _max_update;
188 |     }
189 |   }
190 | 
191 |   void writeModel(LStream &outf) {
192 |     SaveBinary(outf, _E);
193 |     SaveBinary(outf, _gradE);
194 |     SaveBinary(outf, _eg2E);
195 |     SaveBinary(outf, _ftE);
196 | 
197 |     WriteBinary(outf, _bFineTune);
198 |     WriteBinary(outf, _nDim);
199 |     WriteBinary(outf, _nVSize);
200 |     WriteBinary(outf, _max_update);
201 |     WriteVector(outf, _last_update);
202 |   }
203 |   void loadModel(LStream &inf) {
204 |     LoadBinary(inf, &_E, false);
205 |     LoadBinary(inf, &_gradE, false);
206 |     LoadBinary(inf, &_eg2E, false);
207 |     LoadBinary(inf, &_ftE, false);  
208 | 
209 |     ReadBinary(inf, _bFineTune);
210 |     ReadBinary(inf, _nDim);
211 |     ReadBinary(inf, _nVSize);
212 |     ReadBinary(inf, _max_update);
213 | 
214 |     ReadVector(inf, _last_update);
215 |   }
216 | 
217 | };
218 | 
219 | #endif /* SRC_LookupTable_H_ */
220 | 


--------------------------------------------------------------------------------
/Metric.h:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Metric.h
  3 |  *
  4 |  *  Created on: Mar 17, 2015
  5 |  *      Author: mszhang
  6 |  */
  7 | 
  8 | #ifndef SRC_METRIC_H_
  9 | #define SRC_METRIC_H_
 10 | #include "IO.h"
 11 | 
 12 | using namespace std;
 13 | 
 14 | class Metric {
 15 | 
 16 | public:
 17 |   int overall_label_count;
 18 |   int correct_label_count;
 19 |   int predicated_label_count;
 20 | 
 21 | public:
 22 |   Metric()
 23 |   {
 24 |     overall_label_count = 0;
 25 |     correct_label_count = 0;
 26 |     predicated_label_count = 0;
 27 |   }
 28 | 
 29 |   ~Metric(){}
 30 | 
 31 |   void reset()
 32 |   {
 33 |     overall_label_count = 0;
 34 |     correct_label_count = 0;
 35 |     predicated_label_count = 0;
 36 |   }
 37 | 
 38 |   bool bIdentical()
 39 |   {
 40 |     if(predicated_label_count == 0)
 41 |     {
 42 |       if(overall_label_count == correct_label_count)
 43 |       {
 44 |         return true;
 45 |       }
 46 |       return false;
 47 |     }
 48 |     else
 49 |     {
 50 |       if(overall_label_count == correct_label_count && predicated_label_count == correct_label_count)
 51 |       {
 52 |         return true;
 53 |       }
 54 |       return false;
 55 |     }
 56 |   }
 57 | 
 58 |   double getAccuracy()
 59 |   {
 60 |     if(predicated_label_count == 0)
 61 |     {
 62 |       return correct_label_count*1.0/overall_label_count;
 63 |     }
 64 |     else
 65 |     {
 66 |       return correct_label_count*2.0/(overall_label_count + predicated_label_count);
 67 |     }
 68 |   }
 69 | 
 70 | 
 71 |   void print()
 72 |   {
 73 |     if(predicated_label_count == 0)
 74 |     {
 75 |       std::cout << "Accuracy:\tP=" << correct_label_count << "/" << overall_label_count
 76 |           << "=" << correct_label_count*1.0/overall_label_count << std::endl;
 77 |     }
 78 |     else
 79 |     {
 80 |       std::cout << "Recall:\tP=" << correct_label_count << "/" << overall_label_count << "=" << correct_label_count*1.0/overall_label_count
 81 |         << ", " << "Accuracy:\tP=" << correct_label_count << "/" << predicated_label_count << "=" << correct_label_count*1.0/predicated_label_count
 82 |         << ", " << "Fmeasure:\t" << correct_label_count*2.0/(overall_label_count + predicated_label_count) << std::endl;
 83 |     }
 84 |   }
 85 | 
 86 |   void loadModel(LStream &inf) {
 87 |     ReadBinary(inf, overall_label_count);
 88 |     ReadBinary(inf, correct_label_count);
 89 |     ReadBinary(inf, predicated_label_count);
 90 |     // cout << overall_label_count << correct_label_count << predicated_label_count <<endl;
 91 | 
 92 |   }
 93 |   void writeModel(LStream &outf) {
 94 |     WriteBinary(outf, overall_label_count);
 95 |     WriteBinary(outf, correct_label_count);
 96 |     WriteBinary(outf, predicated_label_count);
 97 |     // cout << overall_label_count << correct_label_count << predicated_label_count <<endl;
 98 |   }
 99 | };
100 | 
101 | #endif /* SRC_EXAMPLE_H_ */
102 | 


--------------------------------------------------------------------------------
/N3L.h:
--------------------------------------------------------------------------------
 1 | #ifndef CML_ALL
 2 | #define CML_ALL
 3 | 
 4 | #include "CheckGrad.h"
 5 | #include "MyLib.h"
 6 | #include "Metric.h"
 7 | #include "NRMat.h"
 8 | #include "Alphabet.h"
 9 | #include "Hash_map.hpp"
10 | #include "Utiltensor.h"
11 | #include "Pooling.h"
12 | #include "Dropout.h"
13 | #include "Concat.h"
14 | #include "Windowlized.h"
15 | #include "LookupTable.h"
16 | #include "GatedPooling.h"
17 | #include "AttentionPooling.h"
18 | #include "BiLayer.h"
19 | #include "RNN.h"
20 | #include "GRNN.h"
21 | #include "LSTM.h"
22 | #include "LSTM_STD.h"
23 | #include "LSTM_KER.h"
24 | #include "LSTM_CHD.h"
25 | #include "SoftMaxLoss.h"
26 | #include "MLCRFLoss.h"
27 | #include "MMCRFLoss.h"
28 | #include "RecursiveNN.h"
29 | #include "RecursiveGatedNN.h"
30 | #include "AttRecursiveGatedNN.h"
31 | #include "SparseUniLayer.h"
32 | #include "SparseUniLayer1O.h"
33 | #include "TensorLayer.h"
34 | #include "TriLayer.h"
35 | #include "UniLayer.h"
36 | #include "UniLayer1O.h"
37 | #include "AvgPerceptron1O.h"
38 | #include "IO.h"
39 | 
40 | 
41 | #endif
42 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | LibN3L: A light-weight neural network package for natural language.
 2 | ======
 3 | 
 4 | Just include the directory in your code and call it by "#include N3L.h" 
 5 | 
 6 | Installation:
 7 | ====
 8 | Prerequisition:  
 9 | &ensp;&ensp;&ensp;&ensp;***mshadow***  
10 | Please download and include the directory  "https://github.com/dmlc/mshadow/tree/master/mshadow" in your applications:  
11 | (a) first copy the directory into your computer;  
12 | (b) then include it in your applications;  
13 | If have any problems, please mason.zms@gmail.com  
14 | 
15 | If you want to make mshadow work, you need to install certain libaries such as openblas and cuda.  
16 | I suggest use **openblas** since the current version does not support cuda yet, which is our future work.  
17 | Find it here:  
18 |    https://github.com/xianyi/OpenBLAS  
19 | Compile and install:  
20 | make USE_THREAD=0 &ensp;&ensp;&ensp;&ensp;##single thread version, one can use multi-thread version as well.  
21 | make   install&ensp;&ensp;&ensp;&ensp;&ensp;&ensp;&ensp;&ensp;&ensp;&ensp;&ensp;&ensp;##default path /opt/OpenBLAS  
22 | cp  /opt/OpenBLAS/include/*.*  /usr/include/  
23 | cp  /opt/OpenBLAS/lib/*.*     /usr/lib(64)/  
24 | 
25 | 
26 | Examples:
27 | ====
28 | Some examples are realeased at:  
29 | https://github.com/SUTDNLP/NNSegmentation  
30 | https://github.com/SUTDNLP/NNPOSTagging  
31 | https://github.com/SUTDNLP/NNNamedEntity (already support model save/load module)  
32 | You can see the performances in **[description.pdf](description(expect for lrec2016).pdf)**  
33 | https://github.com/SUTDNLP/OpenTargetedSentiment  
34 | 
35 | Cite:
36 | ====
37 | If you use Libn3l for research, please cite our [LREC paper](https://aclanthology.org/L16-1034.pdf) as follows:
38 | 
39 |     @article{zhang2016libn3l,
40 |     title={Libn3l: a lightweight package for neural nlp},
41 |     author={Zhang, Meishan and Yang, Jie and Teng, Zhiyang and Zhang, Yue},
42 |     booktitle={Proceedings of the Tenth International Conference on Language Resources and Evaluation (LREC'16)},
43 |     pages={225--229},
44 |     year={2016} 
45 |     } 
46 | 
47 | 
48 | Updating...
49 | ====
50 | * 2015-12-10: support model saving and loading.
51 | * 2015-10-20: initial version is available at GitHub.
52 | * 2015-5-31: framework is completed.
53 | * 2014-12-01: project starts.


--------------------------------------------------------------------------------
/RNN.h:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * RNN.h
  3 |  *
  4 |  *  Created on: Mar 18, 2015
  5 |  *      Author: mszhang
  6 |  */
  7 | 
  8 | #ifndef SRC_RNN_H_
  9 | #define SRC_RNN_H_
 10 | #include "tensor.h"
 11 | 
 12 | #include "BiLayer.h"
 13 | #include "MyLib.h"
 14 | #include "Utiltensor.h"
 15 | 
 16 | using namespace mshadow;
 17 | using namespace mshadow::expr;
 18 | using namespace mshadow::utils;
 19 | 
 20 | template<typename xpu>
 21 | class RNN {
 22 | public:
 23 |   BiLayer<xpu> _rnn;
 24 |   bool _left2right;
 25 | 
 26 |   Tensor<xpu, 2, dtype> _null, _nullLoss;
 27 | 
 28 | public:
 29 |   RNN() {
 30 |   }
 31 | 
 32 |   inline void initial(int outputsize, int inputsize, int seed = 0) {
 33 |     _left2right = true;
 34 |     _rnn.initial(outputsize, outputsize, inputsize, true, seed, 0);
 35 | 
 36 |     _null = NewTensor<xpu>(Shape2(1, outputsize), d_zero);
 37 |     _nullLoss = NewTensor<xpu>(Shape2(1, outputsize), d_zero);
 38 | 
 39 |   }
 40 | 
 41 |   inline void initial(int outputsize, int inputsize, bool left2right, int seed = 0) {
 42 |     _left2right = left2right;
 43 |     _rnn.initial(outputsize, outputsize, inputsize, true, seed, 0);
 44 | 
 45 |     _null = NewTensor<xpu>(Shape2(1, outputsize), d_zero);
 46 |     _nullLoss = NewTensor<xpu>(Shape2(1, outputsize), d_zero);
 47 | 
 48 |   }
 49 | 
 50 |   inline void initial(Tensor<xpu, 2, dtype> WL, Tensor<xpu, 2, dtype> WR, Tensor<xpu, 2, dtype> b, bool left2right = true) {
 51 |     _left2right = left2right;
 52 |     _rnn.initial(WL, WR, b, true);
 53 | 
 54 |     _null = NewTensor<xpu>(Shape2(1, b.size(1)), d_zero);
 55 |     _nullLoss = NewTensor<xpu>(Shape2(1, b.size(1)), d_zero);
 56 |   }
 57 | 
 58 |   inline void release() {
 59 |     _rnn.release();
 60 | 
 61 |     FreeSpace(&_null);
 62 |     FreeSpace(&_nullLoss);
 63 |   }
 64 | 
 65 |   virtual ~RNN() {
 66 |     // TODO Auto-generated destructor stub
 67 |   }
 68 | 
 69 |   inline dtype squarenormAll() {
 70 |     dtype norm = _rnn.squarenormAll();
 71 | 
 72 |     return norm;
 73 |   }
 74 | 
 75 |   inline void scaleGrad(dtype scale) {
 76 |     _rnn.scaleGrad(scale);
 77 |   }
 78 | 
 79 | public:
 80 | 
 81 |   inline void ComputeForwardScore(Tensor<xpu, 3, dtype> x, Tensor<xpu, 3, dtype> y) {
 82 |     y = 0.0;
 83 |     int seq_size = x.size(0);
 84 |     if (seq_size == 0)
 85 |       return;
 86 | 
 87 |     if (_left2right) {
 88 |       for (int idx = 0; idx < seq_size; idx++) {
 89 |         if (idx == 0) {
 90 |           _rnn.ComputeForwardScore(_null, x[idx], y[idx]);
 91 |         } else
 92 |           _rnn.ComputeForwardScore(y[idx - 1], x[idx], y[idx]);
 93 |       }
 94 |     } else {
 95 |       for (int idx = seq_size - 1; idx >= 0; idx--) {
 96 |         if (idx == seq_size - 1)
 97 |           _rnn.ComputeForwardScore(_null, x[idx], y[idx]);
 98 |         else
 99 |           _rnn.ComputeForwardScore(y[idx + 1], x[idx], y[idx]);
100 |       }
101 |     }
102 |   }
103 | 
104 |   inline void ComputeForwardScore(const vector<Tensor<xpu, 2, dtype> > &x, vector<Tensor<xpu, 2, dtype> > &y) {
105 |     assign(y, 0.0);
106 |     int seq_size = x.size();
107 |     if (seq_size == 0)
108 |       return;
109 | 
110 |     if (_left2right) {
111 |       for (int idx = 0; idx < seq_size; idx++) {
112 |         if (idx == 0) {
113 |           _rnn.ComputeForwardScore(_null, x[idx], y[idx]);
114 |         } else
115 |           _rnn.ComputeForwardScore(y[idx - 1], x[idx], y[idx]);
116 |       }
117 |     } else {
118 |       for (int idx = seq_size - 1; idx >= 0; idx--) {
119 |         if (idx == seq_size - 1)
120 |           _rnn.ComputeForwardScore(_null, x[idx], y[idx]);
121 |         else
122 |           _rnn.ComputeForwardScore(y[idx + 1], x[idx], y[idx]);
123 |       }
124 |     }
125 |   }
126 | 
127 |   // This function is used for computing hidden values incrementally at the start position
128 |   // It is applied only when the sequential inputs are not fixed in advance,
129 |   // which can vary during decoding.
130 |   // We need not provide a backward function, since during backward, inputs will be given.
131 |   inline void ComputeForwardScoreIncremental(Tensor<xpu, 2, dtype> x, Tensor<xpu, 2, dtype> y) {
132 |     assert(_left2right);
133 |     y = 0.0;
134 |     _rnn.ComputeForwardScore(_null, x, y);
135 |   }
136 | 
137 | 
138 |   // This function is used for computing hidden values incrementally at the non-start position
139 |   // It is applied only when the sequential inputs are not fixed in advance,
140 |   // which can vary during decoding.
141 |   // We need not provide a backward function, since during backward, inputs will be given.
142 |   inline void ComputeForwardScoreIncremental(Tensor<xpu, 2, dtype> py, Tensor<xpu, 2, dtype> x, Tensor<xpu, 2, dtype> y) {
143 |     assert(_left2right);
144 |     y = 0.0;
145 |     _rnn.ComputeForwardScore(py, x, y);
146 |   }
147 | 
148 |   //please allocate the memory outside here
149 |   inline void ComputeBackwardLoss(Tensor<xpu, 3, dtype> x, Tensor<xpu, 3, dtype> y, Tensor<xpu, 3, dtype> ly, Tensor<xpu, 3, dtype> lx, bool bclear = false) {
150 |     int seq_size = x.size(0);
151 |     if (seq_size == 0)
152 |       return;
153 | 
154 |     if (bclear)
155 |       lx = 0.0;
156 |     //left rnn
157 |     Tensor<xpu, 3, dtype> lfy = NewTensor<xpu>(Shape3(y.size(0), y.size(1), y.size(2)), d_zero);
158 |     if (_left2right) {
159 |       for (int idx = seq_size - 1; idx >= 0; idx--) {
160 |         if (idx < seq_size - 1)
161 |           ly[idx] = ly[idx] + lfy[idx];
162 | 
163 |         if (idx == 0)
164 |           _rnn.ComputeBackwardLoss(_null, x[idx], y[idx], ly[idx], _nullLoss, lx[idx]);
165 |         else
166 |           _rnn.ComputeBackwardLoss(y[idx - 1], x[idx], y[idx], ly[idx], lfy[idx - 1], lx[idx]);
167 |       }
168 |     } else {
169 |       // right rnn
170 |       for (int idx = 0; idx < seq_size; idx++) {
171 |         if (idx > 0)
172 |           ly[idx] = ly[idx] + lfy[idx];
173 | 
174 |         if (idx == seq_size - 1)
175 |           _rnn.ComputeBackwardLoss(_null, x[idx], y[idx], ly[idx], _nullLoss, lx[idx]);
176 |         else
177 |           _rnn.ComputeBackwardLoss(y[idx + 1], x[idx], y[idx], ly[idx], lfy[idx + 1], lx[idx]);
178 |       }
179 |     }
180 | 
181 |     FreeSpace(&lfy);
182 |   }
183 | 
184 |   //please allocate the memory outside here
185 |   inline void ComputeBackwardLoss(const vector<Tensor<xpu, 2, dtype> > &x, const vector<Tensor<xpu, 2, dtype> > &y,
186 |       vector<Tensor<xpu, 2, dtype> > &ly, vector<Tensor<xpu, 2, dtype> > &lx, bool bclear = false) {
187 |     int seq_size = x.size();
188 |     if (seq_size == 0)
189 |       return;
190 | 
191 |     if (bclear)
192 |       assign(lx, 0.0);
193 | 
194 |     vector<Tensor<xpu, 2, dtype> > lfy(seq_size);
195 |     for (int idx = 0; idx < seq_size; idx++) {
196 |       lfy[idx] = NewTensor<xpu>(Shape2(ly[0].size(0), ly[0].size(1)), d_zero);
197 |     }
198 | 
199 |     if (_left2right) {
200 |       //left rnn
201 |       for (int idx = seq_size - 1; idx >= 0; idx--) {
202 |         if (idx < seq_size - 1)
203 |           ly[idx] = ly[idx] + lfy[idx];
204 | 
205 |         if (idx == 0)
206 |           _rnn.ComputeBackwardLoss(_null, x[idx], y[idx], ly[idx], _nullLoss, lx[idx]);
207 |         else
208 |           _rnn.ComputeBackwardLoss(y[idx - 1], x[idx], y[idx], ly[idx], lfy[idx - 1], lx[idx]);
209 |       }
210 |     } else {
211 |       // right rnn
212 |       for (int idx = 0; idx < seq_size; idx++) {
213 |         if (idx > 0)
214 |           ly[idx] = ly[idx] + lfy[idx];
215 | 
216 |         if (idx == seq_size - 1)
217 |           _rnn.ComputeBackwardLoss(_null, x[idx], y[idx], ly[idx], _nullLoss, lx[idx]);
218 |         else
219 |           _rnn.ComputeBackwardLoss(y[idx + 1], x[idx], y[idx], ly[idx], lfy[idx + 1], lx[idx]);
220 |       }
221 |     }
222 | 
223 |     for (int idx = 0; idx < seq_size; idx++) {
224 |       FreeSpace(&(lfy[idx]));
225 |     }
226 |   }
227 | 
228 |   inline void randomprint(int num) {
229 |     _rnn.randomprint(num);
230 |   }
231 | 
232 |   inline void updateAdaGrad(dtype regularizationWeight, dtype adaAlpha, dtype adaEps) {
233 |     _rnn.updateAdaGrad(regularizationWeight, adaAlpha, adaEps);
234 |   }
235 | 
236 |   void writeModel(LStream &outf) {
237 |     _rnn.writeModel(outf);
238 | 
239 |     SaveBinary(outf, _null);
240 |     SaveBinary(outf, _nullLoss);
241 | 
242 |     WriteBinary(outf, _left2right);
243 |   }
244 | 
245 |   void loadModel(LStream &inf) {
246 |     _rnn.loadModel(inf);
247 |     LoadBinary(inf, &_null, false);
248 |     LoadBinary(inf, &_nullLoss, false);
249 | 
250 |     ReadBinary(inf, _left2right);
251 |   }
252 | 
253 | };
254 | 
255 | #endif /* SRC_RNN_H_ */
256 | 


--------------------------------------------------------------------------------
/RecursiveGatedNN.h:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * RecursiveGatedNN.h
  3 |  *
  4 |  *  Created on: Mar 18, 2015
  5 |  *      Author: mszhang
  6 |  */
  7 | 
  8 | #ifndef SRC_RecursiveGatedNN_H_
  9 | #define SRC_RecursiveGatedNN_H_
 10 | #include "tensor.h"
 11 | 
 12 | #include "BiLayer.h"
 13 | #include "UniLayer.h"
 14 | #include "MyLib.h"
 15 | #include "Utiltensor.h"
 16 | 
 17 | using namespace mshadow;
 18 | using namespace mshadow::expr;
 19 | using namespace mshadow::utils;
 20 | 
 21 | template<typename xpu>
 22 | class RecursiveGatedNN {
 23 | public:
 24 |   UniLayer<xpu> _reset_left;
 25 |   UniLayer<xpu> _reset_right;
 26 |   UniLayer<xpu> _update_left;
 27 |   UniLayer<xpu> _update_right;
 28 |   UniLayer<xpu> _update_tilde;
 29 |   BiLayer<xpu> _recursive_tilde;
 30 | 
 31 | 
 32 |   Tensor<xpu, 2, dtype> nxl;
 33 |   Tensor<xpu, 2, dtype> nxr;
 34 |   Tensor<xpu, 2, dtype> sum;
 35 | 
 36 |   Tensor<xpu, 2, dtype> pxl;
 37 |   Tensor<xpu, 2, dtype> pxr;
 38 |   Tensor<xpu, 2, dtype> pmy;
 39 | 
 40 | 
 41 |   Tensor<xpu, 2, dtype> lrxl;
 42 |   Tensor<xpu, 2, dtype> lrxr;
 43 |   Tensor<xpu, 2, dtype> lmy;
 44 |   Tensor<xpu, 2, dtype> luxl;
 45 |   Tensor<xpu, 2, dtype> luxr;
 46 |   Tensor<xpu, 2, dtype> lumy;
 47 | 
 48 |   Tensor<xpu, 2, dtype> lnxl;
 49 |   Tensor<xpu, 2, dtype> lnxr;
 50 |   Tensor<xpu, 2, dtype> lsum;
 51 | 
 52 |   Tensor<xpu, 2, dtype> lpxl;
 53 |   Tensor<xpu, 2, dtype> lpxr;
 54 |   Tensor<xpu, 2, dtype> lpmy;
 55 | 
 56 | 
 57 | public:
 58 |   RecursiveGatedNN() {
 59 |   }
 60 | 
 61 |   inline void initial(int dimension, int seed = 0) {
 62 |     _reset_left.initial(dimension, dimension, false, seed, 1);
 63 |     _reset_right.initial(dimension, dimension, false, seed + 10, 1);
 64 |     _update_left.initial(dimension, dimension, false, seed + 20, 3);
 65 |     _update_right.initial(dimension, dimension, false, seed + 30, 3);
 66 |     _update_tilde.initial(dimension, dimension, false, seed + 40, 3);
 67 |     _recursive_tilde.initial(dimension, dimension, dimension, false, seed + 50, 0);
 68 | 
 69 |     nxl = NewTensor<xpu>(Shape2(1, dimension), d_zero);
 70 |     nxr = NewTensor<xpu>(Shape2(1, dimension), d_zero);
 71 |     sum = NewTensor<xpu>(Shape2(1, dimension), d_zero);
 72 | 
 73 |     pxl = NewTensor<xpu>(Shape2(1, dimension), d_zero);
 74 |     pxr = NewTensor<xpu>(Shape2(1, dimension), d_zero);
 75 |     pmy = NewTensor<xpu>(Shape2(1, dimension), d_zero);
 76 | 
 77 | 
 78 |     lrxl = NewTensor<xpu>(Shape2(1, dimension), d_zero);
 79 |     lrxr = NewTensor<xpu>(Shape2(1, dimension), d_zero);
 80 |     lmy = NewTensor<xpu>(Shape2(1, dimension), d_zero);
 81 |     luxl = NewTensor<xpu>(Shape2(1, dimension), d_zero);
 82 |     luxr = NewTensor<xpu>(Shape2(1, dimension), d_zero);
 83 |     lumy = NewTensor<xpu>(Shape2(1, dimension), d_zero);
 84 | 
 85 |     lnxl = NewTensor<xpu>(Shape2(1, dimension), d_zero);
 86 |     lnxr = NewTensor<xpu>(Shape2(1, dimension), d_zero);
 87 |     lsum = NewTensor<xpu>(Shape2(1, dimension), d_zero);
 88 | 
 89 |     lpxl = NewTensor<xpu>(Shape2(1, dimension), d_zero);
 90 |     lpxr = NewTensor<xpu>(Shape2(1, dimension), d_zero);
 91 |     lpmy = NewTensor<xpu>(Shape2(1, dimension), d_zero);
 92 |   }
 93 | 
 94 | 
 95 |   inline void initial(Tensor<xpu, 2, dtype> rW1, Tensor<xpu, 2, dtype> rW2,
 96 |       Tensor<xpu, 2, dtype> uW1, Tensor<xpu, 2, dtype> uW2, Tensor<xpu, 2, dtype> uW3,
 97 |       Tensor<xpu, 2, dtype> W1, Tensor<xpu, 2, dtype> W2, Tensor<xpu, 2, dtype> W3,Tensor<xpu, 2, dtype> b) {
 98 |     _reset_left.initial(rW1, 1);
 99 |     _reset_right.initial(rW2, 1);
100 | 
101 |     _update_left.initial(uW1, 3);
102 |     _update_right.initial(uW2, 3);
103 |     _update_tilde.initial(uW3, 3);
104 | 
105 |     _recursive_tilde.initial(W1, W2, W3, b, 0);
106 |   }
107 | 
108 |   inline void release() {
109 |     _reset_left.release();
110 |     _reset_right.release();
111 | 
112 |     _update_left.release();
113 |     _update_right.release();
114 |     _update_tilde.release();
115 | 
116 |     _recursive_tilde.release();
117 | 
118 |     FreeSpace(&nxl);
119 |     FreeSpace(&nxr);
120 |     FreeSpace(&sum);
121 |     FreeSpace(&pxl);
122 |     FreeSpace(&pxr);
123 |     FreeSpace(&pmy);
124 |     FreeSpace(&lnxl);
125 |     FreeSpace(&lnxr);
126 |     FreeSpace(&lsum);
127 |     FreeSpace(&lpxl);
128 |     FreeSpace(&lpxr);
129 |     FreeSpace(&lpmy);
130 |     FreeSpace(&lrxl);
131 |     FreeSpace(&lrxr);
132 |     FreeSpace(&lmy);
133 |     FreeSpace(&luxl);
134 |     FreeSpace(&luxr);
135 |     FreeSpace(&lumy);
136 |   }
137 | 
138 |   virtual ~RecursiveGatedNN() {
139 |     // TODO Auto-generated destructor stub
140 |   }
141 | 
142 |   inline dtype squarenormAll() {
143 |     dtype norm = _reset_left.squarenormAll();
144 |     norm += _reset_right.squarenormAll();
145 |     norm += _update_left.squarenormAll();
146 |     norm += _update_right.squarenormAll();
147 |     norm += _update_tilde.squarenormAll();
148 |     norm += _recursive_tilde.squarenormAll();
149 | 
150 |     return norm;
151 |   }
152 | 
153 |   inline void scaleGrad(dtype scale) {
154 |     _reset_left.scaleGrad(scale);
155 |     _reset_right.scaleGrad(scale);
156 | 
157 |     _update_left.scaleGrad(scale);
158 |     _update_right.scaleGrad(scale);
159 |     _update_tilde.scaleGrad(scale);
160 | 
161 |     _recursive_tilde.scaleGrad(scale);
162 |   }
163 | 
164 | public:
165 | 
166 |   inline void ComputeForwardScore(Tensor<xpu, 2, dtype> xl, Tensor<xpu, 2, dtype> xr,
167 |       Tensor<xpu, 2, dtype> rxl, Tensor<xpu, 2, dtype> rxr, Tensor<xpu, 2, dtype> my,
168 |       Tensor<xpu, 2, dtype> uxl, Tensor<xpu, 2, dtype> uxr, Tensor<xpu, 2, dtype> umy,
169 |       Tensor<xpu, 2, dtype> y) {
170 | 
171 |     nxl = 0.0;
172 |     nxr = 0.0;
173 |     sum = 0.0;
174 | 
175 |     pxl = 0.0;
176 |     pxr = 0.0;
177 |     pmy = 0.0;
178 | 
179 |     _reset_left.ComputeForwardScore(xl, rxl);
180 |     _reset_right.ComputeForwardScore(xr, rxr);
181 | 
182 | 
183 |     nxl = rxl * xl;
184 |     nxr = rxr * xr;
185 | 
186 |     _recursive_tilde.ComputeForwardScore(nxl, nxr, my);
187 | 
188 | 
189 |     _update_left.ComputeForwardScore(xl, uxl);
190 |     _update_right.ComputeForwardScore(xr, uxr);
191 |     _update_tilde.ComputeForwardScore(my, umy);
192 | 
193 |     sum = uxl + uxr + umy;
194 | 
195 |     pxl = uxl / sum;
196 |     pxr = uxr / sum;
197 |     pmy = umy / sum;
198 | 
199 |     y = pxl * xl + pxr * xr + pmy * my;
200 | 
201 |   }
202 | 
203 |   //please allocate the memory outside here
204 |   inline void ComputeBackwardLoss(Tensor<xpu, 2, dtype> xl, Tensor<xpu, 2, dtype> xr,
205 |       Tensor<xpu, 2, dtype> rxl, Tensor<xpu, 2, dtype> rxr, Tensor<xpu, 2, dtype> my,
206 |       Tensor<xpu, 2, dtype> uxl, Tensor<xpu, 2, dtype> uxr, Tensor<xpu, 2, dtype> umy,
207 |       Tensor<xpu, 2, dtype> y, Tensor<xpu, 2, dtype> ly,
208 |       Tensor<xpu, 2, dtype> lxl, Tensor<xpu, 2, dtype> lxr,
209 |       bool bclear = false) {
210 |     if (bclear){
211 |       lxl = 0.0; lxr = 0.0;
212 |     }
213 | 
214 |     nxl = 0.0;
215 |     nxr = 0.0;
216 |     sum = 0.0;
217 | 
218 |     pxl = 0.0;
219 |     pxr = 0.0;
220 |     pmy = 0.0;
221 | 
222 | 
223 |     lrxl = 0.0;
224 |     lrxr = 0.0;
225 |     lmy = 0.0;
226 |     luxl = 0.0;
227 |     luxr = 0.0;
228 |     lumy = 0.0;
229 | 
230 |     lnxl = 0.0;
231 |     lnxr = 0.0;
232 |     lsum = 0.0;
233 | 
234 |     lpxl = 0.0;
235 |     lpxr = 0.0;
236 |     lpmy = 0.0;
237 | 
238 |     nxl = rxl * xl;
239 |     nxr = rxr * xr;
240 | 
241 |     sum = uxl + uxr + umy;
242 | 
243 |     pxl = uxl / sum;
244 |     pxr = uxr / sum;
245 |     pmy = umy / sum;
246 | 
247 | 
248 |     lpxl += ly * xl;
249 |     lxl += ly * pxl;
250 | 
251 |     lpxr += ly * xr;
252 |     lxr += ly * pxr;
253 | 
254 |     lpmy += ly * my;
255 |     lmy += ly * pmy;
256 | 
257 | 
258 | 
259 |     luxl += lpxl / sum;
260 |     luxr += lpxr / sum;
261 |     lumy += lpmy / sum;
262 | 
263 |     lsum -= lpxl * pxl / sum;
264 |     lsum -= lpxr * pxr / sum;
265 |     lsum -= lpmy * pmy / sum;
266 | 
267 | 
268 |     luxl += lsum;
269 |     luxr += lsum;
270 |     lumy += lsum;
271 | 
272 |     _update_left.ComputeBackwardLoss(xl, uxl, luxl, lxl);
273 |     _update_right.ComputeBackwardLoss(xr, uxr, luxr, lxr);
274 |     _update_tilde.ComputeBackwardLoss(my, umy, lumy, lmy);
275 | 
276 |     _recursive_tilde.ComputeBackwardLoss(nxl, nxr, my, lmy, lnxl, lnxr);
277 | 
278 |     lrxl += lnxl * xl;
279 |     lxl += lnxl * rxl;
280 | 
281 |     lrxr += lnxr * xr;
282 |     lxr += lnxr * rxr;
283 | 
284 |     _reset_left.ComputeBackwardLoss(xl, rxl, lrxl, lxl);
285 |     _reset_right.ComputeBackwardLoss(xr, rxr, lrxr, lxr);
286 | 
287 |   }
288 | 
289 | 
290 |   inline void randomprint(int num) {
291 |     _reset_left.randomprint(num);
292 |     _reset_right.randomprint(num);
293 | 
294 |     _update_left.randomprint(num);
295 |     _update_right.randomprint(num);
296 |     _update_tilde.randomprint(num);
297 | 
298 |     _recursive_tilde.randomprint(num);
299 |   }
300 | 
301 |   inline void updateAdaGrad(dtype regularizationWeight, dtype adaAlpha, dtype adaEps) {
302 |     _reset_left.updateAdaGrad(regularizationWeight, adaAlpha, adaEps);
303 |     _reset_right.updateAdaGrad(regularizationWeight, adaAlpha, adaEps);
304 | 
305 |     _update_left.updateAdaGrad(regularizationWeight, adaAlpha, adaEps);
306 |     _update_right.updateAdaGrad(regularizationWeight, adaAlpha, adaEps);
307 |     _update_tilde.updateAdaGrad(regularizationWeight, adaAlpha, adaEps);
308 | 
309 |     _recursive_tilde.updateAdaGrad(regularizationWeight, adaAlpha, adaEps);
310 |   }
311 | 
312 | 
313 |   void writeModel(LStream &outf) {
314 | 
315 |     _reset_left.writeModel(outf);
316 |     _reset_right.writeModel(outf);
317 |     _update_left.writeModel(outf);
318 |     _update_right.writeModel(outf);
319 |     _update_tilde.writeModel(outf);
320 |     _recursive_tilde.writeModel(outf);
321 |     
322 |     SaveBinary(outf, nxl);
323 |     SaveBinary(outf, nxr);
324 |     SaveBinary(outf, sum);
325 | 
326 |     SaveBinary(outf, pxl);
327 |     SaveBinary(outf, pxr);
328 |     SaveBinary(outf, pmy);
329 | 
330 |     SaveBinary(outf, lrxl);
331 |     SaveBinary(outf, lrxr);
332 |     SaveBinary(outf, lmy);
333 |     SaveBinary(outf, luxl);
334 |     SaveBinary(outf, luxr);
335 |     SaveBinary(outf, lumy);
336 | 
337 |     SaveBinary(outf, lnxl);
338 |     SaveBinary(outf, lnxr);
339 |     SaveBinary(outf, lsum);
340 | 
341 |     SaveBinary(outf, lpxl);
342 |     SaveBinary(outf, lpxr);
343 |     SaveBinary(outf, lpmy);
344 | 
345 |   }
346 | 
347 |   void loadModel(LStream &inf) {
348 | 
349 |     _reset_left.loadModel(inf);
350 |     _reset_right.loadModel(inf);
351 |     _update_left.loadModel(inf);
352 |     _update_right.loadModel(inf);
353 |     _update_tilde.loadModel(inf);
354 |     _recursive_tilde.loadModel(inf);
355 | 
356 | 
357 |     LoadBinary(inf, &nxl, false);
358 |     LoadBinary(inf, &nxr, false);
359 |     LoadBinary(inf, &sum, false);
360 | 
361 |     LoadBinary(inf, &pxl, false);
362 |     LoadBinary(inf, &pxr, false);
363 |     LoadBinary(inf, &pmy, false);
364 | 
365 |     LoadBinary(inf, &lrxl, false);
366 |     LoadBinary(inf, &lrxr, false);
367 |     LoadBinary(inf, &lmy, false);
368 |     LoadBinary(inf, &luxl, false);
369 |     LoadBinary(inf, &luxr, false);
370 |     LoadBinary(inf, &lumy, false);
371 | 
372 |     LoadBinary(inf, &lnxl, false);
373 |     LoadBinary(inf, &lnxr, false);
374 |     LoadBinary(inf, &lsum, false);
375 | 
376 |     LoadBinary(inf, &lpxl, false);
377 |     LoadBinary(inf, &lpxr, false);
378 |     LoadBinary(inf, &lpmy, false);
379 | 
380 |   }
381 | 
382 | 
383 | };
384 | 
385 | #endif /* SRC_RecursiveGatedNN_H_ */
386 | 


--------------------------------------------------------------------------------
/RecursiveNN.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * RecursiveNN.h
 3 |  *
 4 |  *  Created on: Mar 18, 2015
 5 |  *      Author: mszhang
 6 |  */
 7 | 
 8 | #ifndef SRC_RecursiveNN_H_
 9 | #define SRC_RecursiveNN_H_
10 | #include "tensor.h"
11 | 
12 | #include "BiLayer.h"
13 | #include "MyLib.h"
14 | #include "Utiltensor.h"
15 | 
16 | using namespace mshadow;
17 | using namespace mshadow::expr;
18 | using namespace mshadow::utils;
19 | 
20 | // Actually, we do not need such a class, BiLayer satisfies it
21 | 
22 | template<typename xpu>
23 | class RecursiveNN {
24 | public:
25 |   BiLayer<xpu> _rnn;
26 | 
27 | public:
28 |   RecursiveNN() {
29 |   }
30 | 
31 |   inline void initial(int dimension, int seed = 0) {
32 |     _rnn.initial(dimension, dimension, dimension, true, seed, 0);
33 |   }
34 | 
35 | 
36 |   inline void initial(Tensor<xpu, 2, dtype> WL, Tensor<xpu, 2, dtype> WR, Tensor<xpu, 2, dtype> b) {
37 |     _rnn.initial(WL, WR, b, true);
38 |   }
39 | 
40 |   inline void release() {
41 |     _rnn.release();
42 |   }
43 | 
44 |   virtual ~RecursiveNN() {
45 |     // TODO Auto-generated destructor stub
46 |   }
47 | 
48 |   inline dtype squarenormAll() {
49 |     dtype norm = _rnn.squarenormAll();
50 | 
51 |     return norm;
52 |   }
53 | 
54 |   inline void scaleGrad(dtype scale) {
55 |     _rnn.scaleGrad(scale);
56 |   }
57 | 
58 | public:
59 | 
60 |   inline void ComputeForwardScore(Tensor<xpu, 2, dtype> xl, Tensor<xpu, 2, dtype> xr, Tensor<xpu, 2, dtype> y) {
61 |     y = 0.0;
62 |    _rnn.ComputeForwardScore(xl, xr, y);
63 | 
64 |   }
65 | 
66 |   //please allocate the memory outside here
67 |   inline void ComputeBackwardLoss(Tensor<xpu, 2, dtype> xl, Tensor<xpu, 2, dtype> xr, Tensor<xpu, 2, dtype> y, Tensor<xpu, 2, dtype> ly,
68 |       Tensor<xpu, 2, dtype> lxl, Tensor<xpu, 2, dtype> lxr, bool bclear = false) {
69 |     if (bclear){
70 |       lxl = 0.0; lxr = 0.0;
71 |     }
72 |     _rnn.ComputeBackwardLoss(xl, xr, y, ly, lxl, lxr);
73 |   }
74 | 
75 | 
76 |   inline void randomprint(int num) {
77 |     _rnn.randomprint(num);
78 |   }
79 | 
80 |   inline void updateAdaGrad(dtype regularizationWeight, dtype adaAlpha, dtype adaEps) {
81 |     _rnn.updateAdaGrad(regularizationWeight, adaAlpha, adaEps);
82 |   }
83 | 
84 |   void writeModel(LStream &outf) {
85 |     _rnn.writeModel(outf);
86 |   }
87 | 
88 |   void loadModel(LStream &inf) {
89 |     _rnn.loadModel(inf);
90 |   }
91 | 
92 | };
93 | 
94 | #endif /* SRC_RecursiveNN_H_ */
95 | 


--------------------------------------------------------------------------------
/SoftMaxLoss.h:
--------------------------------------------------------------------------------
  1 | #ifndef SOFTMAXLOSS
  2 | #define SOFTMAXLOSS
  3 | 
  4 | #include "tensor.h"
  5 | #include "MyLib.h"
  6 | #include "Metric.h"
  7 | 
  8 | using namespace std;
  9 | using namespace mshadow;
 10 | using namespace mshadow::expr;
 11 | using namespace mshadow::utils;
 12 | 
 13 | template<typename xpu>
 14 | inline dtype softmax_loss(const vector<Tensor<xpu, 2, dtype> > &output, const vector<vector<int> > &answers, vector<Tensor<xpu, 2, dtype> > &loutput,
 15 |     Metric & eval, int batchsize = 1) {
 16 |   int seqsize = output.size();
 17 |   if (answers.size() != seqsize || seqsize == 0) {
 18 |     std::cerr << "softmax_loss error: vector size or context size invalid" << std::endl;
 19 |   }
 20 | 
 21 |   int dim1 = output[0].size(0), dim2 = output[0].size(1);
 22 |   int odim1 = loutput[0].size(0), odim2 = loutput[0].size(1);
 23 |   int labelsize = answers[0].size();
 24 | 
 25 |   if (labelsize != odim2 || dim2 != odim2 || dim1 != 1 || odim1 != 1) {
 26 |     std::cerr << "softmax_loss error: dim size invalid" << std::endl;
 27 |   }
 28 | 
 29 |   Tensor<xpu, 3, dtype> scores = NewTensor<xpu>(Shape3(seqsize, 1, dim2), d_zero);
 30 | 
 31 |   for (int idx = 0; idx < seqsize; idx++) {
 32 |     loutput[idx] = 0.0;
 33 |   }
 34 | 
 35 |   dtype cost = 0.0;
 36 |   static int optLabel;
 37 |   for (int idx = 0; idx < seqsize; idx++) {
 38 |     optLabel = -1;
 39 |     for (int i = 0; i < dim2; ++i) {
 40 |       if (answers[idx][i] >= 0) {
 41 |         if (optLabel < 0 || output[idx][0][i] > output[idx][0][optLabel])
 42 |           optLabel = i;
 43 |       }
 44 |     }
 45 | 
 46 |     dtype sum1 = 0.0;
 47 |     dtype sum2 = 0.0;
 48 |     dtype maxScore = output[idx][0][optLabel];
 49 |     for (int i = 0; i < dim2; ++i) {
 50 |       scores[idx][0][i] = -1e10;
 51 |       if (answers[idx][i] >= 0) {
 52 |         scores[idx][0][i] = exp(output[idx][0][i] - maxScore);
 53 |         if (answers[idx][i] == 1)
 54 |           sum1 += scores[idx][0][i];
 55 |         sum2 += scores[idx][0][i];
 56 |       }
 57 |     }
 58 |     cost += (log(sum2) - log(sum1)) / (batchsize * seqsize);
 59 |     if (answers[idx][optLabel] == 1)
 60 |       eval.correct_label_count++;
 61 |     eval.overall_label_count++;
 62 | 
 63 |     for (int i = 0; i < dim2; ++i) {
 64 |       if (answers[idx][i] >= 0) {
 65 |         loutput[idx][0][i] = (scores[idx][0][i] / sum2 - answers[idx][i]) / (batchsize * seqsize);
 66 |       }
 67 |     }
 68 | 
 69 |   }
 70 | 
 71 |   FreeSpace(&scores);
 72 |   return cost;
 73 | }
 74 | 
 75 | template<typename xpu>
 76 | inline dtype softmax_cost(const vector<Tensor<xpu, 2, dtype> > &output, const vector<vector<int> > &answers) {
 77 |   int seqsize = output.size();
 78 |   if (answers.size() != seqsize || seqsize == 0) {
 79 |     std::cerr << "softmax_cost error: vector size or context size invalid" << std::endl;
 80 |   }
 81 | 
 82 |   int dim1 = output[0].size(0), dim2 = output[0].size(1);
 83 |   int labelsize = answers[0].size();
 84 | 
 85 |   if (labelsize != dim2 || dim1 != 1) {
 86 |     std::cerr << "softmax_cost error: dim size invalid" << std::endl;
 87 |   }
 88 | 
 89 |   Tensor<xpu, 3, dtype> scores = NewTensor<xpu>(Shape3(seqsize, 1, dim2), d_zero);
 90 | 
 91 |   dtype cost = 0.0;
 92 |   static int optLabel;
 93 |   for (int idx = 0; idx < seqsize; idx++) {
 94 |     optLabel = -1;
 95 |     for (int i = 0; i < dim2; ++i) {
 96 |       if (answers[idx][i] >= 0) {
 97 |         if (optLabel < 0 || output[idx][0][i] > output[idx][0][optLabel])
 98 |           optLabel = i;
 99 |       }
100 |     }
101 | 
102 |     dtype sum1 = 0.0;
103 |     dtype sum2 = 0.0;
104 |     dtype maxScore = output[idx][0][optLabel];
105 |     for (int i = 0; i < dim2; ++i) {
106 |       scores[idx][0][i] = -1e10;
107 |       if (answers[idx][i] >= 0) {
108 |         scores[idx][0][i] = exp(output[idx][0][i] - maxScore);
109 |         if (answers[idx][i] == 1)
110 |           sum1 += scores[idx][0][i];
111 |         sum2 += scores[idx][0][i];
112 |       }
113 |     }
114 |     cost += (log(sum2) - log(sum1)) / seqsize;
115 |   }
116 | 
117 |   FreeSpace(&scores);
118 |   return cost;
119 | }
120 | 
121 | template<typename xpu>
122 | inline void softmax_predict(const vector<Tensor<xpu, 2, dtype> > &output, vector<int>& results) {
123 |   int seqsize = output.size();
124 |   if (seqsize == 0) {
125 |     std::cerr << "softmax_predict error: vector size or context size invalid" << std::endl;
126 |   }
127 | 
128 |   int dim1 = output[0].size(0), dim2 = output[0].size(1);
129 |   if (dim1 != 1) {
130 |     std::cerr << "softmax_predict error: dim size invalid" << std::endl;
131 |   }
132 | 
133 |   results.resize(seqsize);
134 | 
135 |   static int optLabel;
136 |   for (int idx = 0; idx < seqsize; idx++) {
137 |     optLabel = -1;
138 |     for (int i = 0; i < dim2; ++i) {
139 |       if (optLabel < 0 || output[idx][0][i] > output[idx][0][optLabel])
140 |         optLabel = i;
141 |     }
142 |     results[idx] = optLabel;
143 |   }
144 | 
145 | }
146 | 
147 | template<typename xpu>
148 | inline dtype softmax_loss(Tensor<xpu, 3, dtype> output, const vector<vector<int> > &answers, Tensor<xpu, 3, dtype> loutput, Metric & eval, int batchsize = 1) {
149 |   int seqsize = output.size(0);
150 |   if (answers.size() != seqsize || seqsize == 0) {
151 |     std::cerr << "softmax_loss error: vector size or context size invalid" << std::endl;
152 |   }
153 | 
154 |   int dim1 = output.size(1), dim2 = output.size(2);
155 |   int odim1 = loutput.size(1), odim2 = loutput.size(2);
156 |   int labelsize = answers[0].size();
157 | 
158 |   if (labelsize != odim2 || dim2 != odim2 || dim1 != 1 || odim1 != 1) {
159 |     std::cerr << "softmax_loss error: dim size invalid" << std::endl;
160 |   }
161 | 
162 |   Tensor<xpu, 3, dtype> scores = NewTensor<xpu>(Shape3(seqsize, 1, dim2), d_zero);
163 | 
164 |   loutput = 0.0;
165 |   dtype cost = 0.0;
166 |   static int optLabel;
167 |   for (int idx = 0; idx < seqsize; idx++) {
168 |     optLabel = -1;
169 |     for (int i = 0; i < dim2; ++i) {
170 |       if (answers[idx][i] >= 0) {
171 |         if (optLabel < 0 || output[idx][0][i] > output[idx][0][optLabel])
172 |           optLabel = i;
173 |       }
174 |     }
175 | 
176 |     dtype sum1 = 0.0;
177 |     dtype sum2 = 0.0;
178 |     dtype maxScore = output[idx][0][optLabel];
179 |     for (int i = 0; i < dim2; ++i) {
180 |       scores[idx][0][i] = -1e10;
181 |       if (answers[idx][i] >= 0) {
182 |         scores[idx][0][i] = exp(output[idx][0][i] - maxScore);
183 |         if (answers[idx][i] == 1)
184 |           sum1 += scores[idx][0][i];
185 |         sum2 += scores[idx][0][i];
186 |       }
187 |     }
188 |     cost += (log(sum2) - log(sum1)) / (batchsize * seqsize);
189 |     if (answers[idx][optLabel] == 1)
190 |       eval.correct_label_count++;
191 |     eval.overall_label_count++;
192 | 
193 |     for (int i = 0; i < dim2; ++i) {
194 |       if (answers[idx][i] >= 0) {
195 |         loutput[idx][0][i] = (scores[idx][0][i] / sum2 - answers[idx][i]) / (batchsize * seqsize);
196 |       }
197 |     }
198 | 
199 |   }
200 | 
201 |   FreeSpace(&scores);
202 |   return cost;
203 | }
204 | 
205 | template<typename xpu>
206 | inline dtype softmax_cost(Tensor<xpu, 3, dtype> output, const vector<vector<int> > &answers, int batchsize = 1) {
207 |   int seqsize = output.size(0);
208 |   if (answers.size() != seqsize || seqsize == 0) {
209 |     std::cerr << "softmax_cost error: vector size or context size invalid" << std::endl;
210 |   }
211 | 
212 |   int dim1 = output.size(1), dim2 = output.size(2);
213 |   int labelsize = answers[0].size();
214 | 
215 |   if (labelsize != dim2 || dim1 != 1) {
216 |     std::cerr << "softmax_cost error: dim size invalid" << std::endl;
217 |   }
218 | 
219 |   Tensor<xpu, 3, dtype> scores = NewTensor<xpu>(Shape3(seqsize, 1, dim2), d_zero);
220 | 
221 |   dtype cost = 0.0;
222 |   static int optLabel;
223 |   for (int idx = 0; idx < seqsize; idx++) {
224 |     optLabel = -1;
225 |     for (int i = 0; i < dim2; ++i) {
226 |       if (answers[idx][i] >= 0) {
227 |         if (optLabel < 0 || output[idx][0][i] > output[idx][0][optLabel])
228 |           optLabel = i;
229 |       }
230 |     }
231 | 
232 |     dtype sum1 = 0.0;
233 |     dtype sum2 = 0.0;
234 |     dtype maxScore = output[idx][0][optLabel];
235 |     for (int i = 0; i < dim2; ++i) {
236 |       scores[idx][0][i] = -1e10;
237 |       if (answers[idx][i] >= 0) {
238 |         scores[idx][0][i] = exp(output[idx][0][i] - maxScore);
239 |         if (answers[idx][i] == 1)
240 |           sum1 += scores[idx][0][i];
241 |         sum2 += scores[idx][0][i];
242 |       }
243 |     }
244 |     cost += (log(sum2) - log(sum1)) / (batchsize * seqsize);
245 |   }
246 | 
247 |   FreeSpace(&scores);
248 |   return cost;
249 | }
250 | 
251 | template<typename xpu>
252 | inline void softmax_predict(Tensor<xpu, 3, dtype> output, vector<int>& results) {
253 |   int seqsize = output.size(0);
254 |   if (seqsize == 0) {
255 |     std::cerr << "softmax_predict error: vector size or context size invalid" << std::endl;
256 |   }
257 | 
258 |   int dim1 = output.size(1), dim2 = output.size(2);
259 |   if (dim1 != 1) {
260 |     std::cerr << "softmax_predict error: dim size invalid" << std::endl;
261 |   }
262 | 
263 |   results.resize(seqsize);
264 | 
265 |   static int optLabel;
266 |   for (int idx = 0; idx < seqsize; idx++) {
267 |     optLabel = -1;
268 |     for (int i = 0; i < dim2; ++i) {
269 |       if (optLabel < 0 || output[idx][0][i] > output[idx][0][optLabel])
270 |         optLabel = i;
271 |     }
272 |     results[idx] = optLabel;
273 |   }
274 | 
275 | }
276 | 
277 | template<typename xpu>
278 | inline dtype softmax_loss(Tensor<xpu, 2, dtype> output, const vector<int> &answer, Tensor<xpu, 2, dtype> loutput, Metric & eval, int batchsize = 1) {
279 |   int dim1 = output.size(0), dim2 = output.size(1);
280 |   int odim1 = loutput.size(0), odim2 = loutput.size(1);
281 |   int labelsize = answer.size();
282 | 
283 |   if (labelsize != odim2 || dim2 != odim2 || dim1 != 1 || odim1 != 1) {
284 |     std::cerr << "softmax_loss error: dim size invalid" << std::endl;
285 |   }
286 | 
287 |   Tensor<xpu, 2, dtype> scores = NewTensor<xpu>(Shape2(1, dim2), d_zero);
288 | 
289 |   loutput = 0.0;
290 |   dtype cost = 0.0;
291 | 
292 |   int optLabel = -1;
293 |   for (int i = 0; i < dim2; ++i) {
294 |     if (answer[i] >= 0) {
295 |       if (optLabel < 0 || output[0][i] > output[0][optLabel])
296 |         optLabel = i;
297 |     }
298 |   }
299 | 
300 |   dtype sum1 = 0.0;
301 |   dtype sum2 = 0.0;
302 |   dtype maxScore = output[0][optLabel];
303 |   for (int i = 0; i < dim2; ++i) {
304 |     scores[0][i] = -1e10;
305 |     if (answer[i] >= 0) {
306 |       scores[0][i] = exp(output[0][i] - maxScore);
307 |       if (answer[i] == 1)
308 |         sum1 += scores[0][i];
309 |       sum2 += scores[0][i];
310 |     }
311 |   }
312 |   cost += (log(sum2) - log(sum1)) / batchsize;
313 |   if (answer[optLabel] == 1)
314 |     eval.correct_label_count++;
315 |   eval.overall_label_count++;
316 | 
317 |   for (int i = 0; i < dim2; ++i) {
318 |     if (answer[i] >= 0) {
319 |       loutput[0][i] = (scores[0][i] / sum2 - answer[i]) / batchsize;
320 |     }
321 |   }
322 | 
323 |   FreeSpace(&scores);
324 |   return cost;
325 | }
326 | 
327 | template<typename xpu>
328 | inline dtype softmax_cost(Tensor<xpu, 2, dtype> output, const vector<int> &answer, int batchsize = 1) {
329 |   int dim1 = output.size(0), dim2 = output.size(1);
330 |   int labelsize = answer.size();
331 | 
332 |   if (labelsize != dim2 || dim1 != 1) {
333 |     std::cerr << "softmax_cost error: dim size invalid" << std::endl;
334 |   }
335 | 
336 |   Tensor<xpu, 2, dtype> scores = NewTensor<xpu>(Shape2(1, dim2), d_zero);
337 | 
338 |   dtype cost = 0.0;
339 | 
340 |   int optLabel = -1;
341 |   for (int i = 0; i < dim2; ++i) {
342 |     if (answer[i] >= 0) {
343 |       if (optLabel < 0 || output[0][i] > output[0][optLabel])
344 |         optLabel = i;
345 |     }
346 |   }
347 | 
348 |   dtype sum1 = 0.0;
349 |   dtype sum2 = 0.0;
350 |   dtype maxScore = output[0][optLabel];
351 |   for (int i = 0; i < dim2; ++i) {
352 |     scores[0][i] = -1e10;
353 |     if (answer[i] >= 0) {
354 |       scores[0][i] = exp(output[0][i] - maxScore);
355 |       if (answer[i] == 1)
356 |         sum1 += scores[0][i];
357 |       sum2 += scores[0][i];
358 |     }
359 |   }
360 |   cost += (log(sum2) - log(sum1)) / batchsize;
361 | 
362 |   FreeSpace(&scores);
363 |   return cost;
364 | }
365 | 
366 | template<typename xpu>
367 | inline void softmax_predict(Tensor<xpu, 2, dtype> output, int& result) {
368 |   int dim1 = output.size(0), dim2 = output.size(1);
369 |   if (dim1 != 1) {
370 |     std::cerr << "softmax_predict error: dim size invalid" << std::endl;
371 |   }
372 | 
373 |   int optLabel = -1;
374 |   for (int i = 0; i < dim2; ++i) {
375 |     if (optLabel < 0 || output[0][i] > output[0][optLabel])
376 |       optLabel = i;
377 |   }
378 |   result = optLabel;
379 | 
380 | }
381 | 
382 | template<typename xpu>
383 | inline int softmax_predict(Tensor<xpu, 2, dtype> output, vector<dtype>& results) {
384 |   int dim1 = output.size(0), dim2 = output.size(1);
385 |   if (dim1 != 1) {
386 |     std::cerr << "softmax_predict error: dim size invalid" << std::endl;
387 |   }
388 | 
389 |   int optLabel = -1;
390 |   for (int i = 0; i < dim2; ++i) {
391 |     if (optLabel < 0 || output[0][i] > output[0][optLabel])
392 |       optLabel = i;
393 |   }
394 | 
395 |   dtype maxScore = output[0][optLabel];
396 |   results.resize(dim2);
397 | 
398 |   dtype sum = 0.0;
399 |   for (int i = 0; i < dim2; ++i) {
400 |       results[i] = exp(output[0][i] - maxScore);
401 |       sum += results[i];
402 |   }
403 | 
404 |   for (int i = 0; i < dim2; ++i) {
405 |       results[i] = results[i]/sum;
406 |   }
407 | 
408 |   return optLabel;
409 | 
410 | }
411 | 
412 | #endif
413 | 


--------------------------------------------------------------------------------
/SparseUniLayer.h:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * SparseUniLayer.h
  3 |  *
  4 |  *  Created on: Mar 18, 2015
  5 |  *      Author: mszhang
  6 |  */
  7 | 
  8 | #ifndef SRC_SparseUniLayer_H_
  9 | #define SRC_SparseUniLayer_H_
 10 | #include "tensor.h"
 11 | #include "Utiltensor.h"
 12 | #include "MyLib.h"
 13 | 
 14 | using namespace mshadow;
 15 | using namespace mshadow::expr;
 16 | using namespace mshadow::utils;
 17 | 
 18 | // Weight updating process implemented without theory support,
 19 | // but recently find an EMNLP 2015 paper "An Empirical Analysis of Optimization for Max-Margin NLP"
 20 | // In all my papers that use adagrad for sparse features, I use it for parameter updating.
 21 | 
 22 | template<typename xpu>
 23 | class SparseUniLayer {
 24 | 
 25 | public:
 26 | 
 27 |   hash_set<int> _indexers;
 28 | 
 29 |   Tensor<xpu, 2, dtype> _W;
 30 |   Tensor<xpu, 2, dtype> _b;
 31 | 
 32 |   Tensor<xpu, 2, dtype> _gradW;
 33 |   Tensor<xpu, 2, dtype> _gradb;
 34 | 
 35 |   Tensor<xpu, 2, dtype> _eg2W;
 36 |   Tensor<xpu, 2, dtype> _eg2b;
 37 | 
 38 |   Tensor<xpu, 2, dtype> _ftW;
 39 | 
 40 |   bool _bUseB;
 41 | 
 42 |   int _funcType; // 0: tanh, 1: sigmod, 2: f(x)=x, 3: exp
 43 | 
 44 |   int _max_update;
 45 |   NRVec<int> _last_update;
 46 | 
 47 | 
 48 | public:
 49 | 
 50 |   SparseUniLayer() {
 51 |     _indexers.clear();
 52 |   }
 53 | 
 54 |   inline void initial(int nOSize, int nISize, bool bUseB = true, int seed = 0, int funcType = 0) {
 55 |     dtype bound = sqrt(6.0 / (nOSize + nISize + 1));
 56 |     //dtype bound = 0.01;
 57 | 
 58 |     _W = NewTensor<xpu>(Shape2(nISize, nOSize), d_zero);
 59 |     _gradW = NewTensor<xpu>(Shape2(nISize, nOSize), d_zero);
 60 |     _eg2W = NewTensor<xpu>(Shape2(nISize, nOSize), d_zero);
 61 |     _ftW = NewTensor<xpu>(Shape2(nISize, nOSize), d_one);
 62 | 
 63 |     _b = NewTensor<xpu>(Shape2(1, nOSize), d_zero);
 64 |     _gradb = NewTensor<xpu>(Shape2(1, nOSize), d_zero);
 65 |     _eg2b = NewTensor<xpu>(Shape2(1, nOSize), d_zero);
 66 | 
 67 |     random(_W, -1.0 * bound, 1.0 * bound, seed);
 68 |     random(_b, -1.0 * bound, 1.0 * bound, seed + 1);
 69 | 
 70 |     _bUseB = bUseB;
 71 |     _funcType = funcType;
 72 | 
 73 |     _max_update = 0;
 74 |     _last_update.resize(nISize);
 75 |     _last_update = 0;
 76 |   }
 77 | 
 78 |   inline void initial(Tensor<xpu, 2, dtype> W, Tensor<xpu, 2, dtype> b, bool bUseB = true, int funcType = 0) {
 79 |     static int nOSize, nISize;
 80 |     nISize = W.size(0);
 81 |     nOSize = W.size(1);
 82 | 
 83 |     _W = NewTensor<xpu>(Shape2(nOSize, nISize), d_zero);
 84 |     _gradW = NewTensor<xpu>(Shape2(nOSize, nISize), d_zero);
 85 |     _eg2W = NewTensor<xpu>(Shape2(nOSize, nISize), d_zero);
 86 |     _ftW = NewTensor<xpu>(Shape2(nOSize, nISize), d_one);
 87 |     Copy(_W, W);
 88 | 
 89 |     _b = NewTensor<xpu>(Shape2(1, nOSize), d_zero);
 90 |     _gradb = NewTensor<xpu>(Shape2(1, nOSize), d_zero);
 91 |     _eg2b = NewTensor<xpu>(Shape2(1, nOSize), d_zero);
 92 | 
 93 |     if (bUseB)
 94 |       Copy(_b, b);
 95 | 
 96 |     _bUseB = bUseB;
 97 |     _funcType = funcType;
 98 | 
 99 |     _max_update = 0;
100 |     _last_update.resize(nISize);
101 |     _last_update = 0;
102 |   }
103 | 
104 |   inline void release() {
105 |     FreeSpace(&_W);
106 |     FreeSpace(&_gradW);
107 |     FreeSpace(&_eg2W);
108 |     FreeSpace(&_ftW);
109 |     FreeSpace(&_b);
110 |     FreeSpace(&_gradb);
111 |     FreeSpace(&_eg2b);
112 |     _indexers.clear();
113 |   }
114 | 
115 |   virtual ~SparseUniLayer() {
116 |     // TODO Auto-generated destructor stub
117 |   }
118 | 
119 |   inline dtype squarenormAll() {
120 |     dtype result = squarenorm(_gradW);
121 | 
122 |     if (_bUseB) {
123 |       result += squarenorm(_gradb);
124 |     }
125 | 
126 |     return result;
127 |   }
128 | 
129 |   inline void scaleGrad(dtype scale) {
130 |     _gradW = _gradW * scale;
131 |     if (_bUseB) {
132 |       _gradb = _gradb * scale;
133 |     }
134 |   }
135 | 
136 | public:
137 |   void ComputeForwardScore(const std::vector<int>& x, Tensor<xpu, 2, dtype> y) {
138 |     static long long featNum, featId;
139 |     featNum = x.size();
140 |     y = 0.0;
141 |     for (int idx = 0; idx < featNum; idx++) {
142 |       featId = x[idx];
143 |       updateSparseWeight(featId);
144 |       y[0] += _W[featId];
145 |     }
146 | 
147 |     if (_bUseB)
148 |       y = y + _b;
149 |     if (_funcType == 0)
150 |       y = F<nl_tanh>(y);
151 |     else if (_funcType == 1)
152 |       y = F<nl_sigmoid>(y);
153 |     else if (_funcType == 3)
154 |       y = F<nl_exp>(y);
155 |   }
156 | 
157 |   void ComputeForwardScore(const std::vector<std::vector<int> >& x, Tensor<xpu, 3, dtype> y) {
158 |     static long long featNum, featId;
159 | 
160 |     int seq_size = y.size(0);
161 | 
162 |     for (int id = 0; id < seq_size; id++) {
163 |       featNum = x[id].size();
164 |       y[id] = 0.0;
165 |       for (int idx = 0; idx < featNum; idx++) {
166 |         featId = x[id][idx];
167 |         updateSparseWeight(featId);
168 |         y[id][0] += _W[featId];
169 |       }
170 | 
171 |       if (_bUseB)
172 |         y[id] = y[id] + _b;
173 |       if (_funcType == 0)
174 |         y[id] = F<nl_tanh>(y[id]);
175 |       else if (_funcType == 1)
176 |         y[id] = F<nl_sigmoid>(y[id]);
177 |       else if (_funcType == 3)
178 |         y[id] = F<nl_exp>(y[id]);
179 |     }
180 |   }
181 | 
182 |   void ComputeForwardScore(const std::vector<std::vector<int> >& x, std::vector<Tensor<xpu, 2, dtype> > &y) {
183 |     static long long featNum, featId;
184 |     int seq_size = y.size();
185 | 
186 |     for (int id = 0; id < seq_size; id++) {
187 |       featNum = x[id].size();
188 |       y[id] = 0.0;
189 |       for (int idx = 0; idx < featNum; idx++) {
190 |         featId = x[id][idx];
191 |         updateSparseWeight(featId);
192 |         y[id][0] += _W[featId];
193 |       }
194 | 
195 |       if (_bUseB)
196 |         y[id] = y[id] + _b;
197 |       if (_funcType == 0)
198 |         y[id] = F<nl_tanh>(y[id]);
199 |       else if (_funcType == 1)
200 |         y[id] = F<nl_sigmoid>(y[id]);
201 |       else if (_funcType == 3)
202 |         y[id] = F<nl_exp>(y[id]);
203 |     }
204 |   }
205 |   // loss is stopped at this layer, since the input is one-hold alike
206 |   void ComputeBackwardLoss(const std::vector<int>& x, Tensor<xpu, 2, dtype> y, Tensor<xpu, 2, dtype> ly) {
207 |     Tensor<xpu, 2, dtype> deri_yx(Shape2(y.size(0), y.size(1))), cly(Shape2(y.size(0), y.size(1)));
208 |     AllocSpace(&deri_yx);
209 |     AllocSpace(&cly);
210 |     if (_funcType == 0) {
211 |       deri_yx = F<nl_dtanh>(y);
212 |       cly = ly * deri_yx;
213 |     } else if (_funcType == 1) {
214 |       deri_yx = F<nl_dsigmoid>(y);
215 |       cly = ly * deri_yx;
216 |     } else if (_funcType == 3) {
217 |       cly = ly * y;
218 |     } else {
219 |       //cly = ly;
220 |       Copy(cly, ly);
221 |     }
222 | 
223 |     //_gradW
224 |     static long long featNum, featId;
225 |     featNum = x.size();
226 |     for (int idx = 0; idx < featNum; idx++) {
227 |       featId = x[idx];
228 |       _indexers.insert(featId);
229 |       _gradW[featId] += cly[0];
230 |     }
231 | 
232 |     if (_bUseB)
233 |       _gradb = _gradb + cly;
234 | 
235 |     FreeSpace(&deri_yx);
236 |     FreeSpace(&cly);
237 |   }
238 | 
239 |   void ComputeBackwardLoss(const std::vector<std::vector<int> >& x, Tensor<xpu, 3, dtype> y, Tensor<xpu, 3, dtype> ly) {
240 |     int seq_size = y.size(0);
241 |     int y_dim1 = y.size(1), y_dim2 = y.size(2);
242 | 
243 |     static long long featNum, featId;
244 |     Tensor<xpu, 2, dtype> deri_yx(Shape2(y_dim1, y_dim2)), cly(Shape2(y_dim1, y_dim2));
245 |     AllocSpace(&deri_yx);
246 |     AllocSpace(&cly);
247 | 
248 |     for (int id = 0; id < seq_size; id++) {
249 |       if (_funcType == 0) {
250 |         deri_yx = F<nl_dtanh>(y[id]);
251 |         cly = ly[id] * deri_yx;
252 |       } else if (_funcType == 1) {
253 |         deri_yx = F<nl_dsigmoid>(y[id]);
254 |         cly = ly[id] * deri_yx;
255 |       } else if (_funcType == 3) {
256 |         cly = ly[id] * y[id];
257 |       } else {
258 |         //cly = ly;
259 |         Copy(cly, ly[id]);
260 |       }
261 |       //_gradW
262 |       featNum = x[id].size();
263 |       for (int idx = 0; idx < featNum; idx++) {
264 |         featId = x[id][idx];
265 |         _indexers.insert(featId);
266 |         _gradW[featId] += cly[0];
267 |       }
268 | 
269 |       if (_bUseB)
270 |         _gradb = _gradb + cly;
271 |     }
272 | 
273 |     FreeSpace(&deri_yx);
274 |     FreeSpace(&cly);
275 |   }
276 | 
277 |   void ComputeBackwardLoss(const std::vector<std::vector<int> >& x, const std::vector<Tensor<xpu, 2, dtype> > &y,
278 |       const std::vector<Tensor<xpu, 2, dtype> > &ly) {
279 |     int seq_size = y.size();
280 |     assert(seq_size > 0);
281 |     int y_dim1 = y[0].size(0), y_dim2 = y[0].size(1);
282 | 
283 |     static long long featNum, featId, startPos;
284 |     Tensor<xpu, 2, dtype> deri_yx(Shape2(y_dim1, y_dim2)), cly(Shape2(y_dim1, y_dim2));
285 |     AllocSpace(&deri_yx);
286 |     AllocSpace(&cly);
287 | 
288 |     for (int id = 0; id < seq_size; id++) {
289 |       if (_funcType == 0) {
290 |         deri_yx = F<nl_dtanh>(y[id]);
291 |         cly = ly[id] * deri_yx;
292 |       } else if (_funcType == 1) {
293 |         deri_yx = F<nl_dsigmoid>(y[id]);
294 |         cly = ly[id] * deri_yx;
295 |       } else if (_funcType == 3) {
296 |         cly = ly[id] * y[id];
297 |       } else {
298 |         //cly = ly;
299 |         Copy(cly, ly[id]);
300 |       }
301 |       //_gradW
302 |       featNum = x[id].size();
303 |       for (int idx = 0; idx < featNum; idx++) {
304 |         featId = x[id][idx];
305 |         _indexers.insert(featId);
306 |         _gradW[featId] += cly[0];
307 |       }
308 | 
309 |       if (_bUseB)
310 |         _gradb = _gradb + cly;
311 |     }
312 | 
313 |     FreeSpace(&deri_yx);
314 |     FreeSpace(&cly);
315 |   }
316 | 
317 |   void randomprint(int num) {
318 |     static int nOSize, nISize;
319 |     nISize = _W.size(0);
320 |     nOSize = _W.size(1);
321 | 
322 |     int count = 0;
323 |     while (count < num) {
324 |       int idx = rand() % nOSize;
325 |       int idy = rand() % nISize;
326 | 
327 |       std::cout << "_W[" << idx << "," << idy << "]=" << _W[idy][idx] << " ";
328 | 
329 |       if (_bUseB) {
330 |         int idz = rand() % nOSize;
331 |         std::cout << "_b[0][" << idz << "]=" << _b[0][idz] << " ";
332 |       }
333 |       count++;
334 |     }
335 | 
336 |     std::cout << std::endl;
337 |   }
338 | 
339 |   void updateAdaGrad(dtype regularizationWeight, dtype adaAlpha, dtype adaEps) {
340 |     static int startPos;
341 | 
342 |     static hash_set<int>::iterator it;
343 | 
344 |     _max_update++;
345 | 
346 |     Tensor<xpu, 1, dtype> sqrt_eg2W = NewTensor<xpu>(Shape1(_W.size(1)), d_zero);
347 | 
348 |     for (it = _indexers.begin(); it != _indexers.end(); ++it) {
349 |       int index = *it;
350 |       _eg2W[index] = _eg2W[index] + _gradW[index] * _gradW[index];
351 |       sqrt_eg2W = F<nl_sqrt>(_eg2W[index] + adaEps);
352 |       _W[index] = (_W[index] * sqrt_eg2W - _gradW[index] * adaAlpha) / (adaAlpha * regularizationWeight + sqrt_eg2W);
353 |       _ftW[index] = sqrt_eg2W / (adaAlpha * regularizationWeight + sqrt_eg2W);
354 |     }
355 | 
356 |     FreeSpace(&sqrt_eg2W);
357 | 
358 |     if (_bUseB) {
359 |       _gradb = _gradb + _b * regularizationWeight;
360 |       _eg2b = _eg2b + _gradb * _gradb;
361 |       _b = _b - _gradb * adaAlpha / F<nl_sqrt>(_eg2b + adaEps);
362 |     }
363 | 
364 |     clearGrad();
365 |   }
366 | 
367 |   void clearGrad() {
368 |     static hash_set<int>::iterator it;
369 | 
370 |     for (it = _indexers.begin(); it != _indexers.end(); ++it) {
371 |       int index = *it;
372 |       _gradW[index] = 0.0;
373 |     }
374 | 
375 |     _indexers.clear();
376 |     if (_bUseB)
377 |       _gradb = 0.0;
378 |   }
379 | 
380 |   void updateSparseWeight(long long featId) {
381 |     if (_last_update[featId] < _max_update) {
382 |       int times = _max_update - _last_update[featId];
383 |       _W[featId] = _W[featId] * F<nl_exp>(times * F<nl_log>(_ftW[featId]));
384 |       _last_update[featId] = _max_update;
385 |     }
386 |   }
387 | 
388 |   void writeModel(LStream &outf) {
389 |     SaveBinary(outf, _W);
390 |     SaveBinary(outf, _b);
391 |     SaveBinary(outf, _gradW);
392 |     SaveBinary(outf, _gradb);
393 |     SaveBinary(outf, _eg2W);
394 |     SaveBinary(outf, _eg2b);
395 |     SaveBinary(outf, _ftW);
396 | 
397 | 
398 |     WriteBinary(outf, _bUseB);
399 |     WriteBinary(outf, _funcType);
400 |     WriteBinary(outf, _max_update);
401 |     WriteVector(outf, _last_update);
402 |   }
403 | 
404 |   void loadModel(LStream &inf) {
405 |     LoadBinary(inf, &_W, false);
406 |     LoadBinary(inf, &_b, false);
407 |     LoadBinary(inf, &_gradW, false);
408 |     LoadBinary(inf, &_gradb, false);
409 |     LoadBinary(inf, &_eg2W, false);
410 |     LoadBinary(inf, &_eg2b, false);
411 |     LoadBinary(inf, &_ftW, false);
412 | 
413 |     ReadBinary(inf, _bUseB);
414 |     ReadBinary(inf, _funcType);
415 |     ReadBinary(inf, _max_update);
416 |     ReadVector(inf, _last_update);
417 |   }
418 | 
419 | 
420 | };
421 | 
422 | #endif /* SRC_SparseUniLayer_H_ */
423 | 


--------------------------------------------------------------------------------
/SparseUniLayer1O.h:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * SparseUniLayer1O.h
  3 |  *
  4 |  *  Created on: Oct 22, 2015
  5 |  *      Author: mszhang
  6 |  */
  7 | 
  8 | #ifndef SPARSEUNILAYER1O_H_
  9 | #define SPARSEUNILAYER1O_H_
 10 | 
 11 | #include "tensor.h"
 12 | #include "Utiltensor.h"
 13 | #include "MyLib.h"
 14 | 
 15 | using namespace mshadow;
 16 | using namespace mshadow::expr;
 17 | using namespace mshadow::utils;
 18 | 
 19 | // Weight updating process implemented without theory support,
 20 | // but recently find an EMNLP 2015 paper "An Empirical Analysis of Optimization for Max-Margin NLP"
 21 | // In all my papers that use adagrad for sparse features, I use it for parameter updating.
 22 | 
 23 | template<typename xpu>
 24 | class SparseUniLayer1O {
 25 | 
 26 | public:
 27 | 
 28 |   hash_set<int> _indexers;
 29 | 
 30 |   Tensor<xpu, 1, dtype> _W;
 31 | 
 32 |   Tensor<xpu, 1, dtype> _gradW;
 33 | 
 34 |   Tensor<xpu, 1, dtype> _eg2W;
 35 | 
 36 |   Tensor<xpu, 1, dtype> _ftW;
 37 | 
 38 | 
 39 |   int _max_update;
 40 |   NRVec<int> _last_update;
 41 | 
 42 | 
 43 | public:
 44 | 
 45 |   SparseUniLayer1O() {
 46 |     _indexers.clear();
 47 |   }
 48 | 
 49 |   inline void initial(int nISize, int seed = 0) {
 50 |     dtype bound = sqrt(6.0 / (nISize + 1));
 51 |     //dtype bound = 0.01;
 52 | 
 53 |     _W = NewTensor<xpu>(Shape1(nISize), d_zero);
 54 |     _gradW = NewTensor<xpu>(Shape1(nISize), d_zero);
 55 |     _eg2W = NewTensor<xpu>(Shape1(nISize), d_zero);
 56 |     _ftW = NewTensor<xpu>(Shape1(nISize), d_one);
 57 | 
 58 | 
 59 |     random(_W, -1.0 * bound, 1.0 * bound, seed);
 60 | 
 61 |     _max_update = 0;
 62 |     _last_update.resize(nISize);
 63 |     _last_update = 0;
 64 |   }
 65 | 
 66 |   inline void initial(Tensor<xpu, 1, dtype> W) {
 67 |     static int nOSize, nISize;
 68 |     nISize = W.size(0);
 69 | 
 70 |     _W = NewTensor<xpu>(Shape1(nISize), d_zero);
 71 |     _gradW = NewTensor<xpu>(Shape1(nISize), d_zero);
 72 |     _eg2W = NewTensor<xpu>(Shape1(nISize), d_zero);
 73 |     _ftW = NewTensor<xpu>(Shape1(nISize), d_one);
 74 |     Copy(_W, W);
 75 | 
 76 | 
 77 |     _max_update = 0;
 78 |     _last_update.resize(nISize);
 79 |     _last_update = 0;
 80 |   }
 81 | 
 82 |   inline void release() {
 83 |     FreeSpace(&_W);
 84 |     FreeSpace(&_gradW);
 85 |     FreeSpace(&_eg2W);
 86 |     FreeSpace(&_ftW);
 87 |     _indexers.clear();
 88 |   }
 89 | 
 90 |   virtual ~SparseUniLayer1O() {
 91 |     // TODO Auto-generated destructor stub
 92 |   }
 93 | 
 94 |   inline dtype squarenormAll() {
 95 |     dtype result = squarenorm(_gradW);
 96 | 
 97 |     return result;
 98 |   }
 99 | 
100 |   inline void scaleGrad(dtype scale) {
101 |     _gradW = _gradW * scale;
102 |   }
103 | 
104 | public:
105 |   void ComputeForwardScore(const std::vector<int>& x, dtype& y) {
106 |     static long long featNum, featId;
107 |     featNum = x.size();
108 |     y = 0.0;
109 |     for (int idx = 0; idx < featNum; idx++) {
110 |       featId = x[idx];
111 |       if(featId >= _W.size(0))continue;
112 |       updateSparseWeight(featId);
113 |       y += _W[featId];
114 |     }
115 | 
116 |   }
117 | 
118 |   // loss is stopped at this layer, since the input is one-hold alike
119 |   void ComputeBackwardLoss(const std::vector<int>& x, dtype ly) {
120 |     //_gradW
121 |     static long long featNum, featId;
122 |     featNum = x.size();
123 |     for (int idx = 0; idx < featNum; idx++) {
124 |       featId = x[idx];
125 |       if(featId >= _W.size(0))continue;
126 |       _indexers.insert(featId);
127 |       _gradW[featId] += ly;
128 |     }
129 |   }
130 | 
131 | 
132 |   void randomprint(int num) {
133 |     static int nISize;
134 |     nISize = _W.size(0);
135 | 
136 |     int count = 0;
137 |     while (count < num) {
138 |       int idx = rand() % nISize;
139 |       std::cout << "_W[" << idx  << "]=" << _W[idx] << " ";
140 |       count++;
141 |     }
142 | 
143 |     std::cout << std::endl;
144 |   }
145 | 
146 |   void updateAdaGrad(dtype regularizationWeight, dtype adaAlpha, dtype adaEps) {
147 |     static int startPos;
148 | 
149 |     static hash_set<int>::iterator it;
150 | 
151 |     _max_update++;
152 | 
153 |     dtype sqrt_eg2W = d_zero;
154 | 
155 | 
156 |     for (it = _indexers.begin(); it != _indexers.end(); ++it) {
157 |       int index = *it;
158 |       _eg2W[index] = _eg2W[index] + _gradW[index] * _gradW[index];
159 |       sqrt_eg2W = sqrt(_eg2W[index] + adaEps);
160 |       _W[index] = (_W[index] * sqrt_eg2W - _gradW[index] * adaAlpha) / (adaAlpha * regularizationWeight + sqrt_eg2W);
161 |       _ftW[index] = sqrt_eg2W / (adaAlpha * regularizationWeight + sqrt_eg2W);
162 |     }
163 | 
164 | 
165 |     //for (it = _indexers.begin(); it != _indexers.end(); ++it) {
166 |     //  int index = *it;
167 |     //  _W[index] = _W[index] - _gradW[index];
168 |     //}
169 | 
170 |     clearGrad();
171 |   }
172 | 
173 |   void clearGrad() {
174 |     static hash_set<int>::iterator it;
175 |     for (it = _indexers.begin(); it != _indexers.end(); ++it) {
176 |       int index = *it;
177 |       _gradW[index] = 0.0;
178 |     }
179 |     _indexers.clear();
180 | 
181 |   }
182 | 
183 |   void updateSparseWeight(long long featId) {
184 | 
185 |     if (_last_update[featId] < _max_update) {
186 |       int times = _max_update - _last_update[featId];
187 |       _W[featId] = _W[featId] * exp(times * log(_ftW[featId]));
188 |       _last_update[featId] = _max_update;
189 |     }
190 |   }
191 | 
192 | 
193 |   void writeModel(LStream &outf) {
194 | 
195 |     SaveBinary(outf, _W);
196 |     SaveBinary(outf, _gradW);
197 |     SaveBinary(outf, _eg2W);
198 |     SaveBinary(outf, _ftW);
199 | 
200 |     WriteBinary(outf, _max_update);
201 |     WriteVector(outf, _last_update);
202 |   }
203 | 
204 |   void loadModel(LStream &inf) {
205 |     LoadBinary(inf, &_W, false);
206 |     LoadBinary(inf, &_gradW, false);
207 |     LoadBinary(inf, &_eg2W, false);
208 |     LoadBinary(inf, &_ftW, false);
209 | 
210 |     ReadBinary(inf, _max_update);
211 |     ReadVector(inf, _last_update);
212 |   }
213 | 
214 | };
215 | 
216 | 
217 | 
218 | #endif /* SPARSEUNILAYER1O_H_ */
219 | 


--------------------------------------------------------------------------------
/TensorLayer.h:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * TensorLayer.h
  3 |  *
  4 |  *  Created on: Mar 18, 2015
  5 |  *      Author: mszhang
  6 |  */
  7 | 
  8 | #ifndef SRC_TensorLayer_H_
  9 | #define SRC_TensorLayer_H_
 10 | #include "tensor.h"
 11 | #include "MyLib.h"
 12 | #include "Utiltensor.h"
 13 | 
 14 | using namespace mshadow;
 15 | using namespace mshadow::expr;
 16 | using namespace mshadow::utils;
 17 | 
 18 | template<typename xpu>
 19 | class TensorLayer {
 20 | 
 21 | public:
 22 | 
 23 |   Tensor<xpu, 3, dtype> _W;
 24 |   Tensor<xpu, 2, dtype> _V;
 25 |   Tensor<xpu, 2, dtype> _b;
 26 | 
 27 |   Tensor<xpu, 3, dtype> _gradW;
 28 |   Tensor<xpu, 2, dtype> _gradV;
 29 |   Tensor<xpu, 2, dtype> _gradb;
 30 | 
 31 |   Tensor<xpu, 3, dtype> _eg2W;
 32 |   Tensor<xpu, 2, dtype> _eg2V;
 33 |   Tensor<xpu, 2, dtype> _eg2b;
 34 | 
 35 |   int _mode; // 1: x1 W x2; 2: x1 W x2 + V x2; 3: x1 W x2 + V x2 + b
 36 | 
 37 |   int _funcType; // 0: tanh, 1: sigmod, 2: f(x)=x, 3: exp
 38 | 
 39 | public:
 40 |   TensorLayer() {
 41 |   }
 42 | 
 43 |   inline void initial(int nOSize, int nISize, int mode = 1, int seed = 0, int funcType = 0) {
 44 |     dtype bound = sqrt(6.0 / (nOSize + nISize + 1));
 45 |     //dtype bound = 0.01;
 46 | 
 47 |     _W = NewTensor<xpu>(Shape3(nOSize, nISize, nOSize), d_zero);
 48 |     _gradW = NewTensor<xpu>(Shape3(nOSize, nISize, nOSize), d_zero);
 49 |     _eg2W = NewTensor<xpu>(Shape3(nOSize, nISize, nOSize), d_zero);
 50 | 
 51 |     _V = NewTensor<xpu>(Shape2(nOSize, nOSize), d_zero);
 52 |     _gradV = NewTensor<xpu>(Shape2(nOSize, nOSize), d_zero);
 53 |     _eg2V = NewTensor<xpu>(Shape2(nOSize, nOSize), d_zero);
 54 | 
 55 |     _b = NewTensor<xpu>(Shape2(1, nOSize), d_zero);
 56 |     _gradb = NewTensor<xpu>(Shape2(1, nOSize), d_zero);
 57 |     _eg2b = NewTensor<xpu>(Shape2(1, nOSize), d_zero);
 58 | 
 59 |     random(_W, -1.0 * bound, 1.0 * bound, seed);
 60 |     random(_V, -1.0 * bound, 1.0 * bound, seed + 1);
 61 |     random(_b, -1.0 * bound, 1.0 * bound, seed + 2);
 62 | 
 63 |     _mode = mode;
 64 |     _funcType = funcType;
 65 |   }
 66 | 
 67 |   inline void initial(Tensor<xpu, 3, dtype> W, Tensor<xpu, 2, dtype> V, Tensor<xpu, 2, dtype> b, int mode = 1, int funcType = 0) {
 68 |     static int nOSize, nISize;
 69 |     nOSize = W.size(0);
 70 |     nISize = W.size(1);
 71 | 
 72 |     _W = NewTensor<xpu>(Shape3(nOSize, nISize, nOSize), d_zero);
 73 |     _gradW = NewTensor<xpu>(Shape3(nOSize, nISize, nOSize), d_zero);
 74 |     _eg2W = NewTensor<xpu>(Shape3(nOSize, nISize, nOSize), d_zero);
 75 |     Copy(_W, W);
 76 | 
 77 |     _V = NewTensor<xpu>(Shape2(nOSize, nOSize), d_zero);
 78 |     _gradV = NewTensor<xpu>(Shape2(nOSize, nOSize), d_zero);
 79 |     _eg2V = NewTensor<xpu>(Shape2(nOSize, nOSize), d_zero);
 80 |     if (mode >= 2)
 81 |       Copy(_V, V);
 82 | 
 83 |     _b = NewTensor<xpu>(Shape2(1, nOSize), d_zero);
 84 |     _gradb = NewTensor<xpu>(Shape2(1, nOSize), d_zero);
 85 |     _eg2b = NewTensor<xpu>(Shape2(1, nOSize), d_zero);
 86 | 
 87 |     if (mode >= 3)
 88 |       Copy(_b, b);
 89 | 
 90 |     _mode = mode;
 91 |     _funcType = funcType;
 92 |   }
 93 | 
 94 |   inline void release() {
 95 |     FreeSpace(&_W);
 96 |     FreeSpace(&_gradW);
 97 |     FreeSpace(&_eg2W);
 98 |     FreeSpace(&_V);
 99 |     FreeSpace(&_gradV);
100 |     FreeSpace(&_eg2V);
101 |     FreeSpace(&_b);
102 |     FreeSpace(&_gradb);
103 |     FreeSpace(&_eg2b);
104 |   }
105 | 
106 |   virtual ~TensorLayer() {
107 |     // TODO Auto-generated destructor stub
108 |   }
109 | 
110 |   inline dtype squarenormAll() {
111 |     dtype result = squarenorm(_gradW);
112 | 
113 |     if (_mode >= 2) {
114 |       result += squarenorm(_gradV);
115 |     }
116 | 
117 |     if (_mode >= 3) {
118 |       result += squarenorm(_gradb);
119 |     }
120 | 
121 |     return result;
122 |   }
123 | 
124 |   inline void scaleGrad(dtype scale) {
125 |     _gradW = _gradW * scale;
126 |     if (_mode >= 2) {
127 |       _gradV = _gradV * scale;
128 |     }
129 |     if (_mode >= 3) {
130 |       _gradb = _gradb * scale;
131 |     }
132 |   }
133 | 
134 | public:
135 |   inline void ComputeForwardScore(Tensor<xpu, 2, dtype> x1, Tensor<xpu, 2, dtype> x2, Tensor<xpu, 2, dtype> y) {
136 |     Tensor<xpu, 2, dtype> midresult1 = NewTensor<xpu>(Shape2(1, y.size(1)), d_zero);
137 |     Tensor<xpu, 2, dtype> midresult2 = NewTensor<xpu>(Shape2(1, 1), d_zero);
138 |     for (int idy = 0; idy < y.size(1); idy++) {
139 |       midresult1 = dot(x1, _W[idy]);
140 |       midresult2 = dot(midresult1, x2.T());
141 |       y[0][idy] = midresult2[0][0];
142 |     }
143 | 
144 |     if (_mode >= 2) {
145 |       midresult1 = dot(x2, _V.T());
146 |       y += midresult1;
147 |     }
148 | 
149 |     if (_mode >= 3)
150 |       y = y + _b;
151 |     if (_funcType == 0)
152 |       y = F<nl_tanh>(y);
153 |     else if (_funcType == 1)
154 |       y = F<nl_sigmoid>(y);
155 |     else if (_funcType == 3)
156 |       y = F<nl_exp>(y);
157 | 
158 |     FreeSpace(&midresult1);
159 |     FreeSpace(&midresult2);
160 |   }
161 | 
162 | 
163 |   inline void ComputeForwardScore(Tensor<xpu, 3, dtype> x1, Tensor<xpu, 3, dtype> x2, Tensor<xpu, 3, dtype> y) {
164 |     int seq_size = y.size(0);
165 |     Tensor<xpu, 2, dtype> midresult1 = NewTensor<xpu>(Shape2(1, y.size(2)), d_zero);
166 |     Tensor<xpu, 2, dtype> midresult2 = NewTensor<xpu>(Shape2(1, 1), d_zero);
167 |     for(int id = 0; id < seq_size; id++){
168 |       for (int idy = 0; idy < y.size(2); idy++) {
169 |         midresult1 = dot(x1[id], _W[idy]);
170 |         midresult2 = dot(midresult1, x2[id].T());
171 |         y[id][0][idy] = midresult2[0][0];
172 |       }
173 | 
174 |       if (_mode >= 2) {
175 |         midresult1 = dot(x2[id], _V.T());
176 |         y[id] += midresult1;
177 |       }
178 | 
179 |       if (_mode >= 3)
180 |         y[id] = y[id] + _b;
181 |       if (_funcType == 0)
182 |         y[id] = F<nl_tanh>(y[id]);
183 |       else if (_funcType == 1)
184 |         y[id] = F<nl_sigmoid>(y[id]);
185 |       else if (_funcType == 3)
186 |         y[id] = F<nl_exp>(y[id]);
187 |     }
188 | 
189 |     FreeSpace(&midresult1);
190 |     FreeSpace(&midresult2);
191 |   }
192 | 
193 |   inline void ComputeForwardScore(const std::vector<Tensor<xpu, 2, dtype> > &x1, const std::vector<Tensor<xpu, 2, dtype> > &x2,
194 |       std::vector<Tensor<xpu, 2, dtype> > &y) {
195 |     int seq_size = y.size();
196 |     assert(seq_size > 0);
197 |     Tensor<xpu, 2, dtype> midresult1 = NewTensor<xpu>(Shape2(1, y[0].size(1)), d_zero);
198 |     Tensor<xpu, 2, dtype> midresult2 = NewTensor<xpu>(Shape2(1, 1), d_zero);
199 |     for(int id = 0; id < seq_size; id++){
200 |       for (int idy = 0; idy < y.size(2); idy++) {
201 |         midresult1 = dot(x1[id], _W[idy]);
202 |         midresult2 = dot(midresult1, x2[id].T());
203 |         y[id][0][idy] = midresult2[0][0];
204 |       }
205 | 
206 |       if (_mode >= 2) {
207 |         midresult1 = dot(x2[id], _V.T());
208 |         y[id] += midresult1;
209 |       }
210 | 
211 |       if (_mode >= 3)
212 |         y[id] = y[id] + _b;
213 |       if (_funcType == 0)
214 |         y[id] = F<nl_tanh>(y[id]);
215 |       else if (_funcType == 1)
216 |         y[id] = F<nl_sigmoid>(y[id]);
217 |       else if (_funcType == 3)
218 |         y[id] = F<nl_exp>(y[id]);
219 |     }
220 | 
221 |     FreeSpace(&midresult1);
222 |     FreeSpace(&midresult2);
223 |   }
224 | 
225 |   //please allocate the memory outside here
226 |   inline void ComputeBackwardLoss(Tensor<xpu, 2, dtype> x1, Tensor<xpu, 2, dtype> x2, Tensor<xpu, 2, dtype> y, Tensor<xpu, 2, dtype> ly,
227 |       Tensor<xpu, 2, dtype> lx1, Tensor<xpu, 2, dtype> lx2, bool bclear = false) {
228 |     //_gradW
229 |     Tensor<xpu, 2, dtype> deri_yx(Shape2(y.size(0), y.size(1))), cly(Shape2(y.size(0), y.size(1)));
230 |     AllocSpace(&deri_yx);
231 |     AllocSpace(&cly);
232 | 
233 |     if(bclear) {
234 |       lx1 = 0.0;
235 |       lx2 = 0.0;
236 |     }
237 |     if (_funcType == 0) {
238 |       deri_yx = F<nl_dtanh>(y);
239 |       cly = ly * deri_yx;
240 |     } else if (_funcType == 1) {
241 |       deri_yx = F<nl_dsigmoid>(y);
242 |       cly = ly * deri_yx;
243 |     } else if (_funcType == 3) {
244 |       cly = ly * y;
245 |     } else {
246 |       //cly = ly;
247 |       Copy(cly, ly);
248 |     }
249 | 
250 |     Tensor<xpu, 2, dtype> midresult1 = NewTensor<xpu>(Shape2(1, y.size(1)), d_zero);
251 |     Tensor<xpu, 2, dtype> midresult2 = NewTensor<xpu>(Shape2(1, y.size(1)), d_zero);
252 |     //_gradW
253 |     for (int idy = 0; idy < y.size(1); idy++) {
254 |       midresult1 = dot(x1, _W[idy]);
255 |       lx2 += cly[0][idy] * midresult1;
256 |       midresult2 = cly[0][idy] * x2;
257 |       _gradW[idy] += dot(x1.T(), midresult2);
258 |       lx1 += dot(midresult2, _W[idy].T());
259 |     }
260 | 
261 |     //_gradV
262 |     if (_mode >= 2) {
263 |       _gradV += dot(cly.T(), x2);
264 |       //lx
265 |       lx2 += dot(cly, _V);
266 |     }
267 | 
268 |     //_gradb
269 |     if (_mode >= 3)
270 |       _gradb += cly;
271 | 
272 |     FreeSpace(&deri_yx);
273 |     FreeSpace(&cly);
274 |     FreeSpace(&midresult1);
275 |     FreeSpace(&midresult2);
276 |   }
277 | 
278 |   inline void ComputeBackwardLoss(Tensor<xpu, 3, dtype> x1, Tensor<xpu, 3, dtype> x2, Tensor<xpu, 3, dtype> y, Tensor<xpu, 3, dtype> ly,
279 |       Tensor<xpu, 3, dtype> lx1, Tensor<xpu, 3, dtype> lx2, bool bclear = false) {
280 |     int seq_size = y.size(0);
281 |     int y_dim1 = y.size(1), y_dim2 = y.size(2);
282 |     assert(y_dim1 == 1);
283 |     //_gradW
284 |     Tensor<xpu, 2, dtype> deri_yx(Shape2(y_dim1, y_dim2)), cly(Shape2(y_dim1, y_dim2));
285 |     Tensor<xpu, 2, dtype> midresult1 = NewTensor<xpu>(Shape2(y_dim1, y_dim2), d_zero);
286 |     Tensor<xpu, 2, dtype> midresult2 = NewTensor<xpu>(Shape2(y_dim1, y_dim2), d_zero);
287 |     AllocSpace(&deri_yx);
288 |     AllocSpace(&cly);
289 | 
290 |     if(bclear) {
291 |       lx1 = 0.0;
292 |       lx2 = 0.0;
293 |     }
294 |     for (int id = 0; id < seq_size; id++) {
295 |       if (_funcType == 0) {
296 |         deri_yx = F<nl_dtanh>(y[id]);
297 |         cly = ly[id] * deri_yx;
298 |       } else if (_funcType == 1) {
299 |         deri_yx = F<nl_dsigmoid>(y);
300 |         cly = ly[id] * deri_yx;
301 |       } else if (_funcType == 3) {
302 |         cly = ly[id] * y[id];
303 |       } else {
304 |         //cly = ly;
305 |         Copy(cly, ly[id]);
306 |       }
307 | 
308 |       //_gradW
309 |       for (int idy = 0; idy < y.size(2); idy++) {
310 |         midresult1 = dot(x1[id], _W[idy]);
311 |         lx2[id] += cly[0][idy] * midresult1;
312 |         midresult2 = cly[0][idy] * x2[id];
313 |         _gradW[idy] += dot(x1[id].T(), midresult2);
314 |         lx1[id] += dot(midresult2, _W[idy].T());
315 |       }
316 | 
317 |       //_gradV
318 |       if (_mode >= 2) {
319 |         _gradV += dot(cly.T(), x2[id]);
320 |         //lx
321 |         lx2[id] += dot(cly, _V);
322 |       }
323 | 
324 |       //_gradb
325 |       if (_mode >= 3)
326 |         _gradb += cly;
327 |     }
328 | 
329 |     FreeSpace(&deri_yx);
330 |     FreeSpace(&cly);
331 |     FreeSpace(&midresult1);
332 |     FreeSpace(&midresult2);
333 |   }
334 | 
335 |   inline void ComputeBackwardLoss(const std::vector<Tensor<xpu, 2, dtype> > &x1, const std::vector<Tensor<xpu, 2, dtype> > &x2,
336 |       const std::vector<Tensor<xpu, 2, dtype> > &y, const std::vector<Tensor<xpu, 2, dtype> > &ly,
337 |       std::vector<Tensor<xpu, 2, dtype> > &lx1, std::vector<Tensor<xpu, 2, dtype> > &lx2, bool bclear = false) {
338 |     int seq_size = y.size();
339 |     assert(seq_size > 0);
340 |     int y_dim1 = y[0].size(0), y_dim2 = y[0].size(1);
341 |     assert(y_dim1 == 1);
342 |     //_gradW
343 |     Tensor<xpu, 2, dtype> deri_yx(Shape2(y_dim1, y_dim2)), cly(Shape2(y_dim1, y_dim2));
344 |     Tensor<xpu, 2, dtype> midresult1 = NewTensor<xpu>(Shape2(y_dim1, y_dim2), d_zero);
345 |     Tensor<xpu, 2, dtype> midresult2 = NewTensor<xpu>(Shape2(y_dim1, y_dim2), d_zero);
346 |     AllocSpace(&deri_yx);
347 |     AllocSpace(&cly);
348 | 
349 |     if(bclear) {
350 |       for (int id = 0; id < seq_size; id++) {
351 |         lx1[id] = 0.0;
352 |         lx2[id] = 0.0;
353 |       }
354 |     }
355 |     for (int id = 0; id < seq_size; id++) {
356 |       if (_funcType == 0) {
357 |         deri_yx = F<nl_dtanh>(y[id]);
358 |         cly = ly[id] * deri_yx;
359 |       } else if (_funcType == 1) {
360 |         deri_yx = F<nl_dsigmoid>(y);
361 |         cly = ly[id] * deri_yx;
362 |       } else if (_funcType == 3) {
363 |         cly = ly[id] * y[id];
364 |       } else {
365 |         //cly = ly;
366 |         Copy(cly, ly[id]);
367 |       }
368 | 
369 |       //_gradW
370 |       for (int idy = 0; idy < y.size(2); idy++) {
371 |         midresult1 = dot(x1[id], _W[idy]);
372 |         lx2[id] += cly[0][idy] * midresult1;
373 |         midresult2 = cly[0][idy] * x2[id];
374 |         _gradW[idy] += dot(x1[id].T(), midresult2);
375 |         lx1[id] += dot(midresult2, _W[idy].T());
376 |       }
377 | 
378 |       //_gradV
379 |       if (_mode >= 2) {
380 |         _gradV += dot(cly.T(), x2[id]);
381 |         //lx
382 |         lx2[id] += dot(cly, _V);
383 |       }
384 | 
385 |       //_gradb
386 |       if (_mode >= 3)
387 |         _gradb += cly;
388 |     }
389 | 
390 |     FreeSpace(&deri_yx);
391 |     FreeSpace(&cly);
392 |     FreeSpace(&midresult1);
393 |     FreeSpace(&midresult2);
394 |   }
395 | 
396 |   inline void randomprint(int num) {
397 |     static int nOSize, nISize;
398 |     nOSize = _W.size(0);
399 |     nISize = _W.size(1);
400 |     int count = 0;
401 |     while (count < num) {
402 |       int idx = rand() % nOSize;
403 |       int idy = rand() % nISize;
404 |       int idz = rand() % nOSize;
405 |       std::cout << "_W[" << idx << "," << idy << "," << idz << "]=" << _W[idx][idy][idz] << " ";
406 | 
407 |       if (_mode >= 2) {
408 |         int idy = rand() % nOSize;
409 |         int idz = rand() % nOSize;
410 |         std::cout << "_V[" << idy << "," << idz << "]=" << _V[idy][idz] << " ";
411 |       }
412 | 
413 |       if (_mode >= 3) {
414 |         int idz = rand() % nOSize;
415 |         std::cout << "_b[0][" << idz << "]=" << _b[0][idz] << " ";
416 |       }
417 |       count++;
418 |     }
419 | 
420 |     std::cout << std::endl;
421 |   }
422 | 
423 |   inline void updateAdaGrad(dtype regularizationWeight, dtype adaAlpha, dtype adaEps) {
424 |     _gradW = _gradW + _W * regularizationWeight;
425 |     _eg2W = _eg2W + _gradW * _gradW;
426 |     _W = _W - _gradW * adaAlpha / F<nl_sqrt>(_eg2W + adaEps);
427 | 
428 |     if (_mode >= 2) {
429 |       _gradV = _gradV + _V * regularizationWeight;
430 |       _eg2V = _eg2V + _gradV * _gradV;
431 |       _V = _V - _gradV * adaAlpha / F<nl_sqrt>(_eg2V + adaEps);
432 |     }
433 | 
434 |     if (_mode >= 3) {
435 |       _gradb = _gradb + _b * regularizationWeight;
436 |       _eg2b = _eg2b + _gradb * _gradb;
437 |       _b = _b - _gradb * adaAlpha / F<nl_sqrt>(_eg2b + adaEps);
438 |     }
439 | 
440 |     clearGrad();
441 |   }
442 | 
443 |   inline void clearGrad() {
444 |     _gradW = 0;
445 |     if (_mode >= 2)
446 |       _gradV = 0;
447 |     if (_mode >= 3)
448 |       _gradb = 0;
449 |   }
450 | 
451 |   void writeModel(LStream &outf) {
452 |     SaveBinary(outf, _W);
453 |     SaveBinary(outf, _V);
454 |     SaveBinary(outf, _b);
455 | 
456 |     SaveBinary(outf, _gradW);
457 |     SaveBinary(outf, _gradV);
458 |     SaveBinary(outf, _gradb);
459 | 
460 |     SaveBinary(outf, _eg2W);
461 |     SaveBinary(outf, _eg2V);
462 |     SaveBinary(outf, _eg2b);
463 | 
464 |     WriteBinary(outf, _mode);
465 |     WriteBinary(outf, _funcType);
466 |   }
467 | 
468 |   void loadModel(LStream &inf) {
469 |     LoadBinary(inf, &_W, false);
470 |     LoadBinary(inf, &_V, false);
471 |     LoadBinary(inf, &_b, false);
472 | 
473 |     LoadBinary(inf, &_gradW, false);
474 |     LoadBinary(inf, &_gradV, false);
475 |     LoadBinary(inf, &_gradb, false);
476 | 
477 |     LoadBinary(inf, &_eg2W, false);
478 |     LoadBinary(inf, &_eg2V, false);
479 |     LoadBinary(inf, &_eg2b, false);
480 | 
481 |     ReadBinary(inf, _mode);
482 |     ReadBinary(inf, _funcType);
483 |   }
484 | 
485 | };
486 | 
487 | #endif /* SRC_TensorLayer_H_ */
488 | 


--------------------------------------------------------------------------------
/TriLayer.h:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * TriLayer.h
  3 |  *
  4 |  *  Created on: Mar 18, 2015
  5 |  *      Author: mszhang
  6 |  */
  7 | 
  8 | #ifndef SRC_TriLayer_H_
  9 | #define SRC_TriLayer_H_
 10 | #include "tensor.h"
 11 | #include "MyLib.h"
 12 | #include "Utiltensor.h"
 13 | 
 14 | using namespace mshadow;
 15 | using namespace mshadow::expr;
 16 | using namespace mshadow::utils;
 17 | 
 18 | template<typename xpu>
 19 | class TriLayer {
 20 | 
 21 | public:
 22 | 
 23 |   Tensor<xpu, 2, dtype> _W1;
 24 |   Tensor<xpu, 2, dtype> _W2;
 25 |   Tensor<xpu, 2, dtype> _W3;
 26 |   Tensor<xpu, 2, dtype> _b;
 27 | 
 28 |   Tensor<xpu, 2, dtype> _gradW1;
 29 |   Tensor<xpu, 2, dtype> _gradW2;
 30 |   Tensor<xpu, 2, dtype> _gradW3;
 31 |   Tensor<xpu, 2, dtype> _gradb;
 32 | 
 33 |   Tensor<xpu, 2, dtype> _eg2W1;
 34 |   Tensor<xpu, 2, dtype> _eg2W2;
 35 |   Tensor<xpu, 2, dtype> _eg2W3;
 36 |   Tensor<xpu, 2, dtype> _eg2b;
 37 | 
 38 |   bool _bUseB;
 39 | 
 40 |   int _funcType; // 0: tanh, 1: sigmod, 2: f(x)=x, 3: exp
 41 | 
 42 | public:
 43 |   TriLayer() {
 44 |   }
 45 | 
 46 |   inline void initial(int nOSize, int nISize1, int nISize2, int nISize3, bool bUseB = true, int seed = 0, int funcType = 0) {
 47 |     dtype bound = sqrt(6.0 / (nOSize + nISize1 + nISize2 + nISize3 + 1));
 48 |     //dtype bound = 0.01;
 49 | 
 50 |     _W1 = NewTensor<xpu>(Shape2(nOSize, nISize1), d_zero);
 51 |     _gradW1 = NewTensor<xpu>(Shape2(nOSize, nISize1), d_zero);
 52 |     _eg2W1 = NewTensor<xpu>(Shape2(nOSize, nISize1), d_zero);
 53 | 
 54 |     _W2 = NewTensor<xpu>(Shape2(nOSize, nISize2), d_zero);
 55 |     _gradW2 = NewTensor<xpu>(Shape2(nOSize, nISize2), d_zero);
 56 |     _eg2W2 = NewTensor<xpu>(Shape2(nOSize, nISize2), d_zero);
 57 | 
 58 |     _W3 = NewTensor<xpu>(Shape2(nOSize, nISize3), d_zero);
 59 |     _gradW3 = NewTensor<xpu>(Shape2(nOSize, nISize3), d_zero);
 60 |     _eg2W3 = NewTensor<xpu>(Shape2(nOSize, nISize3), d_zero);
 61 | 
 62 |     _b = NewTensor<xpu>(Shape2(1, nOSize), d_zero);
 63 |     _gradb = NewTensor<xpu>(Shape2(1, nOSize), d_zero);
 64 |     _eg2b = NewTensor<xpu>(Shape2(1, nOSize), d_zero);
 65 | 
 66 |     random(_W1, -1.0 * bound, 1.0 * bound, seed);
 67 |     random(_W2, -1.0 * bound, 1.0 * bound, seed+1);
 68 |     random(_W3, -1.0 * bound, 1.0 * bound, seed+2);
 69 |     random(_b, -1.0 * bound, 1.0 * bound, seed+3);
 70 | 
 71 |     _bUseB = bUseB;
 72 |     _funcType = funcType;
 73 |   }
 74 | 
 75 |   inline void initial(Tensor<xpu, 2, dtype> W1, Tensor<xpu, 2, dtype> W2, Tensor<xpu, 2, dtype> W3, Tensor<xpu, 2, dtype> b, bool bUseB = true,
 76 |       int funcType = 0) {
 77 |     static int nOSize, nISize1, nISize2, nISize3;
 78 |     nOSize = W1.size(0);
 79 |     nISize1 = W1.size(1);
 80 |     nISize2 = W2.size(1);
 81 |     nISize3 = W3.size(1);
 82 | 
 83 |     _W1 = NewTensor<xpu>(Shape2(nOSize, nISize1), d_zero);
 84 |     _gradW1 = NewTensor<xpu>(Shape2(nOSize, nISize1), d_zero);
 85 |     _eg2W1 = NewTensor<xpu>(Shape2(nOSize, nISize1), d_zero);
 86 |     Copy(_W1, W1);
 87 | 
 88 |     _W2 = NewTensor<xpu>(Shape2(nOSize, nISize2), d_zero);
 89 |     _gradW2 = NewTensor<xpu>(Shape2(nOSize, nISize2), d_zero);
 90 |     _eg2W2 = NewTensor<xpu>(Shape2(nOSize, nISize2), d_zero);
 91 |     Copy(_W2, W2);
 92 | 
 93 |     _W3 = NewTensor<xpu>(Shape2(nOSize, nISize3), d_zero);
 94 |     _gradW3 = NewTensor<xpu>(Shape2(nOSize, nISize3), d_zero);
 95 |     _eg2W3 = NewTensor<xpu>(Shape2(nOSize, nISize3), d_zero);
 96 |     Copy(_W3, W3);
 97 | 
 98 |     _b = NewTensor<xpu>(Shape2(1, nOSize), d_zero);
 99 |     _gradb = NewTensor<xpu>(Shape2(1, nOSize), d_zero);
100 |     _eg2b = NewTensor<xpu>(Shape2(1, nOSize), d_zero);
101 | 
102 |     if (bUseB)
103 |       Copy(_b, b);
104 | 
105 |     _bUseB = bUseB;
106 |     _funcType = funcType;
107 |   }
108 | 
109 |   inline void release() {
110 |     FreeSpace(&_W1);
111 |     FreeSpace(&_gradW1);
112 |     FreeSpace(&_eg2W1);
113 |     FreeSpace(&_W2);
114 |     FreeSpace(&_gradW2);
115 |     FreeSpace(&_eg2W2);
116 |     FreeSpace(&_W3);
117 |     FreeSpace(&_gradW3);
118 |     FreeSpace(&_eg2W3);
119 |     FreeSpace(&_b);
120 |     FreeSpace(&_gradb);
121 |     FreeSpace(&_eg2b);
122 |   }
123 | 
124 |   virtual ~TriLayer() {
125 |     // TODO Auto-generated destructor stub
126 |   }
127 | 
128 |   inline dtype squarenormAll() {
129 |     dtype result = squarenorm(_gradW1);
130 |     result += squarenorm(_gradW2);
131 |     result += squarenorm(_gradW3);
132 |     if (_bUseB) {
133 |       result += squarenorm(_gradb);
134 |     }
135 | 
136 |     return result;
137 |   }
138 | 
139 |   inline void scaleGrad(dtype scale) {
140 |     _gradW1 = _gradW1 * scale;
141 |     _gradW2 = _gradW2 * scale;
142 |     _gradW3 = _gradW3 * scale;
143 |     if (_bUseB) {
144 |       _gradb = _gradb * scale;
145 |     }
146 |   }
147 | 
148 | public:
149 |   inline void ComputeForwardScore(Tensor<xpu, 2, dtype> x1, Tensor<xpu, 2, dtype> x2, Tensor<xpu, 2, dtype> x3, Tensor<xpu, 2, dtype> y) {
150 |     y = dot(x1, _W1.T());
151 |     y += dot(x2, _W2.T());
152 |     y += dot(x3, _W3.T());
153 |     if (_bUseB)
154 |       y = y + _b;
155 |     if (_funcType == 0)
156 |       y = F<nl_tanh>(y);
157 |     else if (_funcType == 1)
158 |       y = F<nl_sigmoid>(y);
159 |     else if (_funcType == 3)
160 |       y = F<nl_exp>(y);
161 |   }
162 | 
163 |   inline void ComputeForwardScore(Tensor<xpu, 3, dtype> x1, Tensor<xpu, 3, dtype> x2, Tensor<xpu, 3, dtype> x3, Tensor<xpu, 3, dtype> y) {
164 |     int seq_size = y.size(0);
165 | 
166 |     for (int id = 0; id < seq_size; id++) {
167 |       y[id] = dot(x1[id], _W1.T());
168 |       y[id] += dot(x2[id], _W2.T());
169 |       y[id] += dot(x3[id], _W3.T());
170 |       if (_bUseB)
171 |         y[id] = y[id] + _b;
172 |       if (_funcType == 0)
173 |         y[id] = F<nl_tanh>(y[id]);
174 |       else if (_funcType == 1)
175 |         y[id] = F<nl_sigmoid>(y[id]);
176 |       else if (_funcType == 3)
177 |         y[id] = F<nl_exp>(y[id]);
178 |     }
179 |   }
180 | 
181 |   inline void ComputeForwardScore(const std::vector<Tensor<xpu, 2, dtype> > &x1, const std::vector<Tensor<xpu, 2, dtype> > &x2,
182 |       const std::vector<Tensor<xpu, 2, dtype> > &x3, std::vector<Tensor<xpu, 2, dtype> > &y) {
183 |     int seq_size = y.size();
184 | 
185 |     for (int id = 0; id < seq_size; id++) {
186 |       y[id] = dot(x1[id], _W1.T());
187 |       y[id] += dot(x2[id], _W2.T());
188 |       y[id] += dot(x3[id], _W3.T());
189 |       if (_bUseB)
190 |         y[id] = y[id] + _b;
191 |       if (_funcType == 0)
192 |         y[id] = F<nl_tanh>(y[id]);
193 |       else if (_funcType == 1)
194 |         y[id] = F<nl_sigmoid>(y[id]);
195 |       else if (_funcType == 3)
196 |         y[id] = F<nl_exp>(y[id]);
197 |     }
198 |   }
199 | 
200 |   //please allocate the memory outside here
201 |   inline void ComputeBackwardLoss(Tensor<xpu, 2, dtype> x1, Tensor<xpu, 2, dtype> x2, Tensor<xpu, 2, dtype> x3, Tensor<xpu, 2, dtype> y,
202 |       Tensor<xpu, 2, dtype> ly, Tensor<xpu, 2, dtype> lx1, Tensor<xpu, 2, dtype> lx2, Tensor<xpu, 2, dtype> lx3, bool bclear = false) {
203 |     //_gradW
204 |     Tensor<xpu, 2, dtype> deri_yx(Shape2(y.size(0), y.size(1))), cly(Shape2(y.size(0), y.size(1)));
205 |     AllocSpace(&deri_yx);
206 |     AllocSpace(&cly);
207 | 
208 |     if(bclear) {
209 |       lx1 = 0.0;
210 |       lx2 = 0.0;
211 |       lx3 = 0.0;
212 |     }
213 |     if (_funcType == 0) {
214 |       deri_yx = F<nl_dtanh>(y);
215 |       cly = ly * deri_yx;
216 |     } else if (_funcType == 1) {
217 |       deri_yx = F<nl_dsigmoid>(y);
218 |       cly = ly * deri_yx;
219 |     } else if (_funcType == 3) {
220 |       cly = ly * y;
221 |     } else {
222 |       //cly = ly;
223 |       Copy(cly, ly);
224 |     }
225 |     //_gradW
226 |     _gradW1 += dot(cly.T(), x1);
227 |     _gradW2 += dot(cly.T(), x2);
228 |     _gradW3 += dot(cly.T(), x3);
229 | 
230 |     //_gradb
231 |     if (_bUseB)
232 |       _gradb += cly;
233 | 
234 |     //lx
235 |     lx1 += dot(cly, _W1);
236 |     lx2 += dot(cly, _W2);
237 |     lx3 += dot(cly, _W3);
238 | 
239 |     FreeSpace(&deri_yx);
240 |     FreeSpace(&cly);
241 |   }
242 | 
243 | 
244 |   //please allocate the memory outside here
245 |   inline void ComputeBackwardLoss(Tensor<xpu, 3, dtype> x1, Tensor<xpu, 3, dtype> x2, Tensor<xpu, 3, dtype> x3, Tensor<xpu, 3, dtype> y,
246 |       Tensor<xpu, 3, dtype> ly, Tensor<xpu, 3, dtype> lx1, Tensor<xpu, 3, dtype> lx2, Tensor<xpu, 3, dtype> lx3, bool bclear = false) {
247 |     int seq_size = y.size(0);
248 |     int y_dim1 = y.size(1), y_dim2 = y.size(2);
249 |     //_gradW
250 |     Tensor<xpu, 2, dtype> deri_yx(Shape2(y_dim1, y_dim2)), cly(Shape2(y_dim1, y_dim2));
251 |     AllocSpace(&deri_yx);
252 |     AllocSpace(&cly);
253 |     if(bclear) {
254 |       lx1 = 0.0;
255 |       lx2 = 0.0;
256 |       lx3 = 0.0;
257 |     }
258 |     for (int id = 0; id < seq_size; id++) {
259 |       if (_funcType == 0) {
260 |         deri_yx = F<nl_dtanh>(y[id]);
261 |         cly = ly[id] * deri_yx;
262 |       } else if (_funcType == 1) {
263 |         deri_yx = F<nl_dsigmoid>(y[id]);
264 |         cly = ly[id] * deri_yx;
265 |       } else if (_funcType == 3) {
266 |         cly = ly[id] * y[id];
267 |       } else {
268 |         //cly = ly;
269 |         Copy(cly, ly[id]);
270 |       }
271 |       //_gradW
272 |       _gradW1 += dot(cly.T(), x1[id]);
273 |       _gradW2 += dot(cly.T(), x2[id]);
274 |       _gradW3 += dot(cly.T(), x3[id]);
275 | 
276 |       //_gradb
277 |       if (_bUseB)
278 |         _gradb += cly;
279 | 
280 |       //lx
281 |       lx1[id] += dot(cly, _W1);
282 |       lx2[id] += dot(cly, _W2);
283 |       lx3[id] += dot(cly, _W3);
284 |     }
285 | 
286 |     FreeSpace(&deri_yx);
287 |     FreeSpace(&cly);
288 |   }
289 | 
290 | 
291 |   //please allocate the memory outside here
292 |   inline void ComputeBackwardLoss(const std::vector<Tensor<xpu, 2, dtype> > &x1, const std::vector<Tensor<xpu, 2, dtype> > &x2,
293 |       const std::vector<Tensor<xpu, 2, dtype> > &x3, const std::vector<Tensor<xpu, 2, dtype> > &y,
294 |       const std::vector<Tensor<xpu, 2, dtype> > &ly, std::vector<Tensor<xpu, 2, dtype> > &lx1,
295 |       std::vector<Tensor<xpu, 2, dtype> > &lx2, std::vector<Tensor<xpu, 2, dtype> > &lx3, bool bclear = false) {
296 |     int seq_size = y.size();
297 |     assert(seq_size > 0);
298 |     int y_dim1 = y[0].size(0), y_dim2 = y[0].size(1);
299 |     //_gradW
300 |     Tensor<xpu, 2, dtype> deri_yx(Shape2(y_dim1, y_dim2)), cly(Shape2(y_dim1, y_dim2));
301 |     AllocSpace(&deri_yx);
302 |     AllocSpace(&cly);
303 |     if(bclear) {
304 |       for (int id = 0; id < seq_size; id++) {
305 |         lx1[id] = 0.0;
306 |         lx2[id] = 0.0;
307 |         lx3[id] = 0.0;
308 |       }
309 |     }
310 |     for (int id = 0; id < seq_size; id++) {
311 |       if (_funcType == 0) {
312 |         deri_yx = F<nl_dtanh>(y[id]);
313 |         cly = ly[id] * deri_yx;
314 |       } else if (_funcType == 1) {
315 |         deri_yx = F<nl_dsigmoid>(y[id]);
316 |         cly = ly[id] * deri_yx;
317 |       } else if (_funcType == 3) {
318 |         cly = ly[id] * y[id];
319 |       } else {
320 |         //cly = ly;
321 |         Copy(cly, ly[id]);
322 |       }
323 |       //_gradW
324 |       _gradW1 += dot(cly.T(), x1[id]);
325 |       _gradW2 += dot(cly.T(), x2[id]);
326 |       _gradW3 += dot(cly.T(), x3[id]);
327 | 
328 |       //_gradb
329 |       if (_bUseB)
330 |         _gradb += cly;
331 | 
332 |       //lx
333 |       lx1[id] += dot(cly, _W1);
334 |       lx2[id] += dot(cly, _W2);
335 |       lx3[id] += dot(cly, _W3);
336 |     }
337 | 
338 |     FreeSpace(&deri_yx);
339 |     FreeSpace(&cly);
340 |   }
341 | 
342 |   inline void randomprint(int num) {
343 |     static int nOSize, nISize1, nISize2, nISize3;
344 |     nOSize = _W1.size(0);
345 |     nISize1 = _W1.size(1);
346 |     nISize2 = _W2.size(1);
347 |     nISize3 = _W3.size(1);
348 |     int count = 0;
349 |     while (count < num) {
350 |       int idx1 = rand() % nOSize;
351 |       int idy1 = rand() % nISize1;
352 |       int idx2 = rand() % nOSize;
353 |       int idy2 = rand() % nISize2;
354 |       int idx3 = rand() % nOSize;
355 |       int idy3 = rand() % nISize3;
356 | 
357 |       std::cout << "_W1[" << idx1 << "," << idy1 << "]=" << _W1[idx1][idy1] << " ";
358 |       std::cout << "_W2[" << idx2 << "," << idy2 << "]=" << _W2[idx2][idy2] << " ";
359 |       std::cout << "_W3[" << idx3 << "," << idy3 << "]=" << _W3[idx3][idy3] << " ";
360 | 
361 |       if (_bUseB) {
362 |         int idz = rand() % nOSize;
363 |         std::cout << "_b[0][" << idz << "]=" << _b[0][idz] << " ";
364 |       }
365 |       count++;
366 |     }
367 | 
368 |     std::cout << std::endl;
369 |   }
370 | 
371 |   inline void updateAdaGrad(dtype regularizationWeight, dtype adaAlpha, dtype adaEps) {
372 |     _gradW1 = _gradW1 + _W1 * regularizationWeight;
373 |     _eg2W1 = _eg2W1 + _gradW1 * _gradW1;
374 |     _W1 = _W1 - _gradW1 * adaAlpha / F<nl_sqrt>(_eg2W1 + adaEps);
375 | 
376 |     _gradW2 = _gradW2 + _W2 * regularizationWeight;
377 |     _eg2W2 = _eg2W2 + _gradW2 * _gradW2;
378 |     _W2 = _W2 - _gradW2 * adaAlpha / F<nl_sqrt>(_eg2W2 + adaEps);
379 | 
380 |     _gradW3 = _gradW3 + _W3 * regularizationWeight;
381 |     _eg2W3 = _eg2W3 + _gradW3 * _gradW3;
382 |     _W3 = _W3 - _gradW3 * adaAlpha / F<nl_sqrt>(_eg2W3 + adaEps);
383 | 
384 |     if (_bUseB) {
385 |       _gradb = _gradb + _b * regularizationWeight;
386 |       _eg2b = _eg2b + _gradb * _gradb;
387 |       _b = _b - _gradb * adaAlpha / F<nl_sqrt>(_eg2b + adaEps);
388 |     }
389 | 
390 |     clearGrad();
391 |   }
392 | 
393 |   inline void clearGrad() {
394 |     _gradW1 = 0;
395 |     _gradW2 = 0;
396 |     _gradW3 = 0;
397 |     if (_bUseB)
398 |       _gradb = 0;
399 |   }
400 | 
401 |   void writeModel(LStream &outf) {
402 |     SaveBinary(outf, _W1);
403 |     SaveBinary(outf, _W2);
404 |     SaveBinary(outf, _W3);
405 |     SaveBinary(outf, _b);
406 | 
407 |     SaveBinary(outf, _gradW1);
408 |     SaveBinary(outf, _gradW2);
409 |     SaveBinary(outf, _gradW3);
410 |     SaveBinary(outf, _gradb);
411 | 
412 |     SaveBinary(outf, _eg2W1);
413 |     SaveBinary(outf, _eg2W2);
414 |     SaveBinary(outf, _eg2W3);
415 |     SaveBinary(outf, _eg2b);
416 | 
417 |     WriteBinary(outf, _bUseB);
418 |     WriteBinary(outf, _funcType);
419 |     // cout << "TrilayerLSTM " << _bUseB << _funcType << endl;
420 |     // cout << "TrilayerLSTM value: " << _W3.size(0) << " and " << _W3.size(1) << " value " << _W3[0][1] << endl;
421 | 
422 | 
423 |   }
424 | 
425 |   void loadModel(LStream &inf) {
426 |     LoadBinary(inf, &_W1, false);
427 |     LoadBinary(inf, &_W2, false);
428 |     LoadBinary(inf, &_W3, false);
429 |     LoadBinary(inf, &_b, false);
430 | 
431 |     LoadBinary(inf, &_gradW1, false);
432 |     LoadBinary(inf, &_gradW2, false);
433 |     LoadBinary(inf, &_gradW3, false);
434 |     LoadBinary(inf, &_gradb, false);
435 | 
436 |     LoadBinary(inf, &_eg2W1, false);
437 |     LoadBinary(inf, &_eg2W2, false);
438 |     LoadBinary(inf, &_eg2W3, false);
439 |     LoadBinary(inf, &_eg2b, false);
440 | 
441 |     ReadBinary(inf, _bUseB);
442 |     ReadBinary(inf, _funcType);
443 |     // cout << "TrilayerLSTM " << _bUseB << _funcType << endl;
444 |     // cout << "TrilayerLSTM value: " << _W3.size(0) << " and " << _W3.size(1) << " value " << _W3[0][1]  << endl;
445 |   }
446 |   
447 | };
448 | 
449 | #endif /* SRC_TriLayer_H_ */
450 | 


--------------------------------------------------------------------------------
/TriLayerLSTM.h:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * TriLayerLSTM.h
  3 |  *
  4 |  *  Created on: Mar 18, 2015
  5 |  *      Author: mszhang
  6 |  */
  7 | 
  8 | #ifndef SRC_TriLayerLSTM_H_
  9 | #define SRC_TriLayerLSTM_H_
 10 | #include "tensor.h"
 11 | #include "MyLib.h"
 12 | #include "Utiltensor.h"
 13 | 
 14 | using namespace mshadow;
 15 | using namespace mshadow::expr;
 16 | using namespace mshadow::utils;
 17 | 
 18 | template<typename xpu>
 19 | class TriLayerLSTM {
 20 | 
 21 | public:
 22 | 
 23 |   Tensor<xpu, 2, dtype> _W1;
 24 |   Tensor<xpu, 2, dtype> _W2;
 25 |   Tensor<xpu, 2, dtype> _W3;
 26 |   Tensor<xpu, 2, dtype> _b;
 27 | 
 28 |   Tensor<xpu, 2, dtype> _gradW1;
 29 |   Tensor<xpu, 2, dtype> _gradW2;
 30 |   Tensor<xpu, 2, dtype> _gradW3;
 31 |   Tensor<xpu, 2, dtype> _gradb;
 32 | 
 33 |   Tensor<xpu, 2, dtype> _eg2W1;
 34 |   Tensor<xpu, 2, dtype> _eg2W2;
 35 |   Tensor<xpu, 2, dtype> _eg2W3;
 36 |   Tensor<xpu, 2, dtype> _eg2b;
 37 | 
 38 |   bool _bUseB;
 39 | 
 40 |   int _funcType; // 0: tanh, 1: sigmod, 2: f(x)=x, 3: exp
 41 | 
 42 | public:
 43 |   TriLayerLSTM() {
 44 |   }
 45 | 
 46 |   inline void initial(int nOSize, int nISize1, int nISize2, bool bUseB = true, int seed = 0, int funcType = 0) {
 47 |     dtype bound = sqrt(6.0 / (nOSize + nISize1 + nISize2 + 1));
 48 |     //dtype bound = 0.01;
 49 | 
 50 |     _W1 = NewTensor<xpu>(Shape2(nOSize, nISize1), d_zero);
 51 |     _gradW1 = NewTensor<xpu>(Shape2(nOSize, nISize1), d_zero);
 52 |     _eg2W1 = NewTensor<xpu>(Shape2(nOSize, nISize1), d_zero);
 53 | 
 54 |     _W2 = NewTensor<xpu>(Shape2(nOSize, nISize2), d_zero);
 55 |     _gradW2 = NewTensor<xpu>(Shape2(nOSize, nISize2), d_zero);
 56 |     _eg2W2 = NewTensor<xpu>(Shape2(nOSize, nISize2), d_zero);
 57 | 
 58 |     _W3 = NewTensor<xpu>(Shape2(1, nOSize), d_zero);
 59 |     _gradW3 = NewTensor<xpu>(Shape2(1, nOSize), d_zero);
 60 |     _eg2W3 = NewTensor<xpu>(Shape2(1, nOSize), d_zero);
 61 | 
 62 |     _b = NewTensor<xpu>(Shape2(1, nOSize), d_zero);
 63 |     _gradb = NewTensor<xpu>(Shape2(1, nOSize), d_zero);
 64 |     _eg2b = NewTensor<xpu>(Shape2(1, nOSize), d_zero);
 65 | 
 66 |     random(_W1, -1.0 * bound, 1.0 * bound, seed);
 67 |     random(_W2, -1.0 * bound, 1.0 * bound, seed+1);
 68 |     random(_W3, -1.0 * bound, 1.0 * bound, seed+2);
 69 |     random(_b, -1.0 * bound, 1.0 * bound, seed+3);
 70 | 
 71 |     _bUseB = bUseB;
 72 |     _funcType = funcType;
 73 |   }
 74 | 
 75 |   inline void initial(Tensor<xpu, 2, dtype> W1, Tensor<xpu, 2, dtype> W2, Tensor<xpu, 2, dtype> W3, Tensor<xpu, 2, dtype> b, bool bUseB = true,
 76 |       int funcType = 0) {
 77 |     static int nOSize, nISize1, nISize2;
 78 |     nOSize = W1.size(0);
 79 |     nISize1 = W1.size(1);
 80 |     nISize2 = W2.size(1);
 81 | 
 82 | 
 83 |     _W1 = NewTensor<xpu>(Shape2(nOSize, nISize1), d_zero);
 84 |     _gradW1 = NewTensor<xpu>(Shape2(nOSize, nISize1), d_zero);
 85 |     _eg2W1 = NewTensor<xpu>(Shape2(nOSize, nISize1), d_zero);
 86 |     Copy(_W1, W1);
 87 | 
 88 |     _W2 = NewTensor<xpu>(Shape2(nOSize, nISize2), d_zero);
 89 |     _gradW2 = NewTensor<xpu>(Shape2(nOSize, nISize2), d_zero);
 90 |     _eg2W2 = NewTensor<xpu>(Shape2(nOSize, nISize2), d_zero);
 91 |     Copy(_W2, W2);
 92 | 
 93 |     _W3 = NewTensor<xpu>(Shape2(1, nOSize), d_zero);
 94 |     _gradW3 = NewTensor<xpu>(Shape2(1, nOSize), d_zero);
 95 |     _eg2W3 = NewTensor<xpu>(Shape2(1, nOSize), d_zero);
 96 |     Copy(_W3, W3);
 97 | 
 98 |     _b = NewTensor<xpu>(Shape2(1, nOSize), d_zero);
 99 |     _gradb = NewTensor<xpu>(Shape2(1, nOSize), d_zero);
100 |     _eg2b = NewTensor<xpu>(Shape2(1, nOSize), d_zero);
101 | 
102 |     if (bUseB)
103 |       Copy(_b, b);
104 | 
105 |     _bUseB = bUseB;
106 |     _funcType = funcType;
107 |   }
108 | 
109 |   inline void release() {
110 |     FreeSpace(&_W1);
111 |     FreeSpace(&_gradW1);
112 |     FreeSpace(&_eg2W1);
113 |     FreeSpace(&_W2);
114 |     FreeSpace(&_gradW2);
115 |     FreeSpace(&_eg2W2);
116 |     FreeSpace(&_W3);
117 |     FreeSpace(&_gradW3);
118 |     FreeSpace(&_eg2W3);
119 |     FreeSpace(&_b);
120 |     FreeSpace(&_gradb);
121 |     FreeSpace(&_eg2b);
122 |   }
123 | 
124 |   virtual ~TriLayerLSTM() {
125 |     // TODO Auto-generated destructor stub
126 |   }
127 | 
128 |   inline dtype squarenormAll() {
129 |     dtype result = squarenorm(_gradW1);
130 |     result += squarenorm(_gradW2);
131 |     result += squarenorm(_gradW3);
132 |     if (_bUseB) {
133 |       result += squarenorm(_gradb);
134 |     }
135 | 
136 |     return result;
137 |   }
138 | 
139 |   inline void scaleGrad(dtype scale) {
140 |     _gradW1 = _gradW1 * scale;
141 |     _gradW2 = _gradW2 * scale;
142 |     _gradW3 = _gradW3 * scale;
143 |     if (_bUseB) {
144 |       _gradb = _gradb * scale;
145 |     }
146 |   }
147 | 
148 | public:
149 |   inline void ComputeForwardScore(Tensor<xpu, 2, dtype> x1, Tensor<xpu, 2, dtype> x2, Tensor<xpu, 2, dtype> x3, Tensor<xpu, 2, dtype> y) {
150 |     y = dot(x1, _W1.T());
151 |     y += dot(x2, _W2.T());
152 |     y += x3 * _W3;
153 |     if (_bUseB)
154 |       y = y + _b;
155 |     if (_funcType == 0)
156 |       y = F<nl_tanh>(y);
157 |     else if (_funcType == 1)
158 |       y = F<nl_sigmoid>(y);
159 |     else if (_funcType == 3)
160 |       y = F<nl_exp>(y);
161 |   }
162 | 
163 |   inline void ComputeForwardScore(Tensor<xpu, 3, dtype> x1, Tensor<xpu, 3, dtype> x2, Tensor<xpu, 3, dtype> x3, Tensor<xpu, 3, dtype> y) {
164 |     int seq_size = y.size(0);
165 | 
166 |     for (int id = 0; id < seq_size; id++) {
167 |       y[id] = dot(x1[id], _W1.T());
168 |       y[id] += dot(x2[id], _W2.T());
169 |       y[id] += x3[id] * _W3;
170 |       if (_bUseB)
171 |         y[id] = y[id] + _b;
172 |       if (_funcType == 0)
173 |         y[id] = F<nl_tanh>(y[id]);
174 |       else if (_funcType == 1)
175 |         y[id] = F<nl_sigmoid>(y[id]);
176 |       else if (_funcType == 3)
177 |         y[id] = F<nl_exp>(y[id]);
178 |     }
179 |   }
180 | 
181 |   inline void ComputeForwardScore(const std::vector<Tensor<xpu, 2, dtype> > &x1, const std::vector<Tensor<xpu, 2, dtype> > &x2,
182 |       const std::vector<Tensor<xpu, 2, dtype> > &x3, std::vector<Tensor<xpu, 2, dtype> > &y) {
183 |     int seq_size = y.size();
184 | 
185 |     for (int id = 0; id < seq_size; id++) {
186 |       y[id] = dot(x1[id], _W1.T());
187 |       y[id] += dot(x2[id], _W2.T());
188 |       y[id] += x3[id] * _W3;
189 |       if (_bUseB)
190 |         y[id] = y[id] + _b;
191 |       if (_funcType == 0)
192 |         y[id] = F<nl_tanh>(y[id]);
193 |       else if (_funcType == 1)
194 |         y[id] = F<nl_sigmoid>(y[id]);
195 |       else if (_funcType == 3)
196 |         y[id] = F<nl_exp>(y[id]);
197 |     }
198 |   }
199 | 
200 |   //please allocate the memory outside here
201 |   inline void ComputeBackwardLoss(Tensor<xpu, 2, dtype> x1, Tensor<xpu, 2, dtype> x2, Tensor<xpu, 2, dtype> x3, Tensor<xpu, 2, dtype> y,
202 |       Tensor<xpu, 2, dtype> ly, Tensor<xpu, 2, dtype> lx1, Tensor<xpu, 2, dtype> lx2, Tensor<xpu, 2, dtype> lx3, bool bclear = false) {
203 |     //_gradW
204 |     Tensor<xpu, 2, dtype> deri_yx(Shape2(y.size(0), y.size(1))), cly(Shape2(y.size(0), y.size(1)));
205 |     AllocSpace(&deri_yx);
206 |     AllocSpace(&cly);
207 | 
208 |     if(bclear) {
209 |       lx1 = 0.0;
210 |       lx2 = 0.0;
211 |       lx3 = 0.0;
212 |     }
213 |     if (_funcType == 0) {
214 |       deri_yx = F<nl_dtanh>(y);
215 |       cly = ly * deri_yx;
216 |     } else if (_funcType == 1) {
217 |       deri_yx = F<nl_dsigmoid>(y);
218 |       cly = ly * deri_yx;
219 |     } else if (_funcType == 3) {
220 |       cly = ly * y;
221 |     } else {
222 |       //cly = ly;
223 |       Copy(cly, ly);
224 |     }
225 |     //_gradW
226 |     _gradW1 += dot(cly.T(), x1);
227 |     _gradW2 += dot(cly.T(), x2);
228 |     _gradW3 += cly * x3;
229 | 
230 |     //_gradb
231 |     if (_bUseB)
232 |       _gradb += cly;
233 | 
234 |     //lx
235 |     lx1 += dot(cly, _W1);
236 |     lx2 += dot(cly, _W2);
237 |     lx3 += cly * _W3;
238 | 
239 |     FreeSpace(&deri_yx);
240 |     FreeSpace(&cly);
241 |   }
242 | 
243 | 
244 |   //please allocate the memory outside here
245 |   inline void ComputeBackwardLoss(Tensor<xpu, 3, dtype> x1, Tensor<xpu, 3, dtype> x2, Tensor<xpu, 3, dtype> x3, Tensor<xpu, 3, dtype> y,
246 |       Tensor<xpu, 3, dtype> ly, Tensor<xpu, 3, dtype> lx1, Tensor<xpu, 3, dtype> lx2, Tensor<xpu, 3, dtype> lx3, bool bclear = false) {
247 |     int seq_size = y.size(0);
248 |     int y_dim1 = y.size(1), y_dim2 = y.size(2);
249 |     //_gradW
250 |     Tensor<xpu, 2, dtype> deri_yx(Shape2(y_dim1, y_dim2)), cly(Shape2(y_dim1, y_dim2));
251 |     AllocSpace(&deri_yx);
252 |     AllocSpace(&cly);
253 |     if(bclear) {
254 |       lx1 = 0.0;
255 |       lx2 = 0.0;
256 |       lx3 = 0.0;
257 |     }
258 |     for (int id = 0; id < seq_size; id++) {
259 |       if (_funcType == 0) {
260 |         deri_yx = F<nl_dtanh>(y[id]);
261 |         cly = ly[id] * deri_yx;
262 |       } else if (_funcType == 1) {
263 |         deri_yx = F<nl_dsigmoid>(y[id]);
264 |         cly = ly[id] * deri_yx;
265 |       } else if (_funcType == 3) {
266 |         cly = ly[id] * y[id];
267 |       } else {
268 |         //cly = ly;
269 |         Copy(cly, ly[id]);
270 |       }
271 |       //_gradW
272 |       _gradW1 += dot(cly.T(), x1[id]);
273 |       _gradW2 += dot(cly.T(), x2[id]);
274 |       _gradW3 += cly * x3[id];
275 | 
276 |       //_gradb
277 |       if (_bUseB)
278 |         _gradb += cly;
279 | 
280 |       //lx
281 |       lx1[id] += dot(cly, _W1);
282 |       lx2[id] += dot(cly, _W2);
283 |       lx3[id] += cly * _W3;
284 |     }
285 | 
286 |     FreeSpace(&deri_yx);
287 |     FreeSpace(&cly);
288 |   }
289 | 
290 | 
291 |   //please allocate the memory outside here
292 |   inline void ComputeBackwardLoss(const std::vector<Tensor<xpu, 2, dtype> > &x1, const std::vector<Tensor<xpu, 2, dtype> > &x2,
293 |       const std::vector<Tensor<xpu, 2, dtype> > &x3, const std::vector<Tensor<xpu, 2, dtype> > &y,
294 |       const std::vector<Tensor<xpu, 2, dtype> > &ly, std::vector<Tensor<xpu, 2, dtype> > &lx1,
295 |       std::vector<Tensor<xpu, 2, dtype> > &lx2, std::vector<Tensor<xpu, 2, dtype> > &lx3, bool bclear = false) {
296 |     int seq_size = y.size();
297 |     assert(seq_size > 0);
298 |     int y_dim1 = y[0].size(0), y_dim2 = y[0].size(1);
299 |     //_gradW
300 |     Tensor<xpu, 2, dtype> deri_yx(Shape2(y_dim1, y_dim2)), cly(Shape2(y_dim1, y_dim2));
301 |     AllocSpace(&deri_yx);
302 |     AllocSpace(&cly);
303 |     if(bclear) {
304 |       for (int id = 0; id < seq_size; id++) {
305 |         lx1[id] = 0.0;
306 |         lx2[id] = 0.0;
307 |         lx3[id] = 0.0;
308 |       }
309 |     }
310 |     for (int id = 0; id < seq_size; id++) {
311 |       if (_funcType == 0) {
312 |         deri_yx = F<nl_dtanh>(y[id]);
313 |         cly = ly[id] * deri_yx;
314 |       } else if (_funcType == 1) {
315 |         deri_yx = F<nl_dsigmoid>(y[id]);
316 |         cly = ly[id] * deri_yx;
317 |       } else if (_funcType == 3) {
318 |         cly = ly[id] * y[id];
319 |       } else {
320 |         //cly = ly;
321 |         Copy(cly, ly[id]);
322 |       }
323 |       //_gradW
324 |       _gradW1 += dot(cly.T(), x1[id]);
325 |       _gradW2 += dot(cly.T(), x2[id]);
326 |       _gradW3 += cly * x3[id];
327 | 
328 |       //_gradb
329 |       if (_bUseB)
330 |         _gradb += cly;
331 | 
332 |       //lx
333 |       lx1[id] += dot(cly, _W1);
334 |       lx2[id] += dot(cly, _W2);
335 |       lx3[id] += cly * _W3;
336 |     }
337 | 
338 |     FreeSpace(&deri_yx);
339 |     FreeSpace(&cly);
340 |   }
341 | 
342 |   inline void randomprint(int num) {
343 |     static int nOSize, nISize1, nISize2, nISize3;
344 |     nOSize = _W1.size(0);
345 |     nISize1 = _W1.size(1);
346 |     nISize2 = _W2.size(1);
347 |     nISize3 = _W3.size(1);
348 |     int count = 0;
349 |     while (count < num) {
350 |       int idx1 = rand() % nOSize;
351 |       int idy1 = rand() % nISize1;
352 |       int idx2 = rand() % nOSize;
353 |       int idy2 = rand() % nISize2;
354 |       int idx3 = rand() % nOSize;
355 |       int idy3 = rand() % nISize3;
356 | 
357 |       std::cout << "_W1[" << idx1 << "," << idy1 << "]=" << _W1[idx1][idy1] << " ";
358 |       std::cout << "_W2[" << idx2 << "," << idy2 << "]=" << _W2[idx2][idy2] << " ";
359 |       std::cout << "_W3[" << idx3 << "," << idy3 << "]=" << _W3[idx3][idy3] << " ";
360 | 
361 |       if (_bUseB) {
362 |         int idz = rand() % nOSize;
363 |         std::cout << "_b[0][" << idz << "]=" << _b[0][idz] << " ";
364 |       }
365 |       count++;
366 |     }
367 | 
368 |     std::cout << std::endl;
369 |   }
370 | 
371 |   inline void updateAdaGrad(dtype regularizationWeight, dtype adaAlpha, dtype adaEps) {
372 |     _gradW1 = _gradW1 + _W1 * regularizationWeight;
373 |     _eg2W1 = _eg2W1 + _gradW1 * _gradW1;
374 |     _W1 = _W1 - _gradW1 * adaAlpha / F<nl_sqrt>(_eg2W1 + adaEps);
375 | 
376 |     _gradW2 = _gradW2 + _W2 * regularizationWeight;
377 |     _eg2W2 = _eg2W2 + _gradW2 * _gradW2;
378 |     _W2 = _W2 - _gradW2 * adaAlpha / F<nl_sqrt>(_eg2W2 + adaEps);
379 | 
380 |     _gradW3 = _gradW3 + _W3 * regularizationWeight;
381 |     _eg2W3 = _eg2W3 + _gradW3 * _gradW3;
382 |     _W3 = _W3 - _gradW3 * adaAlpha / F<nl_sqrt>(_eg2W3 + adaEps);
383 | 
384 |     if (_bUseB) {
385 |       _gradb = _gradb + _b * regularizationWeight;
386 |       _eg2b = _eg2b + _gradb * _gradb;
387 |       _b = _b - _gradb * adaAlpha / F<nl_sqrt>(_eg2b + adaEps);
388 |     }
389 | 
390 |     clearGrad();
391 |   }
392 | 
393 |   inline void clearGrad() {
394 |     _gradW1 = 0;
395 |     _gradW2 = 0;
396 |     _gradW3 = 0;
397 |     if (_bUseB)
398 |       _gradb = 0;
399 |   }
400 | 
401 |   void writeModel(LStream &outf) {
402 |     SaveBinary(outf, _W1);
403 |     SaveBinary(outf, _W2);
404 |     SaveBinary(outf, _W3);
405 |     SaveBinary(outf, _b);
406 | 
407 |     SaveBinary(outf, _gradW1);
408 |     SaveBinary(outf, _gradW2);
409 |     SaveBinary(outf, _gradW3);
410 |     SaveBinary(outf, _gradb);
411 | 
412 |     SaveBinary(outf, _eg2W1);
413 |     SaveBinary(outf, _eg2W2);
414 |     SaveBinary(outf, _eg2W3);
415 |     SaveBinary(outf, _eg2b);
416 | 
417 |     WriteBinary(outf, _bUseB);
418 |     WriteBinary(outf, _funcType);
419 |     // cout << "TrilayerLSTM " << _bUseB << _funcType << endl;
420 |     // cout << "TrilayerLSTM value: " << _W3.size(0) << " and " << _W3.size(1) << " value " << _W3[0][1] << endl;
421 | 
422 | 
423 |   }
424 | 
425 |   void loadModel(LStream &inf) {
426 |     LoadBinary(inf, &_W1, false);
427 |     LoadBinary(inf, &_W2, false);
428 |     LoadBinary(inf, &_W3, false);
429 |     LoadBinary(inf, &_b, false);
430 | 
431 |     LoadBinary(inf, &_gradW1, false);
432 |     LoadBinary(inf, &_gradW2, false);
433 |     LoadBinary(inf, &_gradW3, false);
434 |     LoadBinary(inf, &_gradb, false);
435 | 
436 |     LoadBinary(inf, &_eg2W1, false);
437 |     LoadBinary(inf, &_eg2W2, false);
438 |     LoadBinary(inf, &_eg2W3, false);
439 |     LoadBinary(inf, &_eg2b, false);
440 | 
441 |     ReadBinary(inf, _bUseB);
442 |     ReadBinary(inf, _funcType);
443 |     // cout << "TrilayerLSTM " << _bUseB << _funcType << endl;
444 |     // cout << "TrilayerLSTM value: " << _W3.size(0) << " and " << _W3.size(1) << " value " << _W3[0][1]  << endl;
445 |   }
446 | 
447 | };
448 | 
449 | #endif /* SRC_TriLayerLSTM_H_ */
450 | 


--------------------------------------------------------------------------------
/UniLayer.h:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * UniLayer.h
  3 |  *
  4 |  *  Created on: Mar 18, 2015
  5 |  *      Author: mszhang
  6 |  */
  7 | 
  8 | #ifndef SRC_UniLayer_H_
  9 | #define SRC_UniLayer_H_
 10 | #include "tensor.h"
 11 | #include "MyLib.h"
 12 | #include "Utiltensor.h"
 13 | 
 14 | using namespace mshadow;
 15 | using namespace mshadow::expr;
 16 | using namespace mshadow::utils;
 17 | 
 18 | template<typename xpu>
 19 | class UniLayer {
 20 | 
 21 | public:
 22 | 
 23 |   Tensor<xpu, 2, dtype> _W;
 24 |   Tensor<xpu, 2, dtype> _b;
 25 | 
 26 |   Tensor<xpu, 2, dtype> _gradW;
 27 |   Tensor<xpu, 2, dtype> _gradb;
 28 | 
 29 |   Tensor<xpu, 2, dtype> _eg2W;
 30 |   Tensor<xpu, 2, dtype> _eg2b;
 31 | 
 32 |   bool _bUseB;
 33 | 
 34 |   int _funcType; // 0: tanh, 1: sigmod, 2: f(x)=x, 3: exp
 35 | 
 36 | public:
 37 |   UniLayer() {
 38 |   }
 39 | 
 40 |   inline void initial(int nOSize, int nISize, bool bUseB = true, int seed = 0, int funcType = 0) {
 41 |     dtype bound = sqrt(6.0 / (nOSize + nISize + 1));
 42 |     //dtype bound = 0.01;
 43 | 
 44 |     _W = NewTensor<xpu>(Shape2(nOSize, nISize), d_zero);
 45 |     _gradW = NewTensor<xpu>(Shape2(nOSize, nISize), d_zero);
 46 |     _eg2W = NewTensor<xpu>(Shape2(nOSize, nISize), d_zero);
 47 | 
 48 |     _b = NewTensor<xpu>(Shape2(1, nOSize), d_zero);
 49 |     _gradb = NewTensor<xpu>(Shape2(1, nOSize), d_zero);
 50 |     _eg2b = NewTensor<xpu>(Shape2(1, nOSize), d_zero);
 51 | 
 52 |     random(_W, -1.0 * bound, 1.0 * bound, seed);
 53 |     random(_b, -1.0 * bound, 1.0 * bound, seed + 1);
 54 | 
 55 |     _bUseB = bUseB;
 56 |     _funcType = funcType;
 57 |   }
 58 | 
 59 |   inline void initial(Tensor<xpu, 2, dtype> W, Tensor<xpu, 2, dtype> b, bool bUseB = true, int funcType = 0) {
 60 |     static int nOSize, nISize;
 61 |     nOSize = W.size(0);
 62 |     nISize = W.size(1);
 63 | 
 64 |     _W = NewTensor<xpu>(Shape2(nOSize, nISize), d_zero);
 65 |     _gradW = NewTensor<xpu>(Shape2(nOSize, nISize), d_zero);
 66 |     _eg2W = NewTensor<xpu>(Shape2(nOSize, nISize), d_zero);
 67 |     Copy(_W, W);
 68 | 
 69 |     _b = NewTensor<xpu>(Shape2(1, nOSize), d_zero);
 70 |     _gradb = NewTensor<xpu>(Shape2(1, nOSize), d_zero);
 71 |     _eg2b = NewTensor<xpu>(Shape2(1, nOSize), d_zero);
 72 | 
 73 |     if (bUseB)
 74 |       Copy(_b, b);
 75 | 
 76 |     _bUseB = bUseB;
 77 |     _funcType = funcType;
 78 |   }
 79 | 
 80 |   inline void initial(Tensor<xpu, 2, dtype> W,  int funcType = 0) {
 81 |     static int nOSize, nISize;
 82 |     nOSize = W.size(0);
 83 |     nISize = W.size(1);
 84 | 
 85 |     _W = NewTensor<xpu>(Shape2(nOSize, nISize), d_zero);
 86 |     _gradW = NewTensor<xpu>(Shape2(nOSize, nISize), d_zero);
 87 |     _eg2W = NewTensor<xpu>(Shape2(nOSize, nISize), d_zero);
 88 |     Copy(_W, W);
 89 | 
 90 |     _b = NewTensor<xpu>(Shape2(1, nOSize), d_zero);
 91 |     _gradb = NewTensor<xpu>(Shape2(1, nOSize), d_zero);
 92 |     _eg2b = NewTensor<xpu>(Shape2(1, nOSize), d_zero);
 93 | 
 94 | 
 95 |     _bUseB = false;
 96 |     _funcType = funcType;
 97 |   }
 98 |   inline void release() {
 99 |     FreeSpace(&_W);
100 |     FreeSpace(&_gradW);
101 |     FreeSpace(&_eg2W);
102 |     FreeSpace(&_b);
103 |     FreeSpace(&_gradb);
104 |     FreeSpace(&_eg2b);
105 |   }
106 | 
107 |   virtual ~UniLayer() {
108 |     // TODO Auto-generated destructor stub
109 |   }
110 | 
111 |   inline dtype squarenormAll() {
112 |     dtype result = squarenorm(_gradW);
113 | 
114 |     if (_bUseB) {
115 |       result += squarenorm(_gradb);
116 |     }
117 | 
118 |     return result;
119 |   }
120 | 
121 |   inline void scaleGrad(dtype scale) {
122 |     _gradW = _gradW * scale;
123 |     if (_bUseB) {
124 |       _gradb = _gradb * scale;
125 |     }
126 |   }
127 | 
128 | public:
129 |   inline void ComputeForwardScore(Tensor<xpu, 2, dtype> x, Tensor<xpu, 2, dtype> y) {
130 |     y = dot(x, _W.T());
131 |     if (_bUseB)
132 |       y = y + _b;
133 |     if (_funcType == 0)
134 |       y = F<nl_tanh>(y);
135 |     else if (_funcType == 1)
136 |       y = F<nl_sigmoid>(y);
137 |     else if (_funcType == 3)
138 |       y = F<nl_exp>(y);
139 |   }
140 | 
141 |   inline void ComputeForwardScore(Tensor<xpu, 3, dtype> x, Tensor<xpu, 3, dtype> y) {
142 |     int seq_size = y.size(0);
143 |     for (int id = 0; id < seq_size; id++) {
144 |       y[id] = dot(x[id], _W.T());
145 |       if (_bUseB)
146 |         y[id] = y[id] + _b;
147 |       if (_funcType == 0)
148 |         y[id] = F<nl_tanh>(y[id]);
149 |       else if (_funcType == 1)
150 |         y[id] = F<nl_sigmoid>(y[id]);
151 |       else if (_funcType == 3)
152 |         y[id] = F<nl_exp>(y[id]);
153 |     }
154 |   }
155 | 
156 |   inline void ComputeForwardScore(const std::vector<Tensor<xpu, 2, dtype> > &x, std::vector<Tensor<xpu, 2, dtype> > &y) {
157 |     int seq_size = y.size();
158 |     for (int id = 0; id < seq_size; id++) {
159 |       y[id] = dot(x[id], _W.T());
160 |       if (_bUseB)
161 |         y[id] = y[id] + _b;
162 |       if (_funcType == 0)
163 |         y[id] = F<nl_tanh>(y[id]);
164 |       else if (_funcType == 1)
165 |         y[id] = F<nl_sigmoid>(y[id]);
166 |       else if (_funcType == 3)
167 |         y[id] = F<nl_exp>(y[id]);
168 |     }
169 |   }
170 | 
171 |   //please allocate the memory outside here
172 |   inline void ComputeBackwardLoss(Tensor<xpu, 2, dtype> x, Tensor<xpu, 2, dtype> y, Tensor<xpu, 2, dtype> ly, Tensor<xpu, 2, dtype> lx, bool bclear = false) {
173 |     //_gradW
174 |     Tensor<xpu, 2, dtype> deri_yx(Shape2(y.size(0), y.size(1))), cly(Shape2(y.size(0), y.size(1)));
175 |     AllocSpace(&deri_yx);
176 |     AllocSpace(&cly);
177 | 
178 |     if (bclear)
179 |       lx = 0.0;
180 |     if (_funcType == 0) {
181 |       deri_yx = F<nl_dtanh>(y);
182 |       cly = ly * deri_yx;
183 |     } else if (_funcType == 1) {
184 |       deri_yx = F<nl_dsigmoid>(y);
185 |       cly = ly * deri_yx;
186 |     } else if (_funcType == 3) {
187 |       cly = ly * y;
188 |     } else {
189 |       //cly = ly;
190 |       Copy(cly, ly);
191 |     }
192 |     //_gradW
193 |     _gradW += dot(cly.T(), x);
194 | 
195 |     //_gradb
196 |     if (_bUseB)
197 |       _gradb += cly;
198 | 
199 |     //lx
200 |     lx += dot(cly, _W);
201 | 
202 |     FreeSpace(&deri_yx);
203 |     FreeSpace(&cly);
204 |   }
205 | 
206 |   //please allocate the memory outside here
207 |   inline void ComputeBackwardLoss(Tensor<xpu, 3, dtype> x, Tensor<xpu, 3, dtype> y, Tensor<xpu, 3, dtype> ly, Tensor<xpu, 3, dtype> lx, bool bclear = false) {
208 |     //_gradW
209 |     int seq_size = y.size(0);
210 |     int y_dim1 = y.size(1), y_dim2 = y.size(2);
211 |     Tensor<xpu, 2, dtype> deri_yx(Shape2(y_dim1, y_dim2)), cly(Shape2(y_dim1, y_dim2));
212 |     AllocSpace(&deri_yx);
213 |     AllocSpace(&cly);
214 | 
215 |     if (bclear)
216 |       lx = 0.0;
217 |     for (int id = 0; id < seq_size; id++) {
218 |       if (_funcType == 0) {
219 |         deri_yx = F<nl_dtanh>(y[id]);
220 |         cly = ly[id] * deri_yx;
221 |       } else if (_funcType == 1) {
222 |         deri_yx = F<nl_dsigmoid>(y[id]);
223 |         cly = ly[id] * deri_yx;
224 |       } else if (_funcType == 3) {
225 |         cly = ly[id] * y[id];
226 |       } else {
227 |         //cly = ly;
228 |         Copy(cly, ly[id]);
229 |       }
230 |       //_gradW
231 |       _gradW += dot(cly.T(), x[id]);
232 | 
233 |       //_gradb
234 |       if (_bUseB)
235 |         _gradb += cly;
236 | 
237 |       //lx
238 |       lx[id] += dot(cly, _W);
239 |     }
240 | 
241 |     FreeSpace(&deri_yx);
242 |     FreeSpace(&cly);
243 |   }
244 | 
245 |   //please allocate the memory outside here
246 |   inline void ComputeBackwardLoss(const std::vector<Tensor<xpu, 2, dtype> > &x, const std::vector<Tensor<xpu, 2, dtype> > &y,
247 |       const std::vector<Tensor<xpu, 2, dtype> > &ly, std::vector<Tensor<xpu, 2, dtype> > &lx, bool bclear = false) {
248 |     //_gradW
249 |     int seq_size = y.size();
250 |     assert(seq_size > 0);
251 |     int y_dim1 = y[0].size(0), y_dim2 = y[0].size(1);
252 |     Tensor<xpu, 2, dtype> deri_yx(Shape2(y_dim1, y_dim2)), cly(Shape2(y_dim1, y_dim2));
253 |     AllocSpace(&deri_yx);
254 |     AllocSpace(&cly);
255 | 
256 |     if(bclear) {
257 |       for (int id = 0; id < seq_size; id++) {
258 |         lx[id] = 0.0;
259 |       }
260 |     }
261 |     for (int id = 0; id < seq_size; id++) {
262 |       if (_funcType == 0) {
263 |         deri_yx = F<nl_dtanh>(y[id]);
264 |         cly = ly[id] * deri_yx;
265 |       } else if (_funcType == 1) {
266 |         deri_yx = F<nl_dsigmoid>(y[id]);
267 |         cly = ly[id] * deri_yx;
268 |       } else if (_funcType == 3) {
269 |         cly = ly[id] * y[id];
270 |       } else {
271 |         //cly = ly;
272 |         Copy(cly, ly[id]);
273 |       }
274 |       //_gradW
275 |       _gradW += dot(cly.T(), x[id]);
276 | 
277 |       //_gradb
278 |       if (_bUseB)
279 |         _gradb += cly;
280 | 
281 |       //lx
282 |       lx[id] += dot(cly, _W);
283 |     }
284 | 
285 |     FreeSpace(&deri_yx);
286 |     FreeSpace(&cly);
287 |   }
288 | 
289 |   inline void randomprint(int num) {
290 |     static int nOSize, nISize;
291 |     nOSize = _W.size(0);
292 |     nISize = _W.size(1);
293 |     int count = 0;
294 |     while (count < num) {
295 |       int idx = rand() % nOSize;
296 |       int idy = rand() % nISize;
297 | 
298 |       std::cout << "_W[" << idx << "," << idy << "]=" << _W[idx][idy] << " ";
299 | 
300 |       if (_bUseB) {
301 |         int idz = rand() % nOSize;
302 |         std::cout << "_b[0][" << idz << "]=" << _b[0][idz] << " ";
303 |       }
304 |       count++;
305 |     }
306 | 
307 |     std::cout << std::endl;
308 |   }
309 | 
310 |   inline void updateAdaGrad(dtype regularizationWeight, dtype adaAlpha, dtype adaEps) {
311 |     _gradW = _gradW + _W * regularizationWeight;
312 |     _eg2W = _eg2W + _gradW * _gradW;
313 |     _W = _W - _gradW * adaAlpha / F<nl_sqrt>(_eg2W + adaEps);
314 | 
315 |     if (_bUseB) {
316 |       _gradb = _gradb + _b * regularizationWeight;
317 |       _eg2b = _eg2b + _gradb * _gradb;
318 |       _b = _b - _gradb * adaAlpha / F<nl_sqrt>(_eg2b + adaEps);
319 |     }
320 | 
321 |     clearGrad();
322 |   }
323 | 
324 |   inline void clearGrad() {
325 |     _gradW = 0;
326 |     if (_bUseB)
327 |       _gradb = 0;
328 |   }
329 | 
330 |   void writeModel(LStream &outf) {
331 |     SaveBinary(outf, _W);
332 |     SaveBinary(outf, _b);
333 |     SaveBinary(outf, _gradW);
334 |     SaveBinary(outf, _gradb);
335 |     SaveBinary(outf, _eg2W);
336 |     SaveBinary(outf, _eg2b);
337 |     WriteBinary(outf, _bUseB);
338 |     WriteBinary(outf, _funcType);
339 |     // cout << "Unilayer " << _bUseB << _funcType << endl;
340 | 
341 |   }
342 | 
343 |   void loadModel(LStream &inf) {
344 |     LoadBinary(inf, &_W, false);
345 |     LoadBinary(inf, &_b, false);
346 |     LoadBinary(inf, &_gradW, false);
347 |     LoadBinary(inf, &_gradb, false);
348 |     LoadBinary(inf, &_eg2W, false);
349 |     LoadBinary(inf, &_eg2b, false);
350 |     ReadBinary(inf, _bUseB);
351 |     ReadBinary(inf, _funcType);
352 |     // cout << "Unilayer " << _bUseB << _funcType << endl;
353 |   }
354 |   
355 | };
356 | 
357 | #endif /* SRC_UniLayer_H_ */
358 | 


--------------------------------------------------------------------------------
/UniLayer1O.h:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * UniLayer1O.h
  3 |  *
  4 |  *  Created on: Mar 18, 2015
  5 |  *      Author: mszhang
  6 |  */
  7 | /*
  8 |  *  use it only for output layer
  9 |  */
 10 | #ifndef SRC_UniLayer1O_H_
 11 | #define SRC_UniLayer1O_H_
 12 | #include "tensor.h"
 13 | #include "MyLib.h"
 14 | #include "Utiltensor.h"
 15 | 
 16 | using namespace mshadow;
 17 | using namespace mshadow::expr;
 18 | using namespace mshadow::utils;
 19 | 
 20 | template<typename xpu>
 21 | class UniLayer1O {
 22 | 
 23 | public:
 24 | 
 25 |   Tensor<xpu, 2, dtype> _W;
 26 | 
 27 |   Tensor<xpu, 2, dtype> _gradW;
 28 | 
 29 |   Tensor<xpu, 2, dtype> _eg2W;
 30 | 
 31 | public:
 32 |   UniLayer1O() {
 33 |   }
 34 | 
 35 |   inline void initial(int nISize, int seed = 0) {
 36 |     dtype bound = sqrt(6.0 / (1 + nISize + 1));
 37 |     //dtype bound = 0.01;
 38 | 
 39 |     _W = NewTensor<xpu>(Shape2(1, nISize), d_zero);
 40 |     _gradW = NewTensor<xpu>(Shape2(1, nISize), d_zero);
 41 |     _eg2W = NewTensor<xpu>(Shape2(1, nISize), d_zero);
 42 | 
 43 |     random(_W, -1.0 * bound, 1.0 * bound, seed);
 44 | 
 45 |   }
 46 | 
 47 |   inline void initial(Tensor<xpu, 2, dtype> W) {
 48 |     static int nISize;
 49 |     nISize = W.size(1);
 50 | 
 51 |     _W = NewTensor<xpu>(Shape2(1, nISize), d_zero);
 52 |     _gradW = NewTensor<xpu>(Shape2(1, nISize), d_zero);
 53 |     _eg2W = NewTensor<xpu>(Shape2(1, nISize), d_zero);
 54 |     Copy(_W, W);
 55 | 
 56 |   }
 57 | 
 58 |   inline void release() {
 59 |     FreeSpace(&_W);
 60 |     FreeSpace(&_gradW);
 61 |     FreeSpace(&_eg2W);
 62 |   }
 63 | 
 64 |   virtual ~UniLayer1O() {
 65 |     // TODO Auto-generated destructor stub
 66 |   }
 67 | 
 68 |   inline dtype squarenormAll() {
 69 |     dtype result = squarenorm(_gradW);
 70 | 
 71 |     return result;
 72 |   }
 73 | 
 74 |   inline void scaleGrad(dtype scale) {
 75 |     _gradW = _gradW * scale;
 76 |   }
 77 | 
 78 | public:
 79 |   inline void ComputeForwardScore(Tensor<xpu, 2, dtype> x, dtype& y) {
 80 |     static int nISize;
 81 |     nISize = _W.size(1);
 82 |     y = 0.0;
 83 |     for(int idx = 0; idx < nISize; idx++){
 84 |     	y += x[0][idx] * _W[0][idx];
 85 |     }
 86 |   }
 87 | 
 88 | 
 89 |   //please allocate the memory outside here
 90 |   inline void ComputeBackwardLoss(Tensor<xpu, 2, dtype> x, dtype ly, Tensor<xpu, 2, dtype> lx, bool bclear = false) {
 91 |     //_gradW
 92 |     _gradW += ly * x;
 93 | 
 94 |     if (bclear)
 95 |       lx = 0.0;
 96 |     //lx
 97 |     lx += ly * _W;
 98 | 
 99 |   }
100 | 
101 | 
102 |   inline void randomprint(int num) {
103 |     static int nISize;
104 |     nISize = _W.size(1);
105 |     int count = 0;
106 |     while (count < num) {
107 |       int idy = rand() % nISize;
108 |       std::cout << "_W[" << 0 << "," << idy << "]=" << _W[0][idy] << " ";
109 |       count++;
110 |     }
111 | 
112 |     std::cout << std::endl;
113 |   }
114 | 
115 |   inline void updateAdaGrad(dtype regularizationWeight, dtype adaAlpha, dtype adaEps) {
116 |     _gradW = _gradW + _W * regularizationWeight;
117 |     _eg2W = _eg2W + _gradW * _gradW;
118 |     _W = _W - _gradW * adaAlpha / F<nl_sqrt>(_eg2W + adaEps);
119 | 
120 | 
121 |     clearGrad();
122 |   }
123 | 
124 |   inline void clearGrad() {
125 |     _gradW = 0;
126 |   }
127 | 
128 |   void writeModel(LStream &outf) {
129 |     SaveBinary(outf, _W);
130 |     SaveBinary(outf, _gradW);
131 |     SaveBinary(outf, _eg2W);
132 | 
133 |   }
134 | 
135 |   void loadModel(LStream &inf) {
136 |     LoadBinary(inf, &_W, false);
137 |     LoadBinary(inf, &_gradW, false);
138 |     LoadBinary(inf, &_eg2W, false);
139 |   }
140 | };
141 | 
142 | #endif /* SRC_UniLayer1O_H_ */
143 | 


--------------------------------------------------------------------------------
/Utils.h:
--------------------------------------------------------------------------------
 1 | /*!
 2 |  *  Copyright (c) 2014 by Contributors
 3 |  * \file Utils.h
 4 |  * \brief simple utils for error and checkings
 5 |  * \author Tianqi Chen
 6 |  */
 7 | #ifndef MSHADOW_UTILS_H_
 8 | #define MSHADOW_UTILS_H_
 9 | #define _CRT_SECURE_NO_WARNINGS
10 | #include <cstdio>
11 | #include <string>
12 | #include <cstdarg>
13 | #include <cstdlib>
14 | namespace mshadow {
15 | /*! \brief namespace for helper utils of the project */
16 | namespace utils {
17 | /*! \brief error message buffer length */
18 | const int kPrintBuffer = 1 << 12;
19 | 
20 | #ifndef MSHADOW_CUSTOMIZE_ASSERT_
21 | /*! 
22 |  * \brief handling of Assert error, caused by in-apropriate input
23 |  * \param msg error message 
24 |  */
25 | inline void HandleAssertError(const char *msg) {
26 |   fprintf(stderr, "AssertError:%s\n", msg);
27 |   exit(-1);
28 | }
29 | /*! 
30 |  * \brief handling of Check error, caused by in-apropriate input
31 |  * \param msg error message 
32 |  */
33 | inline void HandleCheckError(const char *msg) {
34 |   fprintf(stderr, "%s\n", msg);
35 |   exit(-1);
36 | }
37 | #else
38 | // include declarations, some one must implement this
39 | void HandleAssertError(const char *msg);
40 | void HandleCheckError(const char *msg);
41 | void HandlePrint(const char *msg);
42 | #endif
43 | 
44 | /*! \brief assert an condition is true, use this to handle debug information */
45 | inline void Assert(bool exp, const char *fmt, ...) {
46 |   if (!exp) {
47 |     std::string msg(kPrintBuffer, '\0');
48 |     va_list args;
49 |     va_start(args, fmt);
50 |     vsnprintf(&msg[0], kPrintBuffer, fmt, args);
51 |     va_end(args);
52 |     HandleAssertError(msg.c_str());
53 |   }
54 | }
55 | 
56 | /*!\brief same as assert, but this is intended to be used as message for user*/
57 | inline void Check(bool exp, const char *fmt, ...) {
58 |   if (!exp) {
59 |     std::string msg(kPrintBuffer, '\0');
60 |     va_list args;
61 |     va_start(args, fmt);
62 |     vsnprintf(&msg[0], kPrintBuffer, fmt, args);
63 |     va_end(args);
64 |     HandleCheckError(msg.c_str());
65 |   }
66 | }
67 | 
68 | /*! \brief report error message, same as check */
69 | inline void Error(const char *fmt, ...) {
70 |   {
71 |     std::string msg(kPrintBuffer, '\0');
72 |     va_list args;
73 |     va_start(args, fmt);
74 |     vsnprintf(&msg[0], kPrintBuffer, fmt, args);
75 |     va_end(args);
76 |     HandleCheckError(msg.c_str());
77 |   }
78 | }
79 | }  // namespace utils
80 | }  // namespace mshadow
81 | #endif  // MSHADOW_UTILS_H_
82 | 


--------------------------------------------------------------------------------
/Utiltensor.h:
--------------------------------------------------------------------------------
  1 | #ifndef UTILTENSOR
  2 | #define UTILTENSOR
  3 | 
  4 | #include "tensor.h"
  5 | #include "MyLib.h"
  6 | 
  7 | using namespace std;
  8 | using namespace mshadow;
  9 | using namespace mshadow::expr;
 10 | using namespace mshadow::utils;
 11 | using namespace nr;
 12 | 
 13 | // define tanh operation
 14 | struct nl_tanh {
 15 | 	MSHADOW_XINLINE static dtype Map(dtype a) {
 16 | //    	return a>0?a:0;
 17 | 		return tanh(a);
 18 | 	}
 19 | };
 20 | struct nl_dtanh {
 21 | 	MSHADOW_XINLINE static dtype Map(dtype a) {
 22 | //    	return a>0?1:0;
 23 | 		return (1.0 - a) * (1.0 + a);
 24 | 	}
 25 | };
 26 | struct nl_sigmoid {
 27 | 	MSHADOW_XINLINE static dtype Map(dtype a) {
 28 | //    	return a>0?a:0;
 29 | 		return 1.0 / (1.0 + exp(-a));
 30 | 	}
 31 | };
 32 | struct nl_dsigmoid {
 33 | 	MSHADOW_XINLINE static dtype Map(dtype a) {
 34 | //    	return a>0?1:0;
 35 | 		return (1.0 - a) * a;
 36 | 	}
 37 | };
 38 | struct nl_relu {
 39 | 	MSHADOW_XINLINE static dtype Map(dtype a) {
 40 | 		return a > 0 ? a : 0;
 41 | 	}
 42 | };
 43 | struct nl_drelu {
 44 | 	MSHADOW_XINLINE static dtype Map(dtype a) {
 45 | 		return a > 0 ? 1 : 0;
 46 | 	}
 47 | };
 48 | struct nl_exp {
 49 | 	MSHADOW_XINLINE static dtype Map(dtype a) {
 50 | //    	return a>0?a:0;
 51 | 		return exp(a);
 52 | 	}
 53 | };
 54 | struct nl_log {
 55 | 	MSHADOW_XINLINE static dtype Map(dtype a) {
 56 | //      return a>0?a:0;
 57 | 		return log(a);
 58 | 	}
 59 | };
 60 | struct xe_dx {
 61 | 	MSHADOW_XINLINE static dtype Map(dtype a, dtype b) {
 62 | 		return (b - a) / (a * (1.0 - a) + 1e-6);
 63 | 	}
 64 | };
 65 | struct xe_ll {
 66 | 	MSHADOW_XINLINE static dtype Map(dtype a, dtype b) {
 67 | 		return b > 0.5f ? log(a + 1e-10) : log(1.0 - a + 1e-10);
 68 | 	}
 69 | };
 70 | struct square {
 71 | 	MSHADOW_XINLINE static dtype Map(dtype a) {
 72 | 		return a * a;
 73 | 
 74 | 	}
 75 | };
 76 | struct clip {
 77 | 	MSHADOW_XINLINE static dtype Map(dtype a) {
 78 | 		return a > 10.0 ? 10.0 : (a < -10.0 ? -10.0 : a);
 79 | 
 80 | 	}
 81 | };
 82 | struct inv_sqrt {
 83 | 	MSHADOW_XINLINE static dtype Map(dtype a, dtype b) {
 84 | 		return a / (sqrt(b) + 0.0001);
 85 | 	}
 86 | };
 87 | 
 88 | struct nl_sqrt {
 89 | 	MSHADOW_XINLINE static dtype Map(dtype a) {
 90 | 		return sqrt(a);
 91 | 	}
 92 | };
 93 | 
 94 | struct dropout {
 95 | 	// p: prob to dropout
 96 | 	MSHADOW_XINLINE static dtype Map(dtype p, dtype r) {
 97 | 		if (p > r)
 98 | 			return 0.0;
 99 | 		else
100 | 			return 1.0 / (1.0 - p);
101 | 	}
102 | };
103 | 
104 | // \sum x_{ijk}^2
105 | template<typename xpu>
106 | inline dtype squarenorm(Tensor<xpu, 1, dtype> w) {
107 | 	dtype result = 0;
108 | 	for (int idx = 0; idx < w.size(0); idx++) {
109 | 		result += w[idx] * w[idx];
110 | 	}
111 | 	return result;
112 | }
113 | 
114 | template<typename xpu>
115 | inline dtype squarenorm(Tensor<xpu, 2, dtype> w) {
116 | 	dtype result = 0;
117 | 	for (int idx = 0; idx < w.size(0); idx++) {
118 | 		for (int idy = 0; idy < w.size(1); idy++) {
119 | 			result += w[idx][idy] * w[idx][idy];
120 | 		}
121 | 	}
122 | 	return result;
123 | }
124 | 
125 | template<typename xpu>
126 | inline dtype squarenorm(Tensor<xpu, 3, dtype> w) {
127 | 	dtype result = 0;
128 | 	for (int idx = 0; idx < w.size(0); idx++) {
129 | 		for (int idy = 0; idy < w.size(1); idy++) {
130 | 			for (int idz = 0; idz < w.size(2); idz++) {
131 | 				result += w[idx][idy][idz] * w[idx][idy][idz];
132 | 			}
133 | 		}
134 | 	}
135 | 	return result;
136 | }
137 | 
138 | template<typename xpu>
139 | inline void assign(Tensor<xpu, 1, dtype> w, const NRVec<dtype>& wnr) {
140 | 	int dim = wnr.size();
141 | 	for (int idx = 0; idx < dim; idx++) {
142 | 		w[idx] = wnr[idx];
143 | 	}
144 | }
145 | 
146 | template<typename xpu>
147 | inline void assign(Tensor<xpu, 2, dtype> w, const NRMat<dtype>& wnr) {
148 | 	int dim1 = wnr.nrows();
149 | 	int dim2 = wnr.ncols();
150 | 	for (int idx = 0; idx < dim1; idx++) {
151 | 		for (int idy = 0; idy < dim2; idy++) {
152 | 			w[idx][idy] = wnr[idx][idy];
153 | 		}
154 | 	}
155 | }
156 | 
157 | template<typename xpu>
158 | inline void assign(Tensor<xpu, 3, dtype> w, const NRMat3d<dtype>& wnr) {
159 | 	int dim1 = wnr.dim1();
160 | 	int dim2 = wnr.dim2();
161 | 	int dim3 = wnr.dim3();
162 | 	for (int idx = 0; idx < dim1; idx++) {
163 | 		for (int idy = 0; idy < dim2; idy++) {
164 | 			for (int idz = 0; idz < dim3; idz++) {
165 | 				w[idx][idy][idz] = wnr[idx][idy][idz];
166 | 			}
167 | 		}
168 | 	}
169 | }
170 | 
171 | template<typename xpu>
172 | inline void assign(vector<Tensor<xpu, 1, dtype> > &w, dtype value) {
173 | 	int dim = w.size();
174 | 	for (int idx = 0; idx < dim; idx++) {
175 | 		w[idx] = value;
176 | 	}
177 | }
178 | 
179 | template<typename xpu>
180 | inline void assign(vector<Tensor<xpu, 2, dtype> > &w, dtype value) {
181 | 	int dim = w.size();
182 | 	for (int idx = 0; idx < dim; idx++) {
183 | 		w[idx] = value;
184 | 	}
185 | }
186 | 
187 | template<typename xpu>
188 | inline void assign(vector<Tensor<xpu, 3, dtype> > &w, dtype value) {
189 | 	int dim = w.size();
190 | 	for (int idx = 0; idx < dim; idx++) {
191 | 		w[idx] = value;
192 | 	}
193 | }
194 | 
195 | template<typename xpu>
196 | inline void norm2one(Tensor<xpu, 2, dtype> w, int idx) {
197 | 	dtype sum = 0.000001;
198 | 	for (int idy = 0; idy < w.size(1); idy++) {
199 | 		sum += w[idx][idy] * w[idx][idy];
200 | 	}
201 | 	dtype scale = sqrt(sum);
202 | 	for (int idy = 0; idy < w.size(1); idy++)
203 | 		w[idx][idy] = w[idx][idy] / scale;
204 | }
205 | 
206 | template<typename xpu>
207 | inline void random(Tensor<xpu, 1, dtype> w, dtype min = 0.0, dtype max = 1.0, int seed = 0) {
208 | 	srand(seed);
209 | 	int dim = w.size(0);
210 | 	for (int idx = 0; idx < dim; idx++) {
211 | 		w[idx] = min + (max - min) * (1.0 * rand() / RAND_MAX);
212 | 	}
213 | }
214 | 
215 | template<typename xpu>
216 | inline void random(Tensor<xpu, 2, dtype> w, dtype min = 0.0, dtype max = 1.0, int seed = 0) {
217 | 	srand(seed);
218 | 	int dim1 = w.size(0);
219 | 	int dim2 = w.size(1);
220 | 	for (int idx = 0; idx < dim1; idx++) {
221 | 		for (int idy = 0; idy < dim2; idy++) {
222 | 			w[idx][idy] = min + (max - min) * (1.0 * rand() / RAND_MAX);
223 | 		}
224 | 	}
225 | }
226 | 
227 | template<typename xpu>
228 | inline void random(Tensor<xpu, 3, dtype> w, dtype min = 0.0, dtype max = 1.0, int seed = 0) {
229 | 	srand(seed);
230 | 	int dim1 = w.size(0);
231 | 	int dim2 = w.size(1);
232 | 	int dim3 = w.size(2);
233 | 	for (int idx = 0; idx < dim1; idx++) {
234 | 		for (int idy = 0; idy < dim2; idy++) {
235 | 			for (int idz = 0; idz < dim3; idz++) {
236 | 				w[idx][idy][idz] = min + (max - min) * (1.0 * rand() / RAND_MAX);
237 | 			}
238 | 		}
239 | 	}
240 | }
241 | 
242 | /*
243 | template<typename xpu>
244 | inline void tcopy(const Tensor<xpu, 3, dtype>& from, Tensor<xpu, 3, dtype>& to, bool bAllocated = true) {
245 | 	if (bAllocated) {
246 | 		if (to.size(0) != from.size(0) || to.size(1) != from.size(1) || to.size(2) != from.size(2)) {
247 | 			FreeSpace(&to);
248 | 			to = NewTensor<xpu>(Shape3(from.size(0), from.size(1), from.size(2)), d_zero);
249 | 		}
250 | 	} else {
251 | 		to = NewTensor<xpu>(Shape3(from.size(0), from.size(1), from.size(2)), d_zero);
252 | 	}
253 | 
254 | 	Copy(to, from);
255 | }
256 | 
257 | template<typename xpu>
258 | inline void tcopy(const Tensor<xpu, 2, dtype>& from, Tensor<xpu, 2, dtype>& to, bool bAllocated = true) {
259 | 	if (bAllocated) {
260 | 		if (to.size(0) != from.size(0) || to.size(1) != from.size(1)) {
261 | 			FreeSpace(&to);
262 | 			to = NewTensor<xpu>(Shape2(from.size(0), from.size(1)), d_zero);
263 | 		}
264 | 	} else {
265 | 		to = NewTensor<xpu>(Shape2(from.size(0), from.size(1)), d_zero);
266 | 	}
267 | 	Copy(to, from);
268 | }
269 | 
270 | template<typename xpu>
271 | inline void tcopy(const Tensor<xpu, 1, dtype>&from, Tensor<xpu, 1, dtype>& to, bool bAllocated = true) {
272 | 	if (bAllocated) {
273 | 		if (to.size(0) != from.size(0)) {
274 | 			FreeSpace(&to);
275 | 			to = NewTensor<xpu>(Shape1(from.size(0)), d_zero);
276 | 		}
277 | 	} else {
278 | 		to = NewTensor<xpu>(Shape1(from.size(0)), d_zero);
279 | 	}
280 | 	Copy(to, from);
281 | }
282 | */
283 | #endif
284 | 


--------------------------------------------------------------------------------
/Windowlized.h:
--------------------------------------------------------------------------------
  1 | #ifndef WINDOWLIZED
  2 | #define WINDOWLIZED
  3 | 
  4 | #include "tensor.h"
  5 | #include "MyLib.h"
  6 | 
  7 | 
  8 | using namespace std;
  9 | using namespace mshadow;
 10 | using namespace mshadow::expr;
 11 | using namespace mshadow::utils;
 12 | 
 13 | 
 14 | template<typename xpu>
 15 | inline void windowlized(const vector<Tensor<xpu, 2, dtype> > &wi, vector<Tensor<xpu, 2, dtype> > &wo, int context)
 16 | {
 17 |   int seqsize = wo.size();
 18 |   if (wi.size() != seqsize || seqsize == 0 || context < 0) {
 19 |     std::cerr << "windowlized error: vector size or context size invalid" << std::endl;
 20 |   }
 21 | 
 22 |   int dim1 = wi[0].size(0), dim2 = wi[0].size(1);
 23 |   int odim1 = wo[0].size(0), odim2 = wo[0].size(1);
 24 |   int computeddim2 = (2 * context + 1) * dim2;
 25 |   if(computeddim2 != odim2 || dim1 != 1 || odim1 != 1){
 26 |     std::cerr << "windowlized error: dim size invalid" << std::endl;
 27 |   }
 28 | 
 29 |   static int offset;
 30 |   for (int idx = 0; idx < seqsize; idx++) {
 31 |     wo[idx] = 0.0;
 32 |     offset = 0;
 33 |     for (int idp = idx - context; idp <= idx + context; idp++) {
 34 |       if (idp < 0 || idp >= seqsize) {
 35 |         offset += dim2;
 36 |       } else {
 37 |         for (int idy = 0; idy < dim2; idy++) {
 38 |           wo[idx][0][offset] = wi[idp][0][idy];
 39 |           offset++;
 40 |         }
 41 |       }
 42 |     }
 43 |     assert(offset == odim2);
 44 |   }
 45 | 
 46 | }
 47 | 
 48 | 
 49 | template<typename xpu>
 50 | inline void windowlized(Tensor<xpu, 3, dtype> wi, Tensor<xpu, 3, dtype> wo, int context)
 51 | {
 52 |   int seqsize = wo.size(0);
 53 |   if (wi.size(0) != seqsize || seqsize == 0 || context < 0) {
 54 |     std::cerr << "windowlized error: vector size or context size invalid" << std::endl;
 55 |   }
 56 | 
 57 |   int dim1 = wi.size(1), dim2 = wi.size(2);
 58 |   int odim1 = wo.size(1), odim2 = wo.size(2);
 59 |   int computeddim2 = (2 * context + 1) * dim2;
 60 |   if(computeddim2 != odim2 || dim1 != 1 || odim1 != 1){
 61 |     std::cerr << "windowlized error: dim size invalid" << std::endl;
 62 |   }
 63 | 
 64 |   wo = 0.0;
 65 |   static int offset;
 66 |   for (int idx = 0; idx < seqsize; idx++) {
 67 |     offset = 0;
 68 |     for (int idp = idx - context; idp <= idx + context; idp++) {
 69 |       if (idp < 0 || idp >= seqsize) {
 70 |         offset += dim2;
 71 |       } else {
 72 |         for (int idy = 0; idy < dim2; idy++) {
 73 |           wo[idx][0][offset] = wi[idp][0][idy];
 74 |           offset++;
 75 |         }
 76 |       }
 77 |     }
 78 |     assert(offset == odim2);
 79 |   }
 80 | 
 81 | }
 82 | 
 83 | 
 84 | template<typename xpu>
 85 | inline void windowlized_backward(vector<Tensor<xpu, 2, dtype> > &lwi, const vector<Tensor<xpu, 2, dtype> > &lwo, int context, bool bclear = false)
 86 | {
 87 |   int seqsize = lwo.size();
 88 |   if (lwi.size() != seqsize || seqsize == 0 || context < 0) {
 89 |     std::cerr << "windowlized error: vector size or context size invalid" << std::endl;
 90 |   }
 91 | 
 92 |   int dim1 = lwi[0].size(0), dim2 = lwi[0].size(1);
 93 |   int odim1 = lwo[0].size(0), odim2 = lwo[0].size(1);
 94 |   int computeddim2 = (2 * context + 1) * dim2;
 95 |   if(computeddim2 != odim2 || dim1 != 1 || odim1 != 1){
 96 |     std::cerr << "windowlized error: dim size invalid" << std::endl;
 97 |   }
 98 | 
 99 |   if(bclear){
100 |     for (int idx = 0; idx < seqsize; idx++) {
101 |       lwi[idx] = 0.0;
102 |     }
103 |   }
104 |   static int offset;
105 |   for (int idx = 0; idx < seqsize; idx++) {
106 |     offset = 0;
107 |     for (int idp = idx - context; idp <= idx + context; idp++) {
108 |       if (idp < 0 || idp >= seqsize) {
109 |         offset += dim2;
110 |       } else {
111 |         for (int idy = 0; idy < dim2; idy++) {
112 |           lwi[idp][0][idy] += lwo[idx][0][offset];
113 |           offset++;
114 |         }
115 |       }
116 |     }
117 |     assert(offset == odim2);
118 |   }
119 | 
120 | }
121 | 
122 | 
123 | template<typename xpu>
124 | inline void windowlized_backward(Tensor<xpu, 3, dtype> lwi, Tensor<xpu, 3, dtype> lwo, int context, bool bclear = false)
125 | {
126 |   int seqsize = lwo.size(0);
127 |   if (lwi.size(0) != seqsize || seqsize == 0 || context < 0) {
128 |     std::cerr << "windowlized error: vector size or context size invalid" << std::endl;
129 |   }
130 | 
131 |   int dim1 = lwi.size(1), dim2 = lwi.size(2);
132 |   int odim1 = lwo.size(1), odim2 = lwo.size(2);
133 |   int computeddim2 = (2 * context + 1) * dim2;
134 |   if(computeddim2 != odim2 || dim1 != 1 || odim1 != 1){
135 |     std::cerr << "windowlized error: dim size invalid" << std::endl;
136 |   }
137 | 
138 |   if(bclear) lwi = 0.0;
139 |   static int offset;
140 |   for (int idx = 0; idx < seqsize; idx++) {
141 |     offset = 0;
142 |     for (int idp = idx - context; idp <= idx + context; idp++) {
143 |       if (idp < 0 || idp >= seqsize) {
144 |         offset += dim2;
145 |       } else {
146 |         for (int idy = 0; idy < dim2; idy++) {
147 |           lwi[idp][0][idy] += lwo[idx][0][offset];
148 |           offset++;
149 |         }
150 |       }
151 |     }
152 |     assert(offset == odim2);
153 |   }
154 | 
155 | }
156 | 
157 | 
158 | #endif
159 | 


--------------------------------------------------------------------------------
/description(expect for lrec2016).pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SUTDNLP/LibN3L/da49c8ccf715170a60f6b5ce1930df1e691dc280/description(expect for lrec2016).pdf


--------------------------------------------------------------------------------