├── LICENSE ├── README.md ├── drsv.cpp ├── drsv_main.cpp ├── run.sh └── utils.cpp /LICENSE: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2015 Ozan Irsoy 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | 23 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ## Overview 2 | 3 | Implementation of a deep recursive neural network for the task of fine-grained sentiment detection. 4 | 5 | See the paper, 6 | >"Deep Recursive Neural Networks for Compositionality in Language" 7 | >Ozan Irsoy, Claire Cardie 8 | >NIPS 2014 9 | 10 | for details. 11 | 12 | If you use my code, please cite: 13 | 14 | @InProceedings{irsoy-drsv, 15 | author = {\.Irsoy, Ozan and Cardie, Claire}, 16 | title = {Deep Recursive Neural Networks for Compositionality in Language}, 17 | booktitle = {Advances in Neural Information Processing Systems 27}, 18 | editor = {Z. Ghahramani and M. Welling and C. Cortes and N.D. Lawrence and K.Q. Weinberger}, 19 | pages = {2096--2104}, 20 | year = {2014}, 21 | publisher = {Curran Associates, Inc.}, 22 | url = {http://papers.nips.cc/paper/5551-deep-recursive-neural-networks-for-compositionality-in-language.pdf}, 23 | location = {Montreal, Quebec} 24 | } 25 | 26 | Feel free to ask questions: oirsoy [a] cs [o] cornell [o] edu. 27 | 28 | 29 | ## Getting Started 30 | 31 | Assuming you have g++ and the code here, running the bash script as 32 | 33 | bash run.sh 34 | 35 | should 36 | 37 | 1. download small word embeddings (50 dimensional Glove) 38 | 2. download the Stanford Sentiment Treebank (in PTB form) 39 | 3. download the Eigen library 40 | 4. compile and run to train a small model to be saved to disk. 41 | 42 | That's it! Once you have a working setup, you can play with the hyperparameters or pick different word embeddings (300d word2vec is used in the experiments in the paper). 43 | 44 | ##License 45 | 46 | Code is released under [the MIT license](http://opensource.org/licenses/MIT). 47 | -------------------------------------------------------------------------------- /drsv.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include "Eigen/Dense" 7 | #include "utils.cpp" 8 | 9 | #define uint unsigned int 10 | 11 | #define layers 3 12 | #define MAXEPOCH 200 13 | #define MINIBATCH 20 14 | #define NORMALIZE true // relevant only with momentum 15 | #define ADAGRAD true // adagrad or momentum 16 | 17 | #define WI true // initialize W as 0.5*I 18 | #define WIr true // regularize W to 0.5*I instead of 0 19 | 20 | using namespace std; 21 | using namespace Eigen; 22 | 23 | double DROP = 0.1; 24 | 25 | Matrix dropout(Matrix x, double p=DROP) { 26 | for (uint i=0; iprint(indent); 102 | right->print(indent); 103 | } 104 | } 105 | 106 | MatrixXd& Node::W(const uint& l, uint type) { 107 | if (l > 0 || left != NULL) { 108 | if (isLeft) 109 | return Whhl[l][type]; 110 | else 111 | return Whhr[l][type]; 112 | } else if (l==0 && left == NULL) { 113 | if (isLeft) 114 | return Wxhl[l][type]; 115 | else 116 | return Wxhr[l][type]; 117 | } else 118 | assert(false); 119 | } 120 | MatrixXd& Node::V(const uint& l, uint type) { 121 | assert(l > 0); 122 | if (left == NULL && l == 1) 123 | return Vxh[type]; 124 | else 125 | return Vhh[l][type]; 126 | } 127 | MatrixXd& Node::U(const uint& l, uint type) { 128 | if (left == NULL && l == 0) 129 | return Uxy[type]; 130 | else 131 | return Uhy[l][type]; 132 | } 133 | 134 | // for a root node, read a PTB tree and construct 135 | // the tree recursively 136 | uint Node::read(string &treeText, uint index, bool init) { 137 | char c; // current char 138 | string b; // buffer string 139 | uint numChild =0; 140 | 141 | for (uint i=index; iisLeft = true; 152 | i = left->read(treeText, i+1, false); 153 | } else if (numChild==1) { 154 | right = new Node(); 155 | right->isLeft = false; 156 | i = right->read(treeText, i+1, false); 157 | } else 158 | assert(false); 159 | numChild++; 160 | } else if (c == ')') { 161 | word = b; 162 | assert(numChild == 2 || numChild == 0); 163 | return i; 164 | } else if (isspace(c)) { 165 | if (numChild == 0) { 166 | r = VectorXd::Zero(5); // buffer is label 167 | r[(int)str2double(b)] = 1; 168 | } 169 | b = ""; // reset buffer 170 | } else { 171 | b += c; 172 | } 173 | } 174 | } 175 | 176 | void Node::forward(bool test) { 177 | if (left != NULL) { // not a leaf 178 | left->forward(test); 179 | right->forward(test); 180 | } else { 181 | if (x[0].size() != nx) 182 | x[0] = (*LT)[word]; // don't have to repeat this if no finetuning 183 | } 184 | 185 | for (uint l=0; lW(l))*(left->x[l]) + (right->W(l))*(right->x[l]); 190 | } 191 | if (l > 0) 192 | v.noalias() += V(l)*x[l-1]; 193 | if (l > 0 || left != NULL) { // layer 0 leaves already have their x!! 194 | if (!test) 195 | x[l] = f(v).cwiseProduct(dropper); 196 | else 197 | x[l] = f(v)*(1-DROP); 198 | } 199 | dx[l] = VectorXd::Zero(x[l].size()); 200 | } 201 | 202 | VectorXd v = c[0]; 203 | for (uint l=layers-1; l=0; l--) { 221 | VectorXd fpxd = fp(x[l]).cwiseProduct(dx[l]); 222 | if (left != NULL) { 223 | left->dx[l].noalias() += (left->W(l)).transpose() * fpxd; 224 | right->dx[l].noalias() += (right->W(l)).transpose() * fpxd; 225 | left->W(l,1).noalias() += fpxd * (left->x[l]).transpose(); 226 | right->W(l,1).noalias() += fpxd * (right->x[l]).transpose(); 227 | } 228 | if (l > 0 || left != NULL) 229 | b[l][1].noalias() += fpxd; 230 | if (l > 0) { 231 | dx[l-1].noalias() += V(l).transpose() * fpxd; 232 | V(l,1).noalias() += fpxd * x[l-1].transpose(); 233 | } 234 | } 235 | 236 | if (left != NULL) { 237 | left->backward(); 238 | right->backward(); 239 | } else { 240 | ; // word vector fine tuning can go here 241 | } 242 | } 243 | 244 | void Node::save(string fname) { 245 | ofstream out(fname.c_str()); 246 | out << ny << " " << nh << " " << nx << " " << layers << endl; 247 | for (uint l=0; l> ny >> nh >> nx >> layers_; 293 | assert(layers == layers_); 294 | Node::init(); 295 | for (uint l=0; l> Whhl[l][0](i,j); 299 | for (uint i=0; i> Whhr[l][0](i,j); 302 | for (uint i=0; i> Wxhl[l][0](i,j); 305 | for (uint i=0; i> Wxhr[l][0](i,j); 308 | for (uint i=0; i> Vhh[l][0](i,j); 311 | for (uint i=0; i> Uhy[l][0](i,j); 314 | for (uint i=0; i> b[l][0](i); 316 | } 317 | for (uint i=0; i> Vxh[0](i,j); 320 | for (uint i=0; i> Uxy[0](i,j); 323 | for (uint i=0; i> c[0](i); 325 | } 326 | 327 | void Node::init() { 328 | for (uint l=0; l cap*cap) ? sqrt(norm/(cap*cap)) : 1; 413 | else 414 | norm = 1; 415 | 416 | if (!ADAGRAD) { 417 | // update velocities 418 | for (uint l=0; l &trees, std::string fname) { 499 | ifstream in(fname.c_str()); 500 | string line; 501 | while(std::getline(in, line)) { 502 | std::istringstream ss(line); 503 | if (!isWhitespace(line)) { 504 | Node* t = new Node(); 505 | t->read(line, 0, true); 506 | trees.push_back(t); 507 | } 508 | } 509 | } 510 | 511 | VectorXd error(vector& trees) { 512 | // root level 513 | double err=0,errbin=0,nbin=0; 514 | double onenorm=0; 515 | for (uint i=0; iforward(true); 517 | onenorm += trees[i]->x[layers-1].sum() / Node::nh; 518 | uint r = argmax(trees[i]->r); 519 | uint y = argmax(trees[i]->y); 520 | if (r != y) 521 | err++; 522 | if (r != 2) { 523 | nbin++; 524 | if (trees[i]->y(0) + trees[i]->y(1) > 525 | trees[i]->y(3) + trees[i]->y(4)) { 526 | if (r == 3 || r == 4) 527 | errbin++; 528 | } else { 529 | if (r == 0 || r == 1) 530 | errbin++; 531 | } 532 | } 533 | 534 | } 535 | Node::nnorm = onenorm / trees.size(); 536 | //cout << onenorm / trees.size() << " "; 537 | VectorXd v(2); v << err / trees.size(), errbin / nbin; 538 | return v; 539 | } 540 | 541 | void train(vector& tra, vector dev, vector test) { 542 | ostringstream strS; 543 | strS << "models/drsv_" << layers << "_" << Node::nh << "_" 544 | << (int)ADAGRAD << "_" << DROP << "_" 545 | << (int)NORMALIZE << "_" 546 | << MAXEPOCH << "_" << Node::lr << "_" << Node::la << "_" 547 | << Node::mr << "_" << Node::fold; 548 | string fname = strS.str(); 549 | cout << tra.size() << " " << dev.size() << " " << test.size() << endl; 550 | vector perm; 551 | for (uint i=0; iforward(false); 564 | tra[j]->backward(); 565 | if ((i+1) % MINIBATCH == 0 || i+1 == tra.size()) { 566 | Node::update(); 567 | } 568 | } 569 | devAcc = 1-error(dev).array(); 570 | testAcc = 1-error(test).array(); 571 | cout << 1-error(tra).array().transpose() 572 | << " " << devAcc.transpose() 573 | << " " << testAcc.transpose() << endl; 574 | 575 | // diagnostic 576 | /* 577 | for (uint l=0; l bestDev[0]) { 591 | bestDev = devAcc; 592 | bestTest = testAcc; 593 | cout << "New Best: " << bestDev.transpose() 594 | << " " << bestTest.transpose() << endl; 595 | Node::save(fname); 596 | } 597 | 598 | if (Node::nnorm > 1e6) break; // exploded, do not continue training 599 | if (Node::LT != NULL) { 600 | delete Node::LT; // to save memory. don't do if finetuning 601 | Node::LT = NULL; 602 | } 603 | } 604 | cout << "best:" << endl << bestDev.transpose() << " " 605 | << bestTest.transpose() << endl; 606 | } 607 | 608 | -------------------------------------------------------------------------------- /drsv_main.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include "Eigen/Dense" 4 | #include "drsv.cpp" 5 | 6 | using namespace std; 7 | using namespace Eigen; 8 | 9 | int main(int argc, char** argv) { 10 | vector tra, dev, test; 11 | 12 | srand(13457); 13 | 14 | readTrees(tra, "trees/train.txt"); 15 | readTrees(dev, "trees/dev.txt"); 16 | readTrees(test, "trees/test.txt"); 17 | 18 | LookupTable *lt = new LookupTable(); 19 | cout << "Loading word vectors..." << flush; 20 | // i used 300d word2vec in my own experiments. 21 | lt->load("glove.6B.50d.txt", 400000, 50, true); 22 | cout << " Done." << endl; 23 | Node::LT = lt; 24 | 25 | if (ADAGRAD) 26 | Node::lr = 0.01; 27 | else 28 | Node::lr = 0.002; 29 | Node::la = 0.0001; 30 | Node::mr = 0.9; 31 | 32 | Node::nx = 50; 33 | Node::nh = 50; 34 | Node::ny = 5; 35 | 36 | train(tra, dev, test); 37 | return 0; 38 | } 39 | -------------------------------------------------------------------------------- /run.sh: -------------------------------------------------------------------------------- 1 | # embeddings 2 | curl -O http://www-nlp.stanford.edu/data/glove.6B.50d.txt.gz 3 | gzip -d glove.6B.50d.txt.gz 4 | 5 | # dataset 6 | curl -O http://nlp.stanford.edu/sentiment/trainDevTestTrees_PTB.zip 7 | unzip trainDevTestTrees_PTB.zip 8 | 9 | # Eigen 10 | curl -L http://bitbucket.org/eigen/eigen/get/3.2.4.tar.gz -o eigen.tar.gz 11 | tar -xzvf eigen.tar.gz --strip-components=1 eigen-eigen-10219c95fe65/Eigen 12 | 13 | # compile & run 14 | g++ drsv_main.cpp -I ./Eigen/ -std=c++11 -O3 -fopenmp -o drsv 15 | ./drsv 16 | -------------------------------------------------------------------------------- /utils.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include 13 | #include "Eigen/Dense" 14 | 15 | #define uint unsigned int 16 | 17 | using namespace Eigen; 18 | using namespace std; 19 | 20 | VectorXd relu(const VectorXd &x) { 21 | return x.array().max(0); 22 | } 23 | 24 | VectorXd relup(const VectorXd &y) { 25 | return (y.array() > 0).cast(); 26 | } 27 | 28 | VectorXd softmax(const VectorXd &x) { 29 | double m = x.maxCoeff(); 30 | VectorXd v = (x.array() - m).exp(); 31 | return v.array() / v.sum(); 32 | } 33 | 34 | VectorXd smxntp(const VectorXd &y, const VectorXd &r) { 35 | return y-r; 36 | } 37 | 38 | double str2double(const string& s) { 39 | istringstream i(s); 40 | double x; 41 | if (!(i >> x)) 42 | return 0; 43 | return x; 44 | } 45 | 46 | class LookupTable { 47 | public: 48 | void load(string fname, uint n, uint d, bool noUnknown=false); 49 | //ColXpr operator[](string word); 50 | VectorXd operator[](string word); 51 | void gradAdd(string word, VectorXd v); 52 | void update(); 53 | void clear(); 54 | 55 | private: 56 | double lr; 57 | unordered_map table; // word -> index 58 | MatrixXd data; // index -> vector representation 59 | MatrixXd gdata; // gradients 60 | MatrixXd adata; // adagrad past 61 | set modifiedCols; 62 | }; 63 | 64 | void LookupTable::clear() { 65 | data = gdata = adata = MatrixXd(); 66 | } 67 | 68 | void LookupTable::load(string fname, uint n, uint d, bool noUnknown) { 69 | ifstream in(fname.c_str()); 70 | assert(in.is_open()); 71 | string line; 72 | if (noUnknown) n++; 73 | data = MatrixXd(d,n); 74 | gdata = adata = MatrixXd::Zero(d,n); 75 | adata.fill(1e-6); 76 | uint j=0; 77 | while(std::getline(in, line)) { 78 | std::istringstream ss(line); 79 | std::istream_iterator begin(ss), end; 80 | 81 | //putting all the tokens in the vector 82 | std::vector tokens(begin, end); 83 | for (uint i=0; i::iterator it; 103 | 104 | // this might not be the best place for this, 105 | // if i'm calling this frequently 106 | if (word == "-LRB-") 107 | word = "("; 108 | else if (word == "-RRB-") 109 | word = ")"; 110 | else if (word == "-LSB-") 111 | word = "("; 112 | else if (word == "-RSB-") 113 | word = ")"; 114 | else if (word == "-LCB-") 115 | word = "("; 116 | else if (word == "-RCB-") 117 | word = ")"; 118 | 119 | it = table.find(word); 120 | if (it != table.end()) // exists 121 | return data.col(table[word]); 122 | else 123 | return data.col(table["*UNKNOWN*"]); 124 | } 125 | 126 | void LookupTable::gradAdd(string word, VectorXd v) { 127 | unordered_map::iterator it; 128 | if (word == "-LRB-") 129 | word = "("; 130 | else if (word == "-RRB-") 131 | word = ")"; 132 | else if (word == "-LSB-") 133 | word = "("; 134 | else if (word == "-RSB-") 135 | word = ")"; 136 | else if (word == "-LCB-") 137 | word = "("; 138 | else if (word == "-RCB-") 139 | word = ")"; 140 | 141 | it = table.find(word); 142 | if (it != table.end()) {// exists 143 | gdata.col(table[word]) += v; 144 | modifiedCols.insert(table[word]); 145 | } else { 146 | gdata.col(table["*UNKNOWN*"]) += v; 147 | modifiedCols.insert(table[word]); 148 | } 149 | } 150 | 151 | void LookupTable::update() { 152 | lr = 0.001; 153 | for (auto i : modifiedCols) { 154 | adata.col(i) = (adata.col(i).cwiseProduct(adata.col(i)) + 155 | gdata.col(i).cwiseProduct(gdata.col(i))).cwiseSqrt(); 156 | data.col(i) -= lr*gdata.col(i).cwiseQuotient(adata.col(i)); 157 | gdata.col(i).setZero(); 158 | } 159 | modifiedCols.clear(); 160 | } 161 | 162 | // index of max in a vector 163 | uint argmax(const VectorXd& x) { 164 | double max = x(0); 165 | uint maxi = 0; 166 | for (uint i=1; i max) { 168 | max = x(i); 169 | maxi = i; 170 | } 171 | } 172 | return maxi; 173 | } 174 | 175 | // this is used for randomly initializing an Eigen matrix 176 | double urand(double dummy) { 177 | double min = -1, max = 1; 178 | return (double(rand())/RAND_MAX)*(max-min) + min; 179 | //+ int((double(rand())/RAND_MAX) < 0.5); 180 | } 181 | 182 | // KFY shuffle (uniformly randomly) of a vector 183 | template 184 | void shuffle(vector& v) { 185 | if (v.size() == 0) return; 186 | for (uint i=v.size()-1; i>0; i--) { 187 | uint j = (rand() % i); 188 | T tmp = v[i]; 189 | v[i] = v[j]; 190 | v[j] = tmp; 191 | } 192 | } 193 | 194 | template 195 | void shuffle(vector& v, vector& w) { 196 | assert(w.size() == v.size()); 197 | if (w.size() == 0) return; 198 | for (uint i=v.size()-1; i>0; i--) { 199 | uint j = (rand() % i); 200 | T tmp = v[i]; 201 | v[i] = v[j]; 202 | v[j] = tmp; 203 | T2 tmp2 = w[i]; 204 | w[i] = w[j]; 205 | w[j] = tmp2; 206 | } 207 | } 208 | 209 | bool isWhitespace(std::string str) { 210 | for(uint i=0; i split(const string &s, char delim) { 218 | stringstream ss(s); 219 | string item; 220 | vector elems; 221 | while (getline(ss, item, delim)) { 222 | elems.push_back(item); 223 | } 224 | return elems; 225 | } 226 | 227 | --------------------------------------------------------------------------------