├── models └── dummy.txt ├── DependencyTreeRNN++ ├── DependencyTreeRNN++.tmp ├── CorpusWordReader.h ├── ReadJson.h ├── CommandLineParser.h ├── CommandLineParser.cpp ├── RnnWeights.h ├── Vocabulary.h ├── RnnDependencyTreeLib.h ├── Utils.h ├── CorpusUnrollsReader.h ├── RnnState.h ├── RnnWeights.cpp ├── RnnTraining.h ├── Vocabulary.cpp ├── ReadJson.cpp ├── CorpusUnrollsReader.cpp └── RnnLib.h ├── books ├── test.txt ├── valid.txt ├── train_small.txt ├── test.labels ├── valid.labels └── all.labels ├── test_rnn_holmes_sequential.sh ├── ensemble.py ├── LICENSE.md ├── logs ├── GutenbergHolmes_seq_mw5_h300_c250_m1000_d4_b5.acc ├── GutenbergHolmes_p2_mw5_h50_c250_m2000_d4_b5_g0.5.acc ├── GutenbergHolmes_p2_mw5_h200_c250_m2000_d4_b5_g0.5.acc ├── GutenbergHolmes_seq_mw5_h50_c250_m0_d0_b5.acc ├── GutenbergHolmes_p0_mw5_h100_c250_m2000_d4_b5_g0.5.acc ├── GutenbergHolmes_p0_mw5_h50_c250_m2000_d4_b5_g0.5.acc ├── GutenbergHolmes_seq_mw5_h100_c250_m0_d0_b5.acc ├── GutenbergHolmes_p0_mw5_h200_c250_m2000_d4_b5_g0.5.acc ├── GutenbergHolmes_seq_mw5_h200_c250_m0_d0_b5.acc ├── GutenbergHolmes_p2_mw5_h100_c250_m2000_d4_b5_g0.5.acc ├── GutenbergHolmes_seq_mw5_h50_c250_m1000_d4_b5_indep_.acc ├── GutenbergHolmes_seq_mw5_h200_c250_m1000_d4_b5.acc └── GutenbergHolmes_seq_mw5_h100_c250_m1000_d4_b5.acc ├── train_rnn_holmes_sequential.sh ├── test_rnn_holmes_example.sh ├── Makefile ├── Makefile.MacOS ├── train_rnn_holmes_p0_1000.sh ├── train_rnn_holmes_p0_2000.sh ├── train_rnn_holmes_p2_1000.sh ├── train_rnn_holmes_p2_2000.sh ├── train_rnn_holmes_example.sh ├── Makefile.Linux ├── results.txt ├── README.md └── preprocessing ├── JSON2unrolls.py └── Text2Parsed2JSON.java /models/dummy.txt: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /DependencyTreeRNN++/DependencyTreeRNN++.tmp: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /books/test.txt: -------------------------------------------------------------------------------- 1 | Holmes.test.json.unrolls.json 2 | -------------------------------------------------------------------------------- /books/valid.txt: -------------------------------------------------------------------------------- 1 | Holmes.valid.json.unrolls.json 2 | -------------------------------------------------------------------------------- /books/train_small.txt: -------------------------------------------------------------------------------- 1 | 04TOM10.TXT.json.unrolls.json 2 | AGENT10.TXT.json.unrolls.json 3 | GOLDR10.TXT.json.unrolls.json 4 | MOLLF10.TXT.json.unrolls.json 5 | RUNNG10.TXT.json.unrolls.json 6 | WARW11.TXT.json.unrolls.json 7 | -------------------------------------------------------------------------------- /test_rnn_holmes_sequential.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | # This is the path that should be edited, 4 | # depending on where the JSON books are stored 5 | PATH_SEQUENTIAL=$PWD"/../Data/GutenbergHolmes_Sequential/" 6 | 7 | # Get the model filename from the command line 8 | FILE_MODEL=$1 9 | 10 | # If we need to debug, change this to "true" 11 | DEBUG_MODE="false" 12 | 13 | # Automatic path generation 14 | PATH_DATA="./books" 15 | PATH_MODELS="./models" 16 | FILE_VALID=$PATH_SEQUENTIAL"/Holmes.valid.json.tokens.txt" 17 | FILE_TEST=$PATH_SEQUENTIAL"/Holmes.test.json.tokens.txt" 18 | FILE_SENTENCE_LABELS_VALID=$PATH_DATA"/valid.labels" 19 | FILE_SENTENCE_LABELS_TEST=$PATH_DATA"/test.labels" 20 | echo "RNN model will be stored in $FILE_MODEL..." 21 | 22 | # Evaluate the dependency-parsing model on the validation set 23 | RnnDependencyTree \ 24 | -rnnlm $FILE_MODEL \ 25 | -test $FILE_VALID \ 26 | -sentence-labels $FILE_SENTENCE_LABELS_VALID \ 27 | -debug $DEBUG_MODE 28 | 29 | # Evaluate the dependency-parsing model on the test set 30 | RnnDependencyTree \ 31 | -rnnlm $FILE_MODEL \ 32 | -test $FILE_TEST \ 33 | -sentence-labels $FILE_SENTENCE_LABELS_TEST \ 34 | -debug $DEBUG_MODE 35 | -------------------------------------------------------------------------------- /ensemble.py: -------------------------------------------------------------------------------- 1 | # first arg gold, following ones files with scores to ensemble 2 | 3 | import sys 4 | 5 | goldFile = sys.argv[1] 6 | 7 | answers = [] 8 | 9 | for line in open(goldFile).readlines(): 10 | answers.append(int(line.strip())) 11 | 12 | print "loaded " + str(len(answers)) + " answers" 13 | 14 | # an array with an array per model to be ensebled 15 | individualSentencePredictions = [] 16 | 17 | for file in sys.argv[2:]: 18 | sentencePredictions = [] 19 | for line in open(file).readlines(): 20 | sentencePredictions.append(float(line.strip())) 21 | 22 | individualSentencePredictions.append(sentencePredictions) 23 | 24 | # now for each answer 25 | # take the scores for 5 sentence predictions 26 | # add them 27 | # pick the highest one and compare 28 | correct = 0.0 29 | indiCounter= 0 30 | for answer in answers: 31 | maxScore = float("-inf") 32 | bestAnswer = None 33 | for i in xrange(5): 34 | scoreSum = 0.0 35 | for preds in individualSentencePredictions: 36 | scoreSum += preds[indiCounter] 37 | #print scoreSum 38 | if scoreSum > maxScore: 39 | maxScore = scoreSum 40 | bestAnswer = i 41 | 42 | indiCounter += 1 43 | #print bestAnswer 44 | #print maxScore 45 | if answer == bestAnswer: 46 | correct += 1 47 | 48 | print "accuracy: " + str(correct/len(answers)) 49 | 50 | -------------------------------------------------------------------------------- /LICENSE.md: -------------------------------------------------------------------------------- 1 | Copyright (c) 2015, Piotr Mirowski 2 | All rights reserved. 3 | 4 | Redistribution and use in source and binary forms, with or without 5 | modification, are permitted provided that the following conditions are met: 6 | 7 | * Redistributions of source code must retain the above copyright notice, this 8 | list of conditions and the following disclaimer. 9 | 10 | * Redistributions in binary form must reproduce the above copyright notice, 11 | this list of conditions and the following disclaimer in the documentation 12 | and/or other materials provided with the distribution. 13 | 14 | * Neither the name of DependencyTreeRnn nor the names of its 15 | contributors may be used to endorse or promote products derived from 16 | this software without specific prior written permission. 17 | 18 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 19 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 20 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 21 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE 22 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 23 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 24 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 25 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 26 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 27 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 28 | -------------------------------------------------------------------------------- /DependencyTreeRNN++/CorpusWordReader.h: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2014-2015 Piotr Mirowski 2 | // 3 | // Piotr Mirowski, Andreas Vlachos 4 | // "Dependency Recurrent Neural Language Models for Sentence Completion" 5 | // ACL 2015 6 | 7 | #ifndef DependencyTreeRNN___CorpusWordReader_h 8 | #define DependencyTreeRNN___CorpusWordReader_h 9 | 10 | #include 11 | #include 12 | #include 13 | #include 14 | 15 | 16 | inline bool isSpace(char c) { return isspace(c); }; 17 | inline bool notIsSpace(char c) { return !isspace(c); }; 18 | 19 | 20 | /** 21 | * Simple class to read words, one by one, from a file. 22 | * When the end of a line is reached, it returns "" 23 | */ 24 | class WordReader { 25 | protected: 26 | std::ifstream m_file; 27 | std::string m_line; 28 | 29 | public: 30 | 31 | WordReader(const std::string &filename) 32 | : m_file(filename) { 33 | } 34 | 35 | 36 | std::string pop_first_word(std::string &s) { 37 | const auto p1 = std::find_if(s.begin(), s.end(), notIsSpace); 38 | const auto p2 = std::find_if(p1, s.end(), isSpace); 39 | const std::string word(p1, p2); 40 | s.erase(0, std::find_if(p2, s.end(), notIsSpace) - s.begin()); 41 | return word; 42 | } 43 | 44 | 45 | std::string get_next() { 46 | std::string result; 47 | if (m_line.empty()) { 48 | if (std::getline(m_file, m_line)) { 49 | m_line += " "; 50 | } else { 51 | return result; 52 | } 53 | } 54 | result = pop_first_word(m_line); 55 | return result; 56 | } 57 | }; 58 | 59 | #endif 60 | -------------------------------------------------------------------------------- /logs/GutenbergHolmes_seq_mw5_h300_c250_m1000_d4_b5.acc: -------------------------------------------------------------------------------- 1 | Iter,0,Alpha,0.100000,VALIDacc,0.321154,VALIDent,7.395424,VALIDppx,168.362202,words/sec,0 2 | Iter,1,Alpha,0.100000,VALIDacc,0.344231,VALIDent,7.152782,VALIDppx,142.299051,words/sec,0 3 | Iter,2,Alpha,0.100000,VALIDacc,0.355769,VALIDent,7.050841,VALIDppx,132.591149,words/sec,0 4 | Iter,3,Alpha,0.100000,VALIDacc,0.371154,VALIDent,6.990962,VALIDppx,127.200642,words/sec,0 5 | Iter,4,Alpha,0.100000,VALIDacc,0.378846,VALIDent,6.968837,VALIDppx,125.264816,words/sec,0 6 | Iter,5,Alpha,0.100000,VALIDacc,0.384615,VALIDent,6.959099,VALIDppx,124.422121,words/sec,0 7 | Iter,6,Alpha,0.100000,VALIDacc,0.386538,VALIDent,6.956460,VALIDppx,124.194757,words/sec,0 8 | Iter,7,Alpha,0.100000,VALIDacc,0.396154,VALIDent,6.960201,VALIDppx,124.517210,words/sec,0 9 | Iter,8,Alpha,0.100000,VALIDacc,0.400000,VALIDent,6.966089,VALIDppx,125.026436,words/sec,0 10 | Iter,9,Alpha,0.100000,VALIDacc,0.394231,VALIDent,6.974113,VALIDppx,125.723723,words/sec,0 11 | Iter,10,Alpha,0.066667,VALIDacc,0.396154,VALIDent,6.835435,VALIDppx,114.201273,words/sec,0 12 | Iter,11,Alpha,0.044444,VALIDacc,0.394231,VALIDent,6.744126,VALIDppx,107.197371,words/sec,0 13 | Iter,12,Alpha,0.029630,VALIDacc,0.392308,VALIDent,6.680182,VALIDppx,102.549864,words/sec,0 14 | Iter,13,Alpha,0.019753,VALIDacc,0.392308,VALIDent,6.632794,VALIDppx,99.236181,words/sec,0 15 | Iter,14,Alpha,0.013169,VALIDacc,0.392308,VALIDent,6.598300,VALIDppx,96.891635,words/sec,0 16 | Iter,15,Alpha,0.008779,VALIDacc,0.390385,VALIDent,6.571902,VALIDppx,95.134870,words/sec,0 17 | Iter,16,Alpha,0.005853,VALIDacc,0.386538,VALIDent,6.549385,VALIDppx,93.661574,words/sec,0 18 | Iter,17,Alpha,0.003902,VALIDacc,0.386538,VALIDent,6.529088,VALIDppx,92.353051,words/sec,0 19 | -------------------------------------------------------------------------------- /train_rnn_holmes_sequential.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | # This is the path that should be edited, 4 | # depending on where the JSON books are stored 5 | PATH_SEQUENTIAL=$PWD"/../Data/GutenbergHolmes_Sequential/" 6 | 7 | # Define the minimum number of word occurrences as 5 and use existing vocabulary file 8 | MIN_WORD_OCCURRENCE=5 9 | RNN_HIDDENS=200 10 | RNN_CLASSES=250 11 | NGRAM_SIZE_MB=2000 12 | NGRAM_ORDER=3 13 | BPTT_ORDER=5 14 | 15 | # If we need to debug, change this to "true" 16 | DEBUG_MODE="false" 17 | 18 | # Automatic path generation 19 | PATH_DATA="./books" 20 | PATH_MODELS="./models" 21 | FILE_TRAIN=$PATH_SEQUENTIAL"/Holmes.train.json.tokens.txt" 22 | FILE_VALID=$PATH_SEQUENTIAL"/Holmes.valid.json.tokens.txt" 23 | FILE_SENTENCE_LABELS=$PATH_DATA"/valid.labels" 24 | FILE_VOCAB=$PATH_DATA"/vocab_mw"$MIN_WORD_OCCURRENCE"_sequential.txt" 25 | FILE_MODEL=$PATH_MODELS"/GutenbergHolmes_seq" 26 | FILE_MODEL=$FILE_MODEL"_mw"$MIN_WORD_OCCURRENCE 27 | FILE_MODEL=$FILE_MODEL"_h"$RNN_HIDDENS 28 | FILE_MODEL=$FILE_MODEL"_c"$RNN_CLASSES 29 | FILE_MODEL=$FILE_MODEL"_m"$NGRAM_SIZE_MB 30 | FILE_MODEL=$FILE_MODEL"_d"$NGRAM_ORDER 31 | FILE_MODEL=$FILE_MODEL"_b"$BPTT_ORDER 32 | FILE_MODEL=$FILE_MODEL".model" 33 | echo "RNN model will be stored in $FILE_MODEL..." 34 | 35 | # Train the dependency-parsing model 36 | RnnDependencyTree \ 37 | -rnnlm $FILE_MODEL \ 38 | -train $FILE_TRAIN \ 39 | -valid $FILE_VALID \ 40 | -sentence-labels $FILE_SENTENCE_LABELS \ 41 | -min-word-occurrence $MIN_WORD_OCCURRENCE \ 42 | -hidden $RNN_HIDDENS \ 43 | -direct $NGRAM_SIZE_MB \ 44 | -direct-order $NGRAM_ORDER \ 45 | -bptt $BPTT_ORDER \ 46 | -bptt-block 1 \ 47 | -class $RNN_CLASSES \ 48 | -debug $DEBUG_MODE 49 | # -vocab $FILE_VOCAB 50 | 51 | -------------------------------------------------------------------------------- /test_rnn_holmes_example.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | # This is the path that should be edited, 4 | # depending on where the JSON books are stored 5 | PATH_JSON=$PWD"/../Data/GutenbergHolmes/" 6 | 7 | # Get the model file name from the argument and the vocabulary file 8 | FILE_MODEL=$1 9 | FILE_VOCAB=$2 10 | DEP_LABELS=$3 11 | # Example of call when dependency labels are included (-feature-labels-type 2): 12 | # $ ./test_rnn_holmes_examples.sh models/GutenbergHolmes_p2_mw2_h100_c250_m100_d3_b5_g0.5.model books/vocab_mw5.txt 2 13 | # Example of call when dependency labels are not included (-feature-labels-type 0): 14 | # $ ./test_rnn_holmes_examples.sh models/GutenbergHolmes_p0_mw2_h100_c250_m100_d3_b5_g0.5.model books/vocab_mw5.txt 0 15 | 16 | # If we need to debug, change this to "true" 17 | DEBUG_MODE="false" 18 | 19 | # Automatic path generation 20 | PATH_DATA="./books" 21 | PATH_MODELS="./models" 22 | LIST_VALID=$PATH_DATA"/valid.txt" 23 | LIST_TEST=$PATH_DATA"/test.txt" 24 | FILE_SENTENCE_LABELS_VALID=$PATH_DATA"/valid.labels" 25 | FILE_SENTENCE_LABELS_TEST=$PATH_DATA"/test.labels" 26 | echo "RNN model is read from $FILE_MODEL..." 27 | 28 | # Test the dependency-parsing model on the validation data 29 | RnnDependencyTree \ 30 | -rnnlm $FILE_MODEL \ 31 | -test $LIST_VALID \ 32 | -sentence-labels $FILE_SENTENCE_LABELS_VALID \ 33 | -path-json-books $PATH_JSON \ 34 | -vocab $FILE_VOCAB \ 35 | -debug $DEBUG_MODE \ 36 | -feature-labels-type $DEP_LABELS 37 | 38 | # Test the dependency-parsing model on the test data 39 | RnnDependencyTree \ 40 | -rnnlm $FILE_MODEL \ 41 | -test $LIST_TEST \ 42 | -sentence-labels $FILE_SENTENCE_LABELS_TEST \ 43 | -path-json-books $PATH_JSON \ 44 | -vocab $FILE_VOCAB \ 45 | -debug $DEBUG_MODE \ 46 | -feature-labels-type $DEP_LABELS 47 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | CC = g++ 2 | 3 | BLASFLAGS = -I/opt/local/include 4 | CPPFLAGS = -Wall -O3 -std=c++0x 5 | OPTIMFLAGS = -funroll-loops -ffast-math 6 | CXXFLAGS = -lm -lblas -g $(CPPFLAGS) $(OPTIMFLAGS) $(BLASFLAGS) 7 | 8 | LDFLAGS = -lblas 9 | 10 | BLASINCLUDE = /opt/local/include/cblas.h 11 | SRCDIR = DependencyTreeRNN++ 12 | INCLUDES = $(BLASINCLUDE) $(SRCDIR)/*.h 13 | 14 | OBJDIR = build 15 | 16 | OBJ = $(OBJDIR)/ReadJson.o \ 17 | $(OBJDIR)/CorpusUnrollsReader.o \ 18 | $(OBJDIR)/CommandLineParser.o \ 19 | $(OBJDIR)/Vocabulary.o \ 20 | $(OBJDIR)/RnnWeights.o \ 21 | $(OBJDIR)/RnnLib.o \ 22 | $(OBJDIR)/RnnTraining.o \ 23 | $(OBJDIR)/RnnDependencyTreeLib.o \ 24 | $(OBJDIR)/main.o 25 | 26 | all: $(OBJ) RnnDependencyTree 27 | 28 | $(OBJDIR)/ReadJson.o: $(SRCDIR)/ReadJson.cpp $(INCLUDES) 29 | $(CC) $(CXXFLAGS) -c -o $@ $< 30 | 31 | $(OBJDIR)/CorpusUnrollsReader.o: $(SRCDIR)/CorpusUnrollsReader.cpp $(INCLUDES) 32 | $(CC) $(CXXFLAGS) -c -o $@ $< 33 | 34 | $(OBJDIR)/CommandLineParser.o: $(SRCDIR)/CommandLineParser.cpp $(INCLUDES) 35 | $(CC) $(CXXFLAGS) -c -o $@ $< 36 | 37 | $(OBJDIR)/Vocabulary.o: $(SRCDIR)/Vocabulary.cpp $(INCLUDES) 38 | $(CC) $(CXXFLAGS) -c -o $@ $< 39 | 40 | $(OBJDIR)/RnnWeights.o: $(SRCDIR)/RnnWeights.cpp $(INCLUDES) 41 | $(CC) $(CXXFLAGS) -c -o $@ $< 42 | 43 | $(OBJDIR)/RnnLib.o: $(SRCDIR)/RnnLib.cpp $(INCLUDES) 44 | $(CC) $(CXXFLAGS) -c -o $@ $< 45 | 46 | $(OBJDIR)/RnnTraining.o: $(SRCDIR)/RnnTraining.cpp $(INCLUDES) 47 | $(CC) $(CXXFLAGS) -c -o $@ $< 48 | 49 | $(OBJDIR)/RnnDependencyTreeLib.o: $(SRCDIR)/RnnDependencyTreeLib.cpp $(INCLUDES) 50 | $(CC) $(CXXFLAGS) -c -o $@ $< 51 | 52 | $(OBJDIR)/main.o: $(SRCDIR)/main.cpp $(INCLUDES) 53 | $(CC) $(CXXFLAGS) -c -o $@ $< 54 | 55 | RnnDependencyTree: $(OBJ) 56 | $(CC) -o $@ $^ $(LDFLAGS) 57 | 58 | clean: 59 | rm -rf $(OBJDIR)/*.o 60 | -------------------------------------------------------------------------------- /Makefile.MacOS: -------------------------------------------------------------------------------- 1 | CC = g++ 2 | 3 | BLASFLAGS = -I/opt/local/include 4 | CPPFLAGS = -Wall -O3 -std=c++0x 5 | OPTIMFLAGS = -funroll-loops -ffast-math 6 | CXXFLAGS = -lm -lblas -g $(CPPFLAGS) $(OPTIMFLAGS) $(BLASFLAGS) 7 | 8 | LDFLAGS = -lblas 9 | 10 | BLASINCLUDE = /opt/local/include/cblas.h 11 | SRCDIR = DependencyTreeRNN++ 12 | INCLUDES = $(BLASINCLUDE) $(SRCDIR)/*.h 13 | 14 | OBJDIR = build 15 | 16 | OBJ = $(OBJDIR)/ReadJson.o \ 17 | $(OBJDIR)/CorpusUnrollsReader.o \ 18 | $(OBJDIR)/CommandLineParser.o \ 19 | $(OBJDIR)/Vocabulary.o \ 20 | $(OBJDIR)/RnnWeights.o \ 21 | $(OBJDIR)/RnnLib.o \ 22 | $(OBJDIR)/RnnTraining.o \ 23 | $(OBJDIR)/RnnDependencyTreeLib.o \ 24 | $(OBJDIR)/main.o 25 | 26 | all: $(OBJ) RnnDependencyTree 27 | 28 | $(OBJDIR)/ReadJson.o: $(SRCDIR)/ReadJson.cpp $(INCLUDES) 29 | $(CC) $(CXXFLAGS) -c -o $@ $< 30 | 31 | $(OBJDIR)/CorpusUnrollsReader.o: $(SRCDIR)/CorpusUnrollsReader.cpp $(INCLUDES) 32 | $(CC) $(CXXFLAGS) -c -o $@ $< 33 | 34 | $(OBJDIR)/CommandLineParser.o: $(SRCDIR)/CommandLineParser.cpp $(INCLUDES) 35 | $(CC) $(CXXFLAGS) -c -o $@ $< 36 | 37 | $(OBJDIR)/Vocabulary.o: $(SRCDIR)/Vocabulary.cpp $(INCLUDES) 38 | $(CC) $(CXXFLAGS) -c -o $@ $< 39 | 40 | $(OBJDIR)/RnnWeights.o: $(SRCDIR)/RnnWeights.cpp $(INCLUDES) 41 | $(CC) $(CXXFLAGS) -c -o $@ $< 42 | 43 | $(OBJDIR)/RnnLib.o: $(SRCDIR)/RnnLib.cpp $(INCLUDES) 44 | $(CC) $(CXXFLAGS) -c -o $@ $< 45 | 46 | $(OBJDIR)/RnnTraining.o: $(SRCDIR)/RnnTraining.cpp $(INCLUDES) 47 | $(CC) $(CXXFLAGS) -c -o $@ $< 48 | 49 | $(OBJDIR)/RnnDependencyTreeLib.o: $(SRCDIR)/RnnDependencyTreeLib.cpp $(INCLUDES) 50 | $(CC) $(CXXFLAGS) -c -o $@ $< 51 | 52 | $(OBJDIR)/main.o: $(SRCDIR)/main.cpp $(INCLUDES) 53 | $(CC) $(CXXFLAGS) -c -o $@ $< 54 | 55 | RnnDependencyTree: $(OBJ) 56 | $(CC) -o $@ $^ $(LDFLAGS) 57 | 58 | clean: 59 | rm -rf $(OBJDIR)/*.o 60 | -------------------------------------------------------------------------------- /train_rnn_holmes_p0_1000.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | # This is the path that should be edited, 4 | # depending on where the JSON books are stored 5 | PATH_JSON="../Data/GutenbergHolmes/" 6 | 7 | # Define the minimum number of word occurrences as 5 and use existing vocabulary file 8 | MIN_WORD_OCCURRENCE=5 9 | DEP_LABELS=2 10 | RNN_HIDDENS=$1 11 | RNN_CLASSES=250 12 | NGRAM_SIZE_MB=1000 13 | NGRAM_ORDER=$2 14 | BPTT_ORDER=5 15 | FEATURE_GAMMA=0.0 16 | 17 | # If we need to debug, change this to "true" 18 | DEBUG_MODE="true" 19 | 20 | # Automatic path generation 21 | PATH_DATA="./books" 22 | PATH_MODELS="./models" 23 | LIST_VALID=$PATH_DATA"/valid.txt" 24 | LIST_TRAIN=$PATH_DATA"/train.txt" 25 | FILE_SENTENCE_LABELS=$PATH_DATA"/valid.labels" 26 | FILE_VOCAB=$PATH_DATA"/vocab_mw"$MIN_WORD_OCCURRENCE".txt" 27 | FILE_MODEL=$PATH_MODELS"/GutenbergHolmes_p"$DEP_LABELS 28 | FILE_MODEL=$FILE_MODEL"_mw"$MIN_WORD_OCCURRENCE 29 | FILE_MODEL=$FILE_MODEL"_h"$RNN_HIDDENS 30 | FILE_MODEL=$FILE_MODEL"_c"$RNN_CLASSES 31 | FILE_MODEL=$FILE_MODEL"_m"$NGRAM_SIZE_MB 32 | FILE_MODEL=$FILE_MODEL"_d"$NGRAM_ORDER 33 | FILE_MODEL=$FILE_MODEL"_b"$BPTT_ORDER 34 | FILE_MODEL=$FILE_MODEL"_g"$FEATURE_GAMMA 35 | FILE_MODEL=$FILE_MODEL".model" 36 | echo "RNN model will be stored in $FILE_MODEL..." 37 | 38 | # Train the dependency-parsing model 39 | RnnDependencyTree \ 40 | -rnnlm $FILE_MODEL \ 41 | -train $LIST_TRAIN \ 42 | -valid $LIST_VALID \ 43 | -sentence-labels $FILE_SENTENCE_LABELS \ 44 | -path-json-books $PATH_JSON \ 45 | -min-word-occurrence $MIN_WORD_OCCURRENCE \ 46 | -feature-labels-type $DEP_LABELS \ 47 | -hidden $RNN_HIDDENS \ 48 | -direct $NGRAM_SIZE_MB \ 49 | -direct-order $NGRAM_ORDER \ 50 | -bptt $BPTT_ORDER \ 51 | -bptt-block 1 \ 52 | -class $RNN_CLASSES \ 53 | -feature-gamma $FEATURE_GAMMA \ 54 | -debug $DEBUG_MODE \ 55 | -vocab $FILE_VOCAB 56 | -------------------------------------------------------------------------------- /train_rnn_holmes_p0_2000.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | # This is the path that should be edited, 4 | # depending on where the JSON books are stored 5 | PATH_JSON="../Data/GutenbergHolmes/" 6 | 7 | # Define the minimum number of word occurrences as 5 and use existing vocabulary file 8 | MIN_WORD_OCCURRENCE=5 9 | DEP_LABELS=2 10 | RNN_HIDDENS=$1 11 | RNN_CLASSES=250 12 | NGRAM_SIZE_MB=2000 13 | NGRAM_ORDER=$2 14 | BPTT_ORDER=5 15 | FEATURE_GAMMA=0.0 16 | 17 | # If we need to debug, change this to "true" 18 | DEBUG_MODE="true" 19 | 20 | # Automatic path generation 21 | PATH_DATA="./books" 22 | PATH_MODELS="./models" 23 | LIST_VALID=$PATH_DATA"/valid.txt" 24 | LIST_TRAIN=$PATH_DATA"/train.txt" 25 | FILE_SENTENCE_LABELS=$PATH_DATA"/valid.labels" 26 | FILE_VOCAB=$PATH_DATA"/vocab_mw"$MIN_WORD_OCCURRENCE".txt" 27 | FILE_MODEL=$PATH_MODELS"/GutenbergHolmes_p"$DEP_LABELS 28 | FILE_MODEL=$FILE_MODEL"_mw"$MIN_WORD_OCCURRENCE 29 | FILE_MODEL=$FILE_MODEL"_h"$RNN_HIDDENS 30 | FILE_MODEL=$FILE_MODEL"_c"$RNN_CLASSES 31 | FILE_MODEL=$FILE_MODEL"_m"$NGRAM_SIZE_MB 32 | FILE_MODEL=$FILE_MODEL"_d"$NGRAM_ORDER 33 | FILE_MODEL=$FILE_MODEL"_b"$BPTT_ORDER 34 | FILE_MODEL=$FILE_MODEL"_g"$FEATURE_GAMMA 35 | FILE_MODEL=$FILE_MODEL".model" 36 | echo "RNN model will be stored in $FILE_MODEL..." 37 | 38 | # Train the dependency-parsing model 39 | RnnDependencyTree \ 40 | -rnnlm $FILE_MODEL \ 41 | -train $LIST_TRAIN \ 42 | -valid $LIST_VALID \ 43 | -sentence-labels $FILE_SENTENCE_LABELS \ 44 | -path-json-books $PATH_JSON \ 45 | -min-word-occurrence $MIN_WORD_OCCURRENCE \ 46 | -feature-labels-type $DEP_LABELS \ 47 | -hidden $RNN_HIDDENS \ 48 | -direct $NGRAM_SIZE_MB \ 49 | -direct-order $NGRAM_ORDER \ 50 | -bptt $BPTT_ORDER \ 51 | -bptt-block 1 \ 52 | -class $RNN_CLASSES \ 53 | -feature-gamma $FEATURE_GAMMA \ 54 | -debug $DEBUG_MODE \ 55 | -vocab $FILE_VOCAB 56 | -------------------------------------------------------------------------------- /train_rnn_holmes_p2_1000.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | # This is the path that should be edited, 4 | # depending on where the JSON books are stored 5 | PATH_JSON="../Data/GutenbergHolmes/" 6 | 7 | # Define the minimum number of word occurrences as 5 and use existing vocabulary file 8 | MIN_WORD_OCCURRENCE=5 9 | DEP_LABELS=2 10 | RNN_HIDDENS=$1 11 | RNN_CLASSES=250 12 | NGRAM_SIZE_MB=1000 13 | NGRAM_ORDER=$2 14 | BPTT_ORDER=5 15 | FEATURE_GAMMA=0.0 16 | 17 | # If we need to debug, change this to "true" 18 | DEBUG_MODE="true" 19 | 20 | # Automatic path generation 21 | PATH_DATA="./books" 22 | PATH_MODELS="./models" 23 | LIST_VALID=$PATH_DATA"/valid.txt" 24 | LIST_TRAIN=$PATH_DATA"/train.txt" 25 | FILE_SENTENCE_LABELS=$PATH_DATA"/valid.labels" 26 | FILE_VOCAB=$PATH_DATA"/vocab_mw"$MIN_WORD_OCCURRENCE".txt" 27 | FILE_MODEL=$PATH_MODELS"/GutenbergHolmes_p"$DEP_LABELS 28 | FILE_MODEL=$FILE_MODEL"_mw"$MIN_WORD_OCCURRENCE 29 | FILE_MODEL=$FILE_MODEL"_h"$RNN_HIDDENS 30 | FILE_MODEL=$FILE_MODEL"_c"$RNN_CLASSES 31 | FILE_MODEL=$FILE_MODEL"_m"$NGRAM_SIZE_MB 32 | FILE_MODEL=$FILE_MODEL"_d"$NGRAM_ORDER 33 | FILE_MODEL=$FILE_MODEL"_b"$BPTT_ORDER 34 | FILE_MODEL=$FILE_MODEL"_g"$FEATURE_GAMMA 35 | FILE_MODEL=$FILE_MODEL".model" 36 | echo "RNN model will be stored in $FILE_MODEL..." 37 | 38 | # Train the dependency-parsing model 39 | RnnDependencyTree \ 40 | -rnnlm $FILE_MODEL \ 41 | -train $LIST_TRAIN \ 42 | -valid $LIST_VALID \ 43 | -sentence-labels $FILE_SENTENCE_LABELS \ 44 | -path-json-books $PATH_JSON \ 45 | -min-word-occurrence $MIN_WORD_OCCURRENCE \ 46 | -feature-labels-type $DEP_LABELS \ 47 | -hidden $RNN_HIDDENS \ 48 | -direct $NGRAM_SIZE_MB \ 49 | -direct-order $NGRAM_ORDER \ 50 | -bptt $BPTT_ORDER \ 51 | -bptt-block 1 \ 52 | -class $RNN_CLASSES \ 53 | -feature-gamma $FEATURE_GAMMA \ 54 | -debug $DEBUG_MODE \ 55 | -vocab $FILE_VOCAB 56 | -------------------------------------------------------------------------------- /train_rnn_holmes_p2_2000.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | # This is the path that should be edited, 4 | # depending on where the JSON books are stored 5 | PATH_JSON="../Data/GutenbergHolmes/" 6 | 7 | # Define the minimum number of word occurrences as 5 and use existing vocabulary file 8 | MIN_WORD_OCCURRENCE=5 9 | DEP_LABELS=2 10 | RNN_HIDDENS=$1 11 | RNN_CLASSES=250 12 | NGRAM_SIZE_MB=2000 13 | NGRAM_ORDER=$2 14 | BPTT_ORDER=5 15 | FEATURE_GAMMA=0.0 16 | 17 | # If we need to debug, change this to "true" 18 | DEBUG_MODE="true" 19 | 20 | # Automatic path generation 21 | PATH_DATA="./books" 22 | PATH_MODELS="./models" 23 | LIST_VALID=$PATH_DATA"/valid.txt" 24 | LIST_TRAIN=$PATH_DATA"/train.txt" 25 | FILE_SENTENCE_LABELS=$PATH_DATA"/valid.labels" 26 | FILE_VOCAB=$PATH_DATA"/vocab_mw"$MIN_WORD_OCCURRENCE".txt" 27 | FILE_MODEL=$PATH_MODELS"/GutenbergHolmes_p"$DEP_LABELS 28 | FILE_MODEL=$FILE_MODEL"_mw"$MIN_WORD_OCCURRENCE 29 | FILE_MODEL=$FILE_MODEL"_h"$RNN_HIDDENS 30 | FILE_MODEL=$FILE_MODEL"_c"$RNN_CLASSES 31 | FILE_MODEL=$FILE_MODEL"_m"$NGRAM_SIZE_MB 32 | FILE_MODEL=$FILE_MODEL"_d"$NGRAM_ORDER 33 | FILE_MODEL=$FILE_MODEL"_b"$BPTT_ORDER 34 | FILE_MODEL=$FILE_MODEL"_g"$FEATURE_GAMMA 35 | FILE_MODEL=$FILE_MODEL".model" 36 | echo "RNN model will be stored in $FILE_MODEL..." 37 | 38 | # Train the dependency-parsing model 39 | RnnDependencyTree \ 40 | -rnnlm $FILE_MODEL \ 41 | -train $LIST_TRAIN \ 42 | -valid $LIST_VALID \ 43 | -sentence-labels $FILE_SENTENCE_LABELS \ 44 | -path-json-books $PATH_JSON \ 45 | -min-word-occurrence $MIN_WORD_OCCURRENCE \ 46 | -feature-labels-type $DEP_LABELS \ 47 | -hidden $RNN_HIDDENS \ 48 | -direct $NGRAM_SIZE_MB \ 49 | -direct-order $NGRAM_ORDER \ 50 | -bptt $BPTT_ORDER \ 51 | -bptt-block 1 \ 52 | -class $RNN_CLASSES \ 53 | -feature-gamma $FEATURE_GAMMA \ 54 | -debug $DEBUG_MODE \ 55 | -vocab $FILE_VOCAB 56 | -------------------------------------------------------------------------------- /train_rnn_holmes_example.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | # This is the path that should be edited, 4 | # depending on where the JSON books are stored 5 | PATH_JSON=$PWD"/../Data/GutenbergHolmes/" 6 | 7 | # Define the minimum number of word occurrences as 5 and use existing vocabulary file 8 | MIN_WORD_OCCURRENCE=5 9 | DEP_LABELS=2 10 | RNN_HIDDENS=600 11 | RNN_CLASSES=250 12 | NGRAM_SIZE_MB=2000 13 | NGRAM_ORDER=3 14 | BPTT_ORDER=5 15 | FEATURE_GAMMA=0.5 16 | 17 | # If we need to debug, change this to "true" 18 | DEBUG_MODE="true" 19 | 20 | # Automatic path generation 21 | PATH_DATA="./books" 22 | PATH_MODELS="./models" 23 | LIST_VALID=$PATH_DATA"/valid.txt" 24 | LIST_TRAIN=$PATH_DATA"/train.txt" 25 | FILE_SENTENCE_LABELS=$PATH_DATA"/valid.labels" 26 | FILE_VOCAB=$PATH_DATA"/vocab_mw"$MIN_WORD_OCCURRENCE".txt" 27 | FILE_MODEL=$PATH_MODELS"/GutenbergHolmes_p"$DEP_LABELS 28 | FILE_MODEL=$FILE_MODEL"_mw"$MIN_WORD_OCCURRENCE 29 | FILE_MODEL=$FILE_MODEL"_h"$RNN_HIDDENS 30 | FILE_MODEL=$FILE_MODEL"_c"$RNN_CLASSES 31 | FILE_MODEL=$FILE_MODEL"_m"$NGRAM_SIZE_MB 32 | FILE_MODEL=$FILE_MODEL"_d"$NGRAM_ORDER 33 | FILE_MODEL=$FILE_MODEL"_b"$BPTT_ORDER 34 | FILE_MODEL=$FILE_MODEL"_g"$FEATURE_GAMMA 35 | FILE_MODEL=$FILE_MODEL".model" 36 | echo "RNN model will be stored in $FILE_MODEL..." 37 | 38 | # Train the dependency-parsing model 39 | RnnDependencyTree \ 40 | -rnnlm $FILE_MODEL \ 41 | -train $LIST_TRAIN \ 42 | -valid $LIST_VALID \ 43 | -sentence-labels $FILE_SENTENCE_LABELS \ 44 | -path-json-books $PATH_JSON \ 45 | -min-word-occurrence $MIN_WORD_OCCURRENCE \ 46 | -feature-labels-type $DEP_LABELS \ 47 | -hidden $RNN_HIDDENS \ 48 | -direct $NGRAM_SIZE_MB \ 49 | -direct-order $NGRAM_ORDER \ 50 | -bptt $BPTT_ORDER \ 51 | -bptt-block 1 \ 52 | -class $RNN_CLASSES \ 53 | -feature-gamma $FEATURE_GAMMA \ 54 | -debug $DEBUG_MODE \ 55 | -vocab $FILE_VOCAB 56 | -------------------------------------------------------------------------------- /DependencyTreeRNN++/ReadJson.h: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2014-2015 Piotr Mirowski 2 | // 3 | // Piotr Mirowski, Andreas Vlachos 4 | // "Dependency Recurrent Neural Language Models for Sentence Completion" 5 | // ACL 2015 6 | 7 | #ifndef DependencyTreeRNN___readjson_h 8 | #define DependencyTreeRNN___readjson_h 9 | 10 | #include 11 | #include "CorpusUnrollsReader.h" 12 | 13 | using namespace std; 14 | 15 | struct JsonToken { 16 | int pos; 17 | string word; 18 | double discount; 19 | string label; 20 | }; 21 | 22 | 23 | class ReadJson { 24 | public: 25 | 26 | /** 27 | * Constructor: read a text file in JSON format. 28 | * If required, insert words and labels to the vocabulary. 29 | * If required, insert tokens into the current book. 30 | */ 31 | ReadJson(const string &filename, 32 | CorpusUnrolls &corpus, 33 | bool insert_vocab, 34 | bool read_book, 35 | bool merge_label_with_word); 36 | 37 | /** 38 | * Destructor 39 | */ 40 | ~ReadJson() { } 41 | 42 | protected: 43 | 44 | 45 | /** 46 | * Trim a word 47 | */ 48 | string const Trim(const string &word) const; 49 | 50 | /** 51 | * Parse a token 52 | */ 53 | size_t const ParseToken(const string &json_element, 54 | JsonToken &tok) const; 55 | 56 | /** 57 | * Parse an unroll 58 | */ 59 | size_t const ParseUnroll(const string &json_unrolls, 60 | vector &unroll) const; 61 | 62 | /** 63 | * Parse a sentence 64 | */ 65 | size_t const ParseSentence(const string &json_sentences, 66 | vector> &sentence) const; 67 | 68 | /** 69 | * Parse a book 70 | */ 71 | size_t const ParseBook(const string &json_book, 72 | vector>> &book) const; 73 | }; 74 | 75 | #endif 76 | -------------------------------------------------------------------------------- /Makefile.Linux: -------------------------------------------------------------------------------- 1 | CC = g++ 2 | 3 | # These paths need to be configured, depending on where cblas.h 4 | # and libcblas.so are located 5 | BLASINCLUDE = /usr/include/cblas.h 6 | BLASFLAGSINCLUDE = -I/usr/include 7 | BLASFLAGSLIB = -L/usr/lib64/atlas 8 | 9 | CPPFLAGS = -Wall -O3 -std=c++0x 10 | OPTIMFLAGS = -funroll-loops -ffast-math 11 | CXXFLAGS = -lm -lblas -g $(CPPFLAGS) $(OPTIMFLAGS) $(BLASFLAGSINCLUDE) 12 | LDFLAGS = -lcblas $(BLASFLAGSLIB) 13 | 14 | SRCDIR = DependencyTreeRNN++ 15 | INCLUDES = $(BLASINCLUDE) $(SRCDIR)/*.h 16 | 17 | OBJDIR = build 18 | 19 | OBJ = $(OBJDIR)/ReadJson.o \ 20 | $(OBJDIR)/CorpusUnrollsReader.o \ 21 | $(OBJDIR)/CommandLineParser.o \ 22 | $(OBJDIR)/Vocabulary.o \ 23 | $(OBJDIR)/RnnWeights.o \ 24 | $(OBJDIR)/RnnLib.o \ 25 | $(OBJDIR)/RnnTraining.o \ 26 | $(OBJDIR)/RnnDependencyTreeLib.o \ 27 | $(OBJDIR)/main.o 28 | 29 | all: $(OBJ) RnnDependencyTree 30 | 31 | $(OBJDIR)/ReadJson.o: $(SRCDIR)/ReadJson.cpp $(INCLUDES) 32 | $(CC) $(CXXFLAGS) -c -o $@ $< 33 | 34 | $(OBJDIR)/CorpusUnrollsReader.o: $(SRCDIR)/CorpusUnrollsReader.cpp $(INCLUDES) 35 | $(CC) $(CXXFLAGS) -c -o $@ $< 36 | 37 | $(OBJDIR)/CommandLineParser.o: $(SRCDIR)/CommandLineParser.cpp $(INCLUDES) 38 | $(CC) $(CXXFLAGS) -c -o $@ $< 39 | 40 | $(OBJDIR)/Vocabulary.o: $(SRCDIR)/Vocabulary.cpp $(INCLUDES) 41 | $(CC) $(CXXFLAGS) -c -o $@ $< 42 | 43 | $(OBJDIR)/RnnWeights.o: $(SRCDIR)/RnnWeights.cpp $(INCLUDES) 44 | $(CC) $(CXXFLAGS) -c -o $@ $< 45 | 46 | $(OBJDIR)/RnnLib.o: $(SRCDIR)/RnnLib.cpp $(INCLUDES) 47 | $(CC) $(CXXFLAGS) -c -o $@ $< 48 | 49 | $(OBJDIR)/RnnTraining.o: $(SRCDIR)/RnnTraining.cpp $(INCLUDES) 50 | $(CC) $(CXXFLAGS) -c -o $@ $< 51 | 52 | $(OBJDIR)/RnnDependencyTreeLib.o: $(SRCDIR)/RnnDependencyTreeLib.cpp $(INCLUDES) 53 | $(CC) $(CXXFLAGS) -c -o $@ $< 54 | 55 | $(OBJDIR)/main.o: $(SRCDIR)/main.cpp $(INCLUDES) 56 | $(CC) $(CXXFLAGS) -c -o $@ $< 57 | 58 | RnnDependencyTree: $(OBJ) 59 | $(CC) -o $@ $^ $(LDFLAGS) 60 | 61 | clean: 62 | rm -rf $(OBJDIR)/*.o 63 | -------------------------------------------------------------------------------- /logs/GutenbergHolmes_p2_mw5_h50_c250_m2000_d4_b5_g0.5.acc: -------------------------------------------------------------------------------- 1 | Iter,0,Alpha,0.100000,VALIDacc,0.375000,VALIDent,6.299215,VALIDppx,78.750394,words/sec,0 2 | Iter,1,Alpha,0.100000,VALIDacc,0.428846,VALIDent,6.407313,VALIDppx,84.877636,words/sec,0 3 | Iter,2,Alpha,0.100000,VALIDacc,0.425000,VALIDent,6.309835,VALIDppx,79.332195,words/sec,0 4 | Iter,3,Alpha,0.100000,VALIDacc,0.419231,VALIDent,6.423861,VALIDppx,85.856848,words/sec,0 5 | Iter,4,Alpha,0.100000,VALIDacc,0.438462,VALIDent,6.334380,VALIDppx,80.693489,words/sec,0 6 | Iter,5,Alpha,0.100000,VALIDacc,0.425000,VALIDent,6.324297,VALIDppx,80.131479,words/sec,0 7 | Iter,6,Alpha,0.066667,VALIDacc,0.436538,VALIDent,6.403221,VALIDppx,84.637246,words/sec,0 8 | Iter,7,Alpha,0.044444,VALIDacc,0.438462,VALIDent,6.186505,VALIDppx,72.832206,words/sec,0 9 | Iter,8,Alpha,0.029630,VALIDacc,0.459615,VALIDent,6.082602,VALIDppx,67.771259,words/sec,0 10 | Iter,9,Alpha,0.019753,VALIDacc,0.465385,VALIDent,6.093529,VALIDppx,68.286507,words/sec,0 11 | Iter,10,Alpha,0.013169,VALIDacc,0.436538,VALIDent,6.042666,VALIDppx,65.921012,words/sec,0 12 | Iter,11,Alpha,0.008779,VALIDacc,0.457692,VALIDent,6.060414,VALIDppx,66.736960,words/sec,0 13 | Iter,12,Alpha,0.005853,VALIDacc,0.467308,VALIDent,6.099138,VALIDppx,68.552512,words/sec,0 14 | Iter,13,Alpha,0.003902,VALIDacc,0.451923,VALIDent,6.000667,VALIDppx,64.029604,words/sec,0 15 | Iter,14,Alpha,0.002601,VALIDacc,0.455769,VALIDent,5.926755,VALIDppx,60.831857,words/sec,0 16 | Iter,15,Alpha,0.001734,VALIDacc,0.450000,VALIDent,5.998394,VALIDppx,63.928802,words/sec,0 17 | Iter,16,Alpha,0.001156,VALIDacc,0.463462,VALIDent,5.933939,VALIDppx,61.135527,words/sec,0 18 | Iter,17,Alpha,0.000771,VALIDacc,0.465385,VALIDent,5.891628,VALIDppx,59.368587,words/sec,0 19 | Iter,18,Alpha,0.000514,VALIDacc,0.461538,VALIDent,5.901500,VALIDppx,59.776242,words/sec,0 20 | Iter,19,Alpha,0.000343,VALIDacc,0.463462,VALIDent,5.897743,VALIDppx,59.620746,words/sec,0 21 | Iter,20,Alpha,0.000228,VALIDacc,0.461538,VALIDent,5.890489,VALIDppx,59.321755,words/sec,0 22 | Iter,21,Alpha,0.000152,VALIDacc,0.457692,VALIDent,5.898964,VALIDppx,59.671253,words/sec,0 23 | Iter,22,Alpha,0.000101,VALIDacc,0.461538,VALIDent,5.902392,VALIDppx,59.813191,words/sec,0 24 | -------------------------------------------------------------------------------- /logs/GutenbergHolmes_p2_mw5_h200_c250_m2000_d4_b5_g0.5.acc: -------------------------------------------------------------------------------- 1 | Iter,0,Alpha,0.100000,VALIDacc,0.369231,VALIDent,6.435766,VALIDppx,86.568258,words/sec,0 2 | Iter,1,Alpha,0.100000,VALIDacc,0.415385,VALIDent,6.461425,VALIDppx,88.121651,words/sec,0 3 | Iter,2,Alpha,0.100000,VALIDacc,0.451923,VALIDent,6.244529,VALIDppx,75.821191,words/sec,0 4 | Iter,3,Alpha,0.100000,VALIDacc,0.423077,VALIDent,6.322476,VALIDppx,80.030403,words/sec,0 5 | Iter,4,Alpha,0.100000,VALIDacc,0.457692,VALIDent,6.376928,VALIDppx,83.108738,words/sec,0 6 | Iter,5,Alpha,0.100000,VALIDacc,0.450000,VALIDent,6.321707,VALIDppx,79.987751,words/sec,0 7 | Iter,6,Alpha,0.066667,VALIDacc,0.463462,VALIDent,6.163304,VALIDppx,71.670303,words/sec,0 8 | Iter,7,Alpha,0.044444,VALIDacc,0.480769,VALIDent,6.096885,VALIDppx,68.445538,words/sec,0 9 | Iter,8,Alpha,0.029630,VALIDacc,0.480769,VALIDent,6.136001,VALIDppx,70.326705,words/sec,0 10 | Iter,9,Alpha,0.019753,VALIDacc,0.461538,VALIDent,6.196881,VALIDppx,73.357940,words/sec,0 11 | Iter,10,Alpha,0.013169,VALIDacc,0.471154,VALIDent,5.950891,VALIDppx,61.858107,words/sec,0 12 | Iter,11,Alpha,0.008779,VALIDacc,0.482692,VALIDent,6.035864,VALIDppx,65.610931,words/sec,0 13 | Iter,12,Alpha,0.005853,VALIDacc,0.498077,VALIDent,5.895801,VALIDppx,59.540550,words/sec,0 14 | Iter,13,Alpha,0.003902,VALIDacc,0.480769,VALIDent,5.950142,VALIDppx,61.826002,words/sec,0 15 | Iter,14,Alpha,0.002601,VALIDacc,0.486538,VALIDent,5.928949,VALIDppx,60.924440,words/sec,0 16 | Iter,15,Alpha,0.001734,VALIDacc,0.494231,VALIDent,5.845973,VALIDppx,57.519254,words/sec,0 17 | Iter,16,Alpha,0.001156,VALIDacc,0.492308,VALIDent,5.877454,VALIDppx,58.788187,words/sec,0 18 | Iter,17,Alpha,0.000771,VALIDacc,0.490385,VALIDent,5.906932,VALIDppx,60.001706,words/sec,0 19 | Iter,18,Alpha,0.000514,VALIDacc,0.475000,VALIDent,5.850823,VALIDppx,57.712936,words/sec,0 20 | Iter,19,Alpha,0.000343,VALIDacc,0.490385,VALIDent,5.900016,VALIDppx,59.714759,words/sec,0 21 | Iter,20,Alpha,0.000228,VALIDacc,0.488462,VALIDent,5.863343,VALIDppx,58.215961,words/sec,0 22 | Iter,21,Alpha,0.000152,VALIDacc,0.486538,VALIDent,5.834466,VALIDppx,57.062314,words/sec,0 23 | Iter,22,Alpha,0.000101,VALIDacc,0.484615,VALIDent,5.837756,VALIDppx,57.192570,words/sec,0 24 | -------------------------------------------------------------------------------- /logs/GutenbergHolmes_seq_mw5_h50_c250_m0_d0_b5.acc: -------------------------------------------------------------------------------- 1 | Iter,0,Alpha,0.100000,VALIDacc,0.255769,VALIDent,7.465440,VALIDppx,176.734491,words/sec,0 2 | Iter,1,Alpha,0.100000,VALIDacc,0.278846,VALIDent,7.379568,VALIDppx,166.521913,words/sec,0 3 | Iter,2,Alpha,0.100000,VALIDacc,0.275000,VALIDent,7.341033,VALIDppx,162.132880,words/sec,0 4 | Iter,3,Alpha,0.100000,VALIDacc,0.273077,VALIDent,7.319749,VALIDppx,159.758497,words/sec,0 5 | Iter,4,Alpha,0.100000,VALIDacc,0.271154,VALIDent,7.305861,VALIDppx,158.227973,words/sec,0 6 | Iter,5,Alpha,0.100000,VALIDacc,0.273077,VALIDent,7.296104,VALIDppx,157.161493,words/sec,0 7 | Iter,6,Alpha,0.100000,VALIDacc,0.269231,VALIDent,7.288831,VALIDppx,156.371244,words/sec,0 8 | Iter,7,Alpha,0.066667,VALIDacc,0.261538,VALIDent,7.226051,VALIDppx,149.712505,words/sec,0 9 | Iter,8,Alpha,0.044444,VALIDacc,0.276923,VALIDent,7.180029,VALIDppx,145.012099,words/sec,0 10 | Iter,9,Alpha,0.029630,VALIDacc,0.284615,VALIDent,7.144468,VALIDppx,141.481303,words/sec,0 11 | Iter,10,Alpha,0.019753,VALIDacc,0.276923,VALIDent,7.115754,VALIDppx,138.693277,words/sec,0 12 | Iter,11,Alpha,0.013169,VALIDacc,0.263462,VALIDent,7.092228,VALIDppx,136.449949,words/sec,0 13 | Iter,12,Alpha,0.008779,VALIDacc,0.267308,VALIDent,7.073162,VALIDppx,134.658539,words/sec,0 14 | Iter,13,Alpha,0.005853,VALIDacc,0.269231,VALIDent,7.057946,VALIDppx,133.245802,words/sec,0 15 | Iter,14,Alpha,0.003902,VALIDacc,0.271154,VALIDent,7.045805,VALIDppx,132.129131,words/sec,0 16 | Iter,15,Alpha,0.002601,VALIDacc,0.271154,VALIDent,7.036284,VALIDppx,131.260082,words/sec,0 17 | Iter,16,Alpha,0.001734,VALIDacc,0.282692,VALIDent,7.028620,VALIDppx,130.564636,words/sec,0 18 | Iter,17,Alpha,0.001156,VALIDacc,0.280769,VALIDent,7.021763,VALIDppx,129.945542,words/sec,0 19 | Iter,18,Alpha,0.000771,VALIDacc,0.286538,VALIDent,7.014895,VALIDppx,129.328384,words/sec,0 20 | Iter,19,Alpha,0.000514,VALIDacc,0.282692,VALIDent,7.007317,VALIDppx,128.650846,words/sec,0 21 | Iter,20,Alpha,0.000343,VALIDacc,0.276923,VALIDent,6.999086,VALIDppx,127.918919,words/sec,0 22 | Iter,21,Alpha,0.000228,VALIDacc,0.276923,VALIDent,6.991647,VALIDppx,127.261011,words/sec,0 23 | Iter,22,Alpha,0.000152,VALIDacc,0.273077,VALIDent,6.986392,VALIDppx,126.798312,words/sec,0 24 | -------------------------------------------------------------------------------- /logs/GutenbergHolmes_p0_mw5_h100_c250_m2000_d4_b5_g0.5.acc: -------------------------------------------------------------------------------- 1 | Iter,0,Alpha,0.100000,VALIDacc,0.438462,VALIDent,8.706267,VALIDppx,417.683773,words/sec,0 2 | Iter,1,Alpha,0.100000,VALIDacc,0.425000,VALIDent,8.590784,VALIDppx,385.552680,words/sec,0 3 | Iter,2,Alpha,0.100000,VALIDacc,0.490385,VALIDent,8.560620,VALIDppx,377.575040,words/sec,0 4 | Iter,3,Alpha,0.100000,VALIDacc,0.465385,VALIDent,8.691508,VALIDppx,413.432440,words/sec,0 5 | Iter,4,Alpha,0.100000,VALIDacc,0.507692,VALIDent,8.473436,VALIDppx,355.433653,words/sec,0 6 | Iter,5,Alpha,0.100000,VALIDacc,0.482692,VALIDent,8.688202,VALIDppx,412.486120,words/sec,0 7 | Iter,6,Alpha,0.066667,VALIDacc,0.526923,VALIDent,8.377757,VALIDppx,332.626040,words/sec,0 8 | Iter,7,Alpha,0.044444,VALIDacc,0.505769,VALIDent,8.229300,VALIDppx,300.100179,words/sec,0 9 | Iter,8,Alpha,0.029630,VALIDacc,0.513462,VALIDent,8.260622,VALIDppx,306.686646,words/sec,0 10 | Iter,9,Alpha,0.019753,VALIDacc,0.540385,VALIDent,8.138515,VALIDppx,281.797551,words/sec,0 11 | Iter,10,Alpha,0.013169,VALIDacc,0.517308,VALIDent,8.106643,VALIDppx,275.640257,words/sec,0 12 | Iter,11,Alpha,0.008779,VALIDacc,0.513462,VALIDent,8.128033,VALIDppx,279.757475,words/sec,0 13 | Iter,12,Alpha,0.005853,VALIDacc,0.515385,VALIDent,8.135802,VALIDppx,281.268162,words/sec,0 14 | Iter,13,Alpha,0.003902,VALIDacc,0.511538,VALIDent,8.103275,VALIDppx,274.997486,words/sec,0 15 | Iter,14,Alpha,0.002601,VALIDacc,0.519231,VALIDent,8.038421,VALIDppx,262.909296,words/sec,0 16 | Iter,15,Alpha,0.001734,VALIDacc,0.515385,VALIDent,8.076482,VALIDppx,269.937517,words/sec,0 17 | Iter,16,Alpha,0.001156,VALIDacc,0.523077,VALIDent,8.013359,VALIDppx,258.381453,words/sec,0 18 | Iter,17,Alpha,0.000771,VALIDacc,0.530769,VALIDent,8.039020,VALIDppx,263.018380,words/sec,0 19 | Iter,18,Alpha,0.000514,VALIDacc,0.530769,VALIDent,7.993173,VALIDppx,254.791517,words/sec,0 20 | Iter,19,Alpha,0.000343,VALIDacc,0.515385,VALIDent,8.033077,VALIDppx,261.937200,words/sec,0 21 | Iter,20,Alpha,0.000228,VALIDacc,0.517308,VALIDent,8.005406,VALIDppx,256.961029,words/sec,0 22 | Iter,21,Alpha,0.000152,VALIDacc,0.526923,VALIDent,7.991467,VALIDppx,254.490402,words/sec,0 23 | Iter,22,Alpha,0.000101,VALIDacc,0.528846,VALIDent,8.007949,VALIDppx,257.414455,words/sec,0 24 | -------------------------------------------------------------------------------- /logs/GutenbergHolmes_p0_mw5_h50_c250_m2000_d4_b5_g0.5.acc: -------------------------------------------------------------------------------- 1 | Iter,0,Alpha,0.100000,VALIDacc,0.434615,VALIDent,8.571457,VALIDppx,380.422074,words/sec,0 2 | Iter,1,Alpha,0.100000,VALIDacc,0.465385,VALIDent,8.530692,VALIDppx,369.823109,words/sec,0 3 | Iter,2,Alpha,0.100000,VALIDacc,0.476923,VALIDent,8.687900,VALIDppx,412.399900,words/sec,0 4 | Iter,3,Alpha,0.100000,VALIDacc,0.459615,VALIDent,8.653864,VALIDppx,402.784280,words/sec,0 5 | Iter,4,Alpha,0.100000,VALIDacc,0.494231,VALIDent,8.315728,VALIDppx,318.627647,words/sec,0 6 | Iter,5,Alpha,0.100000,VALIDacc,0.476923,VALIDent,8.607496,VALIDppx,390.044672,words/sec,0 7 | Iter,6,Alpha,0.066667,VALIDacc,0.476923,VALIDent,8.364198,VALIDppx,329.514430,words/sec,0 8 | Iter,7,Alpha,0.044444,VALIDacc,0.513462,VALIDent,8.270002,VALIDppx,308.687154,words/sec,0 9 | Iter,8,Alpha,0.029630,VALIDacc,0.513462,VALIDent,8.196132,VALIDppx,293.279352,words/sec,0 10 | Iter,9,Alpha,0.019753,VALIDacc,0.501923,VALIDent,8.082509,VALIDppx,271.067559,words/sec,0 11 | Iter,10,Alpha,0.013169,VALIDacc,0.515385,VALIDent,8.125267,VALIDppx,279.221614,words/sec,0 12 | Iter,11,Alpha,0.008779,VALIDacc,0.509615,VALIDent,8.135773,VALIDppx,281.262405,words/sec,0 13 | Iter,12,Alpha,0.005853,VALIDacc,0.526923,VALIDent,8.158307,VALIDppx,285.690116,words/sec,0 14 | Iter,13,Alpha,0.003902,VALIDacc,0.503846,VALIDent,8.037455,VALIDppx,262.733245,words/sec,0 15 | Iter,14,Alpha,0.002601,VALIDacc,0.496154,VALIDent,8.332756,VALIDppx,322.410691,words/sec,0 16 | Iter,15,Alpha,0.001734,VALIDacc,0.507692,VALIDent,8.017326,VALIDppx,259.093027,words/sec,0 17 | Iter,16,Alpha,0.001156,VALIDacc,0.498077,VALIDent,8.044398,VALIDppx,264.000693,words/sec,0 18 | Iter,17,Alpha,0.000771,VALIDacc,0.513462,VALIDent,8.051677,VALIDppx,265.336011,words/sec,0 19 | Iter,18,Alpha,0.000514,VALIDacc,0.517308,VALIDent,7.999546,VALIDppx,255.919458,words/sec,0 20 | Iter,19,Alpha,0.000343,VALIDacc,0.509615,VALIDent,8.001428,VALIDppx,256.253485,words/sec,0 21 | Iter,20,Alpha,0.000228,VALIDacc,0.513462,VALIDent,8.002160,VALIDppx,256.383600,words/sec,0 22 | Iter,21,Alpha,0.000152,VALIDacc,0.513462,VALIDent,7.997038,VALIDppx,255.474958,words/sec,0 23 | Iter,22,Alpha,0.000101,VALIDacc,0.513462,VALIDent,7.995744,VALIDppx,255.245901,words/sec,0 24 | -------------------------------------------------------------------------------- /logs/GutenbergHolmes_seq_mw5_h100_c250_m0_d0_b5.acc: -------------------------------------------------------------------------------- 1 | Iter,0,Alpha,0.100000,VALIDacc,0.292308,VALIDent,7.542692,VALIDppx,186.456028,words/sec,0 2 | Iter,1,Alpha,0.100000,VALIDacc,0.300000,VALIDent,7.414324,VALIDppx,170.582271,words/sec,0 3 | Iter,2,Alpha,0.100000,VALIDacc,0.286538,VALIDent,7.353363,VALIDppx,163.524482,words/sec,0 4 | Iter,3,Alpha,0.100000,VALIDacc,0.290385,VALIDent,7.320493,VALIDppx,159.840874,words/sec,0 5 | Iter,4,Alpha,0.100000,VALIDacc,0.296154,VALIDent,7.299796,VALIDppx,157.564177,words/sec,0 6 | Iter,5,Alpha,0.100000,VALIDacc,0.303846,VALIDent,7.285241,VALIDppx,155.982561,words/sec,0 7 | Iter,6,Alpha,0.100000,VALIDacc,0.301923,VALIDent,7.274643,VALIDppx,154.840929,words/sec,0 8 | Iter,7,Alpha,0.066667,VALIDacc,0.288462,VALIDent,7.203766,VALIDppx,147.417674,words/sec,0 9 | Iter,8,Alpha,0.044444,VALIDacc,0.276923,VALIDent,7.151940,VALIDppx,142.216029,words/sec,0 10 | Iter,9,Alpha,0.029630,VALIDacc,0.276923,VALIDent,7.111860,VALIDppx,138.319469,words/sec,0 11 | Iter,10,Alpha,0.019753,VALIDacc,0.273077,VALIDent,7.080894,VALIDppx,135.382151,words/sec,0 12 | Iter,11,Alpha,0.013169,VALIDacc,0.282692,VALIDent,7.056410,VALIDppx,133.103992,words/sec,0 13 | Iter,12,Alpha,0.008779,VALIDacc,0.278846,VALIDent,7.036203,VALIDppx,131.252642,words/sec,0 14 | Iter,13,Alpha,0.005853,VALIDacc,0.273077,VALIDent,7.018806,VALIDppx,129.679443,words/sec,0 15 | Iter,14,Alpha,0.003902,VALIDacc,0.271154,VALIDent,7.003961,VALIDppx,128.351890,words/sec,0 16 | Iter,15,Alpha,0.002601,VALIDacc,0.271154,VALIDent,6.991587,VALIDppx,127.255774,words/sec,0 17 | Iter,16,Alpha,0.001734,VALIDacc,0.282692,VALIDent,6.981513,VALIDppx,126.370256,words/sec,0 18 | Iter,17,Alpha,0.001156,VALIDacc,0.282692,VALIDent,6.973082,VALIDppx,125.633896,words/sec,0 19 | Iter,18,Alpha,0.000771,VALIDacc,0.288462,VALIDent,6.964960,VALIDppx,124.928588,words/sec,0 20 | Iter,19,Alpha,0.000514,VALIDacc,0.290385,VALIDent,6.956480,VALIDppx,124.196440,words/sec,0 21 | Iter,20,Alpha,0.000343,VALIDacc,0.294231,VALIDent,6.947785,VALIDppx,123.450160,words/sec,0 22 | Iter,21,Alpha,0.000228,VALIDacc,0.296154,VALIDent,6.938872,VALIDppx,122.689853,words/sec,0 23 | Iter,22,Alpha,0.000152,VALIDacc,0.296154,VALIDent,6.930441,VALIDppx,121.974939,words/sec,0 24 | Iter,23,Alpha,0.000101,VALIDacc,0.294231,VALIDent,6.923959,VALIDppx,121.428136,words/sec,0 25 | -------------------------------------------------------------------------------- /logs/GutenbergHolmes_p0_mw5_h200_c250_m2000_d4_b5_g0.5.acc: -------------------------------------------------------------------------------- 1 | Iter,0,Alpha,0.100000,VALIDacc,0.355769,VALIDent,9.542599,VALIDppx,745.776148,words/sec,0 2 | Iter,1,Alpha,0.100000,VALIDacc,0.444231,VALIDent,9.339977,VALIDppx,648.057059,words/sec,0 3 | Iter,2,Alpha,0.100000,VALIDacc,0.438462,VALIDent,9.127300,VALIDppx,559.230902,words/sec,0 4 | Iter,3,Alpha,0.100000,VALIDacc,0.480769,VALIDent,8.829454,VALIDppx,454.915154,words/sec,0 5 | Iter,4,Alpha,0.100000,VALIDacc,0.480769,VALIDent,9.040739,VALIDppx,526.664168,words/sec,0 6 | Iter,5,Alpha,0.100000,VALIDacc,0.488462,VALIDent,9.102036,VALIDppx,549.523101,words/sec,0 7 | Iter,6,Alpha,0.100000,VALIDacc,0.482692,VALIDent,8.684370,VALIDppx,411.392145,words/sec,0 8 | Iter,7,Alpha,0.066667,VALIDacc,0.503846,VALIDent,8.609831,VALIDppx,390.676706,words/sec,0 9 | Iter,8,Alpha,0.044444,VALIDacc,0.478846,VALIDent,8.553101,VALIDppx,375.612336,words/sec,0 10 | Iter,9,Alpha,0.029630,VALIDacc,0.505769,VALIDent,8.371286,VALIDppx,331.137457,words/sec,0 11 | Iter,10,Alpha,0.019753,VALIDacc,0.490385,VALIDent,8.297414,VALIDppx,314.608641,words/sec,0 12 | Iter,11,Alpha,0.013169,VALIDacc,0.513462,VALIDent,8.161090,VALIDppx,286.241738,words/sec,0 13 | Iter,12,Alpha,0.008779,VALIDacc,0.515385,VALIDent,8.232216,VALIDppx,300.707203,words/sec,0 14 | Iter,13,Alpha,0.005853,VALIDacc,0.501923,VALIDent,8.142231,VALIDppx,282.524272,words/sec,0 15 | Iter,14,Alpha,0.003902,VALIDacc,0.511538,VALIDent,8.128911,VALIDppx,279.927823,words/sec,0 16 | Iter,15,Alpha,0.002601,VALIDacc,0.505769,VALIDent,8.079572,VALIDppx,270.516378,words/sec,0 17 | Iter,16,Alpha,0.001734,VALIDacc,0.515385,VALIDent,8.058873,VALIDppx,266.662813,words/sec,0 18 | Iter,17,Alpha,0.001156,VALIDacc,0.507692,VALIDent,8.071768,VALIDppx,269.056923,words/sec,0 19 | Iter,18,Alpha,0.000771,VALIDacc,0.500000,VALIDent,8.124827,VALIDppx,279.136594,words/sec,0 20 | Iter,19,Alpha,0.000514,VALIDacc,0.523077,VALIDent,8.025258,VALIDppx,260.521378,words/sec,0 21 | Iter,20,Alpha,0.000343,VALIDacc,0.526923,VALIDent,8.055572,VALIDppx,266.053408,words/sec,0 22 | Iter,21,Alpha,0.000228,VALIDacc,0.511538,VALIDent,8.045317,VALIDppx,264.168894,words/sec,0 23 | Iter,22,Alpha,0.000152,VALIDacc,0.519231,VALIDent,8.039515,VALIDppx,263.108596,words/sec,0 24 | Iter,23,Alpha,0.000101,VALIDacc,0.513462,VALIDent,8.018300,VALIDppx,259.267879,words/sec,0 25 | -------------------------------------------------------------------------------- /DependencyTreeRNN++/CommandLineParser.h: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2014-2015 Piotr Mirowski 2 | // 3 | // Piotr Mirowski, Andreas Vlachos 4 | // "Dependency Recurrent Neural Language Models for Sentence Completion" 5 | // ACL 2015 6 | 7 | #ifndef __DependencyTreeRNN____CommandLineParser__ 8 | #define __DependencyTreeRNN____CommandLineParser__ 9 | 10 | #include 11 | #include 12 | 13 | class CommandLineArgument { 14 | public: 15 | 16 | /** 17 | * Type of the argument 18 | */ 19 | std::string m_type; 20 | 21 | /** 22 | * Description of the argument 23 | */ 24 | std::string m_description; 25 | 26 | /** 27 | * Value of the argument 28 | */ 29 | std::string m_value; 30 | 31 | /** 32 | * Is the argument required? 33 | */ 34 | bool m_isRequired; 35 | 36 | /** 37 | * Constructors 38 | */ 39 | CommandLineArgument(std::string t, 40 | std::string desc, 41 | std::string d, 42 | bool r) 43 | : m_type(t), m_description(desc), m_value(d), m_isRequired(r) { 44 | } 45 | CommandLineArgument() { 46 | m_type = "UNDEFINED"; 47 | } 48 | }; 49 | 50 | 51 | class CommandLineParser { 52 | public: 53 | /** 54 | * Map between command line argument names and structures containig their values 55 | */ 56 | std::map args; 57 | 58 | /** 59 | * Register a command line argument 60 | */ 61 | void Register(std::string name, 62 | std::string type, 63 | std::string desc, 64 | std::string defaultVal = "", 65 | bool isRequired = false) { 66 | args[name] = CommandLineArgument(type, desc, defaultVal, isRequired); 67 | } 68 | 69 | /** 70 | * Parse the arguments to extract their values and store them in the map 71 | */ 72 | bool Parse(char *list[], int llen); 73 | 74 | /** 75 | * Get a command line argument 76 | */ 77 | bool Get(std::string name, int &value); 78 | bool Get(std::string name, bool &value); 79 | bool Get(std::string name, double &value); 80 | bool Get(std::string name, std::string &value); 81 | bool Get(std::string name, long long &value); 82 | }; 83 | 84 | #endif /* defined(__DependencyTreeRNN____CommandLineParser__) */ 85 | -------------------------------------------------------------------------------- /logs/GutenbergHolmes_seq_mw5_h200_c250_m0_d0_b5.acc: -------------------------------------------------------------------------------- 1 | Iter,0,Alpha,0.100000,VALIDacc,0.278846,VALIDent,7.680208,VALIDppx,205.103386,words/sec,0 2 | Iter,1,Alpha,0.100000,VALIDacc,0.276923,VALIDent,7.517412,VALIDppx,183.217331,words/sec,0 3 | Iter,2,Alpha,0.100000,VALIDacc,0.280769,VALIDent,7.442037,VALIDppx,173.890693,words/sec,0 4 | Iter,3,Alpha,0.100000,VALIDacc,0.273077,VALIDent,7.404015,VALIDppx,169.367654,words/sec,0 5 | Iter,4,Alpha,0.100000,VALIDacc,0.271154,VALIDent,7.380331,VALIDppx,166.610022,words/sec,0 6 | Iter,5,Alpha,0.100000,VALIDacc,0.271154,VALIDent,7.365829,VALIDppx,164.943653,words/sec,0 7 | Iter,6,Alpha,0.100000,VALIDacc,0.275000,VALIDent,7.356609,VALIDppx,163.892785,words/sec,0 8 | Iter,7,Alpha,0.100000,VALIDacc,0.271154,VALIDent,7.349157,VALIDppx,163.048493,words/sec,0 9 | Iter,8,Alpha,0.066667,VALIDacc,0.273077,VALIDent,7.261322,VALIDppx,153.417778,words/sec,0 10 | Iter,9,Alpha,0.044444,VALIDacc,0.282692,VALIDent,7.196659,VALIDppx,146.693266,words/sec,0 11 | Iter,10,Alpha,0.029630,VALIDacc,0.282692,VALIDent,7.143629,VALIDppx,141.399059,words/sec,0 12 | Iter,11,Alpha,0.019753,VALIDacc,0.276923,VALIDent,7.099888,VALIDppx,137.176390,words/sec,0 13 | Iter,12,Alpha,0.013169,VALIDacc,0.271154,VALIDent,7.064767,VALIDppx,133.877223,words/sec,0 14 | Iter,13,Alpha,0.008779,VALIDacc,0.263462,VALIDent,7.036239,VALIDppx,131.255948,words/sec,0 15 | Iter,14,Alpha,0.005853,VALIDacc,0.269231,VALIDent,7.012065,VALIDppx,129.074934,words/sec,0 16 | Iter,15,Alpha,0.003902,VALIDacc,0.271154,VALIDent,6.990707,VALIDppx,127.178192,words/sec,0 17 | Iter,16,Alpha,0.002601,VALIDacc,0.275000,VALIDent,6.972046,VALIDppx,125.543752,words/sec,0 18 | Iter,17,Alpha,0.001734,VALIDacc,0.282692,VALIDent,6.956267,VALIDppx,124.178121,words/sec,0 19 | Iter,18,Alpha,0.001156,VALIDacc,0.282692,VALIDent,6.943481,VALIDppx,123.082419,words/sec,0 20 | Iter,19,Alpha,0.000771,VALIDacc,0.290385,VALIDent,6.933176,VALIDppx,122.206428,words/sec,0 21 | Iter,20,Alpha,0.000514,VALIDacc,0.294231,VALIDent,6.924730,VALIDppx,121.493067,words/sec,0 22 | Iter,21,Alpha,0.000343,VALIDacc,0.296154,VALIDent,6.917802,VALIDppx,120.911067,words/sec,0 23 | Iter,22,Alpha,0.000228,VALIDacc,0.296154,VALIDent,6.911363,VALIDppx,120.372581,words/sec,0 24 | Iter,23,Alpha,0.000152,VALIDacc,0.294231,VALIDent,6.904527,VALIDppx,119.803559,words/sec,0 25 | Iter,24,Alpha,0.000101,VALIDacc,0.296154,VALIDent,6.898090,VALIDppx,119.270248,words/sec,0 26 | -------------------------------------------------------------------------------- /logs/GutenbergHolmes_p2_mw5_h100_c250_m2000_d4_b5_g0.5.acc: -------------------------------------------------------------------------------- 1 | Iter,0,Alpha,0.100000,VALIDacc,0.401923,VALIDent,6.437281,VALIDppx,86.659189,words/sec,0 2 | Iter,1,Alpha,0.100000,VALIDacc,0.415385,VALIDent,6.259147,VALIDppx,76.593341,words/sec,0 3 | Iter,2,Alpha,0.100000,VALIDacc,0.430769,VALIDent,6.251194,VALIDppx,76.172295,words/sec,0 4 | Iter,3,Alpha,0.100000,VALIDacc,0.417308,VALIDent,6.163204,VALIDppx,71.665336,words/sec,0 5 | Iter,4,Alpha,0.100000,VALIDacc,0.421154,VALIDent,6.406475,VALIDppx,84.828349,words/sec,0 6 | Iter,5,Alpha,0.100000,VALIDacc,0.450000,VALIDent,6.295965,VALIDppx,78.573160,words/sec,0 7 | Iter,6,Alpha,0.100000,VALIDacc,0.465385,VALIDent,6.303201,VALIDppx,78.968259,words/sec,0 8 | Iter,7,Alpha,0.100000,VALIDacc,0.465385,VALIDent,6.309432,VALIDppx,79.310048,words/sec,0 9 | Iter,8,Alpha,0.100000,VALIDacc,0.463462,VALIDent,6.207847,VALIDppx,73.917631,words/sec,0 10 | Iter,9,Alpha,0.066667,VALIDacc,0.480769,VALIDent,6.363325,VALIDppx,82.328806,words/sec,0 11 | Iter,10,Alpha,0.044444,VALIDacc,0.480769,VALIDent,6.218611,VALIDppx,74.471202,words/sec,0 12 | Iter,11,Alpha,0.029630,VALIDacc,0.448077,VALIDent,6.202133,VALIDppx,73.625445,words/sec,0 13 | Iter,12,Alpha,0.019753,VALIDacc,0.488462,VALIDent,6.018031,VALIDppx,64.804906,words/sec,0 14 | Iter,13,Alpha,0.013169,VALIDacc,0.500000,VALIDent,6.023851,VALIDppx,65.066880,words/sec,0 15 | Iter,14,Alpha,0.008779,VALIDacc,0.475000,VALIDent,6.022283,VALIDppx,64.996158,words/sec,0 16 | Iter,15,Alpha,0.005853,VALIDacc,0.484615,VALIDent,5.998284,VALIDppx,63.923929,words/sec,0 17 | Iter,16,Alpha,0.003902,VALIDacc,0.471154,VALIDent,6.539653,VALIDppx,93.031885,words/sec,0 18 | Iter,17,Alpha,0.002601,VALIDacc,0.500000,VALIDent,5.929428,VALIDppx,60.944683,words/sec,0 19 | Iter,18,Alpha,0.001734,VALIDacc,0.478846,VALIDent,5.977251,VALIDppx,62.998728,words/sec,0 20 | Iter,19,Alpha,0.001156,VALIDacc,0.475000,VALIDent,5.935953,VALIDppx,61.220937,words/sec,0 21 | Iter,20,Alpha,0.000771,VALIDacc,0.473077,VALIDent,5.903441,VALIDppx,59.856688,words/sec,0 22 | Iter,21,Alpha,0.000514,VALIDacc,0.484615,VALIDent,5.940223,VALIDppx,61.402409,words/sec,0 23 | Iter,22,Alpha,0.000343,VALIDacc,0.476923,VALIDent,5.898638,VALIDppx,59.657756,words/sec,0 24 | Iter,23,Alpha,0.000228,VALIDacc,0.478846,VALIDent,5.915796,VALIDppx,60.371503,words/sec,0 25 | Iter,24,Alpha,0.000152,VALIDacc,0.482692,VALIDent,5.948535,VALIDppx,61.757201,words/sec,0 26 | Iter,25,Alpha,0.000101,VALIDacc,0.488462,VALIDent,5.909594,VALIDppx,60.112520,words/sec,0 27 | -------------------------------------------------------------------------------- /logs/GutenbergHolmes_seq_mw5_h50_c250_m1000_d4_b5_indep_.acc: -------------------------------------------------------------------------------- 1 | Iter,0,Alpha,0.100000,VALIDacc,0.328846,VALIDent,6.935155,VALIDppx,122.374157,words/sec,0 2 | Iter,1,Alpha,0.100000,VALIDacc,0.351923,VALIDent,6.796876,VALIDppx,111.189432,words/sec,0 3 | Iter,2,Alpha,0.100000,VALIDacc,0.353846,VALIDent,6.741020,VALIDppx,106.966833,words/sec,0 4 | Iter,3,Alpha,0.100000,VALIDacc,0.359615,VALIDent,6.714073,VALIDppx,104.987435,words/sec,0 5 | Iter,4,Alpha,0.100000,VALIDacc,0.367308,VALIDent,6.701541,VALIDppx,104.079425,words/sec,0 6 | Iter,5,Alpha,0.100000,VALIDacc,0.380769,VALIDent,6.697558,VALIDppx,103.792480,words/sec,0 7 | Iter,6,Alpha,0.100000,VALIDacc,0.388462,VALIDent,6.699487,VALIDppx,103.931357,words/sec,0 8 | Iter,7,Alpha,0.100000,VALIDacc,0.388462,VALIDent,6.705390,VALIDppx,104.357449,words/sec,0 9 | Iter,8,Alpha,0.100000,VALIDacc,0.384615,VALIDent,6.714212,VALIDppx,104.997541,words/sec,0 10 | Iter,9,Alpha,0.066667,VALIDacc,0.392308,VALIDent,6.642499,VALIDppx,99.905962,words/sec,0 11 | Iter,10,Alpha,0.044444,VALIDacc,0.392308,VALIDent,6.590915,VALIDppx,96.396884,words/sec,0 12 | Iter,11,Alpha,0.029630,VALIDacc,0.394231,VALIDent,6.556514,VALIDppx,94.125512,words/sec,0 13 | Iter,12,Alpha,0.019753,VALIDacc,0.392308,VALIDent,6.531144,VALIDppx,92.484807,words/sec,0 14 | Iter,13,Alpha,0.013169,VALIDacc,0.390385,VALIDent,6.511226,VALIDppx,91.216690,words/sec,0 15 | Iter,14,Alpha,0.008779,VALIDacc,0.390385,VALIDent,6.495026,VALIDppx,90.198183,words/sec,0 16 | Iter,15,Alpha,0.005853,VALIDacc,0.380769,VALIDent,6.481460,VALIDppx,89.353969,words/sec,0 17 | Iter,16,Alpha,0.003902,VALIDacc,0.388462,VALIDent,6.469027,VALIDppx,88.587267,words/sec,0 18 | Iter,17,Alpha,0.002601,VALIDacc,0.386538,VALIDent,6.457031,VALIDppx,87.853664,words/sec,0 19 | Iter,18,Alpha,0.001734,VALIDacc,0.392308,VALIDent,6.446472,VALIDppx,87.213064,words/sec,0 20 | Iter,19,Alpha,0.001156,VALIDacc,0.394231,VALIDent,6.438324,VALIDppx,86.721897,words/sec,0 21 | Iter,20,Alpha,0.000771,VALIDacc,0.394231,VALIDent,6.432292,VALIDppx,86.360061,words/sec,0 22 | Iter,21,Alpha,0.000514,VALIDacc,0.394231,VALIDent,6.427471,VALIDppx,86.071938,words/sec,0 23 | Iter,22,Alpha,0.000343,VALIDacc,0.396154,VALIDent,6.422642,VALIDppx,85.784325,words/sec,0 24 | Iter,23,Alpha,0.000228,VALIDacc,0.398077,VALIDent,6.417652,VALIDppx,85.488138,words/sec,0 25 | Iter,24,Alpha,0.000152,VALIDacc,0.401923,VALIDent,6.413943,VALIDppx,85.268628,words/sec,0 26 | Iter,25,Alpha,0.000101,VALIDacc,0.400000,VALIDent,6.412369,VALIDppx,85.175656,words/sec,0 27 | -------------------------------------------------------------------------------- /logs/GutenbergHolmes_seq_mw5_h200_c250_m1000_d4_b5.acc: -------------------------------------------------------------------------------- 1 | Iter,0,Alpha,0.100000,VALIDacc,0.332692,VALIDent,7.207843,VALIDppx,147.834853,words/sec,0 2 | Iter,1,Alpha,0.100000,VALIDacc,0.344231,VALIDent,7.012280,VALIDppx,129.094183,words/sec,0 3 | Iter,2,Alpha,0.100000,VALIDacc,0.355769,VALIDent,6.934988,VALIDppx,122.359987,words/sec,0 4 | Iter,3,Alpha,0.100000,VALIDacc,0.357692,VALIDent,6.891659,VALIDppx,118.739717,words/sec,0 5 | Iter,4,Alpha,0.100000,VALIDacc,0.357692,VALIDent,6.869177,VALIDppx,116.903732,words/sec,0 6 | Iter,5,Alpha,0.100000,VALIDacc,0.375000,VALIDent,6.858217,VALIDppx,116.018969,words/sec,0 7 | Iter,6,Alpha,0.100000,VALIDacc,0.378846,VALIDent,6.855655,VALIDppx,115.813095,words/sec,0 8 | Iter,7,Alpha,0.100000,VALIDacc,0.384615,VALIDent,6.857846,VALIDppx,115.989114,words/sec,0 9 | Iter,8,Alpha,0.100000,VALIDacc,0.384615,VALIDent,6.862257,VALIDppx,116.344320,words/sec,0 10 | Iter,9,Alpha,0.100000,VALIDacc,0.388462,VALIDent,6.868663,VALIDppx,116.862093,words/sec,0 11 | Iter,10,Alpha,0.100000,VALIDacc,0.392308,VALIDent,6.877193,VALIDppx,117.555111,words/sec,0 12 | Iter,11,Alpha,0.100000,VALIDacc,0.388462,VALIDent,6.887047,VALIDppx,118.360766,words/sec,0 13 | Iter,12,Alpha,0.066667,VALIDacc,0.390385,VALIDent,6.789573,VALIDppx,110.628010,words/sec,0 14 | Iter,13,Alpha,0.044444,VALIDacc,0.390385,VALIDent,6.724826,VALIDppx,105.772854,words/sec,0 15 | Iter,14,Alpha,0.029630,VALIDacc,0.390385,VALIDent,6.676831,VALIDppx,102.311930,words/sec,0 16 | Iter,15,Alpha,0.019753,VALIDacc,0.386538,VALIDent,6.640517,VALIDppx,99.768782,words/sec,0 17 | Iter,16,Alpha,0.013169,VALIDacc,0.382692,VALIDent,6.612376,VALIDppx,97.841573,words/sec,0 18 | Iter,17,Alpha,0.008779,VALIDacc,0.382692,VALIDent,6.589223,VALIDppx,96.283933,words/sec,0 19 | Iter,18,Alpha,0.005853,VALIDacc,0.382692,VALIDent,6.569262,VALIDppx,94.960907,words/sec,0 20 | Iter,19,Alpha,0.003902,VALIDacc,0.386538,VALIDent,6.553112,VALIDppx,93.903810,words/sec,0 21 | Iter,20,Alpha,0.002601,VALIDacc,0.378846,VALIDent,6.538522,VALIDppx,92.958927,words/sec,0 22 | Iter,21,Alpha,0.001734,VALIDacc,0.384615,VALIDent,6.526329,VALIDppx,92.176609,words/sec,0 23 | Iter,22,Alpha,0.001156,VALIDacc,0.386538,VALIDent,6.516917,VALIDppx,91.577251,words/sec,0 24 | Iter,23,Alpha,0.000771,VALIDacc,0.388462,VALIDent,6.509623,VALIDppx,91.115420,words/sec,0 25 | Iter,24,Alpha,0.000514,VALIDacc,0.388462,VALIDent,6.504032,VALIDppx,90.762946,words/sec,0 26 | Iter,25,Alpha,0.000343,VALIDacc,0.388462,VALIDent,6.499585,VALIDppx,90.483618,words/sec,0 27 | Iter,26,Alpha,0.000228,VALIDacc,0.392308,VALIDent,6.495298,VALIDppx,90.215141,words/sec,0 28 | Iter,27,Alpha,0.000152,VALIDacc,0.386538,VALIDent,6.491204,VALIDppx,89.959497,words/sec,0 29 | Iter,28,Alpha,0.000101,VALIDacc,0.392308,VALIDent,6.488327,VALIDppx,89.780284,words/sec,0 30 | -------------------------------------------------------------------------------- /logs/GutenbergHolmes_seq_mw5_h100_c250_m1000_d4_b5.acc: -------------------------------------------------------------------------------- 1 | Iter,0,Alpha,0.100000,VALIDacc,0.334615,VALIDent,7.038659,VALIDppx,131.476297,words/sec,0 2 | Iter,1,Alpha,0.100000,VALIDacc,0.346154,VALIDent,6.874015,VALIDppx,117.296426,words/sec,0 3 | Iter,2,Alpha,0.100000,VALIDacc,0.359615,VALIDent,6.805409,VALIDppx,111.849017,words/sec,0 4 | Iter,3,Alpha,0.100000,VALIDacc,0.363462,VALIDent,6.771734,VALIDppx,109.268546,words/sec,0 5 | Iter,4,Alpha,0.100000,VALIDacc,0.373077,VALIDent,6.755499,VALIDppx,108.045793,words/sec,0 6 | Iter,5,Alpha,0.100000,VALIDacc,0.375000,VALIDent,6.749179,VALIDppx,107.573495,words/sec,0 7 | Iter,6,Alpha,0.100000,VALIDacc,0.384615,VALIDent,6.748471,VALIDppx,107.520716,words/sec,0 8 | Iter,7,Alpha,0.100000,VALIDacc,0.384615,VALIDent,6.751653,VALIDppx,107.758162,words/sec,0 9 | Iter,8,Alpha,0.100000,VALIDacc,0.392308,VALIDent,6.758135,VALIDppx,108.243407,words/sec,0 10 | Iter,9,Alpha,0.100000,VALIDacc,0.398077,VALIDent,6.767457,VALIDppx,108.945080,words/sec,0 11 | Iter,10,Alpha,0.100000,VALIDacc,0.398077,VALIDent,6.778716,VALIDppx,109.798622,words/sec,0 12 | Iter,11,Alpha,0.100000,VALIDacc,0.398077,VALIDent,6.791265,VALIDppx,110.757814,words/sec,0 13 | Iter,12,Alpha,0.100000,VALIDacc,0.398077,VALIDent,6.805175,VALIDppx,111.830860,words/sec,0 14 | Iter,13,Alpha,0.100000,VALIDacc,0.394231,VALIDent,6.820241,VALIDppx,113.004888,words/sec,0 15 | Iter,14,Alpha,0.066667,VALIDacc,0.396154,VALIDent,6.754421,VALIDppx,107.965108,words/sec,0 16 | Iter,15,Alpha,0.044444,VALIDacc,0.396154,VALIDent,6.705283,VALIDppx,104.349745,words/sec,0 17 | Iter,16,Alpha,0.029630,VALIDacc,0.394231,VALIDent,6.670727,VALIDppx,101.880015,words/sec,0 18 | Iter,17,Alpha,0.019753,VALIDacc,0.394231,VALIDent,6.645158,VALIDppx,100.090310,words/sec,0 19 | Iter,18,Alpha,0.013169,VALIDacc,0.396154,VALIDent,6.623484,VALIDppx,98.597800,words/sec,0 20 | Iter,19,Alpha,0.008779,VALIDacc,0.388462,VALIDent,6.603299,VALIDppx,97.227954,words/sec,0 21 | Iter,20,Alpha,0.005853,VALIDacc,0.390385,VALIDent,6.583628,VALIDppx,95.911262,words/sec,0 22 | Iter,21,Alpha,0.003902,VALIDacc,0.390385,VALIDent,6.566929,VALIDppx,94.807458,words/sec,0 23 | Iter,22,Alpha,0.002601,VALIDacc,0.386538,VALIDent,6.555035,VALIDppx,94.029082,words/sec,0 24 | Iter,23,Alpha,0.001734,VALIDacc,0.388462,VALIDent,6.545795,VALIDppx,93.428741,words/sec,0 25 | Iter,24,Alpha,0.001156,VALIDacc,0.388462,VALIDent,6.538449,VALIDppx,92.954264,words/sec,0 26 | Iter,25,Alpha,0.000771,VALIDacc,0.388462,VALIDent,6.532582,VALIDppx,92.576978,words/sec,0 27 | Iter,26,Alpha,0.000514,VALIDacc,0.390385,VALIDent,6.527838,VALIDppx,92.273109,words/sec,0 28 | Iter,27,Alpha,0.000343,VALIDacc,0.392308,VALIDent,6.523325,VALIDppx,91.984900,words/sec,0 29 | Iter,28,Alpha,0.000228,VALIDacc,0.390385,VALIDent,6.518669,VALIDppx,91.688531,words/sec,0 30 | Iter,29,Alpha,0.000152,VALIDacc,0.392308,VALIDent,6.514971,VALIDppx,91.453782,words/sec,0 31 | Iter,30,Alpha,0.000101,VALIDacc,0.388462,VALIDent,6.513047,VALIDppx,91.331887,words/sec,0 32 | -------------------------------------------------------------------------------- /books/test.labels: -------------------------------------------------------------------------------- 1 | 2 2 | 4 3 | 3 4 | 1 5 | 0 6 | 3 7 | 0 8 | 4 9 | 2 10 | 2 11 | 2 12 | 2 13 | 1 14 | 3 15 | 1 16 | 4 17 | 1 18 | 1 19 | 1 20 | 0 21 | 2 22 | 0 23 | 4 24 | 3 25 | 4 26 | 2 27 | 0 28 | 3 29 | 2 30 | 0 31 | 2 32 | 0 33 | 4 34 | 0 35 | 2 36 | 2 37 | 2 38 | 2 39 | 2 40 | 3 41 | 0 42 | 2 43 | 3 44 | 4 45 | 1 46 | 0 47 | 3 48 | 1 49 | 1 50 | 2 51 | 3 52 | 2 53 | 0 54 | 0 55 | 3 56 | 1 57 | 2 58 | 4 59 | 2 60 | 0 61 | 4 62 | 2 63 | 2 64 | 1 65 | 4 66 | 1 67 | 4 68 | 0 69 | 1 70 | 4 71 | 4 72 | 0 73 | 3 74 | 2 75 | 3 76 | 3 77 | 1 78 | 1 79 | 3 80 | 2 81 | 3 82 | 0 83 | 1 84 | 0 85 | 4 86 | 0 87 | 3 88 | 1 89 | 2 90 | 4 91 | 1 92 | 4 93 | 1 94 | 1 95 | 1 96 | 1 97 | 4 98 | 2 99 | 2 100 | 4 101 | 2 102 | 1 103 | 4 104 | 3 105 | 4 106 | 0 107 | 0 108 | 3 109 | 4 110 | 0 111 | 1 112 | 3 113 | 3 114 | 4 115 | 3 116 | 2 117 | 2 118 | 1 119 | 0 120 | 4 121 | 4 122 | 0 123 | 1 124 | 2 125 | 1 126 | 1 127 | 2 128 | 0 129 | 3 130 | 4 131 | 2 132 | 2 133 | 1 134 | 3 135 | 2 136 | 4 137 | 0 138 | 3 139 | 0 140 | 0 141 | 1 142 | 3 143 | 2 144 | 2 145 | 0 146 | 4 147 | 1 148 | 1 149 | 0 150 | 1 151 | 2 152 | 0 153 | 4 154 | 4 155 | 0 156 | 1 157 | 2 158 | 2 159 | 1 160 | 4 161 | 3 162 | 0 163 | 4 164 | 1 165 | 4 166 | 1 167 | 0 168 | 0 169 | 3 170 | 0 171 | 2 172 | 3 173 | 0 174 | 4 175 | 4 176 | 2 177 | 4 178 | 3 179 | 1 180 | 0 181 | 4 182 | 2 183 | 1 184 | 2 185 | 1 186 | 1 187 | 3 188 | 1 189 | 0 190 | 1 191 | 0 192 | 4 193 | 3 194 | 3 195 | 1 196 | 4 197 | 1 198 | 0 199 | 1 200 | 4 201 | 0 202 | 0 203 | 0 204 | 3 205 | 2 206 | 4 207 | 2 208 | 0 209 | 1 210 | 1 211 | 2 212 | 1 213 | 2 214 | 3 215 | 2 216 | 2 217 | 4 218 | 2 219 | 1 220 | 4 221 | 0 222 | 0 223 | 4 224 | 4 225 | 0 226 | 4 227 | 3 228 | 0 229 | 2 230 | 4 231 | 1 232 | 0 233 | 4 234 | 0 235 | 2 236 | 0 237 | 3 238 | 0 239 | 1 240 | 3 241 | 3 242 | 2 243 | 1 244 | 1 245 | 2 246 | 4 247 | 1 248 | 4 249 | 4 250 | 4 251 | 1 252 | 4 253 | 2 254 | 0 255 | 3 256 | 3 257 | 4 258 | 1 259 | 0 260 | 4 261 | 3 262 | 0 263 | 2 264 | 1 265 | 0 266 | 4 267 | 2 268 | 3 269 | 3 270 | 1 271 | 2 272 | 0 273 | 0 274 | 3 275 | 3 276 | 0 277 | 2 278 | 4 279 | 0 280 | 2 281 | 3 282 | 1 283 | 0 284 | 1 285 | 2 286 | 4 287 | 1 288 | 0 289 | 4 290 | 1 291 | 1 292 | 1 293 | 0 294 | 4 295 | 2 296 | 0 297 | 2 298 | 1 299 | 3 300 | 0 301 | 0 302 | 3 303 | 1 304 | 0 305 | 3 306 | 2 307 | 3 308 | 1 309 | 3 310 | 4 311 | 3 312 | 4 313 | 3 314 | 3 315 | 2 316 | 0 317 | 3 318 | 0 319 | 0 320 | 0 321 | 0 322 | 0 323 | 3 324 | 4 325 | 2 326 | 0 327 | 3 328 | 2 329 | 1 330 | 1 331 | 0 332 | 4 333 | 2 334 | 0 335 | 3 336 | 1 337 | 4 338 | 0 339 | 4 340 | 0 341 | 3 342 | 4 343 | 3 344 | 2 345 | 2 346 | 4 347 | 4 348 | 3 349 | 3 350 | 1 351 | 2 352 | 4 353 | 0 354 | 2 355 | 4 356 | 2 357 | 0 358 | 3 359 | 2 360 | 3 361 | 3 362 | 2 363 | 2 364 | 2 365 | 2 366 | 3 367 | 2 368 | 2 369 | 0 370 | 0 371 | 2 372 | 4 373 | 2 374 | 0 375 | 0 376 | 4 377 | 0 378 | 2 379 | 4 380 | 3 381 | 1 382 | 2 383 | 2 384 | 0 385 | 1 386 | 3 387 | 2 388 | 2 389 | 3 390 | 0 391 | 3 392 | 4 393 | 4 394 | 4 395 | 0 396 | 3 397 | 1 398 | 0 399 | 2 400 | 3 401 | 3 402 | 4 403 | 4 404 | 0 405 | 4 406 | 0 407 | 3 408 | 1 409 | 1 410 | 4 411 | 1 412 | 4 413 | 3 414 | 4 415 | 3 416 | 1 417 | 2 418 | 0 419 | 3 420 | 1 421 | 1 422 | 0 423 | 4 424 | 1 425 | 1 426 | 1 427 | 4 428 | 0 429 | 4 430 | 1 431 | 0 432 | 1 433 | 0 434 | 0 435 | 1 436 | 1 437 | 1 438 | 3 439 | 3 440 | 0 441 | 4 442 | 3 443 | 2 444 | 1 445 | 0 446 | 2 447 | 3 448 | 3 449 | 2 450 | 3 451 | 4 452 | 1 453 | 1 454 | 3 455 | 1 456 | 4 457 | 0 458 | 4 459 | 0 460 | 4 461 | 2 462 | 1 463 | 1 464 | 0 465 | 1 466 | 0 467 | 3 468 | 1 469 | 2 470 | 0 471 | 1 472 | 3 473 | 1 474 | 3 475 | 0 476 | 2 477 | 1 478 | 1 479 | 4 480 | 4 481 | 1 482 | 0 483 | 1 484 | 2 485 | 3 486 | 2 487 | 3 488 | 4 489 | 0 490 | 4 491 | 1 492 | 1 493 | 0 494 | 0 495 | 1 496 | 4 497 | 3 498 | 3 499 | 3 500 | 4 501 | 1 502 | 0 503 | 2 504 | 4 505 | 4 506 | 1 507 | 0 508 | 4 509 | 4 510 | 1 511 | 4 512 | 3 513 | 1 514 | 2 515 | 3 516 | 1 517 | 4 518 | 1 519 | 0 520 | 2 521 | -------------------------------------------------------------------------------- /books/valid.labels: -------------------------------------------------------------------------------- 1 | 3 2 | 4 3 | 3 4 | 4 5 | 3 6 | 3 7 | 3 8 | 1 9 | 3 10 | 0 11 | 1 12 | 4 13 | 2 14 | 0 15 | 3 16 | 4 17 | 1 18 | 0 19 | 0 20 | 0 21 | 4 22 | 2 23 | 3 24 | 4 25 | 4 26 | 0 27 | 0 28 | 2 29 | 2 30 | 4 31 | 3 32 | 0 33 | 0 34 | 0 35 | 1 36 | 4 37 | 4 38 | 0 39 | 4 40 | 4 41 | 3 42 | 4 43 | 0 44 | 1 45 | 1 46 | 2 47 | 3 48 | 2 49 | 2 50 | 1 51 | 1 52 | 2 53 | 2 54 | 1 55 | 3 56 | 3 57 | 4 58 | 4 59 | 0 60 | 1 61 | 1 62 | 2 63 | 4 64 | 3 65 | 3 66 | 0 67 | 4 68 | 1 69 | 4 70 | 4 71 | 2 72 | 4 73 | 2 74 | 3 75 | 2 76 | 4 77 | 4 78 | 4 79 | 2 80 | 1 81 | 1 82 | 1 83 | 3 84 | 4 85 | 0 86 | 2 87 | 2 88 | 2 89 | 4 90 | 3 91 | 3 92 | 0 93 | 3 94 | 2 95 | 4 96 | 3 97 | 4 98 | 4 99 | 4 100 | 0 101 | 0 102 | 3 103 | 4 104 | 1 105 | 4 106 | 2 107 | 2 108 | 3 109 | 1 110 | 0 111 | 1 112 | 0 113 | 0 114 | 2 115 | 1 116 | 2 117 | 4 118 | 4 119 | 4 120 | 4 121 | 4 122 | 3 123 | 2 124 | 3 125 | 3 126 | 3 127 | 4 128 | 1 129 | 2 130 | 4 131 | 1 132 | 1 133 | 0 134 | 3 135 | 1 136 | 2 137 | 2 138 | 1 139 | 2 140 | 1 141 | 0 142 | 3 143 | 2 144 | 2 145 | 3 146 | 3 147 | 2 148 | 2 149 | 1 150 | 4 151 | 4 152 | 4 153 | 1 154 | 4 155 | 2 156 | 2 157 | 4 158 | 4 159 | 1 160 | 3 161 | 0 162 | 2 163 | 0 164 | 4 165 | 3 166 | 1 167 | 3 168 | 4 169 | 0 170 | 3 171 | 0 172 | 4 173 | 0 174 | 2 175 | 2 176 | 1 177 | 0 178 | 1 179 | 3 180 | 3 181 | 4 182 | 3 183 | 3 184 | 4 185 | 2 186 | 4 187 | 0 188 | 3 189 | 2 190 | 4 191 | 1 192 | 0 193 | 0 194 | 4 195 | 2 196 | 2 197 | 4 198 | 4 199 | 0 200 | 3 201 | 0 202 | 2 203 | 3 204 | 4 205 | 0 206 | 0 207 | 1 208 | 2 209 | 2 210 | 4 211 | 2 212 | 3 213 | 1 214 | 2 215 | 1 216 | 3 217 | 4 218 | 2 219 | 1 220 | 3 221 | 4 222 | 4 223 | 4 224 | 2 225 | 0 226 | 0 227 | 2 228 | 2 229 | 0 230 | 4 231 | 2 232 | 4 233 | 2 234 | 0 235 | 1 236 | 3 237 | 0 238 | 4 239 | 1 240 | 2 241 | 2 242 | 3 243 | 3 244 | 3 245 | 1 246 | 2 247 | 1 248 | 1 249 | 1 250 | 3 251 | 1 252 | 4 253 | 2 254 | 3 255 | 1 256 | 3 257 | 3 258 | 4 259 | 1 260 | 0 261 | 3 262 | 2 263 | 2 264 | 1 265 | 2 266 | 2 267 | 1 268 | 4 269 | 0 270 | 3 271 | 2 272 | 3 273 | 0 274 | 3 275 | 2 276 | 2 277 | 4 278 | 4 279 | 0 280 | 0 281 | 4 282 | 3 283 | 4 284 | 0 285 | 0 286 | 3 287 | 2 288 | 0 289 | 2 290 | 1 291 | 1 292 | 0 293 | 4 294 | 4 295 | 1 296 | 0 297 | 2 298 | 1 299 | 4 300 | 1 301 | 3 302 | 2 303 | 0 304 | 2 305 | 2 306 | 1 307 | 4 308 | 1 309 | 4 310 | 4 311 | 3 312 | 0 313 | 4 314 | 4 315 | 2 316 | 2 317 | 3 318 | 1 319 | 2 320 | 3 321 | 1 322 | 1 323 | 4 324 | 4 325 | 4 326 | 4 327 | 1 328 | 0 329 | 2 330 | 1 331 | 1 332 | 0 333 | 1 334 | 4 335 | 0 336 | 1 337 | 3 338 | 2 339 | 4 340 | 0 341 | 4 342 | 3 343 | 4 344 | 2 345 | 0 346 | 1 347 | 1 348 | 2 349 | 1 350 | 2 351 | 0 352 | 3 353 | 0 354 | 2 355 | 3 356 | 3 357 | 2 358 | 1 359 | 2 360 | 1 361 | 2 362 | 3 363 | 1 364 | 1 365 | 0 366 | 3 367 | 0 368 | 4 369 | 2 370 | 3 371 | 1 372 | 4 373 | 2 374 | 3 375 | 1 376 | 2 377 | 0 378 | 4 379 | 4 380 | 4 381 | 1 382 | 4 383 | 0 384 | 0 385 | 2 386 | 4 387 | 2 388 | 3 389 | 2 390 | 0 391 | 4 392 | 4 393 | 4 394 | 3 395 | 4 396 | 3 397 | 4 398 | 3 399 | 3 400 | 4 401 | 0 402 | 3 403 | 0 404 | 4 405 | 3 406 | 0 407 | 0 408 | 2 409 | 2 410 | 2 411 | 3 412 | 2 413 | 0 414 | 1 415 | 2 416 | 2 417 | 3 418 | 2 419 | 4 420 | 3 421 | 0 422 | 0 423 | 2 424 | 2 425 | 4 426 | 2 427 | 1 428 | 0 429 | 1 430 | 3 431 | 2 432 | 1 433 | 4 434 | 3 435 | 4 436 | 4 437 | 3 438 | 2 439 | 0 440 | 3 441 | 0 442 | 1 443 | 1 444 | 4 445 | 1 446 | 2 447 | 3 448 | 2 449 | 3 450 | 4 451 | 0 452 | 4 453 | 3 454 | 3 455 | 4 456 | 3 457 | 3 458 | 1 459 | 2 460 | 2 461 | 0 462 | 0 463 | 3 464 | 2 465 | 3 466 | 2 467 | 3 468 | 3 469 | 4 470 | 3 471 | 2 472 | 3 473 | 3 474 | 4 475 | 3 476 | 4 477 | 1 478 | 1 479 | 3 480 | 2 481 | 3 482 | 4 483 | 1 484 | 0 485 | 2 486 | 4 487 | 0 488 | 4 489 | 1 490 | 2 491 | 4 492 | 1 493 | 1 494 | 1 495 | 2 496 | 0 497 | 1 498 | 1 499 | 3 500 | 3 501 | 1 502 | 1 503 | 1 504 | 3 505 | 3 506 | 3 507 | 3 508 | 1 509 | 2 510 | 4 511 | 4 512 | 4 513 | 1 514 | 3 515 | 1 516 | 0 517 | 3 518 | 2 519 | 3 520 | 1 521 | -------------------------------------------------------------------------------- /DependencyTreeRNN++/CommandLineParser.cpp: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2014-2015 Piotr Mirowski 2 | // 3 | // Piotr Mirowski, Andreas Vlachos 4 | // "Dependency Recurrent Neural Language Models for Sentence Completion" 5 | // ACL 2015 6 | 7 | #include 8 | #include 9 | #include 10 | #include "CommandLineParser.h" 11 | 12 | using namespace std; 13 | 14 | 15 | /** 16 | * Get a command line argument 17 | */ 18 | bool CommandLineParser::Get(string name, int &value) { 19 | if (args.find(name) == args.end()) { 20 | cout << name << " must be registered as parameter\n"; 21 | return false; 22 | } 23 | CommandLineArgument a = args[name]; 24 | value = atoi(a.m_value.c_str()); 25 | return true; 26 | } 27 | 28 | 29 | /** 30 | * Get a command line argument 31 | */ 32 | bool CommandLineParser::Get(string name, double &value) { 33 | if (args.find(name) == args.end()) { 34 | cout << name << " must be registered as parameter\n"; 35 | return false; 36 | } 37 | CommandLineArgument a = args[name]; 38 | value = atof(a.m_value.c_str()); 39 | return true; 40 | } 41 | 42 | 43 | /** 44 | * Get a command line argument 45 | */ 46 | bool CommandLineParser::Get(string name, string &value) { 47 | if (args.find(name) == args.end()) { 48 | cout << name << " must be registered as parameter\n"; 49 | return false; 50 | } 51 | CommandLineArgument a = args[name]; 52 | value = a.m_value; 53 | return (!value.empty()); 54 | } 55 | 56 | 57 | /** 58 | * Get a command line argument 59 | */ 60 | bool CommandLineParser::Get(string name, bool &value) { 61 | if (args.find(name) == args.end()) { 62 | cout << name << " must be registered as a parameter\n"; 63 | return false; 64 | } 65 | CommandLineArgument a = args[name]; 66 | value = (a.m_value.compare("true") == 0); 67 | return true; 68 | } 69 | 70 | 71 | /** 72 | * Get a command line argument 73 | */ 74 | bool CommandLineParser::Get(string name, long long &value) { 75 | if (args.find(name) == args.end()) { 76 | cout << name << " must be registered as a parameter \n"; 77 | return false; 78 | } 79 | CommandLineArgument a = args[name]; 80 | value = (long long)atoll(a.m_value.c_str()); 81 | return true; 82 | } 83 | 84 | 85 | /** 86 | * Parse the arguments to extract their values and store them in the map 87 | */ 88 | bool CommandLineParser::Parse(char *list[], int llen) { 89 | if (llen == 1) { 90 | // Show the arguments 91 | cout << "Usage: " << list[0] << "\n"; 92 | for (map::iterator mi = args.begin(); 93 | mi != args.end(); 94 | mi++) { 95 | if (!mi->second.m_isRequired) { 96 | cout << "[-" << mi->first << " (" 97 | << mi->second.m_type << ": " 98 | << mi->second.m_value << ")]: " << mi->second.m_description << "\n"; 99 | } else { 100 | cout << "-" << mi->first << " (" 101 | << mi->second.m_type << "): " 102 | << mi->second.m_description << "\n"; 103 | } 104 | } 105 | return false; 106 | } 107 | 108 | if ((llen % 2) == 0) { 109 | cout << "Command line must have an even number of elements\n"; 110 | cout << "Check argument structure\n"; 111 | return false; 112 | } 113 | 114 | // List of seen arguments 115 | set seen; 116 | for (int i = 1; i < llen; i += 2) { 117 | if (list[i][0] != '-') { 118 | cout << "Argument names must begin with -\n"; 119 | cout << "Saw: " << list[i] << endl; 120 | return false; 121 | } 122 | string aname(&list[i][1]); 123 | if (args.find(aname) == args.end()) { 124 | cout << "Unknown parameter on command line: " << aname << endl; 125 | return false; 126 | } 127 | args[aname].m_value = string(list[i+1]); 128 | seen.insert(aname); 129 | } 130 | 131 | // check that the required arguments have been seen 132 | for (map::iterator mi = args.begin(); 133 | mi != args.end(); 134 | mi++) { 135 | if (mi->second.m_isRequired && !seen.count(mi->first)) { 136 | cout << "Required argument " << mi->first << " not set on command line\n"; 137 | return false; 138 | } 139 | } 140 | return true; 141 | } 142 | -------------------------------------------------------------------------------- /results.txt: -------------------------------------------------------------------------------- 1 | Sequential data, plain RNN: 2 | minimum 5 word occurrence 3 | V = 72846 words 4 | 250 classes 5 | 6 | 7 | No n-gram connections 8 | BPTT(5) every 1 step 9 | 50 hidden: 29.615385% (dev), 28.076923% (test) 10 | 100 hidden: 30.0000% (dev), 30.0000% (test) 11 | 200 hidden: 30.000000% (dev), 30.384615% (test) 12 | 300 hidden: 30.576923% (dev), 28.461538% (test) 13 | 14 | 15 | 1000MB of 2-gram connections 16 | BPTT(5) every 1 step 17 | 50 hidden: 29.615385% (dev), 30.769231% (test) 18 | 100 hidden: 28.653846% (dev), 30.384615% (test) 19 | 200 hidden: 30.000000% (dev), 28.653846% (test) 20 | 300 hidden: IN PROGRESS 21 | 22 | 23 | 1000MB of 3-gram connections 24 | BPTT(5) every 1 step 25 | 50 hidden: 39.2308% (dev), 40.769231% (test) 26 | 100 hidden: 39.423077% (dev), 40.576923% (test) 27 | 200 hidden: 38.846154% (dev), 40.192308% (test) 28 | 300 hidden: IN PROGRESS 29 | 30 | 31 | 1000MB of 4-gram connections 32 | BPTT(5) every 1 step 33 | 50 hidden: 40.192308% (dev), 42.307692% (test) 34 | 100 hidden: 40.576923% (dev), 41.153846% (test) 35 | 200 hidden: 40.000000% (dev), 40.384615% (test) 36 | 300 hidden: 40.192308% (dev), 39.230769% (test) 37 | 38 | 39 | 1000MB of 5-gram connections 40 | BPTT(5) every 1 step 41 | 50 hidden: 39.807692% (dev), 39.807692% (test) 42 | 100 hidden: 39.423077% (dev), 39.615385% (test) 43 | 200 hidden: TODO??? 44 | 300 hidden: TODO??? 45 | 46 | 47 | 1000MB of 3-gram connections 48 | BPTT(10) every 1 step 49 | 100 hidden: 39.423077% (dev), 40.576923% (test) 50 | -> not interesting 51 | 52 | 53 | 1000MB of 5-gram connections 54 | BPTT(10) every 1 step 55 | 50 hidden: 39.807692% (dev), 39.807692% (test) 56 | -> not interesting 57 | 58 | 59 | 2000MB of 3-gram connections 60 | BPTT(5) every 1 step 61 | 100 hidden: 39.423077% (dev), 40.384615% (test) 62 | 200 hidden: 39.423077% (dev), 40.384615% (test) 63 | -> run into memory problem all the time 64 | 65 | 66 | 67 | Dependency Tree RNN: 68 | minimum 5 word occurrence 69 | V = 67824 words, 44 labels 70 | 250 classes 71 | 72 | labels as features (p2) 73 | 1000MB of 3-gram connections 74 | BPTT_ORDER=5 75 | 200 hidden: 50.576923% (dev), 50.000000% (test) 76 | 77 | labels as features (p2) 78 | 2000MB of 4-gram connections 79 | BPTT_ORDER=5 80 | 50 hidden 81 | gamma=0: 48.846154% (dev), 46.730769% (test) 82 | gamma=0.1: 47.115385% (dev), 48.269231% (test) 83 | gamma=0.5: 46.730769% (dev), 47.115385% (test) 84 | gamma=0.9: 47.307692% (dev), 46.730769% (test) 85 | gamma=1.0: 47.307692% (dev), 46.730769% (test) 86 | gamma=1.1: 48.076923% (dev), 45.576923% (test) 87 | 88 | labels as features (p2) 89 | 2000MB of 4-gram connections 90 | BPTT_ORDER=5 91 | 100 hidden 92 | gamma=0: TODO 93 | gamma=0.1: 49.807692% (dev), 49.807692% (test) 94 | gamma=0.5: 50.000000% (dev), 49.615385% (test) 95 | gamma=0.9: 49.230769% (dev), 47.500000% (test) 96 | gamma=1.0: 47.692308% (dev), 47.500000% (test) 97 | gamma=1.1: 48.653846% (dev), 48.076923% (test) 98 | 99 | labels as features (p2) 100 | 2000MB of 3-gram connections 101 | BPTT_ORDER=5 102 | gamma=0.5 103 | 50 hidden: 48.653846% (dev), 48.846154% (test) 104 | 100 hidden: 51.538462% (dev), 49.615385% (test) 105 | 200 hidden: 51.730769% (dev), 50.000000% (test) 106 | 300 hidden: TODO 107 | 108 | 109 | labels as features (p2) 110 | 2000MB of 4-gram connections 111 | BPTT_ORDER=5 112 | gamma=0.5 113 | 50 hidden: 46.730769% (dev), 47.115385% (test) 114 | 100 hidden: 50.000000% (dev), 49.615385% (test) 115 | 200 hidden: 49.807692% (dev), 49.615385% (test) 116 | ensemble of these: 526923076923% (dev), 0.517307692308 (test) 117 | 300 hidden: IN PROGRESS 118 | 119 | no labels (p0) 120 | 2000MB of 4-gram connections 121 | BPTT_ORDER=5 122 | 50 hidden: 52.692308% (dev), 48.846154% (test) 123 | 100 hidden: 54.038462% (dev), 51.346154% (test) 124 | 200 hidden: 52.692308% (dev), 50.769231% (test) 125 | 300 hidden: 126 | 127 | no labels (p0) 128 | 2000MB of 3-gram connections 129 | BPTT_ORDER=5 130 | 50 hidden: 53.269231% (dev), 51.923077% (test) 131 | 100 hidden: 54.230769% (dev), 52.692308% (test) 132 | 200 hidden: 54.230769% (dev), 51.923077% (test) 133 | 300 hidden: 53.269231% (dev), 52.884615% (test) 134 | 135 | no labels (p0) 136 | 2000MB of 2-gram connections 137 | BPTT_ORDER=5 138 | 50 hidden: 48.653846% (dev), 45.000000% (test) 139 | 100 hidden: 48.653846% (dev), 47.500000% (test) 140 | 200 hidden: 48.653846% (dev), 45.576923% (test) 141 | 300 hidden: IN PROGRESS 142 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # DependencyTreeRnn 2 | Dependency tree-based RNN 3 | 4 | Copyright (c) 2014-2015 Piotr Mirowski, Andreas Vlachos 5 | 6 | Please refer to the following paper: 7 | Piotr Mirowski, Andreas Vlachos 8 | "Dependency Recurrent Neural Language Models for Sentence Completion" 9 | ACL 2015 10 | 11 | # Installation 12 | 0. Download the preprocessed training and validation/testing data from here: https://drive.google.com/file/d/0BwPdBcatuO0vS3JlUVBtZHpSb3M/view?usp=sharing 13 | 1. Modify the path to the BLAS header (cblas.h) file, i.e., $BLASINCLUDE 14 | and the BLAS path, i.e., $BLASFLAGS, in file Makefile. 15 | Alternatively, make your own version of that Makefile. 16 | 2. Build the project: 17 | ``` 18 | > make 19 | ``` 20 | or, using your custom Makefile: 21 | ``` 22 | > make -f YOUR_OWN_MAKEFILE 23 | ``` 24 | Note that the .o objects are stored in directory build/ and the executable is ./RnnDependencyTree 25 | 26 | # Sample training script 27 | Shell script train_rnn_holmes_debug.sh trains an RNN on a subset of a few books. 28 | You need to modify the path to where the JSON book files are stored. 29 | 30 | # Important hyperparameters 31 | 32 | 1. Parameters relative to the dataset: 33 | * **train** (string) Training data file (pure text) 34 | * **valid** (string) Validation data file (pure text), using during training 35 | * **test** (string) Test data file (pure text) 36 | * **sentence-labels** (string) Validation/test sentence labels file (pure text) 37 | * **path-json-books** (string) Path to the book JSON files 38 | * **min-word-occurrence** (int) Mininum word occurrence to include word into vocabulary [default: 5] 39 | * **independent** (bool) Is each line in the training/testing file independent? [default: true] 40 | 41 | 2. Parameters relative to the dependency labels 42 | * **feature-labels-type** (int) Dependency parsing labels: 43 | * 0 = none, use words only 44 | * 1 = concatenate label to word 45 | * 2 = use features in the feature vector, separate from words 46 | * **feature-gamma** (double) Decay weight for features consisting of label vectors [default: 0.9]. 47 | * Values up to about 1.3 can be accepted (beyond that, the perplexity seems to become very large). 48 | * f(t) is a vector with D elements (e.g., D=44 types of dependency labels) 49 | * f(t) <- gamma * f(t-1), then set element at current label to 1 50 | * This value could be important for changing the weight given to dependency labels. 51 | * A value larger than 1 means that labels further past in time count more than those immediately in the past. 52 | * 1 means that there is no decay. 53 | * A value between 0 and 1 means that there is some decay. 54 | * 0 means that the decay is immediate. 55 | 56 | 3. RNN architecture parameters 57 | * **rnnlm** (string) RNN language model file to use (save in training / read in test) 58 | * **classes** (int) Number of word classes used in hierarchical softmax [default: 200]. 59 | * If vocabulary size if W, choose C around sqrt(W). 60 | * C = W means 1 class per word. 61 | * C = 1 means standard softmax. 62 | * **hidden** (int) Number of nodes in the hidden layer [default: 100]. 63 | * Try to go higher, perhaps up to 1000 (for 1M-word vocabulary). 64 | * Linear impact on speed. 65 | * **direct** (int) Size of max-entropy hash table storing direct n-gram connections, in millions of entries [default: 0]. 66 | * Basically, direct=1000 means that 1000x10000000 = 1G direct connections between context words and target word are considered. 67 | * However, it is not a proper hashtable (which would take too much memory) but a simple vector of 1G entries, with a hashing function that hashed into specific entries in that vector. Hash collisions are totally ignored. 68 | * Try using direct=1000 or even 2000 hashes if possible. 69 | * **direct-order** (int) Order of direct n-gram connections; 2 is like bigram max entropy features [default: 3]. 70 | * It works on tokens only, and values of 4 or beyond did not bring improvement in others LM tasks. 71 | * **compression** (int) Number of nodes in the compression layer between the hidden and output layers [default: 0] 72 | 73 | 4. Training parameters 74 | * **alpha** (double) Initial learning rate during gradient descent [default: 0.1] 75 | * **beta** (double) L-2 norm regularization coefficient during gradient descent [default: 0.0000001] 76 | * **min-improvement** (double) Minimum improvement before learning rate decreases [default: 1.001] 77 | * **bptt** (int) Number of steps to propagate error back in time [default: 4] 78 | * **bptt-block** (int) Number of time steps after which the error is backpropagated through time [default: 10] 79 | * **gradient-cutoff** (double) Value beyond whih the gradients are clipped, used to avoid exploding gradients [default: 15] 80 | 81 | 5. Additional parameters 82 | * **debug** (bool) Debugging level [default: false] 83 | -------------------------------------------------------------------------------- /DependencyTreeRNN++/RnnWeights.h: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2014-2015 Piotr Mirowski 2 | // 3 | // Piotr Mirowski, Andreas Vlachos 4 | // "Dependency Recurrent Neural Language Models for Sentence Completion" 5 | // ACL 2015 6 | 7 | // Based on code by Geoffrey Zweig and Tomas Mikolov 8 | // for the Feature-Augmented RNN Tool Kit 9 | // http://research.microsoft.com/en-us/projects/rnn/ 10 | 11 | /* 12 | This file is based on or incorporates material from the projects listed below (collectively, "Third Party Code"). 13 | Microsoft is not the original author of the Third Party Code. The original copyright notice and the license under which Microsoft received such Third Party Code, 14 | are set forth below. Such licenses and notices are provided for informational purposes only. Microsoft, not the third party, licenses the Third Party Code to you 15 | under the terms set forth in the EULA for the Microsoft Product. Microsoft reserves all rights not expressly granted under this agreement, whether by implication, 16 | estoppel or otherwise. 17 | 18 | RNNLM 0.3e by Tomas Mikolov 19 | 20 | Provided for Informational Purposes Only 21 | 22 | BSD License 23 | All rights reserved. 24 | Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 25 | Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 26 | 27 | Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other 28 | materials provided with the distribution. 29 | 30 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 31 | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, 32 | INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, 33 | OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 34 | ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 35 | */ 36 | 37 | #ifndef DependencyTreeRNN_RnnWeights_h 38 | #define DependencyTreeRNN_RnnWeights_h 39 | 40 | #include 41 | #include 42 | #include 43 | #include "Utils.h" 44 | 45 | 46 | /** 47 | * Tomas Mikolov decided to implement hash tables and hash functions 48 | * from scratch... 49 | */ 50 | const unsigned int c_Primes[] = {108641969, 116049371, 125925907, 133333309, 145678979, 175308587, 197530793, 234567803, 251851741, 264197411, 330864029, 399999781, 51 | 407407183, 459258997, 479012069, 545678687, 560493491, 607407037, 629629243, 656789717, 716048933, 718518067, 725925469, 733332871, 753085943, 755555077, 52 | 782715551, 790122953, 812345159, 814814293, 893826581, 923456189, 940740127, 953085797, 985184539, 990122807}; 53 | const unsigned int c_PrimesSize = sizeof(c_Primes)/sizeof(c_Primes[0]); 54 | 55 | 56 | /** 57 | * Weights of an RNN 58 | */ 59 | class RnnWeights { 60 | public: 61 | 62 | /** 63 | * Constructor 64 | */ 65 | RnnWeights(int sizeVocabulary, 66 | int sizeHidden, 67 | int sizeFeature, 68 | int sizeClasses, 69 | int sizeCompress, 70 | long long sizeDirectConnection); 71 | 72 | /** 73 | * Load the weights matrices from a file 74 | */ 75 | void Load(FILE *fi); 76 | 77 | /** 78 | * Clear the weights, before loading a new model, to save on memory 79 | */ 80 | void Clear(); 81 | 82 | /** 83 | * Save the weights matrices to a file 84 | */ 85 | void Save(FILE *fo); 86 | 87 | // Weights between input and hidden layer 88 | std::vector Input2Hidden; 89 | // Weights between former hidden state and current hidden layer 90 | std::vector Recurrent2Hidden; 91 | // weights between features and hidden layer 92 | std::vector Features2Hidden; 93 | // Weights between features and output layer 94 | std::vector Features2Output; 95 | // Weights between hidden and output layer (or hidden and compression if compression>0) 96 | std::vector Hidden2Output; 97 | // Optional weights between compression and output layer 98 | std::vector Compress2Output; 99 | // Direct parameters between input and output layer 100 | // (similar to Maximum Entropy model parameters) 101 | std::vector DirectNGram; 102 | 103 | /** 104 | * Return the number of direct connections between input words 105 | * and the output word (i.e., n-gram features) 106 | */ 107 | int GetNumDirectConnection() const { 108 | return static_cast(DirectNGram.size()); 109 | } // int GetNumDirectConnections() 110 | 111 | /** 112 | * Return the number of word classes 113 | */ 114 | int GetNumClasses() const { return m_sizeClasses; } 115 | 116 | /** 117 | * Debug function 118 | */ 119 | void Debug(); 120 | 121 | 122 | protected: 123 | 124 | /** 125 | * Dimensions of the network 126 | */ 127 | int m_sizeVocabulary; 128 | int m_sizeHidden; 129 | int m_sizeFeature; 130 | int m_sizeClasses; 131 | int m_sizeCompress; 132 | long long m_sizeDirectConnection; 133 | int m_sizeInput; 134 | int m_sizeOutput; 135 | }; // class RnnWeights 136 | 137 | #endif 138 | -------------------------------------------------------------------------------- /preprocessing/JSON2unrolls.py: -------------------------------------------------------------------------------- 1 | ''' 2 | // Copyright (c) 2014 Anonymized. All rights reserved. 3 | // 4 | // Code submitted as supplementary material for manuscript: 5 | // "Dependency Recurrent Neural Language Models for Sentence Completion" 6 | // Do not redistribute. 7 | 8 | Created on Aug 4, 2014 9 | 10 | Take a corpus the JSON format obtained from Stanford and convert it to this 11 | 12 | arg1 = input 13 | arg2 = output 14 | 15 | corpus = (list of sentences) 16 | sentence = (list of unrolls) 17 | unroll = (list of tokens) 18 | token = (map containing: index in sentence, string, discount, outDep) 19 | 20 | outDep is the dependency going from the current token to the next word on the path 21 | the last token on the path (leaf node) has a LEAF outEdge 22 | 23 | ''' 24 | 25 | 26 | import networkx 27 | import json 28 | import sys 29 | from collections import Counter 30 | import glob 31 | import os 32 | import os.path 33 | 34 | def extractUnrolls(sentenceDAG): 35 | unrolls = [] 36 | 37 | # so each unroll is a path from ROOT to the leaves. 38 | root2leafPaths = [] 39 | # this counts the number of times a node appears in the path 40 | discountFactors = Counter() 41 | # traverse all tokens to find the root and the leaves: 42 | leaves = [] 43 | root = None 44 | for tokenNo in sentenceDAG.nodes(): 45 | # if a token is a leaf (avoid punctuation which has no incoming ones): 46 | if sentenceDAG.out_degree(tokenNo) == 0 and sentenceDAG.in_degree(tokenNo) > 0: 47 | leaves.append(tokenNo) 48 | if sentenceDAG.in_degree(tokenNo) == 0 and sentenceDAG.out_degree(tokenNo) > 0: 49 | root = tokenNo 50 | 51 | #print "leaves:" + str(leaves) 52 | #print "root:" + str(root) 53 | 54 | for leaf in leaves: 55 | # let's get the path from ROOT: 56 | try: 57 | path = networkx.shortest_path(sentenceDAG, source=root, target=leaf) 58 | root2leafPaths.append(path) 59 | # add the discounts: 60 | for tok in path: 61 | discountFactors[tok] += 1 62 | except networkx.exception.NetworkXNoPath: 63 | print "path did not exist among tokens " + str(root) + " and " + str(leaf) + " in sentence:" 64 | print str(sentenceDAG) 65 | #print root2leafPaths 66 | #print discountFactors 67 | 68 | for path in root2leafPaths: 69 | unroll = [] 70 | for idx_in_path, tokenNo in enumerate(path): 71 | #print sentenceDAG[tokenNo] 72 | word = sentenceDAG.node[tokenNo]['word'] 73 | # the last word has the dummy out edge 74 | if idx_in_path == len(path)-1: 75 | outDep = "LEAF" 76 | else: 77 | outDep = sentenceDAG[tokenNo][path[idx_in_path+1]]["label"] 78 | unroll.append([tokenNo, word, discountFactors[tokenNo], outDep]) 79 | 80 | unrolls.append(unroll) 81 | 82 | return unrolls 83 | 84 | def constructDAG(sentence): 85 | sentenceDAG = networkx.DiGraph() 86 | # first put the nodes in the graph 87 | # fields of interest 0 (tokenNo, starting at 0), 1 (token (lowercase it maybe?), 6 (ancestor), 7 (depLabel to ancestor)) 88 | # add the root 89 | #sentenceDAG.add_node(0, word="ROOT") 90 | # add the index of the token in the sentence, remember to start things from 1 as 0 is reserved for root 91 | for idx, token in enumerate(sentence["tokens"]): 92 | sentenceDAG.add_node(idx, word=token["word"].lower()) 93 | 94 | # and now the edges: 95 | for dependency in sentence["dependencies"]: 96 | sentenceDAG.add_edge(dependency["head"], dependency["dep"], label=dependency["label"]) 97 | #networkx.draw(sentenceDAG) 98 | #print sentenceDAG.nodes(data=True) 99 | #print sentenceDAG.edges(data=True) 100 | return sentenceDAG 101 | 102 | # Create the output path 103 | os.mkdir(sys.argv[2]) 104 | threshold = int(sys.argv[3]) 105 | tokensOnly = False 106 | # check if we are generating the text for the RNNs 107 | if len(sys.argv) == 5 and sys.argv[4] == "TOKENS": 108 | tokensOnly = True 109 | threshold = float("inf") 110 | 111 | 112 | tokensKeptCounter = 0 113 | wordTypesKept = [] 114 | for filename in glob.glob(sys.argv[1]+ "/*"): 115 | allSentences = [] 116 | 117 | jsonFile = open(filename) 118 | sentences = json.loads(jsonFile.read()) 119 | jsonFile.close() 120 | 121 | for sentence in sentences: 122 | sentenceDAG = constructDAG(sentence) 123 | if (len(sentenceDAG.nodes()) < threshold): 124 | gutenbergCheck = False 125 | 126 | nodes = sentenceDAG.nodes(data=True) 127 | 128 | for node in nodes: 129 | if node[1]["word"] == "gutenberg": 130 | #print nodes 131 | gutenbergCheck = True 132 | 133 | if not gutenbergCheck: 134 | tokensKeptCounter += len(nodes) 135 | for node in nodes: 136 | if node[1]["word"] not in wordTypesKept: 137 | wordTypesKept.append( node[1]["word"]) 138 | if tokensOnly: 139 | tokens = [] 140 | for node in nodes: 141 | tokens.append(node[1]["word"]) 142 | allSentences.append(" ".join(tokens)) 143 | else: 144 | unrolls = extractUnrolls(sentenceDAG) 145 | allSentences.append(unrolls) 146 | print "unique word types kept=" + str(len(wordTypesKept)) 147 | if tokensOnly: 148 | with open(sys.argv[2] + "/" + os.path.basename(filename) + ".tokens.txt", "wb") as out: 149 | out.write(("\n".join(allSentences)).encode('utf-8') + "\n") 150 | else: 151 | with open(sys.argv[2] + "/" + os.path.basename(filename) + ".unrolls.json", "wb") as out: 152 | json.dump(allSentences, out) 153 | 154 | print "tokens kept=" + str(tokensKeptCounter) 155 | print "unique word types kept=" + str(len(wordTypesKept)) 156 | -------------------------------------------------------------------------------- /DependencyTreeRNN++/Vocabulary.h: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2014-2015 Piotr Mirowski 2 | // 3 | // Piotr Mirowski, Andreas Vlachos 4 | // "Dependency Recurrent Neural Language Models for Sentence Completion" 5 | // ACL 2015 6 | 7 | // Based on code by Geoffrey Zweig and Tomas Mikolov 8 | // for the Feature-Augmented RNN Tool Kit 9 | // http://research.microsoft.com/en-us/projects/rnn/ 10 | 11 | /* 12 | This file is based on or incorporates material from the projects listed below (collectively, "Third Party Code"). 13 | Microsoft is not the original author of the Third Party Code. The original copyright notice and the license under which Microsoft received such Third Party Code, 14 | are set forth below. Such licenses and notices are provided for informational purposes only. Microsoft, not the third party, licenses the Third Party Code to you 15 | under the terms set forth in the EULA for the Microsoft Product. Microsoft reserves all rights not expressly granted under this agreement, whether by implication, 16 | estoppel or otherwise. 17 | 18 | RNNLM 0.3e by Tomas Mikolov 19 | 20 | Provided for Informational Purposes Only 21 | 22 | BSD License 23 | All rights reserved. 24 | Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 25 | Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 26 | 27 | Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other 28 | materials provided with the distribution. 29 | 30 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 31 | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, 32 | INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, 33 | OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 34 | ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 35 | */ 36 | 37 | #ifndef DependencyTreeRNN_Vocabulary_h 38 | #define DependencyTreeRNN_Vocabulary_h 39 | 40 | #include 41 | #include 42 | #include 43 | #include 44 | #include 45 | 46 | 47 | /** 48 | * Element of vocabulary 49 | */ 50 | struct VocabWord { 51 | std::string word; 52 | double prob; 53 | int cn; 54 | int classIndex; 55 | }; 56 | 57 | 58 | /** 59 | * Class storing word in vocabulary, word classes 60 | * and hash tables to associate them 61 | */ 62 | class Vocabulary { 63 | public: 64 | 65 | /** 66 | * Constructor. 67 | */ 68 | Vocabulary(int numClasses) 69 | : m_numClasses(numClasses), m_useClassFile(false) { 70 | } 71 | 72 | /** 73 | * Constructor that reads the vocabulary and classes from the model file. 74 | */ 75 | Vocabulary(FILE *fi, int sizeVocabulary, int numClasses); 76 | 77 | /** 78 | * Save the vocabulary to a model file. 79 | */ 80 | void Save(FILE *fo); 81 | 82 | /** 83 | * Return the index of a word in the vocabulary, or -1 if OOV. 84 | */ 85 | int SearchWordInVocabulary(const std::string& word) const; 86 | 87 | /** 88 | * Add word to the vocabulary. 89 | */ 90 | int AddWordToVocabulary(const std::string& word); 91 | 92 | /** 93 | * Sort the words in the vocabulary by frequency. 94 | */ 95 | void SortVocabularyByFrequency(); 96 | 97 | /** 98 | * Read the classes of words. 99 | */ 100 | bool ReadClasses(const std::string &filename); 101 | 102 | /** 103 | * Assign all the words to a class. 104 | */ 105 | void AssignWordsToClasses(); 106 | 107 | /** 108 | * Return the number of words/entity tokens in the vocabulary. 109 | */ 110 | int GetVocabularySize() const { 111 | return static_cast(m_vocabularyStorage.size()); 112 | } 113 | 114 | /** 115 | * Manually set the word count. 116 | */ 117 | bool SetWordCount(std::string word, int count); 118 | 119 | /** 120 | * Return the n-th word in the vocabulary. 121 | */ 122 | std::string GetNthWord(int word) const { 123 | return m_vocabularyStorage[word].word; 124 | } 125 | 126 | /** 127 | * Return the index of a given word in the vocabulary. 128 | */ 129 | std::string Word2WordIndex(int word) const { 130 | return m_vocabularyStorage[word].word; 131 | } 132 | 133 | /** 134 | * Return the size of a word class. 135 | */ 136 | int SizeTargetClass(int targetClass) const { 137 | return static_cast(m_classWords[targetClass].size()); 138 | } 139 | 140 | /** 141 | * Return the class index of a word (referenced by an index). 142 | */ 143 | int WordIndex2Class(int word) const { 144 | return m_vocabularyStorage[word].classIndex; 145 | } 146 | 147 | /** 148 | * Return the n-th word in a word class. 149 | */ 150 | int GetNthWordInClass(int targetClass, int n) const { 151 | return static_cast(m_classWords[targetClass][n]); 152 | } 153 | 154 | public: 155 | 156 | // Vocabulary storage 157 | std::vector m_vocabularyStorage; 158 | 159 | // Vocabulary representation (word -> index of the word) 160 | std::unordered_map m_mapWord2Index; 161 | 162 | // Inverse vocabulary representation (index of the word -> word) 163 | std::unordered_map m_mapIndex2Word; 164 | 165 | // Hash table enabling a look-up of the class of a word 166 | // (word -> word class) 167 | std::unordered_map m_mapWord2Class; 168 | 169 | // Information relative to the classes 170 | std::vector > m_classWords; 171 | 172 | protected: 173 | bool m_useClassFile; 174 | int m_numClasses; 175 | 176 | // Store information on which word is in which class 177 | void StoreClassAssociations(); 178 | }; // class Vocabulary 179 | 180 | #endif 181 | -------------------------------------------------------------------------------- /DependencyTreeRNN++/RnnDependencyTreeLib.h: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2014-2015 Piotr Mirowski 2 | // 3 | // Piotr Mirowski, Andreas Vlachos 4 | // "Dependency Recurrent Neural Language Models for Sentence Completion" 5 | // ACL 2015 6 | 7 | // Based on code by Geoffrey Zweig and Tomas Mikolov 8 | // for the Feature-Augmented RNN Tool Kit 9 | // http://research.microsoft.com/en-us/projects/rnn/ 10 | 11 | /* 12 | This file is based on or incorporates material from the projects listed below (collectively, "Third Party Code"). 13 | Microsoft is not the original author of the Third Party Code. The original copyright notice and the license under which Microsoft received such Third Party Code, 14 | are set forth below. Such licenses and notices are provided for informational purposes only. Microsoft, not the third party, licenses the Third Party Code to you 15 | under the terms set forth in the EULA for the Microsoft Product. Microsoft reserves all rights not expressly granted under this agreement, whether by implication, 16 | estoppel or otherwise. 17 | 18 | RNNLM 0.3e by Tomas Mikolov 19 | 20 | Provided for Informational Purposes Only 21 | 22 | BSD License 23 | All rights reserved. 24 | Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 25 | Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 26 | 27 | Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other 28 | materials provided with the distribution. 29 | 30 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 31 | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, 32 | INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, 33 | OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 34 | ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 35 | */ 36 | 37 | #ifndef __DependencyTreeRNN____RnnDependencyTreeLib__ 38 | #define __DependencyTreeRNN____RnnDependencyTreeLib__ 39 | 40 | #include "RnnLib.h" 41 | #include "RnnTraining.h" 42 | #include "CorpusUnrollsReader.h" 43 | 44 | class RnnTreeLM : public RnnLMTraining { 45 | public: 46 | 47 | /** 48 | * Constructor for training/testing the model 49 | */ 50 | RnnTreeLM(const std::string &filename, bool doLoadModel, bool debugMode) 51 | // We load the RNN or not, depending on whether the model file is present 52 | // otherwise simply set its filename 53 | : RnnLMTraining(filename, doLoadModel, debugMode), 54 | // Parameters set by default (can be overriden when loading the model) 55 | m_typeOfDepLabels(0), m_labels(1) { 56 | // If we use dependency labels, do not connect them to the outputs 57 | m_useFeatures2Output = false; 58 | std::cout << "RnnTreeLM\n"; 59 | } 60 | 61 | public: 62 | 63 | /** 64 | * Before learning the RNN model, we need to learn the vocabulary 65 | * from the corpus. Note that the word classes may have been initialized 66 | * beforehand using ReadClasses. Computes the unigram distribution 67 | * of words from a training file, assuming that the existing vocabulary 68 | * is empty. 69 | */ 70 | bool LearnVocabularyFromTrainFile(int numClasses); 71 | 72 | /** 73 | * Import the vocabulary from a text file. 74 | */ 75 | void ImportVocabularyFromFile(std::string &filename, int numClasses) { 76 | m_corpusTrain.ImportVocabulary(filename); 77 | m_corpusValidTest.ImportVocabulary(filename); 78 | AssignVocabularyFromCorpora(numClasses); 79 | } 80 | 81 | /** 82 | * Return the number of labels (features) used in the dependency parsing. 83 | */ 84 | int GetLabelSize() const { return m_labels.GetVocabularySize(); } 85 | 86 | /** 87 | * Set the mode of the dependency labels: 88 | * 0: no dependency labels used 89 | * 1: dependency labels concatenated to the word 90 | * 0: dependency labels used as features in the feature vector 91 | */ 92 | void SetDependencyLabelType(int type) { 93 | m_typeOfDepLabels = type; 94 | } 95 | 96 | /** 97 | * Set the minimum number of word occurrences 98 | */ 99 | void SetMinWordOccurrence(int val) { 100 | m_corpusVocabulary.SetMinWordOccurrence(val); 101 | m_corpusTrain.SetMinWordOccurrence(val); 102 | m_corpusValidTest.SetMinWordOccurrence(val); 103 | } 104 | 105 | /** 106 | * Add a book to the training corpus 107 | */ 108 | void AddBookTrain(const std::string &filename) { 109 | m_corpusVocabulary.AddBookFilename(filename); 110 | m_corpusTrain.AddBookFilename(filename); 111 | } 112 | 113 | /** 114 | * Add a book to the test/validation corpus 115 | */ 116 | void AddBookTestValid(const std::string &filename) { 117 | m_corpusValidTest.AddBookFilename(filename); 118 | } 119 | 120 | /** 121 | * Function that trains the RNN on JSON trees 122 | * of dependency parse 123 | */ 124 | bool TrainRnnModel(); 125 | 126 | /** 127 | * Function that tests the RNN on JSON trees 128 | * of dependency parse 129 | */ 130 | bool TestRnnModel(const std::string &testFile, 131 | const std::string &featureFile, 132 | std::vector &sentenceScores, 133 | double &logProbability, 134 | double &perplexity, 135 | double &entropy, 136 | double &accuracy); 137 | 138 | protected: 139 | 140 | // Corpora 141 | CorpusUnrolls m_corpusVocabulary; 142 | CorpusUnrolls m_corpusTrain; 143 | CorpusUnrolls m_corpusValidTest; 144 | 145 | // Label vocabulary representation (label -> index of the label) 146 | int m_typeOfDepLabels; 147 | 148 | // Label vocabulary hashtables 149 | Vocabulary m_labels; 150 | 151 | // Label vocabulary representation (label -> index of the label) 152 | std::unordered_map m_mapLabel2Index; 153 | 154 | // Reset the vector of feature labels 155 | void ResetFeatureLabelVector(RnnState &state) const; 156 | 157 | // Update the vector of feature labels 158 | void UpdateFeatureLabelVector(int label, RnnState &state) const; 159 | 160 | // Assign the vocabulary from the corpora to the model, 161 | // and compute the word classes. 162 | bool AssignVocabularyFromCorpora(int numClasses); 163 | }; 164 | 165 | #endif /* defined(__DependencyTreeRNN____RnnDependencyTreeLib__) */ 166 | -------------------------------------------------------------------------------- /DependencyTreeRNN++/Utils.h: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2014-2015 Piotr Mirowski 2 | // 3 | // Piotr Mirowski, Andreas Vlachos 4 | // "Dependency Recurrent Neural Language Models for Sentence Completion" 5 | // ACL 2015 6 | 7 | // Based on code by Geoffrey Zweig and Tomas Mikolov 8 | // for the Feature-Augmented RNN Tool Kit 9 | // http://research.microsoft.com/en-us/projects/rnn/ 10 | 11 | /* 12 | This file is based on or incorporates material from the projects listed below (collectively, "Third Party Code"). 13 | Microsoft is not the original author of the Third Party Code. The original copyright notice and the license under which Microsoft received such Third Party Code, 14 | are set forth below. Such licenses and notices are provided for informational purposes only. Microsoft, not the third party, licenses the Third Party Code to you 15 | under the terms set forth in the EULA for the Microsoft Product. Microsoft reserves all rights not expressly granted under this agreement, whether by implication, 16 | estoppel or otherwise. 17 | 18 | RNNLM 0.3e by Tomas Mikolov 19 | 20 | Provided for Informational Purposes Only 21 | 22 | BSD License 23 | All rights reserved. 24 | Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 25 | Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 26 | 27 | Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other 28 | materials provided with the distribution. 29 | 30 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 31 | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, 32 | INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, 33 | OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 34 | ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 35 | */ 36 | 37 | #ifndef DependencyTreeRNN_Utils_h 38 | #define DependencyTreeRNN_Utils_h 39 | 40 | #include 41 | #include 42 | 43 | #include 44 | #include 45 | #include 46 | #include 47 | #include 48 | #include 49 | 50 | 51 | /** 52 | * Log to screen and to file (append) 53 | */ 54 | static void Log(std::string str, std::string logFilename) { 55 | std::ostringstream buf; 56 | std::ofstream logFile(logFilename, std::fstream::app); 57 | buf << str; 58 | logFile << buf.str() << std::flush; 59 | std::cout << buf.str() << std::flush; 60 | buf.str(""); 61 | buf.clear(); 62 | } 63 | 64 | 65 | /** 66 | * Log to screen only 67 | */ 68 | static void Log(std::string str) { 69 | std::ostringstream buf; 70 | buf << str; 71 | std::cout << buf.str() << std::flush; 72 | buf.str(""); 73 | buf.clear(); 74 | } 75 | 76 | 77 | /** 78 | * Read a matrix of floats in binary format 79 | */ 80 | static void ReadBinaryMatrix(FILE *fi, int sizeIn, int sizeOut, 81 | std::vector &vec) { 82 | if (sizeIn * sizeOut == 0) { 83 | return; 84 | } 85 | for (int idxOut = 0; idxOut < sizeOut; idxOut++) { 86 | for (int idxIn = 0; idxIn < sizeIn; idxIn++) { 87 | float val; 88 | fread(&val, 4, 1, fi); 89 | vec[idxIn + idxOut * sizeIn] = val; 90 | } 91 | } 92 | } 93 | 94 | 95 | /** 96 | * Read a vector of floats in binary format 97 | */ 98 | static void ReadBinaryVector(FILE *fi, long long size, 99 | std::vector &vec) { 100 | for (long long aa = 0; aa < size; aa++) { 101 | float val; 102 | fread(&val, 4, 1, fi); 103 | vec[aa] = val; 104 | } 105 | } 106 | 107 | 108 | /** 109 | * Save a matrix of floats in binary format 110 | */ 111 | static void SaveBinaryMatrix(FILE *fo, int sizeIn, int sizeOut, 112 | const std::vector &vec) { 113 | if (sizeIn * sizeOut == 0) { 114 | return; 115 | } 116 | for (int idxOut = 0; idxOut < sizeOut; idxOut++) { 117 | for (int idxIn = 0; idxIn < sizeIn; idxIn++) { 118 | float val = (float)(vec[idxIn + idxOut * sizeIn]); 119 | fwrite(&val, 4, 1, fo); 120 | } 121 | } 122 | } 123 | 124 | 125 | /** 126 | * Save a vector of floats in binary format 127 | */ 128 | static void SaveBinaryVector(FILE *fo, long long size, 129 | const std::vector &vec) { 130 | for (long long aa = 0; aa < size; aa++) { 131 | float val = vec[aa]; 132 | fwrite(&val, 4, 1, fo); 133 | } 134 | } 135 | 136 | 137 | /** 138 | * Random number generator of double random number in range [min, max] 139 | */ 140 | static double GenerateUniformRandomNumber(double min, double max) { 141 | return rand() / ((double)RAND_MAX) * (max - min) + min; 142 | } 143 | 144 | 145 | /** 146 | * Random number generator (approximate Gaussian distribution), 147 | * zero-mean and standard deviation 0.1 148 | */ 149 | static double GenerateNormalRandomNumber() { 150 | return (GenerateUniformRandomNumber(-0.1, 0.1) 151 | + GenerateUniformRandomNumber(-0.1, 0.1) 152 | + GenerateUniformRandomNumber(-0.1, 0.1)); 153 | } 154 | 155 | 156 | /** 157 | * Randomize a vector with small numbers to get zero-mean random numbers 158 | */ 159 | static void RandomizeVector(std::vector &vec) { 160 | for (size_t k = 0; k < vec.size(); k++) { 161 | vec[k] = GenerateNormalRandomNumber(); 162 | } 163 | } 164 | 165 | 166 | /** 167 | * Convert int or double to string 168 | */ 169 | static std::string ConvString(int val) { 170 | return std::to_string(static_cast(val)); 171 | } 172 | 173 | 174 | /** 175 | * Convert int or double to string 176 | */ 177 | static std::string ConvString(size_t val) { 178 | return std::to_string(static_cast(val)); 179 | } 180 | 181 | 182 | /** 183 | * Convert int or double to string 184 | */ 185 | static std::string ConvString(long int val) { 186 | return std::to_string(static_cast(val)); 187 | } 188 | 189 | 190 | /** 191 | * Convert int or double to string 192 | */ 193 | static std::string ConvString(long long int val) { 194 | return std::to_string(static_cast(val)); 195 | } 196 | 197 | 198 | /** 199 | * Convert int or double to string 200 | */ 201 | static std::string ConvString(double val) { 202 | return std::to_string(static_cast(val)); 203 | } 204 | 205 | 206 | #endif 207 | -------------------------------------------------------------------------------- /preprocessing/Text2Parsed2JSON.java: -------------------------------------------------------------------------------- 1 | import edu.stanford.nlp.pipeline.*; 2 | import edu.stanford.nlp.util.*; 3 | import edu.stanford.nlp.semgraph.SemanticGraph; 4 | import edu.stanford.nlp.semgraph.SemanticGraphEdge; 5 | import edu.stanford.nlp.semgraph.SemanticGraphCoreAnnotations.*; 6 | import edu.stanford.nlp.ling.CoreAnnotations; 7 | import edu.stanford.nlp.ling.CoreLabel; 8 | import edu.stanford.nlp.ling.CoreAnnotations.*; 9 | 10 | import java.io.*; 11 | import java.util.*; 12 | 13 | import com.google.gson.*; 14 | 15 | 16 | 17 | public class Text2Parsed2JSON { 18 | 19 | public class MyToken{ 20 | public String word; 21 | public String lemma; 22 | public String pos; 23 | public String ner; 24 | 25 | public MyToken(){}; 26 | 27 | } 28 | 29 | public class MyDependency{ 30 | public int head; 31 | public int dep; 32 | public String label; 33 | 34 | public MyDependency(){}; 35 | } 36 | 37 | public class MySentence{ 38 | 39 | public ArrayList tokens; 40 | public ArrayList dependencies; 41 | 42 | public MySentence(){ 43 | tokens = new ArrayList(); 44 | dependencies = new ArrayList(); 45 | } 46 | 47 | } 48 | 49 | // this holds the main pipeline for the processing 50 | private StanfordCoreNLP mainPipeline; 51 | 52 | public static String readTextFromFile(File textFileName) throws IOException { 53 | BufferedReader textFile = new BufferedReader(new FileReader(textFileName)); 54 | String line; 55 | StringBuffer result = new StringBuffer(); 56 | while ((line = textFile.readLine() ) != null){ 57 | // added the new line back 58 | result.append(line + "\n"); 59 | } 60 | textFile.close(); 61 | return result.toString(); 62 | } 63 | 64 | // dummy function that returns the same text that was passed as input. 65 | // to be over-ridden to do more interesting things. might need to add to the initialization. 66 | private String filterText(String text){ 67 | return text; 68 | } 69 | 70 | public Text2Parsed2JSON(){ 71 | // Initialize the parser: 72 | Properties parser_props = new Properties(); 73 | parser_props.put("annotators", "tokenize, ssplit, pos, lemma, ner, parse"); 74 | // I assume that longer sentences are unlikely to be useful. 75 | parser_props.put("parse.maxlen", 80); 76 | //parser_props.put("tokenize.whitespace", "true"); 77 | //parser_props.put("ssplit.isOneSentence", "true"); 78 | mainPipeline = new StanfordCoreNLP(parser_props); 79 | 80 | } 81 | 82 | // this takes text, runs the main processor and returns the Stanford annotations 83 | // for the sentences kept 84 | public Annotation processText2Annotations(String text){ 85 | // filter the text 86 | String filteredText = filterText(text); 87 | // create an empty Annotation just with the given text 88 | Annotation annotatedText = new Annotation(filteredText); 89 | 90 | mainPipeline.annotate(annotatedText); 91 | 92 | return annotatedText; 93 | } 94 | 95 | 96 | public String processAnnotations2JSON(Annotation annotatedText){ 97 | 98 | // initialize the sentences array 99 | ArrayList mySentences = new ArrayList(); 100 | 101 | // get the sentences 102 | List sentences = annotatedText.get(SentencesAnnotation.class); 103 | 104 | for(CoreMap sentence: sentences) { 105 | MySentence newSentence = new MySentence(); 106 | // traversing the words in the current sentence 107 | // a CoreLabel is a CoreMap with additional token-specific methods 108 | for (CoreLabel token: sentence.get(TokensAnnotation.class)) { 109 | MyToken newToken = new MyToken(); 110 | // this is the text of the token 111 | String word = token.get(TextAnnotation.class); 112 | // this is the POS tag of the token 113 | String pos = token.get(PartOfSpeechAnnotation.class); 114 | // this is the NER label of the token 115 | String ne = token.get(NamedEntityTagAnnotation.class); 116 | // this is the lemma 117 | String lemma = token.get(CoreAnnotations.LemmaAnnotation.class); 118 | newToken.lemma = lemma; 119 | newToken.pos = pos; 120 | newToken.ner = ne; 121 | newToken.word = word; 122 | 123 | newSentence.tokens.add(newToken); 124 | } 125 | 126 | 127 | // this is the Stanford dependency graph of the current sentence 128 | // If a tree with all the tokens is required, use BasicDependenciesAnnotation 129 | // But the one that are commonly the best for IE are CollapsedCCProcessedDependenciesAnnotation (careful, they are not even DAGs) 130 | SemanticGraph dependencies = sentence.get(BasicDependenciesAnnotation.class); 131 | //System.out.print(dependencies.toString("plain")); 132 | 133 | //Set allEdges = dependencies.getEdgeSet(); 134 | 135 | for (SemanticGraphEdge edge: dependencies.edgeIterable()){ 136 | MyDependency dep = new MyDependency(); 137 | // remember to subtract one so that the first word starts at 0 138 | dep.head = edge.getGovernor().index() - 1; 139 | dep.dep = edge.getDependent().index() - 1; 140 | dep.label = edge.getRelation().toString(); 141 | 142 | newSentence.dependencies.add(dep); 143 | } 144 | 145 | mySentences.add(newSentence); 146 | } 147 | 148 | 149 | Gson gson = new Gson(); 150 | 151 | 152 | return gson.toJson(mySentences); 153 | } 154 | 155 | 156 | /** 157 | * @param args 158 | */ 159 | public static void main(String[] args) { 160 | // initialize 161 | Text2Parsed2JSON processor = new Text2Parsed2JSON(); 162 | 163 | // get the directory with the text files 164 | File extractsDirectory = new File(args[0]); 165 | 166 | // get the output directory 167 | File outputDirectory = new File(args[1]); 168 | outputDirectory.mkdir(); 169 | 170 | // get a list of files: 171 | File[] textFileNames = extractsDirectory.listFiles(); 172 | System.out.println("Files to process:" + textFileNames.length); 173 | 174 | // For each text file: 175 | for (int i = 0; i < textFileNames.length; i++){ 176 | 177 | // First get the filename 178 | //String filename = textFileNames[i].getName(); 179 | System.out.println(textFileNames[i]); 180 | // Read in the text 181 | String text; 182 | try { 183 | text = readTextFromFile(textFileNames[i]); 184 | // process 185 | Annotation annotatedText = processor.processText2Annotations(text); 186 | String JSONsentences = processor.processAnnotations2JSON(annotatedText); 187 | //System.out.println(JSONsentences); 188 | 189 | // Create the file for the output 190 | File JSONFile = new File(outputDirectory, textFileNames[i].getName() + ".json"); 191 | //System.out.println(JSONFile.getPath()); 192 | //System.out.println(JSONFile.getName()); 193 | BufferedWriter out = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(JSONFile), "utf-8")); 194 | out.write(JSONsentences); 195 | out.close(); 196 | } catch (IOException e) { 197 | // TODO Auto-generated catch block 198 | e.printStackTrace(); 199 | } 200 | 201 | 202 | 203 | } 204 | } 205 | 206 | } 207 | -------------------------------------------------------------------------------- /books/all.labels: -------------------------------------------------------------------------------- 1 | 3 2 | 4 3 | 3 4 | 4 5 | 3 6 | 3 7 | 3 8 | 1 9 | 3 10 | 0 11 | 1 12 | 4 13 | 2 14 | 0 15 | 3 16 | 4 17 | 1 18 | 0 19 | 0 20 | 0 21 | 4 22 | 2 23 | 3 24 | 4 25 | 4 26 | 0 27 | 0 28 | 2 29 | 2 30 | 4 31 | 3 32 | 0 33 | 0 34 | 0 35 | 1 36 | 4 37 | 4 38 | 0 39 | 4 40 | 4 41 | 3 42 | 4 43 | 0 44 | 1 45 | 1 46 | 2 47 | 3 48 | 2 49 | 2 50 | 1 51 | 1 52 | 2 53 | 2 54 | 1 55 | 3 56 | 3 57 | 4 58 | 4 59 | 0 60 | 1 61 | 1 62 | 2 63 | 4 64 | 3 65 | 3 66 | 0 67 | 4 68 | 1 69 | 4 70 | 4 71 | 2 72 | 4 73 | 2 74 | 3 75 | 2 76 | 4 77 | 4 78 | 4 79 | 2 80 | 1 81 | 1 82 | 1 83 | 3 84 | 4 85 | 0 86 | 2 87 | 2 88 | 2 89 | 4 90 | 3 91 | 3 92 | 0 93 | 3 94 | 2 95 | 4 96 | 3 97 | 4 98 | 4 99 | 4 100 | 0 101 | 0 102 | 3 103 | 4 104 | 1 105 | 4 106 | 2 107 | 2 108 | 3 109 | 1 110 | 0 111 | 1 112 | 0 113 | 0 114 | 2 115 | 1 116 | 2 117 | 4 118 | 4 119 | 4 120 | 4 121 | 4 122 | 3 123 | 2 124 | 3 125 | 3 126 | 3 127 | 4 128 | 1 129 | 2 130 | 4 131 | 1 132 | 1 133 | 0 134 | 3 135 | 1 136 | 2 137 | 2 138 | 1 139 | 2 140 | 1 141 | 0 142 | 3 143 | 2 144 | 2 145 | 3 146 | 3 147 | 2 148 | 2 149 | 1 150 | 4 151 | 4 152 | 4 153 | 1 154 | 4 155 | 2 156 | 2 157 | 4 158 | 4 159 | 1 160 | 3 161 | 0 162 | 2 163 | 0 164 | 4 165 | 3 166 | 1 167 | 3 168 | 4 169 | 0 170 | 3 171 | 0 172 | 4 173 | 0 174 | 2 175 | 2 176 | 1 177 | 0 178 | 1 179 | 3 180 | 3 181 | 4 182 | 3 183 | 3 184 | 4 185 | 2 186 | 4 187 | 0 188 | 3 189 | 2 190 | 4 191 | 1 192 | 0 193 | 0 194 | 4 195 | 2 196 | 2 197 | 4 198 | 4 199 | 0 200 | 3 201 | 0 202 | 2 203 | 3 204 | 4 205 | 0 206 | 0 207 | 1 208 | 2 209 | 2 210 | 4 211 | 2 212 | 3 213 | 1 214 | 2 215 | 1 216 | 3 217 | 4 218 | 2 219 | 1 220 | 3 221 | 4 222 | 4 223 | 4 224 | 2 225 | 0 226 | 0 227 | 2 228 | 2 229 | 0 230 | 4 231 | 2 232 | 4 233 | 2 234 | 0 235 | 1 236 | 3 237 | 0 238 | 4 239 | 1 240 | 2 241 | 2 242 | 3 243 | 3 244 | 3 245 | 1 246 | 2 247 | 1 248 | 1 249 | 1 250 | 3 251 | 1 252 | 4 253 | 2 254 | 3 255 | 1 256 | 3 257 | 3 258 | 4 259 | 1 260 | 0 261 | 3 262 | 2 263 | 2 264 | 1 265 | 2 266 | 2 267 | 1 268 | 4 269 | 0 270 | 3 271 | 2 272 | 3 273 | 0 274 | 3 275 | 2 276 | 2 277 | 4 278 | 4 279 | 0 280 | 0 281 | 4 282 | 3 283 | 4 284 | 0 285 | 0 286 | 3 287 | 2 288 | 0 289 | 2 290 | 1 291 | 1 292 | 0 293 | 4 294 | 4 295 | 1 296 | 0 297 | 2 298 | 1 299 | 4 300 | 1 301 | 3 302 | 2 303 | 0 304 | 2 305 | 2 306 | 1 307 | 4 308 | 1 309 | 4 310 | 4 311 | 3 312 | 0 313 | 4 314 | 4 315 | 2 316 | 2 317 | 3 318 | 1 319 | 2 320 | 3 321 | 1 322 | 1 323 | 4 324 | 4 325 | 4 326 | 4 327 | 1 328 | 0 329 | 2 330 | 1 331 | 1 332 | 0 333 | 1 334 | 4 335 | 0 336 | 1 337 | 3 338 | 2 339 | 4 340 | 0 341 | 4 342 | 3 343 | 4 344 | 2 345 | 0 346 | 1 347 | 1 348 | 2 349 | 1 350 | 2 351 | 0 352 | 3 353 | 0 354 | 2 355 | 3 356 | 3 357 | 2 358 | 1 359 | 2 360 | 1 361 | 2 362 | 3 363 | 1 364 | 1 365 | 0 366 | 3 367 | 0 368 | 4 369 | 2 370 | 3 371 | 1 372 | 4 373 | 2 374 | 3 375 | 1 376 | 2 377 | 0 378 | 4 379 | 4 380 | 4 381 | 1 382 | 4 383 | 0 384 | 0 385 | 2 386 | 4 387 | 2 388 | 3 389 | 2 390 | 0 391 | 4 392 | 4 393 | 4 394 | 3 395 | 4 396 | 3 397 | 4 398 | 3 399 | 3 400 | 4 401 | 0 402 | 3 403 | 0 404 | 4 405 | 3 406 | 0 407 | 0 408 | 2 409 | 2 410 | 2 411 | 3 412 | 2 413 | 0 414 | 1 415 | 2 416 | 2 417 | 3 418 | 2 419 | 4 420 | 3 421 | 0 422 | 0 423 | 2 424 | 2 425 | 4 426 | 2 427 | 1 428 | 0 429 | 1 430 | 3 431 | 2 432 | 1 433 | 4 434 | 3 435 | 4 436 | 4 437 | 3 438 | 2 439 | 0 440 | 3 441 | 0 442 | 1 443 | 1 444 | 4 445 | 1 446 | 2 447 | 3 448 | 2 449 | 3 450 | 4 451 | 0 452 | 4 453 | 3 454 | 3 455 | 4 456 | 3 457 | 3 458 | 1 459 | 2 460 | 2 461 | 0 462 | 0 463 | 3 464 | 2 465 | 3 466 | 2 467 | 3 468 | 3 469 | 4 470 | 3 471 | 2 472 | 3 473 | 3 474 | 4 475 | 3 476 | 4 477 | 1 478 | 1 479 | 3 480 | 2 481 | 3 482 | 4 483 | 1 484 | 0 485 | 2 486 | 4 487 | 0 488 | 4 489 | 1 490 | 2 491 | 4 492 | 1 493 | 1 494 | 1 495 | 2 496 | 0 497 | 1 498 | 1 499 | 3 500 | 3 501 | 1 502 | 1 503 | 1 504 | 3 505 | 3 506 | 3 507 | 3 508 | 1 509 | 2 510 | 4 511 | 4 512 | 4 513 | 1 514 | 3 515 | 1 516 | 0 517 | 3 518 | 2 519 | 3 520 | 1 521 | 2 522 | 4 523 | 3 524 | 1 525 | 0 526 | 3 527 | 0 528 | 4 529 | 2 530 | 2 531 | 2 532 | 2 533 | 1 534 | 3 535 | 1 536 | 4 537 | 1 538 | 1 539 | 1 540 | 0 541 | 2 542 | 0 543 | 4 544 | 3 545 | 4 546 | 2 547 | 0 548 | 3 549 | 2 550 | 0 551 | 2 552 | 0 553 | 4 554 | 0 555 | 2 556 | 2 557 | 2 558 | 2 559 | 2 560 | 3 561 | 0 562 | 2 563 | 3 564 | 4 565 | 1 566 | 0 567 | 3 568 | 1 569 | 1 570 | 2 571 | 3 572 | 2 573 | 0 574 | 0 575 | 3 576 | 1 577 | 2 578 | 4 579 | 2 580 | 0 581 | 4 582 | 2 583 | 2 584 | 1 585 | 4 586 | 1 587 | 4 588 | 0 589 | 1 590 | 4 591 | 4 592 | 0 593 | 3 594 | 2 595 | 3 596 | 3 597 | 1 598 | 1 599 | 3 600 | 2 601 | 3 602 | 0 603 | 1 604 | 0 605 | 4 606 | 0 607 | 3 608 | 1 609 | 2 610 | 4 611 | 1 612 | 4 613 | 1 614 | 1 615 | 1 616 | 1 617 | 4 618 | 2 619 | 2 620 | 4 621 | 2 622 | 1 623 | 4 624 | 3 625 | 4 626 | 0 627 | 0 628 | 3 629 | 4 630 | 0 631 | 1 632 | 3 633 | 3 634 | 4 635 | 3 636 | 2 637 | 2 638 | 1 639 | 0 640 | 4 641 | 4 642 | 0 643 | 1 644 | 2 645 | 1 646 | 1 647 | 2 648 | 0 649 | 3 650 | 4 651 | 2 652 | 2 653 | 1 654 | 3 655 | 2 656 | 4 657 | 0 658 | 3 659 | 0 660 | 0 661 | 1 662 | 3 663 | 2 664 | 2 665 | 0 666 | 4 667 | 1 668 | 1 669 | 0 670 | 1 671 | 2 672 | 0 673 | 4 674 | 4 675 | 0 676 | 1 677 | 2 678 | 2 679 | 1 680 | 4 681 | 3 682 | 0 683 | 4 684 | 1 685 | 4 686 | 1 687 | 0 688 | 0 689 | 3 690 | 0 691 | 2 692 | 3 693 | 0 694 | 4 695 | 4 696 | 2 697 | 4 698 | 3 699 | 1 700 | 0 701 | 4 702 | 2 703 | 1 704 | 2 705 | 1 706 | 1 707 | 3 708 | 1 709 | 0 710 | 1 711 | 0 712 | 4 713 | 3 714 | 3 715 | 1 716 | 4 717 | 1 718 | 0 719 | 1 720 | 4 721 | 0 722 | 0 723 | 0 724 | 3 725 | 2 726 | 4 727 | 2 728 | 0 729 | 1 730 | 1 731 | 2 732 | 1 733 | 2 734 | 3 735 | 2 736 | 2 737 | 4 738 | 2 739 | 1 740 | 4 741 | 0 742 | 0 743 | 4 744 | 4 745 | 0 746 | 4 747 | 3 748 | 0 749 | 2 750 | 4 751 | 1 752 | 0 753 | 4 754 | 0 755 | 2 756 | 0 757 | 3 758 | 0 759 | 1 760 | 3 761 | 3 762 | 2 763 | 1 764 | 1 765 | 2 766 | 4 767 | 1 768 | 4 769 | 4 770 | 4 771 | 1 772 | 4 773 | 2 774 | 0 775 | 3 776 | 3 777 | 4 778 | 1 779 | 0 780 | 4 781 | 3 782 | 0 783 | 2 784 | 1 785 | 0 786 | 4 787 | 2 788 | 3 789 | 3 790 | 1 791 | 2 792 | 0 793 | 0 794 | 3 795 | 3 796 | 0 797 | 2 798 | 4 799 | 0 800 | 2 801 | 3 802 | 1 803 | 0 804 | 1 805 | 2 806 | 4 807 | 1 808 | 0 809 | 4 810 | 1 811 | 1 812 | 1 813 | 0 814 | 4 815 | 2 816 | 0 817 | 2 818 | 1 819 | 3 820 | 0 821 | 0 822 | 3 823 | 1 824 | 0 825 | 3 826 | 2 827 | 3 828 | 1 829 | 3 830 | 4 831 | 3 832 | 4 833 | 3 834 | 3 835 | 2 836 | 0 837 | 3 838 | 0 839 | 0 840 | 0 841 | 0 842 | 0 843 | 3 844 | 4 845 | 2 846 | 0 847 | 3 848 | 2 849 | 1 850 | 1 851 | 0 852 | 4 853 | 2 854 | 0 855 | 3 856 | 1 857 | 4 858 | 0 859 | 4 860 | 0 861 | 3 862 | 4 863 | 3 864 | 2 865 | 2 866 | 4 867 | 4 868 | 3 869 | 3 870 | 1 871 | 2 872 | 4 873 | 0 874 | 2 875 | 4 876 | 2 877 | 0 878 | 3 879 | 2 880 | 3 881 | 3 882 | 2 883 | 2 884 | 2 885 | 2 886 | 3 887 | 2 888 | 2 889 | 0 890 | 0 891 | 2 892 | 4 893 | 2 894 | 0 895 | 0 896 | 4 897 | 0 898 | 2 899 | 4 900 | 3 901 | 1 902 | 2 903 | 2 904 | 0 905 | 1 906 | 3 907 | 2 908 | 2 909 | 3 910 | 0 911 | 3 912 | 4 913 | 4 914 | 4 915 | 0 916 | 3 917 | 1 918 | 0 919 | 2 920 | 3 921 | 3 922 | 4 923 | 4 924 | 0 925 | 4 926 | 0 927 | 3 928 | 1 929 | 1 930 | 4 931 | 1 932 | 4 933 | 3 934 | 4 935 | 3 936 | 1 937 | 2 938 | 0 939 | 3 940 | 1 941 | 1 942 | 0 943 | 4 944 | 1 945 | 1 946 | 1 947 | 4 948 | 0 949 | 4 950 | 1 951 | 0 952 | 1 953 | 0 954 | 0 955 | 1 956 | 1 957 | 1 958 | 3 959 | 3 960 | 0 961 | 4 962 | 3 963 | 2 964 | 1 965 | 0 966 | 2 967 | 3 968 | 3 969 | 2 970 | 3 971 | 4 972 | 1 973 | 1 974 | 3 975 | 1 976 | 4 977 | 0 978 | 4 979 | 0 980 | 4 981 | 2 982 | 1 983 | 1 984 | 0 985 | 1 986 | 0 987 | 3 988 | 1 989 | 2 990 | 0 991 | 1 992 | 3 993 | 1 994 | 3 995 | 0 996 | 2 997 | 1 998 | 1 999 | 4 1000 | 4 1001 | 1 1002 | 0 1003 | 1 1004 | 2 1005 | 3 1006 | 2 1007 | 3 1008 | 4 1009 | 0 1010 | 4 1011 | 1 1012 | 1 1013 | 0 1014 | 0 1015 | 1 1016 | 4 1017 | 3 1018 | 3 1019 | 3 1020 | 4 1021 | 1 1022 | 0 1023 | 2 1024 | 4 1025 | 4 1026 | 1 1027 | 0 1028 | 4 1029 | 4 1030 | 1 1031 | 4 1032 | 3 1033 | 1 1034 | 2 1035 | 3 1036 | 1 1037 | 4 1038 | 1 1039 | 0 1040 | 2 1041 | -------------------------------------------------------------------------------- /DependencyTreeRNN++/CorpusUnrollsReader.h: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2014-2015 Piotr Mirowski 2 | // 3 | // Piotr Mirowski, Andreas Vlachos 4 | // "Dependency Recurrent Neural Language Models for Sentence Completion" 5 | // ACL 2015 6 | 7 | #ifndef __DependencyTreeRNN____corpus__ 8 | #define __DependencyTreeRNN____corpus__ 9 | 10 | #include 11 | #include 12 | #include 13 | #include 14 | #include 15 | #include 16 | #include 17 | 18 | /** 19 | * Basic unit of a text: a token 20 | */ 21 | struct Token { 22 | int pos; 23 | int wordAsContext; 24 | int wordAsTarget; 25 | double discount; 26 | int label; 27 | }; 28 | 29 | /** 30 | * Sentence unroll: a vector of tokens 31 | */ 32 | typedef std::vector Unroll; 33 | 34 | /** 35 | * Sentence: a vector of unrolls 36 | */ 37 | typedef std::vector Sentence; 38 | 39 | 40 | /** 41 | * Book: a class containing a vector of sentences 42 | */ 43 | class BookUnrolls { 44 | public: 45 | 46 | /** 47 | * Constructor and destructor 48 | */ 49 | BookUnrolls() { Burn(); } 50 | ~BookUnrolls() { } 51 | 52 | /** 53 | * Wipe-out all content of the book 54 | */ 55 | void Burn() { 56 | _sentences.clear(); 57 | _numUnrollsInSentence.clear(); 58 | _numTokensInUnrollSentence.clear(); 59 | _numSentences = 0; 60 | _sentenceIndex = 0; 61 | _unrollIndex = 0; 62 | _tokenIndex = 0; 63 | _numTokens = 0; 64 | } 65 | 66 | /** 67 | * Add a token to the book 68 | */ 69 | void AddToken(bool new_sentence, bool new_unroll, 70 | int pos, int wordAsContext, int wordAsTarget, 71 | double discount, int label); 72 | 73 | /** 74 | * Return the number of sentences 75 | */ 76 | int NumSentences() { return _numSentences; } 77 | 78 | /** 79 | * Return the number of unrolls in sentence 80 | */ 81 | int NumUnrolls(int k) { return _numUnrollsInSentence[k]; } 82 | 83 | /** 84 | * Return the number of tokens in unroll of a sentence 85 | */ 86 | int NumTokens(int k, int j) { return _numTokensInUnrollSentence[k][j]; } 87 | 88 | /** 89 | * Return the index of the current sentence 90 | */ 91 | int CurrentSentenceIndex() { return _sentenceIndex; } 92 | 93 | /** 94 | * Return the index of the current sentence 95 | */ 96 | int CurrentUnrollIndex() { return _unrollIndex; } 97 | 98 | /** 99 | * Go to a specific sentence 100 | */ 101 | bool GoToSentence(int n); 102 | 103 | /** 104 | * Go to the next sentence 105 | */ 106 | int NextSentence(); 107 | 108 | /** 109 | * Go to the next unroll in the sentence 110 | */ 111 | int NextUnrollInSentence(); 112 | 113 | /** 114 | * Go to the next unroll in the current sentence. 115 | * Here, we do not loop over but stop (return -1) 116 | * when the end of the unroll is reached. 117 | */ 118 | int NextTokenInUnroll(); 119 | 120 | /** 121 | * Update the current token 122 | */ 123 | void UpdateCurrentToken() { 124 | _currentToken = 125 | &(_sentences[_sentenceIndex][_unrollIndex][_tokenIndex]); 126 | } 127 | 128 | /** 129 | * Accessors to the current token's information 130 | */ 131 | int CurrentTokenNumberInSentence() { return _currentToken->pos; } 132 | double CurrentTokenDiscount() { return _currentToken->discount; } 133 | int CurrentTokenWordAsContext() { return _currentToken->wordAsContext; } 134 | int CurrentTokenWordAsTarget() { return _currentToken->wordAsTarget; } 135 | int CurrentTokenLabel() { return _currentToken->label; } 136 | 137 | /** 138 | * Reset the sentence in the current sentence 139 | */ 140 | void ResetSentence() { 141 | _sentenceIndex = 0; 142 | // Recursively reset the unroll of that first sentence 143 | ResetUnroll(); 144 | } 145 | 146 | /** 147 | * Reset the unroll in the current sentence 148 | */ 149 | void ResetUnroll() { 150 | _unrollIndex = 0; 151 | // Recursively reset the token of that first unroll 152 | ResetToken(); 153 | } 154 | 155 | /** 156 | * Reset the token in the current sentence and unroll 157 | */ 158 | void ResetToken() { 159 | _tokenIndex = 0; 160 | UpdateCurrentToken(); 161 | } 162 | 163 | /** 164 | * Number of tokens 165 | */ 166 | long NumTokens() { return _numTokens; } 167 | 168 | protected: 169 | 170 | // All the sentences of the book 171 | std::vector _sentences; 172 | 173 | // Copy of the current token 174 | Token *_currentToken; 175 | 176 | // Current sentence, unroll and token index 177 | int _sentenceIndex; 178 | int _unrollIndex; 179 | int _tokenIndex; 180 | 181 | // Number of sentences 182 | int _numSentences; 183 | 184 | // Number of unrolls in each sentence 185 | std::vector _numUnrollsInSentence; 186 | 187 | // Number of tokens in each unroll and sentence 188 | std::vector > _numTokensInUnrollSentence; 189 | 190 | // Total number of tokens 191 | long _numTokens; 192 | }; 193 | 194 | 195 | /** 196 | * CorpusUnrolls: contains all vocabulary and the list of books 197 | * but stores only one book at a time 198 | */ 199 | class CorpusUnrolls { 200 | public: 201 | /** 202 | * Constructor 203 | */ 204 | CorpusUnrolls() : 205 | _minWordOccurrence(3), 206 | _oov(0), 207 | _vocabSizeWords(0), 208 | _vocabSizeLabels(0), 209 | _currentBookIndex(-1) { 210 | // Insert OOV and EOS tokens 211 | InsertWord("", 1.0); 212 | InsertWord("", 1.0); 213 | // Insert ROOT label 214 | InsertLabel("ROOT"); 215 | } 216 | 217 | /** 218 | * Constructor and destructor 219 | */ 220 | ~CorpusUnrolls () { } 221 | 222 | public: 223 | /** 224 | * Number of books 225 | */ 226 | int NumBooks() { return (int)(_bookFilenames.size()); } 227 | 228 | /** 229 | * Size of the vocabulary 230 | */ 231 | int NumWords() { return _vocabSizeWords; } 232 | 233 | /** 234 | * Number of labels 235 | */ 236 | int NumLabels() { return _vocabSizeLabels; } 237 | 238 | /** 239 | * Look-up a word in the vocabulary 240 | */ 241 | int LookUpWord(const std::string &word); 242 | 243 | /** 244 | * Look-up a label in the vocabulary 245 | */ 246 | int LookUpLabel(const std::string &label); 247 | 248 | public: 249 | /** 250 | * Set minimum number of word occurrences 251 | */ 252 | void SetMinWordOccurrence(int val) { _minWordOccurrence = val; } 253 | 254 | /** 255 | * Insert a word into the vocabulary, if new 256 | */ 257 | int InsertWord(const std::string &word, double discount); 258 | 259 | /** 260 | * Insert a label into the vocabulary, if new 261 | */ 262 | int InsertLabel(const std::string &label); 263 | 264 | /** 265 | * Read vocabulary from all books and return the number of tokens 266 | */ 267 | long ReadVocabulary(bool mergeLabel); 268 | 269 | /** 270 | * Filter and sort the vocabulary from another corpus 271 | */ 272 | void FilterSortVocabulary(CorpusUnrolls &other); 273 | 274 | /** 275 | * Copy the vocabulary from another corpus 276 | */ 277 | void CopyVocabulary(CorpusUnrolls &other); 278 | 279 | /** 280 | * Export the vocabulary to a text file 281 | */ 282 | void ExportVocabulary(const std::string &filename); 283 | 284 | /** 285 | * Import the vocabulary from a text file 286 | */ 287 | void ImportVocabulary(const std::string &filename); 288 | 289 | /** 290 | * Add a book 291 | */ 292 | void AddBookFilename(const std::string &filename) { 293 | _bookFilenames.push_back(filename); 294 | NextBook(); 295 | } 296 | 297 | /** 298 | * Go to next book 299 | */ 300 | int NextBook() { 301 | _currentBookIndex++; 302 | if (_currentBookIndex == NumBooks()) { _currentBookIndex = 0; } 303 | return _currentBookIndex; 304 | } 305 | 306 | /** 307 | * Shuffle the order of the books 308 | */ 309 | void ShuffleBooks() { 310 | std::random_shuffle(_bookFilenames.begin(), _bookFilenames.end()); 311 | } 312 | 313 | /** 314 | * Read the current book into memory 315 | */ 316 | void ReadBook(bool mergeLabel); 317 | 318 | protected: 319 | 320 | // Minimum number of word occurrences not to be OOV 321 | int _minWordOccurrence; 322 | 323 | // Out-of-vocabulary token 324 | int _oov; 325 | 326 | // Number of words and labels in the vocabulary 327 | int _vocabSizeWords; 328 | int _vocabSizeLabels; 329 | 330 | // Current book 331 | int _currentBookIndex; 332 | 333 | // List of books (filenames) 334 | std::vector _bookFilenames; 335 | 336 | public: 337 | 338 | // Vocabulary: map between a string of text and an integer 339 | std::unordered_map vocabulary; 340 | std::unordered_map vocabularyReverse; 341 | 342 | // Discounted word counts 343 | std::unordered_map wordCountsDiscounted; 344 | 345 | // Labels: map between a string of text and an integer 346 | std::unordered_map labels; 347 | std::unordered_map labelsReverse; 348 | 349 | // Current book 350 | BookUnrolls m_currentBook; 351 | }; 352 | 353 | #endif /* defined(__DependencyTreeRNN____corpus__) */ 354 | -------------------------------------------------------------------------------- /DependencyTreeRNN++/RnnState.h: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2014-2015 Piotr Mirowski 2 | // 3 | // Piotr Mirowski, Andreas Vlachos 4 | // "Dependency Recurrent Neural Language Models for Sentence Completion" 5 | // ACL 2015 6 | 7 | // Based on code by Geoffrey Zweig and Tomas Mikolov 8 | // for the Feature-Augmented RNN Tool Kit 9 | // http://research.microsoft.com/en-us/projects/rnn/ 10 | 11 | /* 12 | This file is based on or incorporates material from the projects listed below (collectively, "Third Party Code"). 13 | Microsoft is not the original author of the Third Party Code. The original copyright notice and the license under which Microsoft received such Third Party Code, 14 | are set forth below. Such licenses and notices are provided for informational purposes only. Microsoft, not the third party, licenses the Third Party Code to you 15 | under the terms set forth in the EULA for the Microsoft Product. Microsoft reserves all rights not expressly granted under this agreement, whether by implication, 16 | estoppel or otherwise. 17 | 18 | RNNLM 0.3e by Tomas Mikolov 19 | 20 | Provided for Informational Purposes Only 21 | 22 | BSD License 23 | All rights reserved. 24 | Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 25 | Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 26 | 27 | Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other 28 | materials provided with the distribution. 29 | 30 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 31 | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, 32 | INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, 33 | OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 34 | ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 35 | */ 36 | 37 | #ifndef DependencyTreeRNN___RnnState_h 38 | #define DependencyTreeRNN___RnnState_h 39 | 40 | #include 41 | #include 42 | 43 | 44 | /** 45 | * Max n-gram order, used for word history and direct connections 46 | * from the word history to the word output 47 | */ 48 | const int c_maxNGramOrder = 20; 49 | 50 | 51 | /** 52 | * State vectors in the RNN model, storing per-word and per-class activations 53 | */ 54 | class RnnState { 55 | public: 56 | 57 | /** 58 | * Constructor 59 | */ 60 | RnnState(int sizeVocabulary, 61 | int sizeHidden, 62 | int sizeFeature, 63 | int sizeClasses, 64 | int sizeCompress, 65 | long long sizeDirectConnection, 66 | int orderDirectConnection) 67 | : m_orderDirectConnection(orderDirectConnection) { 68 | int sizeInput = sizeVocabulary; 69 | int sizeOutput = sizeVocabulary + sizeClasses; 70 | WordHistory.assign(c_maxNGramOrder, 0); 71 | InputLayer.assign(sizeInput, 0.0); 72 | InputGradient.assign(sizeInput, 0.0); 73 | RecurrentLayer.assign(sizeHidden, 0.0); 74 | RecurrentGradient.assign(sizeHidden, 0.0); 75 | HiddenLayer.assign(sizeHidden, 0.0); 76 | HiddenGradient.assign(sizeHidden, 0.0); 77 | FeatureLayer.assign(sizeFeature, 0.0); 78 | FeatureGradient.assign(sizeFeature, 0.0); 79 | OutputLayer.assign(sizeOutput, 0.0); 80 | OutputGradient.assign(sizeOutput, 0.0); 81 | CompressLayer.assign(sizeCompress, 0.0); 82 | CompressGradient.assign(sizeCompress, 0.0); 83 | } 84 | 85 | // Input layer (i.e., words) 86 | std::vector InputLayer; 87 | // Input feature layer (e.g., topics) 88 | std::vector FeatureLayer; 89 | // Hidden layer at previous time step 90 | std::vector RecurrentLayer; 91 | // Hidden layer 92 | std::vector HiddenLayer; 93 | // Second (compression) hidden layer 94 | std::vector CompressLayer; 95 | // Output layer 96 | std::vector OutputLayer; 97 | 98 | // Gradient to the words in input layer 99 | std::vector InputGradient; 100 | // Gradient to the features in input layer 101 | std::vector FeatureGradient; 102 | // Gradient to the hidden state at previous time step 103 | std::vector RecurrentGradient; 104 | // Gradient to the hidden layer 105 | std::vector HiddenGradient; 106 | // Gradient to the second (compression) hidden layer 107 | std::vector CompressGradient; 108 | // Gradient to the output layer 109 | std::vector OutputGradient; 110 | 111 | // Word history 112 | std::vector WordHistory; 113 | 114 | 115 | /** 116 | * Return the number of units in the input (word) layer. 117 | */ 118 | int GetInputSize() const { 119 | return static_cast(InputLayer.size()); 120 | } 121 | 122 | 123 | /** 124 | * Return the number of units in the input (word) layer. 125 | */ 126 | int GetHiddenSize() const { 127 | return static_cast(HiddenLayer.size()); 128 | } 129 | 130 | 131 | /** 132 | * Return the number of units in the optional hidden compression layer. 133 | */ 134 | int GetCompressSize() const { 135 | return static_cast(CompressLayer.size()); 136 | } 137 | 138 | 139 | /** 140 | * Return the number of units in the feature (e.g., topic) layer. 141 | */ 142 | int GetFeatureSize() const { 143 | return static_cast(FeatureLayer.size()); 144 | } 145 | 146 | 147 | /** 148 | * Return the number of units in the output layer. 149 | */ 150 | int GetOutputSize() const { 151 | return static_cast(OutputLayer.size()); 152 | } 153 | 154 | 155 | /** 156 | * Return the number of units in the output layer. 157 | */ 158 | int GetOrderDirectConnection() const { return m_orderDirectConnection; } 159 | 160 | protected: 161 | int m_orderDirectConnection; 162 | }; 163 | 164 | 165 | class RnnBptt { 166 | public: 167 | 168 | /** 169 | * Constructor 170 | */ 171 | RnnBptt(int sizeVocabulary, int sizeHidden, int sizeFeature, 172 | int numBpttSteps, int bpttBlockSize) 173 | : m_bpttSteps(numBpttSteps), m_bpttBlock(bpttBlockSize), 174 | m_sizeHidden(sizeHidden), m_sizeFeature(sizeFeature), 175 | m_steps(0) { 176 | Reset(); 177 | WeightsInput2Hidden.assign(sizeVocabulary * sizeHidden, 0); 178 | WeightsRecurrent2Hidden.assign(sizeHidden * sizeHidden, 0); 179 | WeightsFeature2Hidden.assign(sizeFeature * sizeHidden, 0); 180 | } 181 | 182 | 183 | /** 184 | * Number of BPTT steps that can be considered 185 | */ 186 | int NumSteps() { return m_steps; } 187 | 188 | 189 | /** 190 | * Reset the BPTT memory 191 | */ 192 | void Reset() { 193 | m_steps = 0; 194 | History.assign(m_bpttSteps + m_bpttBlock + 10, -1); 195 | FeatureLayer.assign((m_bpttSteps + m_bpttBlock + 2) * m_sizeFeature, 0); 196 | HiddenLayer.assign((m_bpttSteps + m_bpttBlock + 1) * m_sizeHidden, 0); 197 | HiddenGradient.assign((m_bpttSteps + m_bpttBlock + 1) * m_sizeHidden, 0); 198 | } 199 | 200 | 201 | /** 202 | * Shift the BPTT memory by one 203 | */ 204 | void Shift(int lastWord) { 205 | if (m_bpttSteps > 0) { 206 | // Shift the history of words 207 | for (int a = m_bpttSteps + m_bpttBlock - 1; a > 0; a--) { 208 | History[a] = History[a - 1]; 209 | } 210 | History[0] = lastWord; 211 | 212 | // Shift the history of hidden layer activations 213 | for (int a = m_bpttSteps + m_bpttBlock - 1; a > 0; a--) { 214 | for (int b = 0; b < m_sizeHidden; b++) { 215 | HiddenLayer[a * m_sizeHidden + b] = 216 | HiddenLayer[(a - 1) * m_sizeHidden + b]; 217 | HiddenGradient[a * m_sizeHidden + b] = 218 | HiddenGradient[(a - 1) * m_sizeHidden + b]; 219 | } 220 | } 221 | 222 | // Shift the history of feature activations 223 | for (int a = m_bpttSteps + m_bpttBlock - 1; a > 0; a--) { 224 | for (int b = 0; b < m_sizeFeature; b++) { 225 | FeatureLayer[a * m_sizeFeature + b] = 226 | FeatureLayer[(a - 1) * m_sizeFeature + b]; 227 | } 228 | } 229 | } 230 | // Keep track of the number of that can be considered for BPTT 231 | m_steps++; 232 | m_steps = std::min(m_steps, m_bpttSteps + m_bpttBlock); 233 | } 234 | 235 | 236 | // Word history 237 | std::vector History; 238 | // History of feature inputs 239 | std::vector FeatureLayer; 240 | // History of hidden layer inputs 241 | std::vector HiddenLayer; 242 | // History of gradients to the hidden layer 243 | std::vector HiddenGradient; 244 | // Gradients to the weights, to be added to the SGD gradients 245 | std::vector WeightsInput2Hidden; 246 | std::vector WeightsRecurrent2Hidden; 247 | std::vector WeightsFeature2Hidden; 248 | 249 | 250 | protected: 251 | // Number of steps gradients are back-propagated through time 252 | int m_bpttSteps; 253 | // How many steps (words) do we wait between consecutive BPTT? 254 | int m_bpttBlock; 255 | // How many steps have been stored since the last reset? 256 | int m_steps; 257 | // Number of hidden nodes 258 | int m_sizeHidden; 259 | // Number of features 260 | int m_sizeFeature; 261 | }; 262 | 263 | #endif 264 | -------------------------------------------------------------------------------- /DependencyTreeRNN++/RnnWeights.cpp: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2014-2015 Piotr Mirowski 2 | // 3 | // Piotr Mirowski, Andreas Vlachos 4 | // "Dependency Recurrent Neural Language Models for Sentence Completion" 5 | // ACL 2015 6 | 7 | // Based on code by Geoffrey Zweig and Tomas Mikolov 8 | // for the Feature-Augmented RNN Tool Kit 9 | // http://research.microsoft.com/en-us/projects/rnn/ 10 | 11 | /* 12 | This file is based on or incorporates material from the projects listed below (collectively, "Third Party Code"). 13 | Microsoft is not the original author of the Third Party Code. The original copyright notice and the license under which Microsoft received such Third Party Code, 14 | are set forth below. Such licenses and notices are provided for informational purposes only. Microsoft, not the third party, licenses the Third Party Code to you 15 | under the terms set forth in the EULA for the Microsoft Product. Microsoft reserves all rights not expressly granted under this agreement, whether by implication, 16 | estoppel or otherwise. 17 | 18 | RNNLM 0.3e by Tomas Mikolov 19 | 20 | Provided for Informational Purposes Only 21 | 22 | BSD License 23 | All rights reserved. 24 | Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 25 | Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 26 | 27 | Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other 28 | materials provided with the distribution. 29 | 30 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 31 | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, 32 | INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, 33 | OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 34 | ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 35 | */ 36 | 37 | #include 38 | #include 39 | #include 40 | #include 41 | #include 42 | #include "Utils.h" 43 | #include "RnnWeights.h" 44 | 45 | using namespace std; 46 | 47 | /** 48 | * Constructor 49 | */ 50 | RnnWeights::RnnWeights(int sizeVocabulary, 51 | int sizeHidden, 52 | int sizeFeature, 53 | int sizeClasses, 54 | int sizeCompress, 55 | long long sizeDirectConnection) 56 | : m_sizeVocabulary(sizeVocabulary), 57 | m_sizeHidden(sizeHidden), 58 | m_sizeFeature(sizeFeature), 59 | m_sizeClasses(sizeClasses), 60 | m_sizeCompress(sizeCompress), 61 | m_sizeDirectConnection(sizeDirectConnection), 62 | m_sizeInput(sizeVocabulary), 63 | m_sizeOutput(sizeVocabulary + sizeClasses) { 64 | 65 | // Sanity check 66 | assert(sizeClasses <= sizeVocabulary); 67 | cout << "RnnWeights: allocate " << m_sizeInput << " inputs (" 68 | << sizeVocabulary << " words), " 69 | << m_sizeClasses << " classes, " 70 | << m_sizeHidden << " hiddens, " 71 | << m_sizeFeature << " features, " 72 | << m_sizeCompress << " compressed, " 73 | << m_sizeDirectConnection << " n-grams\n"; 74 | 75 | // Allocate the weights connecting those layers 76 | // (will be assigned random values later) 77 | Input2Hidden.resize(m_sizeInput * m_sizeHidden); 78 | Recurrent2Hidden.resize(m_sizeHidden * m_sizeHidden); 79 | Features2Hidden.resize(m_sizeFeature * m_sizeHidden); 80 | Features2Output.resize(m_sizeFeature * m_sizeOutput); 81 | if (sizeCompress == 0) { 82 | Hidden2Output.resize(m_sizeHidden * m_sizeOutput); 83 | } else { 84 | // Add a compression layer between hidden nodes and outputs 85 | Hidden2Output.resize(m_sizeHidden * m_sizeCompress); 86 | Compress2Output.resize(m_sizeCompress * m_sizeOutput); 87 | } 88 | // Change that to proper normal distribution 89 | // http://en.cppreference.com/w/cpp/numeric/random/normal_distribution 90 | RandomizeVector(Input2Hidden); 91 | RandomizeVector(Recurrent2Hidden); 92 | if (sizeFeature > 0) { 93 | RandomizeVector(Features2Hidden); 94 | RandomizeVector(Features2Output); 95 | } 96 | if (sizeCompress > 0) { 97 | RandomizeVector(Compress2Output); 98 | } 99 | RandomizeVector(Hidden2Output); 100 | 101 | // Initialize the direct n-gram connections 102 | DirectNGram.assign(m_sizeDirectConnection, 0.0); 103 | } // RnnWeights() 104 | 105 | 106 | /** 107 | * Clear all the weights (before loading a new copy), to save memory 108 | */ 109 | void RnnWeights::Clear() { 110 | Input2Hidden.clear(); 111 | Recurrent2Hidden.clear(); 112 | Features2Hidden.clear(); 113 | Features2Output.clear(); 114 | if (m_sizeCompress == 0) { 115 | Hidden2Output.clear(); 116 | } else { 117 | Hidden2Output.clear(); 118 | Compress2Output.clear(); 119 | } 120 | DirectNGram.clear(); 121 | } 122 | 123 | 124 | /** 125 | * Load the weights matrices from a file 126 | */ 127 | void RnnWeights::Load(FILE *fi) { 128 | // Read the weights of input -> hidden connections 129 | Log("Reading " + ConvString(m_sizeHidden) + 130 | "x" + ConvString(m_sizeInput) + " input->hidden weights...\n"); 131 | ReadBinaryMatrix(fi, m_sizeInput, m_sizeHidden, Input2Hidden); 132 | // Read the weights of recurrent hidden -> hidden connections 133 | Log("Reading " + ConvString(m_sizeHidden) + "x" + ConvString(m_sizeHidden) + 134 | " recurrent hidden->hidden weights...\n"); 135 | ReadBinaryMatrix(fi, m_sizeHidden, m_sizeHidden, Recurrent2Hidden); 136 | // Read the weights of feature -> hidden connections 137 | Log("Reading " + ConvString(m_sizeHidden) + "x" + ConvString(m_sizeFeature) + 138 | " feature->hidden weights...\n"); 139 | ReadBinaryMatrix(fi, m_sizeFeature, m_sizeHidden, Features2Hidden); 140 | // Read the weights of feature -> output connections 141 | Log("Reading " + ConvString(m_sizeOutput) + "x" + ConvString(m_sizeFeature) + 142 | " feature->output weights...\n"); 143 | ReadBinaryMatrix(fi, m_sizeFeature, m_sizeOutput, Features2Output); 144 | if (m_sizeCompress == 0) { 145 | // Read the weights of hidden -> output connections 146 | Log("Reading " + ConvString(m_sizeOutput) + "x" + ConvString(m_sizeHidden) + 147 | " hidden->output weights...\n"); 148 | ReadBinaryMatrix(fi, m_sizeHidden, m_sizeOutput, Hidden2Output); 149 | } else { 150 | // Read the weights of hidden -> compression connections 151 | Log("Reading " + ConvString(m_sizeCompress) + "x" + ConvString(m_sizeHidden) + 152 | " hidden->compress weights...\n"); 153 | ReadBinaryMatrix(fi, m_sizeHidden, m_sizeCompress, Hidden2Output); 154 | // Read the weights of compression -> output connections 155 | Log("Reading " + ConvString(m_sizeOutput) + "x" + ConvString(m_sizeCompress) + 156 | " compress->output weights...\n"); 157 | ReadBinaryMatrix(fi, m_sizeCompress, m_sizeOutput, Compress2Output); 158 | } 159 | if (m_sizeDirectConnection > 0) { 160 | Log("Reading " + ConvString(m_sizeDirectConnection) + 161 | " n-gram connections...\n"); 162 | // Read the direct connections 163 | ReadBinaryVector(fi, m_sizeDirectConnection, DirectNGram); 164 | } 165 | } // void Load() 166 | 167 | 168 | /** 169 | * Save the weights matrices to a file 170 | */ 171 | void RnnWeights::Save(FILE *fo) { 172 | string logFilename = "log_saving.txt"; 173 | // Save the weights U: input -> hidden (i.e., the word embeddings) 174 | Log("Saving " + ConvString(m_sizeHidden) + "x" + ConvString(m_sizeInput) + 175 | " input->hidden weights...\n", logFilename); 176 | SaveBinaryMatrix(fo, m_sizeInput, m_sizeHidden, Input2Hidden); 177 | // Save the weights W: recurrent hidden -> hidden (i.e., the time-delay) 178 | Log("Saving " + ConvString(m_sizeHidden) + "x" + ConvString(m_sizeHidden) + 179 | " recurrent hidden->hidden weights...\n", logFilename); 180 | SaveBinaryMatrix(fo, m_sizeHidden, m_sizeHidden, Recurrent2Hidden); 181 | // Save the weights feature -> hidden 182 | Log("Saving " + ConvString(m_sizeHidden) + "x" + ConvString(m_sizeFeature) + 183 | " feature->hidden weights...\n", logFilename); 184 | SaveBinaryMatrix(fo, m_sizeFeature, m_sizeHidden, Features2Hidden); 185 | // Save the weights G: feature -> output 186 | Log("Saving " + ConvString(m_sizeOutput) + "x" + ConvString(m_sizeFeature) + 187 | " feature->output weights...\n", logFilename); 188 | SaveBinaryMatrix(fo, m_sizeFeature, m_sizeOutput, Features2Output); 189 | // Save the weights hidden -> compress and compress -> output 190 | // or simply the weights V: hidden -> output 191 | if (m_sizeCompress > 0) { 192 | Log("Saving " + ConvString(m_sizeCompress) + "x" + ConvString(m_sizeHidden) + 193 | " hidden->compress weights...\n", logFilename); 194 | SaveBinaryMatrix(fo, m_sizeHidden, m_sizeCompress, Hidden2Output); 195 | Log("Saving " + ConvString(m_sizeOutput) + "x" + ConvString(m_sizeCompress) + 196 | " compress->output weights...\n", logFilename); 197 | SaveBinaryMatrix(fo, m_sizeCompress, m_sizeOutput, Compress2Output); 198 | } else { 199 | Log("Saving " + ConvString(m_sizeOutput) + "x" + ConvString(m_sizeHidden) + 200 | " hidden->output weights...\n", logFilename); 201 | SaveBinaryMatrix(fo, m_sizeHidden, m_sizeOutput, Hidden2Output); 202 | } 203 | if (m_sizeDirectConnection > 0) { 204 | // Save the direct connections 205 | Log("Saving " + ConvString(m_sizeDirectConnection) + 206 | " n-gram connections...\n", logFilename); 207 | for (long long aa = 0; aa < m_sizeDirectConnection; aa++) { 208 | float fl = (float)(DirectNGram[aa]); 209 | fwrite(&fl, 4, 1, fo); 210 | } 211 | } 212 | } // void Save() 213 | 214 | 215 | /** 216 | * Debug function 217 | */ 218 | void RnnWeights::Debug() { 219 | Log("input2hidden: " + ConvString(m_sizeInput) + "x" + 220 | ConvString(m_sizeHidden) + " " + 221 | ConvString(Input2Hidden[(m_sizeInput-1)*(m_sizeHidden-1)]) + "\n"); 222 | Log("recurrent2hidden: " + ConvString(m_sizeHidden) + "x" + 223 | ConvString(m_sizeHidden) + " " + 224 | ConvString(Recurrent2Hidden[(m_sizeHidden-1)*(m_sizeHidden-1)]) + "\n"); 225 | Log("hidden2output: " + ConvString(m_sizeHidden) + "x" + 226 | ConvString(m_sizeOutput) + " " + 227 | ConvString(Hidden2Output[(m_sizeOutput-1)*(m_sizeHidden-1)]) + "\n"); 228 | if (m_sizeFeature > 0) { 229 | Log("features2hidden: " + ConvString(m_sizeFeature) + "x" + 230 | ConvString(m_sizeHidden) + " " + 231 | ConvString(Features2Hidden[(m_sizeFeature-1)*(m_sizeHidden-1)]) + "\n"); 232 | Log("features2output: " + ConvString(m_sizeFeature) + "x" + 233 | ConvString(m_sizeOutput) + " " + 234 | ConvString(Features2Output[(m_sizeFeature-1)*(m_sizeOutput-1)]) + "\n"); 235 | } 236 | if (m_sizeDirectConnection > 0) 237 | Log("direct: " + ConvString(m_sizeDirectConnection) + " " + 238 | ConvString(DirectNGram[m_sizeDirectConnection-1]) + "\n"); 239 | } // void Debug() 240 | -------------------------------------------------------------------------------- /DependencyTreeRNN++/RnnTraining.h: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2014-2015 Piotr Mirowski 2 | // 3 | // Piotr Mirowski, Andreas Vlachos 4 | // "Dependency Recurrent Neural Language Models for Sentence Completion" 5 | // ACL 2015 6 | 7 | // Based on code by Geoffrey Zweig and Tomas Mikolov 8 | // for the Feature-Augmented RNN Tool Kit 9 | // http://research.microsoft.com/en-us/projects/rnn/ 10 | 11 | /* 12 | This file is based on or incorporates material from the projects listed below (collectively, "Third Party Code"). 13 | Microsoft is not the original author of the Third Party Code. The original copyright notice and the license under which Microsoft received such Third Party Code, 14 | are set forth below. Such licenses and notices are provided for informational purposes only. Microsoft, not the third party, licenses the Third Party Code to you 15 | under the terms set forth in the EULA for the Microsoft Product. Microsoft reserves all rights not expressly granted under this agreement, whether by implication, 16 | estoppel or otherwise. 17 | 18 | RNNLM 0.3e by Tomas Mikolov 19 | 20 | Provided for Informational Purposes Only 21 | 22 | BSD License 23 | All rights reserved. 24 | Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 25 | Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 26 | 27 | Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other 28 | materials provided with the distribution. 29 | 30 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 31 | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, 32 | INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, 33 | OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 34 | ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 35 | */ 36 | 37 | #ifndef __DependencyTreeRNN____RnnTraining__ 38 | #define __DependencyTreeRNN____RnnTraining__ 39 | 40 | #include 41 | #include 42 | #include 43 | #include 44 | #include "CorpusWordReader.h" 45 | #include "Utils.h" 46 | #include "RnnLib.h" 47 | #include "RnnState.h" 48 | 49 | 50 | /** 51 | * Main class training and testing the RNN model, 52 | * not supposed at all to run in a production online environment 53 | * (not thread-safe). 54 | */ 55 | class RnnLMTraining : public RnnLM { 56 | public: 57 | 58 | /** 59 | * Constructor for training the model 60 | */ 61 | RnnLMTraining(const std::string &filename, bool doLoadModel, bool debugMode) 62 | // We load the RNN or not, depending on whether the model file is present 63 | // otherwise simply set its filename 64 | : RnnLM(filename, doLoadModel), 65 | m_debugMode(debugMode), 66 | m_wordCounter(0), 67 | m_minWordOccurrences(5), 68 | m_oov(1), 69 | m_eof(-2), 70 | m_fileCorrectSentenceLabels("") { 71 | Log("RnnLMTraining: debug mode is " + ConvString(debugMode) + "\n"); 72 | } 73 | 74 | void SetTrainFile(const std::string &str) { m_trainFile = str; } 75 | 76 | void SetValidFile(const std::string &str) { m_validationFile = str; } 77 | 78 | void SetSentenceLabelsFile(const std::string &str) { 79 | m_fileCorrectSentenceLabels = str; 80 | } 81 | 82 | void SetFeatureTrainOrTestFile(const std::string &str) { 83 | m_featureFile = str; 84 | } 85 | 86 | void SetFeatureValidationFile(const std::string &str) { 87 | m_featureValidationFile = str; 88 | } 89 | 90 | void SetFeatureMatrixFile(const std::string &str) { 91 | m_featureMatrixFile = str; 92 | } 93 | 94 | void SetUnkPenalty(double penalty) { m_logProbabilityPenaltyUnk = penalty; } 95 | 96 | void SetGradientCutoff(double newGradient) { 97 | m_gradientCutoff = newGradient; 98 | } 99 | 100 | void SetIndependent(bool newVal) { m_areSentencesIndependent = newVal; } 101 | 102 | void SetLearningRate(double newAlpha) { 103 | m_learningRate = newAlpha; 104 | m_initialLearningRate = newAlpha; 105 | } 106 | 107 | void SetRegularization(double newBeta) { m_regularizationRate = newBeta; } 108 | 109 | void SetMinImprovement(double newMinImprovement) { 110 | m_minLogProbaImprovement = newMinImprovement; 111 | } 112 | 113 | /** 114 | * (Re)set the number of steps of BPTT 115 | */ 116 | void SetNumStepsBPTT(int val) { 117 | m_numBpttSteps = val; 118 | m_bpttVectors = RnnBptt(GetVocabularySize(), GetHiddenSize(), 119 | GetFeatureSize(), 120 | m_numBpttSteps, m_bpttBlockSize); 121 | } 122 | 123 | /** 124 | * (Re)set the number of steps/words when BPTT is called 125 | */ 126 | void SetBPTTBlock(int val) { 127 | m_bpttBlockSize = val; 128 | m_bpttVectors = RnnBptt(GetVocabularySize(), GetHiddenSize(), 129 | GetFeatureSize(), 130 | m_numBpttSteps, m_bpttBlockSize); 131 | } 132 | 133 | void SetDebugMode(bool mode) { m_debugMode = mode; } 134 | 135 | void SetFeatureGamma(double val) { m_featureGammaCoeff = val; } 136 | 137 | public: 138 | 139 | /** 140 | * Main function to train the RNN model 141 | */ 142 | virtual bool TrainRnnModel(); 143 | 144 | /** 145 | * Before learning the RNN model, we need to learn the vocabulary 146 | * from the corpus. Note that the word classes may have been initialized 147 | * beforehand using ReadClasses. Computes the unigram distribution 148 | * of words from a training file, assuming that the existing vocabulary 149 | * is empty. 150 | */ 151 | virtual bool LearnVocabularyFromTrainFile(int numClasses); 152 | 153 | 154 | /** 155 | * Set the minimum number of word occurrences 156 | */ 157 | virtual void SetMinWordOccurrence(int val) { 158 | m_minWordOccurrences = val; 159 | } 160 | 161 | /** 162 | * Read the classes from a file in the following format: 163 | * word [TAB] class_index 164 | * where class index is between 0 and n-1 and there are n classes. 165 | */ 166 | bool ReadClasses(const std::string &filename) { 167 | m_usesClassFile = m_vocab.ReadClasses(filename); 168 | return m_usesClassFile; 169 | } 170 | 171 | /** 172 | * Once we train the RNN model, it is nice to save it to a text or binary file 173 | */ 174 | bool SaveRnnModelToFile(); 175 | 176 | /** 177 | * Simply write the word projections/embeddings to a text file. 178 | */ 179 | void SaveWordEmbeddings(const std::string &filename); 180 | 181 | /** 182 | * Main function to test the RNN model 183 | */ 184 | virtual bool TestRnnModel(const std::string &testFile, 185 | const std::string &featureFile, 186 | std::vector &sentenceScores, 187 | double &logProbability, 188 | double &perplexity, 189 | double &entropy, 190 | double &accuracy); 191 | 192 | /** 193 | * Load a file containing the classification labels 194 | */ 195 | void LoadCorrectSentenceLabels(const std::string &labelFile); 196 | 197 | protected: 198 | 199 | /** 200 | * Get the next token (word or multi-word entity) from a text file 201 | * and return it as an integer in the vocabulary vector. 202 | * Returns -1 for OOV words and -2 for end of file. 203 | */ 204 | int ReadWordIndexFromFile(WordReader &reader); 205 | 206 | /** 207 | * Sort the vocabulary by decreasing count of words in the corpus 208 | * (used for frequency-based word classes, where class 0 contains 209 | * , class 1 contains {the} or another, most frequent token, 210 | * class 2 contains a few very frequent tokens, etc... 211 | */ 212 | void SortVocabularyByFrequency(); 213 | 214 | /** 215 | * Sort the words by class, in increasing class order 216 | * (used when the classes are provided by an external tools, 217 | * e.g., based on maximum entropy features on word bigrams) 218 | */ 219 | void SortVocabularyByClass(); 220 | 221 | /** 222 | * One step of backpropagation of the errors through the RNN 223 | * (optionally, backpropagation through time, BPTT) and of gradient descent. 224 | */ 225 | void BackPropagateErrorsThenOneStepGradientDescent(int last_word, int word); 226 | 227 | /** 228 | * Read the feature vector for the current word 229 | * in the train/test/valid file and update the feature vector 230 | * in the state 231 | * TODO: convert to ifstream 232 | */ 233 | bool LoadFeatureVectorAtCurrentWord(FILE *f, RnnState &state); 234 | 235 | /** 236 | * Compute the accuracy of selecting the top candidate (based on score) 237 | * among n-best lists 238 | */ 239 | double AccuracyNBestList(std::vector scores, 240 | std::vector &correctClasses) const; 241 | 242 | /** 243 | * Cleans all activations and error vectors, in the input, hidden, 244 | * compression, feature and output layers, and resets word history 245 | */ 246 | void ResetAllRnnActivations(RnnState &state) const; 247 | 248 | /** 249 | * Matrix-vector multiplication routine, accelerated using BLAS. 250 | * Computes x <- x + A' * y, 251 | * i.e., the "inverse" operation to y = A * x (adding the result to x) 252 | * where A is of size N x M, x is of length M and y is of length N. 253 | * The operation can done on a contiguous subset of indices 254 | * j in [idxYFrom, idxYTo[ of vector y. 255 | */ 256 | void GradientMatrixXvectorBlas(std::vector &vectorX, 257 | std::vector &vectorY, 258 | std::vector &matrixA, 259 | int widthMatrix, 260 | int idxYFrom, 261 | int idxYTo) const; 262 | 263 | /** 264 | * Matrix-matrix multiplication routine, accelerated using BLAS. 265 | * Computes C <- alpha * A * B + beta * C. 266 | * The operation can done on a contiguous subset of row indices 267 | * j in [idxRowCFrom, idxRowCTo[ in matrix A and C. 268 | */ 269 | void MultiplyMatrixXmatrixBlas(std::vector &matrixA, 270 | std::vector &matrixB, 271 | std::vector &matrixC, 272 | double alpha, 273 | double beta, 274 | int numRowsA, 275 | int numRowsB, 276 | int numColsC, 277 | int idxRowCFrom, 278 | int idxRowCTo) const; 279 | 280 | /** 281 | * Matrix-matrix or vector-vector addition routine using BLAS. 282 | * Computes Y <- alpha * X + beta * Y. 283 | */ 284 | void AddMatrixToMatrixBlas(std::vector &matrixX, 285 | std::vector &matrixY, 286 | double alpha, 287 | double beta, 288 | int numRows, 289 | int numCols) const; 290 | 291 | protected: 292 | 293 | // Are we in debug mode? 294 | bool m_debugMode; 295 | 296 | // Word counter 297 | long m_wordCounter; 298 | 299 | // Index of the OOV () word 300 | int m_oov; 301 | 302 | // Index of the EOF token 303 | int m_eof; 304 | 305 | // Minimum number of word occurrences 306 | int m_minWordOccurrences; 307 | 308 | // Classification labels 309 | std::vector m_correctSentenceLabels; 310 | 311 | // File containing the correct classification labels 312 | std::string m_fileCorrectSentenceLabels; 313 | }; 314 | 315 | #endif /* defined(__DependencyTreeRNN____RnnTraining__) */ 316 | -------------------------------------------------------------------------------- /DependencyTreeRNN++/Vocabulary.cpp: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2014-2015 Piotr Mirowski 2 | // 3 | // Piotr Mirowski, Andreas Vlachos 4 | // "Dependency Recurrent Neural Language Models for Sentence Completion" 5 | // ACL 2015 6 | 7 | // Based on code by Geoffrey Zweig and Tomas Mikolov 8 | // for the Feature-Augmented RNN Tool Kit 9 | // http://research.microsoft.com/en-us/projects/rnn/ 10 | 11 | /* 12 | This file is based on or incorporates material from the projects listed below (collectively, "Third Party Code"). 13 | Microsoft is not the original author of the Third Party Code. The original copyright notice and the license under which Microsoft received such Third Party Code, 14 | are set forth below. Such licenses and notices are provided for informational purposes only. Microsoft, not the third party, licenses the Third Party Code to you 15 | under the terms set forth in the EULA for the Microsoft Product. Microsoft reserves all rights not expressly granted under this agreement, whether by implication, 16 | estoppel or otherwise. 17 | 18 | RNNLM 0.3e by Tomas Mikolov 19 | 20 | Provided for Informational Purposes Only 21 | 22 | BSD License 23 | All rights reserved. 24 | Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 25 | Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 26 | 27 | Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other 28 | materials provided with the distribution. 29 | 30 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 31 | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, 32 | INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, 33 | OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 34 | ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 35 | */ 36 | 37 | #include 38 | #include 39 | #include 40 | #include 41 | #include 42 | #include 43 | #include 44 | #include "Vocabulary.h" 45 | 46 | 47 | /** 48 | * Constructor that reads the vocabulary and classes from the model file. 49 | */ 50 | Vocabulary::Vocabulary(FILE *fi, int sizeVocabulary, int numClasses) { 51 | // Read the vocabulary, stored in text format as following: 52 | // index_number count word_token class_number 53 | // There are tabs and spaces separating the 4 columns 54 | m_vocabularyStorage.resize(sizeVocabulary); 55 | for (int a = 0; a < sizeVocabulary; a++) { 56 | 57 | // Read the word index and the word count 58 | int wordIndex; 59 | int wordCount; 60 | fscanf(fi, "%d%d", &wordIndex, &wordCount); 61 | assert(wordIndex == a); 62 | m_vocabularyStorage[a].cn = wordCount; 63 | m_vocabularyStorage[a].prob = 0; 64 | 65 | // Read the word token 66 | char buffer[2048] = {0}; 67 | if (fscanf(fi, "%s", &buffer)) 68 | m_vocabularyStorage[a].word = buffer; 69 | std::string word = m_vocabularyStorage[a].word; 70 | 71 | // Read the class index 72 | int classIndex; 73 | fscanf(fi, "%d", &classIndex); 74 | 75 | // Store the class information 76 | m_vocabularyStorage[a].classIndex = classIndex; 77 | m_mapWord2Class[word] = classIndex; 78 | 79 | // Associate the word (string) to the word token number using two maps 80 | m_mapWord2Index[word] = wordIndex; 81 | m_mapIndex2Word[wordIndex] = word; 82 | } 83 | 84 | // Store which words are in which class, using a vector 85 | // (length number of classes) of vectors (num words in that class) 86 | m_numClasses = numClasses; 87 | StoreClassAssociations(); 88 | 89 | m_useClassFile = false; 90 | } 91 | 92 | 93 | /** 94 | * Save the vocabulary to a model file 95 | */ 96 | void Vocabulary::Save(FILE *fo) { 97 | // Save the vocabulary, one word per line 98 | int sizeVocabulary = GetVocabularySize(); 99 | fprintf(fo, "\nVocabulary:\n"); 100 | for (int wordIndex = 0; wordIndex < sizeVocabulary; wordIndex++) { 101 | int wordCount = m_vocabularyStorage[wordIndex].cn; 102 | std::string word = m_vocabularyStorage[wordIndex].word; 103 | int wordClass = m_vocabularyStorage[wordIndex].classIndex; 104 | fprintf(fo, "%6d\t%10d\t%s\t%d\n", 105 | wordIndex, wordCount, word.c_str(), wordClass); 106 | } 107 | } 108 | 109 | 110 | 111 | /** 112 | * Add a token (word or multi-word entity) to the vocabulary vector 113 | * and store it in the map from word string to word index 114 | * and in the map from word index to word string. 115 | */ 116 | int Vocabulary::AddWordToVocabulary(const std::string& word) 117 | { 118 | int index = SearchWordInVocabulary(word); 119 | // When a word is unknown, add it to the vocabulary 120 | if (index == -1) { 121 | // Initialize the word index, count and probability to 0 122 | VocabWord w = VocabWord(); 123 | w.word = word; 124 | w.prob = 0.0; 125 | w.cn = 1; 126 | index = static_cast(m_vocabularyStorage.size()); 127 | m_vocabularyStorage.push_back(std::move(w)); 128 | // We need to store the word - index pair in the hash table word -> index 129 | // but we will rewrite that map later after sorting the vocabulary by frequency 130 | m_mapWord2Index[word] = index; 131 | m_mapIndex2Word[index] = word; 132 | } else { 133 | // ... otherwise simply increase its count 134 | m_vocabularyStorage[index].cn++; 135 | } 136 | return (index); 137 | } 138 | 139 | 140 | /** 141 | * Manually set the word count. 142 | */ 143 | bool Vocabulary::SetWordCount(std::string word, int count) { 144 | int index = SearchWordInVocabulary(word); 145 | // When a word is unknown, add it to the vocabulary 146 | if (index > -1) { 147 | m_vocabularyStorage[index].cn = count; 148 | return true; 149 | } else 150 | return false; 151 | } 152 | 153 | 154 | /** 155 | * Sort the vocabulary by decreasing count of words in the corpus 156 | * (used for frequency-based word classes, where class 0 contains 157 | * , class 1 contains {the} or another, most frequent token, 158 | * class 2 contains a few very frequent tokens, etc... 159 | */ 160 | bool OrderWordCounts(const VocabWord& a, const VocabWord& b) { 161 | return a.cn > b.cn; 162 | } 163 | void Vocabulary::SortVocabularyByFrequency() { 164 | // Simply sort the words by frequency, making sure that is first 165 | int indexEos = SearchWordInVocabulary(""); 166 | int countEos = m_vocabularyStorage[indexEos].cn; 167 | m_vocabularyStorage[indexEos].cn = INT_MAX; 168 | std::sort(m_vocabularyStorage.begin(), 169 | m_vocabularyStorage.end(), 170 | OrderWordCounts); 171 | m_vocabularyStorage[indexEos].cn = countEos; 172 | 173 | // Rebuild the the maps of word <-> word index 174 | m_mapWord2Index.clear(); 175 | m_mapIndex2Word.clear(); 176 | for (int index = 0; index < GetVocabularySize(); index++) { 177 | std::string word = m_vocabularyStorage[index].word; 178 | // Add the word to the hash table word -> index 179 | m_mapWord2Index[word] = index; 180 | // Add the word to the hash table index -> word 181 | m_mapIndex2Word[index] = word; 182 | } 183 | } 184 | 185 | 186 | /** 187 | * Return the index of a word in the vocabulary, or -1 if OOV. 188 | */ 189 | int Vocabulary::SearchWordInVocabulary(const std::string& word) const { 190 | auto i = m_mapWord2Index.find(word); 191 | if (i == m_mapWord2Index.end()) { 192 | return -1; 193 | } else { 194 | return (i->second); 195 | } 196 | } 197 | 198 | 199 | /** 200 | * Read the classes from a file in the following format: 201 | * word [TAB] class_index 202 | * where class index is between 0 and n-1 and there are n classes. 203 | */ 204 | bool Vocabulary::ReadClasses(const std::string &filename) 205 | { 206 | FILE *fin = fopen(filename.c_str(), "r"); 207 | if (!fin) { 208 | printf("Error: unable to open %s\n", filename.c_str()); 209 | return false; 210 | } 211 | 212 | char w[8192]; 213 | int clnum; 214 | int eos_class = -1; 215 | int max_class = -1; 216 | std::set words; 217 | while (fscanf(fin, "%s%d", w, &clnum) != EOF) { 218 | if (!strcmp(w, "")) { 219 | printf("Error: should not be in vocab\n"); 220 | return false; 221 | } 222 | 223 | m_mapWord2Class[w] = clnum; 224 | words.insert(w); 225 | 226 | max_class = (clnum > max_class) ? (clnum) : (max_class); 227 | eos_class = (std::string(w) == "") ? (clnum) : (eos_class); 228 | } 229 | 230 | if (eos_class == -1) { 231 | printf("Error: must be present in the vocabulary\n"); 232 | return false; 233 | } 234 | 235 | if (m_mapWord2Class.size() == 0) { 236 | printf("Error: Empty class file!\n"); 237 | return false; 238 | } 239 | 240 | // needs to have the highest class index because it needs to come first in the vocabulary... 241 | for (auto si=words.begin(); si!=words.end(); si++) { 242 | if (m_mapWord2Class[*si] == eos_class) { 243 | m_mapWord2Class[*si] = max_class; 244 | } else { 245 | if (m_mapWord2Class[*si] == max_class) { 246 | m_mapWord2Class[*si] = eos_class; 247 | } 248 | } 249 | } 250 | return true; 251 | } 252 | 253 | 254 | 255 | /** 256 | * Assign words in vocabulary to classes (for hierarchical softmax). 257 | */ 258 | void Vocabulary::AssignWordsToClasses() { 259 | int sizeVocabulary = GetVocabularySize(); 260 | if (m_useClassFile) { 261 | // Custom-specified classes, provided in a file, were used 262 | // at training time. There is nothing to do at this point, 263 | // just copy the class index for each word. 264 | int cnum = -1; 265 | int last = -1; 266 | for (int i = 0; i < sizeVocabulary; i++) { 267 | if (m_vocabularyStorage[i].classIndex != last) { 268 | last = m_vocabularyStorage[i].classIndex; 269 | m_vocabularyStorage[i].classIndex = ++cnum; 270 | } else { 271 | m_vocabularyStorage[i].classIndex = cnum; 272 | } 273 | // Unused 274 | m_vocabularyStorage[i].prob = 0.0; 275 | } 276 | } else { 277 | // Frequency-based classes (povey-style) 278 | // Re-assign classes based on the sqrt(word_count / total_word_count) 279 | // so that the classes contain equal weight of word occurrences. 280 | int b = 0; 281 | for (int i = 0; i < sizeVocabulary; i++) { 282 | b += m_vocabularyStorage[i].cn; 283 | } 284 | double dd = 0; 285 | for (int i = 0; i < sizeVocabulary; i++) { 286 | dd += sqrt(m_vocabularyStorage[i].cn/ (double)b); 287 | } 288 | double df = 0; 289 | int a = 0; 290 | for (int i = 0; i < sizeVocabulary; i++) { 291 | df += sqrt(m_vocabularyStorage[i].cn / (double)b)/dd; 292 | if (df > 1) { 293 | df = 1; 294 | } 295 | if (df > (a + 1) / (double)m_numClasses) { 296 | m_vocabularyStorage[i].classIndex = a; 297 | if (a < m_numClasses - 1) { 298 | a++; 299 | } 300 | } else { 301 | m_vocabularyStorage[i].classIndex = a; 302 | } 303 | // Unused 304 | m_vocabularyStorage[i].prob = 0.0; 305 | } 306 | } 307 | 308 | // Store which words are in which class, using a vector 309 | // (length number of classes) of vectors (num words in that class) 310 | StoreClassAssociations(); 311 | } 312 | 313 | 314 | /** 315 | * Store information on which word is in which class 316 | */ 317 | void Vocabulary::StoreClassAssociations() { 318 | // Store which words are in which class, 319 | // using a vector (length number of classes) of vectors (num words in that class) 320 | m_classWords.resize(m_numClasses); 321 | for (int i = 0; i < m_numClasses; i++) { 322 | m_classWords[i].clear(); 323 | } 324 | for (int i = 0; i < GetVocabularySize(); i++) { 325 | // Assign each word into its class 326 | int wordClass = m_vocabularyStorage[i].classIndex; 327 | m_classWords[wordClass].push_back(i); 328 | } 329 | 330 | // Check that there is no empty class 331 | for (int i = 0; i < m_numClasses; i++) { 332 | assert(!(m_classWords[i].empty())); 333 | } 334 | } 335 | -------------------------------------------------------------------------------- /DependencyTreeRNN++/ReadJson.cpp: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2014-2015 Piotr Mirowski 2 | // 3 | // Piotr Mirowski, Andreas Vlachos 4 | // "Dependency Recurrent Neural Language Models for Sentence Completion" 5 | // ACL 2015 6 | 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include 13 | #include "ReadJson.h" 14 | #include "CorpusUnrollsReader.h" 15 | 16 | using namespace std; 17 | 18 | 19 | /** 20 | * Trim a word 21 | */ 22 | string const ReadJson::Trim(const string &word) const { 23 | assert(word.length() > 1); 24 | string res(word); 25 | if (res[0] == '"') { 26 | res = res.substr(1, res.length()-1); 27 | } 28 | if (res[res.length()-1] == '"') { 29 | res = res.substr(0, res.length()-1); 30 | } 31 | return res; 32 | } 33 | 34 | 35 | /** 36 | * Parse a token 37 | */ 38 | size_t const ReadJson::ParseToken(const string &json_element, JsonToken &tok) const { 39 | 40 | //cout << "parseToken: " << json_element << endl; 41 | 42 | size_t len = json_element.length(); 43 | if (len < 14) { return 0; } 44 | size_t begin = 0; 45 | 46 | // Avoid situations with empty tokens [] 47 | if ((json_element[0] == '[') && (json_element[1] == ']')) 48 | return 2; 49 | 50 | // Consume the [ 51 | if (json_element[0] == '[') { begin++; } 52 | // Parse the token number 53 | size_t end = json_element.find(",", begin); 54 | assert(end != string::npos); 55 | string pos_string = json_element.substr(begin, end - begin); 56 | int token_pos = stoi(pos_string); 57 | begin = end + 1; 58 | assert(begin < len); 59 | 60 | // Consume the space and the first " 61 | if (json_element[begin] == ' ') { begin++; } 62 | if (json_element[begin] == '"') { begin++; } 63 | // Parse the word and trim the " 64 | end = json_element.find("\", ", begin); 65 | assert(end != string::npos); 66 | end = end + 1; 67 | string token_word = json_element.substr(begin, end - begin); 68 | if (token_word.length() <= 1) { 69 | cout << json_element << endl; 70 | } 71 | assert(token_word.length() > 1); 72 | token_word = Trim(token_word); 73 | begin = end + 1; 74 | assert(begin < len); 75 | 76 | // Parse the discount 77 | end = json_element.find(",", begin); 78 | assert(end != string::npos); 79 | string discount_string = json_element.substr(begin, end - begin); 80 | double token_discount = stod(discount_string); 81 | begin = end + 1; 82 | assert(begin < len); 83 | 84 | // Consume the space and the first " 85 | if (json_element[begin] == ' ') { begin++; } 86 | if (json_element[begin] == '"') { begin++; } 87 | // Parse the label 88 | end = json_element.find("]", begin); 89 | assert(end != string::npos); 90 | string token_label = json_element.substr(begin, end - begin); 91 | assert(token_label.length() > 2); 92 | token_label = Trim(token_label); 93 | 94 | // Fill the token 95 | tok.pos = token_pos; 96 | tok.word = token_word; 97 | tok.discount = token_discount; 98 | tok.label = token_label; 99 | 100 | //cout << "token: " << token_pos << " " << token_word 101 | // << " " << token_discount << " " << token_label << endl; 102 | return end; 103 | } 104 | 105 | 106 | /** 107 | * Parse an unroll 108 | */ 109 | size_t const ReadJson::ParseUnroll(const string &json_unrolls, 110 | vector &unroll) const { 111 | 112 | //cout << "parseUnroll: " << json_unrolls << endl; 113 | 114 | size_t end_unroll = json_unrolls.find("]]", 0); 115 | assert(end_unroll != string::npos); 116 | 117 | // Avoid situations with empty unrolls [] 118 | if ((json_unrolls[0] == '[') && (json_unrolls[1] == ']')) 119 | return 2; 120 | assert(json_unrolls[0] == '['); 121 | assert(json_unrolls[1] == '['); 122 | string json_tokens(json_unrolls.substr(0, end_unroll + 1)); 123 | size_t begin = 1; 124 | size_t end = end_unroll + 1; 125 | 126 | while (begin < end_unroll + 1) { 127 | // Find the next end of the token 128 | //cout << "parseToken[" << begin << ", " << end << "]\n" << flush; 129 | JsonToken tok; 130 | end = ParseToken(json_tokens.substr(begin, end - begin), tok); 131 | if (end > 0) { 132 | // Store the token in the unroll 133 | unroll.push_back(tok); 134 | // Go to next token 135 | // Consume the ], comma and space 136 | begin += end; 137 | if (json_tokens[begin] == ']') { begin++; } 138 | if (json_tokens[begin] == ',') { begin++; } 139 | if (json_tokens[begin] == ' ') { begin++; } 140 | end = end_unroll + 1; 141 | } else 142 | break; 143 | } 144 | 145 | return end_unroll; 146 | } 147 | 148 | 149 | /** 150 | * Parse a sentence 151 | */ 152 | size_t const ReadJson::ParseSentence(const string &json_sentences, 153 | vector> &sentence) const { 154 | 155 | //cout << "parseSentence: " << json_sentences << endl; 156 | assert(json_sentences.length() >= 6); 157 | size_t end_sentence = json_sentences.find("]]]", 0); 158 | assert(end_sentence != string::npos); 159 | 160 | // Avoid situations with empty sentences [] 161 | if ((json_sentences[0] == '[') && (json_sentences[1] == ']')) 162 | return 2; 163 | assert(json_sentences[0] == '['); 164 | assert(json_sentences[1] == '['); 165 | assert(json_sentences[2] == '['); 166 | string json_unrolls(json_sentences.substr(0, end_sentence + 2)); 167 | size_t begin = 1; 168 | size_t end = end_sentence + 2; 169 | 170 | while (begin < end_sentence + 2) { 171 | // Find the next end of the token 172 | //cout << "parseUnroll[" << begin << ", " << end << "]\n" << flush; 173 | vector unroll; 174 | end = ParseUnroll(json_unrolls.substr(begin, end - begin), 175 | unroll); 176 | if (end > 2) { 177 | // Store the unroll in the sentence 178 | sentence.push_back(unroll); 179 | } 180 | // Go to the next unroll 181 | begin += end; 182 | // Consume the ], comma and space 183 | if (json_unrolls[begin] == ']') { begin++; } 184 | if (json_unrolls[begin] == ']') { begin++; } 185 | if (json_unrolls[begin] == ',') { begin++; } 186 | if (json_unrolls[begin] == ' ') { begin++; } 187 | end = end_sentence + 2; 188 | } 189 | 190 | return end_sentence; 191 | } 192 | 193 | 194 | /** 195 | * Parse a book 196 | */ 197 | size_t const ReadJson::ParseBook(const string &json_book, 198 | vector>> &book) const { 199 | 200 | //cout << "parseBook: " << json_book << endl; 201 | assert(json_book.length() >= 8); 202 | size_t end_book = json_book.find("]]]]", 0); 203 | if (end_book == string::npos) { 204 | end_book = json_book.find("]]], []]", 0); 205 | if (end_book == string::npos) { 206 | end_book = json_book.find("]]], [], []]", 0); 207 | assert(end_book != string::npos); 208 | } 209 | } 210 | 211 | assert(json_book[0] == '['); 212 | size_t begin = 1; 213 | if ((json_book[begin] == '[') && (json_book[begin+1] == ']') && 214 | (json_book[begin+2] == ',') && (json_book[begin+3] == ' ')) { 215 | begin += 4; 216 | } 217 | if ((json_book[begin] == '[') && (json_book[begin+1] == ']') && 218 | (json_book[begin+2] == ',') && (json_book[begin+3] == ' ')) { 219 | begin += 4; 220 | } 221 | assert(json_book[begin] == '['); 222 | assert(json_book[begin + 1] == '['); 223 | assert(json_book[begin + 2] == '['); 224 | string json_sentences(json_book.substr(0, end_book + 3)); 225 | size_t end = end_book + 3; 226 | 227 | while (begin < end_book + 3) { 228 | // Find the next end of the token 229 | //cout << "parseSentence[" << begin << ", " << end << "]\n" << flush; 230 | vector> sentence; 231 | end = ParseSentence(json_sentences.substr(begin, end - begin), 232 | sentence); 233 | if (end > 2) { 234 | // Store the sentence 235 | book.push_back(sentence); 236 | } 237 | // Go to next sentence 238 | begin += end; 239 | // Consume the ], ], ], comma and space 240 | if (json_sentences[begin] == ']') { begin++; } 241 | if (json_sentences[begin] == ']') { begin++; } 242 | if (json_sentences[begin] == ']') { begin++; } 243 | if (json_sentences[begin] == ',') { begin++; } 244 | if (json_sentences[begin] == ' ') { begin++; } 245 | end = end_book + 3; 246 | } 247 | 248 | return end_book; 249 | } 250 | 251 | 252 | /** 253 | * Constructor: read a text file in JSON format. 254 | * If required, insert words and labels to the vocabulary. 255 | * If required, insert tokens into the current book. 256 | */ 257 | ReadJson::ReadJson(const string &filename, 258 | CorpusUnrolls &corpus, 259 | bool insert_vocab, 260 | bool read_book, 261 | bool merge_label_with_word) { 262 | 263 | cout << "Reading book " << filename << "..." << endl; 264 | ifstream t(filename); 265 | string book_text((istreambuf_iterator(t)), 266 | istreambuf_iterator()); 267 | 268 | vector>> sentences; 269 | cout << "Parsing book " << filename << "..." << endl; 270 | ParseBook(book_text, sentences); 271 | cout << "Parsing done.\n"; 272 | 273 | // Pointer to the current book 274 | BookUnrolls *book = &(corpus.m_currentBook); 275 | 276 | // First, iterate over sentences 277 | int numSentences = 0; 278 | 279 | for (int idx_sentence = 0; idx_sentence < sentences.size(); idx_sentence++) { 280 | 281 | int numUnrollsInThatSentence = 0; 282 | bool isNewSentence = true; 283 | 284 | // Second, iterate over unrolls in each sentence 285 | vector> unrolls = sentences[idx_sentence]; 286 | for (int idx_unroll = 0; idx_unroll < unrolls.size(); idx_unroll++) { 287 | bool isNewUnroll = true; 288 | 289 | // Third, iterate over tokens in each unroll 290 | vector tokens = unrolls[idx_unroll]; 291 | for (int idx_token = 0; idx_token < tokens.size(); idx_token++) { 292 | 293 | // Process the token to get: 294 | // its position in sentence, 295 | // word, discount and label 296 | string tokenWordAsTarget = tokens[idx_token].word; 297 | string tokenLabel = tokens[idx_token].label; 298 | int tokenPos = tokens[idx_token].pos; 299 | double tokenDiscount = 1.0 / (tokens[idx_token].discount); 300 | string tokenWordAsContext(tokenWordAsTarget); 301 | 302 | // Concatenate word with label, when it is used as context? 303 | if (merge_label_with_word) { 304 | tokenWordAsContext += ":" + tokenLabel; 305 | } 306 | 307 | // Shall we insert new words/labels 308 | // into the vocabulary? 309 | if (insert_vocab) { 310 | if (merge_label_with_word) { 311 | if (tokenLabel == "LEAF") { 312 | // Insert target word to vocabulary 313 | corpus.InsertWord(tokenWordAsTarget, tokenDiscount); 314 | } else { 315 | // Insert concatenated context word and label to vocabulary 316 | corpus.InsertWord(tokenWordAsContext, tokenDiscount); 317 | } 318 | } else { 319 | // Insert word and label to two different vocabularies 320 | corpus.InsertWord(tokenWordAsContext, tokenDiscount); 321 | if (tokenLabel != "LEAF") { 322 | corpus.InsertLabel(tokenLabel); 323 | } 324 | } 325 | } 326 | // Insert new words to the book 327 | int wordIndexAsContext = 0, wordIndexAsTarget = 0, labelIndex = 0; 328 | if (merge_label_with_word) { 329 | wordIndexAsContext = corpus.LookUpWord(tokenWordAsContext); 330 | wordIndexAsTarget = corpus.LookUpWord(tokenWordAsTarget); 331 | } else { 332 | wordIndexAsContext = corpus.LookUpWord(tokenWordAsContext); 333 | wordIndexAsTarget = wordIndexAsContext; 334 | labelIndex = corpus.LookUpLabel(tokenLabel); 335 | } 336 | book->AddToken(isNewSentence, isNewUnroll, 337 | tokenPos, wordIndexAsContext, wordIndexAsTarget, 338 | tokenDiscount, labelIndex); 339 | // We are no longer at beginning of a sentence or unroll 340 | isNewSentence = false; 341 | isNewUnroll = false; 342 | } 343 | tokens.clear(); 344 | numUnrollsInThatSentence++; 345 | } 346 | unrolls.clear(); 347 | numSentences++; 348 | } 349 | sentences.clear(); 350 | book_text.clear(); 351 | cout << "ReadJSON: " << filename << endl; 352 | cout << " (" << numSentences << " sentences, including empty ones; "; 353 | cout << book->NumTokens() << " tokens)\n"; 354 | if (insert_vocab) { 355 | cout << " Corpus now contains " << corpus.NumWords() 356 | << " words and " << corpus.NumLabels() << " labels\n"; 357 | } 358 | } 359 | -------------------------------------------------------------------------------- /DependencyTreeRNN++/CorpusUnrollsReader.cpp: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2014-2015 Piotr Mirowski 2 | // 3 | // Piotr Mirowski, Andreas Vlachos 4 | // "Dependency Recurrent Neural Language Models for Sentence Completion" 5 | // ACL 2015 6 | 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include 13 | #include 14 | #include 15 | #include "CorpusUnrollsReader.h" 16 | #include "ReadJson.h" 17 | 18 | using namespace std; 19 | 20 | /** 21 | * Add a token to the book 22 | */ 23 | void BookUnrolls::AddToken(bool isNewSentence, bool isNewUnroll, 24 | int pos, int wordAsContext, int wordAsTarget, 25 | double discount, int label) { 26 | 27 | // Add a new sentence? 28 | if (isNewSentence) { 29 | Sentence s; 30 | _sentences.push_back(s); 31 | _numUnrollsInSentence.push_back(0); 32 | vector v; 33 | _numTokensInUnrollSentence.push_back(v); 34 | // Bookkeeping of sentences and unrolls 35 | _numSentences++; 36 | _sentenceIndex = _numSentences - 1; 37 | _unrollIndex = 0; 38 | _tokenIndex = 0; 39 | } 40 | // Add a new unroll? 41 | if (isNewUnroll) { 42 | Unroll u; 43 | _sentences[_sentenceIndex].push_back(u); 44 | // Bookkeeping of unrolls 45 | _numUnrollsInSentence[_sentenceIndex]++; 46 | _unrollIndex = _numUnrollsInSentence[_sentenceIndex] - 1; 47 | _numTokensInUnrollSentence[_sentenceIndex].push_back(0); 48 | _tokenIndex = 0; 49 | } 50 | // Add a new token 51 | Token newToken; 52 | newToken.pos = pos; 53 | newToken.wordAsContext = wordAsContext; 54 | newToken.wordAsTarget = wordAsTarget; 55 | newToken.discount = discount; 56 | newToken.label = label; 57 | _sentences[_sentenceIndex][_unrollIndex].push_back(newToken); 58 | _numTokensInUnrollSentence[_sentenceIndex][_unrollIndex]++; 59 | _numTokens++; 60 | } 61 | 62 | 63 | /** 64 | * Go to a specific sentence 65 | */ 66 | bool BookUnrolls::GoToSentence(int n) { 67 | // Sanity check 68 | if ((n < 0) || (n >= _numSentences)) { 69 | return false; 70 | } 71 | // Set the new sentence 72 | _sentenceIndex = n; 73 | // Reset the index of the unroll 74 | ResetUnroll(); 75 | return true; 76 | } 77 | 78 | 79 | /** 80 | * Go to the next sentence 81 | */ 82 | int BookUnrolls::NextSentence() { 83 | // Set the new sentence by incrementing its index 84 | if (_sentenceIndex >= (_numSentences - 1)) { 85 | // Return to sentence 0 86 | ResetSentence(); 87 | } else { 88 | // ... or simply go to the next sentence? 89 | _sentenceIndex++; 90 | // Reset the index of the unroll 91 | ResetUnroll(); 92 | } 93 | return _sentenceIndex; 94 | } 95 | 96 | 97 | /** 98 | * Go to the next unroll in the sentence 99 | */ 100 | int BookUnrolls::NextUnrollInSentence() { 101 | int n_unrolls = _numUnrollsInSentence[_sentenceIndex]; 102 | if (_unrollIndex >= (n_unrolls - 1)) { 103 | // Return to unroll 0 in the current sentence... 104 | ResetUnroll(); 105 | } else { 106 | // ... or simply go to the next unroll? 107 | _unrollIndex++; 108 | // Reset the token in that unroll 109 | ResetToken(); 110 | } 111 | return _unrollIndex; 112 | } 113 | 114 | 115 | /** 116 | * Go to the next unroll in the current sentence. 117 | * Here, we do not loop over but stop (return -1) 118 | * when the end of the unroll is reached. 119 | */ 120 | int BookUnrolls::NextTokenInUnroll() { 121 | // If we have reached the end of sentence 122 | if (_tokenIndex < 0) 123 | return -1; 124 | // Number of tokens in sentence 125 | int _numTokens = _numTokensInUnrollSentence[_sentenceIndex][_unrollIndex]; 126 | // Go to the next token or stop 127 | if (_tokenIndex < (_numTokens - 1)) { 128 | _tokenIndex++; 129 | UpdateCurrentToken(); 130 | } else { 131 | _tokenIndex = -1; 132 | } 133 | return _tokenIndex; 134 | } 135 | 136 | 137 | /** 138 | * Custom comparator for sorting a vector> 139 | * by values 140 | */ 141 | struct reverseSortByValue { 142 | bool operator() (const pair &left, 143 | const pair &right) { 144 | return (left.second > right.second); 145 | } 146 | }; 147 | 148 | 149 | /** 150 | * Filter and sort the vocabulary from another corpus 151 | */ 152 | void CorpusUnrolls::FilterSortVocabulary(CorpusUnrolls &other) { 153 | 154 | // Copy the labels as they are 155 | for (int k = 0; k < other.NumLabels(); k++) { 156 | InsertLabel(other.labelsReverse[k]); 157 | } 158 | 159 | // Initialize a vector of filtered word counts 160 | // that contains OOV and EOS 161 | vector > filteredWords; 162 | filteredWords.push_back(pair("", 0.0)); 163 | filteredWords.push_back(pair("", 0.0)); 164 | double freqOOV = 0.0; 165 | double countWords = 0; 166 | 167 | // Copy only words with 3 or more occurrences into that vector 168 | // and keep statistics about OOV words. 169 | // Note that we start the indexing at 3 because we already stored 170 | // and 171 | for (int k = 2; k < other.NumWords(); k++) { 172 | string word = other.vocabularyReverse[k]; 173 | double wordFreq = ceil(other.wordCountsDiscounted[k]); 174 | if (wordFreq >= _minWordOccurrence) { 175 | pair p(word, wordFreq); 176 | filteredWords.push_back(p); 177 | } else { 178 | freqOOV += wordFreq; 179 | } 180 | countWords += wordFreq; 181 | } 182 | // Set the number of tokens to a large value 183 | filteredWords[0].second = INT_MAX; 184 | // Count the number of 185 | filteredWords[1].second = freqOOV; 186 | 187 | // Sort that vector by value 188 | // The sorting should keep at position 0 189 | sort(filteredWords.begin(), 190 | filteredWords.end(), 191 | reverseSortByValue()); 192 | 193 | // Completely clear the corpus word vocabulary 194 | // (not the labels) 195 | vocabulary.clear(); 196 | vocabularyReverse.clear(); 197 | wordCountsDiscounted.clear(); 198 | _vocabSizeWords = 0; 199 | 200 | // Now we can set the number of tokens to 0 201 | // (it never happens, because of the tree parsing) 202 | filteredWords[0].second = 0.0; 203 | 204 | // Copy the content of that vector 205 | for (int k = 0; k < filteredWords.size(); k++) { 206 | string word = filteredWords[k].first; 207 | double wordFreq = filteredWords[k].second; 208 | InsertWord(word, wordFreq); 209 | } 210 | // Note the OOV tag 211 | _oov = vocabulary[""]; 212 | } 213 | 214 | 215 | /** 216 | * Copy the vocabulary from another corpus 217 | */ 218 | void CorpusUnrolls::CopyVocabulary(CorpusUnrolls &other) { 219 | 220 | // Completely clear the corpus word vocabulary and labels 221 | labels.clear(); 222 | labelsReverse.clear(); 223 | vocabulary.clear(); 224 | vocabularyReverse.clear(); 225 | wordCountsDiscounted.clear(); 226 | _vocabSizeWords = 0; 227 | _vocabSizeLabels = 0; 228 | 229 | // Copy the labels as they are 230 | for (int k = 0; k < other.NumLabels(); k++) { 231 | InsertLabel(other.labelsReverse[k]); 232 | } 233 | 234 | // Insert the words from the other corpus into the vocabulary 235 | for (int k = 0; k < other.NumWords(); k++) { 236 | string word = other.vocabularyReverse[k]; 237 | double wordFreq = other.wordCountsDiscounted[k]; 238 | InsertWord(word, wordFreq); 239 | } 240 | 241 | // Note the OOV tag 242 | _oov = vocabulary[""]; 243 | } 244 | 245 | 246 | /** 247 | * Export the vocabulary to a text file 248 | */ 249 | void CorpusUnrolls::ExportVocabulary(const string &filename) { 250 | // Write the header 251 | ofstream vocabFile(filename); 252 | vocabFile << NumWords() << "\t" << NumLabels() << "\n"; 253 | // Write the labels 254 | for (int k = 0; k < NumLabels(); k++) { 255 | vocabFile << k << "\t" << labelsReverse[k] << "\n"; 256 | } 257 | // Write the words and their discount factors 258 | for (int k = 0; k < NumWords(); k++) { 259 | vocabFile << k << "\t" << vocabularyReverse[k] 260 | << "\t" << wordCountsDiscounted[k] << "\n"; 261 | } 262 | vocabFile.close(); 263 | } 264 | 265 | 266 | /** 267 | * Import the vocabulary from a text file 268 | */ 269 | void CorpusUnrolls::ImportVocabulary(const string &filename) { 270 | 271 | // Read the header 272 | ifstream vocabFile(filename); 273 | cout << "Reading vocabulary file " << filename << endl; 274 | assert(vocabFile.is_open()); 275 | 276 | // Completely clear the corpus word vocabulary and labels 277 | labels.clear(); 278 | labelsReverse.clear(); 279 | vocabulary.clear(); 280 | vocabularyReverse.clear(); 281 | wordCountsDiscounted.clear(); 282 | _vocabSizeWords = 0; 283 | _vocabSizeLabels = 0; 284 | 285 | // Read the header line 286 | string line; 287 | getline(vocabFile, line); 288 | stringstream lineStream(line); 289 | string strNumWords; 290 | string strNumLabels; 291 | getline(lineStream, strNumWords, '\t'); 292 | getline(lineStream, strNumLabels); 293 | int numWords = stoi(strNumWords); 294 | int numLabels = stoi(strNumLabels); 295 | cout << "Vocabulary file contains " << numWords << " words and " 296 | << numLabels << " labels\n"; 297 | 298 | // Read the labels one by one 299 | for (int k = 0; k < numLabels; k++) { 300 | getline(vocabFile, line); 301 | stringstream lineStream(line); 302 | string strIdx; 303 | string label; 304 | getline(lineStream, strIdx, '\t'); 305 | getline(lineStream, label); 306 | InsertLabel(label); 307 | } 308 | 309 | // Read the words one by one 310 | for (int k = 0; k < numWords; k++) { 311 | getline(vocabFile, line); 312 | stringstream lineStream(line); 313 | string strIdx; 314 | string word; 315 | string strWordFreq; 316 | getline(lineStream, strIdx, '\t'); 317 | getline(lineStream, word, '\t'); 318 | getline(lineStream, strWordFreq); 319 | double wordFreq = stof(strWordFreq); 320 | InsertWord(word, wordFreq); 321 | } 322 | 323 | vocabFile.close(); 324 | 325 | // Note the OOV tag 326 | _oov = vocabulary[""]; 327 | 328 | printf("Vocab size: %d\n", NumWords()); 329 | printf("Unknown tag at: %d\n", _oov); 330 | printf("Label vocab size: %d\n", NumLabels()); 331 | } 332 | 333 | 334 | /** 335 | * Read vocabulary from all books and return the number of tokens 336 | */ 337 | long CorpusUnrolls::ReadVocabulary(bool mergeLabel) { 338 | 339 | long nTokens = 0; 340 | // Loop over the books 341 | for (int k = 0; k < NumBooks(); k++) { 342 | // Open the training file, load it to a JSON structure 343 | // and add words to the corpus 344 | ReadJson *train_json = 345 | new ReadJson(_bookFilenames[k], *this, true, false, mergeLabel); 346 | nTokens = m_currentBook.NumTokens(); 347 | // Free the memory 348 | delete train_json; 349 | } 350 | return nTokens; 351 | } 352 | 353 | 354 | /** 355 | * Read the current book into memory 356 | */ 357 | void CorpusUnrolls::ReadBook(bool mergeLabel) { 358 | 359 | // "Burn" the previous book, if any, to initialize it 360 | m_currentBook.Burn(); 361 | // Open the training file, load it to a JSON structure 362 | // and add words to the corpus 363 | ReadJson *train_json = 364 | new ReadJson(_bookFilenames[_currentBookIndex], *this, false, true, mergeLabel); 365 | // Free the memory 366 | delete train_json; 367 | } 368 | 369 | 370 | /** 371 | * Insert a word into the vocabulary, if new 372 | */ 373 | int CorpusUnrolls::InsertWord(const string &word, double discount) { 374 | 375 | // Try to find the word 376 | int wordIndex = LookUpWord(word); 377 | if (wordIndex == _oov) { 378 | // Could not find word: insert it to the vocabulary 379 | wordIndex = _vocabSizeWords; 380 | pair kv(word, wordIndex); 381 | vocabulary.insert(kv); 382 | pair kv2(wordIndex, word); 383 | vocabularyReverse.insert(kv2); 384 | _vocabSizeWords++; 385 | } else { 386 | wordIndex = vocabulary[word]; 387 | } 388 | 389 | // Find the current (dis)count of the word 390 | unordered_map::iterator it2 = 391 | wordCountsDiscounted.find(wordIndex); 392 | if (it2 == wordCountsDiscounted.end()) { 393 | pair kv(wordIndex, discount); 394 | wordCountsDiscounted.insert(kv); 395 | } else { 396 | wordCountsDiscounted[wordIndex] += discount; 397 | } 398 | 399 | // Simply return the word index 400 | return wordIndex; 401 | } 402 | 403 | 404 | /** 405 | * Insert a label into the vocabulary, if new 406 | */ 407 | int CorpusUnrolls::InsertLabel(const string &label) { 408 | 409 | // Try to find the label 410 | int labelIndex = LookUpLabel(label); 411 | if (labelIndex == -1) { 412 | // Could not find word: insert it to the vocabulary 413 | labelIndex = _vocabSizeLabels; 414 | pair kv(label, labelIndex); 415 | labels.insert(kv); 416 | pair kv2(labelIndex, label); 417 | labelsReverse.insert(kv2); 418 | _vocabSizeLabels++; 419 | } else { 420 | labelIndex = labels[label]; 421 | } 422 | 423 | // Simply return the label index 424 | return labelIndex; 425 | } 426 | 427 | 428 | /** 429 | * Look-up a word in the vocabulary 430 | */ 431 | int CorpusUnrolls::LookUpWord(const string &word) { 432 | 433 | // Try to find the word 434 | int wordIndex = _oov; 435 | unordered_map::iterator it = 436 | vocabulary.find(word); 437 | if (it != vocabulary.end()) { 438 | wordIndex = vocabulary[word]; 439 | } 440 | return wordIndex; 441 | } 442 | 443 | 444 | /** 445 | * Look-up a label in the vocabulary 446 | */ 447 | int CorpusUnrolls::LookUpLabel(const string &label) { 448 | 449 | // Try to find the word 450 | int labelIndex = -1; 451 | unordered_map::iterator it = 452 | labels.find(label); 453 | if (it != labels.end()) { 454 | labelIndex = labels[label]; 455 | } 456 | return labelIndex; 457 | } 458 | -------------------------------------------------------------------------------- /DependencyTreeRNN++/RnnLib.h: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2014-2015 Piotr Mirowski 2 | // 3 | // Piotr Mirowski, Andreas Vlachos 4 | // "Dependency Recurrent Neural Language Models for Sentence Completion" 5 | // ACL 2015 6 | 7 | // Based on code by Geoffrey Zweig and Tomas Mikolov 8 | // for the Feature-Augmented RNN Tool Kit 9 | // http://research.microsoft.com/en-us/projects/rnn/ 10 | 11 | /* 12 | This file is based on or incorporates material from the projects listed below (collectively, "Third Party Code"). 13 | Microsoft is not the original author of the Third Party Code. The original copyright notice and the license under which Microsoft received such Third Party Code, 14 | are set forth below. Such licenses and notices are provided for informational purposes only. Microsoft, not the third party, licenses the Third Party Code to you 15 | under the terms set forth in the EULA for the Microsoft Product. Microsoft reserves all rights not expressly granted under this agreement, whether by implication, 16 | estoppel or otherwise. 17 | 18 | RNNLM 0.3e by Tomas Mikolov 19 | 20 | Provided for Informational Purposes Only 21 | 22 | BSD License 23 | All rights reserved. 24 | Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 25 | Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 26 | 27 | Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other 28 | materials provided with the distribution. 29 | 30 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 31 | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, 32 | INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, 33 | OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 34 | ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 35 | */ 36 | 37 | #ifndef __DependencyTreeRNN____rnnlmlib__ 38 | #define __DependencyTreeRNN____rnnlmlib__ 39 | 40 | #include 41 | #include 42 | #include 43 | #include 44 | #include 45 | #include 46 | #include 47 | #include "RnnState.h" 48 | #include "RnnWeights.h" 49 | #include "CorpusWordReader.h" 50 | #include "Vocabulary.h" 51 | 52 | 53 | /** 54 | * Main class storing the RNN model 55 | */ 56 | class RnnLM 57 | { 58 | public: 59 | 60 | /** 61 | * Constructor 62 | */ 63 | RnnLM(const std::string &filename, 64 | bool doLoadModel); 65 | 66 | /** 67 | * Load the model. 68 | */ 69 | void LoadRnnModelFromFile(); 70 | 71 | /** 72 | * Return the number of words/entity tokens in the vocabulary. 73 | */ 74 | int GetVocabularySize() const { return m_vocab.GetVocabularySize(); } 75 | 76 | /** 77 | * Return the number of units in the input (word) layer. 78 | */ 79 | int GetInputSize() const { return m_state.GetInputSize(); } 80 | 81 | /** 82 | * Return the number of units in the input (word) layer. 83 | */ 84 | int GetHiddenSize() const { return m_state.GetHiddenSize(); } 85 | 86 | /** 87 | * Return the number of units in the optional hidden compression layer. 88 | */ 89 | int GetCompressSize() const { return m_state.GetCompressSize(); } 90 | 91 | /** 92 | * Return the number of units in the feature (e.g., topic) layer. 93 | */ 94 | int GetFeatureSize() const { return m_state.GetFeatureSize(); } 95 | 96 | /** 97 | * Return the number of units in the output layer. 98 | */ 99 | int GetOutputSize() const { return m_state.GetOutputSize(); } 100 | 101 | /** 102 | * Return the number of direct connections between input words 103 | * and the output word (i.e., n-gram features) 104 | */ 105 | int GetNumDirectConnection() const { return m_weights.GetNumDirectConnection(); } 106 | 107 | /** 108 | * Return the number of direct connections between input words 109 | * and the output word (i.e., n-gram features) 110 | */ 111 | int GetOrderDirectConnection() const { return m_state.GetOrderDirectConnection(); } 112 | 113 | /** 114 | * Return the number of vocabulary classes. These are specified 115 | * at training time and can be frequency-based or rely on more 116 | * complex max-entropy features of the word bigrams. 117 | */ 118 | int GetNumClasses() const { return m_weights.GetNumClasses(); } 119 | 120 | protected: 121 | 122 | /** 123 | * Exponentiates x. 124 | */ 125 | double SafeExponentiate(double val) const 126 | { 127 | // for numerical stability 128 | val = (val > 50) ? 50 : ((val < -50) ? -50 : val); 129 | return (exp(val)); 130 | } 131 | 132 | /** 133 | * Exponentiates x in base 10. 134 | */ 135 | double ExponentiateBase10(double num) const 136 | { 137 | return exp(num * 2.302585093); 138 | } 139 | 140 | /** 141 | * Apply the logistic sigmoid function to x. 142 | */ 143 | double LogisticSigmoid(double val) const 144 | { 145 | return (1 / (1 + SafeExponentiate(-val))); 146 | } 147 | 148 | /** 149 | * Matrix-vector multiplication routine, somewhat accelerated using loop 150 | * unrolling over 8 registers. Computes y <- y + A * x, (i.e. adds A * x to y) 151 | * where A is of size N x M, x is of length M and y is of length N. 152 | * The operation can done on a contiguous subset of indices 153 | * i in [idxYFrom, idxYTo[ of vector y 154 | * and on a contiguous subset of indices j in [idxXFrom, idxXTo[ of vector x. 155 | */ 156 | void MultiplyMatrixXvectorBlas(std::vector &vectorY, 157 | std::vector &vectorX, 158 | std::vector &matrixA, 159 | int widthMatrix, 160 | int idxYFrom, 161 | int idxYTo) const; 162 | 163 | public: 164 | 165 | /** 166 | * Return the index of a word in the vocabulary, or -1 if OOV. 167 | */ 168 | int SearchWordInVocabulary(const std::string& word) const; 169 | 170 | /** 171 | * Go to the next char delim when reading a file. 172 | */ 173 | bool GoToDelimiterInFile(int delim, FILE *fi) const; 174 | 175 | /** 176 | * Function used to initialize the RNN model to the specified dimensions 177 | * of the layers and weight vectors. This is done at construction 178 | * of the RNN model object and also during training time (not at runtime). 179 | * It is not thread safe yet because there is this file (m_featureMatrixFile) 180 | * that contains the topic model for the words (LDA-style, see the paper), 181 | * that is loaded by the function. It also modifies the vocabulary hash tables. 182 | */ 183 | bool InitializeRnnModel(int sizeInput, 184 | int sizeHidden, 185 | int sizeFeature, 186 | int sizeClasses, 187 | int sizeCompress, 188 | long long sizeDirectConnection, 189 | int orderDirectConnection); 190 | 191 | /** 192 | * Erase the hidden layer state and the word history. 193 | * Needed when processing sentences/queries in independent mode. 194 | * Updates the RnnState object. 195 | */ 196 | void ResetHiddenRnnStateAndWordHistory(RnnState &state) const; 197 | void ResetHiddenRnnStateAndWordHistory(RnnState &state, 198 | RnnBptt &bpttState) const; 199 | 200 | /** 201 | * Erases only the word history. 202 | * Needed when processing sentences/queries in independent mode. 203 | * Updates the RnnState object. 204 | */ 205 | void ResetWordHistory(RnnState &state) const; 206 | void ResetWordHistory(RnnState &state, 207 | RnnBptt &bpttState) const; 208 | 209 | /** 210 | * Forward-propagate the RNN through one full step, starting from 211 | * the lastWord w(t) and the previous hidden state activation s(t-1), 212 | * as well as optional feature vector f(t) 213 | * and direct n-gram connections to the word history, 214 | * computing the new hidden state activation s(t) 215 | * s(t) = sigmoid(W * s(t-1) + U * w(t) + F * f(t)) 216 | * x = V * s(t) + G * f(t) + n-gram_connections 217 | * y(t) = softmax_class(x) * softmax_word_given_class(x) 218 | * Updates the RnnState object (but not the weights). 219 | */ 220 | void ForwardPropagateOneStep(int lastWord, 221 | int word, 222 | RnnState &state); 223 | 224 | /** 225 | * Given a target word class, compute the conditional distribution 226 | * of all words within that class. The hidden state activation s(t) 227 | * is assumed to be already computed. Essentially, computes: 228 | * x = V * s(t) + G * f(t) + n-gram_connections 229 | * y(t) = softmax_class(x) * softmax_word_given_class(x) 230 | * but for a specific targetClass. 231 | * Updates the RnnState object (but not the weights). 232 | */ 233 | void ComputeRnnOutputsForGivenClass(const int targetClass, 234 | RnnState &state); 235 | 236 | /** 237 | * Copies the hidden layer activation s(t) to the recurrent connections. 238 | * That copy will become s(t-1) at the next call of ForwardPropagateOneStep 239 | */ 240 | void ForwardPropagateRecurrentConnectionOnly(RnnState &state) const; 241 | 242 | /** 243 | * Shift the word history by one and update last word. 244 | */ 245 | void ForwardPropagateWordHistory(RnnState &state, 246 | int &lastWord, 247 | const int word) const; 248 | 249 | /** 250 | * One way of having additional features to the RNN is to fit a topic 251 | * model to the past history of words. This can be achieved in a simple 252 | * way if such a topic matrix (words vs. topics) has been computed. 253 | * The feature vector f(t) is then simply an autoregressive 254 | * (exponentially decaying) function of the topic model vectors 255 | * for each word in the history. 256 | * This works well when processing sentence in English but might not 257 | * be appropriate for short queries, since the topic feature 258 | * will be continuously reset. 259 | */ 260 | void UpdateFeatureVectorUsingTopicModel(int word, RnnState &state) const; 261 | 262 | /** 263 | * This is currently unused, and we might not use topic model features at all. 264 | * The idea is to load a matrix of size W * T, where W is the number of words 265 | * and T is the number of topics. Each word is embedding into a topic vector. 266 | * The algorithm for word embedding can be Latent Dirichlet Allocation, 267 | * Latent Semantic Indexing, DSSM, etc... 268 | * It however assumes that the topic of the sentence changes with each word 269 | * and is based on longer word history, which is more appropriate for 270 | * long English sentences than for queries. 271 | * The function that needs to be called at runtime or during training is 272 | * UpdateFeatureVectorUsingTopicModel 273 | */ 274 | bool LoadTopicModelFeatureMatrix(); 275 | 276 | // Simply copy the hidden activations and gradients, as well as 277 | // the word history, from one state object to another state object. 278 | void SaveHiddenRnnState(const RnnState &stateFrom, 279 | RnnState &stateTo) const; 280 | 281 | public: 282 | 283 | // Log-probability of unknown words 284 | double m_logProbabilityPenaltyUnk; 285 | 286 | // Vocabulary hashtables 287 | Vocabulary m_vocab; 288 | 289 | // State variable representing all the input/feature/hidden/output layer 290 | // activations of the RNN. This specific variable is just an initial 291 | // value that is created when the RNN model is loaded or initialized. 292 | // The training/testing functions do not modify it, simply make 293 | // a copy of it (convenient way to initialize the state vectors 294 | // of the right sizes). 295 | RnnState m_state; 296 | 297 | // The RNN model weights are stored in this object. Once loaded, 298 | // they will not be updated if the RNN is simply run on new data 299 | // (e.g., NextWord). Of course, the training algorithm will change them. 300 | RnnWeights m_weights; 301 | 302 | // These BPTT data are not used when the RNN model is run, 303 | // only during training, but it was easier to store them here. 304 | RnnBptt m_bpttVectors; 305 | 306 | protected: 307 | 308 | /** 309 | * Is the training file set? 310 | */ 311 | bool m_isTrainFileSet; 312 | 313 | /** 314 | * Is the model loaded? 315 | */ 316 | bool m_isModelLoaded; 317 | 318 | /** 319 | * Training and validation files 320 | */ 321 | std::string m_trainFile; 322 | std::string m_validationFile; 323 | 324 | /** 325 | * RNN model file, version and type 326 | */ 327 | std::string m_rnnModelFile; 328 | int m_rnnModelVersion; 329 | 330 | /** 331 | * Topic features 332 | */ 333 | std::string m_featureFile; 334 | std::string m_featureValidationFile; 335 | std::string m_featureMatrixFile; 336 | double m_featureGammaCoeff; 337 | int m_featureMatrixUsed; 338 | bool m_useFeatures2Output; 339 | 340 | /** 341 | * This is used for the second way how to add features 342 | * into the RNN: only matrix W * T is specified, 343 | * where W = number of words (m_vocabSize) 344 | * and T = number of topics (m_featureSize) 345 | */ 346 | std::vector m_featureMatrix; 347 | 348 | /** 349 | * RNN model learning parameters. All this information will simply 350 | * be loaded from the model file and not used when the RNN is run. 351 | */ 352 | double m_learningRate; 353 | double m_initialLearningRate; 354 | bool m_doStartReducingLearningRate; 355 | double m_regularizationRate; 356 | double m_minLogProbaImprovement; 357 | double m_gradientCutoff; 358 | int m_numBpttSteps; 359 | int m_bpttBlockSize; 360 | 361 | /** 362 | * Information relative to the training of the RNN 363 | */ 364 | int m_iteration; 365 | long m_numTrainWords; 366 | long m_currentPosTrainFile; 367 | 368 | /** 369 | * Information relative to the classes 370 | */ 371 | bool m_usesClassFile; 372 | 373 | /** 374 | * Are the sentences independent? 375 | */ 376 | bool m_areSentencesIndependent; 377 | }; 378 | 379 | #endif /* defined(__DependencyTreeRNN____rnnlmlib__) */ 380 | --------------------------------------------------------------------------------