├── models
    └── dummy.txt
├── DependencyTreeRNN++
    ├── DependencyTreeRNN++.tmp
    ├── CorpusWordReader.h
    ├── ReadJson.h
    ├── CommandLineParser.h
    ├── CommandLineParser.cpp
    ├── RnnWeights.h
    ├── Vocabulary.h
    ├── RnnDependencyTreeLib.h
    ├── Utils.h
    ├── CorpusUnrollsReader.h
    ├── RnnState.h
    ├── RnnWeights.cpp
    ├── RnnTraining.h
    ├── Vocabulary.cpp
    ├── ReadJson.cpp
    ├── CorpusUnrollsReader.cpp
    └── RnnLib.h
├── books
    ├── test.txt
    ├── valid.txt
    ├── train_small.txt
    ├── test.labels
    ├── valid.labels
    └── all.labels
├── test_rnn_holmes_sequential.sh
├── ensemble.py
├── LICENSE.md
├── logs
    ├── GutenbergHolmes_seq_mw5_h300_c250_m1000_d4_b5.acc
    ├── GutenbergHolmes_p2_mw5_h50_c250_m2000_d4_b5_g0.5.acc
    ├── GutenbergHolmes_p2_mw5_h200_c250_m2000_d4_b5_g0.5.acc
    ├── GutenbergHolmes_seq_mw5_h50_c250_m0_d0_b5.acc
    ├── GutenbergHolmes_p0_mw5_h100_c250_m2000_d4_b5_g0.5.acc
    ├── GutenbergHolmes_p0_mw5_h50_c250_m2000_d4_b5_g0.5.acc
    ├── GutenbergHolmes_seq_mw5_h100_c250_m0_d0_b5.acc
    ├── GutenbergHolmes_p0_mw5_h200_c250_m2000_d4_b5_g0.5.acc
    ├── GutenbergHolmes_seq_mw5_h200_c250_m0_d0_b5.acc
    ├── GutenbergHolmes_p2_mw5_h100_c250_m2000_d4_b5_g0.5.acc
    ├── GutenbergHolmes_seq_mw5_h50_c250_m1000_d4_b5_indep_.acc
    ├── GutenbergHolmes_seq_mw5_h200_c250_m1000_d4_b5.acc
    └── GutenbergHolmes_seq_mw5_h100_c250_m1000_d4_b5.acc
├── train_rnn_holmes_sequential.sh
├── test_rnn_holmes_example.sh
├── Makefile
├── Makefile.MacOS
├── train_rnn_holmes_p0_1000.sh
├── train_rnn_holmes_p0_2000.sh
├── train_rnn_holmes_p2_1000.sh
├── train_rnn_holmes_p2_2000.sh
├── train_rnn_holmes_example.sh
├── Makefile.Linux
├── results.txt
├── README.md
└── preprocessing
    ├── JSON2unrolls.py
    └── Text2Parsed2JSON.java


/models/dummy.txt:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/DependencyTreeRNN++/DependencyTreeRNN++.tmp:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/books/test.txt:
--------------------------------------------------------------------------------
1 | Holmes.test.json.unrolls.json
2 | 


--------------------------------------------------------------------------------
/books/valid.txt:
--------------------------------------------------------------------------------
1 | Holmes.valid.json.unrolls.json
2 | 


--------------------------------------------------------------------------------
/books/train_small.txt:
--------------------------------------------------------------------------------
1 | 04TOM10.TXT.json.unrolls.json
2 | AGENT10.TXT.json.unrolls.json
3 | GOLDR10.TXT.json.unrolls.json
4 | MOLLF10.TXT.json.unrolls.json
5 | RUNNG10.TXT.json.unrolls.json
6 | WARW11.TXT.json.unrolls.json
7 | 


--------------------------------------------------------------------------------
/test_rnn_holmes_sequential.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | 
 3 | # This is the path that should be edited,
 4 | # depending on where the JSON books are stored
 5 | PATH_SEQUENTIAL=$PWD"/../Data/GutenbergHolmes_Sequential/"
 6 | 
 7 | # Get the model filename from the command line
 8 | FILE_MODEL=$1
 9 | 
10 | # If we need to debug, change this to "true"
11 | DEBUG_MODE="false"
12 | 
13 | # Automatic path generation
14 | PATH_DATA="./books"
15 | PATH_MODELS="./models"
16 | FILE_VALID=$PATH_SEQUENTIAL"/Holmes.valid.json.tokens.txt"
17 | FILE_TEST=$PATH_SEQUENTIAL"/Holmes.test.json.tokens.txt"
18 | FILE_SENTENCE_LABELS_VALID=$PATH_DATA"/valid.labels"
19 | FILE_SENTENCE_LABELS_TEST=$PATH_DATA"/test.labels"
20 | echo "RNN model will be stored in $FILE_MODEL..."
21 | 
22 | # Evaluate the dependency-parsing model on the validation set
23 | RnnDependencyTree \
24 |   -rnnlm $FILE_MODEL \
25 |   -test $FILE_VALID \
26 |   -sentence-labels $FILE_SENTENCE_LABELS_VALID \
27 |   -debug $DEBUG_MODE
28 | 
29 | # Evaluate the dependency-parsing model on the test set
30 | RnnDependencyTree \
31 |   -rnnlm $FILE_MODEL \
32 |   -test $FILE_TEST \
33 |   -sentence-labels $FILE_SENTENCE_LABELS_TEST \
34 |   -debug $DEBUG_MODE
35 | 


--------------------------------------------------------------------------------
/ensemble.py:
--------------------------------------------------------------------------------
 1 | # first arg gold, following ones files with scores to ensemble
 2 | 
 3 | import sys
 4 | 
 5 | goldFile = sys.argv[1]
 6 | 
 7 | answers = []
 8 | 
 9 | for line in open(goldFile).readlines():
10 |     answers.append(int(line.strip()))
11 | 
12 | print "loaded " + str(len(answers)) + " answers"
13 | 
14 | # an array with an array per model to be ensebled
15 | individualSentencePredictions = []
16 | 
17 | for file in sys.argv[2:]:
18 |     sentencePredictions = []
19 |     for line in open(file).readlines():
20 |         sentencePredictions.append(float(line.strip()))
21 |     
22 |     individualSentencePredictions.append(sentencePredictions)
23 | 
24 | # now for each answer
25 | # take the scores for 5 sentence predictions
26 | # add them
27 | # pick the highest one and compare
28 | correct = 0.0
29 | indiCounter= 0
30 | for answer in answers:
31 |     maxScore = float("-inf")
32 |     bestAnswer = None
33 |     for i in xrange(5):
34 |         scoreSum = 0.0
35 |         for preds in individualSentencePredictions:
36 |             scoreSum += preds[indiCounter]
37 |         #print scoreSum
38 |         if scoreSum > maxScore:
39 |             maxScore = scoreSum
40 |             bestAnswer = i
41 | 
42 |         indiCounter += 1
43 |     #print bestAnswer
44 |     #print maxScore
45 |     if answer == bestAnswer:
46 |         correct += 1
47 | 
48 | print "accuracy: " + str(correct/len(answers))
49 |     
50 | 


--------------------------------------------------------------------------------
/LICENSE.md:
--------------------------------------------------------------------------------
 1 | Copyright (c) 2015, Piotr Mirowski
 2 | All rights reserved.
 3 | 
 4 | Redistribution and use in source and binary forms, with or without
 5 | modification, are permitted provided that the following conditions are met:
 6 | 
 7 | * Redistributions of source code must retain the above copyright notice, this
 8 |   list of conditions and the following disclaimer.
 9 | 
10 | * Redistributions in binary form must reproduce the above copyright notice,
11 |   this list of conditions and the following disclaimer in the documentation
12 |   and/or other materials provided with the distribution.
13 | 
14 | * Neither the name of DependencyTreeRnn nor the names of its
15 |   contributors may be used to endorse or promote products derived from
16 |   this software without specific prior written permission.
17 | 
18 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
19 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
20 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
21 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
22 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
23 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
24 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
25 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
26 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
27 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28 | 


--------------------------------------------------------------------------------
/DependencyTreeRNN++/CorpusWordReader.h:
--------------------------------------------------------------------------------
 1 | // Copyright (c) 2014-2015 Piotr Mirowski
 2 | //
 3 | // Piotr Mirowski, Andreas Vlachos
 4 | // "Dependency Recurrent Neural Language Models for Sentence Completion"
 5 | // ACL 2015
 6 | 
 7 | #ifndef DependencyTreeRNN___CorpusWordReader_h
 8 | #define DependencyTreeRNN___CorpusWordReader_h
 9 | 
10 | #include <fstream>
11 | #include <string>
12 | #include <algorithm>
13 | #include <cctype>
14 | 
15 | 
16 | inline bool isSpace(char c) { return isspace(c); };
17 | inline bool notIsSpace(char c) { return !isspace(c); };
18 | 
19 | 
20 | /**
21 |  * Simple class to read words, one by one, from a file.
22 |  * When the end of a line is reached, it returns "</s>"
23 |  */
24 | class WordReader {
25 | protected:
26 |   std::ifstream m_file;
27 |   std::string m_line;
28 | 
29 | public:
30 |     
31 |   WordReader(const std::string &filename)
32 |   : m_file(filename) {
33 |   }
34 |     
35 | 
36 |   std::string pop_first_word(std::string &s) {
37 |     const auto p1 = std::find_if(s.begin(), s.end(), notIsSpace);
38 |     const auto p2 = std::find_if(p1, s.end(), isSpace);
39 |     const std::string word(p1, p2);
40 |     s.erase(0, std::find_if(p2, s.end(), notIsSpace) - s.begin());
41 |     return word;
42 |   }
43 |     
44 |     
45 |   std::string get_next() {
46 |     std::string result;
47 |     if (m_line.empty()) {
48 |       if (std::getline(m_file, m_line)) {
49 |         m_line += " </s>";
50 |       } else {
51 |         return result;
52 |       }
53 |     }
54 |     result = pop_first_word(m_line);
55 |     return result;
56 |   }
57 | };
58 | 
59 | #endif
60 | 


--------------------------------------------------------------------------------
/logs/GutenbergHolmes_seq_mw5_h300_c250_m1000_d4_b5.acc:
--------------------------------------------------------------------------------
 1 | Iter,0,Alpha,0.100000,VALIDacc,0.321154,VALIDent,7.395424,VALIDppx,168.362202,words/sec,0
 2 | Iter,1,Alpha,0.100000,VALIDacc,0.344231,VALIDent,7.152782,VALIDppx,142.299051,words/sec,0
 3 | Iter,2,Alpha,0.100000,VALIDacc,0.355769,VALIDent,7.050841,VALIDppx,132.591149,words/sec,0
 4 | Iter,3,Alpha,0.100000,VALIDacc,0.371154,VALIDent,6.990962,VALIDppx,127.200642,words/sec,0
 5 | Iter,4,Alpha,0.100000,VALIDacc,0.378846,VALIDent,6.968837,VALIDppx,125.264816,words/sec,0
 6 | Iter,5,Alpha,0.100000,VALIDacc,0.384615,VALIDent,6.959099,VALIDppx,124.422121,words/sec,0
 7 | Iter,6,Alpha,0.100000,VALIDacc,0.386538,VALIDent,6.956460,VALIDppx,124.194757,words/sec,0
 8 | Iter,7,Alpha,0.100000,VALIDacc,0.396154,VALIDent,6.960201,VALIDppx,124.517210,words/sec,0
 9 | Iter,8,Alpha,0.100000,VALIDacc,0.400000,VALIDent,6.966089,VALIDppx,125.026436,words/sec,0
10 | Iter,9,Alpha,0.100000,VALIDacc,0.394231,VALIDent,6.974113,VALIDppx,125.723723,words/sec,0
11 | Iter,10,Alpha,0.066667,VALIDacc,0.396154,VALIDent,6.835435,VALIDppx,114.201273,words/sec,0
12 | Iter,11,Alpha,0.044444,VALIDacc,0.394231,VALIDent,6.744126,VALIDppx,107.197371,words/sec,0
13 | Iter,12,Alpha,0.029630,VALIDacc,0.392308,VALIDent,6.680182,VALIDppx,102.549864,words/sec,0
14 | Iter,13,Alpha,0.019753,VALIDacc,0.392308,VALIDent,6.632794,VALIDppx,99.236181,words/sec,0
15 | Iter,14,Alpha,0.013169,VALIDacc,0.392308,VALIDent,6.598300,VALIDppx,96.891635,words/sec,0
16 | Iter,15,Alpha,0.008779,VALIDacc,0.390385,VALIDent,6.571902,VALIDppx,95.134870,words/sec,0
17 | Iter,16,Alpha,0.005853,VALIDacc,0.386538,VALIDent,6.549385,VALIDppx,93.661574,words/sec,0
18 | Iter,17,Alpha,0.003902,VALIDacc,0.386538,VALIDent,6.529088,VALIDppx,92.353051,words/sec,0
19 | 


--------------------------------------------------------------------------------
/train_rnn_holmes_sequential.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | 
 3 | # This is the path that should be edited,
 4 | # depending on where the JSON books are stored
 5 | PATH_SEQUENTIAL=$PWD"/../Data/GutenbergHolmes_Sequential/"
 6 | 
 7 | # Define the minimum number of word occurrences as 5 and use existing vocabulary file
 8 | MIN_WORD_OCCURRENCE=5
 9 | RNN_HIDDENS=200
10 | RNN_CLASSES=250
11 | NGRAM_SIZE_MB=2000
12 | NGRAM_ORDER=3
13 | BPTT_ORDER=5
14 | 
15 | # If we need to debug, change this to "true"
16 | DEBUG_MODE="false"
17 | 
18 | # Automatic path generation
19 | PATH_DATA="./books"
20 | PATH_MODELS="./models"
21 | FILE_TRAIN=$PATH_SEQUENTIAL"/Holmes.train.json.tokens.txt"
22 | FILE_VALID=$PATH_SEQUENTIAL"/Holmes.valid.json.tokens.txt"
23 | FILE_SENTENCE_LABELS=$PATH_DATA"/valid.labels"
24 | FILE_VOCAB=$PATH_DATA"/vocab_mw"$MIN_WORD_OCCURRENCE"_sequential.txt"
25 | FILE_MODEL=$PATH_MODELS"/GutenbergHolmes_seq"
26 | FILE_MODEL=$FILE_MODEL"_mw"$MIN_WORD_OCCURRENCE
27 | FILE_MODEL=$FILE_MODEL"_h"$RNN_HIDDENS
28 | FILE_MODEL=$FILE_MODEL"_c"$RNN_CLASSES
29 | FILE_MODEL=$FILE_MODEL"_m"$NGRAM_SIZE_MB
30 | FILE_MODEL=$FILE_MODEL"_d"$NGRAM_ORDER
31 | FILE_MODEL=$FILE_MODEL"_b"$BPTT_ORDER
32 | FILE_MODEL=$FILE_MODEL".model"
33 | echo "RNN model will be stored in $FILE_MODEL..."
34 | 
35 | # Train the dependency-parsing model
36 | RnnDependencyTree \
37 |   -rnnlm $FILE_MODEL \
38 |   -train $FILE_TRAIN \
39 |   -valid $FILE_VALID \
40 |   -sentence-labels $FILE_SENTENCE_LABELS \
41 |   -min-word-occurrence $MIN_WORD_OCCURRENCE \
42 |   -hidden $RNN_HIDDENS \
43 |   -direct $NGRAM_SIZE_MB \
44 |   -direct-order $NGRAM_ORDER \
45 |   -bptt $BPTT_ORDER \
46 |   -bptt-block 1 \
47 |   -class $RNN_CLASSES \
48 |   -debug $DEBUG_MODE
49 | #  -vocab $FILE_VOCAB
50 | 
51 | 


--------------------------------------------------------------------------------
/test_rnn_holmes_example.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | 
 3 | # This is the path that should be edited,
 4 | # depending on where the JSON books are stored
 5 | PATH_JSON=$PWD"/../Data/GutenbergHolmes/"
 6 | 
 7 | # Get the model file name from the argument and the vocabulary file
 8 | FILE_MODEL=$1
 9 | FILE_VOCAB=$2
10 | DEP_LABELS=$3
11 | # Example of call when dependency labels are included (-feature-labels-type 2):
12 | # $ ./test_rnn_holmes_examples.sh models/GutenbergHolmes_p2_mw2_h100_c250_m100_d3_b5_g0.5.model books/vocab_mw5.txt 2
13 | # Example of call when dependency labels are not included (-feature-labels-type 0):
14 | # $ ./test_rnn_holmes_examples.sh models/GutenbergHolmes_p0_mw2_h100_c250_m100_d3_b5_g0.5.model books/vocab_mw5.txt 0
15 | 
16 | # If we need to debug, change this to "true"
17 | DEBUG_MODE="false"
18 | 
19 | # Automatic path generation
20 | PATH_DATA="./books"
21 | PATH_MODELS="./models"
22 | LIST_VALID=$PATH_DATA"/valid.txt"
23 | LIST_TEST=$PATH_DATA"/test.txt"
24 | FILE_SENTENCE_LABELS_VALID=$PATH_DATA"/valid.labels"
25 | FILE_SENTENCE_LABELS_TEST=$PATH_DATA"/test.labels"
26 | echo "RNN model is read from $FILE_MODEL..."
27 | 
28 | # Test the dependency-parsing model on the validation data
29 | RnnDependencyTree \
30 |   -rnnlm $FILE_MODEL \
31 |   -test $LIST_VALID \
32 |   -sentence-labels $FILE_SENTENCE_LABELS_VALID \
33 |   -path-json-books $PATH_JSON \
34 |   -vocab $FILE_VOCAB \
35 |   -debug $DEBUG_MODE \
36 |   -feature-labels-type $DEP_LABELS
37 | 
38 | # Test the dependency-parsing model on the test data
39 | RnnDependencyTree \
40 |   -rnnlm $FILE_MODEL \
41 |   -test $LIST_TEST \
42 |   -sentence-labels $FILE_SENTENCE_LABELS_TEST \
43 |   -path-json-books $PATH_JSON \
44 |   -vocab $FILE_VOCAB \
45 |   -debug $DEBUG_MODE \
46 |   -feature-labels-type $DEP_LABELS
47 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
 1 | CC = g++
 2 | 
 3 | BLASFLAGS = -I/opt/local/include
 4 | CPPFLAGS = -Wall -O3 -std=c++0x
 5 | OPTIMFLAGS = -funroll-loops -ffast-math
 6 | CXXFLAGS = -lm -lblas -g $(CPPFLAGS) $(OPTIMFLAGS) $(BLASFLAGS)
 7 | 
 8 | LDFLAGS = -lblas
 9 | 
10 | BLASINCLUDE = /opt/local/include/cblas.h
11 | SRCDIR = DependencyTreeRNN++
12 | INCLUDES = $(BLASINCLUDE) $(SRCDIR)/*.h
13 | 
14 | OBJDIR = build
15 | 
16 | OBJ =	$(OBJDIR)/ReadJson.o \
17 | 	$(OBJDIR)/CorpusUnrollsReader.o \
18 | 	$(OBJDIR)/CommandLineParser.o \
19 | 	$(OBJDIR)/Vocabulary.o \
20 | 	$(OBJDIR)/RnnWeights.o \
21 | 	$(OBJDIR)/RnnLib.o \
22 | 	$(OBJDIR)/RnnTraining.o \
23 | 	$(OBJDIR)/RnnDependencyTreeLib.o \
24 | 	$(OBJDIR)/main.o
25 | 
26 | all: $(OBJ) RnnDependencyTree
27 | 
28 | $(OBJDIR)/ReadJson.o: $(SRCDIR)/ReadJson.cpp $(INCLUDES)
29 | 	$(CC) $(CXXFLAGS) -c -o $@ $<
30 | 
31 | $(OBJDIR)/CorpusUnrollsReader.o: $(SRCDIR)/CorpusUnrollsReader.cpp $(INCLUDES)
32 | 	$(CC) $(CXXFLAGS) -c -o $@ $<
33 | 
34 | $(OBJDIR)/CommandLineParser.o: $(SRCDIR)/CommandLineParser.cpp $(INCLUDES)
35 | 	$(CC) $(CXXFLAGS) -c -o $@ $<
36 | 
37 | $(OBJDIR)/Vocabulary.o: $(SRCDIR)/Vocabulary.cpp $(INCLUDES)
38 | 	$(CC) $(CXXFLAGS) -c -o $@ $<
39 | 
40 | $(OBJDIR)/RnnWeights.o: $(SRCDIR)/RnnWeights.cpp $(INCLUDES)
41 | 	$(CC) $(CXXFLAGS) -c -o $@ $<
42 | 
43 | $(OBJDIR)/RnnLib.o: $(SRCDIR)/RnnLib.cpp $(INCLUDES)
44 | 	$(CC) $(CXXFLAGS) -c -o $@ $<
45 | 
46 | $(OBJDIR)/RnnTraining.o: $(SRCDIR)/RnnTraining.cpp $(INCLUDES)
47 | 	$(CC) $(CXXFLAGS) -c -o $@ $<
48 | 
49 | $(OBJDIR)/RnnDependencyTreeLib.o: $(SRCDIR)/RnnDependencyTreeLib.cpp $(INCLUDES)
50 | 	$(CC) $(CXXFLAGS) -c -o $@ $<
51 | 
52 | $(OBJDIR)/main.o: $(SRCDIR)/main.cpp $(INCLUDES)
53 | 	$(CC) $(CXXFLAGS) -c -o $@ $<
54 | 
55 | RnnDependencyTree: $(OBJ)
56 | 	$(CC) -o $@ $^ $(LDFLAGS)
57 | 
58 | clean:
59 | 	rm -rf $(OBJDIR)/*.o
60 | 


--------------------------------------------------------------------------------
/Makefile.MacOS:
--------------------------------------------------------------------------------
 1 | CC = g++
 2 | 
 3 | BLASFLAGS = -I/opt/local/include
 4 | CPPFLAGS = -Wall -O3 -std=c++0x
 5 | OPTIMFLAGS = -funroll-loops -ffast-math
 6 | CXXFLAGS = -lm -lblas -g $(CPPFLAGS) $(OPTIMFLAGS) $(BLASFLAGS)
 7 | 
 8 | LDFLAGS = -lblas
 9 | 
10 | BLASINCLUDE = /opt/local/include/cblas.h
11 | SRCDIR = DependencyTreeRNN++
12 | INCLUDES = $(BLASINCLUDE) $(SRCDIR)/*.h
13 | 
14 | OBJDIR = build
15 | 
16 | OBJ =	$(OBJDIR)/ReadJson.o \
17 | 	$(OBJDIR)/CorpusUnrollsReader.o \
18 | 	$(OBJDIR)/CommandLineParser.o \
19 | 	$(OBJDIR)/Vocabulary.o \
20 | 	$(OBJDIR)/RnnWeights.o \
21 | 	$(OBJDIR)/RnnLib.o \
22 | 	$(OBJDIR)/RnnTraining.o \
23 | 	$(OBJDIR)/RnnDependencyTreeLib.o \
24 | 	$(OBJDIR)/main.o
25 | 
26 | all: $(OBJ) RnnDependencyTree
27 | 
28 | $(OBJDIR)/ReadJson.o: $(SRCDIR)/ReadJson.cpp $(INCLUDES)
29 | 	$(CC) $(CXXFLAGS) -c -o $@ $<
30 | 
31 | $(OBJDIR)/CorpusUnrollsReader.o: $(SRCDIR)/CorpusUnrollsReader.cpp $(INCLUDES)
32 | 	$(CC) $(CXXFLAGS) -c -o $@ $<
33 | 
34 | $(OBJDIR)/CommandLineParser.o: $(SRCDIR)/CommandLineParser.cpp $(INCLUDES)
35 | 	$(CC) $(CXXFLAGS) -c -o $@ $<
36 | 
37 | $(OBJDIR)/Vocabulary.o: $(SRCDIR)/Vocabulary.cpp $(INCLUDES)
38 | 	$(CC) $(CXXFLAGS) -c -o $@ $<
39 | 
40 | $(OBJDIR)/RnnWeights.o: $(SRCDIR)/RnnWeights.cpp $(INCLUDES)
41 | 	$(CC) $(CXXFLAGS) -c -o $@ $<
42 | 
43 | $(OBJDIR)/RnnLib.o: $(SRCDIR)/RnnLib.cpp $(INCLUDES)
44 | 	$(CC) $(CXXFLAGS) -c -o $@ $<
45 | 
46 | $(OBJDIR)/RnnTraining.o: $(SRCDIR)/RnnTraining.cpp $(INCLUDES)
47 | 	$(CC) $(CXXFLAGS) -c -o $@ $<
48 | 
49 | $(OBJDIR)/RnnDependencyTreeLib.o: $(SRCDIR)/RnnDependencyTreeLib.cpp $(INCLUDES)
50 | 	$(CC) $(CXXFLAGS) -c -o $@ $<
51 | 
52 | $(OBJDIR)/main.o: $(SRCDIR)/main.cpp $(INCLUDES)
53 | 	$(CC) $(CXXFLAGS) -c -o $@ $<
54 | 
55 | RnnDependencyTree: $(OBJ)
56 | 	$(CC) -o $@ $^ $(LDFLAGS)
57 | 
58 | clean:
59 | 	rm -rf $(OBJDIR)/*.o
60 | 


--------------------------------------------------------------------------------
/train_rnn_holmes_p0_1000.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | 
 3 | # This is the path that should be edited,
 4 | # depending on where the JSON books are stored
 5 | PATH_JSON="../Data/GutenbergHolmes/"
 6 | 
 7 | # Define the minimum number of word occurrences as 5 and use existing vocabulary file
 8 | MIN_WORD_OCCURRENCE=5
 9 | DEP_LABELS=2
10 | RNN_HIDDENS=$1
11 | RNN_CLASSES=250
12 | NGRAM_SIZE_MB=1000
13 | NGRAM_ORDER=$2
14 | BPTT_ORDER=5
15 | FEATURE_GAMMA=0.0
16 | 
17 | # If we need to debug, change this to "true"
18 | DEBUG_MODE="true"
19 | 
20 | # Automatic path generation
21 | PATH_DATA="./books"
22 | PATH_MODELS="./models"
23 | LIST_VALID=$PATH_DATA"/valid.txt"
24 | LIST_TRAIN=$PATH_DATA"/train.txt"
25 | FILE_SENTENCE_LABELS=$PATH_DATA"/valid.labels"
26 | FILE_VOCAB=$PATH_DATA"/vocab_mw"$MIN_WORD_OCCURRENCE".txt"
27 | FILE_MODEL=$PATH_MODELS"/GutenbergHolmes_p"$DEP_LABELS
28 | FILE_MODEL=$FILE_MODEL"_mw"$MIN_WORD_OCCURRENCE
29 | FILE_MODEL=$FILE_MODEL"_h"$RNN_HIDDENS
30 | FILE_MODEL=$FILE_MODEL"_c"$RNN_CLASSES
31 | FILE_MODEL=$FILE_MODEL"_m"$NGRAM_SIZE_MB
32 | FILE_MODEL=$FILE_MODEL"_d"$NGRAM_ORDER
33 | FILE_MODEL=$FILE_MODEL"_b"$BPTT_ORDER
34 | FILE_MODEL=$FILE_MODEL"_g"$FEATURE_GAMMA
35 | FILE_MODEL=$FILE_MODEL".model"
36 | echo "RNN model will be stored in $FILE_MODEL..."
37 | 
38 | # Train the dependency-parsing model
39 | RnnDependencyTree \
40 |   -rnnlm $FILE_MODEL \
41 |   -train $LIST_TRAIN \
42 |   -valid $LIST_VALID \
43 |   -sentence-labels $FILE_SENTENCE_LABELS \
44 |   -path-json-books $PATH_JSON \
45 |   -min-word-occurrence $MIN_WORD_OCCURRENCE \
46 |   -feature-labels-type $DEP_LABELS \
47 |   -hidden $RNN_HIDDENS \
48 |   -direct $NGRAM_SIZE_MB \
49 |   -direct-order $NGRAM_ORDER \
50 |   -bptt $BPTT_ORDER \
51 |   -bptt-block 1 \
52 |   -class $RNN_CLASSES \
53 |   -feature-gamma $FEATURE_GAMMA \
54 |   -debug $DEBUG_MODE \
55 |   -vocab $FILE_VOCAB
56 | 


--------------------------------------------------------------------------------
/train_rnn_holmes_p0_2000.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | 
 3 | # This is the path that should be edited,
 4 | # depending on where the JSON books are stored
 5 | PATH_JSON="../Data/GutenbergHolmes/"
 6 | 
 7 | # Define the minimum number of word occurrences as 5 and use existing vocabulary file
 8 | MIN_WORD_OCCURRENCE=5
 9 | DEP_LABELS=2
10 | RNN_HIDDENS=$1
11 | RNN_CLASSES=250
12 | NGRAM_SIZE_MB=2000
13 | NGRAM_ORDER=$2
14 | BPTT_ORDER=5
15 | FEATURE_GAMMA=0.0
16 | 
17 | # If we need to debug, change this to "true"
18 | DEBUG_MODE="true"
19 | 
20 | # Automatic path generation
21 | PATH_DATA="./books"
22 | PATH_MODELS="./models"
23 | LIST_VALID=$PATH_DATA"/valid.txt"
24 | LIST_TRAIN=$PATH_DATA"/train.txt"
25 | FILE_SENTENCE_LABELS=$PATH_DATA"/valid.labels"
26 | FILE_VOCAB=$PATH_DATA"/vocab_mw"$MIN_WORD_OCCURRENCE".txt"
27 | FILE_MODEL=$PATH_MODELS"/GutenbergHolmes_p"$DEP_LABELS
28 | FILE_MODEL=$FILE_MODEL"_mw"$MIN_WORD_OCCURRENCE
29 | FILE_MODEL=$FILE_MODEL"_h"$RNN_HIDDENS
30 | FILE_MODEL=$FILE_MODEL"_c"$RNN_CLASSES
31 | FILE_MODEL=$FILE_MODEL"_m"$NGRAM_SIZE_MB
32 | FILE_MODEL=$FILE_MODEL"_d"$NGRAM_ORDER
33 | FILE_MODEL=$FILE_MODEL"_b"$BPTT_ORDER
34 | FILE_MODEL=$FILE_MODEL"_g"$FEATURE_GAMMA
35 | FILE_MODEL=$FILE_MODEL".model"
36 | echo "RNN model will be stored in $FILE_MODEL..."
37 | 
38 | # Train the dependency-parsing model
39 | RnnDependencyTree \
40 |   -rnnlm $FILE_MODEL \
41 |   -train $LIST_TRAIN \
42 |   -valid $LIST_VALID \
43 |   -sentence-labels $FILE_SENTENCE_LABELS \
44 |   -path-json-books $PATH_JSON \
45 |   -min-word-occurrence $MIN_WORD_OCCURRENCE \
46 |   -feature-labels-type $DEP_LABELS \
47 |   -hidden $RNN_HIDDENS \
48 |   -direct $NGRAM_SIZE_MB \
49 |   -direct-order $NGRAM_ORDER \
50 |   -bptt $BPTT_ORDER \
51 |   -bptt-block 1 \
52 |   -class $RNN_CLASSES \
53 |   -feature-gamma $FEATURE_GAMMA \
54 |   -debug $DEBUG_MODE \
55 |   -vocab $FILE_VOCAB
56 | 


--------------------------------------------------------------------------------
/train_rnn_holmes_p2_1000.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | 
 3 | # This is the path that should be edited,
 4 | # depending on where the JSON books are stored
 5 | PATH_JSON="../Data/GutenbergHolmes/"
 6 | 
 7 | # Define the minimum number of word occurrences as 5 and use existing vocabulary file
 8 | MIN_WORD_OCCURRENCE=5
 9 | DEP_LABELS=2
10 | RNN_HIDDENS=$1
11 | RNN_CLASSES=250
12 | NGRAM_SIZE_MB=1000
13 | NGRAM_ORDER=$2
14 | BPTT_ORDER=5
15 | FEATURE_GAMMA=0.0
16 | 
17 | # If we need to debug, change this to "true"
18 | DEBUG_MODE="true"
19 | 
20 | # Automatic path generation
21 | PATH_DATA="./books"
22 | PATH_MODELS="./models"
23 | LIST_VALID=$PATH_DATA"/valid.txt"
24 | LIST_TRAIN=$PATH_DATA"/train.txt"
25 | FILE_SENTENCE_LABELS=$PATH_DATA"/valid.labels"
26 | FILE_VOCAB=$PATH_DATA"/vocab_mw"$MIN_WORD_OCCURRENCE".txt"
27 | FILE_MODEL=$PATH_MODELS"/GutenbergHolmes_p"$DEP_LABELS
28 | FILE_MODEL=$FILE_MODEL"_mw"$MIN_WORD_OCCURRENCE
29 | FILE_MODEL=$FILE_MODEL"_h"$RNN_HIDDENS
30 | FILE_MODEL=$FILE_MODEL"_c"$RNN_CLASSES
31 | FILE_MODEL=$FILE_MODEL"_m"$NGRAM_SIZE_MB
32 | FILE_MODEL=$FILE_MODEL"_d"$NGRAM_ORDER
33 | FILE_MODEL=$FILE_MODEL"_b"$BPTT_ORDER
34 | FILE_MODEL=$FILE_MODEL"_g"$FEATURE_GAMMA
35 | FILE_MODEL=$FILE_MODEL".model"
36 | echo "RNN model will be stored in $FILE_MODEL..."
37 | 
38 | # Train the dependency-parsing model
39 | RnnDependencyTree \
40 |   -rnnlm $FILE_MODEL \
41 |   -train $LIST_TRAIN \
42 |   -valid $LIST_VALID \
43 |   -sentence-labels $FILE_SENTENCE_LABELS \
44 |   -path-json-books $PATH_JSON \
45 |   -min-word-occurrence $MIN_WORD_OCCURRENCE \
46 |   -feature-labels-type $DEP_LABELS \
47 |   -hidden $RNN_HIDDENS \
48 |   -direct $NGRAM_SIZE_MB \
49 |   -direct-order $NGRAM_ORDER \
50 |   -bptt $BPTT_ORDER \
51 |   -bptt-block 1 \
52 |   -class $RNN_CLASSES \
53 |   -feature-gamma $FEATURE_GAMMA \
54 |   -debug $DEBUG_MODE \
55 |   -vocab $FILE_VOCAB
56 | 


--------------------------------------------------------------------------------
/train_rnn_holmes_p2_2000.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | 
 3 | # This is the path that should be edited,
 4 | # depending on where the JSON books are stored
 5 | PATH_JSON="../Data/GutenbergHolmes/"
 6 | 
 7 | # Define the minimum number of word occurrences as 5 and use existing vocabulary file
 8 | MIN_WORD_OCCURRENCE=5
 9 | DEP_LABELS=2
10 | RNN_HIDDENS=$1
11 | RNN_CLASSES=250
12 | NGRAM_SIZE_MB=2000
13 | NGRAM_ORDER=$2
14 | BPTT_ORDER=5
15 | FEATURE_GAMMA=0.0
16 | 
17 | # If we need to debug, change this to "true"
18 | DEBUG_MODE="true"
19 | 
20 | # Automatic path generation
21 | PATH_DATA="./books"
22 | PATH_MODELS="./models"
23 | LIST_VALID=$PATH_DATA"/valid.txt"
24 | LIST_TRAIN=$PATH_DATA"/train.txt"
25 | FILE_SENTENCE_LABELS=$PATH_DATA"/valid.labels"
26 | FILE_VOCAB=$PATH_DATA"/vocab_mw"$MIN_WORD_OCCURRENCE".txt"
27 | FILE_MODEL=$PATH_MODELS"/GutenbergHolmes_p"$DEP_LABELS
28 | FILE_MODEL=$FILE_MODEL"_mw"$MIN_WORD_OCCURRENCE
29 | FILE_MODEL=$FILE_MODEL"_h"$RNN_HIDDENS
30 | FILE_MODEL=$FILE_MODEL"_c"$RNN_CLASSES
31 | FILE_MODEL=$FILE_MODEL"_m"$NGRAM_SIZE_MB
32 | FILE_MODEL=$FILE_MODEL"_d"$NGRAM_ORDER
33 | FILE_MODEL=$FILE_MODEL"_b"$BPTT_ORDER
34 | FILE_MODEL=$FILE_MODEL"_g"$FEATURE_GAMMA
35 | FILE_MODEL=$FILE_MODEL".model"
36 | echo "RNN model will be stored in $FILE_MODEL..."
37 | 
38 | # Train the dependency-parsing model
39 | RnnDependencyTree \
40 |   -rnnlm $FILE_MODEL \
41 |   -train $LIST_TRAIN \
42 |   -valid $LIST_VALID \
43 |   -sentence-labels $FILE_SENTENCE_LABELS \
44 |   -path-json-books $PATH_JSON \
45 |   -min-word-occurrence $MIN_WORD_OCCURRENCE \
46 |   -feature-labels-type $DEP_LABELS \
47 |   -hidden $RNN_HIDDENS \
48 |   -direct $NGRAM_SIZE_MB \
49 |   -direct-order $NGRAM_ORDER \
50 |   -bptt $BPTT_ORDER \
51 |   -bptt-block 1 \
52 |   -class $RNN_CLASSES \
53 |   -feature-gamma $FEATURE_GAMMA \
54 |   -debug $DEBUG_MODE \
55 |   -vocab $FILE_VOCAB
56 | 


--------------------------------------------------------------------------------
/train_rnn_holmes_example.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | 
 3 | # This is the path that should be edited,
 4 | # depending on where the JSON books are stored
 5 | PATH_JSON=$PWD"/../Data/GutenbergHolmes/"
 6 | 
 7 | # Define the minimum number of word occurrences as 5 and use existing vocabulary file
 8 | MIN_WORD_OCCURRENCE=5
 9 | DEP_LABELS=2
10 | RNN_HIDDENS=600
11 | RNN_CLASSES=250
12 | NGRAM_SIZE_MB=2000
13 | NGRAM_ORDER=3
14 | BPTT_ORDER=5
15 | FEATURE_GAMMA=0.5
16 | 
17 | # If we need to debug, change this to "true"
18 | DEBUG_MODE="true"
19 | 
20 | # Automatic path generation
21 | PATH_DATA="./books"
22 | PATH_MODELS="./models"
23 | LIST_VALID=$PATH_DATA"/valid.txt"
24 | LIST_TRAIN=$PATH_DATA"/train.txt"
25 | FILE_SENTENCE_LABELS=$PATH_DATA"/valid.labels"
26 | FILE_VOCAB=$PATH_DATA"/vocab_mw"$MIN_WORD_OCCURRENCE".txt"
27 | FILE_MODEL=$PATH_MODELS"/GutenbergHolmes_p"$DEP_LABELS
28 | FILE_MODEL=$FILE_MODEL"_mw"$MIN_WORD_OCCURRENCE
29 | FILE_MODEL=$FILE_MODEL"_h"$RNN_HIDDENS
30 | FILE_MODEL=$FILE_MODEL"_c"$RNN_CLASSES
31 | FILE_MODEL=$FILE_MODEL"_m"$NGRAM_SIZE_MB
32 | FILE_MODEL=$FILE_MODEL"_d"$NGRAM_ORDER
33 | FILE_MODEL=$FILE_MODEL"_b"$BPTT_ORDER
34 | FILE_MODEL=$FILE_MODEL"_g"$FEATURE_GAMMA
35 | FILE_MODEL=$FILE_MODEL".model"
36 | echo "RNN model will be stored in $FILE_MODEL..."
37 | 
38 | # Train the dependency-parsing model
39 | RnnDependencyTree \
40 |   -rnnlm $FILE_MODEL \
41 |   -train $LIST_TRAIN \
42 |   -valid $LIST_VALID \
43 |   -sentence-labels $FILE_SENTENCE_LABELS \
44 |   -path-json-books $PATH_JSON \
45 |   -min-word-occurrence $MIN_WORD_OCCURRENCE \
46 |   -feature-labels-type $DEP_LABELS \
47 |   -hidden $RNN_HIDDENS \
48 |   -direct $NGRAM_SIZE_MB \
49 |   -direct-order $NGRAM_ORDER \
50 |   -bptt $BPTT_ORDER \
51 |   -bptt-block 1 \
52 |   -class $RNN_CLASSES \
53 |   -feature-gamma $FEATURE_GAMMA \
54 |   -debug $DEBUG_MODE \
55 |   -vocab $FILE_VOCAB
56 | 


--------------------------------------------------------------------------------
/DependencyTreeRNN++/ReadJson.h:
--------------------------------------------------------------------------------
 1 | // Copyright (c) 2014-2015 Piotr Mirowski
 2 | //
 3 | // Piotr Mirowski, Andreas Vlachos
 4 | // "Dependency Recurrent Neural Language Models for Sentence Completion"
 5 | // ACL 2015
 6 | 
 7 | #ifndef DependencyTreeRNN___readjson_h
 8 | #define DependencyTreeRNN___readjson_h
 9 | 
10 | #include <vector>
11 | #include "CorpusUnrollsReader.h"
12 | 
13 | using namespace std;
14 | 
15 | struct JsonToken {
16 |   int pos;
17 |   string word;
18 |   double discount;
19 |   string label;
20 | };
21 | 
22 | 
23 | class ReadJson {
24 | public:
25 |   
26 |   /**
27 |    * Constructor: read a text file in JSON format.
28 |    * If required, insert words and labels to the vocabulary.
29 |    * If required, insert tokens into the current book.
30 |    */
31 |   ReadJson(const string &filename,
32 |            CorpusUnrolls &corpus,
33 |            bool insert_vocab,
34 |            bool read_book,
35 |            bool merge_label_with_word);
36 |   
37 |   /**
38 |    * Destructor
39 |    */
40 |   ~ReadJson() { }
41 | 
42 | protected:
43 |   
44 | 
45 |   /**
46 |    * Trim a word
47 |    */
48 |   string const Trim(const string &word) const;
49 | 
50 |   /**
51 |    * Parse a token
52 |    */
53 |   size_t const ParseToken(const string &json_element,
54 |                           JsonToken &tok) const;
55 | 
56 |   /**
57 |    * Parse an unroll
58 |    */
59 |   size_t const ParseUnroll(const string &json_unrolls,
60 |                            vector<JsonToken> &unroll) const;
61 | 
62 |   /**
63 |    * Parse a sentence
64 |    */
65 |   size_t const ParseSentence(const string &json_sentences,
66 |                              vector<vector<JsonToken>> &sentence) const;
67 | 
68 |   /**
69 |    * Parse a book
70 |    */
71 |   size_t const ParseBook(const string &json_book,
72 |                          vector<vector<vector<JsonToken>>> &book) const;
73 | };
74 | 
75 | #endif
76 | 


--------------------------------------------------------------------------------
/Makefile.Linux:
--------------------------------------------------------------------------------
 1 | CC = g++
 2 | 
 3 | # These paths need to be configured, depending on where cblas.h
 4 | # and libcblas.so are located
 5 | BLASINCLUDE = /usr/include/cblas.h
 6 | BLASFLAGSINCLUDE = -I/usr/include
 7 | BLASFLAGSLIB = -L/usr/lib64/atlas
 8 | 
 9 | CPPFLAGS = -Wall -O3 -std=c++0x
10 | OPTIMFLAGS = -funroll-loops -ffast-math
11 | CXXFLAGS = -lm -lblas -g $(CPPFLAGS) $(OPTIMFLAGS) $(BLASFLAGSINCLUDE)
12 | LDFLAGS = -lcblas $(BLASFLAGSLIB)
13 | 
14 | SRCDIR = DependencyTreeRNN++
15 | INCLUDES = $(BLASINCLUDE) $(SRCDIR)/*.h
16 | 
17 | OBJDIR = build
18 | 
19 | OBJ =	$(OBJDIR)/ReadJson.o \
20 | 	$(OBJDIR)/CorpusUnrollsReader.o \
21 | 	$(OBJDIR)/CommandLineParser.o \
22 | 	$(OBJDIR)/Vocabulary.o \
23 | 	$(OBJDIR)/RnnWeights.o \
24 | 	$(OBJDIR)/RnnLib.o \
25 | 	$(OBJDIR)/RnnTraining.o \
26 | 	$(OBJDIR)/RnnDependencyTreeLib.o \
27 | 	$(OBJDIR)/main.o
28 | 
29 | all: $(OBJ) RnnDependencyTree
30 | 
31 | $(OBJDIR)/ReadJson.o: $(SRCDIR)/ReadJson.cpp $(INCLUDES)
32 | 	$(CC) $(CXXFLAGS) -c -o $@ $<
33 | 
34 | $(OBJDIR)/CorpusUnrollsReader.o: $(SRCDIR)/CorpusUnrollsReader.cpp $(INCLUDES)
35 | 	$(CC) $(CXXFLAGS) -c -o $@ $<
36 | 
37 | $(OBJDIR)/CommandLineParser.o: $(SRCDIR)/CommandLineParser.cpp $(INCLUDES)
38 | 	$(CC) $(CXXFLAGS) -c -o $@ $<
39 | 
40 | $(OBJDIR)/Vocabulary.o: $(SRCDIR)/Vocabulary.cpp $(INCLUDES)
41 | 	$(CC) $(CXXFLAGS) -c -o $@ $<
42 | 
43 | $(OBJDIR)/RnnWeights.o: $(SRCDIR)/RnnWeights.cpp $(INCLUDES)
44 | 	$(CC) $(CXXFLAGS) -c -o $@ $<
45 | 
46 | $(OBJDIR)/RnnLib.o: $(SRCDIR)/RnnLib.cpp $(INCLUDES)
47 | 	$(CC) $(CXXFLAGS) -c -o $@ $<
48 | 
49 | $(OBJDIR)/RnnTraining.o: $(SRCDIR)/RnnTraining.cpp $(INCLUDES)
50 | 	$(CC) $(CXXFLAGS) -c -o $@ $<
51 | 
52 | $(OBJDIR)/RnnDependencyTreeLib.o: $(SRCDIR)/RnnDependencyTreeLib.cpp $(INCLUDES)
53 | 	$(CC) $(CXXFLAGS) -c -o $@ $<
54 | 
55 | $(OBJDIR)/main.o: $(SRCDIR)/main.cpp $(INCLUDES)
56 | 	$(CC) $(CXXFLAGS) -c -o $@ $<
57 | 
58 | RnnDependencyTree: $(OBJ)
59 | 	$(CC) -o $@ $^ $(LDFLAGS)
60 | 
61 | clean:
62 | 	rm -rf $(OBJDIR)/*.o
63 | 


--------------------------------------------------------------------------------
/logs/GutenbergHolmes_p2_mw5_h50_c250_m2000_d4_b5_g0.5.acc:
--------------------------------------------------------------------------------
 1 | Iter,0,Alpha,0.100000,VALIDacc,0.375000,VALIDent,6.299215,VALIDppx,78.750394,words/sec,0
 2 | Iter,1,Alpha,0.100000,VALIDacc,0.428846,VALIDent,6.407313,VALIDppx,84.877636,words/sec,0
 3 | Iter,2,Alpha,0.100000,VALIDacc,0.425000,VALIDent,6.309835,VALIDppx,79.332195,words/sec,0
 4 | Iter,3,Alpha,0.100000,VALIDacc,0.419231,VALIDent,6.423861,VALIDppx,85.856848,words/sec,0
 5 | Iter,4,Alpha,0.100000,VALIDacc,0.438462,VALIDent,6.334380,VALIDppx,80.693489,words/sec,0
 6 | Iter,5,Alpha,0.100000,VALIDacc,0.425000,VALIDent,6.324297,VALIDppx,80.131479,words/sec,0
 7 | Iter,6,Alpha,0.066667,VALIDacc,0.436538,VALIDent,6.403221,VALIDppx,84.637246,words/sec,0
 8 | Iter,7,Alpha,0.044444,VALIDacc,0.438462,VALIDent,6.186505,VALIDppx,72.832206,words/sec,0
 9 | Iter,8,Alpha,0.029630,VALIDacc,0.459615,VALIDent,6.082602,VALIDppx,67.771259,words/sec,0
10 | Iter,9,Alpha,0.019753,VALIDacc,0.465385,VALIDent,6.093529,VALIDppx,68.286507,words/sec,0
11 | Iter,10,Alpha,0.013169,VALIDacc,0.436538,VALIDent,6.042666,VALIDppx,65.921012,words/sec,0
12 | Iter,11,Alpha,0.008779,VALIDacc,0.457692,VALIDent,6.060414,VALIDppx,66.736960,words/sec,0
13 | Iter,12,Alpha,0.005853,VALIDacc,0.467308,VALIDent,6.099138,VALIDppx,68.552512,words/sec,0
14 | Iter,13,Alpha,0.003902,VALIDacc,0.451923,VALIDent,6.000667,VALIDppx,64.029604,words/sec,0
15 | Iter,14,Alpha,0.002601,VALIDacc,0.455769,VALIDent,5.926755,VALIDppx,60.831857,words/sec,0
16 | Iter,15,Alpha,0.001734,VALIDacc,0.450000,VALIDent,5.998394,VALIDppx,63.928802,words/sec,0
17 | Iter,16,Alpha,0.001156,VALIDacc,0.463462,VALIDent,5.933939,VALIDppx,61.135527,words/sec,0
18 | Iter,17,Alpha,0.000771,VALIDacc,0.465385,VALIDent,5.891628,VALIDppx,59.368587,words/sec,0
19 | Iter,18,Alpha,0.000514,VALIDacc,0.461538,VALIDent,5.901500,VALIDppx,59.776242,words/sec,0
20 | Iter,19,Alpha,0.000343,VALIDacc,0.463462,VALIDent,5.897743,VALIDppx,59.620746,words/sec,0
21 | Iter,20,Alpha,0.000228,VALIDacc,0.461538,VALIDent,5.890489,VALIDppx,59.321755,words/sec,0
22 | Iter,21,Alpha,0.000152,VALIDacc,0.457692,VALIDent,5.898964,VALIDppx,59.671253,words/sec,0
23 | Iter,22,Alpha,0.000101,VALIDacc,0.461538,VALIDent,5.902392,VALIDppx,59.813191,words/sec,0
24 | 


--------------------------------------------------------------------------------
/logs/GutenbergHolmes_p2_mw5_h200_c250_m2000_d4_b5_g0.5.acc:
--------------------------------------------------------------------------------
 1 | Iter,0,Alpha,0.100000,VALIDacc,0.369231,VALIDent,6.435766,VALIDppx,86.568258,words/sec,0
 2 | Iter,1,Alpha,0.100000,VALIDacc,0.415385,VALIDent,6.461425,VALIDppx,88.121651,words/sec,0
 3 | Iter,2,Alpha,0.100000,VALIDacc,0.451923,VALIDent,6.244529,VALIDppx,75.821191,words/sec,0
 4 | Iter,3,Alpha,0.100000,VALIDacc,0.423077,VALIDent,6.322476,VALIDppx,80.030403,words/sec,0
 5 | Iter,4,Alpha,0.100000,VALIDacc,0.457692,VALIDent,6.376928,VALIDppx,83.108738,words/sec,0
 6 | Iter,5,Alpha,0.100000,VALIDacc,0.450000,VALIDent,6.321707,VALIDppx,79.987751,words/sec,0
 7 | Iter,6,Alpha,0.066667,VALIDacc,0.463462,VALIDent,6.163304,VALIDppx,71.670303,words/sec,0
 8 | Iter,7,Alpha,0.044444,VALIDacc,0.480769,VALIDent,6.096885,VALIDppx,68.445538,words/sec,0
 9 | Iter,8,Alpha,0.029630,VALIDacc,0.480769,VALIDent,6.136001,VALIDppx,70.326705,words/sec,0
10 | Iter,9,Alpha,0.019753,VALIDacc,0.461538,VALIDent,6.196881,VALIDppx,73.357940,words/sec,0
11 | Iter,10,Alpha,0.013169,VALIDacc,0.471154,VALIDent,5.950891,VALIDppx,61.858107,words/sec,0
12 | Iter,11,Alpha,0.008779,VALIDacc,0.482692,VALIDent,6.035864,VALIDppx,65.610931,words/sec,0
13 | Iter,12,Alpha,0.005853,VALIDacc,0.498077,VALIDent,5.895801,VALIDppx,59.540550,words/sec,0
14 | Iter,13,Alpha,0.003902,VALIDacc,0.480769,VALIDent,5.950142,VALIDppx,61.826002,words/sec,0
15 | Iter,14,Alpha,0.002601,VALIDacc,0.486538,VALIDent,5.928949,VALIDppx,60.924440,words/sec,0
16 | Iter,15,Alpha,0.001734,VALIDacc,0.494231,VALIDent,5.845973,VALIDppx,57.519254,words/sec,0
17 | Iter,16,Alpha,0.001156,VALIDacc,0.492308,VALIDent,5.877454,VALIDppx,58.788187,words/sec,0
18 | Iter,17,Alpha,0.000771,VALIDacc,0.490385,VALIDent,5.906932,VALIDppx,60.001706,words/sec,0
19 | Iter,18,Alpha,0.000514,VALIDacc,0.475000,VALIDent,5.850823,VALIDppx,57.712936,words/sec,0
20 | Iter,19,Alpha,0.000343,VALIDacc,0.490385,VALIDent,5.900016,VALIDppx,59.714759,words/sec,0
21 | Iter,20,Alpha,0.000228,VALIDacc,0.488462,VALIDent,5.863343,VALIDppx,58.215961,words/sec,0
22 | Iter,21,Alpha,0.000152,VALIDacc,0.486538,VALIDent,5.834466,VALIDppx,57.062314,words/sec,0
23 | Iter,22,Alpha,0.000101,VALIDacc,0.484615,VALIDent,5.837756,VALIDppx,57.192570,words/sec,0
24 | 


--------------------------------------------------------------------------------
/logs/GutenbergHolmes_seq_mw5_h50_c250_m0_d0_b5.acc:
--------------------------------------------------------------------------------
 1 | Iter,0,Alpha,0.100000,VALIDacc,0.255769,VALIDent,7.465440,VALIDppx,176.734491,words/sec,0
 2 | Iter,1,Alpha,0.100000,VALIDacc,0.278846,VALIDent,7.379568,VALIDppx,166.521913,words/sec,0
 3 | Iter,2,Alpha,0.100000,VALIDacc,0.275000,VALIDent,7.341033,VALIDppx,162.132880,words/sec,0
 4 | Iter,3,Alpha,0.100000,VALIDacc,0.273077,VALIDent,7.319749,VALIDppx,159.758497,words/sec,0
 5 | Iter,4,Alpha,0.100000,VALIDacc,0.271154,VALIDent,7.305861,VALIDppx,158.227973,words/sec,0
 6 | Iter,5,Alpha,0.100000,VALIDacc,0.273077,VALIDent,7.296104,VALIDppx,157.161493,words/sec,0
 7 | Iter,6,Alpha,0.100000,VALIDacc,0.269231,VALIDent,7.288831,VALIDppx,156.371244,words/sec,0
 8 | Iter,7,Alpha,0.066667,VALIDacc,0.261538,VALIDent,7.226051,VALIDppx,149.712505,words/sec,0
 9 | Iter,8,Alpha,0.044444,VALIDacc,0.276923,VALIDent,7.180029,VALIDppx,145.012099,words/sec,0
10 | Iter,9,Alpha,0.029630,VALIDacc,0.284615,VALIDent,7.144468,VALIDppx,141.481303,words/sec,0
11 | Iter,10,Alpha,0.019753,VALIDacc,0.276923,VALIDent,7.115754,VALIDppx,138.693277,words/sec,0
12 | Iter,11,Alpha,0.013169,VALIDacc,0.263462,VALIDent,7.092228,VALIDppx,136.449949,words/sec,0
13 | Iter,12,Alpha,0.008779,VALIDacc,0.267308,VALIDent,7.073162,VALIDppx,134.658539,words/sec,0
14 | Iter,13,Alpha,0.005853,VALIDacc,0.269231,VALIDent,7.057946,VALIDppx,133.245802,words/sec,0
15 | Iter,14,Alpha,0.003902,VALIDacc,0.271154,VALIDent,7.045805,VALIDppx,132.129131,words/sec,0
16 | Iter,15,Alpha,0.002601,VALIDacc,0.271154,VALIDent,7.036284,VALIDppx,131.260082,words/sec,0
17 | Iter,16,Alpha,0.001734,VALIDacc,0.282692,VALIDent,7.028620,VALIDppx,130.564636,words/sec,0
18 | Iter,17,Alpha,0.001156,VALIDacc,0.280769,VALIDent,7.021763,VALIDppx,129.945542,words/sec,0
19 | Iter,18,Alpha,0.000771,VALIDacc,0.286538,VALIDent,7.014895,VALIDppx,129.328384,words/sec,0
20 | Iter,19,Alpha,0.000514,VALIDacc,0.282692,VALIDent,7.007317,VALIDppx,128.650846,words/sec,0
21 | Iter,20,Alpha,0.000343,VALIDacc,0.276923,VALIDent,6.999086,VALIDppx,127.918919,words/sec,0
22 | Iter,21,Alpha,0.000228,VALIDacc,0.276923,VALIDent,6.991647,VALIDppx,127.261011,words/sec,0
23 | Iter,22,Alpha,0.000152,VALIDacc,0.273077,VALIDent,6.986392,VALIDppx,126.798312,words/sec,0
24 | 


--------------------------------------------------------------------------------
/logs/GutenbergHolmes_p0_mw5_h100_c250_m2000_d4_b5_g0.5.acc:
--------------------------------------------------------------------------------
 1 | Iter,0,Alpha,0.100000,VALIDacc,0.438462,VALIDent,8.706267,VALIDppx,417.683773,words/sec,0
 2 | Iter,1,Alpha,0.100000,VALIDacc,0.425000,VALIDent,8.590784,VALIDppx,385.552680,words/sec,0
 3 | Iter,2,Alpha,0.100000,VALIDacc,0.490385,VALIDent,8.560620,VALIDppx,377.575040,words/sec,0
 4 | Iter,3,Alpha,0.100000,VALIDacc,0.465385,VALIDent,8.691508,VALIDppx,413.432440,words/sec,0
 5 | Iter,4,Alpha,0.100000,VALIDacc,0.507692,VALIDent,8.473436,VALIDppx,355.433653,words/sec,0
 6 | Iter,5,Alpha,0.100000,VALIDacc,0.482692,VALIDent,8.688202,VALIDppx,412.486120,words/sec,0
 7 | Iter,6,Alpha,0.066667,VALIDacc,0.526923,VALIDent,8.377757,VALIDppx,332.626040,words/sec,0
 8 | Iter,7,Alpha,0.044444,VALIDacc,0.505769,VALIDent,8.229300,VALIDppx,300.100179,words/sec,0
 9 | Iter,8,Alpha,0.029630,VALIDacc,0.513462,VALIDent,8.260622,VALIDppx,306.686646,words/sec,0
10 | Iter,9,Alpha,0.019753,VALIDacc,0.540385,VALIDent,8.138515,VALIDppx,281.797551,words/sec,0
11 | Iter,10,Alpha,0.013169,VALIDacc,0.517308,VALIDent,8.106643,VALIDppx,275.640257,words/sec,0
12 | Iter,11,Alpha,0.008779,VALIDacc,0.513462,VALIDent,8.128033,VALIDppx,279.757475,words/sec,0
13 | Iter,12,Alpha,0.005853,VALIDacc,0.515385,VALIDent,8.135802,VALIDppx,281.268162,words/sec,0
14 | Iter,13,Alpha,0.003902,VALIDacc,0.511538,VALIDent,8.103275,VALIDppx,274.997486,words/sec,0
15 | Iter,14,Alpha,0.002601,VALIDacc,0.519231,VALIDent,8.038421,VALIDppx,262.909296,words/sec,0
16 | Iter,15,Alpha,0.001734,VALIDacc,0.515385,VALIDent,8.076482,VALIDppx,269.937517,words/sec,0
17 | Iter,16,Alpha,0.001156,VALIDacc,0.523077,VALIDent,8.013359,VALIDppx,258.381453,words/sec,0
18 | Iter,17,Alpha,0.000771,VALIDacc,0.530769,VALIDent,8.039020,VALIDppx,263.018380,words/sec,0
19 | Iter,18,Alpha,0.000514,VALIDacc,0.530769,VALIDent,7.993173,VALIDppx,254.791517,words/sec,0
20 | Iter,19,Alpha,0.000343,VALIDacc,0.515385,VALIDent,8.033077,VALIDppx,261.937200,words/sec,0
21 | Iter,20,Alpha,0.000228,VALIDacc,0.517308,VALIDent,8.005406,VALIDppx,256.961029,words/sec,0
22 | Iter,21,Alpha,0.000152,VALIDacc,0.526923,VALIDent,7.991467,VALIDppx,254.490402,words/sec,0
23 | Iter,22,Alpha,0.000101,VALIDacc,0.528846,VALIDent,8.007949,VALIDppx,257.414455,words/sec,0
24 | 


--------------------------------------------------------------------------------
/logs/GutenbergHolmes_p0_mw5_h50_c250_m2000_d4_b5_g0.5.acc:
--------------------------------------------------------------------------------
 1 | Iter,0,Alpha,0.100000,VALIDacc,0.434615,VALIDent,8.571457,VALIDppx,380.422074,words/sec,0
 2 | Iter,1,Alpha,0.100000,VALIDacc,0.465385,VALIDent,8.530692,VALIDppx,369.823109,words/sec,0
 3 | Iter,2,Alpha,0.100000,VALIDacc,0.476923,VALIDent,8.687900,VALIDppx,412.399900,words/sec,0
 4 | Iter,3,Alpha,0.100000,VALIDacc,0.459615,VALIDent,8.653864,VALIDppx,402.784280,words/sec,0
 5 | Iter,4,Alpha,0.100000,VALIDacc,0.494231,VALIDent,8.315728,VALIDppx,318.627647,words/sec,0
 6 | Iter,5,Alpha,0.100000,VALIDacc,0.476923,VALIDent,8.607496,VALIDppx,390.044672,words/sec,0
 7 | Iter,6,Alpha,0.066667,VALIDacc,0.476923,VALIDent,8.364198,VALIDppx,329.514430,words/sec,0
 8 | Iter,7,Alpha,0.044444,VALIDacc,0.513462,VALIDent,8.270002,VALIDppx,308.687154,words/sec,0
 9 | Iter,8,Alpha,0.029630,VALIDacc,0.513462,VALIDent,8.196132,VALIDppx,293.279352,words/sec,0
10 | Iter,9,Alpha,0.019753,VALIDacc,0.501923,VALIDent,8.082509,VALIDppx,271.067559,words/sec,0
11 | Iter,10,Alpha,0.013169,VALIDacc,0.515385,VALIDent,8.125267,VALIDppx,279.221614,words/sec,0
12 | Iter,11,Alpha,0.008779,VALIDacc,0.509615,VALIDent,8.135773,VALIDppx,281.262405,words/sec,0
13 | Iter,12,Alpha,0.005853,VALIDacc,0.526923,VALIDent,8.158307,VALIDppx,285.690116,words/sec,0
14 | Iter,13,Alpha,0.003902,VALIDacc,0.503846,VALIDent,8.037455,VALIDppx,262.733245,words/sec,0
15 | Iter,14,Alpha,0.002601,VALIDacc,0.496154,VALIDent,8.332756,VALIDppx,322.410691,words/sec,0
16 | Iter,15,Alpha,0.001734,VALIDacc,0.507692,VALIDent,8.017326,VALIDppx,259.093027,words/sec,0
17 | Iter,16,Alpha,0.001156,VALIDacc,0.498077,VALIDent,8.044398,VALIDppx,264.000693,words/sec,0
18 | Iter,17,Alpha,0.000771,VALIDacc,0.513462,VALIDent,8.051677,VALIDppx,265.336011,words/sec,0
19 | Iter,18,Alpha,0.000514,VALIDacc,0.517308,VALIDent,7.999546,VALIDppx,255.919458,words/sec,0
20 | Iter,19,Alpha,0.000343,VALIDacc,0.509615,VALIDent,8.001428,VALIDppx,256.253485,words/sec,0
21 | Iter,20,Alpha,0.000228,VALIDacc,0.513462,VALIDent,8.002160,VALIDppx,256.383600,words/sec,0
22 | Iter,21,Alpha,0.000152,VALIDacc,0.513462,VALIDent,7.997038,VALIDppx,255.474958,words/sec,0
23 | Iter,22,Alpha,0.000101,VALIDacc,0.513462,VALIDent,7.995744,VALIDppx,255.245901,words/sec,0
24 | 


--------------------------------------------------------------------------------
/logs/GutenbergHolmes_seq_mw5_h100_c250_m0_d0_b5.acc:
--------------------------------------------------------------------------------
 1 | Iter,0,Alpha,0.100000,VALIDacc,0.292308,VALIDent,7.542692,VALIDppx,186.456028,words/sec,0
 2 | Iter,1,Alpha,0.100000,VALIDacc,0.300000,VALIDent,7.414324,VALIDppx,170.582271,words/sec,0
 3 | Iter,2,Alpha,0.100000,VALIDacc,0.286538,VALIDent,7.353363,VALIDppx,163.524482,words/sec,0
 4 | Iter,3,Alpha,0.100000,VALIDacc,0.290385,VALIDent,7.320493,VALIDppx,159.840874,words/sec,0
 5 | Iter,4,Alpha,0.100000,VALIDacc,0.296154,VALIDent,7.299796,VALIDppx,157.564177,words/sec,0
 6 | Iter,5,Alpha,0.100000,VALIDacc,0.303846,VALIDent,7.285241,VALIDppx,155.982561,words/sec,0
 7 | Iter,6,Alpha,0.100000,VALIDacc,0.301923,VALIDent,7.274643,VALIDppx,154.840929,words/sec,0
 8 | Iter,7,Alpha,0.066667,VALIDacc,0.288462,VALIDent,7.203766,VALIDppx,147.417674,words/sec,0
 9 | Iter,8,Alpha,0.044444,VALIDacc,0.276923,VALIDent,7.151940,VALIDppx,142.216029,words/sec,0
10 | Iter,9,Alpha,0.029630,VALIDacc,0.276923,VALIDent,7.111860,VALIDppx,138.319469,words/sec,0
11 | Iter,10,Alpha,0.019753,VALIDacc,0.273077,VALIDent,7.080894,VALIDppx,135.382151,words/sec,0
12 | Iter,11,Alpha,0.013169,VALIDacc,0.282692,VALIDent,7.056410,VALIDppx,133.103992,words/sec,0
13 | Iter,12,Alpha,0.008779,VALIDacc,0.278846,VALIDent,7.036203,VALIDppx,131.252642,words/sec,0
14 | Iter,13,Alpha,0.005853,VALIDacc,0.273077,VALIDent,7.018806,VALIDppx,129.679443,words/sec,0
15 | Iter,14,Alpha,0.003902,VALIDacc,0.271154,VALIDent,7.003961,VALIDppx,128.351890,words/sec,0
16 | Iter,15,Alpha,0.002601,VALIDacc,0.271154,VALIDent,6.991587,VALIDppx,127.255774,words/sec,0
17 | Iter,16,Alpha,0.001734,VALIDacc,0.282692,VALIDent,6.981513,VALIDppx,126.370256,words/sec,0
18 | Iter,17,Alpha,0.001156,VALIDacc,0.282692,VALIDent,6.973082,VALIDppx,125.633896,words/sec,0
19 | Iter,18,Alpha,0.000771,VALIDacc,0.288462,VALIDent,6.964960,VALIDppx,124.928588,words/sec,0
20 | Iter,19,Alpha,0.000514,VALIDacc,0.290385,VALIDent,6.956480,VALIDppx,124.196440,words/sec,0
21 | Iter,20,Alpha,0.000343,VALIDacc,0.294231,VALIDent,6.947785,VALIDppx,123.450160,words/sec,0
22 | Iter,21,Alpha,0.000228,VALIDacc,0.296154,VALIDent,6.938872,VALIDppx,122.689853,words/sec,0
23 | Iter,22,Alpha,0.000152,VALIDacc,0.296154,VALIDent,6.930441,VALIDppx,121.974939,words/sec,0
24 | Iter,23,Alpha,0.000101,VALIDacc,0.294231,VALIDent,6.923959,VALIDppx,121.428136,words/sec,0
25 | 


--------------------------------------------------------------------------------
/logs/GutenbergHolmes_p0_mw5_h200_c250_m2000_d4_b5_g0.5.acc:
--------------------------------------------------------------------------------
 1 | Iter,0,Alpha,0.100000,VALIDacc,0.355769,VALIDent,9.542599,VALIDppx,745.776148,words/sec,0
 2 | Iter,1,Alpha,0.100000,VALIDacc,0.444231,VALIDent,9.339977,VALIDppx,648.057059,words/sec,0
 3 | Iter,2,Alpha,0.100000,VALIDacc,0.438462,VALIDent,9.127300,VALIDppx,559.230902,words/sec,0
 4 | Iter,3,Alpha,0.100000,VALIDacc,0.480769,VALIDent,8.829454,VALIDppx,454.915154,words/sec,0
 5 | Iter,4,Alpha,0.100000,VALIDacc,0.480769,VALIDent,9.040739,VALIDppx,526.664168,words/sec,0
 6 | Iter,5,Alpha,0.100000,VALIDacc,0.488462,VALIDent,9.102036,VALIDppx,549.523101,words/sec,0
 7 | Iter,6,Alpha,0.100000,VALIDacc,0.482692,VALIDent,8.684370,VALIDppx,411.392145,words/sec,0
 8 | Iter,7,Alpha,0.066667,VALIDacc,0.503846,VALIDent,8.609831,VALIDppx,390.676706,words/sec,0
 9 | Iter,8,Alpha,0.044444,VALIDacc,0.478846,VALIDent,8.553101,VALIDppx,375.612336,words/sec,0
10 | Iter,9,Alpha,0.029630,VALIDacc,0.505769,VALIDent,8.371286,VALIDppx,331.137457,words/sec,0
11 | Iter,10,Alpha,0.019753,VALIDacc,0.490385,VALIDent,8.297414,VALIDppx,314.608641,words/sec,0
12 | Iter,11,Alpha,0.013169,VALIDacc,0.513462,VALIDent,8.161090,VALIDppx,286.241738,words/sec,0
13 | Iter,12,Alpha,0.008779,VALIDacc,0.515385,VALIDent,8.232216,VALIDppx,300.707203,words/sec,0
14 | Iter,13,Alpha,0.005853,VALIDacc,0.501923,VALIDent,8.142231,VALIDppx,282.524272,words/sec,0
15 | Iter,14,Alpha,0.003902,VALIDacc,0.511538,VALIDent,8.128911,VALIDppx,279.927823,words/sec,0
16 | Iter,15,Alpha,0.002601,VALIDacc,0.505769,VALIDent,8.079572,VALIDppx,270.516378,words/sec,0
17 | Iter,16,Alpha,0.001734,VALIDacc,0.515385,VALIDent,8.058873,VALIDppx,266.662813,words/sec,0
18 | Iter,17,Alpha,0.001156,VALIDacc,0.507692,VALIDent,8.071768,VALIDppx,269.056923,words/sec,0
19 | Iter,18,Alpha,0.000771,VALIDacc,0.500000,VALIDent,8.124827,VALIDppx,279.136594,words/sec,0
20 | Iter,19,Alpha,0.000514,VALIDacc,0.523077,VALIDent,8.025258,VALIDppx,260.521378,words/sec,0
21 | Iter,20,Alpha,0.000343,VALIDacc,0.526923,VALIDent,8.055572,VALIDppx,266.053408,words/sec,0
22 | Iter,21,Alpha,0.000228,VALIDacc,0.511538,VALIDent,8.045317,VALIDppx,264.168894,words/sec,0
23 | Iter,22,Alpha,0.000152,VALIDacc,0.519231,VALIDent,8.039515,VALIDppx,263.108596,words/sec,0
24 | Iter,23,Alpha,0.000101,VALIDacc,0.513462,VALIDent,8.018300,VALIDppx,259.267879,words/sec,0
25 | 


--------------------------------------------------------------------------------
/DependencyTreeRNN++/CommandLineParser.h:
--------------------------------------------------------------------------------
 1 | // Copyright (c) 2014-2015 Piotr Mirowski
 2 | //
 3 | // Piotr Mirowski, Andreas Vlachos
 4 | // "Dependency Recurrent Neural Language Models for Sentence Completion"
 5 | // ACL 2015
 6 | 
 7 | #ifndef __DependencyTreeRNN____CommandLineParser__
 8 | #define __DependencyTreeRNN____CommandLineParser__
 9 | 
10 | #include <string>
11 | #include <map>
12 | 
13 | class CommandLineArgument {
14 | public:
15 |     
16 |   /**
17 |    * Type of the argument
18 |    */
19 |   std::string m_type;
20 | 
21 |   /**
22 |    * Description of the argument
23 |    */
24 |   std::string m_description;
25 |     
26 |   /**
27 |    * Value of the argument
28 |    */
29 |   std::string m_value;
30 |   
31 |   /**
32 |    * Is the argument required?
33 |    */
34 |   bool m_isRequired;
35 |     
36 |   /**
37 |    * Constructors
38 |    */
39 |   CommandLineArgument(std::string t,
40 |                       std::string desc,
41 |                       std::string d,
42 |                       bool r)
43 |   : m_type(t), m_description(desc), m_value(d), m_isRequired(r) {
44 |   }
45 |   CommandLineArgument() {
46 |     m_type = "UNDEFINED";
47 |   }
48 | };
49 | 
50 | 
51 | class CommandLineParser {
52 | public:
53 |   /**
54 |    * Map between command line argument names and structures containig their values
55 |    */
56 |   std::map<std::string, CommandLineArgument> args;
57 | 
58 |   /**
59 |    * Register a command line argument
60 |    */
61 |   void Register(std::string name,
62 |                 std::string type,
63 |                 std::string desc,
64 |                 std::string defaultVal = "",
65 |                 bool isRequired = false) {
66 |     args[name] = CommandLineArgument(type, desc, defaultVal, isRequired);
67 |   }
68 |   
69 |   /**
70 |    * Parse the arguments to extract their values and store them in the map
71 |    */
72 |   bool Parse(char *list[], int llen);
73 |   
74 |   /**
75 |    * Get a command line argument
76 |    */
77 |   bool Get(std::string name, int &value);
78 |   bool Get(std::string name, bool &value);
79 |   bool Get(std::string name, double &value);
80 |   bool Get(std::string name, std::string &value);
81 |   bool Get(std::string name, long long &value);
82 | };
83 | 
84 | #endif /* defined(__DependencyTreeRNN____CommandLineParser__) */
85 | 


--------------------------------------------------------------------------------
/logs/GutenbergHolmes_seq_mw5_h200_c250_m0_d0_b5.acc:
--------------------------------------------------------------------------------
 1 | Iter,0,Alpha,0.100000,VALIDacc,0.278846,VALIDent,7.680208,VALIDppx,205.103386,words/sec,0
 2 | Iter,1,Alpha,0.100000,VALIDacc,0.276923,VALIDent,7.517412,VALIDppx,183.217331,words/sec,0
 3 | Iter,2,Alpha,0.100000,VALIDacc,0.280769,VALIDent,7.442037,VALIDppx,173.890693,words/sec,0
 4 | Iter,3,Alpha,0.100000,VALIDacc,0.273077,VALIDent,7.404015,VALIDppx,169.367654,words/sec,0
 5 | Iter,4,Alpha,0.100000,VALIDacc,0.271154,VALIDent,7.380331,VALIDppx,166.610022,words/sec,0
 6 | Iter,5,Alpha,0.100000,VALIDacc,0.271154,VALIDent,7.365829,VALIDppx,164.943653,words/sec,0
 7 | Iter,6,Alpha,0.100000,VALIDacc,0.275000,VALIDent,7.356609,VALIDppx,163.892785,words/sec,0
 8 | Iter,7,Alpha,0.100000,VALIDacc,0.271154,VALIDent,7.349157,VALIDppx,163.048493,words/sec,0
 9 | Iter,8,Alpha,0.066667,VALIDacc,0.273077,VALIDent,7.261322,VALIDppx,153.417778,words/sec,0
10 | Iter,9,Alpha,0.044444,VALIDacc,0.282692,VALIDent,7.196659,VALIDppx,146.693266,words/sec,0
11 | Iter,10,Alpha,0.029630,VALIDacc,0.282692,VALIDent,7.143629,VALIDppx,141.399059,words/sec,0
12 | Iter,11,Alpha,0.019753,VALIDacc,0.276923,VALIDent,7.099888,VALIDppx,137.176390,words/sec,0
13 | Iter,12,Alpha,0.013169,VALIDacc,0.271154,VALIDent,7.064767,VALIDppx,133.877223,words/sec,0
14 | Iter,13,Alpha,0.008779,VALIDacc,0.263462,VALIDent,7.036239,VALIDppx,131.255948,words/sec,0
15 | Iter,14,Alpha,0.005853,VALIDacc,0.269231,VALIDent,7.012065,VALIDppx,129.074934,words/sec,0
16 | Iter,15,Alpha,0.003902,VALIDacc,0.271154,VALIDent,6.990707,VALIDppx,127.178192,words/sec,0
17 | Iter,16,Alpha,0.002601,VALIDacc,0.275000,VALIDent,6.972046,VALIDppx,125.543752,words/sec,0
18 | Iter,17,Alpha,0.001734,VALIDacc,0.282692,VALIDent,6.956267,VALIDppx,124.178121,words/sec,0
19 | Iter,18,Alpha,0.001156,VALIDacc,0.282692,VALIDent,6.943481,VALIDppx,123.082419,words/sec,0
20 | Iter,19,Alpha,0.000771,VALIDacc,0.290385,VALIDent,6.933176,VALIDppx,122.206428,words/sec,0
21 | Iter,20,Alpha,0.000514,VALIDacc,0.294231,VALIDent,6.924730,VALIDppx,121.493067,words/sec,0
22 | Iter,21,Alpha,0.000343,VALIDacc,0.296154,VALIDent,6.917802,VALIDppx,120.911067,words/sec,0
23 | Iter,22,Alpha,0.000228,VALIDacc,0.296154,VALIDent,6.911363,VALIDppx,120.372581,words/sec,0
24 | Iter,23,Alpha,0.000152,VALIDacc,0.294231,VALIDent,6.904527,VALIDppx,119.803559,words/sec,0
25 | Iter,24,Alpha,0.000101,VALIDacc,0.296154,VALIDent,6.898090,VALIDppx,119.270248,words/sec,0
26 | 


--------------------------------------------------------------------------------
/logs/GutenbergHolmes_p2_mw5_h100_c250_m2000_d4_b5_g0.5.acc:
--------------------------------------------------------------------------------
 1 | Iter,0,Alpha,0.100000,VALIDacc,0.401923,VALIDent,6.437281,VALIDppx,86.659189,words/sec,0
 2 | Iter,1,Alpha,0.100000,VALIDacc,0.415385,VALIDent,6.259147,VALIDppx,76.593341,words/sec,0
 3 | Iter,2,Alpha,0.100000,VALIDacc,0.430769,VALIDent,6.251194,VALIDppx,76.172295,words/sec,0
 4 | Iter,3,Alpha,0.100000,VALIDacc,0.417308,VALIDent,6.163204,VALIDppx,71.665336,words/sec,0
 5 | Iter,4,Alpha,0.100000,VALIDacc,0.421154,VALIDent,6.406475,VALIDppx,84.828349,words/sec,0
 6 | Iter,5,Alpha,0.100000,VALIDacc,0.450000,VALIDent,6.295965,VALIDppx,78.573160,words/sec,0
 7 | Iter,6,Alpha,0.100000,VALIDacc,0.465385,VALIDent,6.303201,VALIDppx,78.968259,words/sec,0
 8 | Iter,7,Alpha,0.100000,VALIDacc,0.465385,VALIDent,6.309432,VALIDppx,79.310048,words/sec,0
 9 | Iter,8,Alpha,0.100000,VALIDacc,0.463462,VALIDent,6.207847,VALIDppx,73.917631,words/sec,0
10 | Iter,9,Alpha,0.066667,VALIDacc,0.480769,VALIDent,6.363325,VALIDppx,82.328806,words/sec,0
11 | Iter,10,Alpha,0.044444,VALIDacc,0.480769,VALIDent,6.218611,VALIDppx,74.471202,words/sec,0
12 | Iter,11,Alpha,0.029630,VALIDacc,0.448077,VALIDent,6.202133,VALIDppx,73.625445,words/sec,0
13 | Iter,12,Alpha,0.019753,VALIDacc,0.488462,VALIDent,6.018031,VALIDppx,64.804906,words/sec,0
14 | Iter,13,Alpha,0.013169,VALIDacc,0.500000,VALIDent,6.023851,VALIDppx,65.066880,words/sec,0
15 | Iter,14,Alpha,0.008779,VALIDacc,0.475000,VALIDent,6.022283,VALIDppx,64.996158,words/sec,0
16 | Iter,15,Alpha,0.005853,VALIDacc,0.484615,VALIDent,5.998284,VALIDppx,63.923929,words/sec,0
17 | Iter,16,Alpha,0.003902,VALIDacc,0.471154,VALIDent,6.539653,VALIDppx,93.031885,words/sec,0
18 | Iter,17,Alpha,0.002601,VALIDacc,0.500000,VALIDent,5.929428,VALIDppx,60.944683,words/sec,0
19 | Iter,18,Alpha,0.001734,VALIDacc,0.478846,VALIDent,5.977251,VALIDppx,62.998728,words/sec,0
20 | Iter,19,Alpha,0.001156,VALIDacc,0.475000,VALIDent,5.935953,VALIDppx,61.220937,words/sec,0
21 | Iter,20,Alpha,0.000771,VALIDacc,0.473077,VALIDent,5.903441,VALIDppx,59.856688,words/sec,0
22 | Iter,21,Alpha,0.000514,VALIDacc,0.484615,VALIDent,5.940223,VALIDppx,61.402409,words/sec,0
23 | Iter,22,Alpha,0.000343,VALIDacc,0.476923,VALIDent,5.898638,VALIDppx,59.657756,words/sec,0
24 | Iter,23,Alpha,0.000228,VALIDacc,0.478846,VALIDent,5.915796,VALIDppx,60.371503,words/sec,0
25 | Iter,24,Alpha,0.000152,VALIDacc,0.482692,VALIDent,5.948535,VALIDppx,61.757201,words/sec,0
26 | Iter,25,Alpha,0.000101,VALIDacc,0.488462,VALIDent,5.909594,VALIDppx,60.112520,words/sec,0
27 | 


--------------------------------------------------------------------------------
/logs/GutenbergHolmes_seq_mw5_h50_c250_m1000_d4_b5_indep_.acc:
--------------------------------------------------------------------------------
 1 | Iter,0,Alpha,0.100000,VALIDacc,0.328846,VALIDent,6.935155,VALIDppx,122.374157,words/sec,0
 2 | Iter,1,Alpha,0.100000,VALIDacc,0.351923,VALIDent,6.796876,VALIDppx,111.189432,words/sec,0
 3 | Iter,2,Alpha,0.100000,VALIDacc,0.353846,VALIDent,6.741020,VALIDppx,106.966833,words/sec,0
 4 | Iter,3,Alpha,0.100000,VALIDacc,0.359615,VALIDent,6.714073,VALIDppx,104.987435,words/sec,0
 5 | Iter,4,Alpha,0.100000,VALIDacc,0.367308,VALIDent,6.701541,VALIDppx,104.079425,words/sec,0
 6 | Iter,5,Alpha,0.100000,VALIDacc,0.380769,VALIDent,6.697558,VALIDppx,103.792480,words/sec,0
 7 | Iter,6,Alpha,0.100000,VALIDacc,0.388462,VALIDent,6.699487,VALIDppx,103.931357,words/sec,0
 8 | Iter,7,Alpha,0.100000,VALIDacc,0.388462,VALIDent,6.705390,VALIDppx,104.357449,words/sec,0
 9 | Iter,8,Alpha,0.100000,VALIDacc,0.384615,VALIDent,6.714212,VALIDppx,104.997541,words/sec,0
10 | Iter,9,Alpha,0.066667,VALIDacc,0.392308,VALIDent,6.642499,VALIDppx,99.905962,words/sec,0
11 | Iter,10,Alpha,0.044444,VALIDacc,0.392308,VALIDent,6.590915,VALIDppx,96.396884,words/sec,0
12 | Iter,11,Alpha,0.029630,VALIDacc,0.394231,VALIDent,6.556514,VALIDppx,94.125512,words/sec,0
13 | Iter,12,Alpha,0.019753,VALIDacc,0.392308,VALIDent,6.531144,VALIDppx,92.484807,words/sec,0
14 | Iter,13,Alpha,0.013169,VALIDacc,0.390385,VALIDent,6.511226,VALIDppx,91.216690,words/sec,0
15 | Iter,14,Alpha,0.008779,VALIDacc,0.390385,VALIDent,6.495026,VALIDppx,90.198183,words/sec,0
16 | Iter,15,Alpha,0.005853,VALIDacc,0.380769,VALIDent,6.481460,VALIDppx,89.353969,words/sec,0
17 | Iter,16,Alpha,0.003902,VALIDacc,0.388462,VALIDent,6.469027,VALIDppx,88.587267,words/sec,0
18 | Iter,17,Alpha,0.002601,VALIDacc,0.386538,VALIDent,6.457031,VALIDppx,87.853664,words/sec,0
19 | Iter,18,Alpha,0.001734,VALIDacc,0.392308,VALIDent,6.446472,VALIDppx,87.213064,words/sec,0
20 | Iter,19,Alpha,0.001156,VALIDacc,0.394231,VALIDent,6.438324,VALIDppx,86.721897,words/sec,0
21 | Iter,20,Alpha,0.000771,VALIDacc,0.394231,VALIDent,6.432292,VALIDppx,86.360061,words/sec,0
22 | Iter,21,Alpha,0.000514,VALIDacc,0.394231,VALIDent,6.427471,VALIDppx,86.071938,words/sec,0
23 | Iter,22,Alpha,0.000343,VALIDacc,0.396154,VALIDent,6.422642,VALIDppx,85.784325,words/sec,0
24 | Iter,23,Alpha,0.000228,VALIDacc,0.398077,VALIDent,6.417652,VALIDppx,85.488138,words/sec,0
25 | Iter,24,Alpha,0.000152,VALIDacc,0.401923,VALIDent,6.413943,VALIDppx,85.268628,words/sec,0
26 | Iter,25,Alpha,0.000101,VALIDacc,0.400000,VALIDent,6.412369,VALIDppx,85.175656,words/sec,0
27 | 


--------------------------------------------------------------------------------
/logs/GutenbergHolmes_seq_mw5_h200_c250_m1000_d4_b5.acc:
--------------------------------------------------------------------------------
 1 | Iter,0,Alpha,0.100000,VALIDacc,0.332692,VALIDent,7.207843,VALIDppx,147.834853,words/sec,0
 2 | Iter,1,Alpha,0.100000,VALIDacc,0.344231,VALIDent,7.012280,VALIDppx,129.094183,words/sec,0
 3 | Iter,2,Alpha,0.100000,VALIDacc,0.355769,VALIDent,6.934988,VALIDppx,122.359987,words/sec,0
 4 | Iter,3,Alpha,0.100000,VALIDacc,0.357692,VALIDent,6.891659,VALIDppx,118.739717,words/sec,0
 5 | Iter,4,Alpha,0.100000,VALIDacc,0.357692,VALIDent,6.869177,VALIDppx,116.903732,words/sec,0
 6 | Iter,5,Alpha,0.100000,VALIDacc,0.375000,VALIDent,6.858217,VALIDppx,116.018969,words/sec,0
 7 | Iter,6,Alpha,0.100000,VALIDacc,0.378846,VALIDent,6.855655,VALIDppx,115.813095,words/sec,0
 8 | Iter,7,Alpha,0.100000,VALIDacc,0.384615,VALIDent,6.857846,VALIDppx,115.989114,words/sec,0
 9 | Iter,8,Alpha,0.100000,VALIDacc,0.384615,VALIDent,6.862257,VALIDppx,116.344320,words/sec,0
10 | Iter,9,Alpha,0.100000,VALIDacc,0.388462,VALIDent,6.868663,VALIDppx,116.862093,words/sec,0
11 | Iter,10,Alpha,0.100000,VALIDacc,0.392308,VALIDent,6.877193,VALIDppx,117.555111,words/sec,0
12 | Iter,11,Alpha,0.100000,VALIDacc,0.388462,VALIDent,6.887047,VALIDppx,118.360766,words/sec,0
13 | Iter,12,Alpha,0.066667,VALIDacc,0.390385,VALIDent,6.789573,VALIDppx,110.628010,words/sec,0
14 | Iter,13,Alpha,0.044444,VALIDacc,0.390385,VALIDent,6.724826,VALIDppx,105.772854,words/sec,0
15 | Iter,14,Alpha,0.029630,VALIDacc,0.390385,VALIDent,6.676831,VALIDppx,102.311930,words/sec,0
16 | Iter,15,Alpha,0.019753,VALIDacc,0.386538,VALIDent,6.640517,VALIDppx,99.768782,words/sec,0
17 | Iter,16,Alpha,0.013169,VALIDacc,0.382692,VALIDent,6.612376,VALIDppx,97.841573,words/sec,0
18 | Iter,17,Alpha,0.008779,VALIDacc,0.382692,VALIDent,6.589223,VALIDppx,96.283933,words/sec,0
19 | Iter,18,Alpha,0.005853,VALIDacc,0.382692,VALIDent,6.569262,VALIDppx,94.960907,words/sec,0
20 | Iter,19,Alpha,0.003902,VALIDacc,0.386538,VALIDent,6.553112,VALIDppx,93.903810,words/sec,0
21 | Iter,20,Alpha,0.002601,VALIDacc,0.378846,VALIDent,6.538522,VALIDppx,92.958927,words/sec,0
22 | Iter,21,Alpha,0.001734,VALIDacc,0.384615,VALIDent,6.526329,VALIDppx,92.176609,words/sec,0
23 | Iter,22,Alpha,0.001156,VALIDacc,0.386538,VALIDent,6.516917,VALIDppx,91.577251,words/sec,0
24 | Iter,23,Alpha,0.000771,VALIDacc,0.388462,VALIDent,6.509623,VALIDppx,91.115420,words/sec,0
25 | Iter,24,Alpha,0.000514,VALIDacc,0.388462,VALIDent,6.504032,VALIDppx,90.762946,words/sec,0
26 | Iter,25,Alpha,0.000343,VALIDacc,0.388462,VALIDent,6.499585,VALIDppx,90.483618,words/sec,0
27 | Iter,26,Alpha,0.000228,VALIDacc,0.392308,VALIDent,6.495298,VALIDppx,90.215141,words/sec,0
28 | Iter,27,Alpha,0.000152,VALIDacc,0.386538,VALIDent,6.491204,VALIDppx,89.959497,words/sec,0
29 | Iter,28,Alpha,0.000101,VALIDacc,0.392308,VALIDent,6.488327,VALIDppx,89.780284,words/sec,0
30 | 


--------------------------------------------------------------------------------
/logs/GutenbergHolmes_seq_mw5_h100_c250_m1000_d4_b5.acc:
--------------------------------------------------------------------------------
 1 | Iter,0,Alpha,0.100000,VALIDacc,0.334615,VALIDent,7.038659,VALIDppx,131.476297,words/sec,0
 2 | Iter,1,Alpha,0.100000,VALIDacc,0.346154,VALIDent,6.874015,VALIDppx,117.296426,words/sec,0
 3 | Iter,2,Alpha,0.100000,VALIDacc,0.359615,VALIDent,6.805409,VALIDppx,111.849017,words/sec,0
 4 | Iter,3,Alpha,0.100000,VALIDacc,0.363462,VALIDent,6.771734,VALIDppx,109.268546,words/sec,0
 5 | Iter,4,Alpha,0.100000,VALIDacc,0.373077,VALIDent,6.755499,VALIDppx,108.045793,words/sec,0
 6 | Iter,5,Alpha,0.100000,VALIDacc,0.375000,VALIDent,6.749179,VALIDppx,107.573495,words/sec,0
 7 | Iter,6,Alpha,0.100000,VALIDacc,0.384615,VALIDent,6.748471,VALIDppx,107.520716,words/sec,0
 8 | Iter,7,Alpha,0.100000,VALIDacc,0.384615,VALIDent,6.751653,VALIDppx,107.758162,words/sec,0
 9 | Iter,8,Alpha,0.100000,VALIDacc,0.392308,VALIDent,6.758135,VALIDppx,108.243407,words/sec,0
10 | Iter,9,Alpha,0.100000,VALIDacc,0.398077,VALIDent,6.767457,VALIDppx,108.945080,words/sec,0
11 | Iter,10,Alpha,0.100000,VALIDacc,0.398077,VALIDent,6.778716,VALIDppx,109.798622,words/sec,0
12 | Iter,11,Alpha,0.100000,VALIDacc,0.398077,VALIDent,6.791265,VALIDppx,110.757814,words/sec,0
13 | Iter,12,Alpha,0.100000,VALIDacc,0.398077,VALIDent,6.805175,VALIDppx,111.830860,words/sec,0
14 | Iter,13,Alpha,0.100000,VALIDacc,0.394231,VALIDent,6.820241,VALIDppx,113.004888,words/sec,0
15 | Iter,14,Alpha,0.066667,VALIDacc,0.396154,VALIDent,6.754421,VALIDppx,107.965108,words/sec,0
16 | Iter,15,Alpha,0.044444,VALIDacc,0.396154,VALIDent,6.705283,VALIDppx,104.349745,words/sec,0
17 | Iter,16,Alpha,0.029630,VALIDacc,0.394231,VALIDent,6.670727,VALIDppx,101.880015,words/sec,0
18 | Iter,17,Alpha,0.019753,VALIDacc,0.394231,VALIDent,6.645158,VALIDppx,100.090310,words/sec,0
19 | Iter,18,Alpha,0.013169,VALIDacc,0.396154,VALIDent,6.623484,VALIDppx,98.597800,words/sec,0
20 | Iter,19,Alpha,0.008779,VALIDacc,0.388462,VALIDent,6.603299,VALIDppx,97.227954,words/sec,0
21 | Iter,20,Alpha,0.005853,VALIDacc,0.390385,VALIDent,6.583628,VALIDppx,95.911262,words/sec,0
22 | Iter,21,Alpha,0.003902,VALIDacc,0.390385,VALIDent,6.566929,VALIDppx,94.807458,words/sec,0
23 | Iter,22,Alpha,0.002601,VALIDacc,0.386538,VALIDent,6.555035,VALIDppx,94.029082,words/sec,0
24 | Iter,23,Alpha,0.001734,VALIDacc,0.388462,VALIDent,6.545795,VALIDppx,93.428741,words/sec,0
25 | Iter,24,Alpha,0.001156,VALIDacc,0.388462,VALIDent,6.538449,VALIDppx,92.954264,words/sec,0
26 | Iter,25,Alpha,0.000771,VALIDacc,0.388462,VALIDent,6.532582,VALIDppx,92.576978,words/sec,0
27 | Iter,26,Alpha,0.000514,VALIDacc,0.390385,VALIDent,6.527838,VALIDppx,92.273109,words/sec,0
28 | Iter,27,Alpha,0.000343,VALIDacc,0.392308,VALIDent,6.523325,VALIDppx,91.984900,words/sec,0
29 | Iter,28,Alpha,0.000228,VALIDacc,0.390385,VALIDent,6.518669,VALIDppx,91.688531,words/sec,0
30 | Iter,29,Alpha,0.000152,VALIDacc,0.392308,VALIDent,6.514971,VALIDppx,91.453782,words/sec,0
31 | Iter,30,Alpha,0.000101,VALIDacc,0.388462,VALIDent,6.513047,VALIDppx,91.331887,words/sec,0
32 | 


--------------------------------------------------------------------------------
/books/test.labels:
--------------------------------------------------------------------------------
  1 | 2
  2 | 4
  3 | 3
  4 | 1
  5 | 0
  6 | 3
  7 | 0
  8 | 4
  9 | 2
 10 | 2
 11 | 2
 12 | 2
 13 | 1
 14 | 3
 15 | 1
 16 | 4
 17 | 1
 18 | 1
 19 | 1
 20 | 0
 21 | 2
 22 | 0
 23 | 4
 24 | 3
 25 | 4
 26 | 2
 27 | 0
 28 | 3
 29 | 2
 30 | 0
 31 | 2
 32 | 0
 33 | 4
 34 | 0
 35 | 2
 36 | 2
 37 | 2
 38 | 2
 39 | 2
 40 | 3
 41 | 0
 42 | 2
 43 | 3
 44 | 4
 45 | 1
 46 | 0
 47 | 3
 48 | 1
 49 | 1
 50 | 2
 51 | 3
 52 | 2
 53 | 0
 54 | 0
 55 | 3
 56 | 1
 57 | 2
 58 | 4
 59 | 2
 60 | 0
 61 | 4
 62 | 2
 63 | 2
 64 | 1
 65 | 4
 66 | 1
 67 | 4
 68 | 0
 69 | 1
 70 | 4
 71 | 4
 72 | 0
 73 | 3
 74 | 2
 75 | 3
 76 | 3
 77 | 1
 78 | 1
 79 | 3
 80 | 2
 81 | 3
 82 | 0
 83 | 1
 84 | 0
 85 | 4
 86 | 0
 87 | 3
 88 | 1
 89 | 2
 90 | 4
 91 | 1
 92 | 4
 93 | 1
 94 | 1
 95 | 1
 96 | 1
 97 | 4
 98 | 2
 99 | 2
100 | 4
101 | 2
102 | 1
103 | 4
104 | 3
105 | 4
106 | 0
107 | 0
108 | 3
109 | 4
110 | 0
111 | 1
112 | 3
113 | 3
114 | 4
115 | 3
116 | 2
117 | 2
118 | 1
119 | 0
120 | 4
121 | 4
122 | 0
123 | 1
124 | 2
125 | 1
126 | 1
127 | 2
128 | 0
129 | 3
130 | 4
131 | 2
132 | 2
133 | 1
134 | 3
135 | 2
136 | 4
137 | 0
138 | 3
139 | 0
140 | 0
141 | 1
142 | 3
143 | 2
144 | 2
145 | 0
146 | 4
147 | 1
148 | 1
149 | 0
150 | 1
151 | 2
152 | 0
153 | 4
154 | 4
155 | 0
156 | 1
157 | 2
158 | 2
159 | 1
160 | 4
161 | 3
162 | 0
163 | 4
164 | 1
165 | 4
166 | 1
167 | 0
168 | 0
169 | 3
170 | 0
171 | 2
172 | 3
173 | 0
174 | 4
175 | 4
176 | 2
177 | 4
178 | 3
179 | 1
180 | 0
181 | 4
182 | 2
183 | 1
184 | 2
185 | 1
186 | 1
187 | 3
188 | 1
189 | 0
190 | 1
191 | 0
192 | 4
193 | 3
194 | 3
195 | 1
196 | 4
197 | 1
198 | 0
199 | 1
200 | 4
201 | 0
202 | 0
203 | 0
204 | 3
205 | 2
206 | 4
207 | 2
208 | 0
209 | 1
210 | 1
211 | 2
212 | 1
213 | 2
214 | 3
215 | 2
216 | 2
217 | 4
218 | 2
219 | 1
220 | 4
221 | 0
222 | 0
223 | 4
224 | 4
225 | 0
226 | 4
227 | 3
228 | 0
229 | 2
230 | 4
231 | 1
232 | 0
233 | 4
234 | 0
235 | 2
236 | 0
237 | 3
238 | 0
239 | 1
240 | 3
241 | 3
242 | 2
243 | 1
244 | 1
245 | 2
246 | 4
247 | 1
248 | 4
249 | 4
250 | 4
251 | 1
252 | 4
253 | 2
254 | 0
255 | 3
256 | 3
257 | 4
258 | 1
259 | 0
260 | 4
261 | 3
262 | 0
263 | 2
264 | 1
265 | 0
266 | 4
267 | 2
268 | 3
269 | 3
270 | 1
271 | 2
272 | 0
273 | 0
274 | 3
275 | 3
276 | 0
277 | 2
278 | 4
279 | 0
280 | 2
281 | 3
282 | 1
283 | 0
284 | 1
285 | 2
286 | 4
287 | 1
288 | 0
289 | 4
290 | 1
291 | 1
292 | 1
293 | 0
294 | 4
295 | 2
296 | 0
297 | 2
298 | 1
299 | 3
300 | 0
301 | 0
302 | 3
303 | 1
304 | 0
305 | 3
306 | 2
307 | 3
308 | 1
309 | 3
310 | 4
311 | 3
312 | 4
313 | 3
314 | 3
315 | 2
316 | 0
317 | 3
318 | 0
319 | 0
320 | 0
321 | 0
322 | 0
323 | 3
324 | 4
325 | 2
326 | 0
327 | 3
328 | 2
329 | 1
330 | 1
331 | 0
332 | 4
333 | 2
334 | 0
335 | 3
336 | 1
337 | 4
338 | 0
339 | 4
340 | 0
341 | 3
342 | 4
343 | 3
344 | 2
345 | 2
346 | 4
347 | 4
348 | 3
349 | 3
350 | 1
351 | 2
352 | 4
353 | 0
354 | 2
355 | 4
356 | 2
357 | 0
358 | 3
359 | 2
360 | 3
361 | 3
362 | 2
363 | 2
364 | 2
365 | 2
366 | 3
367 | 2
368 | 2
369 | 0
370 | 0
371 | 2
372 | 4
373 | 2
374 | 0
375 | 0
376 | 4
377 | 0
378 | 2
379 | 4
380 | 3
381 | 1
382 | 2
383 | 2
384 | 0
385 | 1
386 | 3
387 | 2
388 | 2
389 | 3
390 | 0
391 | 3
392 | 4
393 | 4
394 | 4
395 | 0
396 | 3
397 | 1
398 | 0
399 | 2
400 | 3
401 | 3
402 | 4
403 | 4
404 | 0
405 | 4
406 | 0
407 | 3
408 | 1
409 | 1
410 | 4
411 | 1
412 | 4
413 | 3
414 | 4
415 | 3
416 | 1
417 | 2
418 | 0
419 | 3
420 | 1
421 | 1
422 | 0
423 | 4
424 | 1
425 | 1
426 | 1
427 | 4
428 | 0
429 | 4
430 | 1
431 | 0
432 | 1
433 | 0
434 | 0
435 | 1
436 | 1
437 | 1
438 | 3
439 | 3
440 | 0
441 | 4
442 | 3
443 | 2
444 | 1
445 | 0
446 | 2
447 | 3
448 | 3
449 | 2
450 | 3
451 | 4
452 | 1
453 | 1
454 | 3
455 | 1
456 | 4
457 | 0
458 | 4
459 | 0
460 | 4
461 | 2
462 | 1
463 | 1
464 | 0
465 | 1
466 | 0
467 | 3
468 | 1
469 | 2
470 | 0
471 | 1
472 | 3
473 | 1
474 | 3
475 | 0
476 | 2
477 | 1
478 | 1
479 | 4
480 | 4
481 | 1
482 | 0
483 | 1
484 | 2
485 | 3
486 | 2
487 | 3
488 | 4
489 | 0
490 | 4
491 | 1
492 | 1
493 | 0
494 | 0
495 | 1
496 | 4
497 | 3
498 | 3
499 | 3
500 | 4
501 | 1
502 | 0
503 | 2
504 | 4
505 | 4
506 | 1
507 | 0
508 | 4
509 | 4
510 | 1
511 | 4
512 | 3
513 | 1
514 | 2
515 | 3
516 | 1
517 | 4
518 | 1
519 | 0
520 | 2
521 | 


--------------------------------------------------------------------------------
/books/valid.labels:
--------------------------------------------------------------------------------
  1 | 3
  2 | 4
  3 | 3
  4 | 4
  5 | 3
  6 | 3
  7 | 3
  8 | 1
  9 | 3
 10 | 0
 11 | 1
 12 | 4
 13 | 2
 14 | 0
 15 | 3
 16 | 4
 17 | 1
 18 | 0
 19 | 0
 20 | 0
 21 | 4
 22 | 2
 23 | 3
 24 | 4
 25 | 4
 26 | 0
 27 | 0
 28 | 2
 29 | 2
 30 | 4
 31 | 3
 32 | 0
 33 | 0
 34 | 0
 35 | 1
 36 | 4
 37 | 4
 38 | 0
 39 | 4
 40 | 4
 41 | 3
 42 | 4
 43 | 0
 44 | 1
 45 | 1
 46 | 2
 47 | 3
 48 | 2
 49 | 2
 50 | 1
 51 | 1
 52 | 2
 53 | 2
 54 | 1
 55 | 3
 56 | 3
 57 | 4
 58 | 4
 59 | 0
 60 | 1
 61 | 1
 62 | 2
 63 | 4
 64 | 3
 65 | 3
 66 | 0
 67 | 4
 68 | 1
 69 | 4
 70 | 4
 71 | 2
 72 | 4
 73 | 2
 74 | 3
 75 | 2
 76 | 4
 77 | 4
 78 | 4
 79 | 2
 80 | 1
 81 | 1
 82 | 1
 83 | 3
 84 | 4
 85 | 0
 86 | 2
 87 | 2
 88 | 2
 89 | 4
 90 | 3
 91 | 3
 92 | 0
 93 | 3
 94 | 2
 95 | 4
 96 | 3
 97 | 4
 98 | 4
 99 | 4
100 | 0
101 | 0
102 | 3
103 | 4
104 | 1
105 | 4
106 | 2
107 | 2
108 | 3
109 | 1
110 | 0
111 | 1
112 | 0
113 | 0
114 | 2
115 | 1
116 | 2
117 | 4
118 | 4
119 | 4
120 | 4
121 | 4
122 | 3
123 | 2
124 | 3
125 | 3
126 | 3
127 | 4
128 | 1
129 | 2
130 | 4
131 | 1
132 | 1
133 | 0
134 | 3
135 | 1
136 | 2
137 | 2
138 | 1
139 | 2
140 | 1
141 | 0
142 | 3
143 | 2
144 | 2
145 | 3
146 | 3
147 | 2
148 | 2
149 | 1
150 | 4
151 | 4
152 | 4
153 | 1
154 | 4
155 | 2
156 | 2
157 | 4
158 | 4
159 | 1
160 | 3
161 | 0
162 | 2
163 | 0
164 | 4
165 | 3
166 | 1
167 | 3
168 | 4
169 | 0
170 | 3
171 | 0
172 | 4
173 | 0
174 | 2
175 | 2
176 | 1
177 | 0
178 | 1
179 | 3
180 | 3
181 | 4
182 | 3
183 | 3
184 | 4
185 | 2
186 | 4
187 | 0
188 | 3
189 | 2
190 | 4
191 | 1
192 | 0
193 | 0
194 | 4
195 | 2
196 | 2
197 | 4
198 | 4
199 | 0
200 | 3
201 | 0
202 | 2
203 | 3
204 | 4
205 | 0
206 | 0
207 | 1
208 | 2
209 | 2
210 | 4
211 | 2
212 | 3
213 | 1
214 | 2
215 | 1
216 | 3
217 | 4
218 | 2
219 | 1
220 | 3
221 | 4
222 | 4
223 | 4
224 | 2
225 | 0
226 | 0
227 | 2
228 | 2
229 | 0
230 | 4
231 | 2
232 | 4
233 | 2
234 | 0
235 | 1
236 | 3
237 | 0
238 | 4
239 | 1
240 | 2
241 | 2
242 | 3
243 | 3
244 | 3
245 | 1
246 | 2
247 | 1
248 | 1
249 | 1
250 | 3
251 | 1
252 | 4
253 | 2
254 | 3
255 | 1
256 | 3
257 | 3
258 | 4
259 | 1
260 | 0
261 | 3
262 | 2
263 | 2
264 | 1
265 | 2
266 | 2
267 | 1
268 | 4
269 | 0
270 | 3
271 | 2
272 | 3
273 | 0
274 | 3
275 | 2
276 | 2
277 | 4
278 | 4
279 | 0
280 | 0
281 | 4
282 | 3
283 | 4
284 | 0
285 | 0
286 | 3
287 | 2
288 | 0
289 | 2
290 | 1
291 | 1
292 | 0
293 | 4
294 | 4
295 | 1
296 | 0
297 | 2
298 | 1
299 | 4
300 | 1
301 | 3
302 | 2
303 | 0
304 | 2
305 | 2
306 | 1
307 | 4
308 | 1
309 | 4
310 | 4
311 | 3
312 | 0
313 | 4
314 | 4
315 | 2
316 | 2
317 | 3
318 | 1
319 | 2
320 | 3
321 | 1
322 | 1
323 | 4
324 | 4
325 | 4
326 | 4
327 | 1
328 | 0
329 | 2
330 | 1
331 | 1
332 | 0
333 | 1
334 | 4
335 | 0
336 | 1
337 | 3
338 | 2
339 | 4
340 | 0
341 | 4
342 | 3
343 | 4
344 | 2
345 | 0
346 | 1
347 | 1
348 | 2
349 | 1
350 | 2
351 | 0
352 | 3
353 | 0
354 | 2
355 | 3
356 | 3
357 | 2
358 | 1
359 | 2
360 | 1
361 | 2
362 | 3
363 | 1
364 | 1
365 | 0
366 | 3
367 | 0
368 | 4
369 | 2
370 | 3
371 | 1
372 | 4
373 | 2
374 | 3
375 | 1
376 | 2
377 | 0
378 | 4
379 | 4
380 | 4
381 | 1
382 | 4
383 | 0
384 | 0
385 | 2
386 | 4
387 | 2
388 | 3
389 | 2
390 | 0
391 | 4
392 | 4
393 | 4
394 | 3
395 | 4
396 | 3
397 | 4
398 | 3
399 | 3
400 | 4
401 | 0
402 | 3
403 | 0
404 | 4
405 | 3
406 | 0
407 | 0
408 | 2
409 | 2
410 | 2
411 | 3
412 | 2
413 | 0
414 | 1
415 | 2
416 | 2
417 | 3
418 | 2
419 | 4
420 | 3
421 | 0
422 | 0
423 | 2
424 | 2
425 | 4
426 | 2
427 | 1
428 | 0
429 | 1
430 | 3
431 | 2
432 | 1
433 | 4
434 | 3
435 | 4
436 | 4
437 | 3
438 | 2
439 | 0
440 | 3
441 | 0
442 | 1
443 | 1
444 | 4
445 | 1
446 | 2
447 | 3
448 | 2
449 | 3
450 | 4
451 | 0
452 | 4
453 | 3
454 | 3
455 | 4
456 | 3
457 | 3
458 | 1
459 | 2
460 | 2
461 | 0
462 | 0
463 | 3
464 | 2
465 | 3
466 | 2
467 | 3
468 | 3
469 | 4
470 | 3
471 | 2
472 | 3
473 | 3
474 | 4
475 | 3
476 | 4
477 | 1
478 | 1
479 | 3
480 | 2
481 | 3
482 | 4
483 | 1
484 | 0
485 | 2
486 | 4
487 | 0
488 | 4
489 | 1
490 | 2
491 | 4
492 | 1
493 | 1
494 | 1
495 | 2
496 | 0
497 | 1
498 | 1
499 | 3
500 | 3
501 | 1
502 | 1
503 | 1
504 | 3
505 | 3
506 | 3
507 | 3
508 | 1
509 | 2
510 | 4
511 | 4
512 | 4
513 | 1
514 | 3
515 | 1
516 | 0
517 | 3
518 | 2
519 | 3
520 | 1
521 | 


--------------------------------------------------------------------------------
/DependencyTreeRNN++/CommandLineParser.cpp:
--------------------------------------------------------------------------------
  1 | // Copyright (c) 2014-2015 Piotr Mirowski
  2 | //
  3 | // Piotr Mirowski, Andreas Vlachos
  4 | // "Dependency Recurrent Neural Language Models for Sentence Completion"
  5 | // ACL 2015
  6 | 
  7 | #include <iostream>
  8 | #include <set>
  9 | #include <stdlib.h>
 10 | #include "CommandLineParser.h"
 11 | 
 12 | using namespace std;
 13 | 
 14 | 
 15 | /**
 16 |  * Get a command line argument
 17 |  */
 18 | bool CommandLineParser::Get(string name, int &value) {
 19 |   if (args.find(name) == args.end()) {
 20 |     cout << name << " must be registered as parameter\n";
 21 |     return false;
 22 |   }
 23 |   CommandLineArgument a = args[name];
 24 |   value = atoi(a.m_value.c_str());
 25 |   return true;
 26 | }
 27 | 
 28 | 
 29 | /**
 30 |  * Get a command line argument
 31 |  */
 32 | bool CommandLineParser::Get(string name, double &value) {
 33 |   if (args.find(name) == args.end()) {
 34 |     cout << name << " must be registered as parameter\n";
 35 |     return false;
 36 |   }
 37 |   CommandLineArgument a = args[name];
 38 |   value = atof(a.m_value.c_str());
 39 |   return true;
 40 | }
 41 | 
 42 | 
 43 | /**
 44 |  * Get a command line argument
 45 |  */
 46 | bool CommandLineParser::Get(string name, string &value) {
 47 |   if (args.find(name) == args.end()) {
 48 |     cout << name << " must be registered as parameter\n";
 49 |     return false;
 50 |   }
 51 |   CommandLineArgument a = args[name];
 52 |   value = a.m_value;
 53 |   return (!value.empty());
 54 | }
 55 | 
 56 | 
 57 | /**
 58 |  * Get a command line argument
 59 |  */
 60 | bool CommandLineParser::Get(string name, bool &value) {
 61 |   if (args.find(name) == args.end()) {
 62 |     cout << name << " must be registered as a parameter\n";
 63 |     return false;
 64 |   }
 65 |   CommandLineArgument a = args[name];
 66 |   value = (a.m_value.compare("true") == 0);
 67 |   return true;
 68 | }
 69 | 
 70 | 
 71 | /**
 72 |  * Get a command line argument
 73 |  */
 74 | bool CommandLineParser::Get(string name, long long &value) {
 75 |   if (args.find(name) == args.end()) {
 76 |     cout << name << " must be registered as a parameter \n";
 77 |     return false;
 78 |   }
 79 |   CommandLineArgument a = args[name];
 80 |   value = (long long)atoll(a.m_value.c_str());
 81 |   return true;
 82 | }
 83 | 
 84 | 
 85 | /**
 86 |  * Parse the arguments to extract their values and store them in the map
 87 |  */
 88 | bool CommandLineParser::Parse(char *list[], int llen) {
 89 |   if (llen == 1) {
 90 |     // Show the arguments
 91 |     cout << "Usage: " << list[0]  << "\n";
 92 |     for (map<string, CommandLineArgument>::iterator mi = args.begin();
 93 |          mi != args.end();
 94 |          mi++) {
 95 |       if (!mi->second.m_isRequired) {
 96 |         cout << "[-" << mi->first << " ("
 97 |         << mi->second.m_type << ": "
 98 |         << mi->second.m_value << ")]: " << mi->second.m_description << "\n";
 99 |       } else {
100 |         cout << "-" << mi->first << " ("
101 |         << mi->second.m_type << "): "
102 |         << mi->second.m_description << "\n";
103 |       }
104 |     }
105 |     return false;
106 |   }
107 |   
108 |   if ((llen % 2) == 0) {
109 |     cout << "Command line must have an even number of elements\n";
110 |     cout << "Check argument structure\n";
111 |     return false;
112 |   }
113 |   
114 |   // List of seen arguments
115 |   set<string> seen;
116 |   for (int i = 1; i < llen; i += 2) {
117 |     if (list[i][0] != '-') {
118 |       cout << "Argument names must begin with -\n";
119 |       cout << "Saw: " << list[i] << endl;
120 |       return false;
121 |     }
122 |     string aname(&list[i][1]);
123 |     if (args.find(aname) == args.end()) {
124 |       cout << "Unknown parameter on command line: " << aname << endl;
125 |       return false;
126 |     }
127 |     args[aname].m_value = string(list[i+1]);
128 |     seen.insert(aname);
129 |   }
130 |   
131 |   // check that the required arguments have been seen
132 |   for (map<string, CommandLineArgument>::iterator mi = args.begin();
133 |        mi != args.end();
134 |        mi++) {
135 |     if (mi->second.m_isRequired && !seen.count(mi->first)) {
136 |       cout << "Required argument " << mi->first << " not set on command line\n";
137 |       return false;
138 |     }
139 |   }
140 |   return true;
141 | }
142 | 


--------------------------------------------------------------------------------
/results.txt:
--------------------------------------------------------------------------------
  1 | Sequential data, plain RNN:
  2 | minimum 5 word occurrence
  3 | V = 72846 words
  4 | 250 classes
  5 | 
  6 | 
  7 | No n-gram connections
  8 | BPTT(5) every 1 step
  9 | 50 hidden: 29.615385% (dev), 28.076923% (test)
 10 | 100 hidden: 30.0000% (dev), 30.0000% (test)
 11 | 200 hidden: 30.000000% (dev), 30.384615% (test)
 12 | 300 hidden: 30.576923% (dev), 28.461538% (test)
 13 | 
 14 | 
 15 | 1000MB of 2-gram connections
 16 | BPTT(5) every 1 step
 17 | 50 hidden: 29.615385% (dev), 30.769231% (test)
 18 | 100 hidden: 28.653846% (dev), 30.384615% (test)
 19 | 200 hidden: 30.000000% (dev), 28.653846% (test)
 20 | 300 hidden: IN PROGRESS
 21 | 
 22 | 
 23 | 1000MB of 3-gram connections
 24 | BPTT(5) every 1 step
 25 | 50 hidden: 39.2308% (dev), 40.769231% (test)
 26 | 100 hidden: 39.423077% (dev), 40.576923% (test)
 27 | 200 hidden: 38.846154% (dev), 40.192308% (test)
 28 | 300 hidden: IN PROGRESS
 29 | 
 30 | 
 31 | 1000MB of 4-gram connections
 32 | BPTT(5) every 1 step
 33 | 50 hidden: 40.192308% (dev), 42.307692% (test)
 34 | 100 hidden: 40.576923% (dev), 41.153846% (test)
 35 | 200 hidden: 40.000000% (dev), 40.384615% (test)
 36 | 300 hidden: 40.192308% (dev), 39.230769% (test)
 37 | 
 38 | 
 39 | 1000MB of 5-gram connections
 40 | BPTT(5) every 1 step
 41 | 50 hidden: 39.807692% (dev), 39.807692% (test)
 42 | 100 hidden: 39.423077% (dev), 39.615385% (test)
 43 | 200 hidden: TODO???
 44 | 300 hidden: TODO???
 45 | 
 46 | 
 47 | 1000MB of 3-gram connections
 48 | BPTT(10) every 1 step
 49 | 100 hidden: 39.423077% (dev), 40.576923% (test)
 50 | -> not interesting
 51 | 
 52 | 
 53 | 1000MB of 5-gram connections
 54 | BPTT(10) every 1 step
 55 | 50 hidden: 39.807692% (dev), 39.807692% (test)
 56 | -> not interesting
 57 | 
 58 | 
 59 | 2000MB of 3-gram connections
 60 | BPTT(5) every 1 step
 61 | 100 hidden: 39.423077% (dev), 40.384615% (test)
 62 | 200 hidden: 39.423077% (dev), 40.384615% (test)
 63 | -> run into memory problem all the time
 64 | 
 65 | 
 66 | 
 67 | Dependency Tree RNN:
 68 | minimum 5 word occurrence
 69 | V = 67824 words, 44 labels
 70 | 250 classes
 71 | 
 72 | labels as features (p2)
 73 | 1000MB of 3-gram connections
 74 | BPTT_ORDER=5
 75 | 200 hidden: 50.576923% (dev), 50.000000% (test)
 76 | 
 77 | labels as features (p2)
 78 | 2000MB of 4-gram connections
 79 | BPTT_ORDER=5
 80 | 50 hidden
 81 | gamma=0: 48.846154% (dev), 46.730769% (test)
 82 | gamma=0.1: 47.115385% (dev), 48.269231% (test)
 83 | gamma=0.5: 46.730769% (dev), 47.115385% (test)
 84 | gamma=0.9: 47.307692% (dev), 46.730769% (test)
 85 | gamma=1.0: 47.307692% (dev), 46.730769% (test)
 86 | gamma=1.1: 48.076923% (dev), 45.576923% (test)
 87 | 
 88 | labels as features (p2)
 89 | 2000MB of 4-gram connections
 90 | BPTT_ORDER=5
 91 | 100 hidden
 92 | gamma=0: TODO
 93 | gamma=0.1: 49.807692% (dev), 49.807692% (test)
 94 | gamma=0.5: 50.000000% (dev), 49.615385% (test)
 95 | gamma=0.9: 49.230769% (dev), 47.500000% (test)
 96 | gamma=1.0: 47.692308% (dev), 47.500000% (test)
 97 | gamma=1.1: 48.653846% (dev), 48.076923% (test)
 98 | 
 99 | labels as features (p2)
100 | 2000MB of 3-gram connections
101 | BPTT_ORDER=5
102 | gamma=0.5
103 | 50 hidden: 48.653846% (dev), 48.846154% (test)
104 | 100 hidden: 51.538462% (dev), 49.615385% (test)
105 | 200 hidden: 51.730769% (dev), 50.000000% (test)
106 | 300 hidden: TODO 
107 | 
108 | 
109 | labels as features (p2)
110 | 2000MB of 4-gram connections
111 | BPTT_ORDER=5
112 | gamma=0.5
113 | 50 hidden: 46.730769% (dev), 47.115385% (test)
114 | 100 hidden: 50.000000% (dev), 49.615385% (test)
115 | 200 hidden: 49.807692% (dev), 49.615385% (test)
116 | ensemble of these: 526923076923% (dev), 0.517307692308 (test)
117 | 300 hidden: IN PROGRESS
118 | 
119 | no labels (p0)
120 | 2000MB of 4-gram connections
121 | BPTT_ORDER=5
122 | 50 hidden: 52.692308% (dev), 48.846154% (test)
123 | 100 hidden: 54.038462% (dev), 51.346154% (test)
124 | 200 hidden: 52.692308% (dev), 50.769231% (test)
125 | 300 hidden: 
126 | 
127 | no labels (p0)
128 | 2000MB of 3-gram connections
129 | BPTT_ORDER=5
130 | 50 hidden: 53.269231% (dev), 51.923077% (test)
131 | 100 hidden: 54.230769% (dev), 52.692308% (test)
132 | 200 hidden: 54.230769% (dev), 51.923077% (test)
133 | 300 hidden: 53.269231% (dev), 52.884615% (test)
134 | 
135 | no labels (p0)
136 | 2000MB of 2-gram connections
137 | BPTT_ORDER=5
138 | 50 hidden: 48.653846% (dev), 45.000000% (test)
139 | 100 hidden: 48.653846% (dev), 47.500000% (test)
140 | 200 hidden: 48.653846% (dev), 45.576923% (test)
141 | 300 hidden: IN PROGRESS
142 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # DependencyTreeRnn
 2 | Dependency tree-based RNN
 3 | 
 4 | Copyright (c) 2014-2015 Piotr Mirowski, Andreas Vlachos
 5 | 
 6 | Please refer to the following paper:
 7 | Piotr Mirowski, Andreas Vlachos
 8 | "Dependency Recurrent Neural Language Models for Sentence Completion"
 9 | ACL 2015
10 | 
11 | # Installation
12 | 0. Download the preprocessed training and validation/testing data from here: https://drive.google.com/file/d/0BwPdBcatuO0vS3JlUVBtZHpSb3M/view?usp=sharing
13 | 1. Modify the path to the BLAS header (cblas.h) file, i.e., $BLASINCLUDE
14 |    and the BLAS path, i.e., $BLASFLAGS, in file Makefile.
15 |    Alternatively, make your own version of that Makefile.
16 | 2. Build the project:
17 | ```
18 | > make
19 | ```
20 |    or, using your custom Makefile:
21 | ```
22 | > make -f YOUR_OWN_MAKEFILE
23 | ```
24 | Note that the .o objects are stored in directory build/ and the executable is ./RnnDependencyTree
25 |    
26 | # Sample training script
27 | Shell script train_rnn_holmes_debug.sh trains an RNN on a subset of a few books.
28 | You need to modify the path to where the JSON book files are stored.
29 | 
30 | # Important hyperparameters
31 | 
32 | 1. Parameters relative to the dataset:
33 |   * **train** (string) Training data file (pure text)
34 |   * **valid** (string) Validation data file (pure text), using during training
35 |   * **test** (string) Test data file (pure text)
36 |   * **sentence-labels** (string) Validation/test sentence labels file (pure text)
37 |   * **path-json-books** (string) Path to the book JSON files
38 |   * **min-word-occurrence** (int) Mininum word occurrence to include word into vocabulary [default: 5]
39 |   * **independent** (bool) Is each line in the training/testing file independent? [default: true]
40 | 
41 | 2. Parameters relative to the dependency labels
42 |   * **feature-labels-type** (int) Dependency parsing labels:
43 |     * 0 = none, use words only
44 |     * 1 = concatenate label to word
45 |     * 2 = use features in the feature vector, separate from words
46 |   * **feature-gamma** (double) Decay weight for features consisting of label vectors [default: 0.9].
47 |     * Values up to about 1.3 can be accepted (beyond that, the perplexity seems to become very large).
48 |     * f(t) is a vector with D elements (e.g., D=44 types of dependency labels)
49 |     * f(t) <- gamma * f(t-1), then set element at current label to 1
50 |     * This value could be important for changing the weight given to dependency labels.
51 |       * A value larger than 1 means that labels further past in time count more than those immediately in the past.
52 |       * 1 means that there is no decay.
53 |       * A value between 0 and 1 means that there is some decay.
54 |       * 0 means that the decay is immediate.
55 | 
56 | 3. RNN architecture parameters
57 |   * **rnnlm** (string) RNN language model file to use (save in training / read in test)
58 |   * **classes** (int) Number of word classes used in hierarchical softmax [default: 200].
59 |     * If vocabulary size if W, choose C around sqrt(W).
60 |     * C = W means 1 class per word.
61 |     * C = 1 means standard softmax.
62 |   * **hidden** (int) Number of nodes in the hidden layer [default: 100].
63 |     * Try to go higher, perhaps up to 1000 (for 1M-word vocabulary).
64 |     * Linear impact on speed.
65 |   * **direct** (int) Size of max-entropy hash table storing direct n-gram connections, in millions of entries [default: 0].
66 |     * Basically, direct=1000 means that 1000x10000000 = 1G direct connections between context words and target word are considered.
67 |     * However, it is not a proper hashtable (which would take too much memory) but a simple vector of 1G entries, with a hashing function that hashed into specific entries in that vector. Hash collisions are totally ignored.
68 |     * Try using direct=1000 or even 2000 hashes if possible.
69 |   * **direct-order** (int) Order of direct n-gram connections; 2 is like bigram max entropy features [default: 3].
70 |     * It works on tokens only, and values of 4 or beyond did not bring improvement in others LM tasks.
71 |   * **compression** (int) Number of nodes in the compression layer between the hidden and output layers [default: 0]
72 | 
73 | 4. Training parameters
74 |   * **alpha** (double) Initial learning rate during gradient descent [default: 0.1]
75 |   * **beta** (double) L-2 norm regularization coefficient during gradient descent [default: 0.0000001]
76 |   * **min-improvement** (double) Minimum improvement before learning rate decreases [default: 1.001]
77 |   * **bptt** (int) Number of steps to propagate error back in time [default: 4]
78 |   * **bptt-block** (int) Number of time steps after which the error is backpropagated through time [default: 10]
79 |   * **gradient-cutoff** (double) Value beyond whih the gradients are clipped, used to avoid exploding gradients [default: 15]
80 | 
81 | 5. Additional parameters
82 |   * **debug** (bool) Debugging level [default: false]
83 | 


--------------------------------------------------------------------------------
/DependencyTreeRNN++/RnnWeights.h:
--------------------------------------------------------------------------------
  1 | // Copyright (c) 2014-2015 Piotr Mirowski
  2 | //
  3 | // Piotr Mirowski, Andreas Vlachos
  4 | // "Dependency Recurrent Neural Language Models for Sentence Completion"
  5 | // ACL 2015
  6 | 
  7 | // Based on code by Geoffrey Zweig and Tomas Mikolov
  8 | // for the Feature-Augmented RNN Tool Kit
  9 | // http://research.microsoft.com/en-us/projects/rnn/
 10 | 
 11 | /*
 12 |  This file is based on or incorporates material from the projects listed below (collectively, "Third Party Code").
 13 |  Microsoft is not the original author of the Third Party Code. The original copyright notice and the license under which Microsoft received such Third Party Code,
 14 |  are set forth below. Such licenses and notices are provided for informational purposes only. Microsoft, not the third party, licenses the Third Party Code to you
 15 |  under the terms set forth in the EULA for the Microsoft Product. Microsoft reserves all rights not expressly granted under this agreement, whether by implication,
 16 |  estoppel or otherwise.
 17 | 
 18 |  RNNLM 0.3e by Tomas Mikolov
 19 | 
 20 |  Provided for Informational Purposes Only
 21 | 
 22 |  BSD License
 23 |  All rights reserved.
 24 |  Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
 25 |  Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
 26 | 
 27 |  Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other
 28 |  materials provided with the distribution.
 29 | 
 30 |  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
 31 |  WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
 32 |  INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA,
 33 |  OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 34 |  ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 35 |  */
 36 | 
 37 | #ifndef DependencyTreeRNN_RnnWeights_h
 38 | #define DependencyTreeRNN_RnnWeights_h
 39 | 
 40 | #include <stdio.h>
 41 | #include <vector>
 42 | #include <sstream>
 43 | #include "Utils.h"
 44 | 
 45 | 
 46 | /**
 47 |  * Tomas Mikolov decided to implement hash tables and hash functions
 48 |  * from scratch...
 49 |  */
 50 | const unsigned int c_Primes[] = {108641969, 116049371, 125925907, 133333309, 145678979, 175308587, 197530793, 234567803, 251851741, 264197411, 330864029, 399999781,
 51 |   407407183, 459258997, 479012069, 545678687, 560493491, 607407037, 629629243, 656789717, 716048933, 718518067, 725925469, 733332871, 753085943, 755555077,
 52 |   782715551, 790122953, 812345159, 814814293, 893826581, 923456189, 940740127, 953085797, 985184539, 990122807};
 53 | const unsigned int c_PrimesSize = sizeof(c_Primes)/sizeof(c_Primes[0]);
 54 | 
 55 | 
 56 | /**
 57 |  * Weights of an RNN
 58 |  */
 59 | class RnnWeights {
 60 | public:
 61 | 
 62 |   /**
 63 |    * Constructor
 64 |    */
 65 |   RnnWeights(int sizeVocabulary,
 66 |              int sizeHidden,
 67 |              int sizeFeature,
 68 |              int sizeClasses,
 69 |              int sizeCompress,
 70 |              long long sizeDirectConnection);
 71 | 
 72 |   /**
 73 |    * Load the weights matrices from a file
 74 |    */
 75 |   void Load(FILE *fi);
 76 | 
 77 |   /**
 78 |    * Clear the weights, before loading a new model, to save on memory
 79 |    */
 80 |   void Clear();
 81 | 
 82 |   /**
 83 |    * Save the weights matrices to a file
 84 |    */
 85 |   void Save(FILE *fo);
 86 | 
 87 |   // Weights between input and hidden layer
 88 |   std::vector<double> Input2Hidden;
 89 |   // Weights between former hidden state and current hidden layer
 90 |   std::vector<double> Recurrent2Hidden;
 91 |   // weights between features and hidden layer
 92 |   std::vector<double> Features2Hidden;
 93 |   // Weights between features and output layer
 94 |   std::vector<double> Features2Output;
 95 |   // Weights between hidden and output layer (or hidden and compression if compression>0)
 96 |   std::vector<double> Hidden2Output;
 97 |   // Optional weights between compression and output layer
 98 |   std::vector<double> Compress2Output;
 99 |   // Direct parameters between input and output layer
100 |   // (similar to Maximum Entropy model parameters)
101 |   std::vector<double> DirectNGram;
102 | 
103 |   /**
104 |    * Return the number of direct connections between input words
105 |    * and the output word (i.e., n-gram features)
106 |    */
107 |   int GetNumDirectConnection() const {
108 |     return static_cast<int>(DirectNGram.size());
109 |   } // int GetNumDirectConnections()
110 | 
111 |   /**
112 |    * Return the number of word classes
113 |    */
114 |   int GetNumClasses() const { return m_sizeClasses; }
115 | 
116 |   /**
117 |    * Debug function
118 |    */
119 |   void Debug();
120 | 
121 |   
122 | protected:
123 | 
124 |   /**
125 |    * Dimensions of the network
126 |    */
127 |   int m_sizeVocabulary;
128 |   int m_sizeHidden;
129 |   int m_sizeFeature;
130 |   int m_sizeClasses;
131 |   int m_sizeCompress;
132 |   long long m_sizeDirectConnection;
133 |   int m_sizeInput;
134 |   int m_sizeOutput;
135 | }; // class RnnWeights
136 | 
137 | #endif
138 | 


--------------------------------------------------------------------------------
/preprocessing/JSON2unrolls.py:
--------------------------------------------------------------------------------
  1 | '''
  2 | // Copyright (c) 2014 Anonymized. All rights reserved.
  3 | //
  4 | // Code submitted as supplementary material for manuscript:
  5 | // "Dependency Recurrent Neural Language Models for Sentence Completion"
  6 | // Do not redistribute.
  7 | 
  8 | Created on Aug 4, 2014
  9 | 
 10 | Take a corpus the JSON format obtained from Stanford  and convert it to this
 11 | 
 12 | arg1 = input
 13 | arg2 = output
 14 | 
 15 | corpus = (list of sentences)
 16 | sentence = (list of unrolls)
 17 | unroll = (list of tokens)
 18 | token = (map containing: index in sentence, string, discount, outDep)
 19 | 
 20 | outDep is the dependency going from the current token to the next word on the path
 21 | the last token on the path (leaf node) has a LEAF outEdge
 22 | 
 23 | '''
 24 | 
 25 | 
 26 | import networkx
 27 | import json
 28 | import sys
 29 | from collections import Counter
 30 | import glob
 31 | import os
 32 | import os.path
 33 | 
 34 | def extractUnrolls(sentenceDAG):
 35 |     unrolls = []
 36 |     
 37 |     # so each unroll is a path from ROOT to the leaves.
 38 |     root2leafPaths = []
 39 |     # this counts the number of times a node appears in the path
 40 |     discountFactors = Counter()
 41 |     # traverse all tokens to find the root and the leaves:
 42 |     leaves = []
 43 |     root = None
 44 |     for tokenNo in sentenceDAG.nodes():
 45 |         # if a token is a leaf (avoid punctuation which has no incoming ones):
 46 |         if sentenceDAG.out_degree(tokenNo) == 0 and sentenceDAG.in_degree(tokenNo) > 0:
 47 |             leaves.append(tokenNo)
 48 |         if sentenceDAG.in_degree(tokenNo) == 0 and sentenceDAG.out_degree(tokenNo) > 0:
 49 |             root = tokenNo
 50 |             
 51 |     #print "leaves:" + str(leaves)
 52 |     #print "root:" + str(root) 
 53 |     
 54 |     for leaf in leaves:
 55 |         # let's get the path from ROOT:
 56 |         try:
 57 |             path = networkx.shortest_path(sentenceDAG, source=root, target=leaf)
 58 |             root2leafPaths.append(path)
 59 |             # add the discounts:
 60 |             for tok in path:
 61 |                 discountFactors[tok] += 1
 62 |         except networkx.exception.NetworkXNoPath:
 63 |             print "path did not exist among tokens " + str(root) + " and " + str(leaf) + " in sentence:"
 64 |             print str(sentenceDAG)
 65 |     #print root2leafPaths
 66 |     #print discountFactors
 67 |     
 68 |     for path in root2leafPaths:
 69 |         unroll = []
 70 |         for idx_in_path, tokenNo in enumerate(path):
 71 |             #print sentenceDAG[tokenNo]
 72 |             word = sentenceDAG.node[tokenNo]['word']
 73 |             # the last word has the dummy out edge
 74 |             if idx_in_path == len(path)-1:
 75 |                 outDep = "LEAF"
 76 |             else:
 77 |                 outDep = sentenceDAG[tokenNo][path[idx_in_path+1]]["label"]
 78 |             unroll.append([tokenNo, word, discountFactors[tokenNo], outDep]) 
 79 |         
 80 |         unrolls.append(unroll)
 81 |     
 82 |     return unrolls
 83 | 
 84 | def constructDAG(sentence):
 85 |     sentenceDAG = networkx.DiGraph()
 86 |     # first put the nodes in the graph
 87 |     # fields of interest 0 (tokenNo, starting at 0), 1 (token (lowercase it maybe?), 6 (ancestor), 7 (depLabel to ancestor))
 88 |     # add the root
 89 |     #sentenceDAG.add_node(0, word="ROOT")
 90 |     # add the index of the token in the sentence, remember to start things from 1 as 0 is reserved for root
 91 |     for idx, token in enumerate(sentence["tokens"]):
 92 |         sentenceDAG.add_node(idx, word=token["word"].lower())
 93 |         
 94 |     # and now the edges:
 95 |     for dependency in sentence["dependencies"]:
 96 |         sentenceDAG.add_edge(dependency["head"], dependency["dep"], label=dependency["label"])
 97 |     #networkx.draw(sentenceDAG)
 98 |     #print sentenceDAG.nodes(data=True)
 99 |     #print sentenceDAG.edges(data=True)
100 |     return sentenceDAG
101 | 
102 | # Create the output path
103 | os.mkdir(sys.argv[2])
104 | threshold = int(sys.argv[3])
105 | tokensOnly = False
106 | # check if we are generating the text for the RNNs
107 | if len(sys.argv) == 5 and sys.argv[4] == "TOKENS":
108 |     tokensOnly = True
109 |     threshold = float("inf")
110 |     
111 |     
112 | tokensKeptCounter = 0
113 | wordTypesKept = []
114 | for filename in glob.glob(sys.argv[1]+ "/*"):
115 |     allSentences = []
116 | 
117 |     jsonFile = open(filename)
118 |     sentences = json.loads(jsonFile.read())
119 |     jsonFile.close()
120 | 
121 |     for sentence in sentences:
122 |         sentenceDAG = constructDAG(sentence)
123 |         if (len(sentenceDAG.nodes()) < threshold):
124 |             gutenbergCheck = False
125 |             
126 |             nodes = sentenceDAG.nodes(data=True)
127 | 
128 |             for node in nodes: 
129 |                 if node[1]["word"] == "gutenberg":
130 |                     #print nodes
131 |                     gutenbergCheck = True
132 |             
133 |             if not gutenbergCheck:
134 |                 tokensKeptCounter += len(nodes)
135 |                 for node in nodes:
136 |                     if node[1]["word"] not in wordTypesKept:
137 |                         wordTypesKept.append( node[1]["word"])
138 |                 if tokensOnly:
139 |                     tokens = []
140 |                     for node in nodes:
141 |                         tokens.append(node[1]["word"])
142 |                     allSentences.append(" ".join(tokens))
143 |                 else:
144 |                     unrolls = extractUnrolls(sentenceDAG)
145 |                     allSentences.append(unrolls)
146 |     print "unique word types kept=" + str(len(wordTypesKept))    
147 |     if tokensOnly:
148 |         with open(sys.argv[2] + "/" + os.path.basename(filename) + ".tokens.txt", "wb") as out:
149 |             out.write(("\n".join(allSentences)).encode('utf-8') + "\n")
150 |     else:
151 |         with open(sys.argv[2] + "/" + os.path.basename(filename) + ".unrolls.json", "wb") as out:
152 |             json.dump(allSentences, out)
153 | 
154 | print "tokens kept=" + str(tokensKeptCounter)
155 | print "unique word types kept=" + str(len(wordTypesKept))
156 | 


--------------------------------------------------------------------------------
/DependencyTreeRNN++/Vocabulary.h:
--------------------------------------------------------------------------------
  1 | // Copyright (c) 2014-2015 Piotr Mirowski
  2 | //
  3 | // Piotr Mirowski, Andreas Vlachos
  4 | // "Dependency Recurrent Neural Language Models for Sentence Completion"
  5 | // ACL 2015
  6 | 
  7 | // Based on code by Geoffrey Zweig and Tomas Mikolov
  8 | // for the Feature-Augmented RNN Tool Kit
  9 | // http://research.microsoft.com/en-us/projects/rnn/
 10 | 
 11 | /*
 12 |  This file is based on or incorporates material from the projects listed below (collectively, "Third Party Code").
 13 |  Microsoft is not the original author of the Third Party Code. The original copyright notice and the license under which Microsoft received such Third Party Code,
 14 |  are set forth below. Such licenses and notices are provided for informational purposes only. Microsoft, not the third party, licenses the Third Party Code to you
 15 |  under the terms set forth in the EULA for the Microsoft Product. Microsoft reserves all rights not expressly granted under this agreement, whether by implication,
 16 |  estoppel or otherwise.
 17 | 
 18 |  RNNLM 0.3e by Tomas Mikolov
 19 | 
 20 |  Provided for Informational Purposes Only
 21 | 
 22 |  BSD License
 23 |  All rights reserved.
 24 |  Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
 25 |  Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
 26 | 
 27 |  Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other
 28 |  materials provided with the distribution.
 29 | 
 30 |  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
 31 |  WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
 32 |  INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA,
 33 |  OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 34 |  ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 35 |  */
 36 | 
 37 | #ifndef DependencyTreeRNN_Vocabulary_h
 38 | #define DependencyTreeRNN_Vocabulary_h
 39 | 
 40 | #include <string>
 41 | #include <vector>
 42 | #include <map>
 43 | #include <set>
 44 | #include <unordered_map>
 45 | 
 46 | 
 47 | /**
 48 |  * Element of vocabulary
 49 |  */
 50 | struct VocabWord {
 51 |   std::string word;
 52 |   double prob;
 53 |   int cn;
 54 |   int classIndex;
 55 | };
 56 | 
 57 | 
 58 | /**
 59 |  * Class storing word in vocabulary, word classes
 60 |  * and hash tables to associate them
 61 |  */
 62 | class Vocabulary {
 63 | public:
 64 | 
 65 |   /**
 66 |    * Constructor.
 67 |    */
 68 |   Vocabulary(int numClasses)
 69 |   : m_numClasses(numClasses), m_useClassFile(false) {
 70 |   }
 71 | 
 72 |   /**
 73 |    * Constructor that reads the vocabulary and classes from the model file.
 74 |    */
 75 |   Vocabulary(FILE *fi, int sizeVocabulary, int numClasses);
 76 | 
 77 |   /**
 78 |    * Save the vocabulary to a model file.
 79 |    */
 80 |   void Save(FILE *fo);
 81 | 
 82 |   /**
 83 |    * Return the index of a word in the vocabulary, or -1 if OOV.
 84 |    */
 85 |   int SearchWordInVocabulary(const std::string& word) const;
 86 | 
 87 |   /**
 88 |    * Add word to the vocabulary.
 89 |    */
 90 |   int AddWordToVocabulary(const std::string& word);
 91 | 
 92 |   /**
 93 |    * Sort the words in the vocabulary by frequency.
 94 |    */
 95 |   void SortVocabularyByFrequency();
 96 | 
 97 |   /**
 98 |    * Read the classes of words.
 99 |    */
100 |   bool ReadClasses(const std::string &filename);
101 | 
102 |   /**
103 |    * Assign all the words to a class.
104 |    */
105 |   void AssignWordsToClasses();
106 | 
107 |   /**
108 |    * Return the number of words/entity tokens in the vocabulary.
109 |    */
110 |   int GetVocabularySize() const {
111 |     return static_cast<int>(m_vocabularyStorage.size());
112 |   }
113 | 
114 |   /**
115 |    * Manually set the word count.
116 |    */
117 |   bool SetWordCount(std::string word, int count);
118 | 
119 |   /**
120 |    * Return the n-th word in the vocabulary.
121 |    */
122 |   std::string GetNthWord(int word) const {
123 |     return m_vocabularyStorage[word].word;
124 |   }
125 | 
126 |   /**
127 |    * Return the index of a given word in the vocabulary.
128 |    */
129 |   std::string Word2WordIndex(int word) const {
130 |     return m_vocabularyStorage[word].word;
131 |   }
132 | 
133 |   /**
134 |    * Return the size of a word class.
135 |    */
136 |   int SizeTargetClass(int targetClass) const {
137 |     return static_cast<int>(m_classWords[targetClass].size());
138 |   }
139 | 
140 |   /**
141 |    * Return the class index of a word (referenced by an index).
142 |    */
143 |   int WordIndex2Class(int word) const {
144 |     return m_vocabularyStorage[word].classIndex;
145 |   }
146 | 
147 |   /**
148 |    * Return the n-th word in a word class.
149 |    */
150 |   int GetNthWordInClass(int targetClass, int n) const {
151 |     return static_cast<int>(m_classWords[targetClass][n]);
152 |   }
153 | 
154 | public:
155 | 
156 |   // Vocabulary storage
157 |   std::vector<VocabWord> m_vocabularyStorage;
158 | 
159 |   // Vocabulary representation (word -> index of the word)
160 |   std::unordered_map<std::string, int> m_mapWord2Index;
161 | 
162 |   // Inverse vocabulary representation (index of the word -> word)
163 |   std::unordered_map<int, std::string> m_mapIndex2Word;
164 | 
165 |   // Hash table enabling a look-up of the class of a word
166 |   // (word -> word class)
167 |   std::unordered_map<std::string, int> m_mapWord2Class;
168 | 
169 |   // Information relative to the classes
170 |   std::vector<std::vector<int> > m_classWords;
171 | 
172 | protected:
173 |   bool m_useClassFile;
174 |   int m_numClasses;
175 | 
176 |   // Store information on which word is in which class
177 |   void StoreClassAssociations();
178 | }; // class Vocabulary
179 | 
180 | #endif
181 | 


--------------------------------------------------------------------------------
/DependencyTreeRNN++/RnnDependencyTreeLib.h:
--------------------------------------------------------------------------------
  1 | // Copyright (c) 2014-2015 Piotr Mirowski
  2 | //
  3 | // Piotr Mirowski, Andreas Vlachos
  4 | // "Dependency Recurrent Neural Language Models for Sentence Completion"
  5 | // ACL 2015
  6 | 
  7 | // Based on code by Geoffrey Zweig and Tomas Mikolov
  8 | // for the Feature-Augmented RNN Tool Kit
  9 | // http://research.microsoft.com/en-us/projects/rnn/
 10 | 
 11 | /*
 12 |  This file is based on or incorporates material from the projects listed below (collectively, "Third Party Code").
 13 |  Microsoft is not the original author of the Third Party Code. The original copyright notice and the license under which Microsoft received such Third Party Code,
 14 |  are set forth below. Such licenses and notices are provided for informational purposes only. Microsoft, not the third party, licenses the Third Party Code to you
 15 |  under the terms set forth in the EULA for the Microsoft Product. Microsoft reserves all rights not expressly granted under this agreement, whether by implication,
 16 |  estoppel or otherwise.
 17 | 
 18 |  RNNLM 0.3e by Tomas Mikolov
 19 | 
 20 |  Provided for Informational Purposes Only
 21 | 
 22 |  BSD License
 23 |  All rights reserved.
 24 |  Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
 25 |  Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
 26 | 
 27 |  Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other
 28 |  materials provided with the distribution.
 29 | 
 30 |  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
 31 |  WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
 32 |  INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA,
 33 |  OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 34 |  ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 35 |  */
 36 | 
 37 | #ifndef __DependencyTreeRNN____RnnDependencyTreeLib__
 38 | #define __DependencyTreeRNN____RnnDependencyTreeLib__
 39 | 
 40 | #include "RnnLib.h"
 41 | #include "RnnTraining.h"
 42 | #include "CorpusUnrollsReader.h"
 43 | 
 44 | class RnnTreeLM : public RnnLMTraining {
 45 | public:
 46 |   
 47 |   /**
 48 |    * Constructor for training/testing the model
 49 |    */
 50 |   RnnTreeLM(const std::string &filename, bool doLoadModel, bool debugMode)
 51 |   // We load the RNN or not, depending on whether the model file is present
 52 |   // otherwise simply set its filename
 53 |   : RnnLMTraining(filename, doLoadModel, debugMode),
 54 |   // Parameters set by default (can be overriden when loading the model)
 55 |   m_typeOfDepLabels(0), m_labels(1) {
 56 |     // If we use dependency labels, do not connect them to the outputs
 57 |     m_useFeatures2Output = false;
 58 |     std::cout << "RnnTreeLM\n";
 59 |   }
 60 |   
 61 | public:
 62 |   
 63 |   /**
 64 |    * Before learning the RNN model, we need to learn the vocabulary
 65 |    * from the corpus. Note that the word classes may have been initialized
 66 |    * beforehand using ReadClasses. Computes the unigram distribution
 67 |    * of words from a training file, assuming that the existing vocabulary
 68 |    * is empty.
 69 |    */
 70 |   bool LearnVocabularyFromTrainFile(int numClasses);
 71 |   
 72 |   /**
 73 |    * Import the vocabulary from a text file.
 74 |    */
 75 |   void ImportVocabularyFromFile(std::string &filename, int numClasses) {
 76 |     m_corpusTrain.ImportVocabulary(filename);
 77 |     m_corpusValidTest.ImportVocabulary(filename);
 78 |     AssignVocabularyFromCorpora(numClasses);
 79 |   }
 80 | 
 81 |   /**
 82 |    * Return the number of labels (features) used in the dependency parsing.
 83 |    */
 84 |   int GetLabelSize() const { return m_labels.GetVocabularySize(); }
 85 |   
 86 |   /**
 87 |    * Set the mode of the dependency labels:
 88 |    * 0: no dependency labels used
 89 |    * 1: dependency labels concatenated to the word
 90 |    * 0: dependency labels used as features in the feature vector
 91 |    */
 92 |   void SetDependencyLabelType(int type) {
 93 |     m_typeOfDepLabels = type;
 94 |   }
 95 | 
 96 |   /**
 97 |    * Set the minimum number of word occurrences
 98 |    */
 99 |   void SetMinWordOccurrence(int val) {
100 |     m_corpusVocabulary.SetMinWordOccurrence(val);
101 |     m_corpusTrain.SetMinWordOccurrence(val);
102 |     m_corpusValidTest.SetMinWordOccurrence(val);
103 |   }
104 |   
105 |   /**
106 |    * Add a book to the training corpus
107 |    */
108 |   void AddBookTrain(const std::string &filename) {
109 |     m_corpusVocabulary.AddBookFilename(filename);
110 |     m_corpusTrain.AddBookFilename(filename);
111 |   }
112 |   
113 |   /**
114 |    * Add a book to the test/validation corpus
115 |    */
116 |   void AddBookTestValid(const std::string &filename) {
117 |     m_corpusValidTest.AddBookFilename(filename);
118 |   }
119 |   
120 |   /**
121 |    * Function that trains the RNN on JSON trees
122 |    * of dependency parse
123 |    */
124 |   bool TrainRnnModel();
125 |   
126 |   /**
127 |    * Function that tests the RNN on JSON trees
128 |    * of dependency parse
129 |    */
130 |   bool TestRnnModel(const std::string &testFile,
131 |                     const std::string &featureFile,
132 |                     std::vector<double> &sentenceScores,
133 |                     double &logProbability,
134 |                     double &perplexity,
135 |                     double &entropy,
136 |                     double &accuracy);
137 | 
138 | protected:
139 | 
140 |   // Corpora
141 |   CorpusUnrolls m_corpusVocabulary;
142 |   CorpusUnrolls m_corpusTrain;
143 |   CorpusUnrolls m_corpusValidTest;
144 |   
145 |   // Label vocabulary representation (label -> index of the label)
146 |   int m_typeOfDepLabels;
147 |   
148 |   // Label vocabulary hashtables
149 |   Vocabulary m_labels;
150 | 
151 |   // Label vocabulary representation (label -> index of the label)
152 |   std::unordered_map<std::string, int> m_mapLabel2Index;
153 |   
154 |   // Reset the vector of feature labels
155 |   void ResetFeatureLabelVector(RnnState &state) const;
156 |   
157 |   // Update the vector of feature labels
158 |   void UpdateFeatureLabelVector(int label, RnnState &state) const;
159 | 
160 |   // Assign the vocabulary from the corpora to the model,
161 |   // and compute the word classes.
162 |   bool AssignVocabularyFromCorpora(int numClasses);
163 | };
164 | 
165 | #endif /* defined(__DependencyTreeRNN____RnnDependencyTreeLib__) */
166 | 


--------------------------------------------------------------------------------
/DependencyTreeRNN++/Utils.h:
--------------------------------------------------------------------------------
  1 | // Copyright (c) 2014-2015 Piotr Mirowski
  2 | //
  3 | // Piotr Mirowski, Andreas Vlachos
  4 | // "Dependency Recurrent Neural Language Models for Sentence Completion"
  5 | // ACL 2015
  6 | 
  7 | // Based on code by Geoffrey Zweig and Tomas Mikolov
  8 | // for the Feature-Augmented RNN Tool Kit
  9 | // http://research.microsoft.com/en-us/projects/rnn/
 10 | 
 11 | /*
 12 |  This file is based on or incorporates material from the projects listed below (collectively, "Third Party Code").
 13 |  Microsoft is not the original author of the Third Party Code. The original copyright notice and the license under which Microsoft received such Third Party Code,
 14 |  are set forth below. Such licenses and notices are provided for informational purposes only. Microsoft, not the third party, licenses the Third Party Code to you
 15 |  under the terms set forth in the EULA for the Microsoft Product. Microsoft reserves all rights not expressly granted under this agreement, whether by implication,
 16 |  estoppel or otherwise.
 17 | 
 18 |  RNNLM 0.3e by Tomas Mikolov
 19 | 
 20 |  Provided for Informational Purposes Only
 21 | 
 22 |  BSD License
 23 |  All rights reserved.
 24 |  Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
 25 |  Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
 26 | 
 27 |  Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other
 28 |  materials provided with the distribution.
 29 | 
 30 |  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
 31 |  WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
 32 |  INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA,
 33 |  OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 34 |  ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 35 |  */
 36 | 
 37 | #ifndef DependencyTreeRNN_Utils_h
 38 | #define DependencyTreeRNN_Utils_h
 39 | 
 40 | #include <stdio.h>
 41 | #include <vector>
 42 | 
 43 | #include <stdlib.h>
 44 | #include <string.h>
 45 | #include <sstream>
 46 | #include <fstream>
 47 | #include <iostream>
 48 | #include <stdexcept>
 49 | 
 50 | 
 51 | /**
 52 |  * Log to screen and to file (append)
 53 |  */
 54 | static void Log(std::string str, std::string logFilename) {
 55 |   std::ostringstream buf;
 56 |   std::ofstream logFile(logFilename, std::fstream::app);
 57 |   buf << str;
 58 |   logFile << buf.str() << std::flush;
 59 |   std::cout << buf.str() << std::flush;
 60 |   buf.str("");
 61 |   buf.clear();
 62 | }
 63 | 
 64 | 
 65 | /**
 66 |  * Log to screen only
 67 |  */
 68 | static void Log(std::string str) {
 69 |   std::ostringstream buf;
 70 |   buf << str;
 71 |   std::cout << buf.str() << std::flush;
 72 |   buf.str("");
 73 |   buf.clear();
 74 | }
 75 | 
 76 | 
 77 | /**
 78 |  * Read a matrix of floats in binary format
 79 |  */
 80 | static void ReadBinaryMatrix(FILE *fi, int sizeIn, int sizeOut,
 81 |                              std::vector<double> &vec) {
 82 |   if (sizeIn * sizeOut == 0) {
 83 |     return;
 84 |   }
 85 |   for (int idxOut = 0; idxOut < sizeOut; idxOut++) {
 86 |     for (int idxIn = 0; idxIn < sizeIn; idxIn++) {
 87 |       float val;
 88 |       fread(&val, 4, 1, fi);
 89 |       vec[idxIn + idxOut * sizeIn] = val;
 90 |     }
 91 |   }
 92 | }
 93 | 
 94 | 
 95 | /**
 96 |  * Read a vector of floats in binary format
 97 |  */
 98 | static void ReadBinaryVector(FILE *fi, long long size,
 99 |                              std::vector<double> &vec) {
100 |   for (long long aa = 0; aa < size; aa++) {
101 |     float val;
102 |     fread(&val, 4, 1, fi);
103 |     vec[aa] = val;
104 |   }
105 | }
106 | 
107 | 
108 | /**
109 |  * Save a matrix of floats in binary format
110 |  */
111 | static void SaveBinaryMatrix(FILE *fo, int sizeIn, int sizeOut,
112 |                              const std::vector<double> &vec) {
113 |   if (sizeIn * sizeOut == 0) {
114 |     return;
115 |   }
116 |   for (int idxOut = 0; idxOut < sizeOut; idxOut++) {
117 |     for (int idxIn = 0; idxIn < sizeIn; idxIn++) {
118 |       float val = (float)(vec[idxIn + idxOut * sizeIn]);
119 |       fwrite(&val, 4, 1, fo);
120 |     }
121 |   }
122 | }
123 | 
124 | 
125 | /**
126 |  * Save a vector of floats in binary format
127 |  */
128 | static void SaveBinaryVector(FILE *fo, long long size,
129 |                              const std::vector<double> &vec) {
130 |   for (long long aa = 0; aa < size; aa++) {
131 |     float val = vec[aa];
132 |     fwrite(&val, 4, 1, fo);
133 |   }
134 | }
135 | 
136 | 
137 | /**
138 |  * Random number generator of double random number in range [min, max]
139 |  */
140 | static double GenerateUniformRandomNumber(double min, double max) {
141 |   return rand() / ((double)RAND_MAX) * (max - min) + min;
142 | }
143 | 
144 | 
145 | /**
146 |  * Random number generator (approximate Gaussian distribution),
147 |  * zero-mean and standard deviation 0.1
148 |  */
149 | static double GenerateNormalRandomNumber() {
150 |   return (GenerateUniformRandomNumber(-0.1, 0.1)
151 |           + GenerateUniformRandomNumber(-0.1, 0.1)
152 |           + GenerateUniformRandomNumber(-0.1, 0.1));
153 | }
154 | 
155 | 
156 | /**
157 |  * Randomize a vector with small numbers to get zero-mean random numbers
158 |  */
159 | static void RandomizeVector(std::vector<double> &vec) {
160 |   for (size_t k = 0; k < vec.size(); k++) {
161 |     vec[k] = GenerateNormalRandomNumber();
162 |   }
163 | }
164 | 
165 | 
166 | /**
167 |  * Convert int or double to string
168 |  */
169 | static std::string ConvString(int val) {
170 |   return std::to_string(static_cast<long long int>(val));
171 | }
172 | 
173 | 
174 | /**
175 |  * Convert int or double to string
176 |  */
177 | static std::string ConvString(size_t val) {
178 |   return std::to_string(static_cast<long long int>(val));
179 | }
180 | 
181 | 
182 | /**
183 |  * Convert int or double to string
184 |  */
185 | static std::string ConvString(long int val) {
186 |   return std::to_string(static_cast<long long int>(val));
187 | }
188 | 
189 | 
190 | /**
191 |  * Convert int or double to string
192 |  */
193 | static std::string ConvString(long long int val) {
194 |   return std::to_string(static_cast<long long int>(val));
195 | }
196 | 
197 | 
198 | /**
199 |  * Convert int or double to string
200 |  */
201 | static std::string ConvString(double val) {
202 |   return std::to_string(static_cast<long double>(val));
203 | }
204 | 
205 | 
206 | #endif
207 | 


--------------------------------------------------------------------------------
/preprocessing/Text2Parsed2JSON.java:
--------------------------------------------------------------------------------
  1 | import edu.stanford.nlp.pipeline.*;
  2 | import edu.stanford.nlp.util.*;
  3 | import edu.stanford.nlp.semgraph.SemanticGraph;
  4 | import edu.stanford.nlp.semgraph.SemanticGraphEdge;
  5 | import edu.stanford.nlp.semgraph.SemanticGraphCoreAnnotations.*;
  6 | import edu.stanford.nlp.ling.CoreAnnotations;
  7 | import edu.stanford.nlp.ling.CoreLabel;
  8 | import edu.stanford.nlp.ling.CoreAnnotations.*;
  9 | 
 10 | import java.io.*;
 11 | import java.util.*;
 12 | 
 13 | import com.google.gson.*;
 14 | 
 15 | 
 16 | 
 17 | public class Text2Parsed2JSON {
 18 | 	
 19 | 	public class MyToken{
 20 | 		public String word;
 21 | 		public String lemma;
 22 | 		public String pos;
 23 | 		public String ner;
 24 | 		
 25 | 		public MyToken(){};
 26 | 		
 27 | 	}
 28 | 	
 29 | 	public class MyDependency{
 30 | 		public int head;
 31 | 		public int dep;
 32 | 		public String label;
 33 | 		
 34 | 		public MyDependency(){};
 35 | 	}
 36 | 	
 37 | 	public class MySentence{
 38 | 		
 39 | 		public ArrayList<MyToken> tokens;
 40 | 		public ArrayList<MyDependency> dependencies;
 41 | 		
 42 | 		public MySentence(){
 43 | 			tokens = new ArrayList<MyToken>();
 44 | 			dependencies = new ArrayList<MyDependency>();
 45 | 		}
 46 | 		
 47 | 	}
 48 | 
 49 | 	// this holds the main pipeline for the processing 
 50 | 	private StanfordCoreNLP mainPipeline;
 51 | 	
 52 |     public static String readTextFromFile(File textFileName) throws IOException {
 53 |     	BufferedReader textFile = new BufferedReader(new FileReader(textFileName));
 54 |     	String line;
 55 |     	StringBuffer result = new StringBuffer();
 56 |     	while ((line = textFile.readLine() ) != null){
 57 |     		// added the new line back
 58 |     		result.append(line + "\n");
 59 |     	}
 60 |     	textFile.close();
 61 |     	return result.toString();
 62 |     }
 63 |     
 64 |     // dummy function that returns the same text that was passed as input.
 65 |     // to be over-ridden to do more interesting things. might need to add to the initialization.
 66 |     private String filterText(String text){
 67 |     	return text;
 68 |     }
 69 |     
 70 |     public Text2Parsed2JSON(){
 71 | 		// Initialize the parser:
 72 | 		Properties parser_props = new Properties();
 73 | 		parser_props.put("annotators", "tokenize, ssplit, pos, lemma, ner, parse");
 74 | 		// I assume that longer sentences are unlikely to be useful.
 75 | 		parser_props.put("parse.maxlen", 80);
 76 | 		//parser_props.put("tokenize.whitespace", "true");
 77 | 		//parser_props.put("ssplit.isOneSentence", "true");
 78 | 		mainPipeline = new StanfordCoreNLP(parser_props);		
 79 |     	
 80 |     }
 81 |     
 82 |     // this takes text, runs the main processor and returns the Stanford annotations
 83 |     // for the sentences kept 
 84 |     public Annotation processText2Annotations(String text){
 85 |     	// filter the text
 86 |     	String filteredText = filterText(text); 
 87 |     	// create an empty Annotation just with the given text
 88 |         Annotation annotatedText = new Annotation(filteredText);
 89 |                 
 90 |         mainPipeline.annotate(annotatedText);
 91 | 
 92 |         return annotatedText;
 93 |     }
 94 | 
 95 | 
 96 |     public String processAnnotations2JSON(Annotation annotatedText){
 97 |     	
 98 |     	// initialize the sentences array
 99 |     	ArrayList<MySentence> mySentences = new ArrayList<MySentence>();
100 |     	
101 |     	// get the sentences 
102 |         List<CoreMap> sentences = annotatedText.get(SentencesAnnotation.class);
103 |         
104 |         for(CoreMap sentence: sentences) {
105 |         	MySentence newSentence = new MySentence();
106 |           // traversing the words in the current sentence
107 |           // a CoreLabel is a CoreMap with additional token-specific methods
108 |           for (CoreLabel token: sentence.get(TokensAnnotation.class)) {
109 |         	  MyToken newToken = new MyToken();
110 |             // this is the text of the token
111 |             String word = token.get(TextAnnotation.class);
112 |             // this is the POS tag of the token
113 |             String pos = token.get(PartOfSpeechAnnotation.class);
114 |             // this is the NER label of the token
115 |             String ne = token.get(NamedEntityTagAnnotation.class);
116 |             // this is the lemma
117 |             String lemma = token.get(CoreAnnotations.LemmaAnnotation.class);
118 |             newToken.lemma = lemma;
119 |             newToken.pos = pos;
120 |             newToken.ner = ne;
121 |             newToken.word = word;
122 |             
123 |             newSentence.tokens.add(newToken);
124 |           }
125 | 
126 | 
127 |           // this is the Stanford dependency graph of the current sentence
128 |           // If a tree with all the tokens is required, use BasicDependenciesAnnotation
129 |           // But the one that are commonly the best for IE are CollapsedCCProcessedDependenciesAnnotation (careful, they are not even DAGs)
130 |           SemanticGraph dependencies = sentence.get(BasicDependenciesAnnotation.class);
131 |          //System.out.print(dependencies.toString("plain"));
132 |           
133 |           //Set<SemanticGraphEdge> allEdges = dependencies.getEdgeSet();
134 |           
135 |           for (SemanticGraphEdge edge: dependencies.edgeIterable()){
136 |         	  MyDependency dep = new MyDependency();
137 |         	  // remember to subtract one so that the first word starts at 0
138 |         	  dep.head = edge.getGovernor().index() - 1;
139 |         	  dep.dep = edge.getDependent().index() - 1;
140 |         	  dep.label = edge.getRelation().toString();
141 |         	  
142 |         	  newSentence.dependencies.add(dep);
143 |           }
144 |           
145 |           mySentences.add(newSentence);  
146 |         }
147 |         
148 |         
149 |     	Gson gson = new Gson();
150 |     	
151 |     	
152 |     	return gson.toJson(mySentences);
153 |     }
154 | 
155 | 	
156 | 	/**
157 | 	 * @param args
158 | 	 */
159 | 	public static void main(String[] args) {
160 | 		// initialize
161 | 		Text2Parsed2JSON processor = new Text2Parsed2JSON();
162 | 		
163 | 		// get the directory with the text files
164 | 		File extractsDirectory = new File(args[0]);
165 | 
166 | 		// get the output directory
167 | 		File outputDirectory = new File(args[1]);
168 | 		outputDirectory.mkdir();
169 | 
170 | 		// get a list of files:
171 | 		File[] textFileNames = extractsDirectory.listFiles();
172 | 		System.out.println("Files to process:" +  textFileNames.length);
173 | 
174 | 		// For each text file:
175 | 		for (int i = 0; i < textFileNames.length; i++){
176 | 			
177 | 			// First get the filename
178 | 			//String filename = textFileNames[i].getName();
179 | 			System.out.println(textFileNames[i]);
180 | 			// Read in the text
181 | 			String text;
182 | 			try {
183 | 				text = readTextFromFile(textFileNames[i]);
184 | 				// process
185 | 				Annotation annotatedText = processor.processText2Annotations(text);
186 | 				String JSONsentences = processor.processAnnotations2JSON(annotatedText);
187 | 				//System.out.println(JSONsentences);
188 | 				    
189 | 				// Create the file for the output
190 | 				File JSONFile = new File(outputDirectory, textFileNames[i].getName() + ".json");
191 | 				//System.out.println(JSONFile.getPath());
192 | 				//System.out.println(JSONFile.getName());
193 | 				BufferedWriter out = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(JSONFile), "utf-8"));
194 | 				out.write(JSONsentences);
195 | 				out.close();
196 | 			} catch (IOException e) {
197 | 				// TODO Auto-generated catch block 
198 | 				e.printStackTrace();
199 | 			}
200 | 			    
201 | 
202 | 		
203 | 		}
204 | 	}
205 | 
206 | }
207 | 


--------------------------------------------------------------------------------
/books/all.labels:
--------------------------------------------------------------------------------
   1 | 3
   2 | 4
   3 | 3
   4 | 4
   5 | 3
   6 | 3
   7 | 3
   8 | 1
   9 | 3
  10 | 0
  11 | 1
  12 | 4
  13 | 2
  14 | 0
  15 | 3
  16 | 4
  17 | 1
  18 | 0
  19 | 0
  20 | 0
  21 | 4
  22 | 2
  23 | 3
  24 | 4
  25 | 4
  26 | 0
  27 | 0
  28 | 2
  29 | 2
  30 | 4
  31 | 3
  32 | 0
  33 | 0
  34 | 0
  35 | 1
  36 | 4
  37 | 4
  38 | 0
  39 | 4
  40 | 4
  41 | 3
  42 | 4
  43 | 0
  44 | 1
  45 | 1
  46 | 2
  47 | 3
  48 | 2
  49 | 2
  50 | 1
  51 | 1
  52 | 2
  53 | 2
  54 | 1
  55 | 3
  56 | 3
  57 | 4
  58 | 4
  59 | 0
  60 | 1
  61 | 1
  62 | 2
  63 | 4
  64 | 3
  65 | 3
  66 | 0
  67 | 4
  68 | 1
  69 | 4
  70 | 4
  71 | 2
  72 | 4
  73 | 2
  74 | 3
  75 | 2
  76 | 4
  77 | 4
  78 | 4
  79 | 2
  80 | 1
  81 | 1
  82 | 1
  83 | 3
  84 | 4
  85 | 0
  86 | 2
  87 | 2
  88 | 2
  89 | 4
  90 | 3
  91 | 3
  92 | 0
  93 | 3
  94 | 2
  95 | 4
  96 | 3
  97 | 4
  98 | 4
  99 | 4
 100 | 0
 101 | 0
 102 | 3
 103 | 4
 104 | 1
 105 | 4
 106 | 2
 107 | 2
 108 | 3
 109 | 1
 110 | 0
 111 | 1
 112 | 0
 113 | 0
 114 | 2
 115 | 1
 116 | 2
 117 | 4
 118 | 4
 119 | 4
 120 | 4
 121 | 4
 122 | 3
 123 | 2
 124 | 3
 125 | 3
 126 | 3
 127 | 4
 128 | 1
 129 | 2
 130 | 4
 131 | 1
 132 | 1
 133 | 0
 134 | 3
 135 | 1
 136 | 2
 137 | 2
 138 | 1
 139 | 2
 140 | 1
 141 | 0
 142 | 3
 143 | 2
 144 | 2
 145 | 3
 146 | 3
 147 | 2
 148 | 2
 149 | 1
 150 | 4
 151 | 4
 152 | 4
 153 | 1
 154 | 4
 155 | 2
 156 | 2
 157 | 4
 158 | 4
 159 | 1
 160 | 3
 161 | 0
 162 | 2
 163 | 0
 164 | 4
 165 | 3
 166 | 1
 167 | 3
 168 | 4
 169 | 0
 170 | 3
 171 | 0
 172 | 4
 173 | 0
 174 | 2
 175 | 2
 176 | 1
 177 | 0
 178 | 1
 179 | 3
 180 | 3
 181 | 4
 182 | 3
 183 | 3
 184 | 4
 185 | 2
 186 | 4
 187 | 0
 188 | 3
 189 | 2
 190 | 4
 191 | 1
 192 | 0
 193 | 0
 194 | 4
 195 | 2
 196 | 2
 197 | 4
 198 | 4
 199 | 0
 200 | 3
 201 | 0
 202 | 2
 203 | 3
 204 | 4
 205 | 0
 206 | 0
 207 | 1
 208 | 2
 209 | 2
 210 | 4
 211 | 2
 212 | 3
 213 | 1
 214 | 2
 215 | 1
 216 | 3
 217 | 4
 218 | 2
 219 | 1
 220 | 3
 221 | 4
 222 | 4
 223 | 4
 224 | 2
 225 | 0
 226 | 0
 227 | 2
 228 | 2
 229 | 0
 230 | 4
 231 | 2
 232 | 4
 233 | 2
 234 | 0
 235 | 1
 236 | 3
 237 | 0
 238 | 4
 239 | 1
 240 | 2
 241 | 2
 242 | 3
 243 | 3
 244 | 3
 245 | 1
 246 | 2
 247 | 1
 248 | 1
 249 | 1
 250 | 3
 251 | 1
 252 | 4
 253 | 2
 254 | 3
 255 | 1
 256 | 3
 257 | 3
 258 | 4
 259 | 1
 260 | 0
 261 | 3
 262 | 2
 263 | 2
 264 | 1
 265 | 2
 266 | 2
 267 | 1
 268 | 4
 269 | 0
 270 | 3
 271 | 2
 272 | 3
 273 | 0
 274 | 3
 275 | 2
 276 | 2
 277 | 4
 278 | 4
 279 | 0
 280 | 0
 281 | 4
 282 | 3
 283 | 4
 284 | 0
 285 | 0
 286 | 3
 287 | 2
 288 | 0
 289 | 2
 290 | 1
 291 | 1
 292 | 0
 293 | 4
 294 | 4
 295 | 1
 296 | 0
 297 | 2
 298 | 1
 299 | 4
 300 | 1
 301 | 3
 302 | 2
 303 | 0
 304 | 2
 305 | 2
 306 | 1
 307 | 4
 308 | 1
 309 | 4
 310 | 4
 311 | 3
 312 | 0
 313 | 4
 314 | 4
 315 | 2
 316 | 2
 317 | 3
 318 | 1
 319 | 2
 320 | 3
 321 | 1
 322 | 1
 323 | 4
 324 | 4
 325 | 4
 326 | 4
 327 | 1
 328 | 0
 329 | 2
 330 | 1
 331 | 1
 332 | 0
 333 | 1
 334 | 4
 335 | 0
 336 | 1
 337 | 3
 338 | 2
 339 | 4
 340 | 0
 341 | 4
 342 | 3
 343 | 4
 344 | 2
 345 | 0
 346 | 1
 347 | 1
 348 | 2
 349 | 1
 350 | 2
 351 | 0
 352 | 3
 353 | 0
 354 | 2
 355 | 3
 356 | 3
 357 | 2
 358 | 1
 359 | 2
 360 | 1
 361 | 2
 362 | 3
 363 | 1
 364 | 1
 365 | 0
 366 | 3
 367 | 0
 368 | 4
 369 | 2
 370 | 3
 371 | 1
 372 | 4
 373 | 2
 374 | 3
 375 | 1
 376 | 2
 377 | 0
 378 | 4
 379 | 4
 380 | 4
 381 | 1
 382 | 4
 383 | 0
 384 | 0
 385 | 2
 386 | 4
 387 | 2
 388 | 3
 389 | 2
 390 | 0
 391 | 4
 392 | 4
 393 | 4
 394 | 3
 395 | 4
 396 | 3
 397 | 4
 398 | 3
 399 | 3
 400 | 4
 401 | 0
 402 | 3
 403 | 0
 404 | 4
 405 | 3
 406 | 0
 407 | 0
 408 | 2
 409 | 2
 410 | 2
 411 | 3
 412 | 2
 413 | 0
 414 | 1
 415 | 2
 416 | 2
 417 | 3
 418 | 2
 419 | 4
 420 | 3
 421 | 0
 422 | 0
 423 | 2
 424 | 2
 425 | 4
 426 | 2
 427 | 1
 428 | 0
 429 | 1
 430 | 3
 431 | 2
 432 | 1
 433 | 4
 434 | 3
 435 | 4
 436 | 4
 437 | 3
 438 | 2
 439 | 0
 440 | 3
 441 | 0
 442 | 1
 443 | 1
 444 | 4
 445 | 1
 446 | 2
 447 | 3
 448 | 2
 449 | 3
 450 | 4
 451 | 0
 452 | 4
 453 | 3
 454 | 3
 455 | 4
 456 | 3
 457 | 3
 458 | 1
 459 | 2
 460 | 2
 461 | 0
 462 | 0
 463 | 3
 464 | 2
 465 | 3
 466 | 2
 467 | 3
 468 | 3
 469 | 4
 470 | 3
 471 | 2
 472 | 3
 473 | 3
 474 | 4
 475 | 3
 476 | 4
 477 | 1
 478 | 1
 479 | 3
 480 | 2
 481 | 3
 482 | 4
 483 | 1
 484 | 0
 485 | 2
 486 | 4
 487 | 0
 488 | 4
 489 | 1
 490 | 2
 491 | 4
 492 | 1
 493 | 1
 494 | 1
 495 | 2
 496 | 0
 497 | 1
 498 | 1
 499 | 3
 500 | 3
 501 | 1
 502 | 1
 503 | 1
 504 | 3
 505 | 3
 506 | 3
 507 | 3
 508 | 1
 509 | 2
 510 | 4
 511 | 4
 512 | 4
 513 | 1
 514 | 3
 515 | 1
 516 | 0
 517 | 3
 518 | 2
 519 | 3
 520 | 1
 521 | 2
 522 | 4
 523 | 3
 524 | 1
 525 | 0
 526 | 3
 527 | 0
 528 | 4
 529 | 2
 530 | 2
 531 | 2
 532 | 2
 533 | 1
 534 | 3
 535 | 1
 536 | 4
 537 | 1
 538 | 1
 539 | 1
 540 | 0
 541 | 2
 542 | 0
 543 | 4
 544 | 3
 545 | 4
 546 | 2
 547 | 0
 548 | 3
 549 | 2
 550 | 0
 551 | 2
 552 | 0
 553 | 4
 554 | 0
 555 | 2
 556 | 2
 557 | 2
 558 | 2
 559 | 2
 560 | 3
 561 | 0
 562 | 2
 563 | 3
 564 | 4
 565 | 1
 566 | 0
 567 | 3
 568 | 1
 569 | 1
 570 | 2
 571 | 3
 572 | 2
 573 | 0
 574 | 0
 575 | 3
 576 | 1
 577 | 2
 578 | 4
 579 | 2
 580 | 0
 581 | 4
 582 | 2
 583 | 2
 584 | 1
 585 | 4
 586 | 1
 587 | 4
 588 | 0
 589 | 1
 590 | 4
 591 | 4
 592 | 0
 593 | 3
 594 | 2
 595 | 3
 596 | 3
 597 | 1
 598 | 1
 599 | 3
 600 | 2
 601 | 3
 602 | 0
 603 | 1
 604 | 0
 605 | 4
 606 | 0
 607 | 3
 608 | 1
 609 | 2
 610 | 4
 611 | 1
 612 | 4
 613 | 1
 614 | 1
 615 | 1
 616 | 1
 617 | 4
 618 | 2
 619 | 2
 620 | 4
 621 | 2
 622 | 1
 623 | 4
 624 | 3
 625 | 4
 626 | 0
 627 | 0
 628 | 3
 629 | 4
 630 | 0
 631 | 1
 632 | 3
 633 | 3
 634 | 4
 635 | 3
 636 | 2
 637 | 2
 638 | 1
 639 | 0
 640 | 4
 641 | 4
 642 | 0
 643 | 1
 644 | 2
 645 | 1
 646 | 1
 647 | 2
 648 | 0
 649 | 3
 650 | 4
 651 | 2
 652 | 2
 653 | 1
 654 | 3
 655 | 2
 656 | 4
 657 | 0
 658 | 3
 659 | 0
 660 | 0
 661 | 1
 662 | 3
 663 | 2
 664 | 2
 665 | 0
 666 | 4
 667 | 1
 668 | 1
 669 | 0
 670 | 1
 671 | 2
 672 | 0
 673 | 4
 674 | 4
 675 | 0
 676 | 1
 677 | 2
 678 | 2
 679 | 1
 680 | 4
 681 | 3
 682 | 0
 683 | 4
 684 | 1
 685 | 4
 686 | 1
 687 | 0
 688 | 0
 689 | 3
 690 | 0
 691 | 2
 692 | 3
 693 | 0
 694 | 4
 695 | 4
 696 | 2
 697 | 4
 698 | 3
 699 | 1
 700 | 0
 701 | 4
 702 | 2
 703 | 1
 704 | 2
 705 | 1
 706 | 1
 707 | 3
 708 | 1
 709 | 0
 710 | 1
 711 | 0
 712 | 4
 713 | 3
 714 | 3
 715 | 1
 716 | 4
 717 | 1
 718 | 0
 719 | 1
 720 | 4
 721 | 0
 722 | 0
 723 | 0
 724 | 3
 725 | 2
 726 | 4
 727 | 2
 728 | 0
 729 | 1
 730 | 1
 731 | 2
 732 | 1
 733 | 2
 734 | 3
 735 | 2
 736 | 2
 737 | 4
 738 | 2
 739 | 1
 740 | 4
 741 | 0
 742 | 0
 743 | 4
 744 | 4
 745 | 0
 746 | 4
 747 | 3
 748 | 0
 749 | 2
 750 | 4
 751 | 1
 752 | 0
 753 | 4
 754 | 0
 755 | 2
 756 | 0
 757 | 3
 758 | 0
 759 | 1
 760 | 3
 761 | 3
 762 | 2
 763 | 1
 764 | 1
 765 | 2
 766 | 4
 767 | 1
 768 | 4
 769 | 4
 770 | 4
 771 | 1
 772 | 4
 773 | 2
 774 | 0
 775 | 3
 776 | 3
 777 | 4
 778 | 1
 779 | 0
 780 | 4
 781 | 3
 782 | 0
 783 | 2
 784 | 1
 785 | 0
 786 | 4
 787 | 2
 788 | 3
 789 | 3
 790 | 1
 791 | 2
 792 | 0
 793 | 0
 794 | 3
 795 | 3
 796 | 0
 797 | 2
 798 | 4
 799 | 0
 800 | 2
 801 | 3
 802 | 1
 803 | 0
 804 | 1
 805 | 2
 806 | 4
 807 | 1
 808 | 0
 809 | 4
 810 | 1
 811 | 1
 812 | 1
 813 | 0
 814 | 4
 815 | 2
 816 | 0
 817 | 2
 818 | 1
 819 | 3
 820 | 0
 821 | 0
 822 | 3
 823 | 1
 824 | 0
 825 | 3
 826 | 2
 827 | 3
 828 | 1
 829 | 3
 830 | 4
 831 | 3
 832 | 4
 833 | 3
 834 | 3
 835 | 2
 836 | 0
 837 | 3
 838 | 0
 839 | 0
 840 | 0
 841 | 0
 842 | 0
 843 | 3
 844 | 4
 845 | 2
 846 | 0
 847 | 3
 848 | 2
 849 | 1
 850 | 1
 851 | 0
 852 | 4
 853 | 2
 854 | 0
 855 | 3
 856 | 1
 857 | 4
 858 | 0
 859 | 4
 860 | 0
 861 | 3
 862 | 4
 863 | 3
 864 | 2
 865 | 2
 866 | 4
 867 | 4
 868 | 3
 869 | 3
 870 | 1
 871 | 2
 872 | 4
 873 | 0
 874 | 2
 875 | 4
 876 | 2
 877 | 0
 878 | 3
 879 | 2
 880 | 3
 881 | 3
 882 | 2
 883 | 2
 884 | 2
 885 | 2
 886 | 3
 887 | 2
 888 | 2
 889 | 0
 890 | 0
 891 | 2
 892 | 4
 893 | 2
 894 | 0
 895 | 0
 896 | 4
 897 | 0
 898 | 2
 899 | 4
 900 | 3
 901 | 1
 902 | 2
 903 | 2
 904 | 0
 905 | 1
 906 | 3
 907 | 2
 908 | 2
 909 | 3
 910 | 0
 911 | 3
 912 | 4
 913 | 4
 914 | 4
 915 | 0
 916 | 3
 917 | 1
 918 | 0
 919 | 2
 920 | 3
 921 | 3
 922 | 4
 923 | 4
 924 | 0
 925 | 4
 926 | 0
 927 | 3
 928 | 1
 929 | 1
 930 | 4
 931 | 1
 932 | 4
 933 | 3
 934 | 4
 935 | 3
 936 | 1
 937 | 2
 938 | 0
 939 | 3
 940 | 1
 941 | 1
 942 | 0
 943 | 4
 944 | 1
 945 | 1
 946 | 1
 947 | 4
 948 | 0
 949 | 4
 950 | 1
 951 | 0
 952 | 1
 953 | 0
 954 | 0
 955 | 1
 956 | 1
 957 | 1
 958 | 3
 959 | 3
 960 | 0
 961 | 4
 962 | 3
 963 | 2
 964 | 1
 965 | 0
 966 | 2
 967 | 3
 968 | 3
 969 | 2
 970 | 3
 971 | 4
 972 | 1
 973 | 1
 974 | 3
 975 | 1
 976 | 4
 977 | 0
 978 | 4
 979 | 0
 980 | 4
 981 | 2
 982 | 1
 983 | 1
 984 | 0
 985 | 1
 986 | 0
 987 | 3
 988 | 1
 989 | 2
 990 | 0
 991 | 1
 992 | 3
 993 | 1
 994 | 3
 995 | 0
 996 | 2
 997 | 1
 998 | 1
 999 | 4
1000 | 4
1001 | 1
1002 | 0
1003 | 1
1004 | 2
1005 | 3
1006 | 2
1007 | 3
1008 | 4
1009 | 0
1010 | 4
1011 | 1
1012 | 1
1013 | 0
1014 | 0
1015 | 1
1016 | 4
1017 | 3
1018 | 3
1019 | 3
1020 | 4
1021 | 1
1022 | 0
1023 | 2
1024 | 4
1025 | 4
1026 | 1
1027 | 0
1028 | 4
1029 | 4
1030 | 1
1031 | 4
1032 | 3
1033 | 1
1034 | 2
1035 | 3
1036 | 1
1037 | 4
1038 | 1
1039 | 0
1040 | 2
1041 | 


--------------------------------------------------------------------------------
/DependencyTreeRNN++/CorpusUnrollsReader.h:
--------------------------------------------------------------------------------
  1 | // Copyright (c) 2014-2015 Piotr Mirowski
  2 | //
  3 | // Piotr Mirowski, Andreas Vlachos
  4 | // "Dependency Recurrent Neural Language Models for Sentence Completion"
  5 | // ACL 2015
  6 | 
  7 | #ifndef __DependencyTreeRNN____corpus__
  8 | #define __DependencyTreeRNN____corpus__
  9 | 
 10 | #include <stdlib.h>
 11 | #include <string>
 12 | #include <vector>
 13 | #include <map>
 14 | #include <unordered_map>
 15 | #include <algorithm>
 16 | #include <random>
 17 | 
 18 | /**
 19 |  * Basic unit of a text: a token
 20 |  */
 21 | struct Token {
 22 |   int pos;
 23 |   int wordAsContext;
 24 |   int wordAsTarget;
 25 |   double discount;
 26 |   int label;
 27 | };
 28 | 
 29 | /**
 30 |  * Sentence unroll: a vector of tokens
 31 |  */
 32 | typedef std::vector<Token> Unroll;
 33 | 
 34 | /**
 35 |  * Sentence: a vector of unrolls
 36 |  */
 37 | typedef std::vector<Unroll> Sentence;
 38 | 
 39 | 
 40 | /**
 41 |  * Book: a class containing a vector of sentences
 42 |  */
 43 | class BookUnrolls {
 44 | public:
 45 | 
 46 |   /**
 47 |    * Constructor and destructor
 48 |    */
 49 |   BookUnrolls() { Burn(); }
 50 |   ~BookUnrolls() { }
 51 | 
 52 |   /**
 53 |    * Wipe-out all content of the book
 54 |    */
 55 |   void Burn() {
 56 |     _sentences.clear();
 57 |     _numUnrollsInSentence.clear();
 58 |     _numTokensInUnrollSentence.clear();
 59 |     _numSentences = 0;
 60 |     _sentenceIndex = 0;
 61 |     _unrollIndex = 0;
 62 |     _tokenIndex = 0;
 63 |     _numTokens = 0;
 64 |   }
 65 | 
 66 |   /**
 67 |    * Add a token to the book
 68 |    */
 69 |   void AddToken(bool new_sentence, bool new_unroll,
 70 |                 int pos, int wordAsContext, int wordAsTarget,
 71 |                 double discount, int label);
 72 | 
 73 |   /**
 74 |    * Return the number of sentences
 75 |    */
 76 |   int NumSentences() { return _numSentences; }
 77 | 
 78 |   /**
 79 |    * Return the number of unrolls in sentence
 80 |    */
 81 |   int NumUnrolls(int k) { return _numUnrollsInSentence[k]; }
 82 | 
 83 |   /**
 84 |    * Return the number of tokens in unroll of a sentence
 85 |    */
 86 |   int NumTokens(int k, int j) { return _numTokensInUnrollSentence[k][j]; }
 87 | 
 88 |   /**
 89 |    * Return the index of the current sentence
 90 |    */
 91 |   int CurrentSentenceIndex() { return _sentenceIndex; }
 92 | 
 93 |   /**
 94 |    * Return the index of the current sentence
 95 |    */
 96 |   int CurrentUnrollIndex() { return _unrollIndex; }
 97 | 
 98 |   /**
 99 |    * Go to a specific sentence
100 |    */
101 |   bool GoToSentence(int n);
102 | 
103 |   /**
104 |    * Go to the next sentence
105 |    */
106 |   int NextSentence();
107 | 
108 |   /**
109 |    * Go to the next unroll in the sentence
110 |    */
111 |   int NextUnrollInSentence();
112 | 
113 |   /**
114 |    * Go to the next unroll in the current sentence.
115 |    * Here, we do not loop over but stop (return -1)
116 |    * when the end of the unroll is reached.
117 |    */
118 |   int NextTokenInUnroll();
119 | 
120 |   /**
121 |    * Update the current token
122 |    */
123 |   void UpdateCurrentToken() {
124 |     _currentToken =
125 |     &(_sentences[_sentenceIndex][_unrollIndex][_tokenIndex]);
126 |   }
127 | 
128 |   /**
129 |    * Accessors to the current token's information
130 |    */
131 |   int CurrentTokenNumberInSentence() { return _currentToken->pos; }
132 |   double CurrentTokenDiscount() { return _currentToken->discount; }
133 |   int CurrentTokenWordAsContext() { return _currentToken->wordAsContext; }
134 |   int CurrentTokenWordAsTarget() { return _currentToken->wordAsTarget; }
135 |   int CurrentTokenLabel() { return _currentToken->label; }
136 | 
137 |   /**
138 |    * Reset the sentence in the current sentence
139 |    */
140 |   void ResetSentence() {
141 |     _sentenceIndex = 0;
142 |     // Recursively reset the unroll of that first sentence
143 |     ResetUnroll();
144 |   }
145 | 
146 |   /**
147 |    * Reset the unroll in the current sentence
148 |    */
149 |   void ResetUnroll() {
150 |     _unrollIndex = 0;
151 |     // Recursively reset the token of that first unroll
152 |     ResetToken();
153 |   }
154 | 
155 |   /**
156 |    * Reset the token in the current sentence and unroll
157 |    */
158 |   void ResetToken() {
159 |     _tokenIndex = 0;
160 |     UpdateCurrentToken();
161 |   }
162 | 
163 |   /**
164 |    * Number of tokens
165 |    */
166 |   long NumTokens() { return _numTokens; }
167 | 
168 | protected:
169 | 
170 |   // All the sentences of the book
171 |   std::vector<Sentence> _sentences;
172 | 
173 |   // Copy of the current token
174 |   Token *_currentToken;
175 | 
176 |   // Current sentence, unroll and token index
177 |   int _sentenceIndex;
178 |   int _unrollIndex;
179 |   int _tokenIndex;
180 | 
181 |   // Number of sentences
182 |   int _numSentences;
183 | 
184 |   // Number of unrolls in each sentence
185 |   std::vector<int> _numUnrollsInSentence;
186 | 
187 |   // Number of tokens in each unroll and sentence
188 |   std::vector<std::vector<int> > _numTokensInUnrollSentence;
189 | 
190 |   // Total number of tokens
191 |   long _numTokens;
192 | };
193 | 
194 | 
195 | /**
196 |  * CorpusUnrolls: contains all vocabulary and the list of books
197 |  * but stores only one book at a time
198 |  */
199 | class CorpusUnrolls {
200 | public:
201 |   /**
202 |    * Constructor
203 |    */
204 |   CorpusUnrolls() :
205 |   _minWordOccurrence(3),
206 |   _oov(0),
207 |   _vocabSizeWords(0),
208 |   _vocabSizeLabels(0),
209 |   _currentBookIndex(-1) {
210 |     // Insert OOV and EOS tokens
211 |     InsertWord("<unk>", 1.0);
212 |     InsertWord("</s>", 1.0);
213 |     // Insert ROOT label
214 |     InsertLabel("ROOT");
215 |   }
216 | 
217 |   /**
218 |    * Constructor and destructor
219 |    */
220 |   ~CorpusUnrolls () { }
221 | 
222 | public:
223 |   /**
224 |    * Number of books
225 |    */
226 |   int NumBooks() { return (int)(_bookFilenames.size()); }
227 | 
228 |   /**
229 |    * Size of the vocabulary
230 |    */
231 |   int NumWords() { return _vocabSizeWords; }
232 | 
233 |   /**
234 |    * Number of labels
235 |    */
236 |   int NumLabels() { return _vocabSizeLabels; }
237 | 
238 |   /**
239 |    * Look-up a word in the vocabulary
240 |    */
241 |   int LookUpWord(const std::string &word);
242 | 
243 |   /**
244 |    * Look-up a label in the vocabulary
245 |    */
246 |   int LookUpLabel(const std::string &label);
247 | 
248 | public:
249 |   /**
250 |    * Set minimum number of word occurrences
251 |    */
252 |   void SetMinWordOccurrence(int val) { _minWordOccurrence = val; }
253 | 
254 |   /**
255 |    * Insert a word into the vocabulary, if new
256 |    */
257 |   int InsertWord(const std::string &word, double discount);
258 | 
259 |   /**
260 |    * Insert a label into the vocabulary, if new
261 |    */
262 |   int InsertLabel(const std::string &label);
263 | 
264 |   /**
265 |    * Read vocabulary from all books and return the number of tokens
266 |    */
267 |   long ReadVocabulary(bool mergeLabel);
268 | 
269 |   /**
270 |    * Filter and sort the vocabulary from another corpus
271 |    */
272 |   void FilterSortVocabulary(CorpusUnrolls &other);
273 | 
274 |   /**
275 |    * Copy the vocabulary from another corpus
276 |    */
277 |   void CopyVocabulary(CorpusUnrolls &other);
278 | 
279 |   /**
280 |    * Export the vocabulary to a text file
281 |    */
282 |   void ExportVocabulary(const std::string &filename);
283 | 
284 |   /**
285 |    * Import the vocabulary from a text file
286 |    */
287 |   void ImportVocabulary(const std::string &filename);
288 | 
289 |   /**
290 |    * Add a book
291 |    */
292 |   void AddBookFilename(const std::string &filename) {
293 |     _bookFilenames.push_back(filename);
294 |     NextBook();
295 |   }
296 | 
297 |   /**
298 |    * Go to next book
299 |    */
300 |   int NextBook() {
301 |     _currentBookIndex++;
302 |     if (_currentBookIndex == NumBooks()) { _currentBookIndex = 0; }
303 |     return _currentBookIndex;
304 |   }
305 | 
306 |   /**
307 |    * Shuffle the order of the books
308 |    */
309 |   void ShuffleBooks() {
310 |     std::random_shuffle(_bookFilenames.begin(), _bookFilenames.end());
311 |   }
312 | 
313 |   /**
314 |    * Read the current book into memory
315 |    */
316 |   void ReadBook(bool mergeLabel);
317 | 
318 | protected:
319 | 
320 |   // Minimum number of word occurrences not to be OOV
321 |   int _minWordOccurrence;
322 | 
323 |   // Out-of-vocabulary token
324 |   int _oov;
325 | 
326 |   // Number of words and labels in the vocabulary
327 |   int _vocabSizeWords;
328 |   int _vocabSizeLabels;
329 | 
330 |   // Current book
331 |   int _currentBookIndex;
332 | 
333 |   // List of books (filenames)
334 |   std::vector<std::string> _bookFilenames;
335 | 
336 | public:
337 | 
338 |   // Vocabulary: map between a string of text and an integer
339 |   std::unordered_map<std::string, int> vocabulary;
340 |   std::unordered_map<int, std::string> vocabularyReverse;
341 | 
342 |   // Discounted word counts
343 |   std::unordered_map<int, double> wordCountsDiscounted;
344 | 
345 |   // Labels: map between a string of text and an integer
346 |   std::unordered_map<std::string, int> labels;
347 |   std::unordered_map<int, std::string> labelsReverse;
348 | 
349 |   // Current book
350 |   BookUnrolls m_currentBook;
351 | };
352 | 
353 | #endif /* defined(__DependencyTreeRNN____corpus__) */
354 | 


--------------------------------------------------------------------------------
/DependencyTreeRNN++/RnnState.h:
--------------------------------------------------------------------------------
  1 | // Copyright (c) 2014-2015 Piotr Mirowski
  2 | //
  3 | // Piotr Mirowski, Andreas Vlachos
  4 | // "Dependency Recurrent Neural Language Models for Sentence Completion"
  5 | // ACL 2015
  6 | 
  7 | // Based on code by Geoffrey Zweig and Tomas Mikolov
  8 | // for the Feature-Augmented RNN Tool Kit
  9 | // http://research.microsoft.com/en-us/projects/rnn/
 10 | 
 11 | /*
 12 |  This file is based on or incorporates material from the projects listed below (collectively, "Third Party Code").
 13 |  Microsoft is not the original author of the Third Party Code. The original copyright notice and the license under which Microsoft received such Third Party Code,
 14 |  are set forth below. Such licenses and notices are provided for informational purposes only. Microsoft, not the third party, licenses the Third Party Code to you
 15 |  under the terms set forth in the EULA for the Microsoft Product. Microsoft reserves all rights not expressly granted under this agreement, whether by implication,
 16 |  estoppel or otherwise.
 17 | 
 18 |  RNNLM 0.3e by Tomas Mikolov
 19 | 
 20 |  Provided for Informational Purposes Only
 21 | 
 22 |  BSD License
 23 |  All rights reserved.
 24 |  Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
 25 |  Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
 26 | 
 27 |  Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other
 28 |  materials provided with the distribution.
 29 | 
 30 |  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
 31 |  WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
 32 |  INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA,
 33 |  OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 34 |  ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 35 |  */
 36 | 
 37 | #ifndef DependencyTreeRNN___RnnState_h
 38 | #define DependencyTreeRNN___RnnState_h
 39 | 
 40 | #include <vector>
 41 | #include <algorithm>
 42 | 
 43 | 
 44 | /**
 45 |  * Max n-gram order, used for word history and direct connections
 46 |  * from the word history to the word output
 47 |  */
 48 | const int c_maxNGramOrder = 20;
 49 | 
 50 | 
 51 | /**
 52 |  * State vectors in the RNN model, storing per-word and per-class activations
 53 |  */
 54 | class RnnState {
 55 | public:
 56 | 
 57 |   /**
 58 |    * Constructor
 59 |    */
 60 |   RnnState(int sizeVocabulary,
 61 |            int sizeHidden,
 62 |            int sizeFeature,
 63 |            int sizeClasses,
 64 |            int sizeCompress,
 65 |            long long sizeDirectConnection,
 66 |            int orderDirectConnection)
 67 |   : m_orderDirectConnection(orderDirectConnection) {
 68 |     int sizeInput = sizeVocabulary;
 69 |     int sizeOutput = sizeVocabulary + sizeClasses;
 70 |     WordHistory.assign(c_maxNGramOrder, 0);
 71 |     InputLayer.assign(sizeInput, 0.0);
 72 |     InputGradient.assign(sizeInput, 0.0);
 73 |     RecurrentLayer.assign(sizeHidden, 0.0);
 74 |     RecurrentGradient.assign(sizeHidden, 0.0);
 75 |     HiddenLayer.assign(sizeHidden, 0.0);
 76 |     HiddenGradient.assign(sizeHidden, 0.0);
 77 |     FeatureLayer.assign(sizeFeature, 0.0);
 78 |     FeatureGradient.assign(sizeFeature, 0.0);
 79 |     OutputLayer.assign(sizeOutput, 0.0);
 80 |     OutputGradient.assign(sizeOutput, 0.0);
 81 |     CompressLayer.assign(sizeCompress, 0.0);
 82 |     CompressGradient.assign(sizeCompress, 0.0);
 83 |   }
 84 | 
 85 |   // Input layer (i.e., words)
 86 |   std::vector<double> InputLayer;
 87 |   // Input feature layer (e.g., topics)
 88 |   std::vector<double> FeatureLayer;
 89 |   // Hidden layer at previous time step
 90 |   std::vector<double> RecurrentLayer;
 91 |   // Hidden layer
 92 |   std::vector<double> HiddenLayer;
 93 |   // Second (compression) hidden layer
 94 |   std::vector<double> CompressLayer;
 95 |   // Output layer
 96 |   std::vector<double> OutputLayer;
 97 | 
 98 |   // Gradient to the words in input layer
 99 |   std::vector<double> InputGradient;
100 |   // Gradient to the features in input layer
101 |   std::vector<double> FeatureGradient;
102 |   // Gradient to the hidden state at previous time step
103 |   std::vector<double> RecurrentGradient;
104 |   // Gradient to the hidden layer
105 |   std::vector<double> HiddenGradient;
106 |   // Gradient to the second (compression) hidden layer
107 |   std::vector<double> CompressGradient;
108 |   // Gradient to the output layer
109 |   std::vector<double> OutputGradient;
110 | 
111 |   // Word history
112 |   std::vector<int> WordHistory;
113 | 
114 | 
115 |   /**
116 |    * Return the number of units in the input (word) layer.
117 |    */
118 |   int GetInputSize() const {
119 |     return static_cast<int>(InputLayer.size());
120 |   }
121 | 
122 | 
123 |   /**
124 |    * Return the number of units in the input (word) layer.
125 |    */
126 |   int GetHiddenSize() const {
127 |     return static_cast<int>(HiddenLayer.size());
128 |   }
129 | 
130 | 
131 |   /**
132 |    * Return the number of units in the optional hidden compression layer.
133 |    */
134 |   int GetCompressSize() const {
135 |     return static_cast<int>(CompressLayer.size());
136 |   }
137 | 
138 | 
139 |   /**
140 |    * Return the number of units in the feature (e.g., topic) layer.
141 |    */
142 |   int GetFeatureSize() const {
143 |     return static_cast<int>(FeatureLayer.size());
144 |   }
145 | 
146 | 
147 |   /**
148 |    * Return the number of units in the output layer.
149 |    */
150 |   int GetOutputSize() const {
151 |     return static_cast<int>(OutputLayer.size());
152 |   }
153 | 
154 | 
155 |   /**
156 |    * Return the number of units in the output layer.
157 |    */
158 |   int GetOrderDirectConnection() const { return m_orderDirectConnection; }
159 | 
160 | protected:
161 |   int m_orderDirectConnection;
162 | };
163 | 
164 | 
165 | class RnnBptt {
166 | public:
167 | 
168 |   /**
169 |    * Constructor
170 |    */
171 |   RnnBptt(int sizeVocabulary, int sizeHidden, int sizeFeature,
172 |           int numBpttSteps, int bpttBlockSize)
173 |   : m_bpttSteps(numBpttSteps), m_bpttBlock(bpttBlockSize),
174 |   m_sizeHidden(sizeHidden), m_sizeFeature(sizeFeature),
175 |   m_steps(0) {
176 |     Reset();
177 |     WeightsInput2Hidden.assign(sizeVocabulary * sizeHidden, 0);
178 |     WeightsRecurrent2Hidden.assign(sizeHidden * sizeHidden, 0);
179 |     WeightsFeature2Hidden.assign(sizeFeature * sizeHidden, 0);
180 |   }
181 | 
182 | 
183 |   /**
184 |    * Number of BPTT steps that can be considered
185 |    */
186 |   int NumSteps() { return m_steps; }
187 | 
188 | 
189 |   /**
190 |    * Reset the BPTT memory
191 |    */
192 |   void Reset() {
193 |     m_steps = 0;
194 |     History.assign(m_bpttSteps + m_bpttBlock + 10, -1);
195 |     FeatureLayer.assign((m_bpttSteps + m_bpttBlock + 2) * m_sizeFeature, 0);
196 |     HiddenLayer.assign((m_bpttSteps + m_bpttBlock + 1) * m_sizeHidden, 0);
197 |     HiddenGradient.assign((m_bpttSteps + m_bpttBlock + 1) * m_sizeHidden, 0);
198 |   }
199 | 
200 | 
201 |   /**
202 |    * Shift the BPTT memory by one
203 |    */
204 |   void Shift(int lastWord) {
205 |     if (m_bpttSteps > 0) {
206 |       // Shift the history of words
207 |       for (int a = m_bpttSteps + m_bpttBlock - 1; a > 0; a--) {
208 |         History[a] = History[a - 1];
209 |       }
210 |       History[0] = lastWord;
211 | 
212 |       // Shift the history of hidden layer activations
213 |       for (int a = m_bpttSteps + m_bpttBlock - 1; a > 0; a--) {
214 |         for (int b = 0; b < m_sizeHidden; b++) {
215 |           HiddenLayer[a * m_sizeHidden + b] =
216 |           HiddenLayer[(a - 1) * m_sizeHidden + b];
217 |           HiddenGradient[a * m_sizeHidden + b] =
218 |           HiddenGradient[(a - 1) * m_sizeHidden + b];
219 |         }
220 |       }
221 | 
222 |       // Shift the history of feature activations
223 |       for (int a = m_bpttSteps + m_bpttBlock - 1; a > 0; a--) {
224 |         for (int b = 0; b < m_sizeFeature; b++) {
225 |           FeatureLayer[a * m_sizeFeature + b] =
226 |           FeatureLayer[(a - 1) * m_sizeFeature + b];
227 |         }
228 |       }
229 |     }
230 |     // Keep track of the number of that can be considered for BPTT
231 |     m_steps++;
232 |     m_steps = std::min(m_steps, m_bpttSteps + m_bpttBlock);
233 |   }
234 | 
235 | 
236 |   // Word history
237 |   std::vector<int> History;
238 |   // History of feature inputs
239 |   std::vector<double> FeatureLayer;
240 |   // History of hidden layer inputs
241 |   std::vector<double> HiddenLayer;
242 |   // History of gradients to the hidden layer
243 |   std::vector<double> HiddenGradient;
244 |   // Gradients to the weights, to be added to the SGD gradients
245 |   std::vector<double> WeightsInput2Hidden;
246 |   std::vector<double> WeightsRecurrent2Hidden;
247 |   std::vector<double> WeightsFeature2Hidden;
248 | 
249 | 
250 | protected:
251 |   // Number of steps gradients are back-propagated through time
252 |   int m_bpttSteps;
253 |   // How many steps (words) do we wait between consecutive BPTT?
254 |   int m_bpttBlock;
255 |   // How many steps have been stored since the last reset?
256 |   int m_steps;
257 |   // Number of hidden nodes
258 |   int m_sizeHidden;
259 |   // Number of features
260 |   int m_sizeFeature;
261 | };
262 | 
263 | #endif
264 | 


--------------------------------------------------------------------------------
/DependencyTreeRNN++/RnnWeights.cpp:
--------------------------------------------------------------------------------
  1 | // Copyright (c) 2014-2015 Piotr Mirowski
  2 | //
  3 | // Piotr Mirowski, Andreas Vlachos
  4 | // "Dependency Recurrent Neural Language Models for Sentence Completion"
  5 | // ACL 2015
  6 | 
  7 | // Based on code by Geoffrey Zweig and Tomas Mikolov
  8 | // for the Feature-Augmented RNN Tool Kit
  9 | // http://research.microsoft.com/en-us/projects/rnn/
 10 | 
 11 | /*
 12 |  This file is based on or incorporates material from the projects listed below (collectively, "Third Party Code").
 13 |  Microsoft is not the original author of the Third Party Code. The original copyright notice and the license under which Microsoft received such Third Party Code,
 14 |  are set forth below. Such licenses and notices are provided for informational purposes only. Microsoft, not the third party, licenses the Third Party Code to you
 15 |  under the terms set forth in the EULA for the Microsoft Product. Microsoft reserves all rights not expressly granted under this agreement, whether by implication,
 16 |  estoppel or otherwise.
 17 | 
 18 |  RNNLM 0.3e by Tomas Mikolov
 19 | 
 20 |  Provided for Informational Purposes Only
 21 | 
 22 |  BSD License
 23 |  All rights reserved.
 24 |  Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
 25 |  Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
 26 | 
 27 |  Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other
 28 |  materials provided with the distribution.
 29 | 
 30 |  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
 31 |  WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
 32 |  INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA,
 33 |  OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 34 |  ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 35 |  */
 36 | 
 37 | #include <stdio.h>
 38 | #include <vector>
 39 | #include <iostream>
 40 | #include <sstream>
 41 | #include <assert.h>
 42 | #include "Utils.h"
 43 | #include "RnnWeights.h"
 44 | 
 45 | using namespace std;
 46 | 
 47 | /**
 48 |  * Constructor
 49 |  */
 50 | RnnWeights::RnnWeights(int sizeVocabulary,
 51 |                        int sizeHidden,
 52 |                        int sizeFeature,
 53 |                        int sizeClasses,
 54 |                        int sizeCompress,
 55 |                        long long sizeDirectConnection)
 56 | : m_sizeVocabulary(sizeVocabulary),
 57 | m_sizeHidden(sizeHidden),
 58 | m_sizeFeature(sizeFeature),
 59 | m_sizeClasses(sizeClasses),
 60 | m_sizeCompress(sizeCompress),
 61 | m_sizeDirectConnection(sizeDirectConnection),
 62 | m_sizeInput(sizeVocabulary),
 63 | m_sizeOutput(sizeVocabulary + sizeClasses) {
 64 | 
 65 |   // Sanity check
 66 |   assert(sizeClasses <= sizeVocabulary);
 67 |   cout << "RnnWeights: allocate " << m_sizeInput << " inputs ("
 68 |   << sizeVocabulary << " words), "
 69 |   << m_sizeClasses << " classes, "
 70 |   << m_sizeHidden << " hiddens, "
 71 |   << m_sizeFeature << " features, "
 72 |   << m_sizeCompress << " compressed, "
 73 |   << m_sizeDirectConnection << " n-grams\n";
 74 | 
 75 |   // Allocate the weights connecting those layers
 76 |   // (will be assigned random values later)
 77 |   Input2Hidden.resize(m_sizeInput * m_sizeHidden);
 78 |   Recurrent2Hidden.resize(m_sizeHidden * m_sizeHidden);
 79 |   Features2Hidden.resize(m_sizeFeature * m_sizeHidden);
 80 |   Features2Output.resize(m_sizeFeature * m_sizeOutput);
 81 |   if (sizeCompress == 0) {
 82 |     Hidden2Output.resize(m_sizeHidden * m_sizeOutput);
 83 |   } else {
 84 |     // Add a compression layer between hidden nodes and outputs
 85 |     Hidden2Output.resize(m_sizeHidden * m_sizeCompress);
 86 |     Compress2Output.resize(m_sizeCompress * m_sizeOutput);
 87 |   }
 88 |   // Change that to proper normal distribution
 89 |   // http://en.cppreference.com/w/cpp/numeric/random/normal_distribution
 90 |   RandomizeVector(Input2Hidden);
 91 |   RandomizeVector(Recurrent2Hidden);
 92 |   if (sizeFeature > 0) {
 93 |     RandomizeVector(Features2Hidden);
 94 |     RandomizeVector(Features2Output);
 95 |   }
 96 |   if (sizeCompress > 0) {
 97 |     RandomizeVector(Compress2Output);
 98 |   }
 99 |   RandomizeVector(Hidden2Output);
100 | 
101 |   // Initialize the direct n-gram connections
102 |   DirectNGram.assign(m_sizeDirectConnection, 0.0);
103 | } // RnnWeights()
104 | 
105 | 
106 | /**
107 |  * Clear all the weights (before loading a new copy), to save memory
108 |  */
109 | void RnnWeights::Clear() {
110 |   Input2Hidden.clear();
111 |   Recurrent2Hidden.clear();
112 |   Features2Hidden.clear();
113 |   Features2Output.clear();
114 |   if (m_sizeCompress == 0) {
115 |     Hidden2Output.clear();
116 |   } else {
117 |     Hidden2Output.clear();
118 |     Compress2Output.clear();
119 |   }
120 |   DirectNGram.clear();
121 | }
122 | 
123 | 
124 | /**
125 |  * Load the weights matrices from a file
126 |  */
127 | void RnnWeights::Load(FILE *fi) {
128 |   // Read the weights of input -> hidden connections
129 |   Log("Reading " + ConvString(m_sizeHidden) +
130 |       "x" + ConvString(m_sizeInput) + " input->hidden weights...\n");
131 |   ReadBinaryMatrix(fi, m_sizeInput, m_sizeHidden, Input2Hidden);
132 |   // Read the weights of recurrent hidden -> hidden connections
133 |   Log("Reading " + ConvString(m_sizeHidden) + "x" + ConvString(m_sizeHidden) +
134 |       " recurrent hidden->hidden weights...\n");
135 |   ReadBinaryMatrix(fi, m_sizeHidden, m_sizeHidden, Recurrent2Hidden);
136 |   // Read the weights of feature -> hidden connections
137 |   Log("Reading " + ConvString(m_sizeHidden) + "x" + ConvString(m_sizeFeature) +
138 |       " feature->hidden weights...\n");
139 |   ReadBinaryMatrix(fi, m_sizeFeature, m_sizeHidden, Features2Hidden);
140 |   // Read the weights of feature -> output connections
141 |   Log("Reading " + ConvString(m_sizeOutput) + "x" + ConvString(m_sizeFeature) +
142 |       " feature->output weights...\n");
143 |   ReadBinaryMatrix(fi, m_sizeFeature, m_sizeOutput, Features2Output);
144 |   if (m_sizeCompress == 0) {
145 |     // Read the weights of hidden -> output connections
146 |     Log("Reading " + ConvString(m_sizeOutput) + "x" + ConvString(m_sizeHidden) +
147 |         " hidden->output weights...\n");
148 |     ReadBinaryMatrix(fi, m_sizeHidden, m_sizeOutput, Hidden2Output);
149 |   } else {
150 |     // Read the weights of hidden -> compression connections
151 |     Log("Reading " + ConvString(m_sizeCompress) + "x" + ConvString(m_sizeHidden) +
152 |         " hidden->compress weights...\n");
153 |     ReadBinaryMatrix(fi, m_sizeHidden, m_sizeCompress, Hidden2Output);
154 |     // Read the weights of compression -> output connections
155 |     Log("Reading " + ConvString(m_sizeOutput) + "x" + ConvString(m_sizeCompress) +
156 |         " compress->output weights...\n");
157 |     ReadBinaryMatrix(fi, m_sizeCompress, m_sizeOutput, Compress2Output);
158 |   }
159 |   if (m_sizeDirectConnection > 0) {
160 |     Log("Reading " + ConvString(m_sizeDirectConnection) +
161 |         " n-gram connections...\n");
162 |     // Read the direct connections
163 |     ReadBinaryVector(fi, m_sizeDirectConnection, DirectNGram);
164 |   }
165 | } // void Load()
166 | 
167 | 
168 | /**
169 |  * Save the weights matrices to a file
170 |  */
171 | void RnnWeights::Save(FILE *fo) {
172 |   string logFilename = "log_saving.txt";
173 |   // Save the weights U: input -> hidden (i.e., the word embeddings)
174 |   Log("Saving " + ConvString(m_sizeHidden) + "x" + ConvString(m_sizeInput) +
175 |       " input->hidden weights...\n", logFilename);
176 |   SaveBinaryMatrix(fo, m_sizeInput, m_sizeHidden, Input2Hidden);
177 |   // Save the weights W: recurrent hidden -> hidden (i.e., the time-delay)
178 |   Log("Saving " + ConvString(m_sizeHidden) + "x" + ConvString(m_sizeHidden) +
179 |       " recurrent hidden->hidden weights...\n", logFilename);
180 |   SaveBinaryMatrix(fo, m_sizeHidden, m_sizeHidden, Recurrent2Hidden);
181 |   // Save the weights feature -> hidden
182 |   Log("Saving " + ConvString(m_sizeHidden) + "x" + ConvString(m_sizeFeature) +
183 |       " feature->hidden weights...\n", logFilename);
184 |   SaveBinaryMatrix(fo, m_sizeFeature, m_sizeHidden, Features2Hidden);
185 |   // Save the weights G: feature -> output
186 |   Log("Saving " + ConvString(m_sizeOutput) + "x" + ConvString(m_sizeFeature) +
187 |       " feature->output weights...\n", logFilename);
188 |   SaveBinaryMatrix(fo, m_sizeFeature, m_sizeOutput, Features2Output);
189 |   // Save the weights hidden -> compress and compress -> output
190 |   // or simply the weights V: hidden -> output
191 |   if (m_sizeCompress > 0) {
192 |     Log("Saving " + ConvString(m_sizeCompress) + "x" + ConvString(m_sizeHidden) +
193 |         " hidden->compress weights...\n", logFilename);
194 |     SaveBinaryMatrix(fo, m_sizeHidden, m_sizeCompress, Hidden2Output);
195 |     Log("Saving " + ConvString(m_sizeOutput) + "x" + ConvString(m_sizeCompress) +
196 |         " compress->output weights...\n", logFilename);
197 |     SaveBinaryMatrix(fo, m_sizeCompress, m_sizeOutput, Compress2Output);
198 |   } else {
199 |     Log("Saving " + ConvString(m_sizeOutput) + "x" + ConvString(m_sizeHidden) +
200 |         " hidden->output weights...\n", logFilename);
201 |     SaveBinaryMatrix(fo, m_sizeHidden, m_sizeOutput, Hidden2Output);
202 |   }
203 |   if (m_sizeDirectConnection > 0) {
204 |     // Save the direct connections
205 |     Log("Saving " + ConvString(m_sizeDirectConnection) +
206 |         " n-gram connections...\n", logFilename);
207 |     for (long long aa = 0; aa < m_sizeDirectConnection; aa++) {
208 |       float fl = (float)(DirectNGram[aa]);
209 |       fwrite(&fl, 4, 1, fo);
210 |     }
211 |   }
212 | } // void Save()
213 | 
214 | 
215 | /**
216 |  * Debug function
217 |  */
218 | void RnnWeights::Debug() {
219 |   Log("input2hidden: " + ConvString(m_sizeInput) + "x" +
220 |       ConvString(m_sizeHidden) + " " +
221 |       ConvString(Input2Hidden[(m_sizeInput-1)*(m_sizeHidden-1)]) + "\n");
222 |   Log("recurrent2hidden: " + ConvString(m_sizeHidden) + "x" +
223 |       ConvString(m_sizeHidden) + " " +
224 |       ConvString(Recurrent2Hidden[(m_sizeHidden-1)*(m_sizeHidden-1)]) + "\n");
225 |   Log("hidden2output: " + ConvString(m_sizeHidden) + "x" +
226 |       ConvString(m_sizeOutput) + " " +
227 |       ConvString(Hidden2Output[(m_sizeOutput-1)*(m_sizeHidden-1)]) + "\n");
228 |   if (m_sizeFeature > 0) {
229 |     Log("features2hidden: " + ConvString(m_sizeFeature) + "x" +
230 |         ConvString(m_sizeHidden) + " " +
231 |         ConvString(Features2Hidden[(m_sizeFeature-1)*(m_sizeHidden-1)]) + "\n");
232 |     Log("features2output: " + ConvString(m_sizeFeature) + "x" +
233 |         ConvString(m_sizeOutput) + " " +
234 |         ConvString(Features2Output[(m_sizeFeature-1)*(m_sizeOutput-1)]) + "\n");
235 |   }
236 |   if (m_sizeDirectConnection > 0)
237 |     Log("direct: " + ConvString(m_sizeDirectConnection) + " " +
238 |       ConvString(DirectNGram[m_sizeDirectConnection-1]) + "\n");
239 | } // void Debug()
240 | 


--------------------------------------------------------------------------------
/DependencyTreeRNN++/RnnTraining.h:
--------------------------------------------------------------------------------
  1 | // Copyright (c) 2014-2015 Piotr Mirowski
  2 | //
  3 | // Piotr Mirowski, Andreas Vlachos
  4 | // "Dependency Recurrent Neural Language Models for Sentence Completion"
  5 | // ACL 2015
  6 | 
  7 | // Based on code by Geoffrey Zweig and Tomas Mikolov
  8 | // for the Feature-Augmented RNN Tool Kit
  9 | // http://research.microsoft.com/en-us/projects/rnn/
 10 | 
 11 | /*
 12 |  This file is based on or incorporates material from the projects listed below (collectively, "Third Party Code").
 13 |  Microsoft is not the original author of the Third Party Code. The original copyright notice and the license under which Microsoft received such Third Party Code,
 14 |  are set forth below. Such licenses and notices are provided for informational purposes only. Microsoft, not the third party, licenses the Third Party Code to you
 15 |  under the terms set forth in the EULA for the Microsoft Product. Microsoft reserves all rights not expressly granted under this agreement, whether by implication,
 16 |  estoppel or otherwise.
 17 | 
 18 |  RNNLM 0.3e by Tomas Mikolov
 19 | 
 20 |  Provided for Informational Purposes Only
 21 | 
 22 |  BSD License
 23 |  All rights reserved.
 24 |  Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
 25 |  Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
 26 | 
 27 |  Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other
 28 |  materials provided with the distribution.
 29 | 
 30 |  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
 31 |  WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
 32 |  INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA,
 33 |  OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 34 |  ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 35 |  */
 36 | 
 37 | #ifndef __DependencyTreeRNN____RnnTraining__
 38 | #define __DependencyTreeRNN____RnnTraining__
 39 | 
 40 | #include <vector>
 41 | #include <string>
 42 | #include <iostream>
 43 | #include <fstream>
 44 | #include "CorpusWordReader.h"
 45 | #include "Utils.h"
 46 | #include "RnnLib.h"
 47 | #include "RnnState.h"
 48 | 
 49 | 
 50 | /**
 51 |  * Main class training and testing the RNN model,
 52 |  * not supposed at all to run in a production online environment
 53 |  * (not thread-safe).
 54 |  */
 55 | class RnnLMTraining : public RnnLM {
 56 | public:
 57 |   
 58 |   /**
 59 |    * Constructor for training the model
 60 |    */
 61 |   RnnLMTraining(const std::string &filename, bool doLoadModel, bool debugMode)
 62 |   // We load the RNN or not, depending on whether the model file is present
 63 |   // otherwise simply set its filename
 64 |   : RnnLM(filename, doLoadModel),
 65 |   m_debugMode(debugMode),
 66 |   m_wordCounter(0),
 67 |   m_minWordOccurrences(5),
 68 |   m_oov(1),
 69 |   m_eof(-2),
 70 |   m_fileCorrectSentenceLabels("") {
 71 |     Log("RnnLMTraining: debug mode is " + ConvString(debugMode) + "\n");
 72 |   }
 73 |   
 74 |   void SetTrainFile(const std::string &str) { m_trainFile = str; }
 75 |   
 76 |   void SetValidFile(const std::string &str) { m_validationFile = str; }
 77 |   
 78 |   void SetSentenceLabelsFile(const std::string &str) {
 79 |     m_fileCorrectSentenceLabels = str;
 80 |   }
 81 |   
 82 |   void SetFeatureTrainOrTestFile(const std::string &str) {
 83 |     m_featureFile = str;
 84 |   }
 85 |   
 86 |   void SetFeatureValidationFile(const std::string &str) {
 87 |     m_featureValidationFile = str;
 88 |   }
 89 |   
 90 |   void SetFeatureMatrixFile(const std::string &str) {
 91 |     m_featureMatrixFile = str;
 92 |   }
 93 |   
 94 |   void SetUnkPenalty(double penalty) { m_logProbabilityPenaltyUnk = penalty; }
 95 |   
 96 |   void SetGradientCutoff(double newGradient) {
 97 |     m_gradientCutoff = newGradient;
 98 |   }
 99 |   
100 |   void SetIndependent(bool newVal) { m_areSentencesIndependent = newVal; }
101 | 
102 |   void SetLearningRate(double newAlpha) {
103 |     m_learningRate = newAlpha;
104 |     m_initialLearningRate = newAlpha;
105 |   }
106 |   
107 |   void SetRegularization(double newBeta) { m_regularizationRate = newBeta; }
108 |   
109 |   void SetMinImprovement(double newMinImprovement) {
110 |     m_minLogProbaImprovement = newMinImprovement;
111 |   }
112 | 
113 |   /**
114 |    * (Re)set the number of steps of BPTT
115 |    */
116 |   void SetNumStepsBPTT(int val) {
117 |     m_numBpttSteps = val;
118 |     m_bpttVectors = RnnBptt(GetVocabularySize(), GetHiddenSize(),
119 |                             GetFeatureSize(),
120 |                             m_numBpttSteps, m_bpttBlockSize);
121 |   }
122 |   
123 |   /**
124 |    * (Re)set the number of steps/words when BPTT is called
125 |    */
126 |   void SetBPTTBlock(int val) {
127 |     m_bpttBlockSize = val;
128 |     m_bpttVectors = RnnBptt(GetVocabularySize(), GetHiddenSize(),
129 |                             GetFeatureSize(),
130 |                             m_numBpttSteps, m_bpttBlockSize);
131 |   }
132 |   
133 |   void SetDebugMode(bool mode) { m_debugMode = mode; }
134 |   
135 |   void SetFeatureGamma(double val) { m_featureGammaCoeff = val; }
136 |   
137 | public:
138 |   
139 |   /**
140 |    * Main function to train the RNN model
141 |    */
142 |   virtual bool TrainRnnModel();
143 |   
144 |   /**
145 |    * Before learning the RNN model, we need to learn the vocabulary
146 |    * from the corpus. Note that the word classes may have been initialized
147 |    * beforehand using ReadClasses. Computes the unigram distribution
148 |    * of words from a training file, assuming that the existing vocabulary
149 |    * is empty.
150 |    */
151 |   virtual bool LearnVocabularyFromTrainFile(int numClasses);
152 |   
153 | 
154 |   /**
155 |    * Set the minimum number of word occurrences
156 |    */
157 |   virtual void SetMinWordOccurrence(int val) {
158 |     m_minWordOccurrences = val;
159 |   }
160 | 
161 |   /**
162 |    * Read the classes from a file in the following format:
163 |    * word [TAB] class_index
164 |    * where class index is between 0 and n-1 and there are n classes.
165 |    */
166 |   bool ReadClasses(const std::string &filename) {
167 |     m_usesClassFile = m_vocab.ReadClasses(filename);
168 |     return m_usesClassFile;
169 |   }
170 |   
171 |   /**
172 |    * Once we train the RNN model, it is nice to save it to a text or binary file
173 |    */
174 |   bool SaveRnnModelToFile();
175 |   
176 |   /**
177 |    * Simply write the word projections/embeddings to a text file.
178 |    */
179 |   void SaveWordEmbeddings(const std::string &filename);
180 |   
181 |   /**
182 |    * Main function to test the RNN model
183 |    */
184 |   virtual bool TestRnnModel(const std::string &testFile,
185 |                             const std::string &featureFile,
186 |                             std::vector<double> &sentenceScores,
187 |                             double &logProbability,
188 |                             double &perplexity,
189 |                             double &entropy,
190 |                             double &accuracy);
191 |   
192 |   /**
193 |    * Load a file containing the classification labels
194 |    */
195 |   void LoadCorrectSentenceLabels(const std::string &labelFile);
196 |   
197 | protected:
198 |   
199 |   /**
200 |    * Get the next token (word or multi-word entity) from a text file
201 |    * and return it as an integer in the vocabulary vector.
202 |    * Returns -1 for OOV words and -2 for end of file.
203 |    */
204 |   int ReadWordIndexFromFile(WordReader &reader);
205 |   
206 |   /**
207 |    * Sort the vocabulary by decreasing count of words in the corpus
208 |    * (used for frequency-based word classes, where class 0 contains
209 |    * </s>, class 1 contains {the} or another, most frequent token,
210 |    * class 2 contains a few very frequent tokens, etc...
211 |    */
212 |   void SortVocabularyByFrequency();
213 |   
214 |   /**
215 |    * Sort the words by class, in increasing class order
216 |    * (used when the classes are provided by an external tools,
217 |    * e.g., based on maximum entropy features on word bigrams)
218 |    */
219 |   void SortVocabularyByClass();
220 |   
221 |   /**
222 |    * One step of backpropagation of the errors through the RNN
223 |    * (optionally, backpropagation through time, BPTT) and of gradient descent.
224 |    */
225 |   void BackPropagateErrorsThenOneStepGradientDescent(int last_word, int word);
226 |   
227 |   /**
228 |    * Read the feature vector for the current word
229 |    * in the train/test/valid file and update the feature vector
230 |    * in the state
231 |    * TODO: convert to ifstream
232 |    */
233 |   bool LoadFeatureVectorAtCurrentWord(FILE *f, RnnState &state);
234 |   
235 |   /**
236 |    * Compute the accuracy of selecting the top candidate (based on score)
237 |    * among n-best lists
238 |    */
239 |   double AccuracyNBestList(std::vector<double> scores,
240 |                            std::vector<int> &correctClasses) const;
241 |   
242 |   /**
243 |    * Cleans all activations and error vectors, in the input, hidden,
244 |    * compression, feature and output layers, and resets word history
245 |    */
246 |   void ResetAllRnnActivations(RnnState &state) const;
247 |   
248 |   /**
249 |    * Matrix-vector multiplication routine, accelerated using BLAS.
250 |    * Computes x <- x + A' * y,
251 |    * i.e., the "inverse" operation to y = A * x (adding the result to x)
252 |    * where A is of size N x M, x is of length M and y is of length N.
253 |    * The operation can done on a contiguous subset of indices
254 |    * j in [idxYFrom, idxYTo[ of vector y.
255 |    */
256 |   void GradientMatrixXvectorBlas(std::vector<double> &vectorX,
257 |                                  std::vector<double> &vectorY,
258 |                                  std::vector<double> &matrixA,
259 |                                  int widthMatrix,
260 |                                  int idxYFrom,
261 |                                  int idxYTo) const;
262 |   
263 |   /**
264 |    * Matrix-matrix multiplication routine, accelerated using BLAS.
265 |    * Computes C <- alpha * A * B + beta * C.
266 |    * The operation can done on a contiguous subset of row indices
267 |    * j in [idxRowCFrom, idxRowCTo[ in matrix A and C.
268 |    */
269 |   void MultiplyMatrixXmatrixBlas(std::vector<double> &matrixA,
270 |                                  std::vector<double> &matrixB,
271 |                                  std::vector<double> &matrixC,
272 |                                  double alpha,
273 |                                  double beta,
274 |                                  int numRowsA,
275 |                                  int numRowsB,
276 |                                  int numColsC,
277 |                                  int idxRowCFrom,
278 |                                  int idxRowCTo) const;
279 |   
280 |   /**
281 |    * Matrix-matrix or vector-vector addition routine using BLAS.
282 |    * Computes Y <- alpha * X + beta * Y.
283 |    */
284 |   void AddMatrixToMatrixBlas(std::vector<double> &matrixX,
285 |                              std::vector<double> &matrixY,
286 |                              double alpha,
287 |                              double beta,
288 |                              int numRows,
289 |                              int numCols) const;
290 |   
291 | protected:
292 |   
293 |   // Are we in debug mode?
294 |   bool m_debugMode;
295 |   
296 |   // Word counter
297 |   long m_wordCounter;
298 |   
299 |   // Index of the OOV (<unk>) word
300 |   int m_oov;
301 | 
302 |   // Index of the EOF token
303 |   int m_eof;
304 | 
305 |   // Minimum number of word occurrences
306 |   int m_minWordOccurrences;
307 | 
308 |   // Classification labels
309 |   std::vector<int> m_correctSentenceLabels;
310 |   
311 |   // File containing the correct classification labels
312 |   std::string m_fileCorrectSentenceLabels;
313 | };
314 | 
315 | #endif /* defined(__DependencyTreeRNN____RnnTraining__) */
316 | 


--------------------------------------------------------------------------------
/DependencyTreeRNN++/Vocabulary.cpp:
--------------------------------------------------------------------------------
  1 | // Copyright (c) 2014-2015 Piotr Mirowski
  2 | //
  3 | // Piotr Mirowski, Andreas Vlachos
  4 | // "Dependency Recurrent Neural Language Models for Sentence Completion"
  5 | // ACL 2015
  6 | 
  7 | // Based on code by Geoffrey Zweig and Tomas Mikolov
  8 | // for the Feature-Augmented RNN Tool Kit
  9 | // http://research.microsoft.com/en-us/projects/rnn/
 10 | 
 11 | /*
 12 |  This file is based on or incorporates material from the projects listed below (collectively, "Third Party Code").
 13 |  Microsoft is not the original author of the Third Party Code. The original copyright notice and the license under which Microsoft received such Third Party Code,
 14 |  are set forth below. Such licenses and notices are provided for informational purposes only. Microsoft, not the third party, licenses the Third Party Code to you
 15 |  under the terms set forth in the EULA for the Microsoft Product. Microsoft reserves all rights not expressly granted under this agreement, whether by implication,
 16 |  estoppel or otherwise.
 17 | 
 18 |  RNNLM 0.3e by Tomas Mikolov
 19 | 
 20 |  Provided for Informational Purposes Only
 21 | 
 22 |  BSD License
 23 |  All rights reserved.
 24 |  Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
 25 |  Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
 26 | 
 27 |  Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other
 28 |  materials provided with the distribution.
 29 | 
 30 |  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
 31 |  WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
 32 |  INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA,
 33 |  OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 34 |  ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 35 |  */
 36 | 
 37 | #include <stdio.h>
 38 | #include <cstring>
 39 | #include <assert.h>
 40 | #include <climits>
 41 | #include <cmath>
 42 | #include <math.h>
 43 | #include <algorithm>
 44 | #include "Vocabulary.h"
 45 | 
 46 | 
 47 | /**
 48 |  * Constructor that reads the vocabulary and classes from the model file.
 49 |  */
 50 | Vocabulary::Vocabulary(FILE *fi, int sizeVocabulary, int numClasses) {
 51 |   // Read the vocabulary, stored in text format as following:
 52 |   // index_number count word_token class_number
 53 |   // There are tabs and spaces separating the 4 columns
 54 |   m_vocabularyStorage.resize(sizeVocabulary);
 55 |   for (int a = 0; a < sizeVocabulary; a++) {
 56 | 
 57 |     // Read the word index and the word count
 58 |     int wordIndex;
 59 |     int wordCount;
 60 |     fscanf(fi, "%d%d", &wordIndex, &wordCount);
 61 |     assert(wordIndex == a);
 62 |     m_vocabularyStorage[a].cn = wordCount;
 63 |     m_vocabularyStorage[a].prob = 0;
 64 | 
 65 |     // Read the word token
 66 |     char buffer[2048] = {0};
 67 |     if (fscanf(fi, "%s", &buffer))
 68 |       m_vocabularyStorage[a].word = buffer;
 69 |     std::string word = m_vocabularyStorage[a].word;
 70 | 
 71 |     // Read the class index
 72 |     int classIndex;
 73 |     fscanf(fi, "%d", &classIndex);
 74 | 
 75 |     // Store the class information
 76 |     m_vocabularyStorage[a].classIndex = classIndex;
 77 |     m_mapWord2Class[word] = classIndex;
 78 | 
 79 |     // Associate the word (string) to the word token number using two maps
 80 |     m_mapWord2Index[word] = wordIndex;
 81 |     m_mapIndex2Word[wordIndex] = word;
 82 |   }
 83 | 
 84 |   // Store which words are in which class, using a vector
 85 |   // (length number of classes) of vectors (num words in that class)
 86 |   m_numClasses = numClasses;
 87 |   StoreClassAssociations();
 88 | 
 89 |   m_useClassFile = false;
 90 | }
 91 | 
 92 | 
 93 | /**
 94 |  * Save the vocabulary to a model file
 95 |  */
 96 | void Vocabulary::Save(FILE *fo) {
 97 |   // Save the vocabulary, one word per line
 98 |   int sizeVocabulary = GetVocabularySize();
 99 |   fprintf(fo, "\nVocabulary:\n");
100 |   for (int wordIndex = 0; wordIndex < sizeVocabulary; wordIndex++) {
101 |     int wordCount = m_vocabularyStorage[wordIndex].cn;
102 |     std::string word = m_vocabularyStorage[wordIndex].word;
103 |     int wordClass = m_vocabularyStorage[wordIndex].classIndex;
104 |     fprintf(fo, "%6d\t%10d\t%s\t%d\n",
105 |             wordIndex, wordCount, word.c_str(), wordClass);
106 |   }
107 | }
108 | 
109 | 
110 | 
111 | /**
112 |  * Add a token (word or multi-word entity) to the vocabulary vector
113 |  * and store it in the map from word string to word index
114 |  * and in the map from word index to word string.
115 |  */
116 | int Vocabulary::AddWordToVocabulary(const std::string& word)
117 | {
118 |   int index = SearchWordInVocabulary(word);
119 |   // When a word is unknown, add it to the vocabulary
120 |   if (index == -1) {
121 |     // Initialize the word index, count and probability to 0
122 |     VocabWord w = VocabWord();
123 |     w.word = word;
124 |     w.prob = 0.0;
125 |     w.cn = 1;
126 |     index = static_cast<int>(m_vocabularyStorage.size());
127 |     m_vocabularyStorage.push_back(std::move(w));
128 |     // We need to store the word - index pair in the hash table word -> index
129 |     // but we will rewrite that map later after sorting the vocabulary by frequency
130 |     m_mapWord2Index[word] = index;
131 |     m_mapIndex2Word[index] = word;
132 |   } else {
133 |     // ... otherwise simply increase its count
134 |     m_vocabularyStorage[index].cn++;
135 |   }
136 |   return (index);
137 | }
138 | 
139 | 
140 | /**
141 |  * Manually set the word count.
142 |  */
143 | bool Vocabulary::SetWordCount(std::string word, int count) {
144 |   int index = SearchWordInVocabulary(word);
145 |   // When a word is unknown, add it to the vocabulary
146 |   if (index > -1) {
147 |     m_vocabularyStorage[index].cn = count;
148 |     return true;
149 |   } else
150 |     return false;
151 | }
152 | 
153 | 
154 | /**
155 |  * Sort the vocabulary by decreasing count of words in the corpus
156 |  * (used for frequency-based word classes, where class 0 contains
157 |  * </s>, class 1 contains {the} or another, most frequent token,
158 |  * class 2 contains a few very frequent tokens, etc...
159 |  */
160 | bool OrderWordCounts(const VocabWord& a, const VocabWord& b) {
161 |   return a.cn > b.cn;
162 | }
163 | void Vocabulary::SortVocabularyByFrequency() {
164 |   // Simply sort the words by frequency, making sure that </s> is first
165 |   int indexEos = SearchWordInVocabulary("</s>");
166 |   int countEos = m_vocabularyStorage[indexEos].cn;
167 |   m_vocabularyStorage[indexEos].cn = INT_MAX;
168 |   std::sort(m_vocabularyStorage.begin(),
169 |             m_vocabularyStorage.end(),
170 |             OrderWordCounts);
171 |   m_vocabularyStorage[indexEos].cn = countEos;
172 | 
173 |   // Rebuild the the maps of word <-> word index
174 |   m_mapWord2Index.clear();
175 |   m_mapIndex2Word.clear();
176 |   for (int index = 0; index < GetVocabularySize(); index++) {
177 |     std::string word = m_vocabularyStorage[index].word;
178 |     // Add the word to the hash table word -> index
179 |     m_mapWord2Index[word] = index;
180 |     // Add the word to the hash table index -> word
181 |     m_mapIndex2Word[index] = word;
182 |   }
183 | }
184 | 
185 | 
186 | /**
187 |  * Return the index of a word in the vocabulary, or -1 if OOV.
188 |  */
189 | int Vocabulary::SearchWordInVocabulary(const std::string& word) const {
190 |   auto i = m_mapWord2Index.find(word);
191 |   if (i == m_mapWord2Index.end()) {
192 |     return -1;
193 |   } else {
194 |     return (i->second);
195 |   }
196 | }
197 | 
198 | 
199 | /**
200 |  * Read the classes from a file in the following format:
201 |  * word [TAB] class_index
202 |  * where class index is between 0 and n-1 and there are n classes.
203 |  */
204 | bool Vocabulary::ReadClasses(const std::string &filename)
205 | {
206 |   FILE *fin = fopen(filename.c_str(), "r");
207 |   if (!fin) {
208 |     printf("Error: unable to open %s\n", filename.c_str());
209 |     return false;
210 |   }
211 | 
212 |   char w[8192];
213 |   int clnum;
214 |   int eos_class = -1;
215 |   int max_class = -1;
216 |   std::set<std::string> words;
217 |   while (fscanf(fin, "%s%d", w, &clnum) != EOF) {
218 |     if (!strcmp(w, "<s>")) {
219 |       printf("Error: <s> should not be in vocab\n");
220 |       return false;
221 |     }
222 | 
223 |     m_mapWord2Class[w] = clnum;
224 |     words.insert(w);
225 | 
226 |     max_class = (clnum > max_class) ? (clnum) : (max_class);
227 |     eos_class = (std::string(w) == "</s>") ? (clnum) : (eos_class);
228 |   }
229 | 
230 |   if (eos_class == -1) {
231 |     printf("Error: </s> must be present in the vocabulary\n");
232 |     return false;
233 |   }
234 | 
235 |   if (m_mapWord2Class.size() == 0) {
236 |     printf("Error: Empty class file!\n");
237 |     return false;
238 |   }
239 | 
240 |   // </s> needs to have the highest class index because it needs to come first in the vocabulary...
241 |   for (auto si=words.begin(); si!=words.end(); si++) {
242 |     if (m_mapWord2Class[*si] == eos_class) {
243 |       m_mapWord2Class[*si] = max_class;
244 |     } else {
245 |       if (m_mapWord2Class[*si] == max_class) {
246 |         m_mapWord2Class[*si] = eos_class;
247 |       }
248 |     }
249 |   }
250 |   return true;
251 | }
252 | 
253 | 
254 | 
255 | /**
256 |  * Assign words in vocabulary to classes (for hierarchical softmax).
257 |  */
258 | void Vocabulary::AssignWordsToClasses() {
259 |   int sizeVocabulary = GetVocabularySize();
260 |   if (m_useClassFile) {
261 |     // Custom-specified classes, provided in a file, were used
262 |     // at training time. There is nothing to do at this point,
263 |     // just copy the class index for each word.
264 |     int cnum = -1;
265 |     int last = -1;
266 |     for (int i = 0; i < sizeVocabulary; i++) {
267 |       if (m_vocabularyStorage[i].classIndex != last) {
268 |         last = m_vocabularyStorage[i].classIndex;
269 |         m_vocabularyStorage[i].classIndex = ++cnum;
270 |       } else {
271 |         m_vocabularyStorage[i].classIndex = cnum;
272 |       }
273 |       // Unused
274 |       m_vocabularyStorage[i].prob = 0.0;
275 |     }
276 |   } else {
277 |     // Frequency-based classes (povey-style)
278 |     // Re-assign classes based on the sqrt(word_count / total_word_count)
279 |     // so that the classes contain equal weight of word occurrences.
280 |     int b = 0;
281 |     for (int i = 0; i < sizeVocabulary; i++) {
282 |       b += m_vocabularyStorage[i].cn;
283 |     }
284 |     double dd = 0;
285 |     for (int i = 0; i < sizeVocabulary; i++) {
286 |       dd += sqrt(m_vocabularyStorage[i].cn/ (double)b);
287 |     }
288 |     double df = 0;
289 |     int a = 0;
290 |     for (int i = 0; i < sizeVocabulary; i++) {
291 |       df += sqrt(m_vocabularyStorage[i].cn / (double)b)/dd;
292 |       if (df > 1) {
293 |         df = 1;
294 |       }
295 |       if (df > (a + 1) / (double)m_numClasses) {
296 |         m_vocabularyStorage[i].classIndex = a;
297 |         if (a < m_numClasses - 1) {
298 |           a++;
299 |         }
300 |       } else {
301 |         m_vocabularyStorage[i].classIndex = a;
302 |       }
303 |       // Unused
304 |       m_vocabularyStorage[i].prob = 0.0;
305 |     }
306 |   }
307 | 
308 |   // Store which words are in which class, using a vector
309 |   // (length number of classes) of vectors (num words in that class)
310 |   StoreClassAssociations();
311 | }
312 | 
313 | 
314 | /**
315 |  * Store information on which word is in which class
316 |  */
317 | void Vocabulary::StoreClassAssociations() {
318 |   // Store which words are in which class,
319 |   // using a vector (length number of classes) of vectors (num words in that class)
320 |   m_classWords.resize(m_numClasses);
321 |   for (int i = 0; i < m_numClasses; i++) {
322 |     m_classWords[i].clear();
323 |   }
324 |   for (int i = 0; i < GetVocabularySize(); i++) {
325 |     // Assign each word into its class
326 |     int wordClass = m_vocabularyStorage[i].classIndex;
327 |     m_classWords[wordClass].push_back(i);
328 |   }
329 | 
330 |   // Check that there is no empty class
331 |   for (int i = 0; i < m_numClasses; i++) {
332 |     assert(!(m_classWords[i].empty()));
333 |   }
334 | }
335 | 


--------------------------------------------------------------------------------
/DependencyTreeRNN++/ReadJson.cpp:
--------------------------------------------------------------------------------
  1 | // Copyright (c) 2014-2015 Piotr Mirowski
  2 | //
  3 | // Piotr Mirowski, Andreas Vlachos
  4 | // "Dependency Recurrent Neural Language Models for Sentence Completion"
  5 | // ACL 2015
  6 | 
  7 | #include <stdio.h>
  8 | #include <iostream>
  9 | #include <string>
 10 | #include <fstream>
 11 | #include <streambuf>
 12 | #include <assert.h>
 13 | #include "ReadJson.h"
 14 | #include "CorpusUnrollsReader.h"
 15 | 
 16 | using namespace std;
 17 | 
 18 | 
 19 | /**
 20 |  * Trim a word
 21 |  */
 22 | string const ReadJson::Trim(const string &word) const {
 23 |   assert(word.length() > 1);
 24 |   string res(word);
 25 |   if (res[0] == '"') {
 26 |     res = res.substr(1, res.length()-1);
 27 |   }
 28 |   if (res[res.length()-1] == '"') {
 29 |     res = res.substr(0, res.length()-1);
 30 |   }
 31 |   return res;
 32 | }
 33 | 
 34 | 
 35 | /**
 36 |  * Parse a token
 37 |  */
 38 | size_t const ReadJson::ParseToken(const string &json_element, JsonToken &tok) const {
 39 | 
 40 |   //cout << "parseToken: " << json_element << endl;
 41 | 
 42 |   size_t len = json_element.length();
 43 |   if (len < 14) { return 0; }
 44 |   size_t begin = 0;
 45 | 
 46 |   // Avoid situations with empty tokens []
 47 |   if ((json_element[0] == '[') && (json_element[1] == ']'))
 48 |     return 2;
 49 | 
 50 |   // Consume the [
 51 |   if (json_element[0] == '[') { begin++; }
 52 |   // Parse the token number
 53 |   size_t end = json_element.find(",", begin);
 54 |   assert(end != string::npos);
 55 |   string pos_string = json_element.substr(begin, end - begin);
 56 |   int token_pos = stoi(pos_string);
 57 |   begin = end + 1;
 58 |   assert(begin < len);
 59 | 
 60 |   // Consume the space and the first "
 61 |   if (json_element[begin] == ' ') { begin++; }
 62 |   if (json_element[begin] == '"') { begin++; }
 63 |   // Parse the word and trim the "
 64 |   end = json_element.find("\", ", begin);
 65 |   assert(end != string::npos);
 66 |   end = end + 1;
 67 |   string token_word = json_element.substr(begin, end - begin);
 68 |   if (token_word.length() <= 1) {
 69 |     cout << json_element << endl;
 70 |   }
 71 |   assert(token_word.length() > 1);
 72 |   token_word = Trim(token_word);
 73 |   begin = end + 1;
 74 |   assert(begin < len);
 75 | 
 76 |   // Parse the discount
 77 |   end = json_element.find(",", begin);
 78 |   assert(end != string::npos);
 79 |   string discount_string = json_element.substr(begin, end - begin);
 80 |   double token_discount = stod(discount_string);
 81 |   begin = end + 1;
 82 |   assert(begin < len);
 83 | 
 84 |   // Consume the space and the first "
 85 |   if (json_element[begin] == ' ') { begin++; }
 86 |   if (json_element[begin] == '"') { begin++; }
 87 |   // Parse the label
 88 |   end = json_element.find("]", begin);
 89 |   assert(end != string::npos);
 90 |   string token_label = json_element.substr(begin, end - begin);
 91 |   assert(token_label.length() > 2);
 92 |   token_label = Trim(token_label);
 93 | 
 94 |   // Fill the token
 95 |   tok.pos = token_pos;
 96 |   tok.word = token_word;
 97 |   tok.discount = token_discount;
 98 |   tok.label = token_label;
 99 | 
100 |   //cout << "token: " << token_pos << " " << token_word
101 |   //     << " " << token_discount << " " << token_label << endl;
102 |   return end;
103 | }
104 | 
105 | 
106 | /**
107 |  * Parse an unroll
108 |  */
109 | size_t const ReadJson::ParseUnroll(const string &json_unrolls,
110 |                                    vector<JsonToken> &unroll) const {
111 | 
112 |   //cout << "parseUnroll: " << json_unrolls << endl;
113 | 
114 |   size_t end_unroll = json_unrolls.find("]]", 0);
115 |   assert(end_unroll != string::npos);
116 | 
117 |   // Avoid situations with empty unrolls []
118 |   if ((json_unrolls[0] == '[') && (json_unrolls[1] == ']'))
119 |     return 2;
120 |   assert(json_unrolls[0] == '[');
121 |   assert(json_unrolls[1] == '[');
122 |   string json_tokens(json_unrolls.substr(0, end_unroll + 1));
123 |   size_t begin = 1;
124 |   size_t end = end_unroll + 1;
125 | 
126 |   while (begin < end_unroll + 1) {
127 |     // Find the next end of the token
128 |     //cout << "parseToken[" << begin << ", " << end << "]\n" << flush;
129 |     JsonToken tok;
130 |     end = ParseToken(json_tokens.substr(begin, end - begin), tok);
131 |     if (end > 0) {
132 |       // Store the token in the unroll
133 |       unroll.push_back(tok);
134 |       // Go to next token
135 |       // Consume the ], comma and space
136 |       begin += end;
137 |       if (json_tokens[begin] == ']') { begin++; }
138 |       if (json_tokens[begin] == ',') { begin++; }
139 |       if (json_tokens[begin] == ' ') { begin++; }
140 |       end = end_unroll + 1;
141 |     } else
142 |       break;
143 |   }
144 | 
145 |   return end_unroll;
146 | }
147 | 
148 | 
149 | /**
150 |  * Parse a sentence
151 |  */
152 | size_t const ReadJson::ParseSentence(const string &json_sentences,
153 |                                      vector<vector<JsonToken>> &sentence) const {
154 | 
155 |   //cout << "parseSentence: " << json_sentences << endl;
156 |   assert(json_sentences.length() >= 6);
157 |   size_t end_sentence = json_sentences.find("]]]", 0);
158 |   assert(end_sentence != string::npos);
159 | 
160 |   // Avoid situations with empty sentences []
161 |   if ((json_sentences[0] == '[') && (json_sentences[1] == ']'))
162 |     return 2;
163 |   assert(json_sentences[0] == '[');
164 |   assert(json_sentences[1] == '[');
165 |   assert(json_sentences[2] == '[');
166 |   string json_unrolls(json_sentences.substr(0, end_sentence + 2));
167 |   size_t begin = 1;
168 |   size_t end = end_sentence + 2;
169 | 
170 |   while (begin < end_sentence + 2) {
171 |     // Find the next end of the token
172 |     //cout << "parseUnroll[" << begin << ", " << end << "]\n" << flush;
173 |     vector<JsonToken> unroll;
174 |     end = ParseUnroll(json_unrolls.substr(begin, end - begin),
175 |                       unroll);
176 |     if (end > 2) {
177 |       // Store the unroll in the sentence
178 |       sentence.push_back(unroll);
179 |     }
180 |     // Go to the next unroll
181 |     begin += end;
182 |     // Consume the ], comma and space
183 |     if (json_unrolls[begin] == ']') { begin++; }
184 |     if (json_unrolls[begin] == ']') { begin++; }
185 |     if (json_unrolls[begin] == ',') { begin++; }
186 |     if (json_unrolls[begin] == ' ') { begin++; }
187 |     end = end_sentence + 2;
188 |   }
189 | 
190 |   return end_sentence;
191 | }
192 | 
193 | 
194 | /**
195 |  * Parse a book
196 |  */
197 | size_t const ReadJson::ParseBook(const string &json_book,
198 |                                  vector<vector<vector<JsonToken>>> &book) const {
199 | 
200 |   //cout << "parseBook: " << json_book << endl;
201 |   assert(json_book.length() >= 8);
202 |   size_t end_book = json_book.find("]]]]", 0);
203 |   if (end_book == string::npos) {
204 |     end_book = json_book.find("]]], []]", 0);
205 |     if (end_book == string::npos) {
206 |       end_book = json_book.find("]]], [], []]", 0);
207 |       assert(end_book != string::npos);
208 |     }
209 |   }
210 | 
211 |   assert(json_book[0] == '[');
212 |   size_t begin = 1;
213 |   if ((json_book[begin] == '[') && (json_book[begin+1] == ']') &&
214 |       (json_book[begin+2] == ',') && (json_book[begin+3] == ' ')) {
215 |     begin += 4;
216 |   }
217 |   if ((json_book[begin] == '[') && (json_book[begin+1] == ']') &&
218 |       (json_book[begin+2] == ',') && (json_book[begin+3] == ' ')) {
219 |     begin += 4;
220 |   }
221 |   assert(json_book[begin] == '[');
222 |   assert(json_book[begin + 1] == '[');
223 |   assert(json_book[begin + 2] == '[');
224 |   string json_sentences(json_book.substr(0, end_book + 3));
225 |   size_t end = end_book + 3;
226 | 
227 |   while (begin < end_book + 3) {
228 |     // Find the next end of the token
229 |     //cout << "parseSentence[" << begin << ", " << end << "]\n" << flush;
230 |     vector<vector<JsonToken>> sentence;
231 |     end = ParseSentence(json_sentences.substr(begin, end - begin),
232 |                         sentence);
233 |     if (end > 2) {
234 |       // Store the sentence
235 |       book.push_back(sentence);
236 |     }
237 |     // Go to next sentence
238 |     begin += end;
239 |     // Consume the ], ], ], comma and space
240 |     if (json_sentences[begin] == ']') { begin++; }
241 |     if (json_sentences[begin] == ']') { begin++; }
242 |     if (json_sentences[begin] == ']') { begin++; }
243 |     if (json_sentences[begin] == ',') { begin++; }
244 |     if (json_sentences[begin] == ' ') { begin++; }
245 |     end = end_book + 3;
246 |   }
247 | 
248 |   return end_book;
249 | }
250 | 
251 | 
252 | /**
253 |  * Constructor: read a text file in JSON format.
254 |  * If required, insert words and labels to the vocabulary.
255 |  * If required, insert tokens into the current book.
256 |  */
257 | ReadJson::ReadJson(const string &filename,
258 |                    CorpusUnrolls &corpus,
259 |                    bool insert_vocab,
260 |                    bool read_book,
261 |                    bool merge_label_with_word) {
262 | 
263 |   cout << "Reading book " << filename << "..." << endl;
264 |   ifstream t(filename);
265 |   string book_text((istreambuf_iterator<char>(t)),
266 |                    istreambuf_iterator<char>());
267 | 
268 |   vector<vector<vector<JsonToken>>> sentences;
269 |   cout << "Parsing book " << filename << "..." << endl;
270 |   ParseBook(book_text, sentences);
271 |   cout << "Parsing done.\n";
272 | 
273 |   // Pointer to the current book
274 |   BookUnrolls *book = &(corpus.m_currentBook);
275 | 
276 |   // First, iterate over sentences
277 |   int numSentences = 0;
278 | 
279 |   for (int idx_sentence = 0; idx_sentence < sentences.size(); idx_sentence++) {
280 | 
281 |     int numUnrollsInThatSentence = 0;
282 |     bool isNewSentence = true;
283 | 
284 |     // Second, iterate over unrolls in each sentence
285 |     vector<vector<JsonToken>> unrolls = sentences[idx_sentence];
286 |     for (int idx_unroll = 0; idx_unroll < unrolls.size(); idx_unroll++) {
287 |       bool isNewUnroll = true;
288 | 
289 |       // Third, iterate over tokens in each unroll
290 |       vector<JsonToken> tokens = unrolls[idx_unroll];
291 |       for (int idx_token = 0; idx_token < tokens.size(); idx_token++) {
292 | 
293 |         // Process the token to get:
294 |         // its position in sentence,
295 |         // word, discount and label
296 |         string tokenWordAsTarget = tokens[idx_token].word;
297 |         string tokenLabel = tokens[idx_token].label;
298 |         int tokenPos = tokens[idx_token].pos;
299 |         double tokenDiscount = 1.0 / (tokens[idx_token].discount);
300 |         string tokenWordAsContext(tokenWordAsTarget);
301 | 
302 |         // Concatenate word with label, when it is used as context?
303 |         if (merge_label_with_word) {
304 |           tokenWordAsContext += ":" + tokenLabel;
305 |         }
306 | 
307 |         // Shall we insert new words/labels
308 |         // into the vocabulary?
309 |         if (insert_vocab) {
310 |           if (merge_label_with_word) {
311 |             if (tokenLabel == "LEAF") {
312 |               // Insert target word to vocabulary
313 |               corpus.InsertWord(tokenWordAsTarget, tokenDiscount);
314 |             } else {
315 |               // Insert concatenated context word and label to vocabulary
316 |               corpus.InsertWord(tokenWordAsContext, tokenDiscount);
317 |             }
318 |           } else {
319 |             // Insert word and label to two different vocabularies
320 |             corpus.InsertWord(tokenWordAsContext, tokenDiscount);
321 |             if (tokenLabel != "LEAF") {
322 |               corpus.InsertLabel(tokenLabel);
323 |             }
324 |           }
325 |         }
326 |         // Insert new words to the book
327 |         int wordIndexAsContext = 0, wordIndexAsTarget = 0, labelIndex = 0;
328 |         if (merge_label_with_word) {
329 |           wordIndexAsContext = corpus.LookUpWord(tokenWordAsContext);
330 |           wordIndexAsTarget = corpus.LookUpWord(tokenWordAsTarget);
331 |         } else {
332 |           wordIndexAsContext = corpus.LookUpWord(tokenWordAsContext);
333 |           wordIndexAsTarget = wordIndexAsContext;
334 |           labelIndex = corpus.LookUpLabel(tokenLabel);
335 |         }
336 |         book->AddToken(isNewSentence, isNewUnroll,
337 |                        tokenPos, wordIndexAsContext, wordIndexAsTarget,
338 |                        tokenDiscount, labelIndex);
339 |         // We are no longer at beginning of a sentence or unroll
340 |         isNewSentence = false;
341 |         isNewUnroll = false;
342 |       }
343 |       tokens.clear();
344 |       numUnrollsInThatSentence++;
345 |     }
346 |     unrolls.clear();
347 |     numSentences++;
348 |   }
349 |   sentences.clear();
350 |   book_text.clear();
351 |   cout << "ReadJSON: " << filename << endl;
352 |   cout << "          (" << numSentences << " sentences, including empty ones; ";
353 |   cout << book->NumTokens() << " tokens)\n";
354 |   if (insert_vocab) {
355 |     cout << "          Corpus now contains " << corpus.NumWords()
356 |     << " words and " << corpus.NumLabels() << " labels\n";
357 |   }
358 | }
359 | 


--------------------------------------------------------------------------------
/DependencyTreeRNN++/CorpusUnrollsReader.cpp:
--------------------------------------------------------------------------------
  1 | // Copyright (c) 2014-2015 Piotr Mirowski
  2 | //
  3 | // Piotr Mirowski, Andreas Vlachos
  4 | // "Dependency Recurrent Neural Language Models for Sentence Completion"
  5 | // ACL 2015
  6 | 
  7 | #include <stdio.h>
  8 | #include <climits>
  9 | #include <cmath>
 10 | #include <algorithm>
 11 | #include <iostream>
 12 | #include <fstream>
 13 | #include <sstream>
 14 | #include <assert.h>
 15 | #include "CorpusUnrollsReader.h"
 16 | #include "ReadJson.h"
 17 | 
 18 | using namespace std;
 19 | 
 20 | /**
 21 |  * Add a token to the book
 22 |  */
 23 | void BookUnrolls::AddToken(bool isNewSentence, bool isNewUnroll,
 24 |                            int pos, int wordAsContext, int wordAsTarget,
 25 |                            double discount, int label) {
 26 |   
 27 |   // Add a new sentence?
 28 |   if (isNewSentence) {
 29 |     Sentence s;
 30 |     _sentences.push_back(s);
 31 |     _numUnrollsInSentence.push_back(0);
 32 |     vector<int> v;
 33 |     _numTokensInUnrollSentence.push_back(v);
 34 |     // Bookkeeping of sentences and unrolls
 35 |     _numSentences++;
 36 |     _sentenceIndex = _numSentences - 1;
 37 |     _unrollIndex = 0;
 38 |     _tokenIndex = 0;
 39 |   }
 40 |   // Add a new unroll?
 41 |   if (isNewUnroll) {
 42 |     Unroll u;
 43 |     _sentences[_sentenceIndex].push_back(u);
 44 |     // Bookkeeping of unrolls
 45 |     _numUnrollsInSentence[_sentenceIndex]++;
 46 |     _unrollIndex = _numUnrollsInSentence[_sentenceIndex] - 1;
 47 |     _numTokensInUnrollSentence[_sentenceIndex].push_back(0);
 48 |     _tokenIndex = 0;
 49 |   }
 50 |   // Add a new token
 51 |   Token newToken;
 52 |   newToken.pos = pos;
 53 |   newToken.wordAsContext = wordAsContext;
 54 |   newToken.wordAsTarget = wordAsTarget;
 55 |   newToken.discount = discount;
 56 |   newToken.label = label;
 57 |   _sentences[_sentenceIndex][_unrollIndex].push_back(newToken);
 58 |   _numTokensInUnrollSentence[_sentenceIndex][_unrollIndex]++;
 59 |   _numTokens++;
 60 | }
 61 | 
 62 | 
 63 | /**
 64 |  * Go to a specific sentence
 65 |  */
 66 | bool BookUnrolls::GoToSentence(int n) {
 67 |   // Sanity check
 68 |   if ((n < 0) || (n >= _numSentences)) {
 69 |     return false;
 70 |   }
 71 |   // Set the new sentence
 72 |   _sentenceIndex = n;
 73 |   // Reset the index of the unroll
 74 |   ResetUnroll();
 75 |   return true;
 76 | }
 77 | 
 78 | 
 79 | /**
 80 |  * Go to the next sentence
 81 |  */
 82 | int BookUnrolls::NextSentence() {
 83 |   // Set the new sentence by incrementing its index
 84 |   if (_sentenceIndex >= (_numSentences - 1)) {
 85 |     // Return to sentence 0
 86 |     ResetSentence();
 87 |   } else {
 88 |     // ... or simply go to the next sentence?
 89 |     _sentenceIndex++;
 90 |     // Reset the index of the unroll
 91 |     ResetUnroll();
 92 |   }
 93 |   return _sentenceIndex;
 94 | }
 95 | 
 96 | 
 97 | /**
 98 |  * Go to the next unroll in the sentence
 99 |  */
100 | int BookUnrolls::NextUnrollInSentence() {
101 |   int n_unrolls = _numUnrollsInSentence[_sentenceIndex];
102 |   if (_unrollIndex >= (n_unrolls - 1)) {
103 |     // Return to unroll 0 in the current sentence...
104 |     ResetUnroll();
105 |   } else {
106 |     // ... or simply go to the next unroll?
107 |     _unrollIndex++;
108 |     // Reset the token in that unroll
109 |     ResetToken();
110 |   }
111 |   return _unrollIndex;
112 | }
113 | 
114 | 
115 | /**
116 |  * Go to the next unroll in the current sentence.
117 |  * Here, we do not loop over but stop (return -1)
118 |  * when the end of the unroll is reached.
119 |  */
120 | int BookUnrolls::NextTokenInUnroll() {
121 |   // If we have reached the end of sentence
122 |   if (_tokenIndex < 0)
123 |     return -1;
124 |   // Number of tokens in sentence
125 |   int _numTokens = _numTokensInUnrollSentence[_sentenceIndex][_unrollIndex];
126 |   // Go to the next token or stop
127 |   if (_tokenIndex < (_numTokens - 1)) {
128 |     _tokenIndex++;
129 |     UpdateCurrentToken();
130 |   } else {
131 |     _tokenIndex = -1;
132 |   }
133 |   return _tokenIndex;
134 | }
135 | 
136 | 
137 | /**
138 |  * Custom comparator for sorting a vector<pair<string, double>>
139 |  * by values
140 |  */
141 | struct reverseSortByValue {
142 |   bool operator() (const pair<string, double> &left,
143 |                    const pair<string, double> &right) {
144 |     return (left.second > right.second);
145 |   }
146 | };
147 | 
148 | 
149 | /**
150 |  * Filter and sort the vocabulary from another corpus
151 |  */
152 | void CorpusUnrolls::FilterSortVocabulary(CorpusUnrolls &other) {
153 |   
154 |   // Copy the labels as they are
155 |   for (int k = 0; k < other.NumLabels(); k++) {
156 |     InsertLabel(other.labelsReverse[k]);
157 |   }
158 |   
159 |   // Initialize a vector of filtered word counts
160 |   // that contains OOV and EOS
161 |   vector<pair<string, double> > filteredWords;
162 |   filteredWords.push_back(pair<string, double>("</s>", 0.0));
163 |   filteredWords.push_back(pair<string, double>("<unk>", 0.0));
164 |   double freqOOV = 0.0;
165 |   double countWords = 0;
166 |   
167 |   // Copy only words with 3 or more occurrences into that vector
168 |   // and keep statistics about OOV words.
169 |   // Note that we start the indexing at 3 because we already stored
170 |   // <unk> and </s>
171 |   for (int k = 2; k < other.NumWords(); k++) {
172 |     string word = other.vocabularyReverse[k];
173 |     double wordFreq = ceil(other.wordCountsDiscounted[k]);
174 |     if (wordFreq >= _minWordOccurrence) {
175 |       pair<string, double> p(word, wordFreq);
176 |       filteredWords.push_back(p);
177 |     } else {
178 |       freqOOV += wordFreq;
179 |     }
180 |     countWords += wordFreq;
181 |   }
182 |   // Set the number of </s> tokens to a large value
183 |   filteredWords[0].second = INT_MAX;
184 |   // Count the number of <unk>
185 |   filteredWords[1].second = freqOOV;
186 |   
187 |   // Sort that vector by value
188 |   // The sorting should keep </s> at position 0
189 |   sort(filteredWords.begin(),
190 |        filteredWords.end(),
191 |        reverseSortByValue());
192 |   
193 |   // Completely clear the corpus word vocabulary
194 |   // (not the labels)
195 |   vocabulary.clear();
196 |   vocabularyReverse.clear();
197 |   wordCountsDiscounted.clear();
198 |   _vocabSizeWords = 0;
199 | 
200 |   // Now we can set the number of </s> tokens to 0
201 |   // (it never happens, because of the tree parsing)
202 |   filteredWords[0].second = 0.0;
203 |   
204 |   // Copy the content of that vector
205 |   for (int k = 0; k < filteredWords.size(); k++) {
206 |     string word = filteredWords[k].first;
207 |     double wordFreq = filteredWords[k].second;
208 |     InsertWord(word, wordFreq);
209 |   }
210 |   // Note the OOV tag
211 |   _oov = vocabulary["<unk>"];
212 | }
213 | 
214 | 
215 | /**
216 |  * Copy the vocabulary from another corpus
217 |  */
218 | void CorpusUnrolls::CopyVocabulary(CorpusUnrolls &other) {
219 |   
220 |   // Completely clear the corpus word vocabulary and labels
221 |   labels.clear();
222 |   labelsReverse.clear();
223 |   vocabulary.clear();
224 |   vocabularyReverse.clear();
225 |   wordCountsDiscounted.clear();
226 |   _vocabSizeWords = 0;
227 |   _vocabSizeLabels = 0;
228 | 
229 |   // Copy the labels as they are
230 |   for (int k = 0; k < other.NumLabels(); k++) {
231 |     InsertLabel(other.labelsReverse[k]);
232 |   }
233 |   
234 |   // Insert the words from the other corpus into the vocabulary
235 |   for (int k = 0; k < other.NumWords(); k++) {
236 |     string word = other.vocabularyReverse[k];
237 |     double wordFreq = other.wordCountsDiscounted[k];
238 |     InsertWord(word, wordFreq);
239 |   }
240 | 
241 |   // Note the OOV tag
242 |   _oov = vocabulary["<unk>"];
243 | }
244 | 
245 | 
246 | /**
247 |  * Export the vocabulary to a text file
248 |  */
249 | void CorpusUnrolls::ExportVocabulary(const string &filename) {
250 |   // Write the header
251 |   ofstream vocabFile(filename);
252 |   vocabFile << NumWords() << "\t" << NumLabels() << "\n";
253 |   // Write the labels
254 |   for (int k = 0; k < NumLabels(); k++) {
255 |     vocabFile << k << "\t" << labelsReverse[k] << "\n";
256 |   }
257 |   // Write the words and their discount factors
258 |   for (int k = 0; k < NumWords(); k++) {
259 |     vocabFile << k << "\t" << vocabularyReverse[k]
260 |     << "\t" << wordCountsDiscounted[k] << "\n";
261 |   }
262 |   vocabFile.close();
263 | }
264 | 
265 | 
266 | /**
267 |  * Import the vocabulary from a text file
268 |  */
269 | void CorpusUnrolls::ImportVocabulary(const string &filename) {
270 | 
271 |   // Read the header
272 |   ifstream vocabFile(filename);
273 |   cout << "Reading vocabulary file " << filename << endl;
274 |   assert(vocabFile.is_open());
275 | 
276 |   // Completely clear the corpus word vocabulary and labels
277 |   labels.clear();
278 |   labelsReverse.clear();
279 |   vocabulary.clear();
280 |   vocabularyReverse.clear();
281 |   wordCountsDiscounted.clear();
282 |   _vocabSizeWords = 0;
283 |   _vocabSizeLabels = 0;
284 | 
285 |   // Read the header line
286 |   string line;
287 |   getline(vocabFile, line);
288 |   stringstream lineStream(line);
289 |   string strNumWords;
290 |   string strNumLabels;
291 |   getline(lineStream, strNumWords, '\t');
292 |   getline(lineStream, strNumLabels);
293 |   int numWords = stoi(strNumWords);
294 |   int numLabels = stoi(strNumLabels);
295 |   cout << "Vocabulary file contains " << numWords << " words and "
296 |   << numLabels << " labels\n";
297 | 
298 |   // Read the labels one by one
299 |   for (int k = 0; k < numLabels; k++) {
300 |     getline(vocabFile, line);
301 |     stringstream lineStream(line);
302 |     string strIdx;
303 |     string label;
304 |     getline(lineStream, strIdx, '\t');
305 |     getline(lineStream, label);
306 |     InsertLabel(label);
307 |   }
308 | 
309 |   // Read the words one by one
310 |   for (int k = 0; k < numWords; k++) {
311 |     getline(vocabFile, line);
312 |     stringstream lineStream(line);
313 |     string strIdx;
314 |     string word;
315 |     string strWordFreq;
316 |     getline(lineStream, strIdx, '\t');
317 |     getline(lineStream, word, '\t');
318 |     getline(lineStream, strWordFreq);
319 |     double wordFreq = stof(strWordFreq);
320 |     InsertWord(word, wordFreq);
321 |   }
322 | 
323 |   vocabFile.close();
324 | 
325 |   // Note the OOV tag
326 |   _oov = vocabulary["<unk>"];
327 | 
328 |   printf("Vocab size: %d\n", NumWords());
329 |   printf("Unknown tag at: %d\n", _oov);
330 |   printf("Label vocab size: %d\n", NumLabels());
331 | }
332 | 
333 | 
334 | /**
335 |  * Read vocabulary from all books and return the number of tokens
336 |  */
337 | long CorpusUnrolls::ReadVocabulary(bool mergeLabel) {
338 |   
339 |   long nTokens = 0;
340 |   // Loop over the books
341 |   for (int k = 0; k < NumBooks(); k++) {
342 |     // Open the training file, load it to a JSON structure
343 |     // and add words to the corpus
344 |     ReadJson *train_json =
345 |     new ReadJson(_bookFilenames[k], *this, true, false, mergeLabel);
346 |     nTokens = m_currentBook.NumTokens();
347 |     // Free the memory
348 |     delete train_json;
349 |   }
350 |   return nTokens;
351 | }
352 | 
353 | 
354 | /**
355 |  * Read the current book into memory
356 |  */
357 | void CorpusUnrolls::ReadBook(bool mergeLabel) {
358 |   
359 |   // "Burn" the previous book, if any, to initialize it
360 |   m_currentBook.Burn();
361 |   // Open the training file, load it to a JSON structure
362 |   // and add words to the corpus
363 |   ReadJson *train_json =
364 |   new ReadJson(_bookFilenames[_currentBookIndex], *this, false, true, mergeLabel);
365 |   // Free the memory
366 |   delete train_json;
367 | }
368 | 
369 | 
370 | /**
371 |  * Insert a word into the vocabulary, if new
372 |  */
373 | int CorpusUnrolls::InsertWord(const string &word, double discount) {
374 |   
375 |   // Try to find the word
376 |   int wordIndex = LookUpWord(word);
377 |   if (wordIndex == _oov) {
378 |     // Could not find word: insert it to the vocabulary
379 |     wordIndex = _vocabSizeWords;
380 |     pair<string, int> kv(word, wordIndex);
381 |     vocabulary.insert(kv);
382 |     pair<int, string> kv2(wordIndex, word);
383 |     vocabularyReverse.insert(kv2);
384 |     _vocabSizeWords++;
385 |   } else {
386 |     wordIndex = vocabulary[word];
387 |   }
388 |   
389 |   // Find the current (dis)count of the word
390 |   unordered_map<int, double>::iterator it2 =
391 |   wordCountsDiscounted.find(wordIndex);
392 |   if (it2 == wordCountsDiscounted.end()) {
393 |     pair<int, double> kv(wordIndex, discount);
394 |     wordCountsDiscounted.insert(kv);
395 |   } else {
396 |     wordCountsDiscounted[wordIndex] += discount;
397 |   }
398 |   
399 |   // Simply return the word index
400 |   return wordIndex;
401 | }
402 | 
403 | 
404 | /**
405 |  * Insert a label into the vocabulary, if new
406 |  */
407 | int CorpusUnrolls::InsertLabel(const string &label) {
408 |   
409 |   // Try to find the label
410 |   int labelIndex = LookUpLabel(label);
411 |   if (labelIndex == -1) {
412 |     // Could not find word: insert it to the vocabulary
413 |     labelIndex = _vocabSizeLabels;
414 |     pair<string, int> kv(label, labelIndex);
415 |     labels.insert(kv);
416 |     pair<int, string> kv2(labelIndex, label);
417 |     labelsReverse.insert(kv2);
418 |     _vocabSizeLabels++;
419 |   } else {
420 |     labelIndex = labels[label];
421 |   }
422 |   
423 |   // Simply return the label index
424 |   return labelIndex;
425 | }
426 | 
427 | 
428 | /**
429 |  * Look-up a word in the vocabulary
430 |  */
431 | int CorpusUnrolls::LookUpWord(const string &word) {
432 |   
433 |   // Try to find the word
434 |   int wordIndex = _oov;
435 |   unordered_map<string, int>::iterator it =
436 |   vocabulary.find(word);
437 |   if (it != vocabulary.end()) {
438 |     wordIndex = vocabulary[word];
439 |   }
440 |   return wordIndex;
441 | }
442 | 
443 | 
444 | /**
445 |  * Look-up a label in the vocabulary
446 |  */
447 | int CorpusUnrolls::LookUpLabel(const string &label) {
448 |   
449 |   // Try to find the word
450 |   int labelIndex = -1;
451 |   unordered_map<string, int>::iterator it =
452 |   labels.find(label);
453 |   if (it != labels.end()) {
454 |     labelIndex = labels[label];
455 |   }
456 |   return labelIndex;
457 | }
458 | 


--------------------------------------------------------------------------------
/DependencyTreeRNN++/RnnLib.h:
--------------------------------------------------------------------------------
  1 | // Copyright (c) 2014-2015 Piotr Mirowski
  2 | //
  3 | // Piotr Mirowski, Andreas Vlachos
  4 | // "Dependency Recurrent Neural Language Models for Sentence Completion"
  5 | // ACL 2015
  6 | 
  7 | // Based on code by Geoffrey Zweig and Tomas Mikolov
  8 | // for the Feature-Augmented RNN Tool Kit
  9 | // http://research.microsoft.com/en-us/projects/rnn/
 10 | 
 11 | /*
 12 |  This file is based on or incorporates material from the projects listed below (collectively, "Third Party Code").
 13 |  Microsoft is not the original author of the Third Party Code. The original copyright notice and the license under which Microsoft received such Third Party Code,
 14 |  are set forth below. Such licenses and notices are provided for informational purposes only. Microsoft, not the third party, licenses the Third Party Code to you
 15 |  under the terms set forth in the EULA for the Microsoft Product. Microsoft reserves all rights not expressly granted under this agreement, whether by implication,
 16 |  estoppel or otherwise.
 17 | 
 18 |  RNNLM 0.3e by Tomas Mikolov
 19 | 
 20 |  Provided for Informational Purposes Only
 21 | 
 22 |  BSD License
 23 |  All rights reserved.
 24 |  Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
 25 |  Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
 26 | 
 27 |  Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other
 28 |  materials provided with the distribution.
 29 | 
 30 |  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
 31 |  WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
 32 |  INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA,
 33 |  OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 34 |  ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 35 |  */
 36 | 
 37 | #ifndef __DependencyTreeRNN____rnnlmlib__
 38 | #define __DependencyTreeRNN____rnnlmlib__
 39 | 
 40 | #include <vector>
 41 | #include <map>
 42 | #include <set>
 43 | #include <string>
 44 | #include <iostream>
 45 | #include <fstream>
 46 | #include <unordered_map>
 47 | #include "RnnState.h"
 48 | #include "RnnWeights.h"
 49 | #include "CorpusWordReader.h"
 50 | #include "Vocabulary.h"
 51 | 
 52 | 
 53 | /**
 54 |  * Main class storing the RNN model
 55 |  */
 56 | class RnnLM
 57 | {
 58 | public:
 59 | 
 60 |   /**
 61 |    * Constructor
 62 |    */
 63 |   RnnLM(const std::string &filename,
 64 |         bool doLoadModel);
 65 | 
 66 |   /**
 67 |    * Load the model.
 68 |    */
 69 |   void LoadRnnModelFromFile();
 70 | 
 71 |   /**
 72 |    * Return the number of words/entity tokens in the vocabulary.
 73 |    */
 74 |   int GetVocabularySize() const { return m_vocab.GetVocabularySize(); }
 75 | 
 76 |   /**
 77 |    * Return the number of units in the input (word) layer.
 78 |    */
 79 |   int GetInputSize() const { return m_state.GetInputSize(); }
 80 | 
 81 |   /**
 82 |    * Return the number of units in the input (word) layer.
 83 |    */
 84 |   int GetHiddenSize() const { return m_state.GetHiddenSize(); }
 85 | 
 86 |   /**
 87 |    * Return the number of units in the optional hidden compression layer.
 88 |    */
 89 |   int GetCompressSize() const { return m_state.GetCompressSize(); }
 90 | 
 91 |   /**
 92 |    * Return the number of units in the feature (e.g., topic) layer.
 93 |    */
 94 |   int GetFeatureSize() const { return m_state.GetFeatureSize(); }
 95 | 
 96 |   /**
 97 |    * Return the number of units in the output layer.
 98 |    */
 99 |   int GetOutputSize() const { return m_state.GetOutputSize(); }
100 | 
101 |   /**
102 |    * Return the number of direct connections between input words
103 |    * and the output word (i.e., n-gram features)
104 |    */
105 |   int GetNumDirectConnection() const { return m_weights.GetNumDirectConnection(); }
106 | 
107 |   /**
108 |    * Return the number of direct connections between input words
109 |    * and the output word (i.e., n-gram features)
110 |    */
111 |   int GetOrderDirectConnection() const { return m_state.GetOrderDirectConnection(); }
112 | 
113 |   /**
114 |    * Return the number of vocabulary classes. These are specified
115 |    * at training time and can be frequency-based or rely on more
116 |    * complex max-entropy features of the word bigrams.
117 |    */
118 |   int GetNumClasses() const { return m_weights.GetNumClasses(); }
119 | 
120 | protected:
121 | 
122 |   /**
123 |    * Exponentiates x.
124 |    */
125 |   double SafeExponentiate(double val) const
126 |   {
127 |     // for numerical stability
128 |     val = (val > 50) ? 50 : ((val < -50) ? -50 : val);
129 |     return (exp(val));
130 |   }
131 | 
132 |   /**
133 |    * Exponentiates x in base 10.
134 |    */
135 |   double ExponentiateBase10(double num) const
136 |   {
137 |     return exp(num * 2.302585093);
138 |   }
139 | 
140 |   /**
141 |    * Apply the logistic sigmoid function to x.
142 |    */
143 |   double LogisticSigmoid(double val) const
144 |   {
145 |     return (1 / (1 + SafeExponentiate(-val)));
146 |   }
147 | 
148 |   /**
149 |    * Matrix-vector multiplication routine, somewhat accelerated using loop
150 |    * unrolling over 8 registers. Computes y <- y + A * x, (i.e. adds A * x to y)
151 |    * where A is of size N x M, x is of length M and y is of length N.
152 |    * The operation can done on a contiguous subset of indices
153 |    * i in [idxYFrom, idxYTo[ of vector y
154 |    * and on a contiguous subset of indices j in [idxXFrom, idxXTo[ of vector x.
155 |    */
156 |   void MultiplyMatrixXvectorBlas(std::vector<double> &vectorY,
157 |                                  std::vector<double> &vectorX,
158 |                                  std::vector<double> &matrixA,
159 |                                  int widthMatrix,
160 |                                  int idxYFrom,
161 |                                  int idxYTo) const;
162 | 
163 | public:
164 | 
165 |   /**
166 |    * Return the index of a word in the vocabulary, or -1 if OOV.
167 |    */
168 |   int SearchWordInVocabulary(const std::string& word) const;
169 | 
170 |   /**
171 |    * Go to the next char delim when reading a file.
172 |    */
173 |   bool GoToDelimiterInFile(int delim, FILE *fi) const;
174 | 
175 |   /**
176 |    * Function used to initialize the RNN model to the specified dimensions
177 |    * of the layers and weight vectors. This is done at construction
178 |    * of the RNN model object and also during training time (not at runtime).
179 |    * It is not thread safe yet because there is this file (m_featureMatrixFile)
180 |    * that contains the topic model for the words (LDA-style, see the paper),
181 |    * that is loaded by the function. It also modifies the vocabulary hash tables.
182 |    */
183 |   bool InitializeRnnModel(int sizeInput,
184 |                           int sizeHidden,
185 |                           int sizeFeature,
186 |                           int sizeClasses,
187 |                           int sizeCompress,
188 |                           long long sizeDirectConnection,
189 |                           int orderDirectConnection);
190 | 
191 |   /**
192 |    * Erase the hidden layer state and the word history.
193 |    * Needed when processing sentences/queries in independent mode.
194 |    * Updates the RnnState object.
195 |    */
196 |   void ResetHiddenRnnStateAndWordHistory(RnnState &state) const;
197 |   void ResetHiddenRnnStateAndWordHistory(RnnState &state,
198 |                                          RnnBptt &bpttState) const;
199 | 
200 |   /**
201 |    * Erases only the word history.
202 |    * Needed when processing sentences/queries in independent mode.
203 |    * Updates the RnnState object.
204 |    */
205 |   void ResetWordHistory(RnnState &state) const;
206 |   void ResetWordHistory(RnnState &state,
207 |                         RnnBptt &bpttState) const;
208 | 
209 |   /**
210 |    * Forward-propagate the RNN through one full step, starting from
211 |    * the lastWord w(t) and the previous hidden state activation s(t-1),
212 |    * as well as optional feature vector f(t)
213 |    * and direct n-gram connections to the word history,
214 |    * computing the new hidden state activation s(t)
215 |    * s(t) = sigmoid(W * s(t-1) + U * w(t) + F * f(t))
216 |    * x = V * s(t) + G * f(t) + n-gram_connections
217 |    * y(t) = softmax_class(x) * softmax_word_given_class(x)
218 |    * Updates the RnnState object (but not the weights).
219 |    */
220 |   void ForwardPropagateOneStep(int lastWord,
221 |                                int word,
222 |                                RnnState &state);
223 | 
224 |   /**
225 |    * Given a target word class, compute the conditional distribution
226 |    * of all words within that class. The hidden state activation s(t)
227 |    * is assumed to be already computed. Essentially, computes:
228 |    * x = V * s(t) + G * f(t) + n-gram_connections
229 |    * y(t) = softmax_class(x) * softmax_word_given_class(x)
230 |    * but for a specific targetClass.
231 |    * Updates the RnnState object (but not the weights).
232 |    */
233 |   void ComputeRnnOutputsForGivenClass(const int targetClass,
234 |                                       RnnState &state);
235 | 
236 |   /**
237 |    * Copies the hidden layer activation s(t) to the recurrent connections.
238 |    * That copy will become s(t-1) at the next call of ForwardPropagateOneStep
239 |    */
240 |   void ForwardPropagateRecurrentConnectionOnly(RnnState &state) const;
241 | 
242 |   /**
243 |    * Shift the word history by one and update last word.
244 |    */
245 |   void ForwardPropagateWordHistory(RnnState &state,
246 |                                    int &lastWord,
247 |                                    const int word) const;
248 | 
249 |   /**
250 |    * One way of having additional features to the RNN is to fit a topic
251 |    * model to the past history of words. This can be achieved in a simple
252 |    * way if such a topic matrix (words vs. topics) has been computed.
253 |    * The feature vector f(t) is then simply an autoregressive
254 |    * (exponentially decaying) function of the topic model vectors
255 |    * for each word in the history.
256 |    * This works well when processing sentence in English but might not
257 |    * be appropriate for short queries, since the topic feature
258 |    * will be continuously reset.
259 |    */
260 |   void UpdateFeatureVectorUsingTopicModel(int word, RnnState &state) const;
261 | 
262 |   /**
263 |    * This is currently unused, and we might not use topic model features at all.
264 |    * The idea is to load a matrix of size W * T, where W is the number of words
265 |    * and T is the number of topics. Each word is embedding into a topic vector.
266 |    * The algorithm for word embedding can be Latent Dirichlet Allocation,
267 |    * Latent Semantic Indexing, DSSM, etc...
268 |    * It however assumes that the topic of the sentence changes with each word
269 |    * and is based on longer word history, which is more appropriate for
270 |    * long English sentences than for queries.
271 |    * The function that needs to be called at runtime or during training is
272 |    * UpdateFeatureVectorUsingTopicModel
273 |    */
274 |   bool LoadTopicModelFeatureMatrix();
275 | 
276 |   // Simply copy the hidden activations and gradients, as well as
277 |   // the word history, from one state object to another state object.
278 |   void SaveHiddenRnnState(const RnnState &stateFrom,
279 |                           RnnState &stateTo) const;
280 | 
281 | public:
282 | 
283 |   // Log-probability of unknown words
284 |   double m_logProbabilityPenaltyUnk;
285 | 
286 |   // Vocabulary hashtables
287 |   Vocabulary m_vocab;
288 | 
289 |   // State variable representing all the input/feature/hidden/output layer
290 |   // activations of the RNN. This specific variable is just an initial
291 |   // value that is created when the RNN model is loaded or initialized.
292 |   // The training/testing functions do not modify it, simply make
293 |   // a copy of it (convenient way to initialize the state vectors
294 |   // of the right sizes).
295 |   RnnState m_state;
296 | 
297 |   // The RNN model weights are stored in this object. Once loaded,
298 |   // they will not be updated if the RNN is simply run on new data
299 |   // (e.g., NextWord). Of course, the training algorithm will change them.
300 |   RnnWeights m_weights;
301 | 
302 |   // These BPTT data are not used when the RNN model is run,
303 |   // only during training, but it was easier to store them here.
304 |   RnnBptt m_bpttVectors;
305 | 
306 | protected:
307 | 
308 |   /**
309 |    * Is the training file set?
310 |    */
311 |   bool m_isTrainFileSet;
312 | 
313 |   /**
314 |    * Is the model loaded?
315 |    */
316 |   bool m_isModelLoaded;
317 | 
318 |   /**
319 |    * Training and validation files
320 |    */
321 |   std::string m_trainFile;
322 |   std::string m_validationFile;
323 | 
324 |   /**
325 |    * RNN model file, version and type
326 |    */
327 |   std::string m_rnnModelFile;
328 |   int m_rnnModelVersion;
329 | 
330 |   /**
331 |    * Topic features
332 |    */
333 |   std::string m_featureFile;
334 |   std::string m_featureValidationFile;
335 |   std::string m_featureMatrixFile;
336 |   double m_featureGammaCoeff;
337 |   int m_featureMatrixUsed;
338 |   bool m_useFeatures2Output;
339 | 
340 |   /**
341 |    * This is used for the second way how to add features
342 |    * into the RNN: only matrix W * T is specified,
343 |    * where W = number of words (m_vocabSize)
344 |    * and T = number of topics (m_featureSize)
345 |    */
346 |   std::vector<double> m_featureMatrix;
347 | 
348 |   /**
349 |    * RNN model learning parameters. All this information will simply
350 |    * be loaded from the model file and not used when the RNN is run.
351 |    */
352 |   double m_learningRate;
353 |   double m_initialLearningRate;
354 |   bool m_doStartReducingLearningRate;
355 |   double m_regularizationRate;
356 |   double m_minLogProbaImprovement;
357 |   double m_gradientCutoff;
358 |   int m_numBpttSteps;
359 |   int m_bpttBlockSize;
360 | 
361 |   /**
362 |    * Information relative to the training of the RNN
363 |    */
364 |   int m_iteration;
365 |   long m_numTrainWords;
366 |   long m_currentPosTrainFile;
367 | 
368 |   /**
369 |    * Information relative to the classes
370 |    */
371 |   bool m_usesClassFile;
372 | 
373 |   /**
374 |    * Are the sentences independent?
375 |    */
376 |   bool m_areSentencesIndependent;
377 | };
378 | 
379 | #endif /* defined(__DependencyTreeRNN____rnnlmlib__) */
380 | 


--------------------------------------------------------------------------------