├── LICENSE ├── README.md ├── tree-kernel ├── Makefile ├── libtree_kernel.pro ├── node.cpp ├── node.h ├── notes.txt ├── sentence.cpp ├── sentence.h ├── subtree-gen.py ├── test-tree-kernel.cpp ├── tree-kernel.cpp └── tree-kernel.h └── tree-parser ├── Makefile ├── lexer.h ├── parser.h ├── test-tree-parser.cpp ├── token.cpp ├── token.h ├── tree.cpp ├── tree.h ├── tree_lexer.h ├── tree_parser.cpp └── tree_parser.h /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (c) 2011,2015 Jeff Donner 2 | 3 | Permission is hereby granted, free of charge, to any person 4 | obtaining a copy of this software and associated documentation files 5 | (the "Software"), to deal in the Software without restriction, 6 | including without limitation the rights to use, copy, modify, merge, 7 | publish, distribute, sublicense, and/or sell copies of the Software, 8 | and to permit persons to whom the Software is furnished to do so, 9 | subject to the following conditions: 10 | 11 | The above copyright notice and this permission notice shall be 12 | included in all copies or substantial portions of the Software. 13 | 14 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 15 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 16 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. 17 | IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY 18 | CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 19 | TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 20 | SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 21 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | A C++ implementation of tree kernels per Alessandro Moschitti's 2006 2 | paper, "Making Tree Kernels Practical for Natural Language Learning" 3 | 4 | This consists of two directories, tree-parser and tree-kernel itself. 5 | tree-parser parses Penn Treebank forms of sentences as produced by 6 | the [Standford Parser](http://nlp.stanford.edu/software/lex-parser.shtml), 7 | [(example)](http://nlp.stanford.edu:8080/parser) among others. Eg: 8 | 9 | (ROOT 10 | (S 11 | (NP (PRP$ My) (NN dog)) 12 | (ADVP (RB also)) 13 | (VP (VBZ likes) 14 | (S 15 | (VP (VBG eating) 16 | (NP (NN sausage))))) 17 | (. .))) 18 | 19 | 20 | This TreeKernel source is MIT licensed (see LICENSE), and commercially 21 | useable. If you need a different one just ask. 22 | 23 | Though one person has used this without an SVM, I intended to 24 | integrate this into an SVM library, though I never got around to 25 | it. The two libraries I had in mind were: 26 | libSVM - [github](https://github.com/cjlin1/libsvm), 27 | [home](http://www.csie.ntu.edu.tw/~cjlin/libsvm) 28 | and: 29 | Dlib - [home](http://dlib.net/ml.html) 30 | 31 | Feel free to integrate it yourself, I'd love to hear about it if you 32 | do. 33 | 34 | Building 35 | -------- 36 | 37 | This is GNU Makefile, and GCC based, but nothing is OS-dependent so it 38 | should be easily tweakable for other systems. 39 | 40 | cd tree-parser 41 | make 42 | 43 | cd ../tree-kernel 44 | make 45 | 46 | TODO 47 | ---- 48 | 49 | There are several optimizations possible - templating out sigma, 50 | making NodePairsDeltaTable an unordered_map, but most of all, storing 51 | the Nodes in Sentence in a vector and using indices as pointers to 52 | them (to save memory and cache). C bindings would be nice, too. 53 | 54 | If you make any improvements I'll be happy to take them. Of course, 55 | you can just do what you want with it. 56 | -------------------------------------------------------------------------------- /tree-kernel/Makefile: -------------------------------------------------------------------------------- 1 | CXXFLAGS=-O2 -Wall -Wextra 2 | CXX=g++ 3 | INCLUDE=-I/usr/include -I../tree-parser 4 | LIB_SOURCES=\ 5 | node.cpp \ 6 | sentence.cpp \ 7 | tree-kernel.cpp 8 | TEST_SOURCES=\ 9 | test-tree-kernel.cpp 10 | OBJDIR=.objs 11 | LIB_OBJECTS=$(addprefix $(OBJDIR)/, $(LIB_SOURCES:.cpp=.o)) 12 | TEST_OBJECTS=$(addprefix $(OBJDIR)/, $(TEST_SOURCES:.cpp=.o)) 13 | LIBSDIR=../tree-parser 14 | LIBS=tree_parser 15 | 16 | TEST_TARGET=test-tree-kernel 17 | LIB_TARGET=libtree_kernel.a 18 | 19 | all: $(LIB_TARGET) $(TEST_TARGET) 20 | 21 | $(OBJDIR): 22 | mkdir $(OBJDIR) 23 | 24 | # $@ is the target (objs/foo.o), and $< is the input foo.cpp 25 | $(OBJDIR)/%.o: %.cpp $(OBJDIR) 26 | $(CXX) -c $(CXXFLAGS) $(INCLUDE) -o $@ $^ 27 | 28 | .cpp.o: 29 | $(CXX) $(CXXFLAGS) $(INCLUDE) -c $< -o $@ 30 | 31 | $(LIB_TARGET): $(LIB_OBJECTS) 32 | $(AR) $(ARFLAGS) $@ $^ 33 | 34 | $(TEST_TARGET): $(LIB_OBJECTS) $(TEST_OBJECTS) 35 | $(CXX) $(LIB_OBJECTS) $(TEST_OBJECTS) -L$(LIBSDIR) -l$(LIBS) -o $@ 36 | 37 | clean: 38 | rm -f $(LIB_TARGET) 39 | rm -f $(TEST_TARGET) 40 | rm -f $(LIB_OBJECTS) $(TEST_OBJECTS) 41 | -------------------------------------------------------------------------------- /tree-kernel/libtree_kernel.pro: -------------------------------------------------------------------------------- 1 | TEMPLATE = lib 2 | CONFIG -= qt 3 | CONFIG += debug 4 | SOURCES = sentence.cpp tree-kernel.cpp 5 | OBJECTS_DIR=.objs 6 | TARGET = tree-kernel 7 | INCLUDEPATH=../tree-parser 8 | -------------------------------------------------------------------------------- /tree-kernel/node.cpp: -------------------------------------------------------------------------------- 1 | #include "node.h" 2 | #include 3 | #include 4 | #include 5 | 6 | /* 7 | Copyright (c) 2011,2015,2019 Jeff Donner 8 | 9 | Permission is hereby granted, free of charge, to any person 10 | obtaining a copy of this software and associated documentation files 11 | (the "Software"), to deal in the Software without restriction, 12 | including without limitation the rights to use, copy, modify, merge, 13 | publish, distribute, sublicense, and/or sell copies of the Software, 14 | and to permit persons to whom the Software is furnished to do so, 15 | subject to the following conditions: 16 | 17 | The above copyright notice and this permission notice shall be 18 | included in all copies or substantial portions of the Software. 19 | 20 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 21 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 22 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. 23 | IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY 24 | CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 25 | TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 26 | SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 27 | */ 28 | 29 | using namespace std; 30 | 31 | 32 | bool Node::is_preterminal() const 33 | { 34 | return children_.size() == 1 and children_[0].is_text(); 35 | } 36 | 37 | int Node::productions_cmp(Node const& one, Node const& two) 38 | { 39 | for (Node::Children::const_iterator i1 = one.children_.begin(), e1 = one.children_.end(), 40 | i2 = two.children_.begin(), e2 = two.children_.end(); 41 | i1 != e1 and i2 != e2; ++i1, ++i2) { 42 | int cmp = Child::production_component_cmp(*i1, *i2); 43 | if (cmp) 44 | return cmp; 45 | } 46 | return one.children_.size() - two.children_.size(); 47 | } 48 | 49 | bool Node::productions_equal(Node const* one, Node const* two) 50 | { 51 | // cout << "nodes=?" << one->id_string() << "; " << two->id_string(); 52 | // hoping for short-circuit speed, obviously 53 | bool equal = one->children_.size() == two->children_.size() and 54 | productions_cmp(*one, *two) == 0; 55 | // cout << equal << endl; 56 | return equal; 57 | } 58 | 59 | bool Node::Child::production_component_is_less(Node::Child const& one, 60 | Node::Child const& two) 61 | { 62 | if (one.is_text() and two.is_text()) { 63 | return one.text() < two.text(); 64 | } 65 | else if (one.is_text() != two.is_text()) { 66 | return one.is_text() and not two.is_text(); 67 | } 68 | else { 69 | // both full nodes 70 | return productions_cmp(*one.node(), *two.node()) < 0; 71 | } 72 | } 73 | 74 | std::string Node::productions() const 75 | { 76 | if (is_preterminal()) { 77 | return children_[0].text(); 78 | } 79 | else { 80 | ostringstream oss; 81 | for (Node::Children::const_iterator it = children_.begin(), end = children_.end(); 82 | it != end; ++it) { 83 | assert((*it).is_node()); 84 | oss << (*it).node()->tag_ << " "; 85 | } 86 | return oss.str(); 87 | } 88 | } 89 | 90 | string Node::id_string() const 91 | { 92 | ostringstream oss; 93 | oss << this->tag_ << ":" << this->productions(); 94 | return oss.str(); 95 | } 96 | // 97 | Node::Nodes Node::children() const 98 | { 99 | Nodes childs; 100 | for (Children::const_iterator it = children_.begin(), end = children_.end(); 101 | it != end; ++it) { 102 | if (it->is_node()) { 103 | assert(it->node()); 104 | childs.push_back(it->node()); 105 | } 106 | else { 107 | assert(false); 108 | } 109 | } 110 | return childs; 111 | } 112 | 113 | bool Node::production_is_less(Node const* one, Node const* two) 114 | { 115 | for (Node::Children::const_iterator i1 = one->children_.begin(), e1 = one->children_.end(), 116 | i2 = two->children_.begin(), e2 = two->children_.end(); 117 | i1 != e1 and i2 != e2; ++i1, ++i2) { 118 | int cmp = Child::production_component_cmp(*i1, *i2); 119 | if (cmp != 0) { 120 | return cmp < 0; 121 | } 122 | } 123 | return ((int)one->children_.size() - (int)two->children_.size()) < 0; 124 | } 125 | 126 | 127 | void Node::pretty_print(std::ostream& os, int level) const 128 | { 129 | os << std::endl; 130 | indent(os, level); 131 | os << '(' << this->tag_; 132 | for (Children::const_iterator it = this->child_links().begin(), end = this->child_links().end(); 133 | it != end; ++it) { 134 | os << ' '; 135 | if (it->is_text()) 136 | os << it->text(); 137 | else 138 | it->node()->pretty_print(os, level+1); 139 | } 140 | 141 | os << ')'; 142 | if (level == 0) 143 | os << std::endl; 144 | } 145 | 146 | void Node::indent(std::ostream& os, int level) 147 | { 148 | for (int i = 0; i < level; ++i) 149 | os << " "; 150 | } 151 | -------------------------------------------------------------------------------- /tree-kernel/node.h: -------------------------------------------------------------------------------- 1 | #ifndef SENTENCE_NODE_H 2 | #define SENTENCE_NODE_H 3 | 4 | /* 5 | Copyright (c) 2011,2015 Jeff Donner 6 | 7 | Permission is hereby granted, free of charge, to any person 8 | obtaining a copy of this software and associated documentation files 9 | (the "Software"), to deal in the Software without restriction, 10 | including without limitation the rights to use, copy, modify, merge, 11 | publish, distribute, sublicense, and/or sell copies of the Software, 12 | and to permit persons to whom the Software is furnished to do so, 13 | subject to the following conditions: 14 | 15 | The above copyright notice and this permission notice shall be 16 | included in all copies or substantial portions of the Software. 17 | 18 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 19 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 20 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. 21 | IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY 22 | CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 23 | TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 24 | SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 25 | */ 26 | 27 | #include 28 | #include 29 | #include 30 | #include 31 | #include 32 | 33 | // Improvements: 34 | // * Make the tag types enums; leaf text is never a tag, at least not 35 | // the way we implement it here, so it's not a conflict. 36 | // * 'cache' productions; probably worth it. 37 | // 38 | // Both are tree / node.... 39 | class Node 40 | { 41 | public: 42 | typedef std::vector Nodes; 43 | public: 44 | Node(std::string const& tag) 45 | : tag_(tag) 46 | {} 47 | 48 | /// pre-terminal is a node with a pure piece of text as only child 49 | /// (each piece of text gets a unique parent). 50 | bool is_preterminal() const; 51 | 52 | /// Typical 3-way comparison 53 | static int productions_cmp(Node const& one, Node const& two); 54 | 55 | /// Slightly optimized for eq / ne 56 | static bool productions_equal(Node const* one, Node const* two); 57 | 58 | /// Yes by value (NVRO). Of course that's not actually the problem, 59 | /// it's the content that gets created every time we call this. 60 | std::string productions() const; 61 | 62 | static bool production_is_less(Node const* one, Node const* two); 63 | 64 | void add_child(Node* child) { 65 | /// &&& other checks, as to whether we're over a maximum 66 | Child elt(child); 67 | children_.push_back(elt); 68 | } 69 | 70 | void add_child(std::string const& word) { 71 | Child elt(word); 72 | children_.push_back(elt); 73 | } 74 | Nodes children() const; 75 | 76 | /// tag and production, for debugging 77 | std::string id_string() const; 78 | 79 | void pretty_print(std::ostream& os, int level) const; 80 | 81 | private: 82 | // helper to pretty_print 83 | static void indent(std::ostream& os, int level); 84 | 85 | private: 86 | struct Child; 87 | friend struct Child; 88 | /// 'discriminated union', a link to a child. Can embed a string 89 | /// though, for a terminal child. 90 | struct Child 91 | { 92 | Child(Node* tree) 93 | : node_(tree) 94 | {} 95 | 96 | Child(std::string const& text) 97 | : tok_(text) 98 | , node_(0) 99 | {} 100 | /// Tag, type detecter 101 | bool is_node() const { return this->node_ != 0; } 102 | /// Tag, type detecter 103 | bool is_text() const { return !is_node(); } 104 | 105 | /// Immediate text, if it's a 'text child'; not the full production. 106 | std::string const& text() const { return this->tok_; } 107 | 108 | /// Full Node* child; unsafe, you must first check the type 109 | Node* node() const { return this->node_; } 110 | 111 | /// Specialized; saves a bit of time over full 3-way 112 | static bool production_component_is_less( 113 | Child const& one, Child const& two); 114 | 115 | /// standard 3-way 116 | static int production_component_cmp(Child const& one, 117 | Child const& two) { 118 | return ::strcmp(one.production_component(), 119 | two.production_component()); 120 | } 121 | /// This child's part of the parent's production. Ok for both 122 | /// full node and embedded string. 123 | char const* production_component() const { 124 | if (is_text()) return text().c_str(); 125 | else return node_->tag_.c_str(); 126 | } 127 | 128 | public: 129 | // wants to be a union, but std::string has a ctor 130 | // &&& Implement these as a Variant (or boost::any), to save 131 | // memory and cache pressure. Alexandrescu has one (as does 132 | // Boost, which Christopher Diggens claims to have a faster version of). 133 | // http://www.codeproject.com/KB/architecture/union_list.aspx 134 | // 135 | // Meh - not sure we'd get any space saving from a discriminated 136 | // union; node_* doubles as a type tag (I doubt any valid string 137 | // is pure 0). 138 | std::string tok_; 139 | Node* node_; 140 | }; 141 | 142 | typedef std::vector Children; 143 | 144 | Children const& child_links() const { return children_; } 145 | 146 | private: 147 | const std::string tag_; 148 | Children children_; 149 | }; 150 | 151 | #endif // SENTENCE_NODE_H 152 | -------------------------------------------------------------------------------- /tree-kernel/notes.txt: -------------------------------------------------------------------------------- 1 | todo: make sentence implementation be statically polymorphic, via CRTP. 2 | 3 | Of course, work out an example by hand (ulp!) 4 | 5 | short phrase with a dup: 'a dog eats a dog'? 'a dog ate a dog.' 6 | 7 | test against Moschitti's own implementation. 8 | -------------------------------------------------------------------------------- /tree-kernel/sentence.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright (c) 2011,2015 Jeff Donner 3 | 4 | Permission is hereby granted, free of charge, to any person 5 | obtaining a copy of this software and associated documentation files 6 | (the "Software"), to deal in the Software without restriction, 7 | including without limitation the rights to use, copy, modify, merge, 8 | publish, distribute, sublicense, and/or sell copies of the Software, 9 | and to permit persons to whom the Software is furnished to do so, 10 | subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be 13 | included in all copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 16 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 17 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. 18 | IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY 19 | CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 20 | TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 21 | SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 22 | */ 23 | 24 | #include "sentence.h" 25 | #include 26 | #include 27 | #include 28 | #include 29 | 30 | 31 | using namespace std; 32 | 33 | Node* Sentence::copy_from(Tree const* tree) 34 | { 35 | Node* node = new Node(tree->tag()); 36 | grouped_nodes_.push_back(node); 37 | string production; 38 | if (not tree->children().empty()) { 39 | for (Tree::Elements::const_iterator it = tree->children().begin(), 40 | end = tree->children().end(); 41 | it != end; ++it) { 42 | Tree::Element const& elt = *it; 43 | if (elt.is_text()) { 44 | node->add_child(elt.text()); 45 | } 46 | else { 47 | Node* child = Sentence::copy_from(elt.tree()); 48 | node->add_child(child); 49 | } 50 | } 51 | } 52 | return node; 53 | } 54 | 55 | Sentence::Sentence(Tree const* tree) 56 | { 57 | root_ = copy_from(tree); 58 | 59 | group_nodes(); 60 | } 61 | 62 | void Sentence::group_nodes() 63 | { 64 | sort(grouped_nodes_.begin(), grouped_nodes_.end(), 65 | Node::production_is_less); 66 | } 67 | 68 | std::ostream& operator<<(std::ostream& os, Sentence const& s) 69 | { 70 | os << "printing sentence:" << endl; 71 | s.root_->pretty_print(os, 0); 72 | os << "# grouped nodes: " << s.grouped_nodes_.size(); 73 | return os; 74 | } 75 | -------------------------------------------------------------------------------- /tree-kernel/sentence.h: -------------------------------------------------------------------------------- 1 | #ifndef SENTENCE_H 2 | #define SENTENCE_H 3 | 4 | /* 5 | Copyright (c) 2011,2015 Jeff Donner 6 | 7 | Permission is hereby granted, free of charge, to any person 8 | obtaining a copy of this software and associated documentation files 9 | (the "Software"), to deal in the Software without restriction, 10 | including without limitation the rights to use, copy, modify, merge, 11 | publish, distribute, sublicense, and/or sell copies of the Software, 12 | and to permit persons to whom the Software is furnished to do so, 13 | subject to the following conditions: 14 | 15 | The above copyright notice and this permission notice shall be 16 | included in all copies or substantial portions of the Software. 17 | 18 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 19 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 20 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. 21 | IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY 22 | CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 23 | TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 24 | SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 25 | */ 26 | 27 | #include 28 | #include 29 | #include 30 | #include 31 | #include 32 | 33 | #include "node.h" 34 | 35 | #include "tree.h" 36 | 37 | // The style of node this uses. I think it's Penn Treebank - 38 | // whatever the Stanford Parser uses. 39 | 40 | // Must have Java. 41 | // (S 42 | // (VP (MD Must) 43 | // (VP (VB have) 44 | // (NP (NN java)))) 45 | // (. .)) 46 | 47 | // Never has the term 'massaging the media' seemed so accurate. 48 | // (S 49 | // (NP (NNP Never)) 50 | // (VP (VBZ has) 51 | // (NP 52 | // (NP (DT the) (NN term)) 53 | // (SBAR 54 | // (S ("" "") 55 | // (S 56 | // (VP (VBG massaging) 57 | // (NP (DT the) (NNS media)))) ('' '') 58 | // (VP (VBD seemed) 59 | // (ADJP (RB so) (JJ accurate))))))) (. .)) 60 | 61 | // Improvements: 62 | // * Put Nodes in-place in a vector, and use indices as pointers. 63 | // * Templatize that choice. 64 | 65 | class Sentence 66 | { 67 | friend std::ostream& operator<<(std::ostream& os, Sentence const& s); 68 | 69 | public: 70 | Sentence(std::string const& sentence) 71 | : text_(sentence) 72 | , root_(parse(sentence)) 73 | {} 74 | 75 | Sentence(Tree const* tree); 76 | 77 | Node* copy_from(Tree const* tree); 78 | 79 | typedef std::vector Nodes; 80 | 81 | Nodes const& grouped_nodes() const { return grouped_nodes_; } 82 | 83 | private: 84 | void group_nodes(); 85 | 86 | // &&& we do parsing elsewhere, and use copy_from 87 | static Node const* parse(std::string const& text); 88 | 89 | private: 90 | /// original text, for no good reason 91 | std::string text_; 92 | 93 | /// top of tree of nodes 94 | Node const* root_; 95 | 96 | /// Sorted, grouping same-production nodes. It's the grouped-ness 97 | /// that matters. 98 | Nodes grouped_nodes_; 99 | }; 100 | 101 | std::ostream& operator<<(std::ostream& os, Sentence const& s); 102 | 103 | #endif // SENTENCE_H 104 | -------------------------------------------------------------------------------- /tree-kernel/subtree-gen.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import sys 4 | import nltk 5 | 6 | dog = """ 7 | (ROOT 8 | (S 9 | (NP (NN dog)) 10 | (VP (VBP eat) 11 | (NP (NN dog))))) 12 | """ 13 | 14 | 15 | def main(args): 16 | t = nltk.Tree.parse(dog) 17 | for subt in t.subtrees(): 18 | print subt 19 | 20 | 21 | if __name__ == '__main__': 22 | main(sys.argv[1:]) 23 | -------------------------------------------------------------------------------- /tree-kernel/test-tree-kernel.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright (c) 2011,2015 Jeff Donner 3 | 4 | Permission is hereby granted, free of charge, to any person 5 | obtaining a copy of this software and associated documentation files 6 | (the "Software"), to deal in the Software without restriction, 7 | including without limitation the rights to use, copy, modify, merge, 8 | publish, distribute, sublicense, and/or sell copies of the Software, 9 | and to permit persons to whom the Software is furnished to do so, 10 | subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be 13 | included in all copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 16 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 17 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. 18 | IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY 19 | CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 20 | TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 21 | SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 22 | */ 23 | 24 | #include 25 | #include "tree_parser.h" 26 | #include "tree-kernel.h" 27 | #include "sentence.h" 28 | 29 | using namespace std; 30 | 31 | // g++ -g .objs/node.o .objs/sentence.o .objs/tree-kernel.o .objs/test-tree-kernel.o -L../tree-parser -ltree_parser -o test-tree-kernel 32 | 33 | string g_real = 34 | "(ROOT (S (VP (MD Must) (VP (VB have) (NP (NN java)))) (. .)))"; 35 | 36 | // oversimple parse 37 | string g_dog_eat_dog = 38 | "(S" 39 | " (NN dog)" 40 | " (VBP eat)" 41 | " (NN dog))"; 42 | 43 | // oversimple parse 44 | string g_dog_eat_fish = 45 | "(S" 46 | " (NN dog)" 47 | " (VBP eat)" 48 | " (NN fish))"; 49 | 50 | string g_brought_a_cat = 51 | " (VP (V brought)" 52 | " (NP (D a)" 53 | " (N cat)))" 54 | ; 55 | 56 | Tree const* make_tree(string const& tree_text) 57 | { 58 | TreeLexer lexer(tree_text); 59 | TreeParser parser(lexer); 60 | Tree const* t = 0; 61 | try { 62 | t = parser.match_and_eat_tree(); 63 | } 64 | catch (string& ex) { 65 | cout << "failed parse: " << ex << endl; 66 | } 67 | 68 | // t->pretty_print(cout, 0); 69 | 70 | return t; 71 | } 72 | 73 | double kernel_value(string one, string two, bool want_sst_not_st) 74 | { 75 | Tree const* t1 = make_tree(one); 76 | Tree const* t2 = make_tree(two); 77 | 78 | cout << "s1" << endl; 79 | Sentence s1(t1); 80 | cout << "s2" << endl; 81 | Sentence s2(t2); 82 | 83 | // sigma == 1 is SSTs (fragments) 84 | // sigma == 0 is STs (whole sub-trees, down to leaves) 85 | int sigma = want_sst_not_st ? 1 : 0; 86 | double value = kernel_value(s1, s2, sigma); 87 | return value; 88 | } 89 | 90 | void test_sst(string test_name, string one, string two, double expected) 91 | { 92 | cout << test_name << endl; 93 | double value = kernel_value(one, two, true); 94 | if (value != expected) { 95 | cout << "failed - got: " << value << " instead of: " << expected << endl; 96 | } 97 | else 98 | cout << " ok" << endl; 99 | } 100 | 101 | void test_st(string test_name, string one, string two, double expected) 102 | { 103 | cout << test_name << endl; 104 | double value = kernel_value(one, two, false); 105 | if (value != expected) { 106 | cout << "failed - got: " << value << " instead of: " << expected << endl; 107 | } 108 | else 109 | cout << " ok" << endl; 110 | } 111 | 112 | /* Yo - to test better, we need more sub-tree'd sentences. 113 | 114 | def generate_all_connected_subtrees(node, path-so-far): 115 | generate powerset of all children of node 116 | with the empty subset, add path-so-far as a complete subtree 117 | 118 | generate_all_connected_subtrees(child, path-so-far + child) 119 | 120 | def generate all subtrees(root): 121 | for node in all root: 122 | generate_all)connected_subtrees(node, null) 123 | 124 | */ 125 | 126 | int main() 127 | { 128 | // SSTs are like STs, except that you can 129 | // 'break a single leg off of' a pre-terminal, and all off of 130 | // an ordinary non-terminal (because it's really just acting as 131 | // a child of its parent's rule). So if you include a production 132 | // you must include it all. Pre-terminals only /have/ one 'leg', 133 | // so they're included automatically. 134 | // 135 | // '17' is from the paper 136 | test_sst("brought a cat", g_brought_a_cat, g_brought_a_cat, 17); 137 | // hand-counted; number of non-terminal nodes 138 | test_st("brought a cat", g_brought_a_cat, g_brought_a_cat, 5); 139 | 140 | return 0; 141 | } 142 | -------------------------------------------------------------------------------- /tree-kernel/tree-kernel.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright (c) 2011,2015 Jeff Donner 3 | 4 | Permission is hereby granted, free of charge, to any person 5 | obtaining a copy of this software and associated documentation files 6 | (the "Software"), to deal in the Software without restriction, 7 | including without limitation the rights to use, copy, modify, merge, 8 | publish, distribute, sublicense, and/or sell copies of the Software, 9 | and to permit persons to whom the Software is furnished to do so, 10 | subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be 13 | included in all copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 16 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 17 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. 18 | IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY 19 | CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 20 | TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 21 | SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 22 | */ 23 | 24 | #include 25 | #include 26 | #include 27 | 28 | #include "node.h" 29 | #include "sentence.h" 30 | #include "tree-kernel.h" 31 | 32 | using namespace std; 33 | 34 | typedef std::pair NodePair; 35 | typedef std::vector NodePairs; 36 | 37 | // Per this: 38 | // http://stackoverflow.com/questions/21166675/boostflat-map-and-its-performance-compared-to-map-and-unordered-map 39 | // unordered_map isn't much faster than map, so I'll leave this! 40 | typedef std::map NodePairsDeltaTable; 41 | 42 | static 43 | double preterminal_leaves_value(bool include_leaves, double lambda) 44 | { 45 | return include_leaves ? lambda * 2 : lambda; 46 | } 47 | 48 | static 49 | double& delta_ref_at(NodePairsDeltaTable& delta_table, 50 | Node const* n1, Node const* n2) 51 | { 52 | return delta_table[make_pair(n1, n2)]; 53 | } 54 | 55 | 56 | /// Memo-ized dynamic programming 57 | static 58 | double get_delta(NodePairsDeltaTable& delta_table, 59 | Node const* n1, Node const* n2, 60 | int sigma, 61 | bool include_leaves, 62 | // use a lambda = 1.0 to give it no effect 63 | double lambda) 64 | { 65 | assert((sigma == 0 or sigma == 1) or 66 | !"sigma isn't tunable, it's got to be 0 or 1; it's a choice of algorithm"); 67 | double& ref_delta = delta_ref_at(delta_table, n1, n2); 68 | if (ref_delta == 0.0) { 69 | // needs calculating - all nodes we'll see are > 0. 70 | if (n1->is_preterminal() and n2->is_preterminal()) { 71 | // We're treating the tree purely structurally, or by 72 | // part-of-speech by doing this... 73 | // wait a minute - 'include_leaves' is done ahead of time.. 74 | ref_delta = preterminal_leaves_value(include_leaves, lambda); 75 | } else { 76 | ref_delta = lambda; 77 | Node::Nodes n1_children = n1->children(); 78 | Node::Nodes n2_children = n2->children(); 79 | // non-pre-terminals 80 | for (Node::Nodes::const_iterator 81 | it1 = n1_children.begin(), end1 = n1_children.end(), 82 | it2 = n2_children.begin(), end2 = n2_children.end(); 83 | it1 != end1; ++it1, ++it2) { 84 | ref_delta *= sigma + get_delta(delta_table, *it1, *it2, 85 | sigma, include_leaves, lambda); 86 | } 87 | } 88 | } 89 | return ref_delta; 90 | } 91 | 92 | 93 | /// Pick matches out of sparse cross-product 94 | static 95 | NodePairs find_non_zero_delta_pairs( 96 | Sentence const& t1, Sentence const& t2, 97 | // Starts filling this in, too, since it's iterating across everything already 98 | NodePairsDeltaTable& node_pair_deltas, 99 | bool include_leaves, 100 | double decay_lambda) 101 | { 102 | // These want to be topologically sorted, but it's too expensive, 103 | // as we do them pair-by-pair; doing it ahead of time is pointless. 104 | NodePairs node_pairs; 105 | 106 | // Sorted so that the same-production nodes are together. 107 | Sentence::Nodes::const_iterator i1 = t1.grouped_nodes().begin(), 108 | end1 = t1.grouped_nodes().end(); 109 | Sentence::Nodes::const_iterator i2 = t2.grouped_nodes().begin(), 110 | end2 = t2.grouped_nodes().end(); 111 | 112 | while (i1 != end1 and i2 != end2) { 113 | int cmp = Node::productions_cmp(**i1, **i2); 114 | if (0 < cmp) { 115 | ++i2; 116 | } else if (cmp < 0) { 117 | ++i1; 118 | } else { 119 | // they're equal; the interesting part 120 | Sentence::Nodes::const_iterator run2_start = i2; 121 | Node const* n1 = *i1; 122 | Node const* n2 = *i2; 123 | assert(n1); 124 | assert(n2); 125 | // run along the 'runs' 126 | while (i2 != end2 and Node::productions_equal(*i1, *i2)) { 127 | assert(*i1); 128 | assert(*i2); 129 | node_pairs.push_back(make_pair(*i1, *i2)); 130 | 131 | // Fill in table of pre-terminals while we're here 132 | if ((*i1)->is_preterminal() and (*i2)->is_preterminal()) 133 | delta_ref_at(node_pair_deltas, *i1, *i2) = 134 | preterminal_leaves_value(include_leaves, decay_lambda); 135 | else 136 | delta_ref_at(node_pair_deltas, *i1, *i2) = 0.0; 137 | 138 | ++i2; 139 | } 140 | i2 = run2_start; 141 | ++i1; 142 | } 143 | } 144 | 145 | // Would sort anti-topologically but it'd have to be done for each 146 | // /pair/ and so can't be done ahead of time. Although, if we 147 | // sorted them topo-wise /per tree/ ahead of time, maybe it'd 148 | // make the pairwise topo-sort fast enough to be worth it. Hmmm. 149 | return node_pairs; 150 | } 151 | 152 | /// See Alessandro Moschitti, "Making Tree Kernels Practical for 153 | /// Natural Language Learning" (2006) 154 | /// With sigma = 0, this calculates the subtree (ST) kernel (always 155 | /// includes all the way down to the leaves) 156 | /// With sigma = 1, calculates SSTs - includes every connected fragment 157 | /// of the tree, non-leaf nodes included. 158 | double kernel_value(Sentence const& t1, Sentence const& t2, 159 | bool want_sst_not_st, bool include_leaves, 160 | // use decay_lambda = 1.0 for effectively no lambda 161 | double decay_lambda) 162 | { 163 | int sigma = want_sst_not_st ? 1 : 0; 164 | NodePairsDeltaTable delta_table; 165 | NodePairs node_pairs = find_non_zero_delta_pairs( 166 | t1, t2, 167 | // for some opportunistic premature optimization. 168 | delta_table, include_leaves, decay_lambda); 169 | 170 | double kernel = 0.0; 171 | for (NodePairs::const_iterator it = node_pairs.begin(), end = node_pairs.end(); 172 | it != end; ++it) { 173 | Node const* n1 = it->first; 174 | Node const* n2 = it->second; 175 | double delta = get_delta(delta_table, n1, n2, sigma, 176 | include_leaves, decay_lambda); 177 | kernel += delta; 178 | } 179 | 180 | return kernel; 181 | } 182 | -------------------------------------------------------------------------------- /tree-kernel/tree-kernel.h: -------------------------------------------------------------------------------- 1 | #ifndef TREE_KERNEL_H 2 | #define TREE_KERNEL_H 3 | 4 | /* 5 | Copyright (c) 2011,2015 Jeff Donner 6 | 7 | Permission is hereby granted, free of charge, to any person 8 | obtaining a copy of this software and associated documentation files 9 | (the "Software"), to deal in the Software without restriction, 10 | including without limitation the rights to use, copy, modify, merge, 11 | publish, distribute, sublicense, and/or sell copies of the Software, 12 | and to permit persons to whom the Software is furnished to do so, 13 | subject to the following conditions: 14 | 15 | The above copyright notice and this permission notice shall be 16 | included in all copies or substantial portions of the Software. 17 | 18 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 19 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 20 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. 21 | IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY 22 | CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 23 | TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 24 | SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 25 | */ 26 | 27 | #include "node.h" 28 | 29 | class Sentence; 30 | 31 | double kernel_value(Sentence const& t1, Sentence const& t2, 32 | bool want_sst_not_st, bool include_leaves = false, 33 | double decay_lambda = 1.0); 34 | 35 | #endif // TREE_KERNEL_H 36 | -------------------------------------------------------------------------------- /tree-parser/Makefile: -------------------------------------------------------------------------------- 1 | CXXFLAGS=-O2 -Wall -Wextra 2 | CXX=g++ 3 | INCLUDE=-I/usr/include 4 | LIB_SOURCES=\ 5 | token.cpp \ 6 | tree.cpp \ 7 | tree_parser.cpp 8 | TEST_SOURCES=test-tree-parser.cpp 9 | OBJDIR=.objs 10 | LIB_OBJECTS=$(addprefix $(OBJDIR)/, $(LIB_SOURCES:.cpp=.o)) 11 | TEST_OBJECTS=$(addprefix $(OBJDIR)/, $(TEST_SOURCES:.cpp=.o)) 12 | 13 | TEST_TARGET=test-tree-parser 14 | LIB_TARGET=libtree_parser.a 15 | 16 | all: $(LIB_TARGET) $(TEST_TARGET) 17 | 18 | $(OBJDIR): 19 | mkdir $(OBJDIR) 20 | 21 | # $@ is the target (objs/foo.o), and $< is the input foo.cpp 22 | $(OBJDIR)/%.o: %.cpp $(OBJDIR) 23 | $(CXX) -c $(CXXFLAGS) $(INCLUDE) -o $@ $< 24 | 25 | .cpp.o: 26 | $(CXX) $(CXXFLAGS) $(INCLUDE) -c $< -o $@ 27 | 28 | $(LIB_TARGET): $(LIB_OBJECTS) 29 | $(AR) $(ARFLAGS) $@ $^ 30 | 31 | $(TEST_TARGET): $(LIB_OBJECTS) $(TEST_OBJECTS) 32 | $(CXX) $(LIB_OBJECTS) $(TEST_OBJECTS) -o $@ 33 | 34 | clean: 35 | rm -f $(LIB_TARGET) 36 | rm -f $(TEST_TARGET) 37 | rm -f $(LIB_OBJECTS) $(TEST_OBJECTS) 38 | -------------------------------------------------------------------------------- /tree-parser/lexer.h: -------------------------------------------------------------------------------- 1 | #ifndef LEXER_H 2 | #define LEXER_H 3 | 4 | /*** 5 | * Excerpted from "Language Implementation Patterns", 6 | * published by The Pragmatic Bookshelf. 7 | * Copyrights apply to this code. It may not be used to create training material, 8 | * courses, books, articles, and the like. Contact us if you are in doubt. 9 | * We make no guarantees that this code is fit for any purpose. 10 | * Visit http://www.pragmaticprogrammer.com/titles/tpdsl for more book information. 11 | ***/ 12 | 13 | #include 14 | #include 15 | 16 | #include "token.h" 17 | 18 | 19 | class Lexer { 20 | public: 21 | enum { 22 | EOS = -1, // end-of-stream; EOF was taken 23 | EOS_TYPE = 0 24 | }; 25 | 26 | Lexer(std::string input) 27 | : input(input) 28 | , p(0) 29 | , c(input[p]) 30 | { } 31 | 32 | /** Move one character; detect "end of file" */ 33 | void consume() { 34 | ++p; 35 | 36 | if (input.length() <= p) { 37 | c = EOS; 38 | } else { 39 | c = input[p]; 40 | } 41 | } 42 | 43 | /** Ensure x is next character on the input stream */ 44 | void match(char x) { 45 | if (c == x) 46 | consume(); 47 | else 48 | throw std::string("expecting ") + x + "; found " + c; 49 | } 50 | 51 | virtual Token next_token() = 0; 52 | 53 | std::string input; 54 | unsigned p; // index into input of current character 55 | char c; // current character 56 | }; 57 | 58 | #endif // LEXER_H 59 | -------------------------------------------------------------------------------- /tree-parser/parser.h: -------------------------------------------------------------------------------- 1 | #ifndef PARSER_H 2 | #define PARSER_H 3 | 4 | /*** 5 | * Excerpted from "Language Implementation Patterns", 6 | * published by The Pragmatic Bookshelf. 7 | * Copyrights apply to this code. It may not be used to create training material, 8 | * courses, books, articles, and the like. Contact us if you are in doubt. 9 | * We make no guarantees that this code is fit for any purpose. 10 | * Visit http://www.pragmaticprogrammer.com/titles/tpdsl for more book information. 11 | ***/ 12 | 13 | #include "token.h" 14 | #include "lexer.h" 15 | 16 | 17 | class Parser { 18 | public: 19 | Parser(Lexer& lexer) 20 | : lexer(lexer) 21 | { 22 | consume(); 23 | } 24 | /** If lookahead token type matches x, consume & return else error */ 25 | void match(int x) { 26 | if ( lookahead.type == x ) 27 | consume(); 28 | else 29 | throw std::string("expecting ") + "<>" /*+ lexer.get_token_name(x)*/ + 30 | "; found " + lookahead.text; 31 | } 32 | void consume() { 33 | lookahead = lexer.next_token(); 34 | } 35 | 36 | protected: 37 | // source of tokens 38 | Lexer& lexer; 39 | // the current lookahead token 40 | Token lookahead; 41 | }; 42 | 43 | #endif // PARSER_H 44 | -------------------------------------------------------------------------------- /tree-parser/test-tree-parser.cpp: -------------------------------------------------------------------------------- 1 | #include "tree_parser.h" 2 | 3 | using namespace std; 4 | 5 | string g_pretty = 6 | "(ROOT" 7 | " (S" 8 | " (NP (NNP Never))" 9 | " (VP (VBZ has)" 10 | " (NP" 11 | " (NP (DT the) (NN term))" 12 | " (SBAR" 13 | " (S (`` ``)" 14 | " (S" 15 | " (VP (VBG massaging)" 16 | " (NP (DT the) (NNS media))))" 17 | " ('' '')" 18 | " (VP (VBD seemed)" 19 | " (ADJP (RB so) (JJ accurate)))))))" 20 | " (. .)))" 21 | ; 22 | 23 | string g_simple = 24 | "(tag elt)"; 25 | 26 | string g_flat = 27 | "(a b c d)"; 28 | 29 | string g_small = 30 | "(a b (c (d (e f))))"; 31 | 32 | 33 | string g_real = 34 | "(ROOT (S (VP (MD must) (VP (VB have) (NP (NN java)))) (. .)))"; 35 | 36 | void lex_only(string tree_text, string tree_name) 37 | { 38 | cout << "lexing..." << tree_name << endl; 39 | TreeLexer lexer(tree_text); 40 | Token tok = lexer.next_token(); 41 | int x = 0; 42 | while (tok.type != TreeLexer::EOS_TYPE and x < 10) { 43 | cout << "[" << tok.text << "] "; 44 | tok = lexer.next_token(); 45 | ++x; 46 | } 47 | cout << endl; 48 | } 49 | 50 | void test_tree(string tree_text, string tree_name) 51 | { 52 | cout << '\n' << tree_name << endl; 53 | // lex_only(tree_text, tree_name); 54 | TreeLexer lexer(tree_text); 55 | TreeParser parser(lexer); 56 | Tree const* t = 0; 57 | try { 58 | t = parser.match_and_eat_tree(); 59 | } 60 | catch (string& ex) { 61 | cout << "failed parse: " << ex << endl; 62 | } 63 | 64 | if (t) { 65 | // cout << "printing in order:" << endl; 66 | // t->print_inorder(cout); 67 | cout << "printing nice:" << endl; 68 | t->print_nice(cout, 0); 69 | } 70 | } 71 | 72 | int main(int argc, char* argv[]) 73 | { 74 | (void)argc; 75 | (void)argv; 76 | // test_tree(g_simple, "test simple"); 77 | test_tree(g_flat, "test flat"); 78 | test_tree(g_small, "test small"); 79 | test_tree(g_real, "test real"); 80 | test_tree(g_pretty, "test pretty"); 81 | 82 | return 0; 83 | } 84 | -------------------------------------------------------------------------------- /tree-parser/token.cpp: -------------------------------------------------------------------------------- 1 | #include "token.h" 2 | 3 | using namespace std; 4 | 5 | std::ostream& operator<<(std::ostream& os, Token const& tok) 6 | { 7 | // std::string const& tname = s_token_names[tok.type]; 8 | os << "<'" << tok.text << "',";// << tname << ">"; 9 | return os; 10 | } 11 | -------------------------------------------------------------------------------- /tree-parser/token.h: -------------------------------------------------------------------------------- 1 | #ifndef TOKEN_H 2 | #define TOKEN_H 3 | 4 | /*** 5 | * Excerpted from "Language Implementation Patterns", 6 | * published by The Pragmatic Bookshelf. 7 | * Copyrights apply to this code. It may not be used to create training material, 8 | * courses, books, articles, and the like. Contact us if you are in doubt. 9 | * We make no guarantees that this code is fit for any purpose. 10 | * Visit http://www.pragmaticprogrammer.com/titles/tpdsl for more book information. 11 | ***/ 12 | 13 | #include 14 | #include 15 | 16 | 17 | struct Token { 18 | Token(int type, std::string const& text) 19 | : type(type) 20 | , text(text) 21 | {} 22 | 23 | Token() 24 | : type(-1) 25 | , text("UNINITIALIZED") 26 | {} 27 | 28 | int type; 29 | std::string text; 30 | }; 31 | 32 | std::ostream& operator<<(std::ostream& os, Token const& tok); 33 | 34 | #endif // TOKEN_H 35 | -------------------------------------------------------------------------------- /tree-parser/tree.cpp: -------------------------------------------------------------------------------- 1 | #include "tree.h" 2 | // for strcmp 3 | #include 4 | #include 5 | 6 | using namespace std; 7 | 8 | char const* Tree::s_tags_to_indent[] = { 9 | // must be kept sorted 10 | ",", 11 | "." 12 | ":", 13 | "ADJP", 14 | "CC", 15 | "NP", 16 | "PP", 17 | "S", 18 | "SBAR", 19 | "VP" 20 | }; 21 | 22 | 23 | static bool text_is_less(char const* one, char const* two) 24 | { 25 | return strcmp(one, two) < 0; 26 | } 27 | 28 | bool Tree::should_indent_tag(char const* tag) 29 | { 30 | return binary_search (s_tags_to_indent, 31 | s_tags_to_indent + sizeof(Tree::s_tags_to_indent) / sizeof(Tree::s_tags_to_indent[0]), 32 | tag, text_is_less); 33 | } 34 | 35 | 36 | //Tree::IndentTagsIniter(char const* strings, unsigned n_strings) { 37 | // sort(s_tags_to_indent, s_tags_to_indent + n_strings, text_is_less); 38 | //} 39 | 40 | //Tree::IndentTagsIniter init(Tree::s_tags_to_indent, 41 | // sizeof(Tree::s_tags_to_indent) / sizeof(Tree::s_tags_to_indent[0])); 42 | 43 | void Tree::print_inorder(std::ostream& os) const 44 | { 45 | os << this->tag() << ": "; 46 | for (Elements::const_iterator it = this->children().begin(), end = this->children().end(); 47 | it != end; ++it) { 48 | if (it->is_text()) 49 | os << it->text() << " "; 50 | else 51 | it->tree()->print_inorder(os); 52 | } 53 | os << std::endl; 54 | } 55 | 56 | 57 | void Tree::print_nice(std::ostream& os, int level) const 58 | { 59 | if (should_indent_tag(this->tag().c_str())) { 60 | os << std::endl; 61 | indent(os, level); 62 | } 63 | os << '(' << this->tag(); 64 | for (Elements::const_iterator it = this->children().begin(), end = this->children().end(); 65 | it != end; ++it) { 66 | os << ' '; 67 | if (it->is_text()) 68 | os << it->text(); 69 | else 70 | it->tree()->print_nice(os, level+1); 71 | } 72 | 73 | os << ')'; 74 | if (level == 0) 75 | os << std::endl; 76 | } 77 | -------------------------------------------------------------------------------- /tree-parser/tree.h: -------------------------------------------------------------------------------- 1 | #ifndef TREE_H 2 | #define TREE_H 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | 9 | 10 | class Tree { 11 | public: 12 | typedef std::string TagType; 13 | 14 | public: 15 | Tree(std::string const& tag) 16 | : tag_(tag) 17 | {} 18 | 19 | struct Element { 20 | Element(Tree* tree) 21 | : tree_(tree) 22 | {} 23 | 24 | Element(std::string const& text) 25 | : tok_(text) 26 | , tree_(0) 27 | {} 28 | 29 | enum TreeType { 30 | eTree, 31 | eText, 32 | eUnknown 33 | }; 34 | 35 | bool is_tree() const { return tree_ != 0; } 36 | bool is_text() const { return !is_tree(); } 37 | 38 | std::string const& text() const { return this->tok_; } 39 | Tree* tree() const { return this->tree_; } 40 | 41 | public: 42 | // wants to be a union, but std::string has a ctor 43 | std::string tok_; 44 | Tree* tree_; 45 | }; 46 | typedef std::vector Elements; 47 | 48 | Elements const& children() const { return children_; } 49 | 50 | void add_child(Tree::Element const& child) { 51 | children_.push_back(child); 52 | } 53 | 54 | TagType const& tag() const { return this->tag_; } 55 | 56 | void print_inorder(std::ostream& os) const; 57 | 58 | // wait a minnit, I don't need to print it nice, it'll come across the wire 59 | // that way... -- no it doesn't. 60 | // break at every NP, PP, VP, S, :, ,, CC 61 | virtual void print_nice(std::ostream& os, int level) const; 62 | 63 | private: 64 | static void indent(std::ostream& os, int level) { 65 | for (int i = 0; i < level; ++i) 66 | os << " "; 67 | } 68 | 69 | static bool should_indent_tag(char const* tag); 70 | 71 | private: 72 | TagType tag_; 73 | Elements children_; 74 | static char const* s_tags_to_indent[]; 75 | // static std::set s_set_tags_to_indent; 76 | }; 77 | 78 | #endif // TREE_H 79 | -------------------------------------------------------------------------------- /tree-parser/tree_lexer.h: -------------------------------------------------------------------------------- 1 | #ifndef TREE_LEXER_H 2 | #define TREE_LEXER_H 3 | 4 | /*** 5 | * Excerpted from "Language Implementation Patterns", 6 | * published by The Pragmatic Bookshelf. 7 | * Copyrights apply to this code. It may not be used to create training material, 8 | * courses, books, articles, and the like. Contact us if you are in doubt. 9 | * We make no guarantees that this code is fit for any purpose. 10 | * Visit http://www.pragmaticprogrammer.com/titles/tpdsl for more book information. 11 | ***/ 12 | 13 | #include 14 | #include 15 | #include 16 | #include 17 | #include 18 | 19 | #include "lexer.h" 20 | 21 | 22 | class TreeLexer : public Lexer 23 | { 24 | public: 25 | enum { 26 | ERROR_TYPE =-1, 27 | // EOS_TYPE defined in Lexer 28 | NAME = 1, 29 | LPAREN = 2, 30 | RPAREN = 3, 31 | UNKNOWN 32 | }; 33 | 34 | TreeLexer(std::string const& input) 35 | : Lexer(input) 36 | { 37 | // cout << "lexer, starting:" << this->c << endl; 38 | } 39 | 40 | bool isLETTER() { 41 | return isalpha(c) or c == '.' or c == '\'' or c == '`'; 42 | } 43 | 44 | // virtual std::string const& getTokenName(int x) const { return Lexer::s_token_names[x]; } 45 | 46 | virtual Token next_token() { 47 | // cout << "lexer::next_token c:[" << this->c << "]" << endl; 48 | while ( this->c != Lexer::EOS ) { 49 | if (isspace(this->c)) { 50 | this->WS(); 51 | continue; 52 | } 53 | 54 | switch ( this->c ) { 55 | case '(' : consume(); return Token(LPAREN, "("); 56 | case ')' : consume(); return Token(RPAREN, ")"); 57 | default: 58 | if ( isLETTER() ) { 59 | return name(); 60 | } else { 61 | std::ostringstream oss; 62 | oss << "invalid character: [" << this->c << "]"; 63 | throw oss.str(); 64 | } 65 | } 66 | } 67 | return Token(EOS_TYPE, ""); 68 | } 69 | 70 | /** NAME : LETTER+ ; // NAME is sequence of >=1 letter */ 71 | // cleans up rest of name, detected by an alpha char. 72 | // Like '(', but it takes more characters 73 | Token name() { 74 | std::ostringstream oss; 75 | do { 76 | oss << this->c; letter(); 77 | } 78 | while ( isLETTER() ); 79 | Token tok(NAME, oss.str()); 80 | return tok; 81 | } 82 | 83 | /** LETTER : 'a'..'z'|'A'..'Z'; // define what a letter is (\w) */ 84 | void letter() { 85 | if (isLETTER()) 86 | consume(); 87 | else { 88 | std::ostringstream oss; 89 | oss << "expecting LETTER; found [" << this->c << "]"; 90 | throw oss.str(); 91 | } 92 | } 93 | 94 | /** WS : (' '|'\t'|'\n'|'\r')* ; // ignore any whitespace */ 95 | void WS() { 96 | while ( isspace(this->c) ) 97 | consume(); 98 | } 99 | 100 | static std::string s_token_names[]; 101 | }; 102 | 103 | #endif // TREE_LEXER_H 104 | -------------------------------------------------------------------------------- /tree-parser/tree_parser.cpp: -------------------------------------------------------------------------------- 1 | #include "tree_parser.h" 2 | #include 3 | #include 4 | 5 | using namespace std; 6 | 7 | 8 | std::string TreeParser::match_and_eat_name() 9 | { 10 | Token tok = lookahead; 11 | consume(); 12 | return tok.text; 13 | } 14 | 15 | 16 | Tree::Element TreeParser::match_and_eat_element() 17 | { 18 | if (lookahead.type == TreeLexer::NAME) { 19 | Tree::Element name = Tree::Element(match_and_eat_name()); 20 | return name; 21 | } 22 | else if (lookahead.type == TreeLexer::LPAREN) { 23 | Tree::Element tree = Tree::Element(match_and_eat_tree()); 24 | return tree; 25 | } 26 | else { 27 | throw std::string("expecting tok or tree; found:") + "<>"; //Lexer.names_by_type[lookahead.type]; 28 | // wtf gcc, requiring this 29 | // invalid, but will never happen 30 | return Tree::Element(0); 31 | } 32 | } 33 | 34 | void TreeParser::match_and_eat_elements() 35 | { 36 | Tree::Element tok_or_tree = match_and_eat_element(); 37 | current_open_tree().add_child(tok_or_tree); 38 | while (lookahead.type != TreeLexer::RPAREN) { 39 | tok_or_tree = match_and_eat_element(); 40 | current_open_tree().add_child(tok_or_tree); 41 | } 42 | } 43 | 44 | Tree* TreeParser::match_and_eat_tree() 45 | { 46 | match(TreeLexer::LPAREN); 47 | // open tree; 48 | Tree::TagType tag = match_and_eat_name(); 49 | open_node(tag); 50 | // push onto open tree stack 51 | 52 | match_and_eat_elements(); 53 | // append to open tree 54 | // close tree 55 | match(TreeLexer::RPAREN); 56 | return close_node(); 57 | } 58 | 59 | 60 | void TreeParser::match(int token_type) 61 | { 62 | if ( this->lookahead.type == token_type ) 63 | consume(); 64 | else 65 | throw std::string("expecting ") + "<>" /*+ lexer.get_token_name(x)*/ + 66 | "; found " + lookahead.text; 67 | } 68 | -------------------------------------------------------------------------------- /tree-parser/tree_parser.h: -------------------------------------------------------------------------------- 1 | #ifndef TREE_PARSER_H 2 | #define TREE_PARSER_H 3 | 4 | /*** 5 | * Excerpted from "Language Implementation Patterns", 6 | * published by The Pragmatic Bookshelf. 7 | * Copyrights apply to this code. It may not be used to create training material, 8 | * courses, books, articles, and the like. Contact us if you are in doubt. 9 | * We make no guarantees that this code is fit for any purpose. 10 | * Visit http://www.pragmaticprogrammer.com/titles/tpdsl for more book information. 11 | ***/ 12 | 13 | #include "tree_lexer.h" 14 | #include "tree.h" 15 | 16 | #include 17 | #include 18 | using namespace std; 19 | 20 | 21 | class TreeParser { 22 | public: 23 | TreeParser(TreeLexer& input) 24 | : lexer(input) 25 | { consume(); } 26 | 27 | Tree* match_and_eat_tree(); 28 | 29 | private: 30 | Tree::Element match_and_eat_element(); 31 | 32 | Tree& current_open_tree() { 33 | return *open_nodes.back(); 34 | } 35 | 36 | void match_and_eat_elements(); 37 | 38 | std::string match_and_eat_name(); 39 | 40 | 41 | /// Upon a '(', start a new [sub]tree 42 | void open_node(Tree::TagType tag) { 43 | Tree* t = new Tree(tag); 44 | open_nodes.push_back(t); 45 | } 46 | 47 | /// Upon a ')', close most-open tree 48 | Tree* close_node() { 49 | Tree* t = open_nodes.back(); 50 | open_nodes.pop_back(); 51 | return t; 52 | } 53 | 54 | // 'advance' might be a better name 55 | void consume() { 56 | this->lookahead = lexer.next_token(); 57 | } 58 | 59 | /// insist on, but discard, the next token 60 | void match(int token_type); 61 | 62 | private: 63 | std::vector open_nodes; 64 | TreeLexer& lexer; 65 | Token lookahead; 66 | }; 67 | 68 | #endif // TREE_PARSER_H 69 | --------------------------------------------------------------------------------