├── README ├── parse.py └── parse.py~ /README: -------------------------------------------------------------------------------- 1 | 2 | A Jython interface for simple Subject-Verb-Object Extraction from Stanford Phrase Structure tree 3 | 4 | Running Instructions 5 | 1) install jython - Eg: sudo apt-get install jython 6 | 2) Install stanford Parser :(http://nlp.stanford.edu/software/stanford-parser-2012-07-09.tgz) 7 | Extract and unzip to a location -- tar -xzf stanford-parser-2012-07-09.tgz 8 | 9 | 3) Add Stanford Parser to Environment Variable 10 | echo "export STANFORD_PARSER_HOME=./stanford-parser-2012-07-09" >> ~/.bashrc 11 | 12 | 4) Run jython parse.py 13 | 14 | -------------------------------------------------------------------------------- /parse.py: -------------------------------------------------------------------------------- 1 | """ 2 | A Jython Interface to Stanford Parser for Extracting Subject-Verb-Object Relationships 3 | """ 4 | import sys 5 | import os 6 | import pdb 7 | 8 | from java.io import CharArrayReader 9 | assert os.getenv("STANFORD_PARSER_HOME")!=None 10 | 11 | sys.path.append(os.getenv("STANFORD_PARSER_HOME")+os.sep+"stanford-parser.jar") 12 | 13 | from edu.stanford.nlp import * 14 | 15 | class SVO(object): 16 | """ 17 | Class Methods to Extract Subject Verb Object Tuples from a Sentence 18 | """ 19 | def __init__(self): 20 | """ 21 | Initialize the SVO Methods 22 | """ 23 | self.parser = parser.lexparser.LexicalizedParser.loadModel() 24 | self.tlp = trees.PennTreebankLanguagePack() 25 | self.parser.setOptionFlags(["-maxLength", "80", "-retainTmpSubcategories"]) 26 | self.noun_types = ["NN", "NNP", "NNPS","NNS","PRP"] 27 | self.verb_types = ["VB","VBD","VBG","VBN", "VBP", "VBZ"] 28 | self.adjective_types = ["JJ","JJR"] 29 | self.pred_verb_phrase_siblings = None 30 | 31 | def get_attributes(self,node,parent_node, parent_node_siblings): 32 | """ 33 | returns the Attributes for a Node 34 | """ 35 | 36 | def get_subject(self,sub_tree): 37 | """ 38 | Returns the Subject and all attributes for a subject, sub_tree is a Noun Phrase 39 | """ 40 | 41 | sub_nodes = [] 42 | sub_nodes = sub_tree.subTreeList() 43 | sub_nodes = [each for each in sub_nodes if each.isPreTerminal()] 44 | subject = None 45 | 46 | for each in sub_nodes: 47 | if each.value() in self.noun_types: 48 | subject = each.getChildrenAsList()[0].value() 49 | break 50 | 51 | return {'subject':subject, 'attributes' : None} 52 | 53 | def get_object(self,sub_tree): 54 | """ 55 | Returns an Object with all attributes of an object 56 | """ 57 | siblings = self.pred_verb_phrase_siblings 58 | Object = None 59 | for each_tree in siblings: 60 | if each_tree.value() in ["NP","PP"]: 61 | sub_nodes = each_tree.subTreeList() 62 | sub_nodes = [each for each in sub_nodes if each.isPreTerminal()] 63 | for each in sub_nodes: 64 | if each.value() in self.noun_types: 65 | Object = each.getChildrenAsList()[0].value() 66 | break 67 | break 68 | else: 69 | sub_nodes = each_tree.subTreeList() 70 | sub_nodes = [each for each in sub_nodes if each.isPreTerminal()] 71 | for each in sub_nodes: 72 | if each.value() in self.adjective_types: 73 | Object = each.getChildrenAsList()[0].value() 74 | break 75 | # Get first noun in the tree 76 | self.pred_verb_phrase_siblings = None 77 | return {'object':Object, 'attributes' : None} 78 | 79 | def get_predicate(self, sub_tree): 80 | """ 81 | Returns the Verb along with its attributes, Also returns a Verb Phrase 82 | """ 83 | sub_nodes = [] 84 | sub_nodes = sub_tree.subTreeList() 85 | sub_nodes = [each for each in sub_nodes if each.isPreTerminal()] 86 | predicate = None 87 | pred_verb_phrase_siblings = [] 88 | 89 | for each in sub_nodes: 90 | if each.value() in self.verb_types: 91 | sub_tree = each 92 | predicate = each.getChildrenAsList()[0].value() 93 | 94 | if predicate: 95 | pred_verb_phrase_siblings = sub_tree.siblings(sub_tree.parent(self.tree_root)) 96 | pred_verb_phrase_siblings = [each for each in pred_verb_phrase_siblings if each.value() in ["NP","PP","ADJP"]] 97 | self.pred_verb_phrase_siblings = pred_verb_phrase_siblings 98 | return {'predicate':predicate, 'attributes':None} 99 | 100 | def process_parse_tree(self,parse_tree): 101 | """ 102 | Returns the Subject-Verb-Object Representation of a Parse Tree. 103 | Can Vary depending on number of 'sub-sentences' in a Parse Tree 104 | """ 105 | self.tree_root = parse_tree 106 | # Step 1 - Extract all the parse trees that start with 'S' 107 | candidate_trees = parse_tree.subTreeList() 108 | svo_list = [] # A List of SVO pairs extracted 109 | output_list = [] 110 | output_dict ={} 111 | i=0 112 | 113 | for each in candidate_trees: 114 | subject =None 115 | predicate = None 116 | Object = None 117 | if each.value() in ["S", "SQ", "SBAR", "SBARQ", "SINV", "FRAG"]: 118 | children_list = each.getChildrenAsList() 119 | children_values = [each_child.value() for each_child in children_list] 120 | children_dict = dict(zip(children_values,children_list)) 121 | 122 | # Extract Subject, Verb-Phrase, Objects from Sentence sub-trees 123 | if children_dict.get("NP") is not None: 124 | subject = self.get_subject(children_dict["NP"]) 125 | 126 | if children_dict.get("VP") is not None: 127 | # Extract Verb and Object 128 | i+=1 129 | """ 130 | if i==1: 131 | pdb.set_trace() 132 | """ 133 | predicate = self.get_predicate(children_dict["VP"]) 134 | Object = self.get_object(children_dict["VP"]) 135 | 136 | if subject['subject'] and predicate['predicate'] and Object['object']: 137 | output_dict['subject_info'] = subject 138 | output_dict['predicate_info'] = predicate 139 | output_dict['object_info'] = Object 140 | output_list.append(output_dict) 141 | 142 | return output_list 143 | 144 | def tree_print(self,parse_tree): 145 | """ 146 | returns the Pretty PRinting version for Stanford Parse Tree 147 | """ 148 | parse_tree.pennPrint() 149 | 150 | def get_parse_tree(self,sentence): 151 | """ 152 | returns the Parse Tree of a Sample 153 | """ 154 | self.toke = self.tlp.getTokenizerFactory().getTokenizer(CharArrayReader(sentence)); 155 | wordlist = self.toke.tokenize() 156 | parse_tree = self.parser.parseTree(wordlist) 157 | return parse_tree 158 | 159 | 160 | if __name__=="__main__": 161 | svo = SVO() 162 | tree = svo.get_parse_tree("A rare black squirrel has become a regular visitor to a suburban garden") 163 | svo.tree_print(tree) 164 | val = svo.process_parse_tree(tree) 165 | print val 166 | svo.tree_print(tree) 167 | 168 | 169 | 170 | 171 | -------------------------------------------------------------------------------- /parse.py~: -------------------------------------------------------------------------------- 1 | """ 2 | A Jython Interface to Stanford Parser for Extracting Subject-Verb-Object Relationships 3 | """ 4 | import sys 5 | import os 6 | import pdb 7 | 8 | from java.io import CharArrayReader 9 | assert os.getenv("STANFORD_PARSER_HOME")!=None 10 | 11 | sys.path.append(os.getenv("STANFORD_PARSER_HOME")+os.sep+"stanford-parser.jar") 12 | 13 | from edu.stanford.nlp import * 14 | 15 | class SVO(object): 16 | """ 17 | Class Methods to Extract Subject Verb Object Tuples from a Sentence 18 | """ 19 | def __init__(self): 20 | """ 21 | Initialize the SVO Methods 22 | """ 23 | self.parser = parser.lexparser.LexicalizedParser.loadModel() 24 | self.tlp = trees.PennTreebankLanguagePack() 25 | self.parser.setOptionFlags(["-maxLength", "80", "-retainTmpSubcategories"]) 26 | self.noun_types = ["NN", "NNP", "NNPS","NNS","PRP"] 27 | self.verb_types = ["VB","VBD","VBG","VBN", "VBP", "VBZ"] 28 | self.adjective_types = ["JJ","JJR"] 29 | self.pred_verb_phrase_siblings = None 30 | 31 | def get_attributes(self,node,parent_node, parent_node_siblings): 32 | """ 33 | returns the Attributes for a Node 34 | """ 35 | 36 | def get_subject(self,sub_tree): 37 | """ 38 | Returns the Subject and all attributes for a subject, sub_tree is a Noun Phrase 39 | """ 40 | 41 | sub_nodes = [] 42 | sub_nodes = sub_tree.subTreeList() 43 | sub_nodes = [each for each in sub_nodes if each.isPreTerminal()] 44 | subject = None 45 | 46 | for each in sub_nodes: 47 | if each.value() in self.noun_types: 48 | subject = each.getChildrenAsList()[0].value() 49 | break 50 | 51 | return {'subject':subject, 'attributes' : None} 52 | 53 | def get_object(self,sub_tree): 54 | """ 55 | Returns an Object with all attributes of an object 56 | """ 57 | siblings = self.pred_verb_phrase_siblings 58 | Object = None 59 | for each_tree in siblings: 60 | if each_tree.value() in ["NP","PP"]: 61 | sub_nodes = each_tree.subTreeList() 62 | sub_nodes = [each for each in sub_nodes if each.isPreTerminal()] 63 | for each in sub_nodes: 64 | if each.value() in self.noun_types: 65 | Object = each.getChildrenAsList()[0].value() 66 | break 67 | break 68 | else: 69 | sub_nodes = each_tree.subTreeList() 70 | sub_nodes = [each for each in sub_nodes if each.isPreTerminal()] 71 | for each in sub_nodes: 72 | if each.value() in self.adjective_types: 73 | Object = each.getChildrenAsList()[0].value() 74 | break 75 | # Get first noun in the tree 76 | self.pred_verb_phrase_siblings = None 77 | return {'object':Object, 'attributes' : None} 78 | 79 | def get_predicate(self, sub_tree): 80 | """ 81 | Returns the Verb along with its attributes, Also returns a Verb Phrase 82 | """ 83 | sub_nodes = [] 84 | sub_nodes = sub_tree.subTreeList() 85 | sub_nodes = [each for each in sub_nodes if each.isPreTerminal()] 86 | predicate = None 87 | pred_verb_phrase_siblings = [] 88 | 89 | for each in sub_nodes: 90 | if each.value() in self.verb_types: 91 | sub_tree = each 92 | predicate = each.getChildrenAsList()[0].value() 93 | 94 | if predicate: 95 | pred_verb_phrase_siblings = sub_tree.siblings(sub_tree.parent(self.tree_root)) 96 | pred_verb_phrase_siblings = [each for each in pred_verb_phrase_siblings if each.value() in ["NP","PP","ADJP"]] 97 | self.pred_verb_phrase_siblings = pred_verb_phrase_siblings 98 | return {'predicate':predicate, 'attributes':None} 99 | 100 | def process_parse_tree(self,parse_tree): 101 | """ 102 | Returns the Subject-Verb-Object Representation of a Parse Tree. 103 | Can Vary depending on number of 'sub-sentences' in a Parse Tree 104 | """ 105 | self.tree_root = parse_tree 106 | # Step 1 - Extract all the parse trees that start with 'S' 107 | candidate_trees = parse_tree.subTreeList() 108 | svo_list = [] # A List of SVO pairs extracted 109 | output_dict ={} 110 | i=0 111 | 112 | for each in candidate_trees: 113 | subject =None 114 | predicate = None 115 | Object = None 116 | if each.value() in ["S", "SQ", "SBAR", "SBARQ", "SINV", "FRAG"]: 117 | children_list = each.getChildrenAsList() 118 | children_values = [each_child.value() for each_child in children_list] 119 | children_dict = dict(zip(children_values,children_list)) 120 | 121 | # Extract Subject, Verb-Phrase, Objects from Sentence sub-trees 122 | if children_dict.get("NP") is not None: 123 | subject = self.get_subject(children_dict["NP"]) 124 | 125 | if children_dict.get("VP") is not None: 126 | # Extract Verb and Object 127 | i+=1 128 | """ 129 | if i==1: 130 | pdb.set_trace() 131 | """ 132 | predicate = self.get_predicate(children_dict["VP"]) 133 | Object = self.get_object(children_dict["VP"]) 134 | 135 | if subject['subject'] and predicate['predicate'] and Object['object']: 136 | output_dict['subject_info'] = subject 137 | output_dict['predicate_info'] = predicate 138 | output_dict['object_info'] = Object 139 | 140 | pdb.set_trace() 141 | 142 | return output_dict 143 | 144 | def tree_print(self,parse_tree): 145 | """ 146 | returns the Pretty PRinting version for Stanford Parse Tree 147 | """ 148 | parse_tree.pennPrint() 149 | 150 | def get_parse_tree(self,sentence): 151 | """ 152 | returns the Parse Tree of a Sample 153 | """ 154 | self.toke = self.tlp.getTokenizerFactory().getTokenizer(CharArrayReader(sentence)); 155 | wordlist = self.toke.tokenize() 156 | parse_tree = self.parser.parseTree(wordlist) 157 | return parse_tree 158 | 159 | 160 | if __name__=="__main__": 161 | svo = SVO() 162 | tree = svo.get_parse_tree("A rare black squirrel has become a regular visitor to a suburban garden") 163 | svo.tree_print(tree) 164 | val = svo.process_parse_tree(tree) 165 | print val 166 | svo.tree_print(tree) 167 | 168 | 169 | 170 | 171 | --------------------------------------------------------------------------------