├── LICENSE.md ├── sentences.txt ├── README.md └── sent_to_clauses.py /LICENSE.md: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2018 Rahul Kumar Gond 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /sentences.txt: -------------------------------------------------------------------------------- 1 | ///////////////////////////////// works for following sentences //////////////////////////////////// 2 | The dog went to the county fair. 3 | he plays cricket but does not play hockey. 4 | Joe waited for the train, but the train was late. 5 | I can’t believe how fast the dog ran to the county fair. 6 | Joe realized that the train was late while he waited at the train station. 7 | Mary and Samantha arrived at the bus station early but waited until noon for the bus. 8 | Mary and Samantha left on the bus before I arrived, so I did not see them at the bus station. 9 | Because Mary and Samantha arrived at the bus station before noon, I did not see them at the station. 10 | Mary and Samantha realized that Joe was waiting at the train station after they left on the bus. 11 | I looked for Mary and Samantha at the bus station, but they arrived at the station before noon and left on the bus before I arrived. 12 | 13 | 14 | //////////////////////////////// fails for following sentences /////////////////////////////////////// 15 | Every night the office is vacuumed and dusted by the cleaning crew. 16 | Born in Haverfordwest, Wales to English parents, he first caught the public eye at the age of 13, when he was cast in the starring role of Steven Spielberg's Empire of the Sun. -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Sentence-to-Clauses 2 | A python implementation of extracting clauses from a sentence. 3 | 4 | ## Example 5 | Example of sentences and their clauses. 6 | 7 | Sentence | Clauses 8 | ------------- | ------------- 9 | The dog went to the county fair. | ['The dog went to the county fair'] 10 | He plays cricket but does not play hockey. | ['He plays cricket', 'He does not play hockey'] 11 | Joe waited for the train, but the train was late. | ['Joe waited for the train', 'the train was late'] 12 | I can’t believe how fast the dog ran to the county fair. | ["I ca n't believe", 'the dog ran to the county fair'] 13 | Joe realized that the train was late while he waited at the train station. | ['Joe realized', 'the train was late', 'he waited at the train station'] 14 | Mary and Samantha arrived at the bus station early but waited until noon for the bus. | ['Mary and Samantha arrived at the bus station early', 'Mary and Samantha waited until noon for the bus'] 15 | 16 | ## Dependencies 17 | The project requires Python 3, Nltk and CoreNLP. 18 | 19 | ## Future 20 | We can use other parser like Berkeley parser and compare the results. We can also add more patterns to handle complex sentences. 21 | 22 | Note:- This is a little R&D, I was doing on my part, So it's not a full flegded project and works only for simple sentences and can not parse the complex sentence structures. 23 | 24 | ## License 25 | The [MIT License][license] - Copyright (c) 2018 Rahul Kumar Gond 26 | 27 | [license]: 28 | -------------------------------------------------------------------------------- /sent_to_clauses.py: -------------------------------------------------------------------------------- 1 | import nltk 2 | import re 3 | from pycorenlp import * 4 | 5 | nlp=StanfordCoreNLP("http://localhost:9000/") 6 | 7 | # get verb phrases 8 | # if one "VP" node has 2 or more "VP" children then 9 | # all child "VP" while be used as verb phrases 10 | # since a clause may have more than one verb phrases 11 | # ex:- he plays cricket but does not play hockey 12 | # here two verb phrases are "plays cricket" and "does not play hockey" 13 | # ROOT 14 | # | 15 | # S 16 | # _____________________|____ 17 | # | VP 18 | # | ________________|____ 19 | # | | | VP 20 | # | | | ____|________ 21 | # | VP | | | VP 22 | # | ____|_____ | | | ____|____ 23 | # NP | NP | | | | NP 24 | # | | | | | | | | 25 | # PRP VBZ NN CC VBZ RB VB NN 26 | # | | | | | | | | 27 | # he plays cricket but does not play hockey 28 | def get_verb_phrases(t): 29 | verb_phrases = [] 30 | num_children = len(t) 31 | num_VP = sum(1 if t[i].label() == "VP" else 0 for i in range(0, num_children)) 32 | 33 | if t.label() != "VP": 34 | for i in range(0, num_children): 35 | if t[i].height() > 2: 36 | verb_phrases.extend(get_verb_phrases(t[i])) 37 | elif t.label() == "VP" and num_VP > 1: 38 | for i in range(0, num_children): 39 | if t[i].label() == "VP": 40 | if t[i].height() > 2: 41 | verb_phrases.extend(get_verb_phrases(t[i])) 42 | else: 43 | verb_phrases.append(' '.join(t.leaves())) 44 | 45 | return verb_phrases 46 | 47 | 48 | # get position of first node "VP" while traversing from top to bottom 49 | # get the position of subordinating conjunctions like after, as, before, if, since, while etc 50 | # delete the node at these positions to get the subject 51 | # first delete vp nodes then subordinating conjunction nodes 52 | # ie, get the part without verb phrases 53 | # in the above example "he" will be returned 54 | def get_pos(t): 55 | vp_pos = [] 56 | sub_conj_pos = [] 57 | num_children = len(t) 58 | children = [t[i].label() for i in range(0,num_children)] 59 | 60 | flag = re.search(r"(S|SBAR|SBARQ|SINV|SQ)", ' '.join(children)) 61 | 62 | if "VP" in children and not flag: 63 | for i in range(0, num_children): 64 | if t[i].label() == "VP": 65 | vp_pos.append(t[i].treeposition()) 66 | elif not "VP" in children and not flag: 67 | for i in range(0, num_children): 68 | if t[i].height() > 2: 69 | temp1,temp2 = get_pos(t[i]) 70 | vp_pos.extend(temp1) 71 | sub_conj_pos.extend(temp2) 72 | # comment this "else" part, if want to include subordinating conjunctions 73 | else: 74 | for i in range(0, num_children): 75 | if t[i].label() in ["S","SBAR","SBARQ","SINV","SQ"]: 76 | temp1, temp2 = get_pos(t[i]) 77 | vp_pos.extend(temp1) 78 | sub_conj_pos.extend(temp2) 79 | else: 80 | sub_conj_pos.append(t[i].treeposition()) 81 | 82 | return (vp_pos,sub_conj_pos) 83 | 84 | 85 | # get all clauses 86 | def get_clause_list(sent): 87 | parser = nlp.annotate(sent, properties={"annotators":"parse","outputFormat": "json"}) 88 | sent_tree = nltk.tree.ParentedTree.fromstring(parser["sentences"][0]["parse"]) 89 | clause_level_list = ["S","SBAR","SBARQ","SINV","SQ"] 90 | clause_list = [] 91 | sub_trees = [] 92 | # sent_tree.pretty_print() 93 | 94 | # break the tree into subtrees of clauses using 95 | # clause levels "S","SBAR","SBARQ","SINV","SQ" 96 | for sub_tree in reversed(list(sent_tree.subtrees())): 97 | if sub_tree.label() in clause_level_list: 98 | if sub_tree.parent().label() in clause_level_list: 99 | continue 100 | 101 | if (len(sub_tree) == 1 and sub_tree.label() == "S" and sub_tree[0].label() == "VP" 102 | and not sub_tree.parent().label() in clause_level_list): 103 | continue 104 | 105 | sub_trees.append(sub_tree) 106 | del sent_tree[sub_tree.treeposition()] 107 | 108 | # for each clause level subtree, extract relevant simple sentence 109 | for t in sub_trees: 110 | # get verb phrases from the new modified tree 111 | verb_phrases = get_verb_phrases(t) 112 | 113 | # get tree without verb phrases (mainly subject) 114 | # remove subordinating conjunctions 115 | vp_pos,sub_conj_pos = get_pos(t) 116 | for i in vp_pos: 117 | del t[i] 118 | for i in sub_conj_pos: 119 | del t[i] 120 | 121 | subject_phrase = ' '.join(t.leaves()) 122 | 123 | # update the clause_list 124 | for i in verb_phrases: 125 | clause_list.append(subject_phrase + " " + i) 126 | 127 | return clause_list 128 | 129 | if __name__ == "__main__": 130 | # sent = "he plays cricket but does not play hockey" 131 | # sent = re.sub(r"(\.|,|\?|\(|\)|\[|\])"," ",sent) 132 | # clause_list = get_clause_list(sent) 133 | # print(clause_list) 134 | while (True): 135 | sent = input("sentence : \n ") 136 | sent = re.sub(r"(\.|,|\?|\(|\)|\[|\])", " ", sent) 137 | print(get_clause_list(sent)) --------------------------------------------------------------------------------