├── .gitignore ├── CitationSentences.csv ├── LICENSE.txt ├── OriginalMinIE.csv ├── README.md ├── SVM_model ├── Data anlysis.py ├── Data │ ├── annotated_sentences.CSV │ ├── annotated_sentences.txt │ ├── annotated_sentences.xlsx │ └── example.CSV ├── Data_preprocessing.py ├── Evaluation_Data │ └── oa_200randsents.txt ├── Example_pipeline.py ├── Pickle_Data │ ├── citation_with_context.pk │ ├── citation_with_context_vec.pk │ ├── polarities.pk │ ├── purposes.pk │ ├── svm_polarity.pk │ └── svm_purpose.pk ├── SVM.py ├── Word_Embedding.py ├── citation_context_analysis.py ├── save_GloVe_model_To_Local.py ├── save_svm_models.py ├── stopwords.txt ├── test.py ├── tfidf.py └── word_embedding_new.py ├── pom.xml └── src └── main ├── java ├── de │ └── uni_mannheim │ │ ├── clausie │ │ ├── ClausIE.java │ │ ├── Options.java │ │ ├── clause │ │ │ ├── Clause.java │ │ │ └── ClauseDetector.java │ │ ├── conjunction │ │ │ └── ProcessConjunctions.java │ │ ├── constituent │ │ │ ├── Constituent.java │ │ │ ├── IndexedConstituent.java │ │ │ ├── PhraseConstituent.java │ │ │ └── XcompConstituent.java │ │ ├── phrase │ │ │ └── Phrase.java │ │ └── proposition │ │ │ ├── DefaultPropositionGenerator.java │ │ │ ├── Proposition.java │ │ │ └── PropositionGenerator.java │ │ ├── constant │ │ ├── CHARACTER.java │ │ ├── CLAUSE_TYPE.java │ │ ├── NE_TYPE.java │ │ ├── POS_TAG.java │ │ ├── REGEX.java │ │ ├── SEPARATOR.java │ │ └── WORDS.java │ │ ├── minie │ │ ├── MinIE.java │ │ ├── annotation │ │ │ ├── AnnotatedPhrase.java │ │ │ ├── AnnotatedProposition.java │ │ │ ├── Attribution.java │ │ │ ├── Modality.java │ │ │ ├── Polarity.java │ │ │ └── Quantity.java │ │ ├── main │ │ │ ├── Extractor.java │ │ │ └── Main.java │ │ ├── minimize │ │ │ ├── Minimization.java │ │ │ ├── object │ │ │ │ ├── ObjAggressiveMinimization.java │ │ │ │ ├── ObjDictionaryMinimization.java │ │ │ │ └── ObjSafeMinimization.java │ │ │ ├── relation │ │ │ │ ├── RelAggressiveMinimization.java │ │ │ │ ├── RelDictionaryMinimization.java │ │ │ │ └── RelSafeMinimization.java │ │ │ └── subject │ │ │ │ ├── SubjAggressiveMinimization.java │ │ │ │ ├── SubjDictionaryMinimization.java │ │ │ │ └── SubjSafeMinimization.java │ │ ├── proposition │ │ │ └── ImplicitExtractions.java │ │ └── subconstituent │ │ │ ├── FrequencyCandidates.java │ │ │ └── SubConstituent.java │ │ └── utils │ │ ├── Dictionary.java │ │ ├── coreNLP │ │ ├── CoreNLPUtils.java │ │ └── DpUtils.java │ │ ├── fastutils │ │ └── FastUtil.java │ │ ├── minie │ │ └── Utils.java │ │ └── phrase │ │ └── PhraseUtils.java ├── tests │ └── minie │ │ ├── Demo.java │ │ ├── DetectCitationDemo.java │ │ └── OriginalMinIE.java └── uk │ └── ac │ └── ucl │ └── cs │ └── mr │ ├── Fact.java │ ├── FactsBean.java │ ├── FactsResource.java │ ├── Main.java │ └── MinIEService.java └── resources ├── clausie-resources ├── clausie.conf ├── dict-adverbs-conj.txt ├── dict-adverbs-ignore.txt ├── dict-adverbs-include.txt ├── dict-complex-transitive.txt ├── dict-copular.txt ├── dict-ext-copular.txt └── dict-not-ext-copular.txt └── minie-resources ├── certainty-verbs.dict ├── certainty-words.dict ├── neg-adverbs.dict ├── neg-determiners.dict ├── neg-words.dict ├── non-subsective-adjectives-cf.dict ├── non-subsective-adjectives-modal.dict ├── non-subsective-adjectives-temp.dict ├── poss-adj.dict ├── poss-adverbs.dict ├── poss-modal.dict ├── poss-neg-words.dict ├── poss-verbs.dict ├── poss-words.dict ├── quantities-adjectives.dict ├── quantities-determiners.dict ├── wiktionary-mw-titles.txt └── wn-mwe.txt /.gitignore: -------------------------------------------------------------------------------- 1 | /bin/ 2 | /target/* 3 | .idea/* 4 | *.iml 5 | *~ 6 | /target/ 7 | .classpath 8 | .project 9 | .settings/* 10 | -------------------------------------------------------------------------------- /CitationSentences.csv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gkiril/MinSCIE/ea6b53003f65e2043dc46ebddbe8460e2a2b7dbc/CitationSentences.csv -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | # MinScIE: Citation-centered Open Information Extraction 4 | 5 | An Open Information Extraction (OIE) system which provides structured knowledge enriched with semantic information about citations. This system is based upon the OIE system [MinIE](https://github.com/gkiril/minie). 6 | 7 | ## Open Information Extraction (OIE) 8 | Open Information Extraction (OIE) systems aim to extract unseen relations and their arguments from unstructured text in unsupervised manner. In its simplest form, given a natural language sentence, they extract information in the form of a triple, consisted of subject (S), relation (R) and object (O). 9 | 10 | Suppose we have the following input sentence: 11 | ``` 12 | AMD, which is based in U.S., is a technology company. 13 | ``` 14 | 15 | An OIE system aims to make the following extractions: 16 | 17 | ``` 18 | ("AMD"; "is based in"; "U.S.") 19 | ("AMD"; "is"; "technology company") 20 | ``` 21 | 22 | ## Demo 23 | 24 | For the demos, please refer to the classes `tests.minie.Demo.java` and `tests.minie.DetectCitationDemo.java`. 25 | 26 | ## Citing 27 | If you use MinScIE in your work, please cite our [paper](https://madoc.bib.uni-mannheim.de/49216/1/_JCDL19Demo__MinScIE%20%284%29.pdf): 28 | 29 | ``` 30 | @inproceedings{lauscher2019minscie, 31 | title={MinScIE: Citation-centered Open Information Extraction}, 32 | author={Lauscher, Anne and Song, Yide and Gashteovski, Kiril}, 33 | booktitle={Proceedings of ACM/IEEE Joint Conference on Digital Libraries}, 34 | year={2019} 35 | } 36 | ``` 37 | -------------------------------------------------------------------------------- /SVM_model/Data anlysis.py: -------------------------------------------------------------------------------- 1 | # Data analysis, Baseline, evaluation on baseline 2 | 3 | import time 4 | import codecs 5 | import random 6 | import numpy as np 7 | from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, classification_report, confusion_matrix 8 | from nltk import word_tokenize 9 | from nltk.stem import WordNetLemmatizer 10 | 11 | 12 | def main(): 13 | texts_polarities = [] 14 | texts_purposes = [] 15 | texts = [] 16 | polarities = [] 17 | purposes = [] 18 | polarities2 = [] 19 | purposes2 = [] 20 | data_number = 0 21 | polarity_information = {"positive": 0, "neutral": 0, "negative": 0} 22 | purpose_information = {"Criticizing": 0, "Comparison": 0, "Use": 0, "Substantiating": 0, "Basis": 0, "Neutral": 0} 23 | 24 | # import data 25 | for line in codecs.open("./Data/annotated_sentences.txt", "r", "utf-8", 'ignore').readlines(): 26 | data_number = data_number + 1 27 | parts = line.split('\t') 28 | if parts[12].strip() != "0": 29 | texts_polarities.append(parts[5]) 30 | polarities.append(parts[12].strip()) 31 | if parts[11].strip() != "0": 32 | texts_purposes.append(parts[5]) 33 | purposes.append(parts[11].strip()) 34 | if parts[11].strip() != "0" and parts[12].strip() != "0": 35 | texts.append(parts[5]) 36 | purposes2.append(int(parts[11].strip())) 37 | polarities2.append(int(parts[12].strip())) 38 | if parts[12].strip() == "1": 39 | polarity_information["neutral"] += 1 40 | if parts[12].strip() == "2": 41 | polarity_information["positive"] += 1 42 | if parts[12].strip() == "3": 43 | polarity_information["negative"] += 1 44 | if parts[11].strip() == "1": 45 | purpose_information["Criticizing"] += 1 46 | if parts[11].strip() == "2": 47 | purpose_information["Comparison"] += 1 48 | if parts[11].strip() == "3": 49 | purpose_information["Use"] += 1 50 | if parts[11].strip() == "4": 51 | purpose_information["Substantiating"] += 1 52 | if parts[11].strip() == "5": 53 | purpose_information["Basis"] += 1 54 | if parts[11].strip() == "6": 55 | purpose_information["Neutral"] += 1 56 | print("-------------------------------statistic on data----------------------------------------") 57 | print("[INFO] Total data Number: %s" % data_number) 58 | print("[INFO] Data contains %s citations and %s polarities." % (len(texts_polarities), len(polarities))) 59 | print("[INFO] Data contains %s citations and %s purposes." % (len(texts_purposes), len(purposes))) 60 | print("[INFO] Data contains %s citation contexts and %s polarities and %s Purposes." % (len(texts), len(polarities2), len(purposes2))) 61 | print("[INFO] statistic on polarity %s" % polarity_information) 62 | print("[INFO] statistic on purpose %s" % purpose_information) 63 | print("-------------------------------Example----------------------------------------") 64 | print("[INFO] Example context:\n %s" % (texts[0])) 65 | print("[INFO] Has a polarity value of %s" % (polarities2[0])) 66 | 67 | citation_X = texts 68 | polarity_y = polarities2 69 | purpose_y = purposes2 70 | 71 | 72 | print("-------------------------------Baseline Majority-----------------------------") 73 | y1_result = [] 74 | for i in polarity_y: 75 | y1_result.append(1) 76 | print("y1_result %s" % y1_result) 77 | 78 | y2_result = [] 79 | for i in purpose_y: 80 | y2_result.append(6) 81 | print("y2_result %s" % y2_result) 82 | 83 | #print("purpose_y %s" %purpose_y) 84 | polarity_y = np.asarray(polarity_y) 85 | purpose_y = np.asarray(purpose_y) 86 | y1_result = np.asarray(y1_result) 87 | y2_result = np.asarray(y2_result) 88 | 89 | print("-------------------------------Evaluation on Majority-----------------------------") 90 | print("[INFO] Accuracy score for polarity: %s " % accuracy_score(polarity_y, y1_result)) 91 | print("[INFO] Precision score for polarity: %s " % precision_score(polarity_y, y1_result, average="macro")) 92 | print("[INFO] Recall score for polarity: %s " % recall_score(polarity_y, y1_result,average="macro")) 93 | print("[INFO] F1 score for polarity: %s " % f1_score(polarity_y, y1_result, average="macro")) 94 | 95 | print("[INFO] Accuracy score for purpose: %s " % accuracy_score(purpose_y, y2_result)) 96 | print("[INFO] Precision score for purpose: %s " % precision_score(purpose_y, y2_result, average="macro")) 97 | print("[INFO] Recall score for purpose: %s " % recall_score(purpose_y, y2_result,average="macro")) 98 | print("[INFO] F1 score for purpose: %s " % f1_score(purpose_y, y2_result, average="macro")) 99 | 100 | 101 | 102 | print("-------------------------------Baseline Random-----------------------------") 103 | y1_result = [] 104 | for i in polarity_y: 105 | y1_result.append(random.randint(1,3)) 106 | print("y1_result %s" % y1_result) 107 | 108 | y2_result = [] 109 | for i in purpose_y: 110 | y2_result.append(random.randint(1,6)) 111 | print("y2_result %s" % y2_result) 112 | 113 | y1_result = np.asarray(y1_result) 114 | y2_result = np.asarray(y2_result) 115 | 116 | print("-------------------------------Evaluation Random-----------------------------") 117 | print("[INFO] Accuracy score for polarity: %s " % accuracy_score(polarity_y, y1_result)) 118 | print("[INFO] Precision score for polarity: %s " % precision_score(polarity_y, y1_result, average="macro")) 119 | print("[INFO] Recall score for polarity: %s " % recall_score(polarity_y, y1_result, average="macro")) 120 | print("[INFO] F1 score for polarity: %s " % f1_score(polarity_y, y1_result, average="macro")) 121 | 122 | print("[INFO] Accuracy score for purpose: %s " % accuracy_score(purpose_y, y2_result)) 123 | print("[INFO] Precision score for purpose: %s " % precision_score(purpose_y, y2_result, average="macro")) 124 | print("[INFO] Recall score for purpose: %s " % recall_score(purpose_y, y2_result, average="macro")) 125 | print("[INFO] F1 score for purpose: %s " % f1_score(purpose_y, y2_result, average="macro")) 126 | 127 | 128 | 129 | 130 | if __name__ == "__main__": 131 | print("[INFO] Pipeline started") 132 | start_time = time.time() 133 | main() 134 | print("[INFO] Total processing time: %s seconds" % (time.time() - start_time)) -------------------------------------------------------------------------------- /SVM_model/Data/annotated_sentences.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gkiril/MinSCIE/ea6b53003f65e2043dc46ebddbe8460e2a2b7dbc/SVM_model/Data/annotated_sentences.xlsx -------------------------------------------------------------------------------- /SVM_model/Data/example.CSV: -------------------------------------------------------------------------------- 1 | W99-0621,A88-1019,1999,The second instantiation finds the borders of phrases beginning and end and then pairs them in an optimal way into different phrases,0,"These problems formulations are similar to those studied in Ramshaw and Marcus, 1995 and Church, 1988; Argamon et al , 1998, respectively",1,"The experimental results presented using the SNoW based approach compare favorably with previously published results, both for NPs and SV phrases",0,"A s important, we present a few experiments that shed light on some of the issues involved in using learned predictors that interact to produce the desired inference",0,6,1, 2 | W99-0621,A88-1019,1999,Our earlier example would be marked for base NPs as: I wont to California last May,0,"This approach has been studied in Church, 1988; Argamon et al , 1998",1,331 Architecture The architecture used for the Open/Close predictors is shown in Figure 2,0,"Two SNoW predictors are used, one to predict if the word currently in consideration is the first in the phrase an open bracket, and the other to predict if it is the last a close bracket",0,6,1, 3 | W99-0621,A88-1019,1999,A lot of the work on shallow parsing over the past years has concentrated on manual construction of rules,0,"The observation that shallow syntactic information can be extracted using local information by examining the pattern itself, its nearby context and the local part-of-speech information has motivated the use of learning methods to recognize these patterns Church, 1988; Ramshaw and Marcus, 1995; Argamon et al , 1998; Cardie and Pierce, 1998",1, Research supported by NSF grants IIS-9801638 and SBR-9873450,0,t Research supported by NSF grant CCR-9502540,0,6,1, 4 | -------------------------------------------------------------------------------- /SVM_model/Data_preprocessing.py: -------------------------------------------------------------------------------- 1 | #Preprocessing on Data 2 | 3 | import codecs 4 | import numpy as np 5 | from sklearn.metrics import precision_recall_fscore_support 6 | from sklearn import svm 7 | from sklearn.model_selection import KFold 8 | from sklearn.multiclass import OneVsRestClassifier 9 | from sklearn.svm import LinearSVC 10 | from sklearn.model_selection import train_test_split 11 | from sklearn import preprocessing 12 | import pickle 13 | import re 14 | from nltk.corpus import stopwords 15 | from nltk.tokenize import word_tokenize 16 | from nltk.stem import WordNetLemmatizer 17 | 18 | citation = "" 19 | citation_with_context = "" 20 | texts = [] 21 | texts_with_context = [] 22 | polarities = [] 23 | purposes = [] 24 | 25 | # import data 26 | #Preprocessing: remove author names, years, email, url etc from original texts. 27 | email_regex = r'[0-9a-zA-Z_]{0,19}@[0-9a-zA-Z]{1,13}\.(?:com|cn|net)' 28 | url_regex = r"\"?http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+\"?" 29 | 30 | for line in codecs.open("./Data/annotated_sentences.txt", "r", "utf-8", 'ignore').readlines(): 31 | parts = line.split('\t') 32 | if parts[11].strip() != "0" and parts[12].strip() != "0": 33 | citation_with_context = parts[3] + " "+ parts[5] + " " + parts [7] + " " + parts[9] 34 | citation_with_context = re.sub(r'<[A-Z]+>.*?', "",citation_with_context) 35 | citation_with_context = re.sub(url_regex, "", citation_with_context) 36 | texts_with_context.append(citation_with_context) 37 | parts[5] = re.sub(r'<[A-Z]+>.*?', "",parts[5]) 38 | parts[5] = re.sub(url_regex, "", parts[5]) 39 | texts.append(parts[5]) 40 | purposes.append(int(parts[11].strip())) 41 | polarities.append(int(parts[12].strip())) 42 | 43 | preprocessed_text_with_context = [] 44 | # Preprocessing: Lemmatization 45 | # Preprocessing: remove stopwords 46 | lemmatizer = WordNetLemmatizer() 47 | stop_words = set(stopwords.words('english')) 48 | print(stop_words) 49 | for sen in texts_with_context: 50 | word_tokens = word_tokenize(sen) 51 | filtered_sentence = [w for w in word_tokens if not w in stop_words] 52 | filtered_sentence = "" 53 | for w in word_tokens: 54 | w = lemmatizer.lemmatize(w) 55 | w = w.lower(); 56 | if w not in stop_words and len(w)>1 and len(w)<40: 57 | if filtered_sentence == "": 58 | filtered_sentence = w 59 | else: 60 | filtered_sentence = filtered_sentence + " " + w 61 | print(filtered_sentence) 62 | preprocessed_text_with_context.append(filtered_sentence) 63 | 64 | print(texts[0]) 65 | print(len(texts_with_context)) 66 | print(len(preprocessed_text_with_context)) 67 | print(len(polarities)) 68 | print(len(purposes)) 69 | 70 | #save 71 | with open('./Pickle_Data/citation_with_context.pk', 'wb') as f: 72 | pickle.dump(texts_with_context, f) 73 | 74 | with open('./Pickle_Data/citation.pk', 'wb') as f: 75 | pickle.dump(texts, f) 76 | 77 | with open('./Pickle_Data/pre_citation_with_context.pk', 'wb') as f: 78 | pickle.dump(preprocessed_text_with_context, f) 79 | 80 | with open('./Pickle_Data/polarities.pk', 'wb') as f: 81 | pickle.dump(polarities, f) 82 | 83 | with open('./Pickle_Data/purposes.pk', 'wb') as f: 84 | pickle.dump(purposes, f) 85 | 86 | # Check whether data are saved 87 | # with open('C:/Users/songi/PycharmProjects/MasterThesis/purposes.pk', 'rb') as f: 88 | # data = pickle.load(f) 89 | # print(data) -------------------------------------------------------------------------------- /SVM_model/Example_pipeline.py: -------------------------------------------------------------------------------- 1 | import time 2 | import codecs 3 | from sklearn import svm 4 | from sklearn.model_selection import KFold 5 | from sklearn.model_selection import cross_val_score 6 | from sklearn import metrics 7 | from sklearn.feature_extraction.text import CountVectorizer 8 | from sklearn.feature_extraction.text import TfidfTransformer 9 | import numpy as np 10 | from sklearn.metrics import precision_recall_fscore_support 11 | 12 | from nltk import word_tokenize 13 | from nltk.stem import WordNetLemmatizer 14 | 15 | class LemmaTokenizer(object): 16 | def __init__(self): 17 | self.wnl = WordNetLemmatizer() 18 | def __call__(self, doc): 19 | return [self.wnl.lemmatize(t) for t in word_tokenize(doc)] 20 | 21 | def main(): 22 | texts = [] 23 | polarities = [] 24 | 25 | # import data 26 | for line in codecs.open("./Data/annotated_sentences.txt", "r", "utf-8", 'ignore').readlines(): 27 | parts = line.split('\t') 28 | if parts[12].strip() !="0": 29 | texts.append(parts[5]) 30 | polarities.append(parts[12].strip()) 31 | print("[INFO] Imported %s citation contexts and %s polarities." % (len(texts), len(polarities))) 32 | print("[INFO] Example context:\n %s" % (texts[0])) 33 | print("[INFO] Has a polarity value of %s" % (polarities[0])) 34 | print(set(polarities)) 35 | 36 | # extract features 37 | count_vect = CountVectorizer(tokenizer=LemmaTokenizer()) 38 | x_counts = count_vect.fit_transform(texts) 39 | print(x_counts) 40 | tfidf_transformer = TfidfTransformer() 41 | x_tfidf = tfidf_transformer.fit_transform(x_counts) 42 | 43 | # convert to numpy structures 44 | x = x_tfidf.toarray() 45 | y = np.asarray(polarities) 46 | 47 | # train classifier 48 | kf = KFold(n_splits=10, shuffle=True) 49 | clf = svm.LinearSVC() 50 | for k, (train, test) in enumerate(kf.split(x, y)): 51 | clf.fit(x[train], y[train]) 52 | #print("[INFO] fold %s, score: %s " % (k, clf.score(x[test], y[test]))) 53 | #print(train) 54 | #print(test) 55 | result = clf.predict(x[test]) 56 | print("[INFO] fold %s, score: %s " % (k, precision_recall_fscore_support(y[test], result, average="macro" ))) 57 | 58 | 59 | 60 | if __name__ == "__main__": 61 | print("[INFO] Pipeline started") 62 | start_time = time.time() 63 | main() 64 | print("[INFO] Total processing time: %s seconds" % (time.time() - start_time)) -------------------------------------------------------------------------------- /SVM_model/Pickle_Data/citation_with_context.pk: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gkiril/MinSCIE/ea6b53003f65e2043dc46ebddbe8460e2a2b7dbc/SVM_model/Pickle_Data/citation_with_context.pk -------------------------------------------------------------------------------- /SVM_model/Pickle_Data/citation_with_context_vec.pk: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gkiril/MinSCIE/ea6b53003f65e2043dc46ebddbe8460e2a2b7dbc/SVM_model/Pickle_Data/citation_with_context_vec.pk -------------------------------------------------------------------------------- /SVM_model/Pickle_Data/polarities.pk: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gkiril/MinSCIE/ea6b53003f65e2043dc46ebddbe8460e2a2b7dbc/SVM_model/Pickle_Data/polarities.pk -------------------------------------------------------------------------------- /SVM_model/Pickle_Data/purposes.pk: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gkiril/MinSCIE/ea6b53003f65e2043dc46ebddbe8460e2a2b7dbc/SVM_model/Pickle_Data/purposes.pk -------------------------------------------------------------------------------- /SVM_model/Pickle_Data/svm_polarity.pk: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gkiril/MinSCIE/ea6b53003f65e2043dc46ebddbe8460e2a2b7dbc/SVM_model/Pickle_Data/svm_polarity.pk -------------------------------------------------------------------------------- /SVM_model/Pickle_Data/svm_purpose.pk: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gkiril/MinSCIE/ea6b53003f65e2043dc46ebddbe8460e2a2b7dbc/SVM_model/Pickle_Data/svm_purpose.pk -------------------------------------------------------------------------------- /SVM_model/Word_Embedding.py: -------------------------------------------------------------------------------- 1 | import codecs 2 | import nltk 3 | from sklearn import svm 4 | from sklearn.model_selection import KFold 5 | from sklearn.model_selection import cross_val_score 6 | from sklearn import metrics 7 | from sklearn.feature_extraction.text import CountVectorizer 8 | from sklearn.feature_extraction.text import TfidfTransformer 9 | import numpy as np 10 | from sklearn.metrics import precision_recall_fscore_support 11 | from sklearn.cross_validation import train_test_split 12 | from sklearn.neighbors import KNeighborsClassifier 13 | from gensim.test.utils import common_texts, get_tmpfile 14 | from gensim.models import Word2Vec, KeyedVectors 15 | import gensim 16 | from nltk import word_tokenize 17 | from nltk.stem import WordNetLemmatizer 18 | import nltk 19 | 20 | 21 | class LemmaTokenizer(object): 22 | def __init__(self): 23 | self.wnl = WordNetLemmatizer() 24 | 25 | def __call__(self, doc): 26 | return [self.wnl.lemmatize(t) for t in word_tokenize(doc)] 27 | 28 | 29 | def main(): 30 | texts_polarities = [] 31 | texts_purposes = [] 32 | texts = [] 33 | polarities = [] 34 | purposes = [] 35 | polarities2 = [] 36 | purposes2 = [] 37 | vector_citation_x = [] 38 | data_number = 0 39 | polarity_information = {"positive": 0, "neutral": 0, "negative": 0} 40 | purpose_information = {"Criticizing": 0, "Comparison": 0, "Use": 0, "Substantiating": 0, "Basis": 0, "Neutral": 0} 41 | 42 | # import data 43 | for line in codecs.open("./Data/annotated_sentences.txt", "r", "utf-8", 'ignore').readlines(): 44 | data_number = data_number + 1 45 | parts = line.split('\t') 46 | if parts[12].strip() != "0": 47 | texts_polarities.append(parts[5]) 48 | polarities.append(parts[12].strip()) 49 | if parts[11].strip() != "0": 50 | texts_purposes.append(parts[5]) 51 | purposes.append(parts[11].strip()) 52 | if parts[11].strip() != "0" and parts[12].strip() != "0": 53 | texts.append(parts[5]) 54 | purposes2.append(int(parts[11].strip())) 55 | polarities2.append(int(parts[12].strip())) 56 | 57 | citation_X = texts 58 | polarity_y = polarities2 59 | purpose_y = purposes2 60 | 61 | citation_X = np.asarray(citation_X) 62 | print(citation_X) 63 | 64 | #tok_corp=[nltk.word_tokenize(sent) for sent in citation_X] 65 | #model = gensim.models.Word2Vec(tok_corp, min_count=1, size=32) 66 | 67 | #model = gensim.models.KeyedVectors.load_word2vec_format('C:/Users/songi/PycharmProjects/MasterThesis/GoogleNews-vectors-negative300.bin', binary=True) 68 | #model.save('word2vec.model') 69 | model = KeyedVectors.load('word2vec.model') 70 | print(model.most_similar('algorithms')) 71 | print(len(model['algorithms'])) 72 | print(model.most_similar('cat')) 73 | 74 | 75 | for sen in citation_X: 76 | token_sen = nltk.word_tokenize(sen) 77 | sen_len = 0 78 | a = np.zeros(300) 79 | for token in token_sen: 80 | if token in model.wv.vocab: 81 | sen_len = sen_len + 1 82 | a = a + model[token] 83 | print(len(a)) 84 | a = a / sen_len 85 | print(a) 86 | vector_citation_x.append(a) 87 | 88 | print(vector_citation_x) 89 | 90 | 91 | 92 | 93 | 94 | 95 | 96 | if __name__ == "__main__": 97 | print("[INFO] Pipeline started") 98 | main() -------------------------------------------------------------------------------- /SVM_model/citation_context_analysis.py: -------------------------------------------------------------------------------- 1 | import time 2 | import csv 3 | import numpy as np 4 | from nltk import word_tokenize 5 | from nltk.stem import WordNetLemmatizer 6 | 7 | 8 | class LemmaTokenizer(object): 9 | def __init__(self): 10 | self.wnl = WordNetLemmatizer() 11 | 12 | def __call__(self, doc): 13 | return [self.wnl.lemmatize(t) for t in word_tokenize(doc)] 14 | 15 | 16 | def main(): 17 | texts = [[]] 18 | purpose =[] 19 | polarities = [] 20 | 21 | # import data 22 | i =0 23 | with open("./example.csv") as f: 24 | reader = csv.reader(f,delimiter=',') 25 | for row in reader: 26 | texts[i][0].append(row[3]) 27 | texts[i][1].append(row[5]) 28 | texts[i][2].append(row[7]) 29 | texts[i][3].append(row[9]) 30 | purpose.append(row[11]) 31 | polarities.append(row[12]) 32 | i= i+1 33 | print(texts[0]) 34 | 35 | for line in open("./annotated_sentences.csv"): 36 | csv_row = line.split() 37 | #print(line) 38 | 39 | #parts = line.split('\t') 40 | #if parts[12].strip() != "0": 41 | #texts.append(parts[5]) 42 | #polarities.append(parts[12].strip()) 43 | #print("[INFO] Imported %s citation contexts and %s polarities." % (len(texts), len(polarities))) 44 | #print("[INFO] Example context:\n %s" % (texts[0])) 45 | #print("[INFO] Has a polarity value of %s" % (polarities[0])) 46 | #print(set(polarities)) 47 | 48 | 49 | 50 | 51 | 52 | 53 | if __name__ == "__main__": 54 | print("[INFO] Pipeline started") 55 | start_time = time.time() 56 | main() 57 | print("[INFO] Total processing time: %s seconds" % (time.time() - start_time)) -------------------------------------------------------------------------------- /SVM_model/save_GloVe_model_To_Local.py: -------------------------------------------------------------------------------- 1 | import sys 2 | from gensim.models import Word2Vec, KeyedVectors 3 | import nltk 4 | import pickle 5 | import numpy as np 6 | 7 | # load the Stanford GloVe model 8 | filename = 'C:/Users/songi/PycharmProjects/Model/acl_vectors_glove_300d.txt.word2vec' 9 | print('loading model, model file: ', filename) 10 | model = KeyedVectors.load_word2vec_format(filename, binary=False) 11 | 12 | #Some example of word embeding 13 | print('Examples:') 14 | print(model.most_similar('cat')) 15 | 16 | with open('C:/Users/songi/PycharmProjects/Master_Thesis/Pickle_Data/local_Model.pk', 'wb') as f: 17 | pickle.dump(model, f) 18 | print("--------Vectors saved in local------------") -------------------------------------------------------------------------------- /SVM_model/save_svm_models.py: -------------------------------------------------------------------------------- 1 | import codecs 2 | import numpy as np 3 | from sklearn.metrics import precision_recall_fscore_support 4 | from sklearn.metrics import accuracy_score 5 | from sklearn.metrics import precision_score 6 | from sklearn.metrics import recall_score 7 | from sklearn.metrics import f1_score 8 | from sklearn import svm 9 | from sklearn.model_selection import KFold 10 | from sklearn.multiclass import OneVsRestClassifier 11 | from sklearn.svm import LinearSVC 12 | from sklearn.model_selection import train_test_split 13 | from sklearn import preprocessing 14 | from sklearn.linear_model import Ridge 15 | from sklearn.model_selection import cross_val_score 16 | from sklearn.model_selection import GridSearchCV 17 | import pickle 18 | from sklearn.metrics import classification_report 19 | from sklearn.svm import SVC 20 | import itertools 21 | from sklearn.utils import shuffle 22 | 23 | 24 | 25 | #read preprocessed data from local 26 | with open('./Pickle_Data/citation_with_context_vec.pk', 'rb') as f: 27 | vec_texts_with_context = pickle.load(f) 28 | # print(texts_with_context) 29 | with open('./Pickle_Data/pre_citation_with_context_vec.pk', 'rb') as f: 30 | vec_pre_texts_with_context = pickle.load(f) 31 | # 32 | with open('./Pickle_Data/citation_vec.pk', 'rb') as f: 33 | vec_texts = pickle.load(f) 34 | # print(texts) 35 | with open('./Pickle_Data/polarities.pk', 'rb') as f: 36 | polarities = pickle.load(f) 37 | # print(polarities) 38 | with open('./Pickle_Data/purposes.pk', 'rb') as f: 39 | purposes = pickle.load(f) 40 | # print(purposes) 41 | 42 | citation_X = vec_texts 43 | citation_with_context_X = vec_texts_with_context 44 | pre_citation_with_context_X = vec_pre_texts_with_context 45 | polarity_Y = polarities 46 | purpose_Y = purposes 47 | 48 | #change to array 49 | citation_X = np.asarray(citation_X) 50 | citation_with_context_X = np.asarray(citation_with_context_X) 51 | pre_citation_with_context_X = np.asarray(pre_citation_with_context_X) 52 | polarity_Y = np.asarray(polarity_Y) 53 | purpose_Y = np.asarray(purpose_Y) 54 | citation_with_context_X.reshape(1,-1) 55 | #print(citation_X) 56 | #print("------------Example of citations and its length:---------------") 57 | #print(len(citation_X)) 58 | #print(citation_X[0]) 59 | #print("------------Example of citations with contexts and its length:------------") 60 | #print(len(citation_with_context_X)) 61 | 62 | 63 | #change NaN element to 0 64 | nan_element = [] 65 | #remove nan in data 66 | for i in range(len(citation_X)): 67 | sample=citation_X[i] 68 | for j in range(len(sample)): 69 | if np.isnan(sample[j]): 70 | sample[j]=0 71 | nan_element.append(i) 72 | #break 73 | # print(nan_element) 74 | # for i in nan_element: 75 | # citation_X = np.delete(citation_X,i,axis = 0) 76 | # polarity_Y = np.delete(polarity_Y,i,axis = 0) 77 | # purpose_Y = np.delete(purpose_Y,i,axis = 0) 78 | 79 | 80 | for i in range(len(citation_with_context_X)): 81 | sample=citation_with_context_X[i] 82 | for j in range(len(sample)): 83 | if np.isnan(sample[j]): 84 | sample[j]=0 85 | 86 | for i in range(len(pre_citation_with_context_X)): 87 | sample = pre_citation_with_context_X[i] 88 | for j in range(len(sample)): 89 | if np.isnan(sample[j]): 90 | sample[j] = 0 91 | 92 | 93 | #shuffle the data 94 | citation_with_context_X, polarity_Y, purpose_Y= shuffle(citation_with_context_X, polarity_Y,purpose_Y, random_state=0) 95 | 96 | 97 | # Use cross validation to evaluate the model on all data (Train and test) 98 | kf = KFold(n_splits=10, shuffle=False) 99 | clf = svm.SVC(kernel='rbf', C=80, gamma=0.4) 100 | accuracy_scores = [] 101 | precision_scores = [] 102 | recall_scores =[] 103 | fscores = [] 104 | for k, (train, test) in enumerate(kf.split(citation_with_context_X, polarity_Y)): 105 | clf.fit(citation_with_context_X[train], polarity_Y[train]) 106 | result = clf.predict(citation_with_context_X[test]) 107 | accuracy_scores.append(accuracy_score(polarity_Y[test], result)) 108 | precision_scores.append(precision_score(polarity_Y[test], result, average="macro")) 109 | recall_scores.append(recall_score(polarity_Y[test], result, average="macro")) 110 | fscores.append(f1_score(polarity_Y[test], result, average="macro")) 111 | print("[INFO] fold %s, score: %s " % (k, precision_recall_fscore_support(polarity_Y[test], result, average="macro" ))) 112 | 113 | print("Accuracy mean: %s, std. deviation: %s" %(np.mean(accuracy_scores)*100.0,np.std(accuracy_scores)*100.0)) 114 | print("precision_scores mean: %s, std. deviation: %s" %(np.mean(precision_scores)*100.0,np.std(precision_scores)*100.0)) 115 | print("recall_scores mean: %s, std. deviation: %s" %(np.mean(recall_scores)*100.0,np.std(recall_scores)*100.0)) 116 | print("fscores mean: %s, std. deviation: %s" %(np.mean(fscores)*100.0,np.std(fscores)*100.0)) 117 | 118 | f = open('Pickle_Data/svm_polarity.pk','wb') 119 | clf.fit(citation_with_context_X, polarity_Y) 120 | print(citation_with_context_X[1].reshape(1,-1)) 121 | result = clf.predict(citation_with_context_X[13].reshape(1,-1)) 122 | print(result) 123 | pickle.dump(clf,f) 124 | f.close() 125 | 126 | 127 | 128 | kf = KFold(n_splits=10, shuffle=False) 129 | clf = svm.SVC(kernel='rbf', C=75, gamma=1.1) 130 | accuracy_scores = [] 131 | precision_scores = [] 132 | recall_scores =[] 133 | fscores = [] 134 | for k, (train, test) in enumerate(kf.split(citation_with_context_X, purpose_Y)): 135 | clf.fit(citation_with_context_X[train], purpose_Y[train]) 136 | result = clf.predict(citation_with_context_X[test]) 137 | accuracy_scores.append(accuracy_score(purpose_Y[test], result)) 138 | precision_scores.append(precision_score(purpose_Y[test], result, average="macro")) 139 | recall_scores.append(recall_score(purpose_Y[test], result, average="macro")) 140 | fscores.append(f1_score(purpose_Y[test], result, average="macro")) 141 | print("[INFO] fold %s, score: %s " % (k, precision_recall_fscore_support(purpose_Y[test], result, average="macro" ))) 142 | 143 | print("Accuracy mean: %s, std. deviation: %s" %(np.mean(accuracy_scores)*100.0,np.std(accuracy_scores)*100.0)) 144 | print("precision_scores mean: %s, std. deviation: %s" %(np.mean(precision_scores)*100.0,np.std(precision_scores)*100.0)) 145 | print("recall_scores mean: %s, std. deviation: %s" %(np.mean(recall_scores)*100.0,np.std(recall_scores)*100.0)) 146 | print("fscores mean: %s, std. deviation: %s" %(np.mean(fscores)*100.0,np.std(fscores)*100.0)) 147 | 148 | 149 | f = open('Pickle_Data/svm_purpose.pk','wb') 150 | clf.fit(citation_with_context_X, purpose_Y) 151 | result = clf.predict(citation_with_context_X[13].reshape(1,-1)) 152 | print(result) 153 | pickle.dump(clf,f) 154 | f.close() 155 | 156 | 157 | 158 | 159 | 160 | 161 | 162 | -------------------------------------------------------------------------------- /SVM_model/stopwords.txt: -------------------------------------------------------------------------------- 1 | ('the', 'herself', 'our', 'my','yours', 'm', 'your', 'which', 'o', 'shan', 'his', 2 | 'such', 'ain', 'that', 's', 'are', 'was', 'their', 'he', 'being', 3 | 'an', 'there', 'him', 'having', 're', 'it', 'or', 'll', 'ourselves', 4 | 'theirs', 'whom', 'did', 'me', 'than', 'she', 'we', 'd', 5 | 'they', 'themselves', 'itself', 'her', 'those', 'myself', 6 | 'himself', 'a', 'i', 'them', 'this', 'were', 7 | 'is', 'ours', 'be', 'am', 'then', 'to', 'been', 'yourself', 'have', 'so', 8 | 'of', 'same', 'ma', 'by', 'hers', 9 | 'yourselves', 'just', 'you', 't', 'now', 'any', 'y', 'its','A', 'B', 'C', 'D', 'E', 'F', 'G','H', 10 | 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'b', 'c', 'e', 11 | 'f', 'g', 'h', 'j', 'k', 'l', 'm', 'n', 'o','p','q','r','u','v','w','x','z') -------------------------------------------------------------------------------- /SVM_model/test.py: -------------------------------------------------------------------------------- 1 | import sys 2 | from gensim.models import Word2Vec, KeyedVectors 3 | import nltk 4 | import pickle 5 | import numpy as np 6 | 7 | # load the Stanford GloVe model 8 | #filename = 'C:/Users/songi/PycharmProjects/Model/glove.6B.300d.txt.word2vec' 9 | #print('loading model, model file: ', filename) 10 | #model = KeyedVectors.load_word2vec_format(filename, binary=False) 11 | 12 | with open('C:/Users/songi/PycharmProjects/Master_Thesis/Pickle_Data/local_Model.pk', 'rb') as f: 13 | model = pickle.load(f) 14 | 15 | 16 | 17 | # Calculate for user input text a Vector, 18 | user_input_text = sys.argv[1] 19 | vector_user_input_text = np.zeros(300) 20 | token_sen = nltk.word_tokenize(user_input_text) 21 | print(token_sen) 22 | 23 | sen_len = 0 24 | for token in token_sen: 25 | if token in model.wv.vocab: 26 | sen_len = sen_len + 1 27 | vector_user_input_text = vector_user_input_text + model[token] 28 | #print(model[token][0]) 29 | 30 | vector_user_input_text = vector_user_input_text/sen_len 31 | #print("vector: ", vector_user_input_text) 32 | 33 | 34 | polarity_information = {"positive": 0, "neutral": 0, "negative": 0} 35 | purpose_information = {"Criticizing": 0, "Comparison": 0, "Use": 0, "Substantiating": 0, "Basis": 0, "Neutral": 0} 36 | 37 | f = open('C:/Users/songi/PycharmProjects/Master_Thesis/Pickle_Data/svm_polarity.pk','rb') 38 | svm_model = pickle.load(f) 39 | f.close() 40 | 41 | result = svm_model.predict(vector_user_input_text.reshape(1,-1)) 42 | polarity = "" 43 | if result == 1: 44 | polarity = "Neutral" 45 | if result == 2: 46 | polarity = "Positive" 47 | if result == 3: 48 | polarity == "Negative" 49 | print(polarity) 50 | 51 | f = open('C:/Users/songi/PycharmProjects/Master_Thesis/Pickle_Data/svm_purpose.pk','rb') 52 | svm_model = pickle.load(f) 53 | f.close() 54 | 55 | result2 = svm_model.predict(vector_user_input_text.reshape(1,-1)) 56 | purpose = "" 57 | if result == 1: 58 | purpose = "Criticizing" 59 | if result == 2: 60 | purpose = "Comparison" 61 | if result == 3: 62 | purpose == "Use" 63 | if result == 4: 64 | purpose = "Substantiating" 65 | if result == 5: 66 | purpose = "Basis" 67 | if result == 6: 68 | purpose == "Neutral" 69 | print(purpose) 70 | 71 | print ('Number of arguments:', len(sys.argv), 'arguments.') 72 | print ('Argument List:', str(sys.argv[1])) -------------------------------------------------------------------------------- /SVM_model/tfidf.py: -------------------------------------------------------------------------------- 1 | from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer 2 | import codecs 3 | import numpy as np 4 | from sklearn.metrics import precision_recall_fscore_support 5 | from sklearn import svm 6 | from sklearn.model_selection import KFold 7 | from sklearn.multiclass import OneVsRestClassifier 8 | from sklearn.svm import LinearSVC 9 | from sklearn.model_selection import train_test_split 10 | from sklearn import preprocessing 11 | import pickle 12 | from sklearn.metrics import accuracy_score 13 | from sklearn.metrics import precision_score 14 | from sklearn.metrics import recall_score 15 | from sklearn.metrics import f1_score 16 | 17 | #read preprocessed data from local 18 | with open('./Pickle_Data/citation_with_context.pk', 'rb') as f: 19 | texts_with_context = pickle.load(f) 20 | print(texts_with_context[1]) 21 | with open('./Pickle_Data/pre_citation_with_context.pk', 'rb') as f: 22 | pre_texts_with_context = pickle.load(f) 23 | print(pre_texts_with_context[1]) 24 | with open('./Pickle_Data/citation.pk', 'rb') as f: 25 | texts = pickle.load(f) 26 | print(texts[1]) 27 | with open('./Pickle_Data/polarities.pk', 'rb') as f: 28 | polarities = pickle.load(f) 29 | # print(polarities) 30 | with open('./Pickle_Data/purposes.pk', 'rb') as f: 31 | purposes = pickle.load(f) 32 | # print(purposes) 33 | 34 | citation_X = texts 35 | citation_with_context_X = texts_with_context 36 | polarity_Y = polarities 37 | purpose_Y = purposes 38 | 39 | citation_X = np.asarray(citation_X) 40 | citation_with_context_X = np.asarray(citation_with_context_X) 41 | polarity_Y = np.asarray(polarity_Y) 42 | purpose_Y = np.asarray(purpose_Y) 43 | 44 | example_document = ["I have an pen dog cat box.", 45 | "I have an apple."] 46 | 47 | print("---------create tfidf matrix for citations without contexts-------------------") 48 | vectorizer = CountVectorizer() 49 | count = vectorizer.fit_transform(citation_with_context_X) 50 | #print(vectorizer.get_feature_names()) 51 | # if "yide" in vectorizer.get_feature_names(): 52 | # print(vectorizer.vocabulary_["yide"]) 53 | print(count.toarray()) 54 | transformer = TfidfTransformer() 55 | tfidf_matrix = transformer.fit_transform(count) 56 | #print(tfidf_matrix.toarray()) 57 | #print(tfidf_matrix.toarray()) 58 | 59 | 60 | polarity_Y = np.asarray(polarities) 61 | purpose_Y = np.asarray(purposes) 62 | tfidf_matrix = tfidf_matrix.toarray() 63 | 64 | # train classifier normal svm 65 | kf = KFold(n_splits=10, shuffle=False) 66 | clf = svm.LinearSVC() 67 | accuracy_scores = [] 68 | precision_scores = [] 69 | recall_scores =[] 70 | fscores = [] 71 | 72 | for k, (train, test) in enumerate(kf.split(tfidf_matrix, polarity_Y)): 73 | clf.fit(tfidf_matrix[train], polarity_Y[train]) 74 | result = clf.predict(tfidf_matrix[test]) 75 | accuracy_scores.append(accuracy_score(polarity_Y[test], result)) 76 | precision_scores.append(precision_score(polarity_Y[test], result, average="macro")) 77 | recall_scores.append(recall_score(polarity_Y[test], result, average="macro")) 78 | fscores.append(f1_score(polarity_Y[test], result, average="macro")) 79 | print("[INFO] fold %s, score: %s " % (k, precision_recall_fscore_support(polarity_Y[test], result, average="macro" ))) 80 | 81 | print("Accuracy mean: %s, std. deviation: %s" % (np.mean(accuracy_scores) * 100.0, np.std(accuracy_scores) * 100.0)) 82 | print("precision_scores mean: %s, std. deviation: %s" % (np.mean(precision_scores) * 100.0, np.std(precision_scores) * 100.0)) 83 | print("recall_scores mean: %s, std. deviation: %s" % (np.mean(recall_scores) * 100.0, np.std(recall_scores) * 100.0)) 84 | print("fscores mean: %s, std. deviation: %s" % (np.mean(fscores) * 100.0, np.std(fscores) * 100.0)) 85 | 86 | 87 | # train classifier normal svm 88 | kf = KFold(n_splits=10, shuffle=False) 89 | clf = svm.LinearSVC() 90 | accuracy_scores = [] 91 | precision_scores = [] 92 | recall_scores =[] 93 | fscores = [] 94 | 95 | for k, (train, test) in enumerate(kf.split(tfidf_matrix, purpose_Y)): 96 | clf.fit(tfidf_matrix[train], purpose_Y[train]) 97 | result = clf.predict(tfidf_matrix[test]) 98 | accuracy_scores.append(accuracy_score(purpose_Y[test], result)) 99 | precision_scores.append(precision_score(purpose_Y[test], result, average="macro")) 100 | recall_scores.append(recall_score(purpose_Y[test], result, average="macro")) 101 | fscores.append(f1_score(purpose_Y[test], result, average="macro")) 102 | print("[INFO] fold %s, score: %s " % (k, precision_recall_fscore_support(purpose_Y[test], result, average="macro" ))) 103 | 104 | print("Accuracy mean: %s, std. deviation: %s" % (np.mean(accuracy_scores) * 100.0, np.std(accuracy_scores) * 100.0)) 105 | print("precision_scores mean: %s, std. deviation: %s" % (np.mean(precision_scores) * 100.0, np.std(precision_scores) * 100.0)) 106 | print("recall_scores mean: %s, std. deviation: %s" % (np.mean(recall_scores) * 100.0, np.std(recall_scores) * 100.0)) 107 | print("fscores mean: %s, std. deviation: %s" % (np.mean(fscores) * 100.0, np.std(fscores) * 100.0)) 108 | # 109 | # 110 | # # x_train1, x_test1, y_train1, y_test1 = train_test_split(tfidf_matrix, polarity_Y, random_state=0, train_size=0.8) 111 | # # print("------SVM model: svm.LinearSVC(). input: vector of each citation, label: polarities-------") 112 | # # clf.fit(x_train1,y_train1) 113 | # # result = clf.predict(x_test1) 114 | # # print(precision_recall_fscore_support(y_test1, result, average="macro")) 115 | # # 116 | # # # one vs the rest 117 | # # print("------SVM model: OneVsRestClassifier. input: vector of each citation, label: polarities-------") 118 | # # result = OneVsRestClassifier(svm.SVC(kernel='linear')).fit(x_train1,y_train1).predict(x_test1) 119 | # # print(precision_recall_fscore_support(y_test1, result, average="macro")) 120 | # # #print(result2) 121 | # 122 | # 123 | # # print("---------create tfidf matrix for citations with contexts-------------------") 124 | # # vectorizer = CountVectorizer() 125 | # # count = vectorizer.fit_transform(texts_with_context) 126 | # # transformer = TfidfTransformer() 127 | # # tfidf_matrix2 = transformer.fit_transform(count) 128 | # # print(len(tfidf_matrix2.toarray())) 129 | # # print(len(tfidf_matrix2.toarray()[2])) 130 | # # tfidf_matrix2 = tfidf_matrix2.toarray() 131 | # # 132 | # # # train classifier svm 133 | # # clf = svm.LinearSVC() 134 | # # x_train2, x_test2, y_train2, y_test2 = train_test_split(tfidf_matrix2, polarity_Y, random_state=0, train_size=0.8) 135 | # # print("------SVM model: svm.LinearSVC(). input: vector of each citation with context, label: polarities-------") 136 | # # clf.fit(x_train2,y_train2) 137 | # # result = clf.predict(x_test2) 138 | # # print(precision_recall_fscore_support(y_test2, result, average="macro")) 139 | # # 140 | # # # one vs the rest 141 | # # print("------SVM model: OneVsRestClassifier. input: vector of each citation with context, label: polarities-------") 142 | # # result = OneVsRestClassifier(svm.SVC(kernel='linear')).fit(x_train2,y_train2).predict(x_test2) 143 | # # print(precision_recall_fscore_support(y_test2, result, average="macro")) 144 | # 145 | # 146 | # 147 | # # print("---------create tfidf matrix for citations with contexts and preprocessing-------------------") 148 | # # vectorizer = CountVectorizer() 149 | # # count = vectorizer.fit_transform(pre_texts_with_context) 150 | # # transformer = TfidfTransformer() 151 | # # tfidf_matrix3 = transformer.fit_transform(count) 152 | # # print(len(tfidf_matrix3.toarray())) 153 | # # print(len(tfidf_matrix3.toarray()[2])) 154 | # # tfidf_matrix3 = tfidf_matrix3.toarray() 155 | # # 156 | # # # train classifier svm 157 | # # clf = svm.LinearSVC() 158 | # # x_train3, x_test3, y_train3, y_test3 = train_test_split(tfidf_matrix3, polarity_Y, random_state=0, train_size=0.8) 159 | # # print("------SVM model: svm.LinearSVC(). input: vector of each citation with context, label: polarities-------") 160 | # # clf.fit(x_train3,y_train3) 161 | # # result = clf.predict(x_test3) 162 | # # print(precision_recall_fscore_support(y_test3, result, average="macro")) 163 | # # 164 | # # # one vs the rest 165 | # # print("------SVM model: OneVsRestClassifier. input: vector of each citation with context, label: polarities-------") 166 | # # result = OneVsRestClassifier(svm.SVC(kernel='linear')).fit(x_train3,y_train3).predict(x_test3) 167 | # # print(precision_recall_fscore_support(y_test3, result, average="macro")) -------------------------------------------------------------------------------- /SVM_model/word_embedding_new.py: -------------------------------------------------------------------------------- 1 | #Word Embedding: convert sentences to vectors 2 | from gensim.scripts.glove2word2vec import glove2word2vec 3 | from gensim.models import KeyedVectors 4 | import numpy as np 5 | from sklearn.metrics import precision_recall_fscore_support 6 | from sklearn.cross_validation import train_test_split 7 | from sklearn.neighbors import KNeighborsClassifier 8 | from gensim.test.utils import common_texts, get_tmpfile 9 | from gensim.models import Word2Vec, KeyedVectors 10 | import gensim 11 | from nltk import word_tokenize 12 | from nltk.stem import WordNetLemmatizer 13 | import nltk 14 | import pickle 15 | from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer 16 | 17 | # glove_input_file = 'glove.6B.300d.txt' 18 | # word2vec_output_file = 'glove.6B.300d.txt.word2vec' 19 | # glove2word2vec(glove_input_file, word2vec_output_file) 20 | 21 | # load the Stanford GloVe model 22 | filename = '../Model/acl_vectors_glove_300d.txt.word2vec' 23 | model = KeyedVectors.load_word2vec_format(filename, binary=False) 24 | 25 | #Some example of word embeding 26 | print(model.most_similar('algorithms')) 27 | print(len(model['algorithms'])) 28 | print(model.most_similar('cat')) 29 | 30 | # Calculate vector for a example sentence 31 | example_sentence = "example sentence with word embeding" 32 | print("Calculate vector for a example sentence: " + example_sentence) 33 | token_sen = nltk.word_tokenize(example_sentence) 34 | sen_len = 0 35 | a = np.zeros(300) 36 | for token in token_sen: 37 | if token in model.wv.vocab: 38 | sen_len = sen_len + 1 39 | a = a + model[token] 40 | print(model[token][0]) 41 | print(len(a)) 42 | a =a/sen_len 43 | print(a[0]) 44 | 45 | 46 | #read preprocessed data from local 47 | with open('./Pickle_Data/citation_with_context.pk', 'rb') as f: 48 | texts_with_context = pickle.load(f) 49 | # print(texts_with_context) 50 | with open('./Pickle_Data/pre_citation_with_context.pk', 'rb') as f: 51 | pre_texts_with_context = pickle.load(f) 52 | # print(pre_texts_with_context) 53 | with open('./Pickle_Data/citation.pk', 'rb') as f: 54 | texts = pickle.load(f) 55 | # print(texts) 56 | with open('./Pickle_Data/polarities.pk', 'rb') as f: 57 | polarities = pickle.load(f) 58 | # print(polarities) 59 | with open('./Pickle_Data/purposes.pk', 'rb') as f: 60 | purposes = pickle.load(f) 61 | # print(purposes) 62 | 63 | 64 | citation_X = texts 65 | citation_with_context_X = texts_with_context 66 | pre_citation_with_context_X = pre_texts_with_context 67 | polarity_Y = polarities 68 | purpose_Y = purposes 69 | 70 | citation_X = np.asarray(citation_X) 71 | citation_with_context_X = np.asarray(citation_with_context_X) 72 | pre_citation_with_context_X = np.asarray(pre_citation_with_context_X) 73 | #print(citation_X) 74 | print("------------Example of citations and its length:---------------") 75 | print(len(citation_X)) 76 | print(citation_X[0]) 77 | print("------------Example of citations with contexts and its length:------------") 78 | print(len(citation_with_context_X)) 79 | print(citation_with_context_X[0]) 80 | print("------------Example of citations with contexts and preprocessing and its length:------------") 81 | print(len(pre_citation_with_context_X)) 82 | print(pre_citation_with_context_X[0]) 83 | 84 | print("---------create tfidf matrix-------------------") 85 | vectorizer = CountVectorizer() 86 | count = vectorizer.fit_transform(citation_with_context_X) 87 | #print(vectorizer.get_feature_names()) 88 | #print(vectorizer.vocabulary_) 89 | #print(count.toarray()) 90 | transformer = TfidfTransformer() 91 | tfidf_matrix = transformer.fit_transform(count) 92 | tfidf_matrix = tfidf_matrix.toarray() 93 | print(len(tfidf_matrix)) 94 | 95 | # Calculate for each citation (whiout contexts) a Vector, save those vectors in a List -> vector_citations_X 96 | # vector_citations_X = [] 97 | # sen_index = 0 98 | # tfidf = 0 99 | # vocabulary_index = 0 100 | # for sen in citation_X: 101 | # token_sen = nltk.word_tokenize(sen) 102 | # sum_tfidt = 0 103 | # vec_sen = np.zeros(300) 104 | # for token in token_sen: 105 | # if token in model.wv.vocab and token in vectorizer.get_feature_names(): 106 | # vocabulary_index = vectorizer.vocabulary_[token] 107 | # tfidf = tfidf_matrix[sen_index][vocabulary_index] 108 | # sum_tfidt = sum_tfidt + tfidf 109 | # vec_sen = vec_sen + model[token] * tfidf #each vectors weighted by the tfidf value 110 | # #print(len(vec_sen)) 111 | # vec_sen = vec_sen / sum_tfidt 112 | # vector_citations_X.append(vec_sen) 113 | # sen_index =sen_index + 1 114 | # print("--------Number of citations converted to vectors: ------------") 115 | # print(len(vector_citations_X)) 116 | # #print("-------Example vector for first citation: --------------------") 117 | # #print(vector_citation_with_contexts_X[0]) 118 | 119 | # Calculate for each citations (with contexts) a Vector, save those vectors in a List -> vector_citation_with_contexts_X 120 | vector_citation_with_contexts_X = [] 121 | sen_index = 0 122 | vocabulary_index = 0 123 | for sen in citation_with_context_X: 124 | token_sen = nltk.word_tokenize(sen) 125 | vec_sen = np.zeros(300) 126 | sen_len=0 127 | for token in token_sen: 128 | if token in model.wv.vocab: 129 | sen_len=sen_len+1 130 | vec_sen = vec_sen + model[token] 131 | #print(len(vec_sen)) 132 | vec_sen = vec_sen / sen_len 133 | vector_citation_with_contexts_X.append(vec_sen) 134 | sen_index = sen_index + 1 135 | print("--------Number of citations converted to vectors: ------------") 136 | print(len(vector_citation_with_contexts_X)) 137 | #print("-------Example vector for first citation: --------------------") 138 | #print(vector_citation_with_contexts_X[0]) 139 | 140 | # # Calculate for each citations (with contexts and preprocessing) a Vector, save those vectors in a List -> vector_citation_with_contexts_X 141 | # vector_pre_citation_with_contexts_X = [] 142 | # sen_index = 0 143 | # tfidf = 0 144 | # vocabulary_index = 0 145 | # for sen in pre_citation_with_context_X: 146 | # token_sen = nltk.word_tokenize(sen) 147 | # sum_tfidf = 0 148 | # vec_sen = np.zeros(300) 149 | # for token in token_sen: 150 | # if token in model.wv.vocab and token in vectorizer.get_feature_names(): 151 | # vocabulary_index = vectorizer.vocabulary_[token] 152 | # tfidf = tfidf_matrix[sen_index][vocabulary_index] 153 | # sum_tfidf = sum_tfidf + tfidf 154 | # vec_sen = vec_sen + model[token] * tfidf 155 | # #print(len(vec_sen)) 156 | # vec_sen = vec_sen / sum_tfidf 157 | # vector_pre_citation_with_contexts_X.append(vec_sen) 158 | # sen_index =sen_index + 1 159 | # print("--------Number of citations converted to vectors: ------------") 160 | # print(len(vector_pre_citation_with_contexts_X)) 161 | # #print("-------Example vector for first citation: --------------------") 162 | # #print(vector_citation_with_contexts_X[0]) 163 | 164 | # # Save vectors of each citations to local. 165 | # with open('./Pickle_Data/citation_vec.pk', 'wb') as f: 166 | # pickle.dump(vector_citations_X, f) 167 | # print("--------Vectors saved in local------------") 168 | 169 | # Save vectors of each citations (with contexts) to local. 170 | with open('./Pickle_Data/citation_with_context_vec.pk', 'wb') as f: 171 | pickle.dump(vector_citation_with_contexts_X, f) 172 | print("--------Vectors saved in local------------") 173 | 174 | # # Save vectors of each citations (with contexts) to local. 175 | # with open('./Pickle_Data/pre_citation_with_context_vec.pk', 'wb') as f: 176 | # pickle.dump(vector_pre_citation_with_contexts_X, f) 177 | # # with open('C:/Users/songi/PycharmProjects/MasterThesis/citation_with_context_vec.pk', 'rb') as f: 178 | # # data = pickle.load(f) 179 | # # print(data[0]) 180 | # print("--------Vectors saved in local------------") -------------------------------------------------------------------------------- /pom.xml: -------------------------------------------------------------------------------- 1 | 3 | 4.0.0 4 | 5 | de.uni_mannheim 6 | minscie 7 | 0.0.1-SNAPSHOT 8 | jar 9 | 10 | minie 11 | http://maven.apache.org 12 | 13 | 14 | UTF-8 15 | 1.8 16 | 1.8 17 | 18 | 19 | 20 | 21 | 22 | it.unimi.dsi 23 | fastutil 24 | 8.1.0 25 | 26 | 27 | org.python 28 | jython-standalone 29 | 2.7.0 30 | 31 | 32 | 33 | 34 | edu.stanford.nlp 35 | stanford-corenlp 36 | 3.8.0 37 | 38 | 39 | edu.stanford.nlp 40 | stanford-corenlp 41 | 3.8.0 42 | models 43 | 44 | 45 | 46 | 47 | org.glassfish.jersey.containers 48 | jersey-container-grizzly2-http 49 | 2.26 50 | 51 | 52 | org.glassfish.jersey.inject 53 | jersey-hk2 54 | 2.26 55 | 56 | 57 | org.glassfish.jersey.media 58 | jersey-media-json-jackson 59 | 2.26 60 | 61 | 62 | org.glassfish.jersey.media 63 | jersey-media-json-processing 64 | 2.26 65 | 66 | 67 | org.glassfish.jersey.media 68 | jersey-media-multipart 69 | 2.26 70 | 71 | 72 | org.glassfish.jersey.media 73 | jersey-media-sse 74 | 2.26 75 | 76 | 77 | 78 | net.sf.jopt-simple 79 | jopt-simple 80 | 6.0-alpha-1 81 | 82 | 83 | 84 | 85 | 86 | 87 | ${basedir}/src/main/resources 88 | 89 | 90 | 91 | 92 | org.apache.maven.plugins 93 | maven-assembly-plugin 94 | 95 | 96 | 97 | de.uni_mannheim.minie.main.Main 98 | 99 | 100 | false 101 | 102 | jar-with-dependencies 103 | 104 | 105 | 106 | 107 | 108 | 109 | -------------------------------------------------------------------------------- /src/main/java/de/uni_mannheim/clausie/Options.java: -------------------------------------------------------------------------------- 1 | package de.uni_mannheim.clausie; 2 | 3 | import java.io.File; 4 | import java.io.FileInputStream; 5 | import java.io.FileNotFoundException; 6 | import java.io.IOException; 7 | import java.io.InputStream; 8 | import java.io.OutputStream; 9 | import java.io.PrintStream; 10 | import java.net.URL; 11 | import java.util.Arrays; 12 | import java.util.Iterator; 13 | import java.util.Properties; 14 | import java.util.Set; 15 | 16 | import de.uni_mannheim.utils.Dictionary; 17 | import edu.stanford.nlp.ling.IndexedWord; 18 | 19 | /** Options handles the ClausIe settings which should be loaded out of a configuration file. 20 | * 21 | * @author Luciano del Corro 22 | * @author Kiril Gashteovski 23 | */ 24 | public class Options { 25 | // informatin 26 | public Dictionary dictCopular; 27 | public Dictionary dictExtCopular; 28 | public Dictionary dictNotExtCopular; 29 | public Dictionary dictComplexTransitive; 30 | public Dictionary dictAdverbsConj; 31 | public Dictionary dictAdverbsIgnore; 32 | public Dictionary dictAdverbsInclude; 33 | public boolean conservativeSVA; 34 | public boolean conservativeSVOA; 35 | 36 | /** 37 | * Process coordinating conjunctions with common components. All other verbal coordinating 38 | * conjunctions will always been processed. 39 | * 40 | * Example: some sentence 41 | * Option on: ... 42 | * Option off: 43 | * 44 | * defaulot value 45 | * 46 | * Example sentance that is not affected 47 | */ 48 | public boolean processCcAllVerbs; 49 | public boolean processCcNonVerbs; 50 | public boolean processAppositions; 51 | public boolean processPossessives; 52 | public boolean processPartmods; 53 | //public boolean processPassive = false; // NOT SUPPORTED FOR NOW (collapsed semantic graph needed but less stable) 54 | //add for possesive 55 | 56 | // representation 57 | public boolean nary; 58 | public int minOptionalArgs; // only when nary=false 59 | public int maxOptionalArgs; // only when nary=false 60 | public boolean lemmatize; 61 | public String appositionVerb; 62 | public String possessiveVerb; 63 | 64 | //helpds 65 | 66 | /**Constructs the set of options out of a conf file (clausie.conf)*/ 67 | public Options() { 68 | try { 69 | InputStream in = getClass().getResource("/clausie-resources/clausie.conf").openStream(); 70 | setOptions(in); 71 | in.close(); 72 | } catch (IOException e) { 73 | // should not happen 74 | throw new RuntimeException(e); 75 | } 76 | } 77 | 78 | /**Constructs the set of options out of a conf file (fileOrResourceName)*/ 79 | public Options(String fileOrResourceName) throws IOException { 80 | InputStream in = openFileOrResource(fileOrResourceName); 81 | setOptions(in); 82 | in.close(); 83 | } 84 | 85 | private InputStream openFileOrResource(String name) throws IOException { 86 | try { 87 | File file = new File(name); 88 | return new FileInputStream(file); 89 | } catch (FileNotFoundException e) { 90 | } 91 | URL url = getClass().getResource(name); 92 | if (url == null) { 93 | throw new IOException("File or resource '" + name + "' not found."); 94 | } 95 | return url.openStream(); 96 | } 97 | 98 | /** Load options from the configuration file*/ 99 | public void setOptions(InputStream optionsStream) throws IOException { 100 | Properties prop = new Properties(); 101 | prop.load(optionsStream); 102 | 103 | // load the required options 104 | conservativeSVA = Boolean.parseBoolean(getProperty(prop, "conservativeSVA")); 105 | conservativeSVOA = Boolean.parseBoolean(getProperty(prop, "conservativeSVOA")); 106 | processCcAllVerbs = Boolean.parseBoolean(getProperty(prop, "processCcAllVerbs")); 107 | processCcNonVerbs = Boolean.parseBoolean(getProperty(prop, "processCcNonVerbs")); 108 | processAppositions = Boolean.parseBoolean(getProperty(prop, "processAppositions")); 109 | appositionVerb = getProperty(prop, "appositionVerb"); 110 | processPossessives = Boolean.parseBoolean(getProperty(prop, "processPossessives")); 111 | possessiveVerb = getProperty(prop, "possessiveVerb"); 112 | processPartmods = Boolean.parseBoolean(getProperty(prop, "processPartmods")); 113 | lemmatize = Boolean.parseBoolean(getProperty(prop, "lemmatize")); 114 | nary = Boolean.parseBoolean(getProperty(prop, "nary")); 115 | minOptionalArgs = Integer.parseInt(getProperty(prop, "minOptionalArgs")); 116 | maxOptionalArgs = Integer.parseInt(getProperty(prop, "maxOptionalArgs")); 117 | 118 | // get dictionaries 119 | dictCopular = getDictionary(prop, "dictCopular"); 120 | dictExtCopular = getDictionary(prop, "dictExtCopular"); 121 | dictNotExtCopular = getDictionary(prop, "dictNotExtCopular"); 122 | dictComplexTransitive = getDictionary(prop, "dictComplexTransitive"); 123 | dictAdverbsConj = getDictionary(prop, "dictAdverbsConj"); 124 | dictAdverbsIgnore = getDictionary(prop, "dictAdverbsIgnore"); 125 | dictAdverbsInclude = getDictionary(prop, "dictAdverbsInclude"); 126 | 127 | // check for unused properties 128 | if (!prop.isEmpty()) { 129 | System.err.println( "Unknown option(s): " 130 | + Arrays.toString( prop.keySet().toArray() )); 131 | } 132 | } 133 | 134 | /** Returns a required option (key) */ 135 | private String getProperty(Properties prop, String key) throws IOException { 136 | String result = prop.getProperty(key); 137 | if (result == null) { 138 | throw new IOException("Missing option: " + key); 139 | } 140 | prop.remove(key); 141 | return result; 142 | } 143 | 144 | /**Loads a dictionary (key) */ 145 | private Dictionary getDictionary(Properties prop, String key) throws IOException { 146 | String name = getProperty(prop, key); 147 | InputStream in = openFileOrResource(name); 148 | Dictionary dict = new Dictionary(); 149 | dict.load(in); 150 | in.close(); 151 | return dict; 152 | } 153 | 154 | /**Checks if the copular dictionary contains a given word*/ 155 | public boolean isCop(IndexedWord word) { 156 | return dictCopular.containsLemmatized(word); 157 | } 158 | 159 | /**Checks if the extended copular dictionary contains a given word*/ 160 | public boolean isExtCop(IndexedWord word) { 161 | return dictExtCopular.containsLemmatized(word); 162 | } 163 | 164 | /**Checks if the non-extended copular dictionary contains a given word*/ 165 | public boolean isNotExtCop(IndexedWord word) { 166 | return dictNotExtCopular.containsLemmatized(word); 167 | } 168 | 169 | /**Checks if the complex transitive dictionary contains a given word*/ 170 | public boolean isComTran(IndexedWord word) { 171 | return dictComplexTransitive.containsLemmatized(word); 172 | } 173 | 174 | /**Returns a string with some initial words of a given dictionary*/ 175 | private String someWords(Set dict) { 176 | if (dict.isEmpty()) return ""; 177 | StringBuffer result = new StringBuffer(); 178 | Iterator it = dict.iterator(); 179 | String sep = ""; 180 | result.append(" ("); 181 | for(int i=0; i<3 && it.hasNext(); i++) { 182 | result.append(sep); 183 | result.append(it.next()); 184 | sep = ", "; 185 | } 186 | if (it.hasNext()) result.append(", ..."); 187 | result.append(")"); 188 | return result.toString(); 189 | } 190 | 191 | public void print(OutputStream out) { 192 | print(out, ""); 193 | } 194 | 195 | /**Print settings*/ 196 | public void print(OutputStream out, String prefix) { 197 | PrintStream pout = new PrintStream(out); 198 | 199 | pout.println(prefix + "CLAUSE DETECTION"); 200 | pout.println(prefix + " Dict. copular : " + dictCopular.size() + someWords(dictCopular.words)); 201 | pout.println(prefix + " Dict. ext-copular : " + dictExtCopular.size() + someWords(dictExtCopular.words)); 202 | pout.println(prefix + " Dict. not ext.-cop. : " + dictNotExtCopular.size() + someWords(dictNotExtCopular.words)); 203 | pout.println(prefix + " Dict. complex trans. : " + dictComplexTransitive.size() + someWords(dictComplexTransitive.words)); 204 | pout.println(prefix + " Dict. ignored adverb : " + dictAdverbsIgnore.size() + someWords(dictAdverbsIgnore.words)); 205 | pout.println(prefix + " Dict. included adverb: " + dictAdverbsInclude.size() + someWords(dictAdverbsInclude.words)); 206 | pout.println(prefix + " Dict. conj adverbs : " + dictAdverbsConj.size() + someWords(dictAdverbsConj.words)); 207 | pout.println(prefix + " Conservative SVA : " + conservativeSVA); 208 | pout.println(prefix + " Conservative SVOA : " + conservativeSVOA); 209 | pout.println(prefix + " Process all verb CCs : " + processCcAllVerbs); 210 | pout.println(prefix + " Process non-verb CCs : " + processCcNonVerbs); 211 | pout.println(prefix + " Process appositions : " + processAppositions); 212 | pout.println(prefix + " Process possessives : " + processPossessives); 213 | pout.println(prefix + " Process partmods : " + processPartmods); 214 | 215 | pout.println(prefix + ""); 216 | pout.println(prefix + "REPRESENTATION"); 217 | pout.println(prefix + " n-ary propositions : " + nary); 218 | pout.println(prefix + " Min. opt. args : " + minOptionalArgs); 219 | pout.println(prefix + " Max. opt. args : " + maxOptionalArgs); 220 | pout.println(prefix + " Lemmatize : " + lemmatize); 221 | pout.println(prefix + " Appositions verb : \"" + appositionVerb + "\""); 222 | pout.println(prefix + " Possessive verb : \"" + possessiveVerb + "\""); 223 | } 224 | } 225 | -------------------------------------------------------------------------------- /src/main/java/de/uni_mannheim/clausie/constituent/Constituent.java: -------------------------------------------------------------------------------- 1 | package de.uni_mannheim.clausie.constituent; 2 | 3 | import edu.stanford.nlp.ling.IndexedWord; 4 | 5 | /** 6 | * A constituent of a clause. 7 | * 8 | * @author Luciano del Corro 9 | * @author Kiril Gashteovski 10 | * 11 | */ 12 | public abstract class Constituent { 13 | 14 | // -- types ----------------------------------------------------------------------------------- 15 | 16 | /** Constituent types */ 17 | public enum Type { 18 | SUBJECT, VERB, DOBJ, IOBJ, COMPLEMENT, CCOMP, XCOMP, ACOMP, ADVERBIAL, UNKOWN 19 | }; 20 | 21 | /** Constituent status (could be one of the three: required, optional or ignore */ 22 | public enum Status { 23 | REQUIRED, OPTIONAL, IGNORE 24 | }; 25 | 26 | /** The root vertex of this constituent in {@link #semanticGraph}. This vertex and all its 27 | * descendants are part of the constituent (unless they appear in {@link #excludedVertexes}). */ 28 | protected IndexedWord root; 29 | 30 | // -- member variables ------------------------------------------------------------------------ 31 | 32 | /** Type of this constituent */ 33 | protected Type type; 34 | 35 | 36 | // -- construction ---------------------------------------------------------------------------- 37 | 38 | /** Constructs a constituent of the specified type. */ 39 | protected Constituent(Type type) { 40 | this.type = type; 41 | } 42 | 43 | /** Constructs a constituent of unknown type. */ 44 | protected Constituent() { 45 | this.type = Type.UNKOWN; 46 | } 47 | 48 | // -- getters/setters ------------------------------------------------------------------------- 49 | 50 | /** Returns the type of this constituent. */ 51 | public Type getType() { 52 | return type; 53 | } 54 | 55 | public IndexedWord getRoot() { 56 | return this.root; 57 | } 58 | 59 | // -- utility methods ------------------------------------------------------------------------- 60 | 61 | /** Returns a textual representation of the root word of this constituent. */ 62 | public abstract String rootString(); 63 | } 64 | -------------------------------------------------------------------------------- /src/main/java/de/uni_mannheim/clausie/constituent/IndexedConstituent.java: -------------------------------------------------------------------------------- 1 | package de.uni_mannheim.clausie.constituent; 2 | 3 | import java.util.List; 4 | import java.util.Set; 5 | import java.util.TreeSet; 6 | 7 | import de.uni_mannheim.utils.coreNLP.DpUtils; 8 | import edu.stanford.nlp.ling.IndexedWord; 9 | import edu.stanford.nlp.semgraph.SemanticGraph; 10 | import edu.stanford.nlp.semgraph.SemanticGraphEdge; 11 | 12 | /** A constituent of a clause described by a {@link SemanticGraph}. 13 | * 14 | * Each constituent has a root vertex. The root together with its the descendants form the 15 | * constituent. In some cases, additional vertexes need to be included or excluded; 16 | * these vertexes are also recorded within this class. 17 | * 18 | * Note that the {@link SemanticGraph} may or may not match the graph of the input sentences or the 19 | * other constituents of the same clause. For example, the semantic graphs are modified when 20 | * processing of coordinating conjunctions. 21 | * 22 | * @author Luciano del Corro 23 | * @author Kiril Gashteovski 24 | */ 25 | public class IndexedConstituent extends Constituent { 26 | 27 | // -- member variables ------------------------------------------------------------------------ 28 | 29 | /** Semantic graph for this sentence */ 30 | //protected static SemanticGraph sentSemanticGraph; 31 | //protected SemanticGraph sentSemanticGraph; 32 | 33 | /** Semantic graph for this constituent */ 34 | private SemanticGraph semanticGraph; 35 | 36 | /** Additional root vertexes that form this constituent. These vertexes and all their descendants 37 | * are part of the constituent (unless they appear in {@link #excludedVertexes}). */ 38 | private Set additionalVertexes; 39 | 40 | /** Vertexes that are excluded from this constituent. All descendants are excluded as well 41 | * (unless they appear in {@link #root} or {@link additionalRoots}). */ 42 | public Set excludedVertexes; 43 | 44 | // -- construction ---------------------------------------------------------------------------- 45 | 46 | protected IndexedConstituent() { 47 | } 48 | 49 | /** Constructs a new indexed constituent. 50 | * 51 | * @param semanticGraph Semantic graph for this constituent ({@see #semanticGraph}) 52 | * @param root The root vertex of this constituent ({@see {@link #root}) 53 | * @param additionalVertexes Additional root vertexes that form this constituent ({@see 54 | * {@link #additionalVertexes}) 55 | * @param excludedVertexes Vertexes that are excluded from this constituent ({@see 56 | * {@link #excludedVertexes}) 57 | * @param type type of this constituent 58 | */ 59 | public IndexedConstituent(SemanticGraph semanticGraph, IndexedWord root, Set additionalVertexes, 60 | Set excludedVertexes, Type type) { 61 | super(type); 62 | this.semanticGraph = semanticGraph; 63 | this.root = root; 64 | this.additionalVertexes = new TreeSet(additionalVertexes); 65 | this.excludedVertexes = new TreeSet(excludedVertexes); 66 | } 67 | 68 | /** Constructs a simple indexed constituent without additional additional or excluded vertexes. 69 | * 70 | * @param semanticGraph Semantic graph for this constituent ({@see #semanticGraph}) 71 | * @param root The root vertex of this constituent ({@see {@link #root}) 72 | * @param type type of this constituent 73 | */ 74 | public IndexedConstituent(SemanticGraph semanticGraph, IndexedWord root, Type type) { 75 | this(semanticGraph, root, new TreeSet(), new TreeSet(), type); 76 | } 77 | 78 | /** Creates a deep copy of this indexed constituent. */ 79 | @Override 80 | public IndexedConstituent clone() { 81 | IndexedConstituent clone = new IndexedConstituent(); 82 | clone.type = type; 83 | clone.semanticGraph = new SemanticGraph(semanticGraph); 84 | clone.root = this.root; 85 | clone.additionalVertexes = new TreeSet(this.additionalVertexes); 86 | clone.excludedVertexes = new TreeSet(this.excludedVertexes); 87 | return clone; 88 | } 89 | 90 | // -- getters/setters ------------------------------------------------------------------------- 91 | 92 | /** Returns the semantic graph for this constituent ({@see #semanticGraph}). */ 93 | public SemanticGraph getSemanticGraph() { 94 | return semanticGraph; 95 | } 96 | 97 | /** Returns the semantic graph for this sentence ({@see #sentSemanticGraph}). */ 98 | /*public SemanticGraph getSentSemanticGraph() { 99 | return sentSemanticGraph; 100 | }*/ 101 | 102 | /** Sets the semantic graph for this constituent ({@see #semanticGraph}). */ 103 | public void setSemanticGraph(SemanticGraph newSemanticGraph) { 104 | this.semanticGraph = newSemanticGraph; 105 | } 106 | 107 | /** Returns the root vertex of this constituent ({@see {@link #root}). */ 108 | public IndexedWord getRoot() { 109 | return root; 110 | } 111 | 112 | /** Sets the root vertex of this constituent ({@see {@link #root}). */ 113 | public void setRoot(IndexedWord newRoot) { 114 | root = newRoot; 115 | } 116 | 117 | /** Returns additional root vertexes that form this constituent ({@see 118 | * {@link #additionalVertexes}). */ 119 | public Set getAdditionalVertexes() { 120 | return additionalVertexes; 121 | } 122 | 123 | /** Returns vertexes that are excluded from this constituent ({@see {@link #excludedVertexes}). */ 124 | public Set getExcludedVertexes() { 125 | return excludedVertexes; 126 | } 127 | 128 | /** Checks whether this constituent is a prepositional phrase (i.e., starts with a preposition). */ 129 | public boolean isPrepositionalPhrase(SemanticGraph sentSemanticGraph) { //This is a mess, find other way of fixing. This is purelly heuristic. 130 | //It needs to know the semantic graph for the sentence after this is fixed the member variable sentSemanticGraph 131 | //can be removed 132 | List parents = semanticGraph.getParentList(root); //This is not the cleanest way semantics messed up. 133 | //specially with the rel we cannot just check if 134 | //the head is a preposition 135 | //(return root.tag().equals("IN")) because the 136 | //parser some times includes a preposition in the 137 | //verbal phrase "He is about to win" 138 | for(IndexedWord parent: parents) { 139 | SemanticGraphEdge edge = semanticGraph.getEdge(parent, root); 140 | if(DpUtils.isRel(edge)) 141 | return true; 142 | if(DpUtils.isAnyPrep(edge)) { 143 | List ancestors = sentSemanticGraph.getParentList(parent); 144 | 145 | for(IndexedWord ancestor: ancestors) { 146 | SemanticGraphEdge ed = sentSemanticGraph.getEdge(ancestor, parent); 147 | if(DpUtils.isRcmod(ed)) 148 | return true; 149 | } 150 | 151 | } 152 | } 153 | return false; 154 | //return root.tag().equals("IN"); 155 | } 156 | 157 | // -- utility methods ------------------------------------------------------------------------- 158 | 159 | /** Returns a textual representation of the root word of this constituent. */ 160 | public String rootString() { 161 | return root.originalText(); 162 | } 163 | 164 | /** Returns a copy of the semantic graph of this constituent in which all edges (from any 165 | * included vertex) to excluded vertexes have been removed. Useful for proposition generation. */ 166 | public SemanticGraph createReducedSemanticGraph() { 167 | SemanticGraph result = new SemanticGraph(semanticGraph); 168 | DpUtils.removeEdges(result, root, excludedVertexes); 169 | for (IndexedWord v : additionalVertexes) { 170 | DpUtils.removeEdges(result, v, excludedVertexes); 171 | } 172 | return result; 173 | } 174 | 175 | public void setAdditionalVertexes(Set aVertexes){ 176 | this.additionalVertexes = aVertexes; 177 | } 178 | public void addVertexToAdditionalVertexes(IndexedWord w){ 179 | this.additionalVertexes.add(w); 180 | } 181 | 182 | public Type getConstituentType(){ 183 | return this.type; 184 | } 185 | } 186 | -------------------------------------------------------------------------------- /src/main/java/de/uni_mannheim/clausie/constituent/PhraseConstituent.java: -------------------------------------------------------------------------------- 1 | package de.uni_mannheim.clausie.constituent; 2 | 3 | import it.unimi.dsi.fastutil.objects.ObjectArrayList; 4 | import de.uni_mannheim.clausie.phrase.Phrase; 5 | import edu.stanford.nlp.ling.IndexedWord; 6 | 7 | /** 8 | * A phrase expression of a constituent. The constituent is represented as a Phrase 9 | * 10 | * @author Kiril Gashteovski 11 | * 12 | */ 13 | public class PhraseConstituent extends Constituent { 14 | /** The constituent as a phrase **/ 15 | private Phrase phrase; 16 | 17 | /** Constructs a constituent with a specified textual representation and type. */ 18 | public PhraseConstituent(Phrase p, Type type) { 19 | super(type); 20 | this.phrase = p; 21 | this.root = p.getRoot(); 22 | } 23 | 24 | /** Returns a textual representation of the constituent. */ 25 | public String rootString() { 26 | return this.phrase.getWords(); 27 | } 28 | 29 | /** Adding a word to the list of words of the phrase **/ 30 | public void addWordToList(IndexedWord word){ 31 | this.phrase.addWordToList(word); 32 | } 33 | /** Adding all the elements from a list of indexed words to the list of indexed words of the phrase **/ 34 | public void addWordsToList(ObjectArrayList words){ 35 | this.phrase.addWordsToList(words); 36 | } 37 | public Phrase getPhrase(){ 38 | return this.phrase; 39 | } 40 | } 41 | -------------------------------------------------------------------------------- /src/main/java/de/uni_mannheim/clausie/constituent/XcompConstituent.java: -------------------------------------------------------------------------------- 1 | package de.uni_mannheim.clausie.constituent; 2 | 3 | import java.util.Set; 4 | import java.util.TreeSet; 5 | 6 | import de.uni_mannheim.clausie.clause.Clause; 7 | import edu.stanford.nlp.ling.IndexedWord; 8 | import edu.stanford.nlp.semgraph.SemanticGraph; 9 | 10 | import it.unimi.dsi.fastutil.objects.ObjectArrayList; 11 | 12 | /** An {@code XcompConstituent} of a clause formed out of an xcomp. 13 | * 14 | * Note that the xcomp relation refers to a clause with an external subject. 15 | * The constituent stores the set of clauses that can be derived from the xcomp 16 | * clause. 17 | * 18 | * @author Luciano del Corro 19 | * @author Kiril Gashteovski 20 | * 21 | */ 22 | public class XcompConstituent extends IndexedConstituent { 23 | 24 | /** Clauses derived from this constituent */ 25 | private ObjectArrayList clauses; 26 | 27 | private XcompConstituent() { 28 | } 29 | 30 | /** Constructs a new constituent for the xcomp relation. 31 | * 32 | * @param semanticGraph Semantic graph for this constituent ({@see #semanticGraph}) 33 | * @param root The root vertex of this constituent ({@see {@link #root}) 34 | * @param type type of this constituent 35 | * @param clauses derived from this constituent 36 | */ 37 | public XcompConstituent(SemanticGraph semanticGraph, IndexedWord root, Type type, ObjectArrayList clauses) { 38 | super(semanticGraph, root, type); 39 | this.setClauses(clauses); 40 | } 41 | 42 | /** Constructs a new indexed constituent for the xcomp relation. 43 | * 44 | * @param semanticGraph Semantic graph for this constituent ({@see #semanticGraph}) 45 | * @param root The root vertex of this constituent ({@see {@link #root}) 46 | * @param additionalVertexes Additional root vertexes that form this constituent ({@see 47 | * {@link #additionalVertexes}) 48 | * @param excludedVertexes Vertexes that are excluded from this constituent ({@see 49 | * {@link #excludedVertexes}) 50 | * @param type type of this constituent 51 | * @param clauses derived from this constituent 52 | */ 53 | public XcompConstituent(SemanticGraph semanticGraph, IndexedWord root, Set additionalVertexes, 54 | Set excludedVertexes, Type type, ObjectArrayList clauses) { 55 | super(semanticGraph, root, additionalVertexes, excludedVertexes, type); 56 | this.setClauses(clauses); 57 | } 58 | 59 | /** Returns the clauses derived from the constituent. */ 60 | public ObjectArrayList getClauses() { 61 | return clauses; 62 | } 63 | 64 | /** Sets the clauses derived from the constituent. */ 65 | public void setClauses(ObjectArrayList clauses) { 66 | this.clauses = clauses; 67 | } 68 | 69 | @Override 70 | public XcompConstituent clone() { 71 | XcompConstituent clone = new XcompConstituent(); 72 | clone.type = type; 73 | clone.setSemanticGraph(new SemanticGraph(this.getSemanticGraph())); 74 | clone.root = this.getRoot(); 75 | clone.setAdditionalVertexes(new TreeSet(this.getAdditionalVertexes())); 76 | clone.excludedVertexes = new TreeSet(this.excludedVertexes); 77 | clone.clauses = new ObjectArrayList(clauses); 78 | return clone; 79 | } 80 | } 81 | -------------------------------------------------------------------------------- /src/main/java/de/uni_mannheim/clausie/proposition/DefaultPropositionGenerator.java: -------------------------------------------------------------------------------- 1 | package de.uni_mannheim.clausie.proposition; 2 | 3 | import java.util.SortedSet; 4 | import java.util.TreeSet; 5 | 6 | import de.uni_mannheim.clausie.ClausIE; 7 | import de.uni_mannheim.clausie.clause.Clause; 8 | import de.uni_mannheim.clausie.constituent.Constituent; 9 | import de.uni_mannheim.clausie.constituent.IndexedConstituent; 10 | import de.uni_mannheim.clausie.constituent.PhraseConstituent; 11 | import de.uni_mannheim.clausie.constituent.Constituent.Status; 12 | import de.uni_mannheim.clausie.phrase.Phrase; 13 | import de.uni_mannheim.utils.coreNLP.CoreNLPUtils; 14 | 15 | import edu.stanford.nlp.semgraph.SemanticGraph; 16 | 17 | import it.unimi.dsi.fastutil.objects.ObjectArrayList; 18 | 19 | 20 | /** 21 | * Currently the default proposition generator generates 3-ary propositions out of a clause 22 | * 23 | * @author Luciano del Corro 24 | * @author Kiril Gashteovski 25 | * 26 | * */ 27 | public class DefaultPropositionGenerator extends PropositionGenerator { 28 | public DefaultPropositionGenerator(ClausIE clausIE) { 29 | super(clausIE); 30 | } 31 | 32 | /** 33 | * @param clause: the clause in which the proposition is generated (and added to the list of propositions in 'clause') 34 | * @param sGraph: semantic graph of the sentence 35 | */ 36 | @Override 37 | public void generate(Clause clause, SemanticGraph sGraph) { 38 | Proposition proposition = new Proposition(); 39 | ObjectArrayList constTypes = new ObjectArrayList(); 40 | 41 | // Process subject 42 | if (clause.getSubject() > -1 && clause.getIncludedConstitsInds().getBoolean(clause.getSubject())) { // subject is -1 when there is an xcomp 43 | Phrase subjPhrase = generate(clause, clause.getSubject(), sGraph); 44 | Constituent subjConstituent = clause.getConstituents().get(clause.getSubject()); 45 | subjPhrase.setRoot(subjConstituent.getRoot()); 46 | proposition.addPhrase(new Phrase(subjPhrase)); 47 | constTypes.add(Constituent.Type.SUBJECT); 48 | } else { 49 | //throw new IllegalArgumentException(); 50 | } 51 | 52 | // Process verb 53 | if (clause.getIncludedConstitsInds().getBoolean(clause.getVerbInd())) { 54 | Phrase relation = generate(clause, clause.getVerbInd(), sGraph); 55 | Constituent verb = clause.getConstituents().get(clause.getVerbInd()); 56 | relation.setRoot(verb.getRoot()); 57 | proposition.addPhrase(new Phrase(relation)); 58 | constTypes.add(Constituent.Type.VERB); 59 | } else { 60 | throw new IllegalArgumentException(); 61 | } 62 | 63 | // Process arguments 64 | SortedSet sortedIndexes = new TreeSet(); 65 | sortedIndexes.addAll(clause.getIobjectsInds()); 66 | sortedIndexes.addAll(clause.getDobjectsInds()); 67 | sortedIndexes.addAll(clause.getXcompsInds()); 68 | sortedIndexes.addAll(clause.getCcompsInds()); 69 | sortedIndexes.addAll(clause.getAcompsInds()); 70 | sortedIndexes.addAll(clause.getAdverbialInds()); 71 | if (clause.getComplementInd() >= 0) 72 | sortedIndexes.add(clause.getComplementInd()); 73 | for (int index: sortedIndexes) { 74 | Constituent verbConstituent = clause.getConstituents().get(clause.getVerbInd()); 75 | Constituent indexConstituent = clause.getConstituents().get(index); 76 | boolean isVerbIndexedConstituent = verbConstituent instanceof IndexedConstituent; 77 | boolean adverbialsContainIndex = clause.getAdverbialInds().contains(index); 78 | if (isVerbIndexedConstituent && adverbialsContainIndex && 79 | indexConstituent.getRoot().index() < verbConstituent.getRoot().index()) 80 | continue; 81 | 82 | if (clause.getIncludedConstitsInds().getBoolean(index)) { 83 | Phrase argument = generate(clause, index, sGraph); 84 | argument.setRoot(clause.getConstituents().get(index).getRoot()); 85 | proposition.addPhrase(new Phrase(argument)); 86 | constTypes.add(clause.getConstituents().get(index).getType()); 87 | } 88 | } 89 | 90 | // Process adverbials before verb 91 | sortedIndexes.clear(); 92 | sortedIndexes.addAll(clause.getAdverbialInds()); 93 | for (Integer index : sortedIndexes) { 94 | Constituent verbConstituent = clause.getConstituents().get(clause.getVerbInd()); 95 | Constituent indexConstituent = clause.getConstituents().get(index); 96 | boolean isVerbPhraseConstituent = verbConstituent instanceof PhraseConstituent; 97 | // If the verb is a TextConstituent or the current constituent's root index is greater than the 98 | // verb constituent's root index -> break 99 | if (isVerbPhraseConstituent || (indexConstituent.getRoot().index() > verbConstituent.getRoot().index())) 100 | break; 101 | if (clause.getIncludedConstitsInds().getBoolean(index)) { 102 | Phrase argument = generate(clause, index, sGraph); 103 | argument.setRoot(clause.getConstituents().get(index).getRoot()); 104 | proposition.getPhrases().add(new Phrase(argument)); 105 | constTypes.add(clause.getConstituents().get(index).getType()); 106 | 107 | if (clause.getConstituentStatus(index, clausIE.getOptions()).equals(Status.OPTIONAL)) { 108 | proposition.addOptionalConstituentIndex(proposition.getPhrases().size()); 109 | } 110 | } 111 | } 112 | 113 | // Make triple if specified + push necessary constituents to the relation 114 | if (!clausIE.getOptions().nary) { 115 | proposition.clearOptionalConstituentIndicesSet(); 116 | if (proposition.getPhrases().size() > 3) { 117 | // Push the necessary constituents to the relation 118 | pushConstituentsToRelation(proposition, constTypes); 119 | 120 | // Merge the rest of the n-ary tuple to the 3rd constituent (making it a triple) 121 | Phrase argPhrase = new Phrase(); 122 | argPhrase.setRoot(proposition.getPhrases().get(2).getRoot()); 123 | for (int i = 2; i < proposition.getPhrases().size(); i++) { 124 | argPhrase.addWordsToList(proposition.getPhrases().get(i).getWordList().clone()); 125 | } 126 | proposition.setPhrase(2, argPhrase); 127 | for (int i = proposition.getPhrases().size() - 1; i > 2; i--) { 128 | proposition.getPhrases().remove(i); 129 | } 130 | } 131 | } 132 | 133 | // We are done 134 | clause.addProposition(proposition); 135 | } 136 | 137 | /** 138 | * Given a constituent index i, push it to the relation of the proposition p 139 | * @param proposition: proposition 140 | * @param i: push the i-th phrase to the relation of the proposition 141 | */ 142 | private static void pushConstituentToRelation(Proposition p, int i){ 143 | // New relational phrase. The root of the relational phrase is the verb by default 144 | Phrase relation = new Phrase(); 145 | relation.setRoot(p.getPhrases().get(1).getRoot()); 146 | 147 | // Push 148 | relation.addWordsToList(p.getPhrases().get(1).getWordList().clone()); 149 | relation.addWordsToList(p.getPhrases().get(i).getWordList().clone()); 150 | p.setRelation(relation); 151 | 152 | // Clean the i-th constituent 153 | p.getPhrases().get(i).getWordList().clear(); 154 | } 155 | 156 | /** 157 | * Given a proposition and a list of constituency types (corresponding the phrases of the proposition), 158 | * push the constituents to the relation if needed 159 | * @param proposition 160 | * @param constTypes 161 | */ 162 | private static void pushConstituentsToRelation(Proposition p, ObjectArrayList types){ 163 | // Push constituents to the relation if the 4th constituent is an adverbial 164 | // (for SVA(A), SVC(A), SVO(A), SVOA) 165 | if (types.get(3) == Constituent.Type.ADVERBIAL){ 166 | // If the adverbial is consisted of one adverb, don't push the previous constituent 167 | if (p.getPhrases().get(3).getWordList().size() > 1) { 168 | // If CCOMP don't push it 169 | if (types.get(2) == Constituent.Type.CCOMP) { 170 | return; 171 | } 172 | pushConstituentToRelation(p, 2); 173 | } 174 | // If the adverbial is consisted of one adverb, push the adverb to the relation 175 | else if (p.getPhrases().get(3).getWordList().size() == 1){ 176 | if (CoreNLPUtils.isAdverb(p.getPhrases().get(3).getWordList().get(0).tag())) 177 | pushConstituentToRelation(p, 3); 178 | else 179 | pushConstituentToRelation(p, 2); 180 | } 181 | } 182 | // If the 3rd constituent is an indirect/direct object or an adverbial (for SVOO/SVOC, SVOA) 183 | else if (types.get(2) == Constituent.Type.IOBJ || types.get(2) == Constituent.Type.DOBJ || 184 | types.get(2) == Constituent.Type.ADVERBIAL){ 185 | pushConstituentToRelation(p, 2); 186 | } 187 | } 188 | } 189 | -------------------------------------------------------------------------------- /src/main/java/de/uni_mannheim/clausie/proposition/Proposition.java: -------------------------------------------------------------------------------- 1 | 2 | package de.uni_mannheim.clausie.proposition; 3 | 4 | import java.util.HashSet; 5 | import java.util.Set; 6 | 7 | import de.uni_mannheim.clausie.phrase.Phrase; 8 | import de.uni_mannheim.constant.SEPARATOR; 9 | import it.unimi.dsi.fastutil.objects.ObjectArrayList; 10 | 11 | /** Stores a proposition. 12 | * 13 | * @author Luciano del Corro 14 | * @author Kiril Gashteovski 15 | * 16 | */ 17 | public class Proposition { 18 | 19 | /** Constituents of the proposition */ 20 | private ObjectArrayList phrases = new ObjectArrayList(); 21 | 22 | /** Position of optional constituents */ 23 | private Set optional = new HashSet(); 24 | 25 | // TODO: types of constituents (e.g., optionality) sentence ID etc. 26 | 27 | public Proposition() { 28 | } 29 | 30 | /** 31 | * Removes a word from a constituent 32 | * @param i: the constituent index 33 | * @param j: the word index within the constituent 34 | */ 35 | public void removeWordFromConstituent(int i, int j){ 36 | this.phrases.get(i).removeWordFromList(j); 37 | } 38 | 39 | /** Returns a list of constituents of the proposition */ 40 | public ObjectArrayList getConstituents(){ 41 | return this.phrases; 42 | } 43 | 44 | /** Returns the subject of the proposition */ 45 | public Phrase subject() { 46 | return this.phrases.get(0); 47 | } 48 | 49 | /** Returns the relation of the proposition */ 50 | public Phrase relation() { 51 | return phrases.get(1); 52 | } 53 | 54 | /** Returns the object of the proposition (should be used when working with triples only!) */ 55 | public Phrase object(){ 56 | return phrases.get(2); 57 | } 58 | 59 | /** Sets the relation of the proposition */ 60 | public void setRelation(Phrase rel){ 61 | phrases.set(1, rel); 62 | } 63 | 64 | /** Returns a constituent in a given position */ 65 | public Phrase argument(int i) { 66 | return phrases.get(i + 2); 67 | } 68 | 69 | /** Returns the number of arguments */ 70 | public int noArguments() { 71 | return phrases.size() - 2; 72 | } 73 | 74 | /** Checks if an argument is optional */ 75 | public boolean isOptionalArgument(int i) { 76 | return optional.contains(i + 2); 77 | } 78 | 79 | /** 80 | * Given a proposition, this function turns it into a "sentence" by concatenating the constituents' strings 81 | * @return 82 | */ 83 | public String propositionToString(){ 84 | StringBuffer sb = new StringBuffer(); 85 | for (int i = 0; i < phrases.size(); i++){ 86 | sb.append(phrases.get(i).getWords()); 87 | sb.append(SEPARATOR.SPACE); 88 | } 89 | return sb.toString().trim(); 90 | } 91 | 92 | public ObjectArrayList getPhrases(){ 93 | return this.phrases; 94 | } 95 | public void addPhrase (Phrase p){ 96 | this.phrases.add(p); 97 | } 98 | public void setPhrase(int i, Phrase p){ 99 | this.phrases.set(i, p); 100 | } 101 | /** Get optional constituents' indices **/ 102 | public Set getOptinoalConstituentsIndices(){ 103 | return this.optional; 104 | } 105 | /** Add index of an optional constituent **/ 106 | public void addOptionalConstituentIndex(int i){ 107 | this.optional.add(i); 108 | } 109 | /** Clear the set of optional constituent indices **/ 110 | public void clearOptionalConstituentIndicesSet(){ 111 | this.optional.clear(); 112 | } 113 | 114 | @Override 115 | public String toString() { 116 | StringBuffer sb = new StringBuffer(); 117 | String sep = "("; 118 | 119 | for (int i=0; i < phrases.size(); i++) { 120 | String constituent = phrases.get(i).getWords(); 121 | sb.append(sep); 122 | sep = ", "; 123 | sb.append("\""); 124 | sb.append(constituent); 125 | sb.append("\""); 126 | if (optional.contains(i)) { 127 | sb.append("?"); 128 | } 129 | } 130 | sb.append(")"); 131 | return sb.toString(); 132 | } 133 | 134 | @Override 135 | public Proposition clone() { 136 | Proposition clone = new Proposition(); 137 | clone.phrases = new ObjectArrayList(this.phrases.clone()); 138 | clone.optional = new HashSet(this.optional); 139 | return clone; 140 | } 141 | } 142 | -------------------------------------------------------------------------------- /src/main/java/de/uni_mannheim/clausie/proposition/PropositionGenerator.java: -------------------------------------------------------------------------------- 1 | package de.uni_mannheim.clausie.proposition; 2 | 3 | import java.util.Collection; 4 | import java.util.Collections; 5 | import java.util.HashSet; 6 | import java.util.Set; 7 | import java.util.TreeSet; 8 | 9 | import de.uni_mannheim.clausie.ClausIE; 10 | import de.uni_mannheim.clausie.clause.Clause; 11 | import de.uni_mannheim.clausie.constituent.Constituent; 12 | import de.uni_mannheim.clausie.constituent.IndexedConstituent; 13 | import de.uni_mannheim.clausie.constituent.PhraseConstituent; 14 | import de.uni_mannheim.clausie.phrase.Phrase; 15 | import de.uni_mannheim.utils.coreNLP.DpUtils; 16 | import edu.stanford.nlp.ling.IndexedWord; 17 | import edu.stanford.nlp.trees.EnglishGrammaticalRelations; 18 | import edu.stanford.nlp.trees.GrammaticalRelation; 19 | import edu.stanford.nlp.semgraph.SemanticGraph; 20 | 21 | /** 22 | * Handles the generation of propositions out of a given clause 23 | * 24 | * @author Luciano del Corro 25 | * @author Kiril Gashteovski 26 | * 27 | */ 28 | public abstract class PropositionGenerator { 29 | 30 | ClausIE clausIE; 31 | 32 | /** Relations to be excluded in every constituent of a clause except the verb */ 33 | protected static final Set EXCLUDE_RELATIONS; 34 | 35 | /** Relations to be excluded in the verb */ 36 | protected static final Set EXCLUDE_RELATIONS_VERB; 37 | 38 | static { 39 | EXCLUDE_RELATIONS = new HashSet(); 40 | EXCLUDE_RELATIONS.add(EnglishGrammaticalRelations.RELATIVE_CLAUSE_MODIFIER); 41 | EXCLUDE_RELATIONS.add(EnglishGrammaticalRelations.APPOSITIONAL_MODIFIER); 42 | EXCLUDE_RELATIONS.add(EnglishGrammaticalRelations.PARATAXIS); 43 | EXCLUDE_RELATIONS.add(EnglishGrammaticalRelations.valueOf("dep")); 44 | 45 | EXCLUDE_RELATIONS_VERB = new HashSet(); 46 | EXCLUDE_RELATIONS_VERB.addAll(EXCLUDE_RELATIONS); 47 | EXCLUDE_RELATIONS_VERB.add(EnglishGrammaticalRelations.valueOf("dep")); //without this asome adverbs or auxiliaries will end up in the relation 48 | } 49 | 50 | /** Constructs a proposition generator*/ 51 | public PropositionGenerator(ClausIE clausIE) { 52 | this.clausIE = clausIE; 53 | } 54 | 55 | /** Generates propositions for a given clause*/ 56 | public abstract void generate(Clause clause, SemanticGraph sGraph); 57 | 58 | /** Generates a textual representation of a given constituent plus a set of words*/ 59 | private Phrase generatePhrase(IndexedConstituent constituent, Collection words, SemanticGraph sGraph) { 60 | Phrase phrase = new Phrase(); 61 | 62 | if (constituent.isPrepositionalPhrase(sGraph)) { 63 | // TODO: before, it was: constituent.getRoot().originalText(). For some reason, in the case for 64 | // "in Los Angeles", the word "in" returns empty string for originalText(), and the actual word for word(). 65 | // Check if this compromises the code in some way 66 | // TODO: see if you could find a faster way to make this check (not to go through the list of all excluded 67 | // words, for instance: use a flag as an input parameter) 68 | if (!constituent.excludedVertexes.contains(constituent.getRoot())){ 69 | phrase.addWordToList(constituent.getRoot()); 70 | } 71 | } 72 | 73 | for (IndexedWord word : words) { 74 | if (DpUtils.filterTokens(word)) 75 | continue; 76 | phrase.addWordToList(word); 77 | } 78 | 79 | return phrase; 80 | } 81 | 82 | /** Generates a textual representation of a given constituent in a given clause*/ 83 | public Phrase generate(Clause clause, int constituentIndex, SemanticGraph sGraph) { 84 | Set excludeRelations = EXCLUDE_RELATIONS; 85 | if (clause.getVerbInd() == constituentIndex) { 86 | excludeRelations = EXCLUDE_RELATIONS_VERB; 87 | } 88 | 89 | return generate(clause, constituentIndex, excludeRelations, Collections. emptySet(), sGraph); 90 | } 91 | 92 | /** Generates a textual representation of a given constituent in a given clause **/ 93 | public Phrase generate(Clause clause, int constituentIndex, Collection excludeRelations, 94 | Collection excludeRelationsTop, SemanticGraph sGraph) { 95 | 96 | Constituent constituent = clause.getConstituents().get(constituentIndex); 97 | 98 | if (constituent instanceof PhraseConstituent) { 99 | PhraseConstituent tConstituent = ((PhraseConstituent) constituent); 100 | return tConstituent.getPhrase(); 101 | } else if (constituent instanceof IndexedConstituent) { 102 | IndexedConstituent iconstituent = (IndexedConstituent) constituent; 103 | SemanticGraph subgraph = iconstituent.createReducedSemanticGraph(); 104 | DpUtils.removeEdges(subgraph, iconstituent.getRoot(), excludeRelations, excludeRelationsTop); 105 | Set words = new TreeSet(subgraph.descendants(iconstituent.getRoot())); 106 | 107 | for (IndexedWord v : iconstituent.getAdditionalVertexes()) { 108 | words.addAll(subgraph.descendants(v)); 109 | } 110 | if (iconstituent.isPrepositionalPhrase(sGraph)) 111 | words.remove(iconstituent.getRoot()); 112 | 113 | Phrase phrase = generatePhrase(iconstituent, words, sGraph); 114 | return phrase; 115 | } else { 116 | throw new IllegalArgumentException(); 117 | } 118 | } 119 | } 120 | -------------------------------------------------------------------------------- /src/main/java/de/uni_mannheim/constant/CHARACTER.java: -------------------------------------------------------------------------------- 1 | package de.uni_mannheim.constant; 2 | 3 | /** 4 | * @author Kiril Gashteovski 5 | */ 6 | public class CHARACTER { 7 | public static String VBAR = "|"; 8 | public static String QMARK = "?"; 9 | public static String ASTERISK = "*"; 10 | public static String PLUS = "+"; 11 | public static String LBRACE = "{"; 12 | public static String RBRACE = "}"; 13 | public static String LBRACKET = "["; 14 | public static String RBRACKET = "]"; 15 | public static String LPARENTHESIS = "("; 16 | public static String RPARENTHESIS = ")"; 17 | public static String CARET = "^"; 18 | public static String EQUAL = "="; 19 | public static String DOT = "."; 20 | public static String COMMA = ","; 21 | public static String TAB = "\t"; 22 | public static String NEW_LINE = "\n"; 23 | public static String CRETURN = "\r"; 24 | public static String SPACE = " "; 25 | public static String MINUS = "-"; 26 | public static String QUOTATION_MARK = "\""; 27 | public static String SEMI_COLON = ";"; 28 | public static String COLON = ":"; 29 | public static String UNDERSCORE = "_"; 30 | public static String EMPTY_STRING = ""; 31 | public static String LESS = "<"; 32 | } 33 | -------------------------------------------------------------------------------- /src/main/java/de/uni_mannheim/constant/CLAUSE_TYPE.java: -------------------------------------------------------------------------------- 1 | package de.uni_mannheim.constant; 2 | 3 | /** 4 | * @author Kiril Gashteovski 5 | */ 6 | public class CLAUSE_TYPE { 7 | public static final String ST_SV = "SV"; 8 | public static final String ST_SVA = "SVA"; 9 | public static final String ST_SVO = "SVO"; 10 | public static final String ST_SVC = "SVC"; 11 | public static final String ST_SVOA = "SVOA"; 12 | public static final String ST_SVOO = "SVOO"; 13 | public static final String ST_SVOC = "SVOC"; 14 | public static final String ST_EXISTENTIAL = "EXISTENTIAL"; 15 | public static final String ST_UNKNOWN = "UNKNOWN"; 16 | } 17 | -------------------------------------------------------------------------------- /src/main/java/de/uni_mannheim/constant/NE_TYPE.java: -------------------------------------------------------------------------------- 1 | package de.uni_mannheim.constant; 2 | 3 | /** 4 | * @author Kiril Gashteovski 5 | */ 6 | public class NE_TYPE { 7 | public static final String MISC = "MISC"; 8 | public static final String PERSON = "PERSON"; 9 | public static final String LOCATION = "LOCATION"; 10 | public static final String ORGANIZATION = "ORGANIZATION"; 11 | public static final String DATE = "DATE"; 12 | public static final String DURATION = "DURATION"; 13 | public static final String TIME = "TIME"; 14 | public static final String MONEY = "MONEY"; 15 | public static final String NUMBER = "NUMBER"; 16 | public static final String ORDINAL = "ORDINAL"; 17 | public static final String NO_NER = "O"; 18 | public static final String ENTITY = "ENTITY"; 19 | public static final String SET = "SET"; 20 | } 21 | -------------------------------------------------------------------------------- /src/main/java/de/uni_mannheim/constant/POS_TAG.java: -------------------------------------------------------------------------------- 1 | package de.uni_mannheim.constant; 2 | 3 | /** 4 | * @author Kiril Gashteovski 5 | */ 6 | public class POS_TAG { 7 | public static final String CD = "CD"; 8 | public static final String DT = "DT"; 9 | public static final String VB = "VB"; 10 | public static final String VBD = "VBD"; 11 | public static final String VBG = "VBG"; 12 | public static final String VBN = "VBN"; 13 | public static final String VBP = "VBP"; 14 | public static final String VBZ = "VBZ"; 15 | public static final String MD = "MD"; 16 | public static final String NN = "NN"; 17 | public static final String NNS = "NNS"; 18 | public static final String NNP = "NNP"; 19 | public static final String NNPS = "NNPS"; 20 | public static final String JJ = "JJ"; 21 | public static final String JJR = "JJR"; 22 | public static final String JJS = "JJS"; 23 | public static final String RB = "RB"; 24 | public static final String RBR = "RBR"; 25 | public static final String RBS = "RBS"; 26 | public static final String RP = "RP"; 27 | public static final String PR = "PR"; // used for both PRP and PRP$ 28 | public static final String PRP = "PRP"; 29 | public static final String PRP_P = "PRP$"; 30 | public static final String WP = "WP"; 31 | public static final String WP_P = "WP$"; 32 | public static final String WDT = "WDT"; 33 | public static final String WRB = "WRB"; 34 | public static final String POS = "POS"; 35 | public static final String SYM = "SYM"; 36 | public static final String IN = "IN"; 37 | public static final String TO = "TO"; 38 | } 39 | -------------------------------------------------------------------------------- /src/main/java/de/uni_mannheim/constant/SEPARATOR.java: -------------------------------------------------------------------------------- 1 | package de.uni_mannheim.constant; 2 | 3 | /** 4 | * @author Kiril Gashteovski 5 | */ 6 | public class SEPARATOR { 7 | public static final String SPACE = " "; 8 | public static final String TAB = "\t"; 9 | public static final String COMMA = ","; 10 | public static final String MINUS = "-"; 11 | public static final String NEW_LINE = "\n"; 12 | } 13 | -------------------------------------------------------------------------------- /src/main/java/de/uni_mannheim/constant/WORDS.java: -------------------------------------------------------------------------------- 1 | package de.uni_mannheim.constant; 2 | 3 | import java.io.IOException; 4 | 5 | import de.uni_mannheim.utils.Dictionary; 6 | 7 | /** 8 | * @author Kiril Gashteovski 9 | */ 10 | public class WORDS { 11 | // A set of non-subsective modal adjectives 12 | public static Dictionary NON_SUBSECTIVE_JJ_MODAL; 13 | static { 14 | try { 15 | NON_SUBSECTIVE_JJ_MODAL = new Dictionary("/minie-resources/non-subsective-adjectives-modal.dict"); 16 | } catch (IOException e) { 17 | throw new Error(e); 18 | } 19 | } 20 | 21 | // A set of non-subsective cf. adjectives 22 | public static Dictionary NON_SUBSECTIVE_JJ_CF; 23 | static { 24 | try { 25 | NON_SUBSECTIVE_JJ_CF = new Dictionary("/minie-resources/non-subsective-adjectives-cf.dict"); 26 | } catch (IOException e) { 27 | throw new Error(e); 28 | } 29 | } 30 | 31 | // A set of non-subsective temp. adjectives 32 | public static Dictionary NON_SUBSECTIVE_JJ_TEMP; 33 | static { 34 | try { 35 | NON_SUBSECTIVE_JJ_TEMP = new Dictionary("/minie-resources/non-subsective-adjectives-temp.dict"); 36 | } catch (IOException e) { 37 | throw new Error(e); 38 | } 39 | } 40 | 41 | public static String word = "word"; 42 | public static String idx = "idx"; 43 | public static String factuality = "Factuality"; 44 | public static String attribution = "Attribution"; 45 | } 46 | -------------------------------------------------------------------------------- /src/main/java/de/uni_mannheim/minie/annotation/Attribution.java: -------------------------------------------------------------------------------- 1 | package de.uni_mannheim.minie.annotation; 2 | 3 | import de.uni_mannheim.constant.CHARACTER; 4 | import de.uni_mannheim.constant.SEPARATOR; 5 | 6 | /** 7 | * A class representing the attribution 8 | * @param attributionPhrase: a phrase containing the words for the attribution 9 | * @param modality: the modality of the attribution (possibility or certainty) 10 | * @param polarity: the polarity of the attribution (positive or negative) 11 | * @param predicateVerb: the predicate verb (as a string in its lemma version) 12 | * 13 | * @author Kiril Gashteovski 14 | */ 15 | 16 | public class Attribution { 17 | private AnnotatedPhrase attributionPhrase; 18 | private Modality.Type modality; 19 | private Polarity.Type polarity; 20 | private String predicateVerb; 21 | 22 | 23 | /** Some string constants necessary for detecting the attribution **/ 24 | public static String ACCORDING = "according"; 25 | 26 | /** Default constructor: modality == certainty, polarity == positive, attributionPhrase == null */ 27 | public Attribution(){ 28 | this.attributionPhrase = null; 29 | this.modality = Modality.Type.CERTAINTY; 30 | this.polarity = Polarity.Type.POSITIVE; 31 | this.predicateVerb = CHARACTER.EMPTY_STRING; 32 | } 33 | 34 | /** Constructor with a given attribution phrase. The modality and polarity are by default 'certainty' and 'positive' 35 | * respectively 36 | * 37 | * @param: attributionPhrase: the attribution phrase 38 | * @param: pVerb: the predicate verb (a string) 39 | */ 40 | public Attribution(AnnotatedPhrase attributionPhrase, String pVerb){ 41 | this.attributionPhrase = attributionPhrase; 42 | this.modality = Modality.Type.CERTAINTY; 43 | this.polarity = Polarity.Type.POSITIVE; 44 | this.predicateVerb = pVerb; 45 | } 46 | 47 | /** 48 | * Fully parameterized constructor 49 | * @param attributionPhrase: the attribution phrase 50 | * @param pol: polarity type 51 | * @param mod: modality type 52 | * @param pVerb: predicate verb 53 | */ 54 | public Attribution(AnnotatedPhrase attributionPhrase, Polarity.Type pol, Modality.Type mod, String pVerb){ 55 | this.attributionPhrase = attributionPhrase; 56 | this.modality = mod; 57 | this.polarity = pol; 58 | this.predicateVerb = pVerb; 59 | } 60 | /** Copy constructor **/ 61 | public Attribution(Attribution s){ 62 | this.attributionPhrase = s.getAttributionPhrase(); 63 | this.modality = s.getModalityType(); 64 | this.polarity = s.getPolarityType(); 65 | this.predicateVerb = s.getPredicateVerb(); 66 | } 67 | 68 | // Getters 69 | public AnnotatedPhrase getAttributionPhrase(){ 70 | return this.attributionPhrase; 71 | } 72 | public Modality.Type getModalityType(){ 73 | return this.modality; 74 | } 75 | public Polarity.Type getPolarityType(){ 76 | return this.polarity; 77 | } 78 | public String getPredicateVerb(){ 79 | return this.predicateVerb; 80 | } 81 | 82 | // Setters 83 | public void setAttributionPhrase(AnnotatedPhrase s){ 84 | this.attributionPhrase = s; 85 | } 86 | public void setModalityType(Modality.Type t){ 87 | this.modality = t; 88 | } 89 | public void setPolarityType(Polarity.Type t){ 90 | this.polarity = t; 91 | } 92 | public void setPredicateVerb(String pVerb){ 93 | this.predicateVerb = pVerb; 94 | } 95 | 96 | // Clear the attribution 97 | public void clear(){ 98 | this.attributionPhrase = null; 99 | this.modality = Modality.Type.CERTAINTY; 100 | this.polarity = Polarity.Type.POSITIVE; 101 | this.predicateVerb = CHARACTER.EMPTY_STRING; 102 | } 103 | 104 | // Write down the attribution in the format (attribution_phrase, predicate, polarity, modality) 105 | @Override 106 | public String toString(){ 107 | StringBuffer sb = new StringBuffer(); 108 | sb.append(CHARACTER.LPARENTHESIS); 109 | 110 | // Append the attribution phrase 111 | for (int i = 0; i < this.attributionPhrase.getWordList().size(); i++) { 112 | sb.append(this.attributionPhrase.getWordList().get(i).word()); 113 | if (i < this.attributionPhrase.getWordList().size() - 1) 114 | sb.append(SEPARATOR.SPACE); 115 | } 116 | 117 | sb.append(SEPARATOR.COMMA); 118 | sb.append(SEPARATOR.SPACE); 119 | 120 | // Append the predicate verb 121 | sb.append("Predicate: "); 122 | sb.append(this.predicateVerb); 123 | sb.append(SEPARATOR.COMMA); 124 | sb.append(SEPARATOR.SPACE); 125 | 126 | // Append the polarity 127 | sb.append("POLARITY: "); 128 | if (this.polarity == Polarity.Type.POSITIVE) 129 | sb.append(Polarity.ST_POSITIVE); 130 | else 131 | sb.append(Polarity.ST_NEGATIVE); 132 | sb.append(SEPARATOR.SPACE); 133 | sb.append(SEPARATOR.COMMA); 134 | sb.append(SEPARATOR.SPACE); 135 | 136 | // Append the modality 137 | sb.append("MODALITY: "); 138 | if (this.modality == Modality.Type.CERTAINTY) 139 | sb.append(Modality.ST_CERTAINTY); 140 | else 141 | sb.append(Modality.ST_POSSIBILITY); 142 | sb.append(CHARACTER.RPARENTHESIS); 143 | sb.append(SEPARATOR.SPACE); 144 | sb.append(SEPARATOR.COMMA); 145 | sb.append(SEPARATOR.SPACE); 146 | 147 | sb.append("POLARITY: "); 148 | if (this.polarity == Polarity.Type.POSITIVE) 149 | sb.append(Polarity.ST_POSITIVE); 150 | else 151 | sb.append(Polarity.ST_NEGATIVE); 152 | 153 | return sb.toString().trim(); 154 | } 155 | 156 | /** Return the attribution as a string in format "(Attribution Phrase, (POLARITY, MODALITY)) **/ 157 | public String toStringCompact() { 158 | StringBuffer sb = new StringBuffer(); 159 | sb.append(CHARACTER.LPARENTHESIS); 160 | 161 | // Append the attribution phrase 162 | for (int i = 0; i < this.attributionPhrase.getWordList().size(); i++) { 163 | sb.append(this.attributionPhrase.getWordList().get(i).word()); 164 | if (i < this.attributionPhrase.getWordList().size() - 1) 165 | sb.append(SEPARATOR.SPACE); 166 | } 167 | 168 | sb.append(SEPARATOR.COMMA); 169 | sb.append(SEPARATOR.SPACE); 170 | 171 | // Append the factuality 172 | sb.append(CHARACTER.LPARENTHESIS); 173 | if (this.polarity == Polarity.Type.POSITIVE) 174 | sb.append(Polarity.ST_PLUS); 175 | else 176 | sb.append(Polarity.ST_MINUS); 177 | sb.append(SEPARATOR.COMMA); 178 | if (this.modality == Modality.Type.CERTAINTY) 179 | sb.append(Modality.ST_CT); 180 | else 181 | sb.append(Modality.ST_PS); 182 | sb.append(CHARACTER.RPARENTHESIS); 183 | sb.append(CHARACTER.RPARENTHESIS); 184 | 185 | return sb.toString(); 186 | } 187 | } 188 | -------------------------------------------------------------------------------- /src/main/java/de/uni_mannheim/minie/annotation/Polarity.java: -------------------------------------------------------------------------------- 1 | package de.uni_mannheim.minie.annotation; 2 | 3 | import edu.stanford.nlp.ling.IndexedWord; 4 | import edu.stanford.nlp.semgraph.SemanticGraph; 5 | import edu.stanford.nlp.semgraph.SemanticGraphEdge; 6 | 7 | import it.unimi.dsi.fastutil.objects.ObjectArrayList; 8 | 9 | import java.io.IOException; 10 | 11 | import de.uni_mannheim.constant.CHARACTER; 12 | import de.uni_mannheim.constant.POS_TAG; 13 | import de.uni_mannheim.constant.SEPARATOR; 14 | import de.uni_mannheim.utils.coreNLP.CoreNLPUtils; 15 | import de.uni_mannheim.utils.Dictionary; 16 | 17 | /** 18 | * Annotation for polarity 19 | * 20 | * @author Kiril Gashteovski 21 | */ 22 | public class Polarity { 23 | /** Annotations for polarity, can be just "POSTIIVE" or "NEGATIVE" */ 24 | public static enum Type {POSITIVE, NEGATIVE}; 25 | 26 | /** Static strings for polarity **/ 27 | public static String ST_POSITIVE = "POSITIVE"; 28 | public static String ST_PLUS = "+"; 29 | public static String ST_NEGATIVE = "NEGATIVE"; 30 | public static String ST_MINUS = "-"; 31 | 32 | /** List of negative words and edges (if found any) **/ 33 | private ObjectArrayList negativeWords; 34 | private ObjectArrayList negativeEdges; 35 | 36 | /** Polarity type **/ 37 | private Polarity.Type polarityType; 38 | 39 | /** A set of all negative words **/ 40 | public static Dictionary NEG_WORDS; 41 | static { 42 | try { 43 | NEG_WORDS = new Dictionary("/minie-resources/neg-words.dict"); 44 | } catch (IOException e) { 45 | throw new Error(e); 46 | } 47 | } 48 | 49 | /** A set of negative adverbs **/ 50 | public static Dictionary NEG_ADVERBS; 51 | static { 52 | try { 53 | NEG_ADVERBS = new Dictionary("/minie-resources/neg-adverbs.dict"); 54 | } catch (IOException e) { 55 | throw new Error(e); 56 | } 57 | } 58 | 59 | /** Set of negative determiners **/ 60 | public static Dictionary NEG_DETERMINERS; 61 | static { 62 | try { 63 | NEG_DETERMINERS = new Dictionary("/minie-resources/neg-determiners.dict"); 64 | } catch (IOException e) { 65 | throw new Error(e); 66 | } 67 | } 68 | 69 | /** Default constructor. Assumes positive polarity type by default **/ 70 | public Polarity(){ 71 | this.polarityType = Type.POSITIVE; 72 | this.negativeEdges = new ObjectArrayList(); 73 | this.negativeWords = new ObjectArrayList(); 74 | } 75 | 76 | /** 77 | * Constructor given the polarity type. Creates empty lists for negative words and edges 78 | * @param t: polarity type 79 | */ 80 | public Polarity(Polarity.Type t){ 81 | this.polarityType = t; 82 | this.negativeEdges = new ObjectArrayList(); 83 | this.negativeWords = new ObjectArrayList(); 84 | } 85 | /** 86 | * Copy constructor 87 | * @param p: polarity object 88 | */ 89 | public Polarity(Polarity p){ 90 | this.polarityType = p.getType(); 91 | this.negativeEdges = p.getNegativeEdges(); 92 | this.negativeWords = p.getNegativeWords(); 93 | } 94 | /** 95 | * Parametric constructor, given the polarity types, negative words, negative edges 96 | * @param t: polarity type 97 | * @param negWords: list of negative words 98 | * @param negEdges: list of negative edges 99 | */ 100 | public Polarity(Polarity.Type t, ObjectArrayList negWords, ObjectArrayList negEdges){ 101 | this.polarityType = t; 102 | this.negativeEdges = negEdges; 103 | this.negativeWords = negWords; 104 | } 105 | 106 | /** Getters **/ 107 | public Polarity.Type getType(){ 108 | return this.polarityType; 109 | } 110 | public ObjectArrayList getNegativeWords(){ 111 | return this.negativeWords; 112 | } 113 | public ObjectArrayList getNegativeEdges(){ 114 | return this.negativeEdges; 115 | } 116 | 117 | /** Setters **/ 118 | public void setType(Polarity.Type t){ 119 | this.polarityType = t; 120 | } 121 | public void setNegativeWords(ObjectArrayList negWords){ 122 | this.negativeWords = negWords; 123 | } 124 | public void setNegativeEdges(ObjectArrayList negEdges){ 125 | this.negativeEdges = negEdges; 126 | } 127 | 128 | /** Adding elements to lists **/ 129 | public void addNegativeEdge(SemanticGraphEdge e){ 130 | this.negativeEdges.add(e); 131 | } 132 | public void addNegativeWord(IndexedWord w){ 133 | this.negativeWords.add(w); 134 | } 135 | 136 | /** Clear the polarity object, i.e. set its default values (type = positive, neg. words and edges are empty lists) */ 137 | public void clear(){ 138 | this.polarityType = Type.POSITIVE; 139 | this.negativeEdges = new ObjectArrayList(); 140 | this.negativeWords = new ObjectArrayList(); 141 | } 142 | 143 | /** 144 | * Given a phrase and its sentence semantic graph, detect the polarity type. If negative polarity is found, add the 145 | * negative words and edges to their appropriate lists from the Polarity class. 146 | * 147 | * @param phrase: phrase (essentially, list of words, which are part of some sentence) 148 | * @param sentenceSemGraph: the semantic graph of the phrase's sentence 149 | * @return polarity object 150 | */ 151 | public static Polarity getPolarity(AnnotatedPhrase phrase, SemanticGraph sentenceSemGraph){ 152 | Polarity pol = new Polarity(); 153 | 154 | for (int i = 0; i < phrase.getWordList().size(); i++){ 155 | // Check for negative adverbs 156 | if (CoreNLPUtils.isAdverb(phrase.getWordList().get(i).tag())){ 157 | if (Polarity.NEG_ADVERBS.contains(phrase.getWordList().get(i).lemma())){ 158 | Polarity.setNegPol(pol, phrase.getWordList().get(i), sentenceSemGraph.getEdge( 159 | sentenceSemGraph.getParent(phrase.getWordList().get(i)), 160 | phrase.getWordList().get(i))); 161 | } 162 | } 163 | // Check for negative determiners 164 | else if (phrase.getWordList().get(i).tag().equals(POS_TAG.DT)){ 165 | if (Polarity.NEG_DETERMINERS.contains(phrase.getWordList().get(i).lemma())){ 166 | Polarity.setNegPol(pol, phrase.getWordList().get(i), sentenceSemGraph.getEdge( 167 | sentenceSemGraph.getParent(phrase.getWordList().get(i)), 168 | phrase.getWordList().get(i))); 169 | } 170 | } 171 | } 172 | 173 | return pol; 174 | } 175 | 176 | /** 177 | * Given a polarity object, negative word and a negative edge, set the polarity type to "negative" and add the 178 | * negative words and edges to their appropriate lists 179 | * 180 | * @param pol: polarity object 181 | * @param negWord: negative word 182 | * @param negEdge: negative edge 183 | */ 184 | private static void setNegPol(Polarity pol, IndexedWord negWord, SemanticGraphEdge negEdge){ 185 | pol.setType(Polarity.Type.NEGATIVE); 186 | pol.addNegativeWord(negWord); 187 | pol.addNegativeEdge(negEdge); 188 | } 189 | 190 | /** Given a polarity object, convert it into a string */ 191 | @Override 192 | public String toString(){ 193 | StringBuffer sb = new StringBuffer(); 194 | sb.append(CHARACTER.LPARENTHESIS); 195 | if (this.polarityType == Polarity.Type.POSITIVE) 196 | sb.append(CHARACTER.PLUS); 197 | else { 198 | sb.append(CHARACTER.MINUS); 199 | sb.append(CHARACTER.COMMA); 200 | sb.append(SEPARATOR.SPACE); 201 | for (SemanticGraphEdge edge: this.negativeEdges){ 202 | sb.append(edge.toString()); 203 | sb.append(CHARACTER.COMMA); 204 | sb.append(SEPARATOR.SPACE); 205 | } 206 | } 207 | 208 | sb.append(CHARACTER.RPARENTHESIS); 209 | return sb.toString().trim(); 210 | } 211 | } 212 | -------------------------------------------------------------------------------- /src/main/java/de/uni_mannheim/minie/annotation/Quantity.java: -------------------------------------------------------------------------------- 1 | package de.uni_mannheim.minie.annotation; 2 | 3 | import java.io.IOException; 4 | 5 | import de.uni_mannheim.constant.CHARACTER; 6 | import de.uni_mannheim.utils.Dictionary; 7 | import edu.stanford.nlp.ling.IndexedWord; 8 | import edu.stanford.nlp.semgraph.SemanticGraphEdge; 9 | 10 | import it.unimi.dsi.fastutil.objects.ObjectArrayList; 11 | 12 | /** 13 | * Annotation for quantity 14 | * 15 | * @author Kiril Gashteovski 16 | */ 17 | public class Quantity { 18 | /** The quantity words */ 19 | private ObjectArrayList qWords; 20 | /** The quantity edges **/ 21 | private ObjectArrayList qEdges; 22 | /** The quantity ID */ 23 | private String id; 24 | 25 | /** A set of quantity determiners **/ 26 | public static Dictionary DT_QUANTITIES; 27 | static { 28 | try { 29 | DT_QUANTITIES = new Dictionary("/minie-resources/quantities-determiners.dict"); 30 | } catch (IOException e) { 31 | throw new Error(e); 32 | } 33 | } 34 | 35 | /** A set of quantity adjectives **/ 36 | public static Dictionary JJ_QUANTITIES; 37 | static { 38 | try { 39 | JJ_QUANTITIES = new Dictionary("/minie-resources/quantities-adjectives.dict"); 40 | } catch (IOException e) { 41 | throw new Error(e); 42 | } 43 | } 44 | 45 | /** Static strings used for quantities **/ 46 | public static String ST_QUANTITY = "QUANTITY"; 47 | public static String ST_QUANT = "QUANT"; 48 | 49 | /** Strings used for IDs for quantities **/ 50 | public static String SUBJECT_ID = "S"; 51 | public static String RELATION_ID = "R"; 52 | public static String OBJECT_ID = "O"; 53 | 54 | /** Default constructor **/ 55 | public Quantity() { 56 | this.qWords = new ObjectArrayList(); 57 | this.qEdges = new ObjectArrayList(); 58 | this.id = CHARACTER.EMPTY_STRING; 59 | } 60 | /** Copy constructor **/ 61 | public Quantity(Quantity q){ 62 | this.qWords = q.getQuantityWords(); 63 | this.qEdges = q.getQuantityEdges(); 64 | this.id = q.getId(); 65 | } 66 | /** 67 | * Given a list of indexed words and a list of semantic graph edges, create a quantity object which will have 68 | * qWords as quantity words and qEdges as quantity edges (no ID = empty string) 69 | * @param qWords: quantity words 70 | * @param qEdges: quantity edges 71 | */ 72 | public Quantity(ObjectArrayList qWords, ObjectArrayList qEdges){ 73 | this.qWords = qWords.clone(); 74 | this.qEdges = qEdges.clone(); 75 | this.id = CHARACTER.EMPTY_STRING; 76 | } 77 | /** 78 | * Given a list of indexed words, a list of semantic graph edges and an ID, create a quantity object which will have 79 | * qWords as quantity words and qEdges as quantity edges and ID as an id 80 | * @param qWords: quantity words 81 | * @param qEdges: quantity edges 82 | * @param id: the ID of the quantity 83 | */ 84 | public Quantity(ObjectArrayList qWords, ObjectArrayList qEdges, String id){ 85 | this.qWords = qWords.clone(); 86 | this.qEdges = qEdges.clone(); 87 | this.id = id; 88 | } 89 | 90 | /** Get the quantity words **/ 91 | public ObjectArrayList getQuantityWords(){ 92 | return this.qWords; 93 | } 94 | /** Get the quantity edges **/ 95 | public ObjectArrayList getQuantityEdges(){ 96 | return this.qEdges; 97 | } 98 | /** Get the quantity ID **/ 99 | public String getId(){ 100 | return this.id; 101 | } 102 | 103 | /** Set the quantity words **/ 104 | public void setWords(ObjectArrayList words){ 105 | this.qWords = words; 106 | } 107 | /** Set the quantity edges **/ 108 | public void setEdges(ObjectArrayList edges){ 109 | this.qEdges = edges; 110 | } 111 | /** Set the quantity ID **/ 112 | public void setId(String id){ 113 | this.id = id; 114 | } 115 | 116 | /** Add word to the word list of quantities **/ 117 | public void addWord(IndexedWord w) { 118 | this.qWords.add(w); 119 | } 120 | 121 | /** Given a quantity object, convert it into a string */ 122 | @Override 123 | public String toString(){ 124 | StringBuffer sb = new StringBuffer(); 125 | 126 | // Write the words in the format (WORD_1 WORD_2 ... WORD_n) 127 | //sb.append(CHARACTER.LPARENTHESIS); 128 | //sb.append(Quantity.ST_QUANT); 129 | //sb.append(CHARACTER.UNDERSCORE); 130 | sb.append(this.id); 131 | sb.append(CHARACTER.EQUAL); 132 | for (int i = 0; i < this.qWords.size(); i++){ 133 | sb.append(this.qWords.get(i).word()); 134 | if (i < this.qWords.size() - 1) 135 | sb.append(CHARACTER.SPACE); 136 | } 137 | //sb.append(CHARACTER.RPARENTHESIS); 138 | 139 | return sb.toString().trim(); 140 | } 141 | } 142 | -------------------------------------------------------------------------------- /src/main/java/de/uni_mannheim/minie/main/Extractor.java: -------------------------------------------------------------------------------- 1 | package de.uni_mannheim.minie.main; 2 | 3 | import de.uni_mannheim.clausie.ClausIE; 4 | import de.uni_mannheim.minie.MinIE; 5 | import de.uni_mannheim.utils.Dictionary; 6 | import de.uni_mannheim.utils.minie.Utils; 7 | import edu.stanford.nlp.pipeline.StanfordCoreNLP; 8 | import de.uni_mannheim.utils.coreNLP.CoreNLPUtils; 9 | 10 | import java.io.IOException; 11 | 12 | /** 13 | * This class acts as a generic interface to the MinIE system 14 | * 15 | * @author Martin Achenbach 16 | * @author Kiril Gashteovski 17 | */ 18 | public class Extractor { 19 | private StanfordCoreNLP parser; 20 | private ClausIE clausIE; 21 | private MinIE minIE; 22 | private Dictionary dictionary; 23 | 24 | /** 25 | * default constructor 26 | */ 27 | public Extractor() { 28 | // initialize the parser 29 | this.parser = CoreNLPUtils.StanfordDepNNParser(); 30 | 31 | // initialize ClausIE 32 | this.clausIE = new ClausIE(); 33 | 34 | // initialize MinIE 35 | this.minIE = new MinIE(); 36 | 37 | // set up default dictionary 38 | try { 39 | this.setDictionary(new Dictionary(Utils.DEFAULT_DICTIONARIES)); 40 | } catch (IOException e) { 41 | e.printStackTrace(); 42 | } 43 | } 44 | 45 | /** 46 | * constructor with dictionary, helpful when running in dictionary mode 47 | * @param dictionary: dictionary 48 | */ 49 | public Extractor(Dictionary dictionary) { 50 | // initialize the parser 51 | this.parser = CoreNLPUtils.StanfordDepNNParser(); 52 | 53 | // initialize ClausIE 54 | this.clausIE = new ClausIE(); 55 | 56 | // initialize MinIE 57 | this.minIE = new MinIE(); 58 | 59 | // set dictionary 60 | this.setDictionary(dictionary); 61 | } 62 | 63 | /** 64 | * set the dictionary for dictionary mode 65 | * @param dictionary: dictionary to use 66 | */ 67 | public void setDictionary(Dictionary dictionary) { 68 | this.dictionary = dictionary; 69 | } 70 | 71 | /** 72 | * analyze a sentence using a specific mode 73 | * @param sentence: sentence to analyze 74 | * @param mode: MinIE mode 75 | * @return the results of MinIE 76 | */ 77 | public MinIE analyzeSentence(String sentence, MinIE.Mode mode) { 78 | // first reset objects 79 | this.clausIE.clear(); 80 | this.minIE.clear(); 81 | 82 | // parse the sentence 83 | this.clausIE.setSemanticGraph(CoreNLPUtils.parse(this.parser, sentence)); 84 | // detect clauses 85 | this.clausIE.detectClauses(); 86 | // generate propositions 87 | this.clausIE.generatePropositions(this.clausIE.getSemanticGraph()); 88 | 89 | // start minimizing 90 | this.minIE.setSemanticGraph(this.clausIE.getSemanticGraph()); 91 | this.minIE.setPropositions(this.clausIE); 92 | this.minIE.setPolarity(); 93 | this.minIE.setModality(); 94 | 95 | // minimize in given mode 96 | switch (mode) { 97 | case AGGRESSIVE: 98 | this.minIE.minimizeAggressiveMode(); 99 | break; 100 | case DICTIONARY: 101 | this.minIE.minimizeDictionaryMode(this.dictionary.words()); 102 | break; 103 | case SAFE: 104 | this.minIE.minimizeSafeMode(); 105 | break; 106 | case COMPLETE: 107 | break; 108 | } 109 | // remove duplicates 110 | this.minIE.removeDuplicates(); 111 | return this.minIE; 112 | } 113 | } 114 | -------------------------------------------------------------------------------- /src/main/java/de/uni_mannheim/minie/main/Main.java: -------------------------------------------------------------------------------- 1 | package de.uni_mannheim.minie.main; 2 | 3 | 4 | import it.unimi.dsi.fastutil.objects.ObjectArrayList; 5 | 6 | import joptsimple.OptionException; 7 | import joptsimple.OptionParser; 8 | import joptsimple.OptionSet; 9 | 10 | import java.io.BufferedReader; 11 | import java.io.DataInputStream; 12 | import java.io.FileInputStream; 13 | import java.io.FileNotFoundException; 14 | import java.io.FileOutputStream; 15 | import java.io.IOException; 16 | import java.io.InputStream; 17 | import java.io.InputStreamReader; 18 | import java.io.OutputStream; 19 | import java.io.PrintStream; 20 | 21 | import java.util.logging.Logger; 22 | 23 | import de.uni_mannheim.utils.Dictionary; 24 | import de.uni_mannheim.minie.MinIE; 25 | import de.uni_mannheim.minie.annotation.AnnotatedProposition; 26 | import de.uni_mannheim.utils.minie.Utils; 27 | 28 | /** 29 | * Main class that acts as a console interface to the MinIE system 30 | * 31 | * @author Martin Achenbach 32 | * @author Kiril Gashteovski 33 | */ 34 | public class Main { 35 | /** used MinIE mode **/ 36 | private static MinIE.Mode mode; 37 | 38 | /** console logger **/ 39 | private final static Logger logger = Logger.getLogger(String.valueOf(Main.class)); 40 | 41 | /** 42 | * main function to call from console with available options 43 | * @param args: console arguments 44 | * @throws IOException 45 | */ 46 | public static void main(String[] args) throws IOException { 47 | // init the optionParser 48 | OptionParser optionParser = initOptionParser(); 49 | OptionSet options; 50 | // parse options 51 | try { 52 | options = optionParser.parse(args); 53 | } catch (OptionException e) { 54 | System.err.println(e.getMessage()); 55 | System.out.println(""); 56 | optionParser.printHelpOn(System.out); 57 | return; 58 | } 59 | 60 | //print help if need, if yes, break 61 | if (options.has("h")) { 62 | optionParser.printHelpOn(System.out); 63 | return; 64 | } 65 | 66 | // setup input and output 67 | logger.info("Setting up input and output streams..."); 68 | InputStream in = getInputStream(options); 69 | OutputStream out = getOutputStream(options); 70 | BufferedReader din = new BufferedReader(new InputStreamReader(in)); 71 | PrintStream dout = new PrintStream(out, true, "UTF-8"); 72 | 73 | // get mode 74 | mode = Utils.getMode((String) options.valueOf("m")); 75 | logger.info("Mode set to " + mode); 76 | 77 | // initialize extractor 78 | Extractor extractor; 79 | 80 | 81 | if (mode == MinIE.Mode.DICTIONARY) { 82 | // load multi-word dictionaries if in dictionary mode 83 | Dictionary collocationDictionary = Utils.loadDictionary(options); 84 | extractor = new Extractor(collocationDictionary); 85 | } else { 86 | // if not use default constructor 87 | extractor = new Extractor(); 88 | } 89 | logger.info("\n\nSetup finished, ready to take input sentence:"); 90 | 91 | // start analyzing 92 | long start = System.currentTimeMillis(); 93 | String line; 94 | int counter = 0; 95 | while ((line = din.readLine()) != null) { 96 | // skip empty lines 97 | if (line.isEmpty()) continue; 98 | 99 | //logger.info("Start analyzing sentence: " + line); 100 | 101 | // parse sentence 102 | MinIE result = extractor.analyzeSentence(line, mode); 103 | 104 | // print results from MinIE 105 | ObjectArrayList propositions = result.getPropositions(); 106 | dout.println("\nOutput:"); 107 | if (options.has("p")) { 108 | result.getSentenceSemanticGraph().prettyPrint(); 109 | dout.print("\n"); 110 | } 111 | 112 | if (propositions.size() < 1) { 113 | dout.println("No extraction found."); 114 | dout.print("\n"); 115 | } else { 116 | for (AnnotatedProposition proposition : result.getPropositions()) { 117 | dout.println(Utils.formatProposition(proposition)); 118 | } 119 | dout.print("\n"); 120 | } 121 | counter++; 122 | } 123 | // finished analyzing 124 | long end = System.currentTimeMillis(); 125 | //logger.info("Analyzing time: " + (end - start) / 1000. + "s"); 126 | // clean up 127 | in.close(); 128 | out.close(); 129 | } 130 | 131 | /** 132 | * initializes and configures the option parser 133 | * @return a configured option parser 134 | */ 135 | private static OptionParser initOptionParser() { 136 | OptionParser optionParser = new OptionParser(); 137 | optionParser 138 | .accepts("f", "input file (if absent, MinIE reads from stdin)") 139 | .withOptionalArg() 140 | .describedAs("file") 141 | .ofType(String.class); 142 | optionParser 143 | .accepts("o", "output file (if absent, MinIE writes to stdout)") 144 | .withRequiredArg() 145 | .describedAs("file") 146 | .ofType(String.class); 147 | optionParser 148 | .accepts("m", "specification mode; allowed values: \"safe\", \"dictionary\", \"aggressive\", \"complete\"; defaults to \"safe\"") 149 | .withRequiredArg() 150 | .describedAs("mode") 151 | .ofType(String.class) 152 | .defaultsTo("safe"); 153 | optionParser 154 | .accepts("dict", "path of the multi-word expression dictionaries (can be several paths separated by ';'); \"dictionary\" mode only") 155 | .withOptionalArg() 156 | .ofType(String.class) 157 | .withValuesSeparatedBy(';'); 158 | optionParser 159 | .accepts("dict-overwrite", "if set, the default dictionary (multi-word expressions from WordNet and Wiktionary), will be overwritten, else new dictionaries will be appended") 160 | .withOptionalArg(); 161 | optionParser 162 | .accepts("p", "print the dependency parse of the input sentence"); 163 | optionParser 164 | .accepts("h", "show help"); 165 | return optionParser; 166 | } 167 | 168 | /** 169 | * returns input stream according to given options 170 | * @param options: option set for option parser 171 | * @return input stream 172 | */ 173 | private static InputStream getInputStream(OptionSet options) { 174 | InputStream in = null; 175 | // check if input file was specified 176 | if (options.has("f")) { 177 | try { 178 | String filename = (String)options.valueOf("f"); 179 | in = new FileInputStream(filename); 180 | logger.info("Reading from file " + filename); 181 | } catch (FileNotFoundException e) { 182 | e.printStackTrace(); 183 | } 184 | } else { 185 | // default to stdin 186 | in = System.in; 187 | logger.info("Reading from stdin"); 188 | } 189 | return new DataInputStream(in); 190 | } 191 | 192 | /** 193 | * returns output stream according to given options 194 | * @param options: option set for option parser 195 | * @return output stream 196 | */ 197 | private static OutputStream getOutputStream(OptionSet options) { 198 | OutputStream out = null; 199 | // check if output file was specified 200 | if (options.has("o")) { 201 | try { 202 | String filename = (String) options.valueOf("o"); 203 | out = new FileOutputStream(filename); 204 | logger.info("Writing to file " + filename); 205 | } catch (FileNotFoundException e) { 206 | e.printStackTrace(); 207 | } 208 | } else { 209 | // default to stdout 210 | out = System.out; 211 | logger.info("Writing to stdout"); 212 | } 213 | return new PrintStream(out); 214 | } 215 | } 216 | -------------------------------------------------------------------------------- /src/main/java/de/uni_mannheim/minie/minimize/object/ObjAggressiveMinimization.java: -------------------------------------------------------------------------------- 1 | package de.uni_mannheim.minie.minimize.object; 2 | 3 | import java.util.HashSet; 4 | import java.util.Set; 5 | 6 | import de.uni_mannheim.constant.POS_TAG; 7 | import de.uni_mannheim.constant.REGEX; 8 | import de.uni_mannheim.minie.annotation.AnnotatedPhrase; 9 | import de.uni_mannheim.minie.annotation.Quantity; 10 | 11 | import edu.stanford.nlp.ling.IndexedWord; 12 | import edu.stanford.nlp.ling.tokensregex.TokenSequenceMatcher; 13 | import edu.stanford.nlp.ling.tokensregex.TokenSequencePattern; 14 | import edu.stanford.nlp.semgraph.SemanticGraph; 15 | import edu.stanford.nlp.trees.EnglishGrammaticalRelations; 16 | import edu.stanford.nlp.trees.GrammaticalRelation; 17 | import it.unimi.dsi.fastutil.objects.ObjectArrayList; 18 | 19 | import de.uni_mannheim.utils.coreNLP.CoreNLPUtils; 20 | 21 | /** 22 | * @author Kiril Gashteovski 23 | */ 24 | public class ObjAggressiveMinimization { 25 | /** 26 | * Object aggressive minimization 27 | * @param object: object phrase 28 | * @param sg: semantic graph of the sentence 29 | */ 30 | public static void minimizeObject(AnnotatedPhrase object, SemanticGraph sg){ 31 | // Don't minimize if the phrase contains one word or no words (rare cases) 32 | if (object.getWordList() == null || object.getWordList().size() <= 1){ 33 | return; 34 | } 35 | 36 | // Don't minimize if the phrase is a multi word NER or multiple nouns in a sequence 37 | String seqPosNer = CoreNLPUtils.wordsToPosMergedNerSeq(object.getWordList()); 38 | if (seqPosNer.matches(REGEX.MULTI_WORD_ENTITY) || seqPosNer.matches(REGEX.MULTI_WORD_NOUN)){ 39 | return; 40 | } 41 | 42 | // Do safe minimization first 43 | ObjSafeMinimization.minimizeObject(object, sg); 44 | 45 | // List of words to be dropped 46 | ObjectArrayList dropWords = new ObjectArrayList<>(); 47 | 48 | // Drop some type of modifiers 49 | Set excludeRels = new HashSet<>(); 50 | excludeRels.add(EnglishGrammaticalRelations.ADVERBIAL_MODIFIER); 51 | excludeRels.add(EnglishGrammaticalRelations.ADJECTIVAL_MODIFIER); 52 | excludeRels.add(EnglishGrammaticalRelations.DETERMINER); 53 | excludeRels.add(EnglishGrammaticalRelations.PREDETERMINER); 54 | excludeRels.add(EnglishGrammaticalRelations.NUMERIC_MODIFIER); 55 | excludeRels.add(EnglishGrammaticalRelations.NUMBER_MODIFIER); 56 | excludeRels.add(EnglishGrammaticalRelations.POSSESSION_MODIFIER); 57 | excludeRels.add(EnglishGrammaticalRelations.POSSESSIVE_MODIFIER); 58 | excludeRels.add(EnglishGrammaticalRelations.QUANTIFIER_MODIFIER); 59 | excludeRels.add(EnglishGrammaticalRelations.TEMPORAL_MODIFIER); 60 | excludeRels.add(EnglishGrammaticalRelations.NP_ADVERBIAL_MODIFIER); 61 | excludeRels.add(EnglishGrammaticalRelations.PREPOSITIONAL_MODIFIER); 62 | //excludeRels.add(EnglishGrammaticalRelations.AUX_MODIFIER); 63 | for (IndexedWord w: object.getWordList()) { 64 | // Skip the words that were included afterwards (not part of the DP) 65 | if (w.index() < 0) 66 | continue; 67 | 68 | // Get the relevant modifiers to be dropped (their modifiers as well) 69 | Set modifiers = sg.getChildrenWithRelns(w, excludeRels); 70 | for (IndexedWord m: modifiers) { 71 | ObjectArrayList subModifiers = CoreNLPUtils.getSubTreeSortedNodes(m, sg, null); 72 | for (IndexedWord sm: subModifiers) 73 | //if (!sm.tag().equals(POS_TAG.IN)) 74 | dropWords.add(sm); 75 | } 76 | dropWords.addAll(modifiers); 77 | 78 | // Drop quantities 79 | if (w.ner().equals(Quantity.ST_QUANTITY)) 80 | dropWords.add(w); 81 | } 82 | object.getWordList().removeAll(dropWords); 83 | // add words to dropped word list 84 | object.addDroppedEdges(CoreNLPUtils.listOfIndexedWordsToParentEdges(sg, dropWords)); 85 | object.addDroppedWords(dropWords); 86 | dropWords.clear(); 87 | 88 | // If [IN|TO] .* [IN|TO] => drop [IN|TO] .*, i.e. -> drop PP attachments 89 | TokenSequencePattern tPattern = TokenSequencePattern.compile(REGEX.T_PREP_ALL_PREP); 90 | TokenSequenceMatcher tMatcher = tPattern.getMatcher(object.getWordCoreLabelList()); 91 | ObjectArrayList matchedWords = new ObjectArrayList<>(); 92 | while (tMatcher.find()){ 93 | matchedWords = CoreNLPUtils.listOfCoreMapWordsToIndexedWordList(tMatcher.groupNodes()); 94 | for (int i = 0; i < matchedWords.size(); i++) { 95 | if (matchedWords.get(i).tag().equals(POS_TAG.IN) || matchedWords.get(i).tag().equals(POS_TAG.TO)) { 96 | if (i == 0) { 97 | if (matchedWords.get(i).tag().equals(POS_TAG.TO) && CoreNLPUtils.isVerb(matchedWords.get(i+1).tag())) 98 | break; 99 | dropWords.add(matchedWords.get(i)); 100 | } else break; 101 | } else { 102 | dropWords.add(matchedWords.get(i)); 103 | } 104 | } 105 | } 106 | object.getWordList().removeAll(dropWords); 107 | // add words to dropped word list 108 | object.addDroppedEdges(CoreNLPUtils.listOfIndexedWordsToParentEdges(sg, dropWords)); 109 | object.addDroppedWords(dropWords); 110 | dropWords.clear(); 111 | 112 | // TODO: if QUANT + NP + IN => drop "QUANT + NP" ? 113 | 114 | // If VB_1+ TO VB_2 => drop VB_1+ TO .* 115 | tPattern = TokenSequencePattern.compile(REGEX.T_VB_TO_VB); 116 | tMatcher = tPattern.getMatcher(object.getWordCoreLabelList()); 117 | matchedWords = new ObjectArrayList<>(); 118 | while (tMatcher.find()){ 119 | matchedWords = CoreNLPUtils.listOfCoreMapWordsToIndexedWordList(tMatcher.groupNodes()); 120 | for (int i = 0; i < matchedWords.size(); i++) { 121 | if (matchedWords.get(i).tag().equals(POS_TAG.TO)) { 122 | dropWords.add(matchedWords.get(i)); 123 | break; 124 | } else { 125 | dropWords.add(matchedWords.get(i)); 126 | } 127 | } 128 | } 129 | object.getWordList().removeAll(dropWords); 130 | // add words to dropped word list 131 | object.addDroppedEdges(CoreNLPUtils.listOfIndexedWordsToParentEdges(sg, dropWords)); 132 | object.addDroppedWords(dropWords); 133 | dropWords.clear(); 134 | 135 | // Drop auxilaries 136 | for (IndexedWord w: object.getWordList()) { 137 | if (w.index() < 0) 138 | continue; 139 | Set modifiers = sg.getChildrenWithReln(w, EnglishGrammaticalRelations.AUX_MODIFIER); 140 | for (IndexedWord m: modifiers) { 141 | ObjectArrayList subModifiers = CoreNLPUtils.getSubTreeSortedNodes(m, sg, null); 142 | for (IndexedWord sm: subModifiers) 143 | dropWords.add(sm); 144 | } 145 | dropWords.addAll(modifiers); 146 | } 147 | object.getWordList().removeAll(dropWords); 148 | // add words to dropped word list 149 | object.addDroppedEdges(CoreNLPUtils.listOfIndexedWordsToParentEdges(sg, dropWords)); 150 | object.addDroppedWords(dropWords); 151 | dropWords.clear(); 152 | 153 | // Drop noun modifiers with different NERs 154 | for (IndexedWord w: object.getWordList()) { 155 | if (w.index() < 0) 156 | continue; 157 | Set modifiers = sg.getChildrenWithReln(w, EnglishGrammaticalRelations.NOUN_COMPOUND_MODIFIER); 158 | for (IndexedWord mw: modifiers) { 159 | if (!w.ner().equals(mw.ner())) { 160 | dropWords.add(mw); 161 | dropWords.addAll(CoreNLPUtils.getSubTreeSortedNodes(mw, sg, null)); 162 | } 163 | } 164 | } 165 | object.getWordList().removeAll(dropWords); 166 | // add words to dropped word list 167 | object.addDroppedEdges(CoreNLPUtils.listOfIndexedWordsToParentEdges(sg, dropWords)); 168 | object.addDroppedWords(dropWords); 169 | dropWords.clear(); 170 | } 171 | } -------------------------------------------------------------------------------- /src/main/java/de/uni_mannheim/minie/minimize/object/ObjDictionaryMinimization.java: -------------------------------------------------------------------------------- 1 | package de.uni_mannheim.minie.minimize.object; 2 | 3 | import java.util.ArrayList; 4 | import java.util.List; 5 | 6 | import de.uni_mannheim.minie.annotation.AnnotatedPhrase; 7 | import de.uni_mannheim.minie.minimize.Minimization; 8 | import de.uni_mannheim.minie.minimize.object.ObjSafeMinimization; 9 | import de.uni_mannheim.utils.coreNLP.CoreNLPUtils; 10 | 11 | import edu.stanford.nlp.semgraph.SemanticGraph; 12 | import edu.stanford.nlp.util.CoreMap; 13 | import it.unimi.dsi.fastutil.objects.ObjectOpenHashSet; 14 | 15 | /** 16 | * @author Kiril Gashteovski 17 | */ 18 | public class ObjDictionaryMinimization { 19 | /** 20 | * Minimize only the objects that are considered to have "non-frequent patterns" 21 | * @param obj: the object phrase 22 | * @param sg: semantic graph of the sentence 23 | * @param freqObjs: dictionary of multi-word expressions (frequent objects) 24 | */ 25 | public static void minimizeObject(AnnotatedPhrase obj, SemanticGraph sg, ObjectOpenHashSet collocations){ 26 | // Do the safe minimization first 27 | ObjSafeMinimization.minimizeObject(obj, sg); 28 | 29 | // If the object is frequent, don't minimize anything 30 | if (collocations.contains(CoreNLPUtils.listOfWordsToLemmaString(obj.getWordList()).toLowerCase())){ 31 | return; 32 | } 33 | 34 | // Minimization object 35 | Minimization simp = new Minimization(obj, sg, collocations); 36 | 37 | // remWords: list of words to be removed (reusable variable) 38 | // matchWords: list of matched words from the regex (reusable variable) 39 | List remWords = new ArrayList<>(); 40 | List matchWords = new ArrayList<>(); 41 | 42 | // Safe minimization on the noun phrases and named entities within the subj. phrase 43 | simp.nounPhraseDictMinimization(remWords, matchWords); 44 | simp.namedEntityDictionaryMinimization(remWords, matchWords); 45 | } 46 | } -------------------------------------------------------------------------------- /src/main/java/de/uni_mannheim/minie/minimize/object/ObjSafeMinimization.java: -------------------------------------------------------------------------------- 1 | package de.uni_mannheim.minie.minimize.object; 2 | 3 | import java.util.ArrayList; 4 | import java.util.List; 5 | 6 | import de.uni_mannheim.minie.annotation.AnnotatedPhrase; 7 | import de.uni_mannheim.minie.minimize.Minimization; 8 | import edu.stanford.nlp.semgraph.SemanticGraph; 9 | import edu.stanford.nlp.util.CoreMap; 10 | 11 | import it.unimi.dsi.fastutil.objects.ObjectOpenHashSet; 12 | 13 | /** 14 | * @author Kiril Gashteovski 15 | */ 16 | public class ObjSafeMinimization { 17 | /** 18 | * Minimize only the objects that are considered to have "safe patterns" 19 | * @param object: the objects phrase 20 | * @param sg: the semantic graph of the whole sentence 21 | */ 22 | public static void minimizeObject(AnnotatedPhrase object, SemanticGraph sg){ 23 | Minimization simp = new Minimization(object, sg, new ObjectOpenHashSet()); 24 | 25 | // remWords: list of words to be removed (reusable variable) 26 | // matchWords: list of matched words from the regex (reusable variable) 27 | List remWords = new ArrayList<>(); 28 | List matchWords = new ArrayList<>(); 29 | 30 | // Safe minimization on the noun phrases and named entities 31 | simp.nounPhraseSafeMinimization(remWords, matchWords); 32 | simp.namedEntitySafeMinimization(remWords, matchWords); 33 | } 34 | } 35 | 36 | -------------------------------------------------------------------------------- /src/main/java/de/uni_mannheim/minie/minimize/relation/RelAggressiveMinimization.java: -------------------------------------------------------------------------------- 1 | package de.uni_mannheim.minie.minimize.relation; 2 | 3 | import java.util.HashSet; 4 | import java.util.Set; 5 | 6 | import de.uni_mannheim.constant.POS_TAG; 7 | import de.uni_mannheim.constant.REGEX; 8 | import de.uni_mannheim.minie.annotation.AnnotatedPhrase; 9 | import de.uni_mannheim.minie.annotation.Quantity; 10 | 11 | import edu.stanford.nlp.ling.IndexedWord; 12 | import edu.stanford.nlp.ling.tokensregex.TokenSequenceMatcher; 13 | import edu.stanford.nlp.ling.tokensregex.TokenSequencePattern; 14 | import edu.stanford.nlp.semgraph.SemanticGraph; 15 | import edu.stanford.nlp.trees.EnglishGrammaticalRelations; 16 | import edu.stanford.nlp.trees.GrammaticalRelation; 17 | import it.unimi.dsi.fastutil.objects.ObjectArrayList; 18 | 19 | import de.uni_mannheim.utils.coreNLP.CoreNLPUtils; 20 | 21 | /** 22 | * @author Kiril Gashteovski 23 | */ 24 | public class RelAggressiveMinimization { 25 | /** 26 | * Always minimize the subject towards the root word 27 | * @param relation: relation phrase 28 | * @param sg: sentence semantic graph (dependency parse graph) 29 | */ 30 | public static void minimizeRelation(AnnotatedPhrase relation, SemanticGraph sg){ 31 | // Don't minimize if the phrase contains one word or no words (rare cases) 32 | if (relation.getWordList() == null || relation.getWordList().size() <= 1){ 33 | return; 34 | } 35 | 36 | // Do safe minimization first 37 | RelSafeMinimization.minimizeRelation(relation, sg); 38 | 39 | // List of words to be dropped 40 | ObjectArrayList dropWords = new ObjectArrayList<>(); 41 | 42 | // Drop some type of modifiers 43 | Set excludeRels = new HashSet<>(); 44 | excludeRels.add(EnglishGrammaticalRelations.ADVERBIAL_MODIFIER); 45 | excludeRels.add(EnglishGrammaticalRelations.ADJECTIVAL_MODIFIER); 46 | excludeRels.add(EnglishGrammaticalRelations.DETERMINER); 47 | excludeRels.add(EnglishGrammaticalRelations.PREDETERMINER); 48 | excludeRels.add(EnglishGrammaticalRelations.NUMERIC_MODIFIER); 49 | excludeRels.add(EnglishGrammaticalRelations.NUMBER_MODIFIER); 50 | excludeRels.add(EnglishGrammaticalRelations.POSSESSION_MODIFIER); 51 | excludeRels.add(EnglishGrammaticalRelations.POSSESSIVE_MODIFIER); 52 | excludeRels.add(EnglishGrammaticalRelations.QUANTIFIER_MODIFIER); 53 | excludeRels.add(EnglishGrammaticalRelations.TEMPORAL_MODIFIER); 54 | excludeRels.add(EnglishGrammaticalRelations.NP_ADVERBIAL_MODIFIER); 55 | //excludeRels.add(EnglishGrammaticalRelations.AUX_MODIFIER); 56 | for (IndexedWord w: relation.getWordList()) { 57 | // Skip the words that were included afterwards (not part of the DP) 58 | if (w.index() < 0) 59 | continue; 60 | 61 | // Get the relevant modifiers to be dropped (their modifiers as well) 62 | Set modifiers = sg.getChildrenWithRelns(w, excludeRels); 63 | for (IndexedWord m: modifiers) { 64 | ObjectArrayList subModifiers = CoreNLPUtils.getSubTreeSortedNodes(m, sg, null); 65 | for (IndexedWord sm: subModifiers) 66 | if (!sm.tag().equals(POS_TAG.IN)) 67 | dropWords.add(sm); 68 | } 69 | dropWords.addAll(modifiers); 70 | 71 | // Drop quantities 72 | if (w.ner().equals(Quantity.ST_QUANTITY)) 73 | dropWords.add(w); 74 | } 75 | relation.getWordList().removeAll(dropWords); 76 | // add words to dropped word list 77 | relation.addDroppedEdges(CoreNLPUtils.listOfIndexedWordsToParentEdges(sg, dropWords)); 78 | relation.addDroppedWords(dropWords); 79 | dropWords.clear(); 80 | 81 | // If [IN|TO] .* [IN|TO] => drop [IN|TO] .*, i.e. -> drop PP attachments 82 | TokenSequencePattern tPattern = TokenSequencePattern.compile(REGEX.T_PREP_ALL_PREP); 83 | TokenSequenceMatcher tMatcher = tPattern.getMatcher(relation.getWordCoreLabelList()); 84 | ObjectArrayList matchedWords = new ObjectArrayList<>(); 85 | while (tMatcher.find()){ 86 | matchedWords = CoreNLPUtils.listOfCoreMapWordsToIndexedWordList(tMatcher.groupNodes()); 87 | for (int i = 0; i < matchedWords.size(); i++) { 88 | if (matchedWords.get(i).tag().equals(POS_TAG.IN) || matchedWords.get(i).tag().equals(POS_TAG.TO)) { 89 | if (i == 0) { 90 | if (matchedWords.get(i).tag().equals(POS_TAG.TO) && CoreNLPUtils.isVerb(matchedWords.get(i+1).tag())) 91 | break; 92 | dropWords.add(matchedWords.get(i)); 93 | } else break; 94 | } else { 95 | dropWords.add(matchedWords.get(i)); 96 | } 97 | } 98 | } 99 | relation.getWordList().removeAll(dropWords); 100 | // add words to dropped word list 101 | relation.addDroppedEdges(CoreNLPUtils.listOfIndexedWordsToParentEdges(sg, dropWords)); 102 | relation.addDroppedWords(dropWords); 103 | dropWords.clear(); 104 | 105 | // TODO: if QUANT + NP + IN => drop "QUANT + NP" ? 106 | 107 | // If VB_1+ TO VB_2 => drop VB_1+ TO .* 108 | tPattern = TokenSequencePattern.compile(REGEX.T_VB_TO_VB); 109 | tMatcher = tPattern.getMatcher(relation.getWordCoreLabelList()); 110 | matchedWords = new ObjectArrayList<>(); 111 | while (tMatcher.find()){ 112 | matchedWords = CoreNLPUtils.listOfCoreMapWordsToIndexedWordList(tMatcher.groupNodes()); 113 | for (int i = 0; i < matchedWords.size(); i++) { 114 | if (matchedWords.get(i).tag().equals(POS_TAG.TO)) { 115 | dropWords.add(matchedWords.get(i)); 116 | break; 117 | } else { 118 | dropWords.add(matchedWords.get(i)); 119 | } 120 | } 121 | } 122 | relation.getWordList().removeAll(dropWords); 123 | // add words to dropped word list 124 | relation.addDroppedEdges(CoreNLPUtils.listOfIndexedWordsToParentEdges(sg, dropWords)); 125 | relation.addDroppedWords(dropWords); 126 | dropWords.clear(); 127 | 128 | // Drop auxilaries 129 | for (IndexedWord w: relation.getWordList()) { 130 | if (w.index() < 0) 131 | continue; 132 | Set modifiers = sg.getChildrenWithReln(w, EnglishGrammaticalRelations.AUX_MODIFIER); 133 | for (IndexedWord m: modifiers) { 134 | ObjectArrayList subModifiers = CoreNLPUtils.getSubTreeSortedNodes(m, sg, null); 135 | for (IndexedWord sm: subModifiers) 136 | dropWords.add(sm); 137 | } 138 | dropWords.addAll(modifiers); 139 | } 140 | relation.getWordList().removeAll(dropWords); 141 | // add words to dropped word list 142 | relation.addDroppedEdges(CoreNLPUtils.listOfIndexedWordsToParentEdges(sg, dropWords)); 143 | relation.addDroppedWords(dropWords); 144 | dropWords.clear(); 145 | 146 | // Drop noun modifiers with different NERs 147 | for (IndexedWord w: relation.getWordList()) { 148 | if (w.index() < 0) 149 | continue; 150 | Set modifiers = sg.getChildrenWithReln(w, EnglishGrammaticalRelations.NOUN_COMPOUND_MODIFIER); 151 | for (IndexedWord mw: modifiers) { 152 | if (!w.ner().equals(mw.ner())) { 153 | dropWords.add(mw); 154 | dropWords.addAll(CoreNLPUtils.getSubTreeSortedNodes(mw, sg, null)); 155 | } 156 | } 157 | } 158 | relation.getWordList().removeAll(dropWords); 159 | // add words to dropped word list 160 | relation.addDroppedEdges(CoreNLPUtils.listOfIndexedWordsToParentEdges(sg, dropWords)); 161 | relation.addDroppedWords(dropWords); 162 | dropWords.clear(); 163 | } 164 | } 165 | -------------------------------------------------------------------------------- /src/main/java/de/uni_mannheim/minie/minimize/relation/RelDictionaryMinimization.java: -------------------------------------------------------------------------------- 1 | package de.uni_mannheim.minie.minimize.relation; 2 | 3 | import java.util.ArrayList; 4 | import java.util.List; 5 | 6 | import de.uni_mannheim.minie.annotation.AnnotatedPhrase; 7 | import de.uni_mannheim.minie.minimize.Minimization; 8 | import de.uni_mannheim.minie.minimize.relation.RelSafeMinimization; 9 | 10 | import edu.stanford.nlp.semgraph.SemanticGraph; 11 | import edu.stanford.nlp.util.CoreMap; 12 | import it.unimi.dsi.fastutil.objects.ObjectOpenHashSet; 13 | 14 | import de.uni_mannheim.utils.coreNLP.CoreNLPUtils; 15 | 16 | /** 17 | * @author Kiril Gashteovski 18 | */ 19 | public class RelDictionaryMinimization { 20 | /** 21 | * Minimize only the relations that are considered to have "non-frequent patterns" 22 | * @param rel: the relation phrase 23 | * @param sg: semantic graph of the sentence 24 | * @param freqRels: dictionary of multi-word expressions (frequent relations) 25 | */ 26 | public static void minimizeRelation(AnnotatedPhrase rel, SemanticGraph sg, ObjectOpenHashSet collocations){ 27 | // Do the safe minimization first 28 | RelSafeMinimization.minimizeRelation(rel, sg); 29 | 30 | // If the subject is frequent, don't minimize anything 31 | if (collocations.contains(CoreNLPUtils.listOfWordsToLemmaString(rel.getWordList()).toLowerCase())){ 32 | return; 33 | } 34 | 35 | // Do the safe minimization first 36 | RelSafeMinimization.minimizeRelation(rel, sg); 37 | 38 | // remWords: list of words to be removed (reusable variable) 39 | // matchWords: list of matched words from the regex (reusable variable) 40 | List remWords = new ArrayList<>(); 41 | List matchWords = new ArrayList<>(); 42 | 43 | // Move to the dict. minimization of the noun phrases within the relation 44 | Minimization simp = new Minimization(rel, sg, collocations); 45 | simp.nounPhraseDictMinimization(remWords, matchWords); 46 | simp.namedEntityDictionaryMinimization(remWords, matchWords); 47 | } 48 | } -------------------------------------------------------------------------------- /src/main/java/de/uni_mannheim/minie/minimize/relation/RelSafeMinimization.java: -------------------------------------------------------------------------------- 1 | package de.uni_mannheim.minie.minimize.relation; 2 | 3 | import java.util.ArrayList; 4 | import java.util.List; 5 | 6 | import de.uni_mannheim.constant.POS_TAG; 7 | import de.uni_mannheim.constant.REGEX; 8 | import de.uni_mannheim.minie.annotation.AnnotatedPhrase; 9 | import de.uni_mannheim.minie.annotation.Polarity; 10 | import de.uni_mannheim.minie.minimize.Minimization; 11 | import de.uni_mannheim.utils.coreNLP.CoreNLPUtils; 12 | 13 | import edu.stanford.nlp.ling.IndexedWord; 14 | import edu.stanford.nlp.ling.tokensregex.TokenSequenceMatcher; 15 | import edu.stanford.nlp.ling.tokensregex.TokenSequencePattern; 16 | import edu.stanford.nlp.semgraph.SemanticGraph; 17 | import edu.stanford.nlp.util.CoreMap; 18 | import it.unimi.dsi.fastutil.objects.ObjectArrayList; 19 | import it.unimi.dsi.fastutil.objects.ObjectOpenHashSet; 20 | 21 | /** 22 | * @author Kiril Gashteovski 23 | */ 24 | public class RelSafeMinimization { 25 | /** 26 | * Minimize only the relations that are considered to have "safe patterns" 27 | * @param rel: the relation phrase 28 | * @param sg: semantic graph of the sentence 29 | */ 30 | public static void minimizeRelation(AnnotatedPhrase rel, SemanticGraph sg){ 31 | // Minimize left/right of the verb 32 | minimizationLeftFromVerb(rel, sg); 33 | minimizationRightFromVerb(rel, sg); 34 | } 35 | 36 | /** 37 | * Minimize the relations considered to have "safe patterns", and occur from the left of the verb 38 | * @param rel: the relation phrase 39 | * @param sg: the semantic graph of the sentence 40 | */ 41 | private static void minimizationLeftFromVerb(AnnotatedPhrase rel, SemanticGraph sg){ 42 | // Minimization object 43 | Minimization simp = new Minimization(rel, sg, new ObjectOpenHashSet()); 44 | 45 | // remWords: list of words to be removed (reusable variable) 46 | // matchWords: list of matched words from the regex (reusable variable) 47 | List remWords = new ArrayList<>(); 48 | List matchWords = new ArrayList<>(); 49 | 50 | simp.verbPhraseSafeMinimization(remWords, matchWords); 51 | } 52 | 53 | /** 54 | * Minimize the relations considered to have "safe patterns", and occur from the right of the verb 55 | * @param rel: the relation phrase 56 | * @param sg: the semantic graph of the sentence 57 | */ 58 | private static void minimizationRightFromVerb(AnnotatedPhrase rel, SemanticGraph sg){ 59 | // Minimization object 60 | Minimization simp = new Minimization(rel, sg, new ObjectOpenHashSet()); 61 | 62 | // remWords: list of words to be removed (reusable variable) 63 | // matchWords: list of matched words from the regex (reusable variable) 64 | List remWords = new ArrayList<>(); 65 | List matchWords = new ArrayList<>(); 66 | 67 | // Safe minimization on the noun phrases and named entities within the rel. phrase 68 | simp.nounPhraseSafeMinimization(remWords, matchWords); 69 | simp.namedEntitySafeMinimization(remWords, matchWords); 70 | rel = simp.getPhrase(); 71 | 72 | // Reusable variables 73 | ObjectOpenHashSet droppedWords = new ObjectOpenHashSet(); 74 | ObjectArrayList matchedWords = new ObjectArrayList<>(); 75 | ObjectArrayList verbs = new ObjectArrayList<>(); 76 | List children; 77 | 78 | // Flags for checking certain conditions 79 | boolean containsNEG; 80 | boolean isAdverb; 81 | 82 | // If ^VB+ RB+ VB+ => drop RB+ 83 | TokenSequencePattern tPattern = TokenSequencePattern.compile(REGEX.T_VB_RB_VB); 84 | TokenSequenceMatcher tMatcher = tPattern.getMatcher(rel.getWordCoreLabelList()); 85 | while (tMatcher.find()){ 86 | matchedWords = CoreNLPUtils.listOfCoreMapWordsToIndexedWordList(tMatcher.groupNodes()); 87 | // Check if the first word of the matched words is the first word of the relation 88 | if (matchedWords.get(0).index() != rel.getWordList().get(0).index()) 89 | break; 90 | 91 | verbs = CoreNLPUtils.getChainedTagNoNER(rel.getWordList(), 0); 92 | for (int i = 0; i < matchedWords.size(); i++){ 93 | isAdverb = matchedWords.get(i).tag().equals(POS_TAG.RB); 94 | containsNEG = Polarity.NEG_WORDS.contains(matchedWords.get(i).lemma().toLowerCase()); 95 | 96 | if (isAdverb && !containsNEG) { 97 | // If the adverb is the head word, don't drop it 98 | children = sg.getChildList(rel.getWordList().get(i)); 99 | children.retainAll(verbs); 100 | if (children.size() == 0) { 101 | droppedWords.addAll(CoreNLPUtils.getChainedTagNoNER(rel.getWordList(), i)); 102 | } 103 | break; 104 | } 105 | } 106 | 107 | if (droppedWords.size() > 0){ 108 | rel.removeWordsFromList(droppedWords); 109 | // add words to dropped word list 110 | rel.addDroppedEdges(CoreNLPUtils.listOfIndexedWordsToParentEdges(sg, droppedWords)); 111 | rel.addDroppedWords(droppedWords); 112 | droppedWords = new ObjectOpenHashSet(); 113 | } 114 | } 115 | 116 | // If ^VB+ RB+ => drop RB+ 117 | tPattern = TokenSequencePattern.compile(REGEX.T_VB_RB); 118 | tMatcher = tPattern.getMatcher(rel.getWordCoreLabelList()); 119 | while (tMatcher.find()){ 120 | matchedWords = CoreNLPUtils.listOfCoreMapWordsToIndexedWordList(tMatcher.groupNodes()); 121 | // Check if the first word of the matched words is the first word of the relation 122 | if (matchedWords.get(0).index() != rel.getWordList().get(0).index()) 123 | break; 124 | 125 | verbs = CoreNLPUtils.getChainedTagNoNER(rel.getWordList(), 0); 126 | for (int i = 0; i < matchedWords.size(); i++){ 127 | isAdverb = matchedWords.get(i).tag().equals(POS_TAG.RB); 128 | containsNEG = Polarity.NEG_WORDS.contains(matchedWords.get(i).lemma().toLowerCase()); 129 | 130 | if (isAdverb && !containsNEG) { 131 | // If the adverb is the head word, don't drop it 132 | children = sg.getChildList(rel.getWordList().get(i)); 133 | children.retainAll(verbs); 134 | if (children.size() == 0) { 135 | droppedWords.addAll(CoreNLPUtils.getChainedTagNoNER(rel.getWordList(), i)); 136 | } 137 | break; 138 | } 139 | } 140 | 141 | if (droppedWords.size() > 0){ 142 | rel.removeWordsFromList(droppedWords); 143 | // add words to dropped word list 144 | rel.addDroppedEdges(CoreNLPUtils.listOfIndexedWordsToParentEdges(sg, droppedWords)); 145 | rel.addDroppedWords(droppedWords); 146 | droppedWords = new ObjectOpenHashSet(); 147 | } 148 | } 149 | } 150 | } 151 | -------------------------------------------------------------------------------- /src/main/java/de/uni_mannheim/minie/minimize/subject/SubjAggressiveMinimization.java: -------------------------------------------------------------------------------- 1 | package de.uni_mannheim.minie.minimize.subject; 2 | 3 | import java.util.HashSet; 4 | import java.util.Set; 5 | 6 | import de.uni_mannheim.constant.POS_TAG; 7 | import de.uni_mannheim.constant.REGEX; 8 | import de.uni_mannheim.minie.annotation.AnnotatedPhrase; 9 | import de.uni_mannheim.minie.annotation.Quantity; 10 | import de.uni_mannheim.utils.coreNLP.CoreNLPUtils; 11 | 12 | import edu.stanford.nlp.ling.IndexedWord; 13 | import edu.stanford.nlp.ling.tokensregex.TokenSequenceMatcher; 14 | import edu.stanford.nlp.ling.tokensregex.TokenSequencePattern; 15 | import edu.stanford.nlp.semgraph.SemanticGraph; 16 | import edu.stanford.nlp.trees.EnglishGrammaticalRelations; 17 | import edu.stanford.nlp.trees.GrammaticalRelation; 18 | import it.unimi.dsi.fastutil.objects.ObjectArrayList; 19 | 20 | public class SubjAggressiveMinimization { 21 | 22 | /** 23 | * Always minimize the subject towards the root word 24 | * @param subject: subject phrase 25 | * @param sg: semantic graph of the sentence 26 | */ 27 | public static void minimizeSubject(AnnotatedPhrase subject, SemanticGraph sg){ 28 | // Don't minimize if the phrase contains one word or no words (rare cases) 29 | if (subject.getWordList() == null || subject.getWordList().size() <= 1){ 30 | return; 31 | } 32 | // Don't minimize if the phrase is a multi word NER or multiple nouns in a sequence 33 | String seqPosNer = CoreNLPUtils.wordsToPosMergedNerSeq(subject.getWordList()); 34 | if (seqPosNer.matches(REGEX.MULTI_WORD_ENTITY) || seqPosNer.matches(REGEX.MULTI_WORD_NOUN)){ 35 | return; 36 | } 37 | 38 | // Do safe minimization first 39 | SubjSafeMinimization.minimizeSubject(subject, sg); 40 | 41 | // List of words to be dropped 42 | ObjectArrayList dropWords = new ObjectArrayList<>(); 43 | 44 | // Drop some type of modifiers 45 | Set excludeRels = new HashSet<>(); 46 | excludeRels.add(EnglishGrammaticalRelations.ADVERBIAL_MODIFIER); 47 | excludeRels.add(EnglishGrammaticalRelations.ADJECTIVAL_MODIFIER); 48 | excludeRels.add(EnglishGrammaticalRelations.DETERMINER); 49 | excludeRels.add(EnglishGrammaticalRelations.PREDETERMINER); 50 | excludeRels.add(EnglishGrammaticalRelations.NUMERIC_MODIFIER); 51 | excludeRels.add(EnglishGrammaticalRelations.NUMBER_MODIFIER); 52 | excludeRels.add(EnglishGrammaticalRelations.POSSESSION_MODIFIER); 53 | excludeRels.add(EnglishGrammaticalRelations.POSSESSIVE_MODIFIER); 54 | excludeRels.add(EnglishGrammaticalRelations.QUANTIFIER_MODIFIER); 55 | excludeRels.add(EnglishGrammaticalRelations.TEMPORAL_MODIFIER); 56 | excludeRels.add(EnglishGrammaticalRelations.NP_ADVERBIAL_MODIFIER); 57 | excludeRels.add(EnglishGrammaticalRelations.PREPOSITIONAL_MODIFIER); 58 | //excludeRels.add(EnglishGrammaticalRelations.AUX_MODIFIER); 59 | for (IndexedWord w: subject.getWordList()) { 60 | // Skip the words that were included afterwards (not part of the DP) 61 | if (w.index() < 0) 62 | continue; 63 | 64 | // Get the relevant modifiers to be dropped (their modifiers as well) 65 | Set modifiers = sg.getChildrenWithRelns(w, excludeRels); 66 | for (IndexedWord m: modifiers) { 67 | ObjectArrayList subModifiers = CoreNLPUtils.getSubTreeSortedNodes(m, sg, null); 68 | for (IndexedWord sm: subModifiers) 69 | //if (!sm.tag().equals(POS_TAG.IN)) 70 | dropWords.add(sm); 71 | } 72 | dropWords.addAll(modifiers); 73 | 74 | // Drop quantities 75 | if (w.ner().equals(Quantity.ST_QUANTITY)) 76 | dropWords.add(w); 77 | } 78 | subject.getWordList().removeAll(dropWords); 79 | dropWords.clear(); 80 | 81 | // If [IN|TO] .* [IN|TO] => drop [IN|TO] .*, i.e. -> drop PP attachments 82 | TokenSequencePattern tPattern = TokenSequencePattern.compile(REGEX.T_PREP_ALL_PREP); 83 | TokenSequenceMatcher tMatcher = tPattern.getMatcher(subject.getWordCoreLabelList()); 84 | ObjectArrayList matchedWords = new ObjectArrayList<>(); 85 | while (tMatcher.find()){ 86 | matchedWords = CoreNLPUtils.listOfCoreMapWordsToIndexedWordList(tMatcher.groupNodes()); 87 | for (int i = 0; i < matchedWords.size(); i++) { 88 | if (matchedWords.get(i).tag().equals(POS_TAG.IN) || matchedWords.get(i).tag().equals(POS_TAG.TO)) { 89 | if (i == 0) { 90 | if (matchedWords.get(i).tag().equals(POS_TAG.TO) && CoreNLPUtils.isVerb(matchedWords.get(i+1).tag())) 91 | break; 92 | dropWords.add(matchedWords.get(i)); 93 | } else break; 94 | } else { 95 | dropWords.add(matchedWords.get(i)); 96 | } 97 | } 98 | } 99 | subject.addDroppedEdges(CoreNLPUtils.listOfIndexedWordsToParentEdges(sg, dropWords)); 100 | subject.addDroppedWords(dropWords); 101 | subject.getWordList().removeAll(dropWords); 102 | dropWords.clear(); 103 | 104 | // TODO: if QUANT + NP + IN => drop "QUANT + NP" ? 105 | 106 | // If VB_1+ TO VB_2 => drop VB_1+ TO .* 107 | tPattern = TokenSequencePattern.compile(REGEX.T_VB_TO_VB); 108 | tMatcher = tPattern.getMatcher(subject.getWordCoreLabelList()); 109 | matchedWords = new ObjectArrayList<>(); 110 | while (tMatcher.find()){ 111 | matchedWords = CoreNLPUtils.listOfCoreMapWordsToIndexedWordList(tMatcher.groupNodes()); 112 | for (int i = 0; i < matchedWords.size(); i++) { 113 | if (matchedWords.get(i).tag().equals(POS_TAG.TO)) { 114 | dropWords.add(matchedWords.get(i)); 115 | break; 116 | } else { 117 | dropWords.add(matchedWords.get(i)); 118 | } 119 | } 120 | } 121 | subject.getWordList().removeAll(dropWords); 122 | // add words to dropped word list 123 | subject.addDroppedEdges(CoreNLPUtils.listOfIndexedWordsToParentEdges(sg, dropWords)); 124 | subject.addDroppedWords(dropWords); 125 | dropWords.clear(); 126 | 127 | // Drop auxilaries 128 | for (IndexedWord w: subject.getWordList()) { 129 | if (w.index() < 0) 130 | continue; 131 | Set modifiers = sg.getChildrenWithReln(w, EnglishGrammaticalRelations.AUX_MODIFIER); 132 | for (IndexedWord m: modifiers) { 133 | ObjectArrayList subModifiers = CoreNLPUtils.getSubTreeSortedNodes(m, sg, null); 134 | for (IndexedWord sm: subModifiers) 135 | dropWords.add(sm); 136 | } 137 | dropWords.addAll(modifiers); 138 | } 139 | subject.getWordList().removeAll(dropWords); 140 | // add words to dropped word list 141 | subject.addDroppedEdges(CoreNLPUtils.listOfIndexedWordsToParentEdges(sg, dropWords)); 142 | subject.addDroppedWords(dropWords); 143 | dropWords.clear(); 144 | 145 | // Drop noun modifiers with different NERs 146 | for (IndexedWord w: subject.getWordList()) { 147 | if (w.index() < 0) 148 | continue; 149 | Set modifiers = sg.getChildrenWithReln(w, EnglishGrammaticalRelations.NOUN_COMPOUND_MODIFIER); 150 | for (IndexedWord mw: modifiers) { 151 | if (!w.ner().equals(mw.ner())) { 152 | dropWords.add(mw); 153 | dropWords.addAll(CoreNLPUtils.getSubTreeSortedNodes(mw, sg, null)); 154 | } 155 | } 156 | } 157 | subject.getWordList().removeAll(dropWords); 158 | // add words to dropped word list 159 | subject.addDroppedEdges(CoreNLPUtils.listOfIndexedWordsToParentEdges(sg, dropWords)); 160 | subject.addDroppedWords(dropWords); 161 | dropWords.clear(); 162 | } 163 | } -------------------------------------------------------------------------------- /src/main/java/de/uni_mannheim/minie/minimize/subject/SubjDictionaryMinimization.java: -------------------------------------------------------------------------------- 1 | package de.uni_mannheim.minie.minimize.subject; 2 | 3 | import java.util.ArrayList; 4 | import java.util.List; 5 | 6 | import de.uni_mannheim.minie.annotation.AnnotatedPhrase; 7 | import de.uni_mannheim.minie.minimize.Minimization; 8 | import de.uni_mannheim.minie.minimize.subject.SubjSafeMinimization; 9 | import de.uni_mannheim.utils.coreNLP.CoreNLPUtils; 10 | 11 | import edu.stanford.nlp.semgraph.SemanticGraph; 12 | import edu.stanford.nlp.util.CoreMap; 13 | 14 | import it.unimi.dsi.fastutil.objects.ObjectOpenHashSet; 15 | 16 | /** 17 | * Minimize the subject according to the 'Dictionary minimization' rules. 18 | * @param subject: the subject phrase 19 | * @param sg: semantic graph of the whole sentence 20 | * @param freqSubj: a multi-word dictionary of frequent subjects 21 | * @param collocations: a dictionary of collocations 22 | **/ 23 | public class SubjDictionaryMinimization { 24 | public static void minimizeSubject(AnnotatedPhrase subject, SemanticGraph sg, ObjectOpenHashSet collocations){ 25 | // Do the safe minimization first 26 | SubjSafeMinimization.minimizeSubject(subject, sg); 27 | 28 | // If the subject is frequent, don't minimize anything 29 | if (collocations.contains(CoreNLPUtils.listOfWordsToLemmaString(subject.getWordList()).toLowerCase())){ 30 | return; 31 | } 32 | 33 | // Minimization object 34 | Minimization simp = new Minimization(subject, sg, collocations); 35 | 36 | // remWords: list of words to be removed (reusable variable) 37 | // matchWords: list of matched words from the regex (reusable variable) 38 | List remWords = new ArrayList<>(); 39 | List matchWords = new ArrayList<>(); 40 | 41 | // Safe minimization on the noun phrases and named entities within the subj. phrase 42 | simp.nounPhraseDictMinimization(remWords, matchWords); 43 | simp.removeVerbsBeforeNouns(remWords, matchWords); 44 | simp.namedEntityDictionaryMinimization(remWords, matchWords); 45 | } 46 | } -------------------------------------------------------------------------------- /src/main/java/de/uni_mannheim/minie/minimize/subject/SubjSafeMinimization.java: -------------------------------------------------------------------------------- 1 | package de.uni_mannheim.minie.minimize.subject; 2 | 3 | import java.util.ArrayList; 4 | import java.util.List; 5 | 6 | import de.uni_mannheim.minie.annotation.AnnotatedPhrase; 7 | import de.uni_mannheim.minie.minimize.Minimization; 8 | import edu.stanford.nlp.semgraph.SemanticGraph; 9 | import edu.stanford.nlp.util.CoreMap; 10 | import it.unimi.dsi.fastutil.objects.ObjectOpenHashSet; 11 | 12 | public class SubjSafeMinimization { 13 | /** 14 | * Minimize only the subjects that are considered to have "safe patterns" 15 | * @param subject: the subject phrase 16 | * @param sg: the semantic graph of the whole sentence 17 | */ 18 | public static void minimizeSubject(AnnotatedPhrase subject, SemanticGraph sg){ 19 | Minimization simp = new Minimization(subject, sg, new ObjectOpenHashSet()); 20 | 21 | // remWords: list of words to be removed (reusable variable) 22 | // matchWords: list of matched words from the regex (reusable variable) 23 | List remWords = new ArrayList<>(); 24 | List matchWords = new ArrayList<>(); 25 | 26 | // Safe minimization on the noun phrases and named entities 27 | simp.nounPhraseSafeMinimization(remWords, matchWords); 28 | simp.namedEntitySafeMinimization(remWords, matchWords); 29 | } 30 | } 31 | -------------------------------------------------------------------------------- /src/main/java/de/uni_mannheim/minie/subconstituent/FrequencyCandidates.java: -------------------------------------------------------------------------------- 1 | package de.uni_mannheim.minie.subconstituent; 2 | 3 | import de.uni_mannheim.clausie.phrase.Phrase; 4 | import de.uni_mannheim.constant.REGEX; 5 | import de.uni_mannheim.utils.coreNLP.CoreNLPUtils; 6 | 7 | import edu.stanford.nlp.ling.IndexedWord; 8 | import edu.stanford.nlp.ling.tokensregex.TokenSequenceMatcher; 9 | import edu.stanford.nlp.ling.tokensregex.TokenSequencePattern; 10 | import edu.stanford.nlp.semgraph.SemanticGraph; 11 | import it.unimi.dsi.fastutil.objects.ObjectArrayList; 12 | import it.unimi.dsi.fastutil.objects.ObjectOpenHashSet; 13 | 14 | /** 15 | * @author Kiril Gashteovski 16 | */ 17 | public class FrequencyCandidates { 18 | /** The phrase from which the frequency candidates are generated from **/ 19 | private Phrase phrase; 20 | /** The sentence semantic graph **/ 21 | private SemanticGraph sg; 22 | /** The sub constituents' candidates from the phrase **/ 23 | private ObjectOpenHashSet candidates; 24 | 25 | /** Default constructor **/ 26 | public FrequencyCandidates(){ 27 | this.phrase = new Phrase(); 28 | this.sg = new SemanticGraph(); 29 | this.candidates = new ObjectOpenHashSet<>(); 30 | } 31 | 32 | /** Parametric constructor **/ 33 | public FrequencyCandidates(Phrase p, SemanticGraph sentenceSg){ 34 | this.phrase = p; 35 | this.sg = sentenceSg; 36 | this.phrase.detectRoot(this.sg); 37 | this.candidates = new ObjectOpenHashSet<>(); 38 | } 39 | 40 | /** Generate the frequency candidates by default: 41 | * 1) the whole phrase itself 42 | * 2) the root word 43 | * 3) the chained words from the root 44 | * 4) the chained sub-constituent candidates 45 | **/ 46 | public void generateDefaultFreqCandidates(){ 47 | // Stoping conditions (when the phrase is just one word or no words at all (sometimes it happens) 48 | if (this.phrase.getWordList().size() == 0){ 49 | return; 50 | } 51 | else if (this.phrase.getWordList().size() == 1){ 52 | this.candidates.add(this.phrase.getWordList().get(0).lemma().toLowerCase()); 53 | return; 54 | } 55 | 56 | // 1) the whole phrase itself 57 | this.candidates.add(CoreNLPUtils.listOfWordsToLemmaString(this.phrase.getWordList()).toLowerCase()); 58 | 59 | // 2) the root word 60 | this.candidates.add(this.phrase.getRoot().lemma().toLowerCase()); 61 | 62 | // 3) the chained words from the root 63 | ObjectArrayList chainedRootWords = 64 | CoreNLPUtils.getChainedWords(this.phrase.getRoot(), this.phrase.getWordList()); 65 | this.candidates.add(CoreNLPUtils.listOfWordsToLemmaString(chainedRootWords).toLowerCase()); 66 | } 67 | 68 | /** Generate candidates for each noun phrase within the phrase **/ 69 | public void generateNounPhraseFreqCandidates(){ 70 | SubConstituent sc = new SubConstituent(this.sg); 71 | 72 | // Generate candidates for [DT|RB|JJ]+ NN+ 73 | TokenSequencePattern tPattern = TokenSequencePattern.compile(REGEX.T_DT_RB_JJ_PR_NN); 74 | TokenSequenceMatcher tMatcher = tPattern.getMatcher(this.phrase.getWordCoreLabelList()); 75 | this.generateCandidatesFromTokenRegexMatch(tMatcher, sc); 76 | } 77 | 78 | /** 79 | * Given a token sequence matcher for regular expressions for sequences over tokens, get the sub-constituents and 80 | * store them in the sub-constituent object sc 81 | * @param tMatcher: token sequence matcher for regular expressions for sequences over tokens 82 | * @param sc: sub-constituent object 83 | */ 84 | public void generateCandidatesFromTokenRegexMatch(TokenSequenceMatcher tMatcher, SubConstituent sc){ 85 | // The matched list of words and their "root" 86 | ObjectArrayList matchWords; 87 | IndexedWord matchRoot; 88 | 89 | // Given a match, get the subconstituents 90 | while (tMatcher.find()){ 91 | matchWords = CoreNLPUtils.listOfCoreMapWordsToIndexedWordList(tMatcher.groupNodes()); 92 | matchRoot = CoreNLPUtils.getRootFromWordList(this.sg, matchWords); 93 | sc.setRoot(matchRoot); 94 | sc.setWords(matchWords); 95 | sc.generateSubConstituentsFromLeft(); 96 | for (String cand: sc.getStringSubConstituents()){ 97 | this.candidates.add(cand); 98 | } 99 | sc.clearSubConstituentsAndCandidates(); 100 | } 101 | } 102 | 103 | // Getters 104 | public Phrase getPhrase(){ 105 | return this.phrase; 106 | } 107 | public SemanticGraph getSentenceSemGraph(){ 108 | return this.sg; 109 | } 110 | public ObjectOpenHashSet getCandidates(){ 111 | return this.candidates; 112 | } 113 | 114 | // Setters 115 | public void setPhrase(Phrase p){ 116 | this.phrase = p; 117 | } 118 | public void setSentenceSemGraph(SemanticGraph sentenceSg){ 119 | this.sg = sentenceSg; 120 | } 121 | public void setCandidates(ObjectOpenHashSet cands){ 122 | this.candidates = cands; 123 | } 124 | 125 | /** Clear the frequency candidates object (empty phrase and semantic graph, and clear the list of candidates) **/ 126 | public void clear(){ 127 | this.phrase = new Phrase(); 128 | this.sg = new SemanticGraph(); 129 | this.candidates.clear(); 130 | } 131 | /** Clear the candidates list **/ 132 | public void clearCandidates(){ 133 | this.candidates.clear(); 134 | } 135 | } 136 | -------------------------------------------------------------------------------- /src/main/java/de/uni_mannheim/utils/Dictionary.java: -------------------------------------------------------------------------------- 1 | package de.uni_mannheim.utils; 2 | 3 | import java.io.DataInput; 4 | import java.io.DataInputStream; 5 | import java.io.IOException; 6 | import java.io.InputStream; 7 | 8 | import it.unimi.dsi.fastutil.objects.ObjectOpenHashSet; 9 | 10 | import edu.stanford.nlp.ling.IndexedWord; 11 | 12 | /** 13 | * A dictionary stores a set of strings. 14 | * 15 | * @author Kiril Gashteovski 16 | */ 17 | public class Dictionary { 18 | 19 | /** Stores the strings */ 20 | public ObjectOpenHashSet words; 21 | 22 | /** Default constructor **/ 23 | public Dictionary() { 24 | this.words = new ObjectOpenHashSet(); 25 | } 26 | 27 | /** Opens an empty set of strings (the dictionary) and then loads the dictionary from the input stream **/ 28 | public Dictionary(InputStream in) throws IOException { 29 | this.words = new ObjectOpenHashSet(); 30 | this.load(in); 31 | } 32 | 33 | /** Opens an empty set of strings (the dictionary) and then loads the dictionary from the resource path **/ 34 | public Dictionary(String resourcePath) throws IOException { 35 | this.words = new ObjectOpenHashSet(); 36 | this.load(resourcePath); 37 | } 38 | 39 | /** Opens an empty set of strings (the dictionary) and then loads the dictionary from the multiple resources 40 | * @throws IOException **/ 41 | public Dictionary(String [] resourcePaths) throws IOException { 42 | this.words = new ObjectOpenHashSet(); 43 | this.load(resourcePaths); 44 | } 45 | 46 | /** The size of the dictionary (number of words) **/ 47 | public int size() { 48 | return this.words.size(); 49 | } 50 | 51 | /** Checks if a certain word (as a string) is in the dictionary **/ 52 | public boolean contains(String word) { 53 | return this.words.contains(word); 54 | } 55 | 56 | /** Checks if a certain word (IndexedWord object) is in the dictionary in its lemmatized form **/ 57 | public boolean containsLemmatized(IndexedWord word) { 58 | return this.words.contains(word.lemma()); 59 | } 60 | 61 | private InputStream getInputStreamFromResource(String resourceName) throws IOException { 62 | return this.getClass().getResource(resourceName).openStream(); 63 | } 64 | 65 | /** Loads a dictionary from a resource path 66 | * @throws IOException 67 | **/ 68 | public void load(String resourcePath) throws IOException { 69 | this.load(this.getInputStreamFromResource(resourcePath)); 70 | } 71 | 72 | /** Loads a dictionary from several resource paths 73 | * @throws IOException **/ 74 | public void load(String [] resourcePaths) throws IOException { 75 | for (String path: resourcePaths) { 76 | this.load(path); 77 | } 78 | } 79 | 80 | /** Loads the dictionary out of an {@link InputStream}. 81 | * Each line of the original file should contain an entry to the dictionary 82 | */ 83 | public void load(InputStream in) throws IOException { 84 | DataInput data = new DataInputStream(in); 85 | String line = data.readLine(); 86 | while (line != null) { 87 | line = line.trim(); 88 | if (line.length() > 0) { 89 | this.words.add(line); 90 | } 91 | line = data.readLine(); 92 | } 93 | } 94 | 95 | /** Get the set of words **/ 96 | public ObjectOpenHashSet words() { 97 | return this.words; 98 | } 99 | 100 | /** Add entries to the dictionary **/ 101 | public void addWords(ObjectOpenHashSet ws) { 102 | this.words.addAll(ws); 103 | } 104 | } 105 | -------------------------------------------------------------------------------- /src/main/java/de/uni_mannheim/utils/minie/Utils.java: -------------------------------------------------------------------------------- 1 | package de.uni_mannheim.utils.minie; 2 | 3 | import edu.stanford.nlp.ling.IndexedWord; 4 | import it.unimi.dsi.fastutil.objects.ObjectArrayList; 5 | import joptsimple.OptionSet; 6 | 7 | import java.io.IOException; 8 | import java.util.ArrayList; 9 | import java.util.Arrays; 10 | import java.util.Collection; 11 | import java.util.StringJoiner; 12 | 13 | import de.uni_mannheim.minie.MinIE; 14 | import de.uni_mannheim.minie.annotation.AnnotatedProposition; 15 | import de.uni_mannheim.minie.annotation.Attribution; 16 | import de.uni_mannheim.minie.annotation.Quantity; 17 | import de.uni_mannheim.utils.Dictionary; 18 | 19 | /** 20 | * Helper class for MinIE 21 | * 22 | * @author Martin Achenbach 23 | * @author Kiril Gashteovski 24 | */ 25 | public class Utils { 26 | /** MinIE default dictionaries **/ 27 | public static String [] DEFAULT_DICTIONARIES = new String [] {"/minie-resources/wn-mwe.txt", 28 | "/minie-resources/wiktionary-mw-titles.txt"}; 29 | 30 | /** 31 | * formats an annotated proposition in Ollie style 32 | * @param proposition: annotated proposition to format 33 | * @return formatted proposition 34 | */ 35 | public static String formatProposition(AnnotatedProposition proposition) { 36 | // First the triple 37 | StringJoiner tripleJoiner = new StringJoiner(";", "(", ")"); 38 | String subject = proposition.getSubject().toString(); 39 | if (!subject.isEmpty()) tripleJoiner.add(subject); 40 | String relation = proposition.getRelation().toString(); 41 | if (!relation.isEmpty()) tripleJoiner.add(relation); 42 | String object = proposition.getObject().toString(); 43 | if (!object.isEmpty()) tripleJoiner.add(object); 44 | 45 | // Factuality 46 | String factualityString = ""; 47 | String factuality = formatFactuality(proposition.getPolarity().getType().toString(), proposition.getModality().getModalityType().toString()); 48 | if (!factuality.isEmpty()) factualityString = String.format("[factuality=%s]", factuality); 49 | 50 | /*String clausalModifier = proposition.getClauseModifier().toString(); 51 | if (!clausalModifier.isEmpty()) annotations.add("clausalModifier=" + clausalModifier);*/ 52 | 53 | // Attribution 54 | Attribution attribution = proposition.getAttribution(); 55 | String attributionString = ""; 56 | 57 | // Only process the attribution if there is a attribution phrase TODO is this suitable? 58 | if (attribution != null && attribution.getAttributionPhrase() != null) { 59 | StringJoiner attributionAttributesJoiner = new StringJoiner(";"); 60 | String attributionPhrase = attribution.getAttributionPhrase().toString(); 61 | if (!attributionPhrase.isEmpty()) attributionAttributesJoiner.add("phrase:" + attributionPhrase); 62 | String attributionPredicate = attribution.getPredicateVerb().toString(); 63 | if (!attributionPredicate.isEmpty()) attributionAttributesJoiner.add("predicate:" + attributionPredicate); 64 | String attributionFactuality = formatFactuality(attribution.getPolarityType().toString(), attribution.getModalityType().toString()); 65 | if (!attributionFactuality.isEmpty()) attributionAttributesJoiner.add("factuality:" + attributionFactuality); 66 | attributionString = String.format("[attribution=%s]", attributionAttributesJoiner.toString()); 67 | } 68 | 69 | // Quantities 70 | StringJoiner quantityJoiner = new StringJoiner(";"); 71 | String quantitiesString = ""; 72 | ObjectArrayList quantities = new ObjectArrayList(); 73 | 74 | // Add all quantities 75 | quantities.addAll(proposition.getSubject().getQuantities()); 76 | quantities.addAll(proposition.getRelation().getQuantities()); 77 | quantities.addAll(proposition.getObject().getQuantities()); 78 | if (quantities.size() > 0) { 79 | for (Quantity q : quantities) { 80 | StringJoiner quantityPhrase = new StringJoiner(" "); 81 | for (IndexedWord w : q.getQuantityWords()) { 82 | quantityPhrase.add(w.originalText()); 83 | } 84 | quantityJoiner.add(String.format("QUANT_%s:%s", q.getId(),quantityPhrase.toString())); 85 | } 86 | quantitiesString = String.format("[quantities=%s]", quantityJoiner.toString()); 87 | } 88 | String output = tripleJoiner.toString() + factualityString + attributionString + quantitiesString; 89 | return output; 90 | } 91 | 92 | /** 93 | * format a factuality pair 94 | * @param polarity: polarity to format 95 | * @param modality: modality to format 96 | * @return formatted factuality 97 | */ 98 | private static String formatFactuality(String polarity, String modality) { 99 | String factuality = ""; 100 | if (!polarity.isEmpty() && !modality.isEmpty()) { 101 | if (polarity.equalsIgnoreCase("POSITIVE")) { 102 | polarity = "+"; 103 | } else { 104 | polarity = "-"; 105 | } 106 | if (modality.equalsIgnoreCase("CERTAINTY")) { 107 | modality = "CT"; 108 | } else { 109 | modality = "PS"; 110 | } 111 | factuality = String.format("(%s,%s)", polarity, modality); 112 | } 113 | return factuality; 114 | } 115 | 116 | /** 117 | * parses a string to a MinIE mode 118 | * @param s: string to parse 119 | * @return MinIE mode 120 | */ 121 | public static MinIE.Mode getMode(String s) { 122 | MinIE.Mode mode; 123 | if (s.equalsIgnoreCase("aggressive")) { 124 | mode = MinIE.Mode.AGGRESSIVE; 125 | } else if (s.equalsIgnoreCase("dictionary")) { 126 | mode = MinIE.Mode.DICTIONARY; 127 | } else if (s.equalsIgnoreCase("complete")) { 128 | mode = MinIE.Mode.COMPLETE; 129 | } else { 130 | mode = MinIE.Mode.SAFE; 131 | } 132 | return mode; 133 | } 134 | 135 | /** 136 | * load a dictionary from a given location in the option set 137 | * @param options: option set to read the locations from 138 | * @return a dictionary read from the specified locations 139 | * @throws IOException 140 | */ 141 | public static Dictionary loadDictionary(OptionSet options) throws IOException { 142 | Dictionary collocationDictionary = null; 143 | ArrayList filenames = new ArrayList(); 144 | if (!options.has("dict-overwrite")) { 145 | // if the overwrite option is not set, add the default dictionaries 146 | filenames.addAll(Arrays.asList(DEFAULT_DICTIONARIES)); 147 | } 148 | if (options.has("dict")) { 149 | filenames.addAll((Collection) options.valuesOf("dict")); 150 | } 151 | String[] filenamesArray = Arrays.copyOf(filenames.toArray(), filenames.size(), String[].class); 152 | //logger.info("Loading dictionaries from " + Arrays.toString(filenamesArray)); 153 | collocationDictionary = new Dictionary(filenamesArray); 154 | //logger.info("Finished loading dictionaries"); 155 | return collocationDictionary; 156 | } 157 | } 158 | -------------------------------------------------------------------------------- /src/main/java/de/uni_mannheim/utils/phrase/PhraseUtils.java: -------------------------------------------------------------------------------- 1 | package de.uni_mannheim.utils.phrase; 2 | 3 | import de.uni_mannheim.clausie.phrase.Phrase; 4 | import de.uni_mannheim.constant.SEPARATOR; 5 | import de.uni_mannheim.minie.annotation.AnnotatedPhrase; 6 | import it.unimi.dsi.fastutil.objects.ObjectArrayList; 7 | 8 | /** 9 | * @author Kiril Gashteovski 10 | */ 11 | public class PhraseUtils { 12 | /** 13 | * Given a list of phrases, return their words concatenated into one string. 14 | * @param phraseList: list of phrases 15 | * @return string (words from the phrase list concatenated) 16 | */ 17 | public static String listOfPhrasesToString(ObjectArrayList phraseList){ 18 | StringBuffer sb = new StringBuffer(); 19 | for (Phrase phrase: phraseList){ 20 | sb.append(phrase.getWords()); 21 | sb.append(SEPARATOR.SPACE); 22 | } 23 | return sb.toString().trim(); 24 | } 25 | 26 | /** 27 | * Given a list of annoteted phrases, return their words concatenated into one string. 28 | * @param phraseList: list of phrases 29 | * @return string (words from the phrase list concatenated) 30 | */ 31 | public static String listOfAnnotatedPhrasesToString(ObjectArrayList phraseList){ 32 | StringBuffer sb = new StringBuffer(); 33 | for (AnnotatedPhrase aPhrase: phraseList){ 34 | sb.append(aPhrase.getWords()); 35 | sb.append(SEPARATOR.SPACE); 36 | } 37 | return sb.toString().trim(); 38 | } 39 | } 40 | -------------------------------------------------------------------------------- /src/main/java/tests/minie/Demo.java: -------------------------------------------------------------------------------- 1 | package tests.minie; 2 | 3 | import java.io.BufferedReader; 4 | import java.io.DataInputStream; 5 | import java.io.IOException; 6 | import java.io.InputStream; 7 | import java.io.InputStreamReader; 8 | 9 | import org.python.util.PythonInterpreter; 10 | 11 | import de.uni_mannheim.minie.MinIE; 12 | import de.uni_mannheim.minie.annotation.AnnotatedProposition; 13 | import de.uni_mannheim.utils.coreNLP.CoreNLPUtils; 14 | 15 | import edu.stanford.nlp.pipeline.StanfordCoreNLP; 16 | import edu.stanford.nlp.semgraph.SemanticGraph; 17 | 18 | 19 | 20 | /** 21 | * @author Kiril Gashteovski 22 | * @author Yide Song 23 | */ 24 | public class Demo { 25 | public static void main(String args[]) throws IOException, InterruptedException { 26 | 27 | 28 | 29 | // Dependency parsing pipeline initialization 30 | StanfordCoreNLP parser = CoreNLPUtils.StanfordDepNNParser(); 31 | 32 | // Input sentence 33 | String sentence = "Both taxa are produced in abundance by a variety of coniferous plants, and are typical in the Paleogene of the northern UK and Greenland region (Boulter and Manum 1989; Jolley and Whitham 2004; Jolley and Morton 2007), as well as mid-latitude North America (Smith et al., 2007) and Arctic Canada (Greenwood and Basinger, 1993)."; 34 | 35 | 36 | 37 | // Generate the extractions (With SAFE mode) 38 | MinIE minie = new MinIE(sentence, parser, MinIE.Mode.SAFE); 39 | System.out.println("New Sentence: " + minie.getNewSentence()); 40 | 41 | // Print the extractions 42 | System.out.println("\nInput sentence: " + sentence); 43 | System.out.println("============================="); 44 | System.out.println("Extractions:"); 45 | for (AnnotatedProposition ap: minie.getPropositions()) { 46 | System.out.println("\tTriple: " + ap.getTripleAsString()); 47 | System.out.print("\tFactuality: " + ap.getFactualityAsString()); 48 | if(ap.getCitePolarity() != null && ap.getCitePurpose() != null){ 49 | System.out.print("\tCite: " + ap.getCiteAsString()); 50 | } 51 | if (ap.getAttribution().getAttributionPhrase() != null) 52 | System.out.print("\tAttribution: " + ap.getAttribution().toStringCompact()); 53 | else 54 | System.out.print("\tAttribution: NONE"); 55 | System.out.println("\n\t----------"); 56 | } 57 | 58 | System.out.println("\n\nDONE!"); 59 | } 60 | } 61 | -------------------------------------------------------------------------------- /src/main/java/tests/minie/DetectCitationDemo.java: -------------------------------------------------------------------------------- 1 | package tests.minie; 2 | 3 | import java.io.*; 4 | import org.python.util.PythonInterpreter; 5 | 6 | import de.uni_mannheim.minie.MinIE; 7 | import de.uni_mannheim.minie.annotation.AnnotatedProposition; 8 | import de.uni_mannheim.utils.coreNLP.CoreNLPUtils; 9 | 10 | import edu.stanford.nlp.pipeline.StanfordCoreNLP; 11 | import edu.stanford.nlp.semgraph.SemanticGraph; 12 | 13 | 14 | /** 15 | * @author Kiril Gashteovski 16 | */ 17 | public class DetectCitationDemo { 18 | public static void main(String args[]) throws IOException, InterruptedException { 19 | // Dependency parsing pipeline initialization 20 | StanfordCoreNLP parser = CoreNLPUtils.StanfordDepNNParser(); 21 | 22 | 23 | PrintWriter pw = null; 24 | try { 25 | pw = new PrintWriter(new File("CitationSentences.csv")); 26 | } catch (FileNotFoundException e) { 27 | e.printStackTrace(); 28 | } 29 | 30 | StringBuilder builder = new StringBuilder(); 31 | builder.append("id"); 32 | builder.append(','); 33 | builder.append("Input Sentence"); 34 | builder.append(','); 35 | builder.append("New Sentence"); 36 | builder.append('\n'); 37 | 38 | File file = new File("SVM_model/Evaluation_Data/oa_200randsents.txt"); 39 | 40 | BufferedReader br = null; 41 | try { 42 | br = new BufferedReader(new FileReader(file)); 43 | } catch (FileNotFoundException e) { 44 | // TODO Auto-generated catch block 45 | e.printStackTrace(); 46 | } 47 | 48 | String st; 49 | int id = 0; 50 | try { 51 | while ((st = br.readLine()) != null){ 52 | //remove "," in the sentence. 53 | st = removeId(st); 54 | MinIE minie = new MinIE(st, parser, MinIE.Mode.SAFE); 55 | String nst = minie.getNewSentence(); 56 | 57 | if(minie.isCitation()==true){ 58 | System.out.println("This sentence is citation sentence: " + st); 59 | String handleStr=st; 60 | if(st.contains(",")){ 61 | if(st.contains("\"")){ 62 | handleStr=st.replace("\"", "\"\""); 63 | } 64 | handleStr="\""+handleStr+"\""; 65 | } 66 | 67 | String handleStr2=nst; 68 | if(nst.contains(",")){ 69 | if(nst.contains("\"")){ 70 | handleStr2=nst.replace("\"", "\"\""); 71 | } 72 | handleStr2="\""+handleStr2+"\""; 73 | } 74 | System.out.println(handleStr); 75 | builder.append(id); 76 | builder.append(','); 77 | builder.append(handleStr); 78 | builder.append(','); 79 | builder.append(handleStr2); 80 | builder.append(','); 81 | builder.append(minie.getCitePolarity()); 82 | builder.append(','); 83 | builder.append(minie.getCitePurpose()); 84 | builder.append('\n'); 85 | id++; 86 | 87 | for (AnnotatedProposition ap: minie.getPropositions()) { 88 | builder.append(','); 89 | builder.append(ap.getTripleAsString()); 90 | builder.append(','); 91 | builder.append(ap.getFactualityAsString()); 92 | builder.append(','); 93 | if (ap.getAttribution().getAttributionPhrase() != null) { 94 | builder.append(ap.getAttribution().toStringCompact()); 95 | }else{ 96 | builder.append("NONE"); 97 | } 98 | builder.append('\n'); 99 | } 100 | builder.append('\n'); 101 | } 102 | } 103 | } catch (IOException e) { 104 | // TODO Auto-generated catch block 105 | e.printStackTrace(); 106 | } 107 | 108 | pw.write('\ufeff'); 109 | pw.write(builder.toString()); 110 | pw.close(); 111 | System.out.println("done!"); 112 | 113 | //runMinIE(); 114 | } 115 | 116 | public static String removeId (String sentence){ 117 | sentence = sentence.replaceAll("S[0-9A-Z]*\\:[0-9]*", ""); 118 | return sentence; 119 | } 120 | 121 | public static void runMinIE() { 122 | // Dependency parsing pipeline initialization 123 | StanfordCoreNLP parser = CoreNLPUtils.StanfordDepNNParser(); 124 | 125 | // Input sentence 126 | String sentence = "The Joker believes that the hero Batman was not actually born in foggy Gotham City (Walters, 1994)."; 127 | 128 | // Generate the extractions (With SAFE mode) 129 | MinIE minie = new MinIE(sentence, parser, MinIE.Mode.SAFE); 130 | 131 | // Print the extractions 132 | System.out.println("\nInput sentence: " + sentence); 133 | System.out.println("============================="); 134 | System.out.println("Extractions:"); 135 | for (AnnotatedProposition ap: minie.getPropositions()) { 136 | System.out.println("\tTriple: " + ap.getTripleAsString()); 137 | System.out.print("\tFactuality: " + ap.getFactualityAsString()); 138 | if(ap.getCitePolarity() != null && ap.getCitePurpose() != null){ 139 | System.out.print("\tCite: " + ap.getCiteAsString()); 140 | } 141 | if (ap.getAttribution().getAttributionPhrase() != null) 142 | System.out.print("\tAttribution: " + ap.getAttribution().toStringCompact()); 143 | else 144 | System.out.print("\tAttribution: NONE"); 145 | System.out.println("\n\t----------"); 146 | } 147 | 148 | System.out.println("\n\nDONE!"); 149 | } 150 | } 151 | -------------------------------------------------------------------------------- /src/main/java/tests/minie/OriginalMinIE.java: -------------------------------------------------------------------------------- 1 | package tests.minie; 2 | 3 | import java.io.*; 4 | import org.python.util.PythonInterpreter; 5 | 6 | import de.uni_mannheim.minie.MinIE; 7 | import de.uni_mannheim.minie.annotation.AnnotatedProposition; 8 | import de.uni_mannheim.utils.coreNLP.CoreNLPUtils; 9 | 10 | import edu.stanford.nlp.pipeline.StanfordCoreNLP; 11 | import edu.stanford.nlp.semgraph.SemanticGraph; 12 | 13 | 14 | 15 | /** 16 | * @author Kiril Gashteovski 17 | */ 18 | public class OriginalMinIE { 19 | public static void main(String args[]) throws IOException, InterruptedException { 20 | // Dependency parsing pipeline initialization 21 | StanfordCoreNLP parser = CoreNLPUtils.StanfordDepNNParser(); 22 | 23 | 24 | PrintWriter pw = null; 25 | try { 26 | pw = new PrintWriter(new File("OriginalMinIE.csv")); 27 | } catch (FileNotFoundException e) { 28 | e.printStackTrace(); 29 | } 30 | 31 | StringBuilder builder = new StringBuilder(); 32 | builder.append("id"); 33 | builder.append(','); 34 | builder.append("Input Sentence"); 35 | builder.append('\n'); 36 | 37 | File file = new File("SVM_model/Evaluation_Data/oa_200randsents.txt"); 38 | 39 | BufferedReader br = null; 40 | try { 41 | br = new BufferedReader(new FileReader(file)); 42 | } catch (FileNotFoundException e) { 43 | // TODO Auto-generated catch block 44 | e.printStackTrace(); 45 | } 46 | 47 | String st; 48 | int id = 0; 49 | try { 50 | while ((st = br.readLine()) != null){ 51 | //remove "," in the sentence. 52 | st = removeId(st); 53 | MinIE minie = new MinIE(st, parser, MinIE.Mode.SAFE); 54 | 55 | String handleStr=st; 56 | if(st.contains(",")){ 57 | if(st.contains("\"")){ 58 | handleStr=st.replace("\"", "\"\""); 59 | } 60 | handleStr="\""+handleStr+"\""; 61 | } 62 | 63 | builder.append(id); 64 | builder.append(','); 65 | builder.append(handleStr); 66 | builder.append('\n'); 67 | id++; 68 | 69 | for (AnnotatedProposition ap: minie.getPropositions()) { 70 | builder.append(','); 71 | builder.append(ap.getTripleAsString()); 72 | builder.append(','); 73 | builder.append(ap.getFactualityAsString()); 74 | builder.append(','); 75 | if (ap.getAttribution().getAttributionPhrase() != null) { 76 | builder.append(ap.getAttribution().toStringCompact()); 77 | }else{ 78 | builder.append("NONE"); 79 | } 80 | builder.append('\n'); 81 | } 82 | builder.append('\n'); 83 | } 84 | 85 | } catch (IOException e) { 86 | // TODO Auto-generated catch block 87 | e.printStackTrace(); 88 | } 89 | 90 | pw.write('\ufeff'); 91 | pw.write(builder.toString()); 92 | pw.close(); 93 | System.out.println("done!"); 94 | 95 | //runMinIE(); 96 | } 97 | 98 | public static String removeId (String sentence){ 99 | sentence = sentence.replaceAll("S[0-9A-Z]*\\:[0-9]*", ""); 100 | return sentence; 101 | } 102 | 103 | public static void runMinIE() { 104 | // Dependency parsing pipeline initialization 105 | StanfordCoreNLP parser = CoreNLPUtils.StanfordDepNNParser(); 106 | 107 | // Input sentence 108 | String sentence = "The Joker believes that the hero Batman was not actually born in foggy Gotham City (Walters, 1994)."; 109 | 110 | // Generate the extractions (With SAFE mode) 111 | MinIE minie = new MinIE(sentence, parser, MinIE.Mode.SAFE); 112 | 113 | // Print the extractions 114 | System.out.println("\nInput sentence: " + sentence); 115 | System.out.println("============================="); 116 | System.out.println("Extractions:"); 117 | for (AnnotatedProposition ap: minie.getPropositions()) { 118 | System.out.println("\tTriple: " + ap.getTripleAsString()); 119 | System.out.print("\tFactuality: " + ap.getFactualityAsString()); 120 | if(ap.getCitePolarity() != null && ap.getCitePurpose() != null){ 121 | System.out.print("\tCite: " + ap.getCiteAsString()); 122 | } 123 | if (ap.getAttribution().getAttributionPhrase() != null) 124 | System.out.print("\tAttribution: " + ap.getAttribution().toStringCompact()); 125 | else 126 | System.out.print("\tAttribution: NONE"); 127 | System.out.println("\n\t----------"); 128 | } 129 | 130 | System.out.println("\n\nDONE!"); 131 | } 132 | } 133 | -------------------------------------------------------------------------------- /src/main/java/uk/ac/ucl/cs/mr/Fact.java: -------------------------------------------------------------------------------- 1 | package uk.ac.ucl.cs.mr; 2 | 3 | public class Fact { 4 | 5 | public String subject = null; 6 | public String predicate = null; 7 | public String object = null; 8 | 9 | public Fact(String s, String p, String o) { 10 | this.subject = s; 11 | this.predicate = p; 12 | this.object = o; 13 | } 14 | 15 | public String getSubject() { 16 | return subject; 17 | } 18 | 19 | public void setSubject(String subject) { 20 | this.subject = subject; 21 | } 22 | 23 | public String getPredicate() { 24 | return predicate; 25 | } 26 | 27 | public void setPredicate(String predicate) { 28 | this.predicate = predicate; 29 | } 30 | 31 | public String getObject() { 32 | return object; 33 | } 34 | 35 | public void setObject(String object) { 36 | this.object = object; 37 | } 38 | 39 | } 40 | -------------------------------------------------------------------------------- /src/main/java/uk/ac/ucl/cs/mr/FactsBean.java: -------------------------------------------------------------------------------- 1 | package uk.ac.ucl.cs.mr; 2 | 3 | import java.util.List; 4 | 5 | import javax.xml.bind.annotation.XmlRootElement; 6 | 7 | @XmlRootElement 8 | public class FactsBean { 9 | 10 | public List facts; 11 | 12 | public FactsBean(List facts) { 13 | this.facts = facts; 14 | } 15 | 16 | } 17 | -------------------------------------------------------------------------------- /src/main/java/uk/ac/ucl/cs/mr/FactsResource.java: -------------------------------------------------------------------------------- 1 | package uk.ac.ucl.cs.mr; 2 | 3 | import java.util.List; 4 | import java.util.ArrayList; 5 | 6 | import javax.ws.rs.POST; 7 | import javax.ws.rs.Path; 8 | import javax.ws.rs.Produces; 9 | import javax.ws.rs.core.MediaType; 10 | 11 | import de.uni_mannheim.minie.MinIE; 12 | import de.uni_mannheim.minie.annotation.AnnotatedPhrase; 13 | import de.uni_mannheim.minie.annotation.AnnotatedProposition; 14 | import de.uni_mannheim.utils.coreNLP.CoreNLPUtils; 15 | 16 | import edu.stanford.nlp.pipeline.StanfordCoreNLP; 17 | 18 | @Path("/query") 19 | public class FactsResource { 20 | 21 | private static final StanfordCoreNLP parser = CoreNLPUtils.StanfordDepNNParser(); 22 | 23 | @POST 24 | @Produces({MediaType.APPLICATION_JSON}) 25 | public FactsBean query(String sentence) { 26 | MinIE minie = new MinIE(sentence, FactsResource.parser, MinIE.Mode.SAFE); 27 | 28 | List facts = new ArrayList<>(); 29 | 30 | for (AnnotatedProposition ap: minie.getPropositions()) { 31 | List triple = ap.getTriple(); 32 | 33 | String s = triple.get(0).toString(); 34 | String p = triple.get(1).toString(); 35 | String o = triple.get(2).toString(); 36 | 37 | Fact fact = new Fact(s, p, o); 38 | facts.add(fact); 39 | } 40 | 41 | return new FactsBean(facts); 42 | } 43 | } 44 | -------------------------------------------------------------------------------- /src/main/java/uk/ac/ucl/cs/mr/Main.java: -------------------------------------------------------------------------------- 1 | package uk.ac.ucl.cs.mr; 2 | 3 | import java.io.IOException; 4 | import java.net.URI; 5 | import java.util.logging.Level; 6 | import java.util.logging.Logger; 7 | 8 | 9 | import org.glassfish.jersey.grizzly2.httpserver.GrizzlyHttpServerFactory; 10 | import org.glassfish.jersey.server.ResourceConfig; 11 | 12 | import org.glassfish.grizzly.http.server.HttpServer; 13 | 14 | import de.uni_mannheim.utils.coreNLP.CoreNLPUtils; 15 | import edu.stanford.nlp.pipeline.StanfordCoreNLP; 16 | 17 | public class Main { 18 | 19 | private static final URI BASE_URI = URI.create("http://localhost:8080/minie/"); 20 | 21 | public static void main(String[] args) { 22 | try { 23 | System.out.println("MinIE Service"); 24 | 25 | final HttpServer server = GrizzlyHttpServerFactory 26 | .createHttpServer(BASE_URI, create(), false); 27 | Runtime.getRuntime().addShutdownHook(new Thread(new Runnable() { 28 | @Override 29 | public void run() { 30 | server.shutdownNow(); 31 | } 32 | })); 33 | server.start(); 34 | 35 | System.out.println(String.format("Application started.%n" + 36 | "Stop the application using CTRL+C")); 37 | 38 | Thread.currentThread().join(); 39 | } catch (IOException | InterruptedException ex) { 40 | Logger.getLogger(Main.class.getName()).log(Level.SEVERE, null, ex); 41 | } 42 | 43 | } 44 | 45 | public static ResourceConfig create() { 46 | return new MinIEService(); 47 | } 48 | } 49 | -------------------------------------------------------------------------------- /src/main/java/uk/ac/ucl/cs/mr/MinIEService.java: -------------------------------------------------------------------------------- 1 | package uk.ac.ucl.cs.mr; 2 | 3 | import edu.stanford.nlp.pipeline.StanfordCoreNLP; 4 | import org.glassfish.jersey.jackson.JacksonFeature; 5 | import org.glassfish.jersey.server.ResourceConfig; 6 | 7 | public class MinIEService extends ResourceConfig { 8 | 9 | public MinIEService() { 10 | super(FactsResource.class, JacksonFeature.class); 11 | } 12 | 13 | } 14 | -------------------------------------------------------------------------------- /src/main/resources/clausie-resources/clausie.conf: -------------------------------------------------------------------------------- 1 | conservativeSVA = true 2 | conservativeSVOA = false 3 | processCcAllVerbs = true 4 | processCcNonVerbs = true 5 | processAppositions = true 6 | appositionVerb = is 7 | processPossessives = true 8 | processPartmods = true 9 | possessiveVerb = has 10 | lemmatize = false 11 | nary = false 12 | minOptionalArgs = 0 13 | maxOptionalArgs = 1 14 | 15 | dictCopular = /clausie-resources/dict-copular.txt 16 | dictExtCopular = /clausie-resources/dict-ext-copular.txt 17 | dictNotExtCopular = /clausie-resources/dict-not-ext-copular.txt 18 | dictComplexTransitive = /clausie-resources/dict-complex-transitive.txt 19 | dictAdverbsConj = /clausie-resources/dict-adverbs-conj.txt 20 | dictAdverbsIgnore = /clausie-resources/dict-adverbs-ignore.txt 21 | dictAdverbsInclude = /clausie-resources/dict-adverbs-include.txt -------------------------------------------------------------------------------- /src/main/resources/clausie-resources/dict-adverbs-conj.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gkiril/MinSCIE/ea6b53003f65e2043dc46ebddbe8460e2a2b7dbc/src/main/resources/clausie-resources/dict-adverbs-conj.txt -------------------------------------------------------------------------------- /src/main/resources/clausie-resources/dict-adverbs-ignore.txt: -------------------------------------------------------------------------------- 1 | so 2 | then 3 | thus 4 | why 5 | as 6 | even -------------------------------------------------------------------------------- /src/main/resources/clausie-resources/dict-adverbs-include.txt: -------------------------------------------------------------------------------- 1 | hardly 2 | barely 3 | scarcely 4 | seldom 5 | rarely -------------------------------------------------------------------------------- /src/main/resources/clausie-resources/dict-complex-transitive.txt: -------------------------------------------------------------------------------- 1 | bring 2 | catch 3 | drive 4 | get 5 | keep 6 | lay 7 | lead 8 | place 9 | put 10 | set 11 | sit 12 | show 13 | stand 14 | slip 15 | take -------------------------------------------------------------------------------- /src/main/resources/clausie-resources/dict-copular.txt: -------------------------------------------------------------------------------- 1 | act 2 | appear 3 | be 4 | become 5 | come 6 | come out 7 | end up 8 | get 9 | go 10 | grow 11 | fall 12 | feel 13 | keep 14 | leave 15 | look 16 | prove 17 | remain 18 | seem 19 | smell 20 | sound 21 | stay 22 | taste 23 | turn 24 | turn up 25 | wind up -------------------------------------------------------------------------------- /src/main/resources/clausie-resources/dict-ext-copular.txt: -------------------------------------------------------------------------------- 1 | act 2 | appear 3 | be 4 | become 5 | come 6 | come out 7 | end up 8 | get 9 | go 10 | grow 11 | fall 12 | feel 13 | keep 14 | leave 15 | look 16 | prove 17 | remain 18 | seem 19 | smell 20 | sound 21 | stay 22 | taste 23 | turn 24 | turn up 25 | wind up 26 | live 27 | come 28 | go 29 | stand 30 | lie 31 | love 32 | do 33 | try -------------------------------------------------------------------------------- /src/main/resources/clausie-resources/dict-not-ext-copular.txt: -------------------------------------------------------------------------------- 1 | die 2 | walk -------------------------------------------------------------------------------- /src/main/resources/minie-resources/certainty-verbs.dict: -------------------------------------------------------------------------------- 1 | say 2 | add 3 | claim 4 | write 5 | publish 6 | know 7 | remember 8 | learn 9 | discover 10 | forget 11 | admit 12 | prove 13 | show 14 | explain 15 | confirm 16 | acknowledge 17 | recall -------------------------------------------------------------------------------- /src/main/resources/minie-resources/certainty-words.dict: -------------------------------------------------------------------------------- 1 | certainly 2 | surely 3 | definitely 4 | undoubtedly 5 | clearly 6 | obviously -------------------------------------------------------------------------------- /src/main/resources/minie-resources/neg-adverbs.dict: -------------------------------------------------------------------------------- 1 | not 2 | never 3 | n't -------------------------------------------------------------------------------- /src/main/resources/minie-resources/neg-determiners.dict: -------------------------------------------------------------------------------- 1 | no 2 | non -------------------------------------------------------------------------------- /src/main/resources/minie-resources/neg-words.dict: -------------------------------------------------------------------------------- 1 | no 2 | not 3 | never 4 | non 5 | n't -------------------------------------------------------------------------------- /src/main/resources/minie-resources/non-subsective-adjectives-cf.dict: -------------------------------------------------------------------------------- 1 | anti- 2 | anti 3 | fabricated 4 | fake 5 | fictional 6 | fictitious 7 | imaginary 8 | mythical 9 | phony 10 | false 11 | artificial 12 | erroneous 13 | mistaken 14 | mock 15 | pseudo- 16 | pseudo 17 | simulated 18 | spurious 19 | unsuccessful 20 | counterfeit 21 | deputy 22 | faulty 23 | virtual -------------------------------------------------------------------------------- /src/main/resources/minie-resources/non-subsective-adjectives-modal.dict: -------------------------------------------------------------------------------- 1 | alleged 2 | believed 3 | debatable 4 | disputed 5 | dubious 6 | hypothetical 7 | impossible 8 | improbable 9 | plausible 10 | putative 11 | questionable 12 | so-called 13 | supposed 14 | suspicious 15 | theoretical 16 | uncertain 17 | unlikely 18 | would-be 19 | doubtful 20 | apparent 21 | arguable 22 | assumed 23 | likely 24 | ostensible 25 | possible 26 | potential 27 | predicted 28 | presumed 29 | probable 30 | seeming -------------------------------------------------------------------------------- /src/main/resources/minie-resources/non-subsective-adjectives-temp.dict: -------------------------------------------------------------------------------- 1 | erstwhile 2 | ex- 3 | ex 4 | expected 5 | former 6 | future 7 | historic 8 | onetime 9 | past 10 | proposed -------------------------------------------------------------------------------- /src/main/resources/minie-resources/poss-adj.dict: -------------------------------------------------------------------------------- 1 | likely 2 | probable 3 | possible -------------------------------------------------------------------------------- /src/main/resources/minie-resources/poss-adverbs.dict: -------------------------------------------------------------------------------- 1 | probably 2 | possibly 3 | perhaps 4 | generally 5 | likely 6 | unsure 7 | presumably 8 | apparently 9 | seemingly 10 | probable 11 | possible 12 | maybe -------------------------------------------------------------------------------- /src/main/resources/minie-resources/poss-modal.dict: -------------------------------------------------------------------------------- 1 | might 2 | may 3 | could 4 | can 5 | would 6 | should 7 | shall 8 | must 9 | will 10 | 'll -------------------------------------------------------------------------------- /src/main/resources/minie-resources/poss-neg-words.dict: -------------------------------------------------------------------------------- 1 | unlikely 2 | unlike 3 | improbable 4 | unbelievable 5 | unbelievably -------------------------------------------------------------------------------- /src/main/resources/minie-resources/poss-verbs.dict: -------------------------------------------------------------------------------- 1 | think 2 | consider 3 | guess 4 | predict 5 | suggest 6 | believe 7 | doubt 8 | wonder 9 | ask 10 | speculate 11 | theorize 12 | theorise 13 | hypothesize 14 | hypothesise 15 | conjecture 16 | suspect -------------------------------------------------------------------------------- /src/main/resources/minie-resources/poss-words.dict: -------------------------------------------------------------------------------- 1 | probably 2 | possibly 3 | perhaps 4 | generally 5 | likely 6 | unsure 7 | presumably 8 | apparently 9 | seemingly 10 | probable 11 | possible 12 | maybe -------------------------------------------------------------------------------- /src/main/resources/minie-resources/quantities-adjectives.dict: -------------------------------------------------------------------------------- 1 | many -------------------------------------------------------------------------------- /src/main/resources/minie-resources/quantities-determiners.dict: -------------------------------------------------------------------------------- 1 | some 2 | all 3 | any 4 | each 5 | every 6 | half 7 | many --------------------------------------------------------------------------------