├── .gitignore
├── CitationSentences.csv
├── LICENSE.txt
├── OriginalMinIE.csv
├── README.md
├── SVM_model
    ├── Data anlysis.py
    ├── Data
    │   ├── annotated_sentences.CSV
    │   ├── annotated_sentences.txt
    │   ├── annotated_sentences.xlsx
    │   └── example.CSV
    ├── Data_preprocessing.py
    ├── Evaluation_Data
    │   └── oa_200randsents.txt
    ├── Example_pipeline.py
    ├── Pickle_Data
    │   ├── citation_with_context.pk
    │   ├── citation_with_context_vec.pk
    │   ├── polarities.pk
    │   ├── purposes.pk
    │   ├── svm_polarity.pk
    │   └── svm_purpose.pk
    ├── SVM.py
    ├── Word_Embedding.py
    ├── citation_context_analysis.py
    ├── save_GloVe_model_To_Local.py
    ├── save_svm_models.py
    ├── stopwords.txt
    ├── test.py
    ├── tfidf.py
    └── word_embedding_new.py
├── pom.xml
└── src
    └── main
        ├── java
            ├── de
            │   └── uni_mannheim
            │   │   ├── clausie
            │   │       ├── ClausIE.java
            │   │       ├── Options.java
            │   │       ├── clause
            │   │       │   ├── Clause.java
            │   │       │   └── ClauseDetector.java
            │   │       ├── conjunction
            │   │       │   └── ProcessConjunctions.java
            │   │       ├── constituent
            │   │       │   ├── Constituent.java
            │   │       │   ├── IndexedConstituent.java
            │   │       │   ├── PhraseConstituent.java
            │   │       │   └── XcompConstituent.java
            │   │       ├── phrase
            │   │       │   └── Phrase.java
            │   │       └── proposition
            │   │       │   ├── DefaultPropositionGenerator.java
            │   │       │   ├── Proposition.java
            │   │       │   └── PropositionGenerator.java
            │   │   ├── constant
            │   │       ├── CHARACTER.java
            │   │       ├── CLAUSE_TYPE.java
            │   │       ├── NE_TYPE.java
            │   │       ├── POS_TAG.java
            │   │       ├── REGEX.java
            │   │       ├── SEPARATOR.java
            │   │       └── WORDS.java
            │   │   ├── minie
            │   │       ├── MinIE.java
            │   │       ├── annotation
            │   │       │   ├── AnnotatedPhrase.java
            │   │       │   ├── AnnotatedProposition.java
            │   │       │   ├── Attribution.java
            │   │       │   ├── Modality.java
            │   │       │   ├── Polarity.java
            │   │       │   └── Quantity.java
            │   │       ├── main
            │   │       │   ├── Extractor.java
            │   │       │   └── Main.java
            │   │       ├── minimize
            │   │       │   ├── Minimization.java
            │   │       │   ├── object
            │   │       │   │   ├── ObjAggressiveMinimization.java
            │   │       │   │   ├── ObjDictionaryMinimization.java
            │   │       │   │   └── ObjSafeMinimization.java
            │   │       │   ├── relation
            │   │       │   │   ├── RelAggressiveMinimization.java
            │   │       │   │   ├── RelDictionaryMinimization.java
            │   │       │   │   └── RelSafeMinimization.java
            │   │       │   └── subject
            │   │       │   │   ├── SubjAggressiveMinimization.java
            │   │       │   │   ├── SubjDictionaryMinimization.java
            │   │       │   │   └── SubjSafeMinimization.java
            │   │       ├── proposition
            │   │       │   └── ImplicitExtractions.java
            │   │       └── subconstituent
            │   │       │   ├── FrequencyCandidates.java
            │   │       │   └── SubConstituent.java
            │   │   └── utils
            │   │       ├── Dictionary.java
            │   │       ├── coreNLP
            │   │           ├── CoreNLPUtils.java
            │   │           └── DpUtils.java
            │   │       ├── fastutils
            │   │           └── FastUtil.java
            │   │       ├── minie
            │   │           └── Utils.java
            │   │       └── phrase
            │   │           └── PhraseUtils.java
            ├── tests
            │   └── minie
            │   │   ├── Demo.java
            │   │   ├── DetectCitationDemo.java
            │   │   └── OriginalMinIE.java
            └── uk
            │   └── ac
            │       └── ucl
            │           └── cs
            │               └── mr
            │                   ├── Fact.java
            │                   ├── FactsBean.java
            │                   ├── FactsResource.java
            │                   ├── Main.java
            │                   └── MinIEService.java
        └── resources
            ├── clausie-resources
                ├── clausie.conf
                ├── dict-adverbs-conj.txt
                ├── dict-adverbs-ignore.txt
                ├── dict-adverbs-include.txt
                ├── dict-complex-transitive.txt
                ├── dict-copular.txt
                ├── dict-ext-copular.txt
                └── dict-not-ext-copular.txt
            └── minie-resources
                ├── certainty-verbs.dict
                ├── certainty-words.dict
                ├── neg-adverbs.dict
                ├── neg-determiners.dict
                ├── neg-words.dict
                ├── non-subsective-adjectives-cf.dict
                ├── non-subsective-adjectives-modal.dict
                ├── non-subsective-adjectives-temp.dict
                ├── poss-adj.dict
                ├── poss-adverbs.dict
                ├── poss-modal.dict
                ├── poss-neg-words.dict
                ├── poss-verbs.dict
                ├── poss-words.dict
                ├── quantities-adjectives.dict
                ├── quantities-determiners.dict
                ├── wiktionary-mw-titles.txt
                └── wn-mwe.txt


/.gitignore:
--------------------------------------------------------------------------------
 1 | /bin/
 2 | /target/*
 3 | .idea/*
 4 | *.iml
 5 | *~
 6 | /target/
 7 | .classpath
 8 | .project
 9 | .settings/*
10 | 


--------------------------------------------------------------------------------
/CitationSentences.csv:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gkiril/MinSCIE/ea6b53003f65e2043dc46ebddbe8460e2a2b7dbc/CitationSentences.csv


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | <img src="https://raw.githubusercontent.com/gkiril/minie/gh-pages/images/minie_logo.png" align="right" width="150" />
 2 | 
 3 | # MinScIE: Citation-centered Open Information Extraction
 4 | 
 5 | An Open Information Extraction (OIE) system which provides structured knowledge enriched with semantic information about citations. This system is based upon the OIE system [MinIE](https://github.com/gkiril/minie). 
 6 | 
 7 | ## Open Information Extraction (OIE)
 8 | Open Information Extraction (OIE) systems aim to extract unseen relations and their arguments from unstructured text in unsupervised manner. In its simplest form, given a natural language sentence, they extract information in the form of a triple, consisted of subject (S), relation (R) and object (O). 
 9 | 
10 | Suppose we have the following input sentence:
11 | ```
12 | AMD, which is based in U.S., is a technology company.
13 | ```
14 | 
15 | An OIE system aims to make the following extractions: 
16 | 
17 | ```
18 | ("AMD"; "is based in"; "U.S.")
19 | ("AMD"; "is"; "technology company")
20 | ```
21 | 
22 | ## Demo
23 | 
24 | For the demos, please refer to the classes `tests.minie.Demo.java` and `tests.minie.DetectCitationDemo.java`.
25 | 
26 | ## Citing
27 | If you use MinScIE in your work, please cite our [paper](https://madoc.bib.uni-mannheim.de/49216/1/_JCDL19Demo__MinScIE%20%284%29.pdf):
28 | 
29 | ```
30 | @inproceedings{lauscher2019minscie,
31 |   title={MinScIE: Citation-centered Open Information Extraction},
32 |   author={Lauscher, Anne and Song, Yide and Gashteovski, Kiril},
33 |   booktitle={Proceedings of ACM/IEEE Joint Conference on Digital Libraries},
34 |   year={2019}
35 | }
36 | ```
37 | 


--------------------------------------------------------------------------------
/SVM_model/Data anlysis.py:
--------------------------------------------------------------------------------
  1 | # Data analysis, Baseline, evaluation on baseline
  2 | 
  3 | import time
  4 | import codecs
  5 | import random
  6 | import numpy as np
  7 | from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, classification_report, confusion_matrix
  8 | from nltk import word_tokenize
  9 | from nltk.stem import WordNetLemmatizer
 10 | 
 11 | 
 12 | def main():
 13 |     texts_polarities = []
 14 |     texts_purposes = []
 15 |     texts = []
 16 |     polarities = []
 17 |     purposes = []
 18 |     polarities2 = []
 19 |     purposes2 = []
 20 |     data_number = 0
 21 |     polarity_information = {"positive": 0, "neutral": 0, "negative": 0}
 22 |     purpose_information = {"Criticizing": 0, "Comparison": 0, "Use": 0, "Substantiating": 0, "Basis": 0, "Neutral": 0}
 23 | 
 24 |     # import data
 25 |     for line in codecs.open("./Data/annotated_sentences.txt", "r", "utf-8", 'ignore').readlines():
 26 |         data_number = data_number + 1
 27 |         parts = line.split('\t')
 28 |         if parts[12].strip() != "0":
 29 |             texts_polarities.append(parts[5])
 30 |             polarities.append(parts[12].strip())
 31 |         if parts[11].strip() != "0":
 32 |             texts_purposes.append(parts[5])
 33 |             purposes.append(parts[11].strip())
 34 |         if parts[11].strip() != "0" and parts[12].strip() != "0":
 35 |             texts.append(parts[5])
 36 |             purposes2.append(int(parts[11].strip()))
 37 |             polarities2.append(int(parts[12].strip()))
 38 |             if parts[12].strip() == "1":
 39 |                 polarity_information["neutral"] += 1
 40 |             if parts[12].strip() == "2":
 41 |                 polarity_information["positive"] += 1
 42 |             if parts[12].strip() == "3":
 43 |                 polarity_information["negative"] += 1
 44 |             if parts[11].strip() == "1":
 45 |                 purpose_information["Criticizing"] += 1
 46 |             if parts[11].strip() == "2":
 47 |                 purpose_information["Comparison"] += 1
 48 |             if parts[11].strip() == "3":
 49 |                 purpose_information["Use"] += 1
 50 |             if parts[11].strip() == "4":
 51 |                 purpose_information["Substantiating"] += 1
 52 |             if parts[11].strip() == "5":
 53 |                 purpose_information["Basis"] += 1
 54 |             if parts[11].strip() == "6":
 55 |                 purpose_information["Neutral"] += 1
 56 |     print("-------------------------------statistic on data----------------------------------------")
 57 |     print("[INFO] Total data Number: %s" % data_number)
 58 |     print("[INFO] Data contains %s citations and %s polarities." % (len(texts_polarities), len(polarities)))
 59 |     print("[INFO] Data contains %s citations and %s purposes." % (len(texts_purposes), len(purposes)))
 60 |     print("[INFO] Data contains %s citation contexts and %s polarities and %s Purposes." % (len(texts), len(polarities2), len(purposes2)))
 61 |     print("[INFO] statistic on polarity %s" % polarity_information)
 62 |     print("[INFO] statistic on purpose %s" % purpose_information)
 63 |     print("-------------------------------Example----------------------------------------")
 64 |     print("[INFO] Example context:\n %s" % (texts[0]))
 65 |     print("[INFO] Has a polarity value of %s" % (polarities2[0]))
 66 | 
 67 |     citation_X = texts
 68 |     polarity_y = polarities2
 69 |     purpose_y = purposes2
 70 | 
 71 | 
 72 |     print("-------------------------------Baseline Majority-----------------------------")
 73 |     y1_result = []
 74 |     for i in polarity_y:
 75 |         y1_result.append(1)
 76 |     print("y1_result %s" % y1_result)
 77 | 
 78 |     y2_result = []
 79 |     for i in purpose_y:
 80 |         y2_result.append(6)
 81 |     print("y2_result %s" % y2_result)
 82 | 
 83 |     #print("purpose_y %s" %purpose_y)
 84 |     polarity_y = np.asarray(polarity_y)
 85 |     purpose_y = np.asarray(purpose_y)
 86 |     y1_result = np.asarray(y1_result)
 87 |     y2_result = np.asarray(y2_result)
 88 | 
 89 |     print("-------------------------------Evaluation on Majority-----------------------------")
 90 |     print("[INFO] Accuracy score for polarity: %s " % accuracy_score(polarity_y, y1_result))
 91 |     print("[INFO] Precision score for polarity: %s " % precision_score(polarity_y, y1_result, average="macro"))
 92 |     print("[INFO] Recall score for polarity: %s " % recall_score(polarity_y, y1_result,average="macro"))
 93 |     print("[INFO] F1 score for polarity: %s " % f1_score(polarity_y, y1_result, average="macro"))
 94 | 
 95 |     print("[INFO] Accuracy score for purpose: %s " % accuracy_score(purpose_y, y2_result))
 96 |     print("[INFO] Precision score for purpose: %s " % precision_score(purpose_y, y2_result, average="macro"))
 97 |     print("[INFO] Recall score for purpose: %s " % recall_score(purpose_y, y2_result,average="macro"))
 98 |     print("[INFO] F1 score for purpose: %s " % f1_score(purpose_y, y2_result, average="macro"))
 99 | 
100 | 
101 | 
102 |     print("-------------------------------Baseline Random-----------------------------")
103 |     y1_result = []
104 |     for i in polarity_y:
105 |         y1_result.append(random.randint(1,3))
106 |     print("y1_result %s" % y1_result)
107 | 
108 |     y2_result = []
109 |     for i in purpose_y:
110 |         y2_result.append(random.randint(1,6))
111 |     print("y2_result %s" % y2_result)
112 | 
113 |     y1_result = np.asarray(y1_result)
114 |     y2_result = np.asarray(y2_result)
115 | 
116 |     print("-------------------------------Evaluation Random-----------------------------")
117 |     print("[INFO] Accuracy score for polarity: %s " % accuracy_score(polarity_y, y1_result))
118 |     print("[INFO] Precision score for polarity: %s " % precision_score(polarity_y, y1_result, average="macro"))
119 |     print("[INFO] Recall score for polarity: %s " % recall_score(polarity_y, y1_result, average="macro"))
120 |     print("[INFO] F1 score for polarity: %s " % f1_score(polarity_y, y1_result, average="macro"))
121 | 
122 |     print("[INFO] Accuracy score for purpose: %s " % accuracy_score(purpose_y, y2_result))
123 |     print("[INFO] Precision score for purpose: %s " % precision_score(purpose_y, y2_result, average="macro"))
124 |     print("[INFO] Recall score for purpose: %s " % recall_score(purpose_y, y2_result, average="macro"))
125 |     print("[INFO] F1 score for purpose: %s " % f1_score(purpose_y, y2_result, average="macro"))
126 | 
127 | 
128 | 
129 | 
130 | if __name__ == "__main__":
131 |     print("[INFO] Pipeline started")
132 |     start_time = time.time()
133 |     main()
134 |     print("[INFO] Total processing time: %s seconds" % (time.time() - start_time))


--------------------------------------------------------------------------------
/SVM_model/Data/annotated_sentences.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gkiril/MinSCIE/ea6b53003f65e2043dc46ebddbe8460e2a2b7dbc/SVM_model/Data/annotated_sentences.xlsx


--------------------------------------------------------------------------------
/SVM_model/Data/example.CSV:
--------------------------------------------------------------------------------
1 | W99-0621,A88-1019,1999,The second instantiation finds the borders of phrases beginning and end and then pairs them in an optimal way into different phrases,0,"These problems formulations are similar to those studied in <REF>Ramshaw and Marcus, 1995</REF> and <TREF>Church, 1988</TREF>; <REF>Argamon et al , 1998</REF>, respectively",1,"The experimental results presented using the SNoW based approach compare favorably with previously published results, both for NPs and SV phrases",0,"A s important, we present a few experiments that shed light on some of the issues involved in using learned predictors that interact to produce the desired inference",0,6,1,
2 | W99-0621,A88-1019,1999,Our earlier example would be marked for base NPs as: I wont to California last May,0,"This approach has been studied in <TREF>Church, 1988</TREF>; <REF>Argamon et al , 1998</REF>",1,331 Architecture The architecture used for the Open/Close predictors is shown in Figure 2,0,"Two SNoW predictors are used, one to predict if the word currently in consideration is the first in the phrase an open bracket, and the other to predict if it is the last a close bracket",0,6,1,
3 | W99-0621,A88-1019,1999,A lot of the work on shallow parsing over the past years has concentrated on manual construction of rules,0,"The observation that shallow syntactic information can be extracted using local information by examining the pattern itself, its nearby context and the local part-of-speech information has motivated the use of learning methods to recognize these patterns <TREF>Church, 1988</TREF>; <REF>Ramshaw and Marcus, 1995</REF>; <REF>Argamon et al , 1998</REF>; <REF>Cardie and Pierce, 1998</REF>",1, Research supported by NSF grants IIS-9801638 and SBR-9873450,0,t Research supported by NSF grant CCR-9502540,0,6,1,
4 | 


--------------------------------------------------------------------------------
/SVM_model/Data_preprocessing.py:
--------------------------------------------------------------------------------
 1 | #Preprocessing on Data
 2 | 
 3 | import codecs
 4 | import numpy as np
 5 | from sklearn.metrics import precision_recall_fscore_support
 6 | from sklearn import svm
 7 | from sklearn.model_selection import KFold
 8 | from sklearn.multiclass import OneVsRestClassifier
 9 | from sklearn.svm import LinearSVC
10 | from sklearn.model_selection import train_test_split
11 | from sklearn import preprocessing
12 | import pickle
13 | import re
14 | from nltk.corpus import stopwords
15 | from nltk.tokenize import word_tokenize
16 | from nltk.stem import WordNetLemmatizer
17 | 
18 | citation = ""
19 | citation_with_context = ""
20 | texts = []
21 | texts_with_context = []
22 | polarities = []
23 | purposes = []
24 | 
25 | # import data
26 | #Preprocessing: remove author names, years, email, url etc from original texts.
27 | email_regex = r'[0-9a-zA-Z_]{0,19}@[0-9a-zA-Z]{1,13}\.(?:com|cn|net)'
28 | url_regex = r"\"?http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+\"?"
29 | 
30 | for line in codecs.open("./Data/annotated_sentences.txt", "r", "utf-8", 'ignore').readlines():
31 |     parts = line.split('\t')
32 |     if parts[11].strip() != "0" and parts[12].strip() != "0":
33 |         citation_with_context = parts[3] + " "+ parts[5] + " " + parts [7] + " " + parts[9]
34 |         citation_with_context = re.sub(r'<[A-Z]+>.*?</[A-Z]+>', "",citation_with_context)
35 |         citation_with_context = re.sub(url_regex, "", citation_with_context)
36 |         texts_with_context.append(citation_with_context)
37 |         parts[5] = re.sub(r'<[A-Z]+>.*?</[A-Z]+>', "",parts[5])
38 |         parts[5] = re.sub(url_regex, "", parts[5])
39 |         texts.append(parts[5])
40 |         purposes.append(int(parts[11].strip()))
41 |         polarities.append(int(parts[12].strip()))
42 | 
43 | preprocessed_text_with_context = []
44 | # Preprocessing: Lemmatization
45 | # Preprocessing: remove stopwords
46 | lemmatizer = WordNetLemmatizer()
47 | stop_words = set(stopwords.words('english'))
48 | print(stop_words)
49 | for sen in texts_with_context:
50 |     word_tokens = word_tokenize(sen)
51 |     filtered_sentence = [w for w in word_tokens if not w in stop_words]
52 |     filtered_sentence = ""
53 |     for w in word_tokens:
54 |         w = lemmatizer.lemmatize(w)
55 |         w = w.lower();
56 |         if w not in stop_words and len(w)>1 and len(w)<40:
57 |             if filtered_sentence == "":
58 |                 filtered_sentence = w
59 |             else:
60 |                 filtered_sentence = filtered_sentence + " " + w
61 |     print(filtered_sentence)
62 |     preprocessed_text_with_context.append(filtered_sentence)
63 | 
64 | print(texts[0])
65 | print(len(texts_with_context))
66 | print(len(preprocessed_text_with_context))
67 | print(len(polarities))
68 | print(len(purposes))
69 | 
70 | #save
71 | with open('./Pickle_Data/citation_with_context.pk', 'wb') as f:
72 |     pickle.dump(texts_with_context, f)
73 | 
74 | with open('./Pickle_Data/citation.pk', 'wb') as f:
75 |     pickle.dump(texts, f)
76 | 
77 | with open('./Pickle_Data/pre_citation_with_context.pk', 'wb') as f:
78 |     pickle.dump(preprocessed_text_with_context, f)
79 | 
80 | with open('./Pickle_Data/polarities.pk', 'wb') as f:
81 |     pickle.dump(polarities, f)
82 | 
83 | with open('./Pickle_Data/purposes.pk', 'wb') as f:
84 |     pickle.dump(purposes, f)
85 | 
86 | # Check whether data are saved
87 | # with open('C:/Users/songi/PycharmProjects/MasterThesis/purposes.pk', 'rb') as f:
88 | #     data = pickle.load(f)
89 | #     print(data)


--------------------------------------------------------------------------------
/SVM_model/Example_pipeline.py:
--------------------------------------------------------------------------------
 1 | import time
 2 | import codecs
 3 | from sklearn import svm
 4 | from sklearn.model_selection import KFold
 5 | from sklearn.model_selection import cross_val_score
 6 | from sklearn import metrics
 7 | from sklearn.feature_extraction.text import CountVectorizer
 8 | from sklearn.feature_extraction.text import TfidfTransformer
 9 | import numpy as np
10 | from sklearn.metrics import precision_recall_fscore_support
11 | 
12 | from nltk import word_tokenize
13 | from nltk.stem import WordNetLemmatizer
14 | 
15 | class LemmaTokenizer(object):
16 |     def __init__(self):
17 |         self.wnl = WordNetLemmatizer()
18 |     def __call__(self, doc):
19 |         return [self.wnl.lemmatize(t) for t in word_tokenize(doc)]
20 | 
21 | def main():
22 |     texts = []
23 |     polarities = []
24 | 
25 |     # import data
26 |     for line in codecs.open("./Data/annotated_sentences.txt", "r", "utf-8", 'ignore').readlines():
27 |         parts = line.split('\t')
28 |         if parts[12].strip() !="0":
29 |             texts.append(parts[5])
30 |             polarities.append(parts[12].strip())
31 |     print("[INFO] Imported %s citation contexts and %s polarities." % (len(texts), len(polarities)))
32 |     print("[INFO] Example context:\n %s" % (texts[0]))
33 |     print("[INFO] Has a polarity value of %s" % (polarities[0]))
34 |     print(set(polarities))
35 | 
36 |     # extract features
37 |     count_vect = CountVectorizer(tokenizer=LemmaTokenizer())
38 |     x_counts = count_vect.fit_transform(texts)
39 |     print(x_counts)
40 |     tfidf_transformer = TfidfTransformer()
41 |     x_tfidf = tfidf_transformer.fit_transform(x_counts)
42 | 
43 |     # convert to numpy structures
44 |     x = x_tfidf.toarray()
45 |     y = np.asarray(polarities)
46 | 
47 |     # train classifier
48 |     kf = KFold(n_splits=10, shuffle=True)
49 |     clf = svm.LinearSVC()
50 |     for k, (train, test) in enumerate(kf.split(x, y)):
51 |         clf.fit(x[train], y[train])
52 |         #print("[INFO] fold %s, score: %s " % (k, clf.score(x[test], y[test])))
53 |         #print(train)
54 |         #print(test)
55 |         result = clf.predict(x[test])
56 |         print("[INFO] fold %s, score: %s " % (k, precision_recall_fscore_support(y[test], result, average="macro" )))
57 | 
58 | 
59 | 
60 | if __name__ == "__main__":
61 |     print("[INFO] Pipeline started")
62 |     start_time = time.time()
63 |     main()
64 |     print("[INFO] Total processing time: %s seconds" % (time.time() - start_time))


--------------------------------------------------------------------------------
/SVM_model/Pickle_Data/citation_with_context.pk:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gkiril/MinSCIE/ea6b53003f65e2043dc46ebddbe8460e2a2b7dbc/SVM_model/Pickle_Data/citation_with_context.pk


--------------------------------------------------------------------------------
/SVM_model/Pickle_Data/citation_with_context_vec.pk:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gkiril/MinSCIE/ea6b53003f65e2043dc46ebddbe8460e2a2b7dbc/SVM_model/Pickle_Data/citation_with_context_vec.pk


--------------------------------------------------------------------------------
/SVM_model/Pickle_Data/polarities.pk:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gkiril/MinSCIE/ea6b53003f65e2043dc46ebddbe8460e2a2b7dbc/SVM_model/Pickle_Data/polarities.pk


--------------------------------------------------------------------------------
/SVM_model/Pickle_Data/purposes.pk:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gkiril/MinSCIE/ea6b53003f65e2043dc46ebddbe8460e2a2b7dbc/SVM_model/Pickle_Data/purposes.pk


--------------------------------------------------------------------------------
/SVM_model/Pickle_Data/svm_polarity.pk:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gkiril/MinSCIE/ea6b53003f65e2043dc46ebddbe8460e2a2b7dbc/SVM_model/Pickle_Data/svm_polarity.pk


--------------------------------------------------------------------------------
/SVM_model/Pickle_Data/svm_purpose.pk:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gkiril/MinSCIE/ea6b53003f65e2043dc46ebddbe8460e2a2b7dbc/SVM_model/Pickle_Data/svm_purpose.pk


--------------------------------------------------------------------------------
/SVM_model/Word_Embedding.py:
--------------------------------------------------------------------------------
 1 | import codecs
 2 | import nltk
 3 | from sklearn import svm
 4 | from sklearn.model_selection import KFold
 5 | from sklearn.model_selection import cross_val_score
 6 | from sklearn import metrics
 7 | from sklearn.feature_extraction.text import CountVectorizer
 8 | from sklearn.feature_extraction.text import TfidfTransformer
 9 | import numpy as np
10 | from sklearn.metrics import precision_recall_fscore_support
11 | from sklearn.cross_validation import train_test_split
12 | from sklearn.neighbors import KNeighborsClassifier
13 | from gensim.test.utils import common_texts, get_tmpfile
14 | from gensim.models import Word2Vec, KeyedVectors
15 | import gensim
16 | from nltk import word_tokenize
17 | from nltk.stem import WordNetLemmatizer
18 | import nltk
19 | 
20 | 
21 | class LemmaTokenizer(object):
22 |     def __init__(self):
23 |         self.wnl = WordNetLemmatizer()
24 | 
25 |     def __call__(self, doc):
26 |         return [self.wnl.lemmatize(t) for t in word_tokenize(doc)]
27 | 
28 | 
29 | def main():
30 |     texts_polarities = []
31 |     texts_purposes = []
32 |     texts = []
33 |     polarities = []
34 |     purposes = []
35 |     polarities2 = []
36 |     purposes2 = []
37 |     vector_citation_x = []
38 |     data_number = 0
39 |     polarity_information = {"positive": 0, "neutral": 0, "negative": 0}
40 |     purpose_information = {"Criticizing": 0, "Comparison": 0, "Use": 0, "Substantiating": 0, "Basis": 0, "Neutral": 0}
41 | 
42 |     # import data
43 |     for line in codecs.open("./Data/annotated_sentences.txt", "r", "utf-8", 'ignore').readlines():
44 |         data_number = data_number + 1
45 |         parts = line.split('\t')
46 |         if parts[12].strip() != "0":
47 |             texts_polarities.append(parts[5])
48 |             polarities.append(parts[12].strip())
49 |         if parts[11].strip() != "0":
50 |             texts_purposes.append(parts[5])
51 |             purposes.append(parts[11].strip())
52 |         if parts[11].strip() != "0" and parts[12].strip() != "0":
53 |             texts.append(parts[5])
54 |             purposes2.append(int(parts[11].strip()))
55 |             polarities2.append(int(parts[12].strip()))
56 | 
57 |     citation_X = texts
58 |     polarity_y = polarities2
59 |     purpose_y = purposes2
60 | 
61 |     citation_X = np.asarray(citation_X)
62 |     print(citation_X)
63 | 
64 |     #tok_corp=[nltk.word_tokenize(sent) for sent in citation_X]
65 |     #model = gensim.models.Word2Vec(tok_corp, min_count=1, size=32)
66 | 
67 |     #model = gensim.models.KeyedVectors.load_word2vec_format('C:/Users/songi/PycharmProjects/MasterThesis/GoogleNews-vectors-negative300.bin', binary=True)
68 |     #model.save('word2vec.model')
69 |     model = KeyedVectors.load('word2vec.model')
70 |     print(model.most_similar('algorithms'))
71 |     print(len(model['algorithms']))
72 |     print(model.most_similar('cat'))
73 | 
74 | 
75 |     for sen in citation_X:
76 |         token_sen = nltk.word_tokenize(sen)
77 |         sen_len = 0
78 |         a = np.zeros(300)
79 |         for token in token_sen:
80 |             if token in model.wv.vocab:
81 |                 sen_len = sen_len + 1
82 |                 a = a + model[token]
83 |         print(len(a))
84 |         a = a / sen_len
85 |         print(a)
86 |         vector_citation_x.append(a)
87 | 
88 |     print(vector_citation_x)
89 | 
90 | 
91 | 
92 | 
93 | 
94 | 
95 | 
96 | if __name__ == "__main__":
97 |     print("[INFO] Pipeline started")
98 |     main()


--------------------------------------------------------------------------------
/SVM_model/citation_context_analysis.py:
--------------------------------------------------------------------------------
 1 | import time
 2 | import csv
 3 | import numpy as np
 4 | from nltk import word_tokenize
 5 | from nltk.stem import WordNetLemmatizer
 6 | 
 7 | 
 8 | class LemmaTokenizer(object):
 9 |     def __init__(self):
10 |         self.wnl = WordNetLemmatizer()
11 | 
12 |     def __call__(self, doc):
13 |         return [self.wnl.lemmatize(t) for t in word_tokenize(doc)]
14 | 
15 | 
16 | def main():
17 |     texts = [[]]
18 |     purpose =[]
19 |     polarities = []
20 | 
21 |     # import data
22 |     i =0
23 |     with open("./example.csv") as f:
24 |         reader = csv.reader(f,delimiter=',')
25 |         for row in reader:
26 |             texts[i][0].append(row[3])
27 |             texts[i][1].append(row[5])
28 |             texts[i][2].append(row[7])
29 |             texts[i][3].append(row[9])
30 |             purpose.append(row[11])
31 |             polarities.append(row[12])
32 |             i= i+1
33 |         print(texts[0])
34 | 
35 |     for line in open("./annotated_sentences.csv"):
36 |         csv_row = line.split()
37 |         #print(line)
38 | 
39 |         #parts = line.split('\t')
40 |         #if parts[12].strip() != "0":
41 |             #texts.append(parts[5])
42 |             #polarities.append(parts[12].strip())
43 |     #print("[INFO] Imported %s citation contexts and %s polarities." % (len(texts), len(polarities)))
44 |     #print("[INFO] Example context:\n %s" % (texts[0]))
45 |     #print("[INFO] Has a polarity value of %s" % (polarities[0]))
46 |     #print(set(polarities))
47 | 
48 | 
49 | 
50 | 
51 | 
52 | 
53 | if __name__ == "__main__":
54 |     print("[INFO] Pipeline started")
55 |     start_time = time.time()
56 |     main()
57 |     print("[INFO] Total processing time: %s seconds" % (time.time() - start_time))


--------------------------------------------------------------------------------
/SVM_model/save_GloVe_model_To_Local.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | from gensim.models import Word2Vec, KeyedVectors
 3 | import nltk
 4 | import pickle
 5 | import numpy as np
 6 | 
 7 | # load the Stanford GloVe model
 8 | filename = 'C:/Users/songi/PycharmProjects/Model/acl_vectors_glove_300d.txt.word2vec'
 9 | print('loading model, model file: ', filename)
10 | model = KeyedVectors.load_word2vec_format(filename, binary=False)
11 | 
12 | #Some example of word embeding
13 | print('Examples:')
14 | print(model.most_similar('cat'))
15 | 
16 | with open('C:/Users/songi/PycharmProjects/Master_Thesis/Pickle_Data/local_Model.pk', 'wb') as f:
17 |     pickle.dump(model, f)
18 | print("--------Vectors saved in local------------")


--------------------------------------------------------------------------------
/SVM_model/save_svm_models.py:
--------------------------------------------------------------------------------
  1 | import codecs
  2 | import numpy as np
  3 | from sklearn.metrics import precision_recall_fscore_support
  4 | from sklearn.metrics import accuracy_score
  5 | from sklearn.metrics import precision_score
  6 | from sklearn.metrics import recall_score
  7 | from sklearn.metrics import f1_score
  8 | from sklearn import svm
  9 | from sklearn.model_selection import KFold
 10 | from sklearn.multiclass import OneVsRestClassifier
 11 | from sklearn.svm import LinearSVC
 12 | from sklearn.model_selection import train_test_split
 13 | from sklearn import preprocessing
 14 | from sklearn.linear_model import Ridge
 15 | from sklearn.model_selection import cross_val_score
 16 | from sklearn.model_selection import GridSearchCV
 17 | import pickle
 18 | from sklearn.metrics import classification_report
 19 | from sklearn.svm import SVC
 20 | import itertools
 21 | from sklearn.utils import shuffle
 22 | 
 23 | 
 24 | 
 25 | #read preprocessed data from local
 26 | with open('./Pickle_Data/citation_with_context_vec.pk', 'rb') as f:
 27 |     vec_texts_with_context = pickle.load(f)
 28 | #    print(texts_with_context)
 29 | with open('./Pickle_Data/pre_citation_with_context_vec.pk', 'rb') as f:
 30 |     vec_pre_texts_with_context = pickle.load(f)
 31 | #
 32 | with open('./Pickle_Data/citation_vec.pk', 'rb') as f:
 33 |     vec_texts = pickle.load(f)
 34 | #    print(texts)
 35 | with open('./Pickle_Data/polarities.pk', 'rb') as f:
 36 |     polarities = pickle.load(f)
 37 | #    print(polarities)
 38 | with open('./Pickle_Data/purposes.pk', 'rb') as f:
 39 |     purposes = pickle.load(f)
 40 | #    print(purposes)
 41 | 
 42 | citation_X = vec_texts
 43 | citation_with_context_X = vec_texts_with_context
 44 | pre_citation_with_context_X = vec_pre_texts_with_context
 45 | polarity_Y = polarities
 46 | purpose_Y = purposes
 47 | 
 48 | #change to array
 49 | citation_X = np.asarray(citation_X)
 50 | citation_with_context_X = np.asarray(citation_with_context_X)
 51 | pre_citation_with_context_X = np.asarray(pre_citation_with_context_X)
 52 | polarity_Y = np.asarray(polarity_Y)
 53 | purpose_Y = np.asarray(purpose_Y)
 54 | citation_with_context_X.reshape(1,-1)
 55 | #print(citation_X)
 56 | #print("------------Example of citations and its length:---------------")
 57 | #print(len(citation_X))
 58 | #print(citation_X[0])
 59 | #print("------------Example of citations with contexts and its length:------------")
 60 | #print(len(citation_with_context_X))
 61 | 
 62 | 
 63 | #change NaN element to 0
 64 | nan_element = []
 65 | #remove nan in data
 66 | for i in range(len(citation_X)):
 67 |     sample=citation_X[i]
 68 |     for j in range(len(sample)):
 69 |         if np.isnan(sample[j]):
 70 |             sample[j]=0
 71 |             nan_element.append(i)
 72 |             #break
 73 | # print(nan_element)
 74 | # for i in nan_element:
 75 | #     citation_X = np.delete(citation_X,i,axis = 0)
 76 | #     polarity_Y = np.delete(polarity_Y,i,axis = 0)
 77 | #     purpose_Y = np.delete(purpose_Y,i,axis = 0)
 78 | 
 79 | 
 80 | for i in range(len(citation_with_context_X)):
 81 |     sample=citation_with_context_X[i]
 82 |     for j in range(len(sample)):
 83 |         if np.isnan(sample[j]):
 84 |             sample[j]=0
 85 | 
 86 | for i in range(len(pre_citation_with_context_X)):
 87 |     sample = pre_citation_with_context_X[i]
 88 |     for j in range(len(sample)):
 89 |         if np.isnan(sample[j]):
 90 |             sample[j] = 0
 91 | 
 92 | 
 93 | #shuffle the data
 94 | citation_with_context_X, polarity_Y, purpose_Y= shuffle(citation_with_context_X, polarity_Y,purpose_Y, random_state=0)
 95 | 
 96 | 
 97 | # Use cross validation to evaluate the model on all data (Train and test)
 98 | kf = KFold(n_splits=10, shuffle=False)
 99 | clf = svm.SVC(kernel='rbf', C=80, gamma=0.4)
100 | accuracy_scores = []
101 | precision_scores = []
102 | recall_scores =[]
103 | fscores = []
104 | for k, (train, test) in enumerate(kf.split(citation_with_context_X, polarity_Y)):
105 |     clf.fit(citation_with_context_X[train], polarity_Y[train])
106 |     result = clf.predict(citation_with_context_X[test])
107 |     accuracy_scores.append(accuracy_score(polarity_Y[test], result))
108 |     precision_scores.append(precision_score(polarity_Y[test], result, average="macro"))
109 |     recall_scores.append(recall_score(polarity_Y[test], result, average="macro"))
110 |     fscores.append(f1_score(polarity_Y[test], result, average="macro"))
111 |     print("[INFO] fold %s, score: %s " % (k, precision_recall_fscore_support(polarity_Y[test], result, average="macro" )))
112 | 
113 | print("Accuracy mean: %s, std. deviation: %s" %(np.mean(accuracy_scores)*100.0,np.std(accuracy_scores)*100.0))
114 | print("precision_scores mean: %s, std. deviation: %s" %(np.mean(precision_scores)*100.0,np.std(precision_scores)*100.0))
115 | print("recall_scores mean: %s, std. deviation: %s" %(np.mean(recall_scores)*100.0,np.std(recall_scores)*100.0))
116 | print("fscores mean: %s, std. deviation: %s" %(np.mean(fscores)*100.0,np.std(fscores)*100.0))
117 | 
118 | f = open('Pickle_Data/svm_polarity.pk','wb')
119 | clf.fit(citation_with_context_X, polarity_Y)
120 | print(citation_with_context_X[1].reshape(1,-1))
121 | result = clf.predict(citation_with_context_X[13].reshape(1,-1))
122 | print(result)
123 | pickle.dump(clf,f)
124 | f.close()
125 | 
126 | 
127 | 
128 | kf = KFold(n_splits=10, shuffle=False)
129 | clf = svm.SVC(kernel='rbf', C=75, gamma=1.1)
130 | accuracy_scores = []
131 | precision_scores = []
132 | recall_scores =[]
133 | fscores = []
134 | for k, (train, test) in enumerate(kf.split(citation_with_context_X, purpose_Y)):
135 |     clf.fit(citation_with_context_X[train], purpose_Y[train])
136 |     result = clf.predict(citation_with_context_X[test])
137 |     accuracy_scores.append(accuracy_score(purpose_Y[test], result))
138 |     precision_scores.append(precision_score(purpose_Y[test], result, average="macro"))
139 |     recall_scores.append(recall_score(purpose_Y[test], result, average="macro"))
140 |     fscores.append(f1_score(purpose_Y[test], result, average="macro"))
141 |     print("[INFO] fold %s, score: %s " % (k, precision_recall_fscore_support(purpose_Y[test], result, average="macro" )))
142 | 
143 | print("Accuracy mean: %s, std. deviation: %s" %(np.mean(accuracy_scores)*100.0,np.std(accuracy_scores)*100.0))
144 | print("precision_scores mean: %s, std. deviation: %s" %(np.mean(precision_scores)*100.0,np.std(precision_scores)*100.0))
145 | print("recall_scores mean: %s, std. deviation: %s" %(np.mean(recall_scores)*100.0,np.std(recall_scores)*100.0))
146 | print("fscores mean: %s, std. deviation: %s" %(np.mean(fscores)*100.0,np.std(fscores)*100.0))
147 | 
148 | 
149 | f = open('Pickle_Data/svm_purpose.pk','wb')
150 | clf.fit(citation_with_context_X, purpose_Y)
151 | result = clf.predict(citation_with_context_X[13].reshape(1,-1))
152 | print(result)
153 | pickle.dump(clf,f)
154 | f.close()
155 | 
156 | 
157 | 
158 | 
159 | 
160 | 
161 | 
162 | 


--------------------------------------------------------------------------------
/SVM_model/stopwords.txt:
--------------------------------------------------------------------------------
 1 | ('the', 'herself', 'our', 'my','yours', 'm', 'your', 'which', 'o', 'shan', 'his',
 2 |                'such', 'ain', 'that', 's', 'are', 'was', 'their', 'he',  'being',
 3 |                 'an', 'there', 'him', 'having', 're', 'it', 'or', 'll', 'ourselves',
 4 |                'theirs', 'whom', 'did', 'me', 'than', 'she', 'we', 'd',
 5 |                'they', 'themselves', 'itself', 'her', 'those', 'myself',
 6 |                 'himself', 'a', 'i', 'them', 'this', 'were',
 7 |                 'is', 'ours', 'be', 'am', 'then', 'to', 'been', 'yourself', 'have', 'so',
 8 |                 'of', 'same', 'ma', 'by',  'hers',
 9 |                 'yourselves', 'just',  'you', 't', 'now', 'any', 'y', 'its','A', 'B', 'C', 'D', 'E', 'F', 'G','H',
10 |                'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'b', 'c', 'e',
11 |                'f', 'g', 'h', 'j', 'k', 'l', 'm', 'n', 'o','p','q','r','u','v','w','x','z')


--------------------------------------------------------------------------------
/SVM_model/test.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | from gensim.models import Word2Vec, KeyedVectors
 3 | import nltk
 4 | import pickle
 5 | import numpy as np
 6 | 
 7 | # load the Stanford GloVe model
 8 | #filename = 'C:/Users/songi/PycharmProjects/Model/glove.6B.300d.txt.word2vec'
 9 | #print('loading model, model file: ', filename)
10 | #model = KeyedVectors.load_word2vec_format(filename, binary=False)
11 | 
12 | with open('C:/Users/songi/PycharmProjects/Master_Thesis/Pickle_Data/local_Model.pk', 'rb') as f:
13 |     model = pickle.load(f)
14 | 
15 | 
16 | 
17 | # Calculate for user input text a Vector,
18 | user_input_text = sys.argv[1]
19 | vector_user_input_text = np.zeros(300)
20 | token_sen = nltk.word_tokenize(user_input_text)
21 | print(token_sen)
22 | 
23 | sen_len = 0
24 | for token in token_sen:
25 |     if token in model.wv.vocab:
26 |         sen_len = sen_len + 1
27 |         vector_user_input_text = vector_user_input_text + model[token]
28 |         #print(model[token][0])
29 | 
30 | vector_user_input_text = vector_user_input_text/sen_len
31 | #print("vector: ", vector_user_input_text)
32 | 
33 | 
34 | polarity_information = {"positive": 0, "neutral": 0, "negative": 0}
35 | purpose_information = {"Criticizing": 0, "Comparison": 0, "Use": 0, "Substantiating": 0, "Basis": 0, "Neutral": 0}
36 | 
37 | f = open('C:/Users/songi/PycharmProjects/Master_Thesis/Pickle_Data/svm_polarity.pk','rb')
38 | svm_model = pickle.load(f)
39 | f.close()
40 | 
41 | result = svm_model.predict(vector_user_input_text.reshape(1,-1))
42 | polarity = ""
43 | if result == 1:
44 |     polarity = "Neutral"
45 | if result == 2:
46 |     polarity = "Positive"
47 | if result == 3:
48 |     polarity == "Negative"
49 | print(polarity)
50 | 
51 | f = open('C:/Users/songi/PycharmProjects/Master_Thesis/Pickle_Data/svm_purpose.pk','rb')
52 | svm_model = pickle.load(f)
53 | f.close()
54 | 
55 | result2 = svm_model.predict(vector_user_input_text.reshape(1,-1))
56 | purpose = ""
57 | if result == 1:
58 |     purpose = "Criticizing"
59 | if result == 2:
60 |     purpose = "Comparison"
61 | if result == 3:
62 |     purpose == "Use"
63 | if result == 4:
64 |     purpose = "Substantiating"
65 | if result == 5:
66 |     purpose = "Basis"
67 | if result == 6:
68 |     purpose == "Neutral"
69 | print(purpose)
70 | 
71 | print ('Number of arguments:', len(sys.argv), 'arguments.')
72 | print ('Argument List:', str(sys.argv[1]))


--------------------------------------------------------------------------------
/SVM_model/tfidf.py:
--------------------------------------------------------------------------------
  1 | from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer
  2 | import codecs
  3 | import numpy as np
  4 | from sklearn.metrics import precision_recall_fscore_support
  5 | from sklearn import svm
  6 | from sklearn.model_selection import KFold
  7 | from sklearn.multiclass import OneVsRestClassifier
  8 | from sklearn.svm import LinearSVC
  9 | from sklearn.model_selection import train_test_split
 10 | from sklearn import preprocessing
 11 | import pickle
 12 | from sklearn.metrics import accuracy_score
 13 | from sklearn.metrics import precision_score
 14 | from sklearn.metrics import recall_score
 15 | from sklearn.metrics import f1_score
 16 | 
 17 | #read preprocessed data from local
 18 | with open('./Pickle_Data/citation_with_context.pk', 'rb') as f:
 19 |     texts_with_context = pickle.load(f)
 20 |     print(texts_with_context[1])
 21 | with open('./Pickle_Data/pre_citation_with_context.pk', 'rb') as f:
 22 |     pre_texts_with_context = pickle.load(f)
 23 |     print(pre_texts_with_context[1])
 24 | with open('./Pickle_Data/citation.pk', 'rb') as f:
 25 |     texts = pickle.load(f)
 26 |     print(texts[1])
 27 | with open('./Pickle_Data/polarities.pk', 'rb') as f:
 28 |     polarities = pickle.load(f)
 29 | #    print(polarities)
 30 | with open('./Pickle_Data/purposes.pk', 'rb') as f:
 31 |     purposes = pickle.load(f)
 32 | #    print(purposes)
 33 | 
 34 | citation_X = texts
 35 | citation_with_context_X = texts_with_context
 36 | polarity_Y = polarities
 37 | purpose_Y = purposes
 38 | 
 39 | citation_X = np.asarray(citation_X)
 40 | citation_with_context_X = np.asarray(citation_with_context_X)
 41 | polarity_Y = np.asarray(polarity_Y)
 42 | purpose_Y = np.asarray(purpose_Y)
 43 | 
 44 | example_document = ["I have an pen dog cat box.",
 45 |                     "I have an apple."]
 46 | 
 47 | print("---------create tfidf matrix for citations without contexts-------------------")
 48 | vectorizer = CountVectorizer()
 49 | count = vectorizer.fit_transform(citation_with_context_X)
 50 | #print(vectorizer.get_feature_names())
 51 | # if "yide" in vectorizer.get_feature_names():
 52 | #     print(vectorizer.vocabulary_["yide"])
 53 | print(count.toarray())
 54 | transformer = TfidfTransformer()
 55 | tfidf_matrix = transformer.fit_transform(count)
 56 | #print(tfidf_matrix.toarray())
 57 | #print(tfidf_matrix.toarray())
 58 | 
 59 | 
 60 | polarity_Y = np.asarray(polarities)
 61 | purpose_Y = np.asarray(purposes)
 62 | tfidf_matrix = tfidf_matrix.toarray()
 63 | 
 64 | # train classifier normal svm
 65 | kf = KFold(n_splits=10, shuffle=False)
 66 | clf = svm.LinearSVC()
 67 | accuracy_scores = []
 68 | precision_scores = []
 69 | recall_scores =[]
 70 | fscores = []
 71 | 
 72 | for k, (train, test) in enumerate(kf.split(tfidf_matrix, polarity_Y)):
 73 |     clf.fit(tfidf_matrix[train], polarity_Y[train])
 74 |     result = clf.predict(tfidf_matrix[test])
 75 |     accuracy_scores.append(accuracy_score(polarity_Y[test], result))
 76 |     precision_scores.append(precision_score(polarity_Y[test], result, average="macro"))
 77 |     recall_scores.append(recall_score(polarity_Y[test], result, average="macro"))
 78 |     fscores.append(f1_score(polarity_Y[test], result, average="macro"))
 79 |     print("[INFO] fold %s, score: %s " % (k, precision_recall_fscore_support(polarity_Y[test], result, average="macro" )))
 80 | 
 81 | print("Accuracy mean: %s, std. deviation: %s" % (np.mean(accuracy_scores) * 100.0, np.std(accuracy_scores) * 100.0))
 82 | print("precision_scores mean: %s, std. deviation: %s" % (np.mean(precision_scores) * 100.0, np.std(precision_scores) * 100.0))
 83 | print("recall_scores mean: %s, std. deviation: %s" % (np.mean(recall_scores) * 100.0, np.std(recall_scores) * 100.0))
 84 | print("fscores mean: %s, std. deviation: %s" % (np.mean(fscores) * 100.0, np.std(fscores) * 100.0))
 85 | 
 86 | 
 87 | # train classifier normal svm
 88 | kf = KFold(n_splits=10, shuffle=False)
 89 | clf = svm.LinearSVC()
 90 | accuracy_scores = []
 91 | precision_scores = []
 92 | recall_scores =[]
 93 | fscores = []
 94 | 
 95 | for k, (train, test) in enumerate(kf.split(tfidf_matrix, purpose_Y)):
 96 |     clf.fit(tfidf_matrix[train], purpose_Y[train])
 97 |     result = clf.predict(tfidf_matrix[test])
 98 |     accuracy_scores.append(accuracy_score(purpose_Y[test], result))
 99 |     precision_scores.append(precision_score(purpose_Y[test], result, average="macro"))
100 |     recall_scores.append(recall_score(purpose_Y[test], result, average="macro"))
101 |     fscores.append(f1_score(purpose_Y[test], result, average="macro"))
102 |     print("[INFO] fold %s, score: %s " % (k, precision_recall_fscore_support(purpose_Y[test], result, average="macro" )))
103 | 
104 | print("Accuracy mean: %s, std. deviation: %s" % (np.mean(accuracy_scores) * 100.0, np.std(accuracy_scores) * 100.0))
105 | print("precision_scores mean: %s, std. deviation: %s" % (np.mean(precision_scores) * 100.0, np.std(precision_scores) * 100.0))
106 | print("recall_scores mean: %s, std. deviation: %s" % (np.mean(recall_scores) * 100.0, np.std(recall_scores) * 100.0))
107 | print("fscores mean: %s, std. deviation: %s" % (np.mean(fscores) * 100.0, np.std(fscores) * 100.0))
108 | #
109 | #
110 | # # x_train1, x_test1, y_train1, y_test1 = train_test_split(tfidf_matrix, polarity_Y, random_state=0, train_size=0.8)
111 | # # print("------SVM model: svm.LinearSVC(). input: vector of each citation, label: polarities-------")
112 | # # clf.fit(x_train1,y_train1)
113 | # # result = clf.predict(x_test1)
114 | # # print(precision_recall_fscore_support(y_test1, result, average="macro"))
115 | # #
116 | # # # one vs the rest
117 | # # print("------SVM model: OneVsRestClassifier. input: vector of each citation, label: polarities-------")
118 | # # result = OneVsRestClassifier(svm.SVC(kernel='linear')).fit(x_train1,y_train1).predict(x_test1)
119 | # # print(precision_recall_fscore_support(y_test1, result, average="macro"))
120 | # # #print(result2)
121 | #
122 | #
123 | # # print("---------create tfidf matrix for citations with contexts-------------------")
124 | # # vectorizer = CountVectorizer()
125 | # # count = vectorizer.fit_transform(texts_with_context)
126 | # # transformer = TfidfTransformer()
127 | # # tfidf_matrix2 = transformer.fit_transform(count)
128 | # # print(len(tfidf_matrix2.toarray()))
129 | # # print(len(tfidf_matrix2.toarray()[2]))
130 | # # tfidf_matrix2 = tfidf_matrix2.toarray()
131 | # #
132 | # # # train classifier svm
133 | # # clf = svm.LinearSVC()
134 | # # x_train2, x_test2, y_train2, y_test2 = train_test_split(tfidf_matrix2, polarity_Y, random_state=0, train_size=0.8)
135 | # # print("------SVM model: svm.LinearSVC(). input: vector of each citation with context, label: polarities-------")
136 | # # clf.fit(x_train2,y_train2)
137 | # # result = clf.predict(x_test2)
138 | # # print(precision_recall_fscore_support(y_test2, result, average="macro"))
139 | # #
140 | # # # one vs the rest
141 | # # print("------SVM model: OneVsRestClassifier. input: vector of each citation with context, label: polarities-------")
142 | # # result = OneVsRestClassifier(svm.SVC(kernel='linear')).fit(x_train2,y_train2).predict(x_test2)
143 | # # print(precision_recall_fscore_support(y_test2, result, average="macro"))
144 | #
145 | #
146 | #
147 | # # print("---------create tfidf matrix for citations with contexts and preprocessing-------------------")
148 | # # vectorizer = CountVectorizer()
149 | # # count = vectorizer.fit_transform(pre_texts_with_context)
150 | # # transformer = TfidfTransformer()
151 | # # tfidf_matrix3 = transformer.fit_transform(count)
152 | # # print(len(tfidf_matrix3.toarray()))
153 | # # print(len(tfidf_matrix3.toarray()[2]))
154 | # # tfidf_matrix3 = tfidf_matrix3.toarray()
155 | # #
156 | # # # train classifier svm
157 | # # clf = svm.LinearSVC()
158 | # # x_train3, x_test3, y_train3, y_test3 = train_test_split(tfidf_matrix3, polarity_Y, random_state=0, train_size=0.8)
159 | # # print("------SVM model: svm.LinearSVC(). input: vector of each citation with context, label: polarities-------")
160 | # # clf.fit(x_train3,y_train3)
161 | # # result = clf.predict(x_test3)
162 | # # print(precision_recall_fscore_support(y_test3, result, average="macro"))
163 | # #
164 | # # # one vs the rest
165 | # # print("------SVM model: OneVsRestClassifier. input: vector of each citation with context, label: polarities-------")
166 | # # result = OneVsRestClassifier(svm.SVC(kernel='linear')).fit(x_train3,y_train3).predict(x_test3)
167 | # # print(precision_recall_fscore_support(y_test3, result, average="macro"))


--------------------------------------------------------------------------------
/SVM_model/word_embedding_new.py:
--------------------------------------------------------------------------------
  1 | #Word Embedding: convert sentences to vectors
  2 | from gensim.scripts.glove2word2vec import glove2word2vec
  3 | from gensim.models import KeyedVectors
  4 | import numpy as np
  5 | from sklearn.metrics import precision_recall_fscore_support
  6 | from sklearn.cross_validation import train_test_split
  7 | from sklearn.neighbors import KNeighborsClassifier
  8 | from gensim.test.utils import common_texts, get_tmpfile
  9 | from gensim.models import Word2Vec, KeyedVectors
 10 | import gensim
 11 | from nltk import word_tokenize
 12 | from nltk.stem import WordNetLemmatizer
 13 | import nltk
 14 | import pickle
 15 | from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer
 16 | 
 17 | # glove_input_file = 'glove.6B.300d.txt'
 18 | # word2vec_output_file = 'glove.6B.300d.txt.word2vec'
 19 | # glove2word2vec(glove_input_file, word2vec_output_file)
 20 | 
 21 | # load the Stanford GloVe model
 22 | filename = '../Model/acl_vectors_glove_300d.txt.word2vec'
 23 | model = KeyedVectors.load_word2vec_format(filename, binary=False)
 24 | 
 25 | #Some example of word embeding
 26 | print(model.most_similar('algorithms'))
 27 | print(len(model['algorithms']))
 28 | print(model.most_similar('cat'))
 29 | 
 30 | # Calculate vector for a example sentence
 31 | example_sentence = "example sentence with word embeding"
 32 | print("Calculate vector for a example sentence: " + example_sentence)
 33 | token_sen = nltk.word_tokenize(example_sentence)
 34 | sen_len = 0
 35 | a = np.zeros(300)
 36 | for token in token_sen:
 37 |     if token in model.wv.vocab:
 38 |         sen_len = sen_len + 1
 39 |         a = a + model[token]
 40 |         print(model[token][0])
 41 | print(len(a))
 42 | a =a/sen_len
 43 | print(a[0])
 44 | 
 45 | 
 46 | #read preprocessed data from local
 47 | with open('./Pickle_Data/citation_with_context.pk', 'rb') as f:
 48 |     texts_with_context = pickle.load(f)
 49 | #    print(texts_with_context)
 50 | with open('./Pickle_Data/pre_citation_with_context.pk', 'rb') as f:
 51 |     pre_texts_with_context = pickle.load(f)
 52 | #    print(pre_texts_with_context)
 53 | with open('./Pickle_Data/citation.pk', 'rb') as f:
 54 |     texts = pickle.load(f)
 55 | #    print(texts)
 56 | with open('./Pickle_Data/polarities.pk', 'rb') as f:
 57 |     polarities = pickle.load(f)
 58 | #    print(polarities)
 59 | with open('./Pickle_Data/purposes.pk', 'rb') as f:
 60 |     purposes = pickle.load(f)
 61 | #    print(purposes)
 62 | 
 63 | 
 64 | citation_X = texts
 65 | citation_with_context_X = texts_with_context
 66 | pre_citation_with_context_X = pre_texts_with_context
 67 | polarity_Y = polarities
 68 | purpose_Y = purposes
 69 | 
 70 | citation_X = np.asarray(citation_X)
 71 | citation_with_context_X = np.asarray(citation_with_context_X)
 72 | pre_citation_with_context_X = np.asarray(pre_citation_with_context_X)
 73 | #print(citation_X)
 74 | print("------------Example of citations and its length:---------------")
 75 | print(len(citation_X))
 76 | print(citation_X[0])
 77 | print("------------Example of citations with contexts and its length:------------")
 78 | print(len(citation_with_context_X))
 79 | print(citation_with_context_X[0])
 80 | print("------------Example of citations with contexts and preprocessing and its length:------------")
 81 | print(len(pre_citation_with_context_X))
 82 | print(pre_citation_with_context_X[0])
 83 | 
 84 | print("---------create tfidf matrix-------------------")
 85 | vectorizer = CountVectorizer()
 86 | count = vectorizer.fit_transform(citation_with_context_X)
 87 | #print(vectorizer.get_feature_names())
 88 | #print(vectorizer.vocabulary_)
 89 | #print(count.toarray())
 90 | transformer = TfidfTransformer()
 91 | tfidf_matrix = transformer.fit_transform(count)
 92 | tfidf_matrix = tfidf_matrix.toarray()
 93 | print(len(tfidf_matrix))
 94 | 
 95 | # Calculate for each citation (whiout contexts) a Vector, save those vectors in a List -> vector_citations_X
 96 | # vector_citations_X = []
 97 | # sen_index = 0
 98 | # tfidf = 0
 99 | # vocabulary_index = 0
100 | # for sen in citation_X:
101 | #     token_sen = nltk.word_tokenize(sen)
102 | #     sum_tfidt  = 0
103 | #     vec_sen = np.zeros(300)
104 | #     for token in token_sen:
105 | #         if token in model.wv.vocab and token in vectorizer.get_feature_names():
106 | #             vocabulary_index = vectorizer.vocabulary_[token]
107 | #             tfidf = tfidf_matrix[sen_index][vocabulary_index]
108 | #             sum_tfidt = sum_tfidt + tfidf
109 | #             vec_sen = vec_sen + model[token] * tfidf #each vectors weighted by the tfidf value
110 | #     #print(len(vec_sen))
111 | #     vec_sen = vec_sen / sum_tfidt
112 | #     vector_citations_X.append(vec_sen)
113 | #     sen_index =sen_index + 1
114 | # print("--------Number of citations converted to vectors: ------------")
115 | # print(len(vector_citations_X))
116 | # #print("-------Example vector for first citation: --------------------")
117 | # #print(vector_citation_with_contexts_X[0])
118 | 
119 | # Calculate for each citations (with contexts) a Vector, save those vectors in a List -> vector_citation_with_contexts_X
120 | vector_citation_with_contexts_X = []
121 | sen_index = 0
122 | vocabulary_index = 0
123 | for sen in citation_with_context_X:
124 |     token_sen = nltk.word_tokenize(sen)
125 |     vec_sen = np.zeros(300)
126 |     sen_len=0
127 |     for token in token_sen:
128 |         if token in model.wv.vocab:
129 |             sen_len=sen_len+1
130 |             vec_sen = vec_sen + model[token]
131 |     #print(len(vec_sen))
132 |     vec_sen = vec_sen / sen_len
133 |     vector_citation_with_contexts_X.append(vec_sen)
134 |     sen_index = sen_index + 1
135 | print("--------Number of citations converted to vectors: ------------")
136 | print(len(vector_citation_with_contexts_X))
137 | #print("-------Example vector for first citation: --------------------")
138 | #print(vector_citation_with_contexts_X[0])
139 | 
140 | # # Calculate for each citations (with contexts and preprocessing) a Vector, save those vectors in a List -> vector_citation_with_contexts_X
141 | # vector_pre_citation_with_contexts_X = []
142 | # sen_index = 0
143 | # tfidf = 0
144 | # vocabulary_index = 0
145 | # for sen in pre_citation_with_context_X:
146 | #     token_sen = nltk.word_tokenize(sen)
147 | #     sum_tfidf = 0
148 | #     vec_sen = np.zeros(300)
149 | #     for token in token_sen:
150 | #         if token in model.wv.vocab and token in vectorizer.get_feature_names():
151 | #             vocabulary_index = vectorizer.vocabulary_[token]
152 | #             tfidf = tfidf_matrix[sen_index][vocabulary_index]
153 | #             sum_tfidf = sum_tfidf + tfidf
154 | #             vec_sen = vec_sen + model[token] * tfidf
155 | #     #print(len(vec_sen))
156 | #     vec_sen = vec_sen / sum_tfidf
157 | #     vector_pre_citation_with_contexts_X.append(vec_sen)
158 | #     sen_index =sen_index + 1
159 | # print("--------Number of citations converted to vectors: ------------")
160 | # print(len(vector_pre_citation_with_contexts_X))
161 | # #print("-------Example vector for first citation: --------------------")
162 | # #print(vector_citation_with_contexts_X[0])
163 | 
164 | # # Save vectors of each citations to local.
165 | # with open('./Pickle_Data/citation_vec.pk', 'wb') as f:
166 | #     pickle.dump(vector_citations_X, f)
167 | # print("--------Vectors saved in local------------")
168 | 
169 | # Save vectors of each citations (with contexts) to local.
170 | with open('./Pickle_Data/citation_with_context_vec.pk', 'wb') as f:
171 |     pickle.dump(vector_citation_with_contexts_X, f)
172 | print("--------Vectors saved in local------------")
173 | 
174 | # # Save vectors of each citations (with contexts) to local.
175 | # with open('./Pickle_Data/pre_citation_with_context_vec.pk', 'wb') as f:
176 | #     pickle.dump(vector_pre_citation_with_contexts_X, f)
177 | # # with open('C:/Users/songi/PycharmProjects/MasterThesis/citation_with_context_vec.pk', 'rb') as f:
178 | # #     data = pickle.load(f)
179 | # #     print(data[0])
180 | # print("--------Vectors saved in local------------")


--------------------------------------------------------------------------------
/pom.xml:
--------------------------------------------------------------------------------
  1 | <project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
  2 |   xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
  3 |   <modelVersion>4.0.0</modelVersion>
  4 | 
  5 |   <groupId>de.uni_mannheim</groupId>
  6 |   <artifactId>minscie</artifactId>
  7 |   <version>0.0.1-SNAPSHOT</version>
  8 |   <packaging>jar</packaging>
  9 | 
 10 |   <name>minie</name>
 11 |   <url>http://maven.apache.org</url>
 12 | 
 13 |   <properties>
 14 |     <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
 15 |     <maven.compiler.source>1.8</maven.compiler.source>
 16 |     <maven.compiler.target>1.8</maven.compiler.target>
 17 |   </properties>
 18 | 
 19 |   <dependencies>
 20 |     <!-- https://mvnrepository.com/artifact/it.unimi.dsi/fastutil -->
 21 |     <dependency>
 22 |         <groupId>it.unimi.dsi</groupId>
 23 |         <artifactId>fastutil</artifactId>
 24 |         <version>8.1.0</version>
 25 |     </dependency>
 26 |     <dependency>
 27 | 	    <groupId>org.python</groupId>
 28 | 	    <artifactId>jython-standalone</artifactId>
 29 | 	    <version>2.7.0</version>
 30 | 	</dependency>
 31 | 
 32 |     <!-- Stanford CoreNLP 3.8.0 dependencies -->
 33 |     <dependency>
 34 |         <groupId>edu.stanford.nlp</groupId>
 35 |         <artifactId>stanford-corenlp</artifactId>
 36 |         <version>3.8.0</version>
 37 |     </dependency>
 38 |     <dependency>
 39 |         <groupId>edu.stanford.nlp</groupId>
 40 |         <artifactId>stanford-corenlp</artifactId>
 41 |         <version>3.8.0</version>
 42 |         <classifier>models</classifier>
 43 |     </dependency>
 44 | 
 45 |     <!-- Jersey 2.26 dependencies -->
 46 |     <dependency>
 47 |         <groupId>org.glassfish.jersey.containers</groupId>
 48 |         <artifactId>jersey-container-grizzly2-http</artifactId>
 49 |         <version>2.26</version>
 50 |     </dependency>
 51 |     <dependency>
 52 |         <groupId>org.glassfish.jersey.inject</groupId>
 53 |         <artifactId>jersey-hk2</artifactId>
 54 |         <version>2.26</version>
 55 |     </dependency>
 56 |     <dependency>
 57 |         <groupId>org.glassfish.jersey.media</groupId>
 58 |         <artifactId>jersey-media-json-jackson</artifactId>
 59 |         <version>2.26</version>
 60 |     </dependency>
 61 |     <dependency>
 62 |         <groupId>org.glassfish.jersey.media</groupId>
 63 |         <artifactId>jersey-media-json-processing</artifactId>
 64 |         <version>2.26</version>
 65 |     </dependency>
 66 |     <dependency>
 67 |         <groupId>org.glassfish.jersey.media</groupId>
 68 |         <artifactId>jersey-media-multipart</artifactId>
 69 |         <version>2.26</version>
 70 |     </dependency>
 71 |     <dependency>
 72 |         <groupId>org.glassfish.jersey.media</groupId>
 73 |         <artifactId>jersey-media-sse</artifactId>
 74 |         <version>2.26</version>
 75 |     </dependency>
 76 | 
 77 |     <dependency>
 78 |         <groupId>net.sf.jopt-simple</groupId>
 79 |         <artifactId>jopt-simple</artifactId>
 80 |         <version>6.0-alpha-1</version>
 81 |     </dependency>
 82 |   </dependencies>
 83 | 
 84 |   <build>
 85 |       <resources>
 86 |           <resource>
 87 |               <directory>${basedir}/src/main/resources</directory>
 88 |           </resource>
 89 |       </resources>
 90 |       <plugins>
 91 |           <plugin>
 92 |               <groupId>org.apache.maven.plugins</groupId>
 93 |               <artifactId>maven-assembly-plugin</artifactId>
 94 |               <configuration>
 95 |                   <archive>
 96 |                       <manifest>
 97 |                           <mainClass>de.uni_mannheim.minie.main.Main</mainClass>
 98 |                       </manifest>
 99 |                   </archive>
100 |                   <appendAssemblyId>false</appendAssemblyId>
101 |                   <descriptorRefs>
102 |                       <descriptorRef>jar-with-dependencies</descriptorRef>
103 |                   </descriptorRefs>
104 |               </configuration>
105 |           </plugin>
106 |       </plugins>
107 |   </build>
108 | </project>
109 | 


--------------------------------------------------------------------------------
/src/main/java/de/uni_mannheim/clausie/Options.java:
--------------------------------------------------------------------------------
  1 | package de.uni_mannheim.clausie;
  2 | 
  3 | import java.io.File;
  4 | import java.io.FileInputStream;
  5 | import java.io.FileNotFoundException;
  6 | import java.io.IOException;
  7 | import java.io.InputStream;
  8 | import java.io.OutputStream;
  9 | import java.io.PrintStream;
 10 | import java.net.URL;
 11 | import java.util.Arrays;
 12 | import java.util.Iterator;
 13 | import java.util.Properties;
 14 | import java.util.Set;
 15 | 
 16 | import de.uni_mannheim.utils.Dictionary;
 17 | import edu.stanford.nlp.ling.IndexedWord;
 18 | 
 19 | /** Options handles the ClausIe settings which should be loaded out of a configuration file.
 20 |  *  
 21 |  * @author Luciano del Corro
 22 |  * @author Kiril Gashteovski
 23 |  */
 24 | public class Options {
 25 | 	// informatin
 26 | 	public Dictionary dictCopular;
 27 | 	public Dictionary dictExtCopular;
 28 | 	public Dictionary dictNotExtCopular;
 29 | 	public Dictionary dictComplexTransitive;
 30 | 	public Dictionary dictAdverbsConj;
 31 | 	public Dictionary dictAdverbsIgnore;
 32 | 	public Dictionary dictAdverbsInclude;
 33 | 	public boolean conservativeSVA;
 34 | 	public boolean conservativeSVOA;
 35 | 	
 36 | 	/**
 37 | 	 * Process coordinating conjunctions with common components. All other verbal coordinating
 38 | 	 * conjunctions will always been processed.
 39 | 	 * 
 40 | 	 * Example: some sentence
 41 | 	 * Option on: ...
 42 | 	 * Option off:
 43 | 	 * 
 44 | 	 * defaulot value
 45 | 	 * 
 46 | 	 * Example sentance that is not affected
 47 | 	 */
 48 | 	public boolean processCcAllVerbs;	
 49 | 	public boolean processCcNonVerbs;
 50 | 	public boolean processAppositions;
 51 | 	public boolean processPossessives;
 52 | 	public boolean processPartmods;
 53 | 	//public boolean processPassive = false; // NOT SUPPORTED FOR NOW (collapsed semantic graph needed but less stable)
 54 | 	//add for possesive
 55 | 	
 56 | 	// representation
 57 | 	public boolean nary;
 58 | 	public int minOptionalArgs; // only when nary=false
 59 | 	public int maxOptionalArgs; // only when nary=false
 60 | 	public boolean lemmatize;
 61 | 	public String appositionVerb;
 62 | 	public String possessiveVerb;
 63 | 	
 64 | 	//helpds
 65 | 	
 66 | 	/**Constructs the set of options out of a conf file (clausie.conf)*/
 67 | 	public Options() {
 68 | 		try {
 69 | 			InputStream in = getClass().getResource("/clausie-resources/clausie.conf").openStream();
 70 | 			setOptions(in);
 71 | 			in.close();		
 72 | 		} catch (IOException e) {
 73 | 			// should not happen
 74 | 			throw new RuntimeException(e);
 75 | 		}
 76 | 	}
 77 | 
 78 | 	/**Constructs the set of options out of a conf file (fileOrResourceName)*/
 79 | 	public Options(String fileOrResourceName) throws IOException {
 80 | 		InputStream in = openFileOrResource(fileOrResourceName);
 81 | 		setOptions(in);
 82 | 		in.close();		
 83 | 	}
 84 | 	
 85 | 	private InputStream openFileOrResource(String name) throws IOException {
 86 | 		try {
 87 | 			File file = new File(name);
 88 | 			return new FileInputStream(file);
 89 | 		} catch (FileNotFoundException e) {
 90 | 		}
 91 | 		URL url = getClass().getResource(name);
 92 | 		if (url == null) {
 93 | 			throw new IOException("File or resource '" + name + "' not found.");
 94 | 		}
 95 | 		return url.openStream();
 96 | 	}
 97 | 	
 98 | 	/** Load options from the configuration file*/
 99 | 	public void setOptions(InputStream optionsStream) throws IOException {
100 | 		Properties prop = new Properties();
101 | 		prop.load(optionsStream);
102 | 		
103 | 		// load the required options
104 | 		conservativeSVA = Boolean.parseBoolean(getProperty(prop, "conservativeSVA"));
105 | 		conservativeSVOA = Boolean.parseBoolean(getProperty(prop, "conservativeSVOA"));
106 | 		processCcAllVerbs = Boolean.parseBoolean(getProperty(prop, "processCcAllVerbs"));
107 | 		processCcNonVerbs = Boolean.parseBoolean(getProperty(prop, "processCcNonVerbs"));
108 | 		processAppositions = Boolean.parseBoolean(getProperty(prop, "processAppositions"));
109 | 		appositionVerb = getProperty(prop, "appositionVerb");
110 | 		processPossessives = Boolean.parseBoolean(getProperty(prop, "processPossessives"));
111 | 		possessiveVerb = getProperty(prop, "possessiveVerb");
112 | 		processPartmods = Boolean.parseBoolean(getProperty(prop, "processPartmods"));
113 | 		lemmatize = Boolean.parseBoolean(getProperty(prop, "lemmatize"));
114 | 		nary = Boolean.parseBoolean(getProperty(prop, "nary"));
115 | 		minOptionalArgs = Integer.parseInt(getProperty(prop, "minOptionalArgs"));
116 | 		maxOptionalArgs = Integer.parseInt(getProperty(prop, "maxOptionalArgs"));
117 | 		
118 | 		// get dictionaries
119 | 		dictCopular = getDictionary(prop, "dictCopular");
120 | 		dictExtCopular = getDictionary(prop, "dictExtCopular");
121 | 		dictNotExtCopular = getDictionary(prop, "dictNotExtCopular");
122 | 		dictComplexTransitive = getDictionary(prop, "dictComplexTransitive");
123 | 		dictAdverbsConj = getDictionary(prop, "dictAdverbsConj");
124 | 		dictAdverbsIgnore = getDictionary(prop, "dictAdverbsIgnore");
125 | 		dictAdverbsInclude = getDictionary(prop, "dictAdverbsInclude");
126 | 		
127 | 		// check for unused properties
128 | 		if (!prop.isEmpty()) {
129 | 			System.err.println( "Unknown option(s): " 
130 | 					+ Arrays.toString( prop.keySet().toArray() ));
131 | 		}
132 | 	}
133 | 
134 | 	/** Returns a required option (key) */
135 | 	private String getProperty(Properties prop, String key) throws IOException {
136 | 		String result = prop.getProperty(key);
137 | 		if (result == null) {
138 | 			throw new IOException("Missing option: " + key);
139 | 		}
140 | 		prop.remove(key);
141 | 		return result;
142 | 	}
143 | 	
144 | 	/**Loads a dictionary (key) */
145 | 	private Dictionary getDictionary(Properties prop, String key) throws IOException {
146 | 		String name = getProperty(prop, key);
147 | 		InputStream in = openFileOrResource(name);
148 | 		Dictionary dict = new Dictionary();
149 | 		dict.load(in);
150 | 		in.close();
151 | 		return dict;
152 | 	}
153 | 	
154 | 	/**Checks if the copular dictionary contains a given word*/
155 | 	public boolean isCop(IndexedWord word) {
156 | 		return dictCopular.containsLemmatized(word);
157 | 	}
158 | 
159 | 	/**Checks if the extended copular dictionary contains a given word*/
160 | 	public boolean isExtCop(IndexedWord word) {
161 | 		return dictExtCopular.containsLemmatized(word);
162 | 	}
163 | 
164 | 	/**Checks if the non-extended copular dictionary contains a given word*/
165 | 	public boolean isNotExtCop(IndexedWord word) {
166 | 		return dictNotExtCopular.containsLemmatized(word);
167 | 	}
168 | 	
169 | 	/**Checks if the complex transitive dictionary contains a given word*/
170 | 	public boolean isComTran(IndexedWord word) {
171 | 		return dictComplexTransitive.containsLemmatized(word);
172 | 	}
173 | 
174 | 	/**Returns a string with some initial words of a given dictionary*/
175 | 	private String someWords(Set<String> dict) {
176 | 		if (dict.isEmpty()) return "";
177 | 		StringBuffer result = new StringBuffer();
178 | 		Iterator<String> it = dict.iterator();
179 | 		String sep = "";
180 | 		result.append(" (");
181 | 		for(int i=0; i<3 && it.hasNext(); i++) {
182 | 			result.append(sep);
183 | 			result.append(it.next());
184 | 			sep = ", ";
185 | 		}
186 | 		if (it.hasNext()) result.append(", ...");
187 | 		result.append(")");
188 | 		return result.toString();
189 | 	}
190 | 	
191 | 	public void print(OutputStream out) {
192 | 		print(out, "");
193 | 	}
194 | 	
195 | 	/**Print settings*/
196 | 	public void print(OutputStream out, String prefix) {
197 | 		PrintStream pout = new PrintStream(out);
198 | 
199 | 		pout.println(prefix + "CLAUSE DETECTION");
200 | 		pout.println(prefix + "  Dict. copular        : " + dictCopular.size() + someWords(dictCopular.words));
201 | 		pout.println(prefix + "  Dict. ext-copular    : " + dictExtCopular.size() + someWords(dictExtCopular.words));
202 | 		pout.println(prefix + "  Dict. not ext.-cop.  : " + dictNotExtCopular.size() + someWords(dictNotExtCopular.words));
203 | 		pout.println(prefix + "  Dict. complex trans. : " + dictComplexTransitive.size() + someWords(dictComplexTransitive.words));
204 | 		pout.println(prefix + "  Dict. ignored adverb : " + dictAdverbsIgnore.size() + someWords(dictAdverbsIgnore.words));
205 | 		pout.println(prefix + "  Dict. included adverb: " + dictAdverbsInclude.size() + someWords(dictAdverbsInclude.words));
206 | 		pout.println(prefix + "  Dict. conj adverbs   : " + dictAdverbsConj.size() + someWords(dictAdverbsConj.words));
207 | 		pout.println(prefix + "  Conservative SVA     : " + conservativeSVA);
208 | 		pout.println(prefix + "  Conservative SVOA    : " + conservativeSVOA);
209 | 		pout.println(prefix + "  Process all verb CCs : " + processCcAllVerbs);
210 | 		pout.println(prefix + "  Process non-verb CCs : " + processCcNonVerbs);
211 | 		pout.println(prefix + "  Process appositions  : " + processAppositions);
212 | 		pout.println(prefix + "  Process possessives  : " + processPossessives);
213 | 		pout.println(prefix + "  Process partmods     : " + processPartmods);
214 | 
215 | 		pout.println(prefix + "");
216 | 		pout.println(prefix + "REPRESENTATION");		
217 | 		pout.println(prefix + "  n-ary propositions  : " + nary);
218 | 		pout.println(prefix + "  Min. opt. args      : " + minOptionalArgs);
219 | 		pout.println(prefix + "  Max. opt. args      : " + maxOptionalArgs);
220 | 		pout.println(prefix + "  Lemmatize           : " + lemmatize);
221 | 		pout.println(prefix + "  Appositions verb    : \"" + appositionVerb + "\"");
222 | 		pout.println(prefix + "  Possessive verb     : \"" + possessiveVerb + "\"");
223 | 	}
224 | }
225 | 


--------------------------------------------------------------------------------
/src/main/java/de/uni_mannheim/clausie/constituent/Constituent.java:
--------------------------------------------------------------------------------
 1 | package de.uni_mannheim.clausie.constituent;
 2 | 
 3 | import edu.stanford.nlp.ling.IndexedWord;
 4 | 
 5 | /**
 6 |  * A constituent of a clause.
 7 |  *
 8 |  * @author Luciano del Corro
 9 |  * @author Kiril Gashteovski
10 |  *
11 |  */
12 | public abstract class Constituent {
13 | 
14 |     // -- types -----------------------------------------------------------------------------------
15 | 
16 |     /** Constituent types */
17 |     public enum Type {
18 |         SUBJECT, VERB, DOBJ, IOBJ, COMPLEMENT, CCOMP, XCOMP, ACOMP, ADVERBIAL, UNKOWN
19 |     };
20 | 
21 |     /** Constituent status (could be one of the three: required, optional or ignore */
22 |     public enum Status {
23 |         REQUIRED, OPTIONAL, IGNORE
24 |     };
25 | 
26 |     /** The root vertex of this constituent in {@link #semanticGraph}. This vertex and all its
27 |      * descendants are part of the constituent (unless they appear in {@link #excludedVertexes}). */
28 |     protected IndexedWord root;
29 |     
30 |     // -- member variables ------------------------------------------------------------------------
31 |     
32 |     /** Type of this constituent */
33 |     protected Type type;
34 | 
35 |     
36 |     // -- construction ----------------------------------------------------------------------------
37 |     
38 |     /** Constructs a constituent of the specified type. */
39 |     protected Constituent(Type type) {
40 |         this.type = type;
41 |     }
42 | 
43 |     /** Constructs a constituent of unknown type. */
44 |     protected Constituent() {
45 |         this.type = Type.UNKOWN;
46 |     }
47 |     
48 |     // -- getters/setters -------------------------------------------------------------------------
49 | 
50 |     /** Returns the type of this constituent. */
51 |     public Type getType() {
52 |         return type;
53 |     }
54 |     
55 |     public IndexedWord getRoot() { 
56 |         return this.root;
57 |     }
58 |     
59 |     // -- utility methods -------------------------------------------------------------------------
60 |     
61 |     /** Returns a textual representation of the root word of this constituent. */
62 |     public abstract String rootString();
63 | }
64 | 


--------------------------------------------------------------------------------
/src/main/java/de/uni_mannheim/clausie/constituent/IndexedConstituent.java:
--------------------------------------------------------------------------------
  1 | package de.uni_mannheim.clausie.constituent;
  2 | 
  3 | import java.util.List;
  4 | import java.util.Set;
  5 | import java.util.TreeSet;
  6 | 
  7 | import de.uni_mannheim.utils.coreNLP.DpUtils;
  8 | import edu.stanford.nlp.ling.IndexedWord;
  9 | import edu.stanford.nlp.semgraph.SemanticGraph;
 10 | import edu.stanford.nlp.semgraph.SemanticGraphEdge;
 11 | 
 12 | /** A constituent of a clause described by a {@link SemanticGraph}.
 13 |  * 
 14 |  * Each constituent has a root vertex. The root together with its the descendants form the
 15 |  * constituent. In some cases, additional vertexes need to be included or excluded; 
 16 |  * these vertexes are also recorded within this class.
 17 |  * 
 18 |  * Note that the {@link SemanticGraph} may or may not match the graph of the input sentences or the
 19 |  * other constituents of the same clause. For example, the semantic graphs are modified when
 20 |  * processing of coordinating conjunctions.
 21 |  *
 22 |  * @author Luciano del Corro
 23 |  * @author Kiril Gashteovski
 24 |  */
 25 | public class IndexedConstituent extends Constituent {
 26 | 
 27 |     // -- member variables ------------------------------------------------------------------------
 28 | 
 29 | 	/** Semantic graph for this sentence */
 30 |     //protected static SemanticGraph sentSemanticGraph;
 31 | 	//protected SemanticGraph sentSemanticGraph;
 32 | 	
 33 |     /** Semantic graph for this constituent */
 34 |     private SemanticGraph semanticGraph;
 35 | 
 36 |     /** Additional root vertexes that form this constituent. These vertexes and all their descendants
 37 |      * are part of the constituent (unless they appear in {@link #excludedVertexes}). */
 38 |     private Set<IndexedWord> additionalVertexes;
 39 | 
 40 |     /** Vertexes that are excluded from this constituent. All descendants are excluded as well
 41 |      * (unless they appear in {@link #root} or {@link additionalRoots}). */
 42 |     public Set<IndexedWord> excludedVertexes;
 43 | 
 44 |     // -- construction ----------------------------------------------------------------------------
 45 | 
 46 |     protected IndexedConstituent() {
 47 |     }
 48 | 
 49 |     /** Constructs a new indexed constituent.
 50 |      * 
 51 |      * @param semanticGraph Semantic graph for this constituent ({@see #semanticGraph})
 52 |      * @param root The root vertex of this constituent ({@see {@link #root})
 53 |      * @param additionalVertexes Additional root vertexes that form this constituent ({@see
 54 |      *            {@link #additionalVertexes})
 55 |      * @param excludedVertexes Vertexes that are excluded from this constituent ({@see
 56 |      *            {@link #excludedVertexes})
 57 |      * @param type type of this constituent 
 58 |      */
 59 |     public IndexedConstituent(SemanticGraph semanticGraph, IndexedWord root, Set<IndexedWord> additionalVertexes, 
 60 |             Set<IndexedWord> excludedVertexes, Type type) {
 61 |         super(type);
 62 |         this.semanticGraph = semanticGraph;
 63 |         this.root = root;
 64 |         this.additionalVertexes = new TreeSet<IndexedWord>(additionalVertexes);
 65 |         this.excludedVertexes = new TreeSet<IndexedWord>(excludedVertexes);
 66 |     }
 67 | 
 68 |     /** Constructs a simple indexed constituent without additional additional or excluded vertexes.
 69 |      * 
 70 |      * @param semanticGraph Semantic graph for this constituent ({@see #semanticGraph})
 71 |      * @param root The root vertex of this constituent ({@see {@link #root})
 72 |      * @param type type of this constituent 
 73 |      */
 74 |     public IndexedConstituent(SemanticGraph semanticGraph, IndexedWord root, Type type) {
 75 |         this(semanticGraph, root, new TreeSet<IndexedWord>(), new TreeSet<IndexedWord>(), type);
 76 |     }
 77 | 
 78 |     /** Creates a deep copy of this indexed constituent. */
 79 |     @Override
 80 | 	public IndexedConstituent clone() {
 81 |         IndexedConstituent clone = new IndexedConstituent();
 82 |         clone.type = type;
 83 |         clone.semanticGraph = new SemanticGraph(semanticGraph);
 84 |         clone.root = this.root;
 85 |         clone.additionalVertexes = new TreeSet<IndexedWord>(this.additionalVertexes);
 86 |         clone.excludedVertexes = new TreeSet<IndexedWord>(this.excludedVertexes);
 87 |         return clone;
 88 |     }
 89 | 
 90 |     // -- getters/setters -------------------------------------------------------------------------
 91 | 
 92 |     /** Returns the semantic graph for this constituent ({@see #semanticGraph}). */
 93 |     public SemanticGraph getSemanticGraph() {
 94 |         return semanticGraph;
 95 |     }
 96 |     
 97 |     /** Returns the semantic graph for this sentence ({@see #sentSemanticGraph}). */
 98 |     /*public SemanticGraph getSentSemanticGraph() {
 99 |         return sentSemanticGraph;
100 |     }*/
101 | 
102 |     /** Sets the semantic graph for this constituent ({@see #semanticGraph}). */
103 |     public void setSemanticGraph(SemanticGraph newSemanticGraph) {
104 |         this.semanticGraph = newSemanticGraph;
105 |     }
106 | 
107 |     /** Returns the root vertex of this constituent ({@see {@link #root}). */
108 |     public IndexedWord getRoot() {
109 |         return root;
110 |     }
111 | 
112 |     /** Sets the root vertex of this constituent ({@see {@link #root}). */
113 |     public void setRoot(IndexedWord newRoot) {
114 |         root = newRoot;
115 |     }
116 | 
117 |     /** Returns additional root vertexes that form this constituent ({@see
118 |      * {@link #additionalVertexes}). */
119 |     public Set<IndexedWord> getAdditionalVertexes() {
120 |         return additionalVertexes;
121 |     }
122 | 
123 |     /** Returns vertexes that are excluded from this constituent ({@see {@link #excludedVertexes}). */
124 |     public Set<IndexedWord> getExcludedVertexes() {
125 |         return excludedVertexes;
126 |     }
127 | 
128 |     /** Checks whether this constituent is a prepositional phrase (i.e., starts with a preposition). */
129 |     public boolean isPrepositionalPhrase(SemanticGraph sentSemanticGraph) { //This is a mess, find other way of fixing. This is purelly heuristic. 
130 |     	//It needs to know the semantic graph for the sentence after this is fixed the member variable sentSemanticGraph 
131 |     	//can be removed
132 |     	List<IndexedWord> parents = semanticGraph.getParentList(root); 	//This is not the cleanest way semantics messed up. 
133 |     																	//specially with the rel we cannot just check if 
134 |     																	//the head is a preposition 
135 |     																	//(return root.tag().equals("IN")) because the 
136 |     																	//parser some times includes a preposition in the 
137 |     																	//verbal phrase "He is about to win"
138 |     	for(IndexedWord parent: parents) {
139 |     		SemanticGraphEdge edge = semanticGraph.getEdge(parent, root);
140 |     		if(DpUtils.isRel(edge))
141 |     			return true;
142 |     		if(DpUtils.isAnyPrep(edge)) {
143 |     			List<IndexedWord> ancestors = sentSemanticGraph.getParentList(parent);
144 |     			
145 |     			for(IndexedWord ancestor: ancestors) {
146 |     				SemanticGraphEdge ed = sentSemanticGraph.getEdge(ancestor, parent);
147 |     				if(DpUtils.isRcmod(ed))
148 |     					return true;
149 |     			}
150 |     			
151 |     		}
152 |     	}
153 |     	return false;
154 |         //return root.tag().equals("IN");
155 |     }
156 | 
157 |     // -- utility methods -------------------------------------------------------------------------
158 | 
159 |     /** Returns a textual representation of the root word of this constituent. */
160 |     public String rootString() {
161 |         return root.originalText();
162 |     }
163 | 
164 |     /** Returns a copy of the semantic graph of this constituent in which all edges (from any 
165 |      * included vertex) to excluded vertexes have been removed. Useful for proposition generation. */
166 |     public SemanticGraph createReducedSemanticGraph() {
167 |         SemanticGraph result = new SemanticGraph(semanticGraph);
168 |         DpUtils.removeEdges(result,  root,  excludedVertexes);
169 |         for (IndexedWord v : additionalVertexes) {
170 |             DpUtils.removeEdges(result,  v,  excludedVertexes);
171 |         }
172 |         return result;
173 |     }
174 |     
175 |     public void setAdditionalVertexes(Set<IndexedWord> aVertexes){
176 |         this.additionalVertexes = aVertexes;
177 |     }
178 |     public void addVertexToAdditionalVertexes(IndexedWord w){
179 |         this.additionalVertexes.add(w);
180 |     }
181 |     
182 |     public Type getConstituentType(){
183 |         return this.type;
184 |     }
185 | }
186 | 


--------------------------------------------------------------------------------
/src/main/java/de/uni_mannheim/clausie/constituent/PhraseConstituent.java:
--------------------------------------------------------------------------------
 1 | package de.uni_mannheim.clausie.constituent;
 2 | 
 3 | import it.unimi.dsi.fastutil.objects.ObjectArrayList;
 4 | import de.uni_mannheim.clausie.phrase.Phrase;
 5 | import edu.stanford.nlp.ling.IndexedWord;
 6 | 
 7 | /**
 8 |  * A phrase expression of a constituent. The constituent is represented as a Phrase
 9 |  *
10 |  * @author Kiril Gashteovski
11 |  *
12 |  */
13 | public class PhraseConstituent extends Constituent {
14 |     /** The constituent as a phrase **/
15 |     private Phrase phrase;
16 | 	
17 |     /** Constructs a constituent with a specified textual representation and type. */
18 |     public PhraseConstituent(Phrase p, Type type) {
19 |         super(type);
20 |         this.phrase = p;
21 |         this.root = p.getRoot();
22 |     }
23 | 
24 |     /** Returns a textual representation of the constituent. */
25 |     public String rootString() {
26 |         return this.phrase.getWords();
27 |     }
28 | 	
29 |     /** Adding a word to the list of words of the phrase **/
30 |     public void addWordToList(IndexedWord word){
31 |         this.phrase.addWordToList(word);
32 |     }
33 |     /** Adding all the elements from a list of indexed words to the list of indexed words of the phrase **/
34 |     public void addWordsToList(ObjectArrayList<IndexedWord> words){
35 |         this.phrase.addWordsToList(words);
36 |     }
37 |     public Phrase getPhrase(){
38 |         return this.phrase;
39 |     }
40 | }
41 | 


--------------------------------------------------------------------------------
/src/main/java/de/uni_mannheim/clausie/constituent/XcompConstituent.java:
--------------------------------------------------------------------------------
 1 | package de.uni_mannheim.clausie.constituent;
 2 | 
 3 | import java.util.Set;
 4 | import java.util.TreeSet;
 5 | 
 6 | import de.uni_mannheim.clausie.clause.Clause;
 7 | import edu.stanford.nlp.ling.IndexedWord;
 8 | import edu.stanford.nlp.semgraph.SemanticGraph;
 9 | 
10 | import it.unimi.dsi.fastutil.objects.ObjectArrayList;
11 | 
12 | /** An {@code XcompConstituent} of a clause formed out of an xcomp.
13 |  * 
14 |  * Note that the xcomp relation refers to a clause with an external subject.
15 |  * The constituent stores the set of clauses that can be derived from the xcomp 
16 |  * clause. 
17 |  *
18 |  * @author Luciano del Corro
19 |  * @author Kiril Gashteovski
20 |  *
21 |  */
22 | public class XcompConstituent extends IndexedConstituent {
23 | 	
24 |     /** Clauses derived from this constituent */
25 |     private ObjectArrayList<Clause> clauses;
26 | 	
27 |     private XcompConstituent() {		
28 |     }
29 | 
30 |     /** Constructs a new constituent for the xcomp relation.
31 |      * 
32 |      * @param semanticGraph Semantic graph for this constituent ({@see #semanticGraph})
33 |      * @param root The root vertex of this constituent ({@see {@link #root})
34 |      * @param type type of this constituent 
35 |      * @param clauses derived from this constituent
36 |      */
37 |     public XcompConstituent(SemanticGraph semanticGraph, IndexedWord root, Type type, ObjectArrayList<Clause> clauses) {
38 |         super(semanticGraph, root, type);
39 |         this.setClauses(clauses);
40 |     }
41 | 	
42 |     /** Constructs a new indexed constituent for the xcomp relation.
43 |      * 
44 |      * @param semanticGraph Semantic graph for this constituent ({@see #semanticGraph})
45 |      * @param root The root vertex of this constituent ({@see {@link #root})
46 |      * @param additionalVertexes Additional root vertexes that form this constituent ({@see
47 |      *            {@link #additionalVertexes})
48 |      * @param excludedVertexes Vertexes that are excluded from this constituent ({@see
49 |      *            {@link #excludedVertexes})
50 |      * @param type type of this constituent
51 |      * @param clauses derived from this constituent
52 |      */
53 |     public XcompConstituent(SemanticGraph semanticGraph, IndexedWord root, Set<IndexedWord> additionalVertexes,
54 |             Set<IndexedWord> excludedVertexes, Type type, ObjectArrayList<Clause> clauses) {
55 |         super(semanticGraph, root, additionalVertexes, excludedVertexes, type);
56 |         this.setClauses(clauses);
57 |     }
58 | 
59 |     /** Returns the clauses derived from the constituent. */
60 |     public ObjectArrayList<Clause> getClauses() {
61 |         return clauses;
62 |     }
63 | 
64 |     /** Sets the clauses derived from the constituent. */
65 |     public void setClauses(ObjectArrayList<Clause> clauses) {
66 |         this.clauses = clauses;
67 |     }
68 | 	
69 |     @Override
70 |     public XcompConstituent clone() {
71 |         XcompConstituent clone = new XcompConstituent();
72 |         clone.type = type;
73 |         clone.setSemanticGraph(new SemanticGraph(this.getSemanticGraph()));
74 |         clone.root = this.getRoot();
75 |         clone.setAdditionalVertexes(new TreeSet<IndexedWord>(this.getAdditionalVertexes()));
76 |         clone.excludedVertexes = new TreeSet<IndexedWord>(this.excludedVertexes);
77 |         clone.clauses = new ObjectArrayList<Clause>(clauses);
78 |         return clone;
79 |     }
80 | }
81 | 


--------------------------------------------------------------------------------
/src/main/java/de/uni_mannheim/clausie/proposition/DefaultPropositionGenerator.java:
--------------------------------------------------------------------------------
  1 | package de.uni_mannheim.clausie.proposition;
  2 | 
  3 | import java.util.SortedSet;
  4 | import java.util.TreeSet;
  5 | 
  6 | import de.uni_mannheim.clausie.ClausIE;
  7 | import de.uni_mannheim.clausie.clause.Clause;
  8 | import de.uni_mannheim.clausie.constituent.Constituent;
  9 | import de.uni_mannheim.clausie.constituent.IndexedConstituent;
 10 | import de.uni_mannheim.clausie.constituent.PhraseConstituent;
 11 | import de.uni_mannheim.clausie.constituent.Constituent.Status;
 12 | import de.uni_mannheim.clausie.phrase.Phrase;
 13 | import de.uni_mannheim.utils.coreNLP.CoreNLPUtils;
 14 | 
 15 | import edu.stanford.nlp.semgraph.SemanticGraph;
 16 | 
 17 | import it.unimi.dsi.fastutil.objects.ObjectArrayList;
 18 | 
 19 | 
 20 | /**
 21 |  * Currently the default proposition generator generates 3-ary propositions out of a clause
 22 |  *
 23 |  * @author Luciano del Corro
 24 |  * @author Kiril Gashteovski
 25 |  *
 26 |  * */
 27 | public class DefaultPropositionGenerator extends PropositionGenerator {
 28 |     public DefaultPropositionGenerator(ClausIE clausIE) {
 29 |         super(clausIE);
 30 |     }
 31 | 
 32 |     /** 
 33 |      *  @param clause: the clause in which the proposition is generated (and added to the list of propositions in 'clause')
 34 |      *  @param sGraph: semantic graph of the sentence
 35 |      */
 36 |     @Override
 37 |     public void generate(Clause clause, SemanticGraph sGraph) {
 38 |         Proposition proposition = new Proposition();
 39 |         ObjectArrayList<Constituent.Type> constTypes = new ObjectArrayList<Constituent.Type>();
 40 | 
 41 |         // Process subject
 42 |         if (clause.getSubject() > -1 && clause.getIncludedConstitsInds().getBoolean(clause.getSubject())) { // subject is -1 when there is an xcomp
 43 |             Phrase subjPhrase = generate(clause, clause.getSubject(), sGraph);
 44 |             Constituent subjConstituent = clause.getConstituents().get(clause.getSubject());
 45 |             subjPhrase.setRoot(subjConstituent.getRoot());
 46 |             proposition.addPhrase(new Phrase(subjPhrase));
 47 |             constTypes.add(Constituent.Type.SUBJECT);
 48 |         } else {
 49 |             //throw new IllegalArgumentException();
 50 |         }
 51 | 
 52 |         // Process verb
 53 |         if (clause.getIncludedConstitsInds().getBoolean(clause.getVerbInd())) {
 54 |             Phrase relation = generate(clause, clause.getVerbInd(), sGraph);
 55 |             Constituent verb = clause.getConstituents().get(clause.getVerbInd());
 56 |             relation.setRoot(verb.getRoot());
 57 |             proposition.addPhrase(new Phrase(relation));
 58 |             constTypes.add(Constituent.Type.VERB);
 59 |         } else {
 60 |             throw new IllegalArgumentException();
 61 |         }
 62 | 
 63 |         // Process arguments
 64 |         SortedSet<Integer> sortedIndexes = new TreeSet<Integer>();
 65 |         sortedIndexes.addAll(clause.getIobjectsInds());
 66 |         sortedIndexes.addAll(clause.getDobjectsInds());
 67 |         sortedIndexes.addAll(clause.getXcompsInds());
 68 |         sortedIndexes.addAll(clause.getCcompsInds());
 69 |         sortedIndexes.addAll(clause.getAcompsInds());
 70 |         sortedIndexes.addAll(clause.getAdverbialInds());
 71 |         if (clause.getComplementInd() >= 0)
 72 |             sortedIndexes.add(clause.getComplementInd());
 73 |         for (int index: sortedIndexes) {
 74 |             Constituent verbConstituent = clause.getConstituents().get(clause.getVerbInd());
 75 |             Constituent indexConstituent = clause.getConstituents().get(index);
 76 |             boolean isVerbIndexedConstituent = verbConstituent instanceof IndexedConstituent;
 77 |             boolean adverbialsContainIndex = clause.getAdverbialInds().contains(index);
 78 |             if (isVerbIndexedConstituent && adverbialsContainIndex && 
 79 |                     indexConstituent.getRoot().index() < verbConstituent.getRoot().index()) 
 80 |                 continue;
 81 | 
 82 |             if (clause.getIncludedConstitsInds().getBoolean(index)) {
 83 |                 Phrase argument = generate(clause, index, sGraph);
 84 |                 argument.setRoot(clause.getConstituents().get(index).getRoot());
 85 |                 proposition.addPhrase(new Phrase(argument));       
 86 |                 constTypes.add(clause.getConstituents().get(index).getType());
 87 |             }
 88 |         }
 89 | 
 90 |         // Process adverbials  before verb
 91 |         sortedIndexes.clear();
 92 |         sortedIndexes.addAll(clause.getAdverbialInds());
 93 |         for (Integer index : sortedIndexes) {
 94 |             Constituent verbConstituent = clause.getConstituents().get(clause.getVerbInd());
 95 |             Constituent indexConstituent = clause.getConstituents().get(index);
 96 |             boolean isVerbPhraseConstituent = verbConstituent instanceof PhraseConstituent;
 97 |             // If the verb is a TextConstituent or the current constituent's root index is greater than the
 98 |             // verb constituent's root index -> break  
 99 |             if (isVerbPhraseConstituent || (indexConstituent.getRoot().index() > verbConstituent.getRoot().index())) 
100 |                 break;
101 |             if (clause.getIncludedConstitsInds().getBoolean(index)) {
102 |                 Phrase argument = generate(clause, index, sGraph);
103 |                 argument.setRoot(clause.getConstituents().get(index).getRoot());
104 |                 proposition.getPhrases().add(new Phrase(argument));
105 |                 constTypes.add(clause.getConstituents().get(index).getType());
106 | 
107 |                 if (clause.getConstituentStatus(index, clausIE.getOptions()).equals(Status.OPTIONAL)) {
108 |                     proposition.addOptionalConstituentIndex(proposition.getPhrases().size());
109 |                 }	
110 |             }
111 |         }
112 | 
113 |         // Make triple if specified + push necessary constituents to the relation
114 |         if (!clausIE.getOptions().nary) {
115 |             proposition.clearOptionalConstituentIndicesSet();
116 |             if (proposition.getPhrases().size() > 3) {
117 |                 // Push the necessary constituents to the relation
118 |                 pushConstituentsToRelation(proposition, constTypes);
119 | 
120 |                 // Merge the rest of the n-ary tuple to the 3rd constituent (making it a triple)
121 |                 Phrase argPhrase = new Phrase();
122 |                 argPhrase.setRoot(proposition.getPhrases().get(2).getRoot());
123 |                 for (int i = 2; i < proposition.getPhrases().size(); i++) {
124 |                     argPhrase.addWordsToList(proposition.getPhrases().get(i).getWordList().clone());
125 |                 }
126 |                 proposition.setPhrase(2, argPhrase);
127 |                 for (int i = proposition.getPhrases().size() - 1; i > 2; i--) {
128 |                     proposition.getPhrases().remove(i);
129 |                 }
130 |             }
131 |         }
132 | 
133 |         // We are done
134 |         clause.addProposition(proposition);
135 |     }
136 | 
137 |     /**
138 |      * Given a constituent index i, push it to the relation of the proposition p
139 |      * @param proposition: proposition 
140 |      * @param i: push the i-th phrase to the relation of the proposition 
141 |      */
142 |     private static void pushConstituentToRelation(Proposition p, int i){
143 |         // New relational phrase. The root of the relational phrase is the verb by default
144 |         Phrase relation = new Phrase();
145 |         relation.setRoot(p.getPhrases().get(1).getRoot()); 
146 |         
147 |         // Push
148 |         relation.addWordsToList(p.getPhrases().get(1).getWordList().clone());
149 |         relation.addWordsToList(p.getPhrases().get(i).getWordList().clone());
150 |         p.setRelation(relation);
151 |         
152 |         // Clean the i-th constituent
153 |         p.getPhrases().get(i).getWordList().clear();
154 |     }
155 | 
156 |     /**
157 |      * Given a proposition and a list of constituency types (corresponding the phrases of the proposition), 
158 |      * push the constituents to the relation if needed
159 |      * @param proposition
160 |      * @param constTypes
161 |      */
162 |     private static void pushConstituentsToRelation(Proposition p, ObjectArrayList<Constituent.Type> types){
163 |         // Push constituents to the relation if the 4th constituent is an adverbial 
164 |         // (for SVA(A), SVC(A), SVO(A), SVOA)
165 |         if (types.get(3) == Constituent.Type.ADVERBIAL){
166 |             // If the adverbial is consisted of one adverb, don't push the previous constituent
167 |             if (p.getPhrases().get(3).getWordList().size() > 1) {
168 |                 // If CCOMP don't push it
169 |                 if (types.get(2) == Constituent.Type.CCOMP) {
170 |                     return;
171 |                 }
172 |                 pushConstituentToRelation(p, 2);
173 |             }
174 |             // If the adverbial is consisted of one adverb, push the adverb to the relation
175 |             else if (p.getPhrases().get(3).getWordList().size() == 1){
176 |                 if (CoreNLPUtils.isAdverb(p.getPhrases().get(3).getWordList().get(0).tag()))
177 |                     pushConstituentToRelation(p, 3);
178 |                 else
179 |                     pushConstituentToRelation(p, 2);
180 |             }
181 |         }
182 |         // If the 3rd constituent is an indirect/direct object or an adverbial (for SVOO/SVOC, SVOA)
183 |         else if (types.get(2) == Constituent.Type.IOBJ || types.get(2) == Constituent.Type.DOBJ ||
184 |                  types.get(2) == Constituent.Type.ADVERBIAL){
185 |             pushConstituentToRelation(p, 2);
186 |         }
187 |     }
188 | }
189 | 


--------------------------------------------------------------------------------
/src/main/java/de/uni_mannheim/clausie/proposition/Proposition.java:
--------------------------------------------------------------------------------
  1 | 
  2 | package de.uni_mannheim.clausie.proposition;
  3 | 
  4 | import java.util.HashSet;
  5 | import java.util.Set;
  6 | 
  7 | import de.uni_mannheim.clausie.phrase.Phrase;
  8 | import de.uni_mannheim.constant.SEPARATOR;
  9 | import it.unimi.dsi.fastutil.objects.ObjectArrayList;
 10 | 
 11 | /** Stores a proposition.
 12 |  * 
 13 |  * @author Luciano del Corro
 14 |  * @author Kiril Gashteovski
 15 |  *
 16 |  */
 17 | public class Proposition {
 18 | 	
 19 | 	/** Constituents of the proposition */
 20 |     private ObjectArrayList<Phrase> phrases = new ObjectArrayList<Phrase>();
 21 | 	
 22 |     /** Position of optional constituents */
 23 | 	private Set<Integer> optional = new HashSet<Integer>();
 24 | 	
 25 | 	// TODO: types of constituents (e.g., optionality) sentence ID etc.
 26 | 	
 27 | 	public Proposition() {
 28 | 	}
 29 | 	
 30 | 	/**
 31 | 	 * Removes a word from a constituent
 32 | 	 * @param i: the constituent index
 33 | 	 * @param j: the word index within the constituent
 34 | 	 */
 35 | 	public void removeWordFromConstituent(int i, int j){
 36 | 	    this.phrases.get(i).removeWordFromList(j);
 37 | 	}
 38 | 	
 39 | 	/** Returns a list of constituents of the proposition */
 40 | 	public ObjectArrayList<Phrase> getConstituents(){
 41 | 		return this.phrases;
 42 | 	}
 43 | 	
 44 | 	/** Returns the subject of the proposition */
 45 | 	public Phrase subject() {
 46 | 		return this.phrases.get(0);
 47 | 	}
 48 | 	
 49 | 	/** Returns the relation of the proposition */
 50 | 	public Phrase relation() {
 51 | 		return phrases.get(1);
 52 | 	}
 53 | 	
 54 | 	/** Returns the object of the proposition (should be used when working with triples only!) */
 55 | 	public Phrase object(){
 56 | 	    return phrases.get(2);
 57 | 	}
 58 | 	
 59 | 	/** Sets the relation of the proposition */
 60 | 	public void setRelation(Phrase rel){
 61 | 	    phrases.set(1, rel);
 62 | 	}
 63 | 	
 64 | 	/** Returns a constituent in a given position */
 65 | 	public Phrase argument(int i) {
 66 | 		return phrases.get(i + 2);
 67 | 	}
 68 | 	
 69 | 	/** Returns the number of arguments */
 70 | 	public int noArguments() {
 71 | 		return phrases.size() - 2;
 72 | 	}
 73 | 	
 74 | 	/** Checks if an argument is optional */
 75 | 	public boolean isOptionalArgument(int i) {
 76 | 		return optional.contains(i + 2);
 77 | 	}
 78 | 	
 79 | 	/**
 80 | 	 * Given a proposition, this function turns it into a "sentence" by concatenating the constituents' strings
 81 | 	 * @return
 82 | 	 */
 83 | 	public String propositionToString(){
 84 | 	    StringBuffer sb = new StringBuffer();
 85 | 	    for (int i = 0; i < phrases.size(); i++){
 86 | 	        sb.append(phrases.get(i).getWords());
 87 | 	        sb.append(SEPARATOR.SPACE);
 88 | 	    }
 89 | 	    return sb.toString().trim();
 90 | 	}
 91 | 	
 92 | 	public ObjectArrayList<Phrase> getPhrases(){
 93 | 	    return this.phrases;
 94 | 	}
 95 | 	public void addPhrase (Phrase p){
 96 | 	    this.phrases.add(p);
 97 | 	}
 98 | 	public void setPhrase(int i, Phrase p){
 99 | 	    this.phrases.set(i, p);
100 | 	}
101 | 	/** Get optional constituents' indices **/
102 | 	public Set<Integer> getOptinoalConstituentsIndices(){
103 | 	    return this.optional;
104 | 	}
105 | 	/** Add index of an optional constituent **/
106 | 	public void addOptionalConstituentIndex(int i){
107 | 	    this.optional.add(i);
108 | 	}
109 | 	/** Clear the set of optional constituent indices **/
110 | 	public void clearOptionalConstituentIndicesSet(){
111 | 	    this.optional.clear();
112 | 	}
113 | 	
114 | 	@Override
115 | 	public String toString() {
116 | 		StringBuffer sb = new StringBuffer();
117 | 		String sep = "(";
118 | 		
119 | 		for (int i=0; i < phrases.size(); i++) {
120 | 			String constituent = phrases.get(i).getWords();
121 | 			sb.append(sep);
122 | 			sep = ", ";
123 | 			sb.append("\"");
124 | 			sb.append(constituent);
125 | 			sb.append("\"");
126 | 			if (optional.contains(i)) {
127 | 				sb.append("?");
128 | 			}
129 | 		}
130 | 		sb.append(")");
131 | 		return sb.toString();
132 | 	}
133 | 	
134 | 	@Override
135 | 	public Proposition clone() {
136 | 		Proposition clone = new Proposition();
137 | 		clone.phrases = new ObjectArrayList<Phrase>(this.phrases.clone());
138 | 		clone.optional = new HashSet<Integer>(this.optional);
139 | 		return clone;
140 | 	}
141 | }
142 | 


--------------------------------------------------------------------------------
/src/main/java/de/uni_mannheim/clausie/proposition/PropositionGenerator.java:
--------------------------------------------------------------------------------
  1 | package de.uni_mannheim.clausie.proposition;
  2 | 
  3 | import java.util.Collection;
  4 | import java.util.Collections;
  5 | import java.util.HashSet;
  6 | import java.util.Set;
  7 | import java.util.TreeSet;
  8 | 
  9 | import de.uni_mannheim.clausie.ClausIE;
 10 | import de.uni_mannheim.clausie.clause.Clause;
 11 | import de.uni_mannheim.clausie.constituent.Constituent;
 12 | import de.uni_mannheim.clausie.constituent.IndexedConstituent;
 13 | import de.uni_mannheim.clausie.constituent.PhraseConstituent;
 14 | import de.uni_mannheim.clausie.phrase.Phrase;
 15 | import de.uni_mannheim.utils.coreNLP.DpUtils;
 16 | import edu.stanford.nlp.ling.IndexedWord;
 17 | import edu.stanford.nlp.trees.EnglishGrammaticalRelations;
 18 | import edu.stanford.nlp.trees.GrammaticalRelation;
 19 | import edu.stanford.nlp.semgraph.SemanticGraph;
 20 | 
 21 | /**
 22 |  * Handles the generation of propositions out of a given clause
 23 |  *
 24 |  * @author Luciano del Corro
 25 |  * @author Kiril Gashteovski
 26 |  *
 27 |  */
 28 | public abstract class PropositionGenerator {
 29 |     
 30 |     ClausIE clausIE;
 31 | 
 32 |     /** Relations to be excluded in every constituent of a clause except the verb */
 33 |     protected static final Set<GrammaticalRelation> EXCLUDE_RELATIONS;
 34 |     
 35 |     /** Relations to be excluded in the verb */
 36 |     protected static final Set<GrammaticalRelation> EXCLUDE_RELATIONS_VERB;
 37 | 
 38 |     static {
 39 |         EXCLUDE_RELATIONS = new HashSet<GrammaticalRelation>();
 40 |         EXCLUDE_RELATIONS.add(EnglishGrammaticalRelations.RELATIVE_CLAUSE_MODIFIER);
 41 |         EXCLUDE_RELATIONS.add(EnglishGrammaticalRelations.APPOSITIONAL_MODIFIER);
 42 |         EXCLUDE_RELATIONS.add(EnglishGrammaticalRelations.PARATAXIS);
 43 |         EXCLUDE_RELATIONS.add(EnglishGrammaticalRelations.valueOf("dep"));
 44 | 
 45 |         EXCLUDE_RELATIONS_VERB = new HashSet<GrammaticalRelation>();
 46 |         EXCLUDE_RELATIONS_VERB.addAll(EXCLUDE_RELATIONS);
 47 |         EXCLUDE_RELATIONS_VERB.add(EnglishGrammaticalRelations.valueOf("dep")); //without this asome adverbs or auxiliaries will end up in the relation
 48 |     }
 49 | 
 50 |     /** Constructs a proposition generator*/
 51 |     public PropositionGenerator(ClausIE clausIE) {
 52 |         this.clausIE = clausIE;
 53 |     }
 54 | 
 55 |     /** Generates propositions for a given clause*/
 56 |     public abstract void generate(Clause clause, SemanticGraph sGraph);
 57 | 
 58 |     /** Generates a textual representation of a given constituent plus a set of words*/
 59 |     private Phrase generatePhrase(IndexedConstituent constituent, Collection<IndexedWord> words, SemanticGraph sGraph) {
 60 |         Phrase phrase = new Phrase();
 61 |         
 62 |         if (constituent.isPrepositionalPhrase(sGraph)) {
 63 |             // TODO: before, it was: constituent.getRoot().originalText(). For some reason, in the case for
 64 |             // "in Los Angeles", the word "in" returns empty string for originalText(), and the actual word for word().
 65 |             // Check if this compromises the code in some way
 66 |             // TODO: see if you could find a faster way to make this check (not to go through the list of all excluded
 67 |             // words, for instance: use a flag as an input parameter)
 68 |             if (!constituent.excludedVertexes.contains(constituent.getRoot())){
 69 |                 phrase.addWordToList(constituent.getRoot());
 70 |             }
 71 |         }
 72 | 
 73 |         for (IndexedWord word : words) {
 74 |             if (DpUtils.filterTokens(word))
 75 |                 continue;
 76 |             phrase.addWordToList(word);
 77 |         }
 78 | 
 79 |         return phrase;
 80 |     }
 81 | 
 82 |     /** Generates a textual representation of a given constituent in a given clause*/
 83 |     public Phrase generate(Clause clause, int constituentIndex, SemanticGraph sGraph) {
 84 |         Set<GrammaticalRelation> excludeRelations = EXCLUDE_RELATIONS;
 85 |         if (clause.getVerbInd() == constituentIndex) {
 86 |             excludeRelations = EXCLUDE_RELATIONS_VERB;
 87 |         }
 88 | 
 89 |         return generate(clause, constituentIndex, excludeRelations, Collections.<GrammaticalRelation> emptySet(), sGraph);
 90 |     }
 91 | 
 92 |     /** Generates a textual representation of a given constituent in a given clause **/
 93 |     public Phrase generate(Clause clause, int constituentIndex, Collection<GrammaticalRelation> excludeRelations, 
 94 |             Collection<GrammaticalRelation> excludeRelationsTop, SemanticGraph sGraph) {
 95 |         
 96 |         Constituent constituent = clause.getConstituents().get(constituentIndex);
 97 | 
 98 |         if (constituent instanceof PhraseConstituent) {
 99 |             PhraseConstituent tConstituent = ((PhraseConstituent) constituent);
100 |             return tConstituent.getPhrase();
101 |         } else if (constituent instanceof IndexedConstituent) {
102 |             IndexedConstituent iconstituent = (IndexedConstituent) constituent;
103 |             SemanticGraph subgraph = iconstituent.createReducedSemanticGraph(); 
104 |             DpUtils.removeEdges(subgraph, iconstituent.getRoot(), excludeRelations, excludeRelationsTop);
105 |             Set<IndexedWord> words = new TreeSet<IndexedWord>(subgraph.descendants(iconstituent.getRoot()));
106 |             
107 |             for (IndexedWord v : iconstituent.getAdditionalVertexes()) {
108 |                 words.addAll(subgraph.descendants(v));
109 |             }
110 |             if (iconstituent.isPrepositionalPhrase(sGraph))
111 |                 words.remove(iconstituent.getRoot());
112 |             
113 |             Phrase phrase = generatePhrase(iconstituent, words, sGraph);
114 |             return phrase;
115 |         } else {
116 |             throw new IllegalArgumentException();
117 |         }
118 |     }  
119 | }
120 | 


--------------------------------------------------------------------------------
/src/main/java/de/uni_mannheim/constant/CHARACTER.java:
--------------------------------------------------------------------------------
 1 | package de.uni_mannheim.constant;
 2 | 
 3 | /**
 4 |  * @author Kiril Gashteovski
 5 |  */
 6 | public class CHARACTER {
 7 |     public static String VBAR = "|";
 8 |     public static String QMARK = "?";
 9 |     public static String ASTERISK = "*";
10 |     public static String PLUS = "+";
11 |     public static String LBRACE = "{";
12 |     public static String RBRACE = "}";
13 |     public static String LBRACKET = "[";
14 |     public static String RBRACKET = "]";
15 |     public static String LPARENTHESIS = "(";
16 |     public static String RPARENTHESIS = ")";
17 |     public static String CARET = "^";
18 |     public static String EQUAL = "=";
19 |     public static String DOT = ".";
20 |     public static String COMMA = ",";
21 |     public static String TAB = "\t";
22 |     public static String NEW_LINE = "\n";
23 |     public static String CRETURN = "\r";
24 |     public static String SPACE = " ";
25 |     public static String MINUS = "-";
26 |     public static String QUOTATION_MARK = "\"";
27 |     public static String SEMI_COLON = ";";
28 |     public static String COLON = ":";
29 |     public static String UNDERSCORE = "_";
30 |     public static String EMPTY_STRING = "";
31 |     public static String LESS = "<";
32 | }
33 | 


--------------------------------------------------------------------------------
/src/main/java/de/uni_mannheim/constant/CLAUSE_TYPE.java:
--------------------------------------------------------------------------------
 1 | package de.uni_mannheim.constant;
 2 | 
 3 | /**
 4 |  * @author Kiril Gashteovski
 5 |  */
 6 | public class CLAUSE_TYPE {
 7 |     public static final String ST_SV = "SV";
 8 |     public static final String ST_SVA = "SVA";
 9 |     public static final String ST_SVO = "SVO";
10 |     public static final String ST_SVC = "SVC";
11 |     public static final String ST_SVOA = "SVOA";
12 |     public static final String ST_SVOO = "SVOO";
13 |     public static final String ST_SVOC = "SVOC";
14 |     public static final String ST_EXISTENTIAL = "EXISTENTIAL";
15 |     public static final String ST_UNKNOWN = "UNKNOWN";
16 | }
17 | 


--------------------------------------------------------------------------------
/src/main/java/de/uni_mannheim/constant/NE_TYPE.java:
--------------------------------------------------------------------------------
 1 | package de.uni_mannheim.constant;
 2 | 
 3 | /**
 4 |  * @author Kiril Gashteovski
 5 |  */
 6 | public class NE_TYPE {
 7 |     public static final String MISC = "MISC";
 8 |     public static final String PERSON = "PERSON";
 9 |     public static final String LOCATION = "LOCATION";
10 |     public static final String ORGANIZATION = "ORGANIZATION";
11 |     public static final String DATE = "DATE";
12 |     public static final String DURATION = "DURATION";
13 |     public static final String TIME = "TIME";
14 |     public static final String MONEY = "MONEY";
15 |     public static final String NUMBER = "NUMBER";
16 |     public static final String ORDINAL = "ORDINAL";
17 |     public static final String NO_NER = "O";
18 |     public static final String ENTITY = "ENTITY";
19 |     public static final String SET = "SET";
20 | }
21 | 


--------------------------------------------------------------------------------
/src/main/java/de/uni_mannheim/constant/POS_TAG.java:
--------------------------------------------------------------------------------
 1 | package de.uni_mannheim.constant;
 2 | 
 3 | /**
 4 |  * @author Kiril Gashteovski
 5 |  */
 6 | public class POS_TAG {
 7 |     public static final String CD = "CD";
 8 |     public static final String DT = "DT";
 9 |     public static final String VB = "VB";
10 |     public static final String VBD = "VBD";
11 |     public static final String VBG = "VBG";
12 |     public static final String VBN = "VBN";
13 |     public static final String VBP = "VBP";
14 |     public static final String VBZ = "VBZ";
15 |     public static final String MD = "MD";
16 |     public static final String NN = "NN";
17 |     public static final String NNS = "NNS";
18 |     public static final String NNP = "NNP";
19 |     public static final String NNPS = "NNPS";
20 |     public static final String JJ = "JJ";
21 |     public static final String JJR = "JJR";
22 |     public static final String JJS = "JJS";
23 |     public static final String RB = "RB";
24 |     public static final String RBR = "RBR";
25 |     public static final String RBS = "RBS";
26 |     public static final String RP = "RP";
27 |     public static final String PR = "PR"; // used for both PRP and PRP$
28 |     public static final String PRP = "PRP";
29 |     public static final String PRP_P = "PRP$";
30 |     public static final String WP = "WP";
31 |     public static final String WP_P = "WP$";
32 |     public static final String WDT = "WDT";
33 |     public static final String WRB = "WRB";
34 |     public static final String POS = "POS";
35 |     public static final String SYM = "SYM";
36 |     public static final String IN = "IN";
37 |     public static final String TO = "TO";
38 | }
39 | 


--------------------------------------------------------------------------------
/src/main/java/de/uni_mannheim/constant/SEPARATOR.java:
--------------------------------------------------------------------------------
 1 | package de.uni_mannheim.constant;
 2 | 
 3 | /**
 4 |  * @author Kiril Gashteovski
 5 |  */
 6 | public class SEPARATOR {
 7 |     public static final String SPACE = " ";
 8 |     public static final String TAB = "\t";
 9 |     public static final String COMMA = ",";
10 |     public static final String MINUS = "-";
11 |     public static final String NEW_LINE = "\n";
12 | }
13 | 


--------------------------------------------------------------------------------
/src/main/java/de/uni_mannheim/constant/WORDS.java:
--------------------------------------------------------------------------------
 1 | package de.uni_mannheim.constant;
 2 | 
 3 | import java.io.IOException;
 4 | 
 5 | import de.uni_mannheim.utils.Dictionary;
 6 | 
 7 | /**
 8 |  * @author Kiril Gashteovski
 9 |  */
10 | public class WORDS {
11 |     // A set of non-subsective modal adjectives
12 |     public static Dictionary NON_SUBSECTIVE_JJ_MODAL;
13 |     static {
14 |         try {
15 |             NON_SUBSECTIVE_JJ_MODAL = new Dictionary("/minie-resources/non-subsective-adjectives-modal.dict");
16 |         } catch (IOException e) {
17 |             throw new Error(e);
18 |         } 
19 |     }
20 |     
21 |     // A set of non-subsective cf. adjectives
22 |     public static Dictionary NON_SUBSECTIVE_JJ_CF;
23 |     static {
24 |         try {
25 |             NON_SUBSECTIVE_JJ_CF = new Dictionary("/minie-resources/non-subsective-adjectives-cf.dict");
26 |         } catch (IOException e) {
27 |             throw new Error(e);
28 |         } 
29 |     }
30 |     
31 |     // A set of non-subsective temp. adjectives 
32 |     public static Dictionary NON_SUBSECTIVE_JJ_TEMP;
33 |     static {
34 |         try {
35 |             NON_SUBSECTIVE_JJ_TEMP = new Dictionary("/minie-resources/non-subsective-adjectives-temp.dict");
36 |         } catch (IOException e) {
37 |             throw new Error(e);
38 |         } 
39 |     }
40 |     
41 |     public static String word = "word";
42 |     public static String idx = "idx";
43 |     public static String factuality = "Factuality";
44 |     public static String attribution = "Attribution";
45 | }
46 | 


--------------------------------------------------------------------------------
/src/main/java/de/uni_mannheim/minie/annotation/Attribution.java:
--------------------------------------------------------------------------------
  1 | package de.uni_mannheim.minie.annotation;
  2 | 
  3 | import de.uni_mannheim.constant.CHARACTER;
  4 | import de.uni_mannheim.constant.SEPARATOR;
  5 | 
  6 | /**
  7 |  * A class representing the attribution
  8 |  * @param attributionPhrase: a phrase containing the words for the attribution
  9 |  * @param modality: the modality of the attribution (possibility or certainty)
 10 |  * @param polarity: the polarity of the attribution (positive or negative)
 11 |  * @param predicateVerb: the predicate verb (as a string in its lemma version)
 12 |  *
 13 |  * @author Kiril Gashteovski
 14 |  */
 15 | 
 16 | public class Attribution {
 17 |     private AnnotatedPhrase attributionPhrase;
 18 |     private Modality.Type modality;
 19 |     private Polarity.Type polarity;
 20 |     private String predicateVerb;
 21 | 
 22 |     
 23 |     /** Some string constants necessary for detecting the attribution **/
 24 |     public static String ACCORDING = "according";
 25 |     
 26 |     /** Default constructor: modality == certainty, polarity == positive, attributionPhrase == null */
 27 |     public Attribution(){
 28 |         this.attributionPhrase = null;
 29 |         this.modality = Modality.Type.CERTAINTY;
 30 |         this.polarity = Polarity.Type.POSITIVE;
 31 |         this.predicateVerb = CHARACTER.EMPTY_STRING;
 32 |     }
 33 |     
 34 |     /** Constructor with a given attribution phrase. The modality and polarity are by default 'certainty' and 'positive' 
 35 |      *  respectively
 36 |      *  
 37 |      *  @param: attributionPhrase: the attribution phrase
 38 |      *  @param: pVerb: the predicate verb (a string) 
 39 |      */
 40 |     public Attribution(AnnotatedPhrase attributionPhrase, String pVerb){
 41 |         this.attributionPhrase = attributionPhrase;
 42 |         this.modality = Modality.Type.CERTAINTY;
 43 |         this.polarity = Polarity.Type.POSITIVE;
 44 |         this.predicateVerb = pVerb;
 45 |     }
 46 |     
 47 |     /**
 48 |      * Fully parameterized constructor  
 49 |      * @param attributionPhrase: the attribution phrase
 50 |      * @param pol: polarity type
 51 |      * @param mod: modality type
 52 |      * @param pVerb: predicate verb
 53 |      */
 54 |     public Attribution(AnnotatedPhrase attributionPhrase, Polarity.Type pol, Modality.Type mod, String pVerb){
 55 |         this.attributionPhrase = attributionPhrase;
 56 |         this.modality = mod;
 57 |         this.polarity = pol;
 58 |         this.predicateVerb = pVerb;        
 59 |     }
 60 |     /** Copy constructor **/
 61 |     public Attribution(Attribution s){
 62 |         this.attributionPhrase = s.getAttributionPhrase();
 63 |         this.modality = s.getModalityType();
 64 |         this.polarity = s.getPolarityType();
 65 |         this.predicateVerb = s.getPredicateVerb();   
 66 |     }
 67 |     
 68 |     // Getters
 69 |     public AnnotatedPhrase getAttributionPhrase(){
 70 |         return this.attributionPhrase;
 71 |     }
 72 |     public Modality.Type getModalityType(){
 73 |         return this.modality;
 74 |     }
 75 |     public Polarity.Type getPolarityType(){
 76 |         return this.polarity;
 77 |     }
 78 |     public String getPredicateVerb(){
 79 |         return this.predicateVerb;
 80 |     }
 81 |     
 82 |     // Setters
 83 |     public void setAttributionPhrase(AnnotatedPhrase s){
 84 |         this.attributionPhrase = s;
 85 |     }
 86 |     public void setModalityType(Modality.Type t){
 87 |         this.modality = t;
 88 |     }
 89 |     public void setPolarityType(Polarity.Type t){
 90 |         this.polarity = t;
 91 |     }
 92 |     public void setPredicateVerb(String pVerb){
 93 |         this.predicateVerb = pVerb;
 94 |     }
 95 |     
 96 |     // Clear the attribution
 97 |     public void clear(){
 98 |         this.attributionPhrase = null;
 99 |         this.modality = Modality.Type.CERTAINTY;
100 |         this.polarity = Polarity.Type.POSITIVE;
101 |         this.predicateVerb = CHARACTER.EMPTY_STRING;
102 |     }
103 |     
104 |     // Write down the attribution in the format (attribution_phrase, predicate, polarity, modality)
105 |     @Override
106 |     public String toString(){
107 |         StringBuffer sb = new StringBuffer();
108 |         sb.append(CHARACTER.LPARENTHESIS);
109 | 
110 |         // Append the attribution phrase
111 |         for (int i = 0; i < this.attributionPhrase.getWordList().size(); i++) {
112 |             sb.append(this.attributionPhrase.getWordList().get(i).word());
113 |             if (i < this.attributionPhrase.getWordList().size() - 1)
114 |                 sb.append(SEPARATOR.SPACE);
115 |         }
116 | 
117 |         sb.append(SEPARATOR.COMMA);
118 |         sb.append(SEPARATOR.SPACE);
119 |         
120 |         // Append the predicate verb
121 |         sb.append("Predicate: ");
122 |         sb.append(this.predicateVerb);
123 |         sb.append(SEPARATOR.COMMA);
124 |         sb.append(SEPARATOR.SPACE);
125 |         
126 |         // Append the polarity
127 |         sb.append("POLARITY:  ");
128 |         if (this.polarity == Polarity.Type.POSITIVE)
129 |             sb.append(Polarity.ST_POSITIVE);
130 |         else 
131 |             sb.append(Polarity.ST_NEGATIVE);
132 |         sb.append(SEPARATOR.SPACE);
133 |         sb.append(SEPARATOR.COMMA);
134 |         sb.append(SEPARATOR.SPACE);
135 |         
136 |         // Append the modality
137 |         sb.append("MODALITY:  ");
138 |         if (this.modality == Modality.Type.CERTAINTY)
139 |             sb.append(Modality.ST_CERTAINTY);
140 |         else
141 |             sb.append(Modality.ST_POSSIBILITY);
142 |         sb.append(CHARACTER.RPARENTHESIS);
143 |         sb.append(SEPARATOR.SPACE);
144 |         sb.append(SEPARATOR.COMMA);
145 |         sb.append(SEPARATOR.SPACE);
146 |         
147 |         sb.append("POLARITY:  ");
148 |         if (this.polarity == Polarity.Type.POSITIVE)
149 |             sb.append(Polarity.ST_POSITIVE);
150 |         else 
151 |             sb.append(Polarity.ST_NEGATIVE);
152 |         
153 |         return sb.toString().trim();
154 |     }
155 |     
156 |     /** Return the attribution as a string in format "(Attribution Phrase, (POLARITY, MODALITY)) **/
157 |     public String toStringCompact() {
158 |         StringBuffer sb = new StringBuffer();
159 |         sb.append(CHARACTER.LPARENTHESIS);
160 | 
161 |         // Append the attribution phrase
162 |         for (int i = 0; i < this.attributionPhrase.getWordList().size(); i++) {
163 |             sb.append(this.attributionPhrase.getWordList().get(i).word());
164 |             if (i < this.attributionPhrase.getWordList().size() - 1)
165 |                 sb.append(SEPARATOR.SPACE);
166 |         }
167 |         
168 |         sb.append(SEPARATOR.COMMA);
169 |         sb.append(SEPARATOR.SPACE);
170 |         
171 |         // Append the factuality
172 |         sb.append(CHARACTER.LPARENTHESIS);
173 |         if (this.polarity == Polarity.Type.POSITIVE)
174 |             sb.append(Polarity.ST_PLUS);
175 |         else 
176 |             sb.append(Polarity.ST_MINUS);
177 |         sb.append(SEPARATOR.COMMA);
178 |         if (this.modality == Modality.Type.CERTAINTY)
179 |             sb.append(Modality.ST_CT);
180 |         else
181 |             sb.append(Modality.ST_PS);
182 |         sb.append(CHARACTER.RPARENTHESIS);
183 |         sb.append(CHARACTER.RPARENTHESIS);     
184 |         
185 |         return sb.toString();
186 |     }
187 | }
188 | 


--------------------------------------------------------------------------------
/src/main/java/de/uni_mannheim/minie/annotation/Polarity.java:
--------------------------------------------------------------------------------
  1 | package de.uni_mannheim.minie.annotation;
  2 | 
  3 | import edu.stanford.nlp.ling.IndexedWord;
  4 | import edu.stanford.nlp.semgraph.SemanticGraph;
  5 | import edu.stanford.nlp.semgraph.SemanticGraphEdge;
  6 | 
  7 | import it.unimi.dsi.fastutil.objects.ObjectArrayList;
  8 | 
  9 | import java.io.IOException;
 10 | 
 11 | import de.uni_mannheim.constant.CHARACTER;
 12 | import de.uni_mannheim.constant.POS_TAG;
 13 | import de.uni_mannheim.constant.SEPARATOR;
 14 | import de.uni_mannheim.utils.coreNLP.CoreNLPUtils;
 15 | import de.uni_mannheim.utils.Dictionary;
 16 | 
 17 | /**
 18 |  * Annotation for polarity
 19 |  *
 20 |  * @author Kiril Gashteovski
 21 |  */
 22 | public class Polarity {
 23 |     /** Annotations for polarity, can be just "POSTIIVE" or "NEGATIVE" */
 24 |     public static enum Type {POSITIVE, NEGATIVE};
 25 |     
 26 |     /** Static strings for polarity **/
 27 |     public static String ST_POSITIVE = "POSITIVE";
 28 |     public static String ST_PLUS = "+";
 29 |     public static String ST_NEGATIVE = "NEGATIVE";
 30 |     public static String ST_MINUS = "-";
 31 |     
 32 |     /** List of negative words and edges (if found any) **/
 33 |     private ObjectArrayList<IndexedWord> negativeWords; 
 34 |     private ObjectArrayList<SemanticGraphEdge> negativeEdges;
 35 |     
 36 |     /** Polarity type **/
 37 |     private Polarity.Type polarityType;
 38 |     
 39 |     /** A set of all negative words **/
 40 |     public static Dictionary NEG_WORDS;
 41 |     static {
 42 |         try {
 43 |             NEG_WORDS = new Dictionary("/minie-resources/neg-words.dict");
 44 |         } catch (IOException e) {
 45 |             throw new Error(e);
 46 |         } 
 47 |     }
 48 |     
 49 |     /** A set of negative adverbs **/
 50 |     public static Dictionary NEG_ADVERBS;
 51 |     static {
 52 |         try {
 53 |             NEG_ADVERBS = new Dictionary("/minie-resources/neg-adverbs.dict");
 54 |         } catch (IOException e) {
 55 |             throw new Error(e);
 56 |         } 
 57 |     }
 58 |     
 59 |     /** Set of negative determiners **/
 60 |     public static Dictionary NEG_DETERMINERS;
 61 |     static {
 62 |         try {
 63 |             NEG_DETERMINERS = new Dictionary("/minie-resources/neg-determiners.dict");
 64 |         } catch (IOException e) {
 65 |             throw new Error(e);
 66 |         } 
 67 |     }
 68 |     
 69 |     /** Default constructor. Assumes positive polarity type by default **/
 70 |     public Polarity(){
 71 |         this.polarityType = Type.POSITIVE;
 72 |         this.negativeEdges = new ObjectArrayList<SemanticGraphEdge>();
 73 |         this.negativeWords = new ObjectArrayList<IndexedWord>();
 74 |     }
 75 |     
 76 |     /**
 77 |      * Constructor given the polarity type. Creates empty lists for negative words and edges
 78 |      * @param t: polarity type
 79 |      */
 80 |     public Polarity(Polarity.Type t){
 81 |         this.polarityType = t;
 82 |         this.negativeEdges = new ObjectArrayList<SemanticGraphEdge>();
 83 |         this.negativeWords = new ObjectArrayList<IndexedWord>();
 84 |     }
 85 |     /**
 86 |      * Copy constructor
 87 |      * @param p: polarity object
 88 |      */
 89 |     public Polarity(Polarity p){
 90 |         this.polarityType = p.getType();
 91 |         this.negativeEdges = p.getNegativeEdges();
 92 |         this.negativeWords = p.getNegativeWords();
 93 |     }
 94 |     /**
 95 |      * Parametric constructor, given the polarity types, negative words, negative edges
 96 |      * @param t: polarity type
 97 |      * @param negWords: list of negative words
 98 |      * @param negEdges: list of negative edges
 99 |      */
100 |     public Polarity(Polarity.Type t, ObjectArrayList<IndexedWord> negWords, ObjectArrayList<SemanticGraphEdge> negEdges){
101 |         this.polarityType = t;
102 |         this.negativeEdges = negEdges;
103 |         this.negativeWords = negWords;
104 |     }
105 |     
106 |     /** Getters **/
107 |     public Polarity.Type getType(){
108 |         return this.polarityType;
109 |     }
110 |     public ObjectArrayList<IndexedWord> getNegativeWords(){
111 |         return this.negativeWords;
112 |     }
113 |     public ObjectArrayList<SemanticGraphEdge> getNegativeEdges(){
114 |         return this.negativeEdges;
115 |     }
116 |     
117 |     /** Setters **/
118 |     public void setType(Polarity.Type t){
119 |         this.polarityType = t;
120 |     }
121 |     public void setNegativeWords(ObjectArrayList<IndexedWord> negWords){
122 |         this.negativeWords = negWords;
123 |     }
124 |     public void setNegativeEdges(ObjectArrayList<SemanticGraphEdge> negEdges){
125 |         this.negativeEdges = negEdges;
126 |     }
127 |     
128 |     /** Adding elements to lists **/
129 |     public void addNegativeEdge(SemanticGraphEdge e){
130 |         this.negativeEdges.add(e);
131 |     }
132 |     public void addNegativeWord(IndexedWord w){
133 |         this.negativeWords.add(w);
134 |     }
135 |     
136 |     /** Clear the polarity object, i.e. set its default values (type = positive, neg. words and edges are empty lists) */
137 |     public void clear(){
138 |         this.polarityType = Type.POSITIVE;
139 |         this.negativeEdges = new ObjectArrayList<SemanticGraphEdge>();
140 |         this.negativeWords = new ObjectArrayList<IndexedWord>();
141 |     }
142 |     
143 |     /**
144 |      * Given a phrase and its sentence semantic graph, detect the polarity type. If negative polarity is found, add the 
145 |      * negative words and edges to their appropriate lists from the Polarity class.
146 |      * 
147 |      * @param phrase: phrase (essentially, list of words, which are part of some sentence)
148 |      * @param sentenceSemGraph: the semantic graph of the phrase's sentence
149 |      * @return polarity object
150 |      */
151 |     public static Polarity getPolarity(AnnotatedPhrase phrase, SemanticGraph sentenceSemGraph){
152 |         Polarity pol = new Polarity();
153 |         
154 |         for (int i = 0; i < phrase.getWordList().size(); i++){
155 |             // Check for negative adverbs
156 |             if (CoreNLPUtils.isAdverb(phrase.getWordList().get(i).tag())){
157 |                 if (Polarity.NEG_ADVERBS.contains(phrase.getWordList().get(i).lemma())){
158 |                     Polarity.setNegPol(pol, phrase.getWordList().get(i), sentenceSemGraph.getEdge(
159 |                                                                     sentenceSemGraph.getParent(phrase.getWordList().get(i)), 
160 |                                                                     phrase.getWordList().get(i)));
161 |                 }
162 |             }
163 |             // Check for negative determiners
164 |             else if (phrase.getWordList().get(i).tag().equals(POS_TAG.DT)){
165 |                 if (Polarity.NEG_DETERMINERS.contains(phrase.getWordList().get(i).lemma())){
166 |                     Polarity.setNegPol(pol, phrase.getWordList().get(i), sentenceSemGraph.getEdge(
167 |                             sentenceSemGraph.getParent(phrase.getWordList().get(i)), 
168 |                             phrase.getWordList().get(i)));
169 |                 }
170 |             }
171 |         }
172 |         
173 |         return pol;
174 |     }
175 |     
176 |     /**
177 |      * Given a polarity object, negative word and a negative edge, set the polarity type to "negative" and add the 
178 |      * negative words and edges to their appropriate lists
179 |      * 
180 |      * @param pol: polarity object
181 |      * @param negWord: negative word
182 |      * @param negEdge: negative edge
183 |      */
184 |     private static void setNegPol(Polarity pol, IndexedWord negWord, SemanticGraphEdge negEdge){
185 |         pol.setType(Polarity.Type.NEGATIVE);
186 |         pol.addNegativeWord(negWord);
187 |         pol.addNegativeEdge(negEdge);
188 |     }
189 |     
190 |     /** Given a polarity object, convert it into a string */
191 |     @Override
192 |     public String toString(){
193 |         StringBuffer sb = new StringBuffer();
194 |         sb.append(CHARACTER.LPARENTHESIS);
195 |         if (this.polarityType == Polarity.Type.POSITIVE)
196 |             sb.append(CHARACTER.PLUS);
197 |         else { 
198 |             sb.append(CHARACTER.MINUS);
199 |             sb.append(CHARACTER.COMMA);
200 |             sb.append(SEPARATOR.SPACE);
201 |             for (SemanticGraphEdge edge: this.negativeEdges){
202 |                 sb.append(edge.toString());
203 |                 sb.append(CHARACTER.COMMA);
204 |                 sb.append(SEPARATOR.SPACE);
205 |             }
206 |         }
207 |         
208 |         sb.append(CHARACTER.RPARENTHESIS);
209 |         return sb.toString().trim();
210 |     }
211 | }
212 | 


--------------------------------------------------------------------------------
/src/main/java/de/uni_mannheim/minie/annotation/Quantity.java:
--------------------------------------------------------------------------------
  1 | package de.uni_mannheim.minie.annotation;
  2 | 
  3 | import java.io.IOException;
  4 | 
  5 | import de.uni_mannheim.constant.CHARACTER;
  6 | import de.uni_mannheim.utils.Dictionary;
  7 | import edu.stanford.nlp.ling.IndexedWord;
  8 | import edu.stanford.nlp.semgraph.SemanticGraphEdge;
  9 | 
 10 | import it.unimi.dsi.fastutil.objects.ObjectArrayList;
 11 | 
 12 | /**
 13 |  * Annotation for quantity
 14 |  *
 15 |  * @author Kiril Gashteovski
 16 |  */
 17 | public class Quantity {
 18 |     /** The quantity words */
 19 |     private ObjectArrayList<IndexedWord> qWords; 
 20 |     /** The quantity edges **/
 21 |     private ObjectArrayList<SemanticGraphEdge> qEdges;
 22 |     /** The quantity ID */
 23 |     private String id; 
 24 |     
 25 |     /** A set of quantity determiners **/
 26 |     public static Dictionary DT_QUANTITIES;
 27 |     static {
 28 |         try {
 29 |             DT_QUANTITIES = new Dictionary("/minie-resources/quantities-determiners.dict");
 30 |         } catch (IOException e) {
 31 |             throw new Error(e);
 32 |         } 
 33 |     }
 34 |     
 35 |     /** A set of quantity adjectives **/
 36 |     public static Dictionary JJ_QUANTITIES;
 37 |     static {
 38 |         try {
 39 |             JJ_QUANTITIES = new Dictionary("/minie-resources/quantities-adjectives.dict");
 40 |         } catch (IOException e) {
 41 |             throw new Error(e);
 42 |         } 
 43 |     }
 44 |     
 45 |     /** Static strings used for quantities **/
 46 |     public static String ST_QUANTITY = "QUANTITY";
 47 |     public static String ST_QUANT = "QUANT";
 48 |     
 49 |     /** Strings used for IDs for quantities **/
 50 |     public static String SUBJECT_ID = "S";
 51 |     public static String RELATION_ID = "R";
 52 |     public static String OBJECT_ID = "O";
 53 |     
 54 |     /** Default constructor **/
 55 |     public Quantity() {
 56 |         this.qWords = new ObjectArrayList<IndexedWord>();
 57 |         this.qEdges = new ObjectArrayList<SemanticGraphEdge>();
 58 |         this.id = CHARACTER.EMPTY_STRING;
 59 |     }
 60 |     /** Copy constructor **/
 61 |     public Quantity(Quantity q){
 62 |         this.qWords = q.getQuantityWords();
 63 |         this.qEdges = q.getQuantityEdges();
 64 |         this.id = q.getId();
 65 |     }
 66 |     /**
 67 |      * Given a list of indexed words and a list of semantic graph edges, create a quantity object which will have
 68 |      * qWords as quantity words and qEdges as quantity edges (no ID = empty string)
 69 |      * @param qWords: quantity words
 70 |      * @param qEdges: quantity edges
 71 |      */
 72 |     public Quantity(ObjectArrayList<IndexedWord> qWords, ObjectArrayList<SemanticGraphEdge> qEdges){
 73 |         this.qWords = qWords.clone();
 74 |         this.qEdges = qEdges.clone();
 75 |         this.id = CHARACTER.EMPTY_STRING;
 76 |     }
 77 |     /**
 78 |      * Given a list of indexed words, a list of semantic graph edges and an ID, create a quantity object which will have
 79 |      * qWords as quantity words and qEdges as quantity edges and ID as an id
 80 |      * @param qWords: quantity words
 81 |      * @param qEdges: quantity edges
 82 |      * @param id: the ID of the quantity
 83 |      */
 84 |     public Quantity(ObjectArrayList<IndexedWord> qWords, ObjectArrayList<SemanticGraphEdge> qEdges, String id){
 85 |         this.qWords = qWords.clone();
 86 |         this.qEdges = qEdges.clone();
 87 |         this.id = id;
 88 |     }
 89 |     
 90 |     /** Get the quantity words **/
 91 |     public ObjectArrayList<IndexedWord> getQuantityWords(){
 92 |         return this.qWords;
 93 |     }
 94 |     /** Get the quantity edges **/
 95 |     public ObjectArrayList<SemanticGraphEdge> getQuantityEdges(){
 96 |         return this.qEdges;
 97 |     }
 98 |     /** Get the quantity ID **/
 99 |     public String getId(){
100 |         return this.id;
101 |     }
102 |     
103 |     /** Set the quantity words **/
104 |     public void setWords(ObjectArrayList<IndexedWord> words){
105 |         this.qWords = words;
106 |     }
107 |     /** Set the quantity edges **/
108 |     public void setEdges(ObjectArrayList<SemanticGraphEdge> edges){
109 |         this.qEdges = edges;
110 |     }
111 |     /** Set the quantity ID **/
112 |     public void setId(String id){
113 |         this.id = id;
114 |     }
115 |     
116 |     /** Add word to the word list of quantities **/
117 |     public void addWord(IndexedWord w) {
118 |         this.qWords.add(w);
119 |     }
120 |     
121 |     /** Given a quantity object, convert it into a string */
122 |     @Override
123 |     public String toString(){
124 |         StringBuffer sb = new StringBuffer();
125 |         
126 |         // Write the words in the format (WORD_1 WORD_2 ... WORD_n) 
127 |         //sb.append(CHARACTER.LPARENTHESIS);
128 |         //sb.append(Quantity.ST_QUANT);
129 |         //sb.append(CHARACTER.UNDERSCORE);
130 |         sb.append(this.id);
131 |         sb.append(CHARACTER.EQUAL);
132 |         for (int i = 0; i < this.qWords.size(); i++){
133 |             sb.append(this.qWords.get(i).word());
134 |             if (i < this.qWords.size() - 1)
135 |                 sb.append(CHARACTER.SPACE);
136 |         }
137 |         //sb.append(CHARACTER.RPARENTHESIS);
138 |         
139 |         return sb.toString().trim();
140 |     }
141 | }
142 | 


--------------------------------------------------------------------------------
/src/main/java/de/uni_mannheim/minie/main/Extractor.java:
--------------------------------------------------------------------------------
  1 | package de.uni_mannheim.minie.main;
  2 | 
  3 | import de.uni_mannheim.clausie.ClausIE;
  4 | import de.uni_mannheim.minie.MinIE;
  5 | import de.uni_mannheim.utils.Dictionary;
  6 | import de.uni_mannheim.utils.minie.Utils;
  7 | import edu.stanford.nlp.pipeline.StanfordCoreNLP;
  8 | import de.uni_mannheim.utils.coreNLP.CoreNLPUtils;
  9 | 
 10 | import java.io.IOException;
 11 | 
 12 | /**
 13 |  * This class acts as a generic interface to the MinIE system
 14 |  *
 15 |  * @author Martin Achenbach
 16 |  * @author Kiril Gashteovski
 17 |  */
 18 | public class Extractor {
 19 |     private StanfordCoreNLP parser;
 20 |     private ClausIE clausIE;
 21 |     private MinIE minIE;
 22 |     private Dictionary dictionary;
 23 | 
 24 |     /**
 25 |      * default constructor
 26 |      */
 27 |     public Extractor() {
 28 |         // initialize the parser
 29 |         this.parser = CoreNLPUtils.StanfordDepNNParser();
 30 | 
 31 |         // initialize ClausIE
 32 |         this.clausIE = new ClausIE();
 33 | 
 34 |         // initialize MinIE
 35 |         this.minIE = new MinIE();
 36 | 
 37 |         // set up default dictionary
 38 |         try {
 39 |             this.setDictionary(new Dictionary(Utils.DEFAULT_DICTIONARIES));
 40 |         } catch (IOException e) {
 41 |             e.printStackTrace();
 42 |         }
 43 |     }
 44 | 
 45 |     /**
 46 |      * constructor with dictionary, helpful when running in dictionary mode
 47 |      * @param dictionary: dictionary
 48 |      */
 49 |     public Extractor(Dictionary dictionary) {
 50 |         // initialize the parser
 51 |         this.parser = CoreNLPUtils.StanfordDepNNParser();
 52 | 
 53 |         // initialize ClausIE
 54 |         this.clausIE = new ClausIE();
 55 | 
 56 |         // initialize MinIE
 57 |         this.minIE = new MinIE();
 58 | 
 59 |         // set dictionary
 60 |         this.setDictionary(dictionary);
 61 |     }
 62 | 
 63 |     /**
 64 |      * set the dictionary for dictionary mode
 65 |      * @param dictionary: dictionary to use
 66 |      */
 67 |     public void setDictionary(Dictionary dictionary) {
 68 |         this.dictionary = dictionary;
 69 |     }
 70 | 
 71 |     /**
 72 |      * analyze a sentence using a specific mode
 73 |      * @param sentence: sentence to analyze
 74 |      * @param mode: MinIE mode
 75 |      * @return the results of MinIE
 76 |      */
 77 |     public MinIE analyzeSentence(String sentence, MinIE.Mode mode) {
 78 |         // first reset objects
 79 |         this.clausIE.clear();
 80 |         this.minIE.clear();
 81 | 
 82 |         // parse the sentence
 83 |         this.clausIE.setSemanticGraph(CoreNLPUtils.parse(this.parser, sentence));
 84 |         // detect clauses
 85 |         this.clausIE.detectClauses();
 86 |         // generate propositions
 87 |         this.clausIE.generatePropositions(this.clausIE.getSemanticGraph());
 88 | 
 89 |         // start minimizing
 90 |         this.minIE.setSemanticGraph(this.clausIE.getSemanticGraph());
 91 |         this.minIE.setPropositions(this.clausIE);
 92 |         this.minIE.setPolarity();
 93 |         this.minIE.setModality();
 94 |         
 95 |         // minimize in given mode
 96 |         switch (mode) {
 97 |             case AGGRESSIVE:
 98 |                 this.minIE.minimizeAggressiveMode();
 99 |                 break;
100 |             case DICTIONARY:
101 |                 this.minIE.minimizeDictionaryMode(this.dictionary.words());
102 |                 break;
103 |             case SAFE:
104 |                 this.minIE.minimizeSafeMode();
105 |                 break;
106 |             case COMPLETE:
107 |                 break;
108 |         }
109 |         // remove duplicates
110 |         this.minIE.removeDuplicates();
111 |         return this.minIE;
112 |     }
113 | }
114 | 


--------------------------------------------------------------------------------
/src/main/java/de/uni_mannheim/minie/main/Main.java:
--------------------------------------------------------------------------------
  1 | package de.uni_mannheim.minie.main;
  2 | 
  3 | 
  4 | import it.unimi.dsi.fastutil.objects.ObjectArrayList;
  5 | 
  6 | import joptsimple.OptionException;
  7 | import joptsimple.OptionParser;
  8 | import joptsimple.OptionSet;
  9 | 
 10 | import java.io.BufferedReader;
 11 | import java.io.DataInputStream;
 12 | import java.io.FileInputStream;
 13 | import java.io.FileNotFoundException;
 14 | import java.io.FileOutputStream;
 15 | import java.io.IOException;
 16 | import java.io.InputStream;
 17 | import java.io.InputStreamReader;
 18 | import java.io.OutputStream;
 19 | import java.io.PrintStream;
 20 | 
 21 | import java.util.logging.Logger;
 22 | 
 23 | import de.uni_mannheim.utils.Dictionary;
 24 | import de.uni_mannheim.minie.MinIE;
 25 | import de.uni_mannheim.minie.annotation.AnnotatedProposition;
 26 | import de.uni_mannheim.utils.minie.Utils;
 27 | 
 28 | /**
 29 |  * Main class that acts as a console interface to the MinIE system
 30 |  *
 31 |  * @author Martin Achenbach
 32 |  * @author Kiril Gashteovski
 33 |  */
 34 | public class Main {
 35 |     /** used MinIE mode **/
 36 |     private static MinIE.Mode mode;
 37 | 
 38 |     /** console logger **/
 39 |     private final static Logger logger = Logger.getLogger(String.valueOf(Main.class));
 40 | 
 41 |     /**
 42 |      * main function to call from console with available options
 43 |      * @param args: console arguments
 44 |      * @throws IOException
 45 |      */
 46 |     public static void main(String[] args) throws IOException {
 47 |         // init the optionParser
 48 |         OptionParser optionParser = initOptionParser();
 49 |         OptionSet options;
 50 |         // parse options
 51 |         try {
 52 |             options = optionParser.parse(args);
 53 |         } catch (OptionException e) {
 54 |             System.err.println(e.getMessage());
 55 |             System.out.println("");
 56 |             optionParser.printHelpOn(System.out);
 57 |             return;
 58 |         }
 59 | 
 60 |         //print help if need, if yes, break
 61 |         if (options.has("h")) {
 62 |             optionParser.printHelpOn(System.out);
 63 |             return;
 64 |         }
 65 | 
 66 |         // setup input and output
 67 |         logger.info("Setting up input and output streams...");
 68 |         InputStream in = getInputStream(options);
 69 |         OutputStream out = getOutputStream(options);
 70 |         BufferedReader din = new BufferedReader(new InputStreamReader(in));
 71 |         PrintStream dout = new PrintStream(out, true, "UTF-8");
 72 | 
 73 |         // get mode
 74 |         mode = Utils.getMode((String) options.valueOf("m"));
 75 |         logger.info("Mode set to " + mode);
 76 | 
 77 |         // initialize extractor
 78 |         Extractor extractor;
 79 | 
 80 | 
 81 |         if (mode == MinIE.Mode.DICTIONARY) {
 82 |             // load multi-word dictionaries if in dictionary mode
 83 |             Dictionary collocationDictionary = Utils.loadDictionary(options);
 84 |             extractor = new Extractor(collocationDictionary);
 85 |         } else {
 86 |             // if not use default constructor
 87 |             extractor = new Extractor();
 88 |         }
 89 |         logger.info("\n\nSetup finished, ready to take input sentence:");
 90 | 
 91 |         // start analyzing
 92 |         long start = System.currentTimeMillis();
 93 |         String line;
 94 |         int counter = 0;
 95 |         while ((line = din.readLine()) != null) {
 96 |             // skip empty lines
 97 |             if (line.isEmpty()) continue;
 98 | 
 99 |             //logger.info("Start analyzing sentence: " + line);
100 | 
101 |             // parse sentence
102 |             MinIE result = extractor.analyzeSentence(line, mode);
103 | 
104 |             // print results from MinIE
105 |             ObjectArrayList<AnnotatedProposition> propositions = result.getPropositions();
106 |             dout.println("\nOutput:");
107 |             if (options.has("p")) {
108 |                 result.getSentenceSemanticGraph().prettyPrint();
109 |                 dout.print("\n");
110 |             }
111 | 
112 |             if (propositions.size() < 1) {
113 |                 dout.println("No extraction found.");
114 |                 dout.print("\n");
115 |             } else {
116 |                 for (AnnotatedProposition proposition : result.getPropositions()) {
117 |                     dout.println(Utils.formatProposition(proposition));
118 |                 }
119 |                 dout.print("\n");
120 |             }
121 |             counter++;
122 |         }
123 |         // finished analyzing
124 |         long end = System.currentTimeMillis();
125 |         //logger.info("Analyzing time: " + (end - start) / 1000. + "s");
126 |         // clean up
127 |         in.close();
128 |         out.close();
129 |     }
130 | 
131 |     /**
132 |      * initializes and configures the option parser
133 |      * @return a configured option parser
134 |      */
135 |     private static OptionParser initOptionParser() {
136 |         OptionParser optionParser = new OptionParser();
137 |         optionParser
138 |                 .accepts("f", "input file (if absent, MinIE reads from stdin)")
139 |                 .withOptionalArg()
140 |                 .describedAs("file")
141 |                 .ofType(String.class);
142 |         optionParser
143 |                 .accepts("o", "output file (if absent, MinIE writes to stdout)")
144 |                 .withRequiredArg()
145 |                 .describedAs("file")
146 |                 .ofType(String.class);
147 |         optionParser
148 |                 .accepts("m", "specification mode; allowed values: \"safe\", \"dictionary\", \"aggressive\", \"complete\"; defaults to \"safe\"")
149 |                 .withRequiredArg()
150 |                 .describedAs("mode")
151 |                 .ofType(String.class)
152 |                 .defaultsTo("safe");
153 |         optionParser
154 |                 .accepts("dict", "path of the multi-word expression dictionaries (can be several paths separated by ';'); \"dictionary\" mode only")
155 |                 .withOptionalArg()
156 |                 .ofType(String.class)
157 |                 .withValuesSeparatedBy(';');
158 |         optionParser
159 |                 .accepts("dict-overwrite", "if set, the default dictionary (multi-word expressions from WordNet and Wiktionary), will be overwritten, else new dictionaries will be appended")
160 |                 .withOptionalArg();
161 |         optionParser
162 |                 .accepts("p", "print the dependency parse of the input sentence");
163 |         optionParser
164 |                 .accepts("h", "show help");
165 |         return optionParser;
166 |     }
167 | 
168 |     /**
169 |      * returns input stream according to given options
170 |      * @param options: option set for option parser
171 |      * @return input stream
172 |      */
173 |     private static InputStream getInputStream(OptionSet options) {
174 |         InputStream in = null;
175 |         // check if input file was specified
176 |         if (options.has("f")) {
177 |             try {
178 |                 String filename = (String)options.valueOf("f");
179 |                 in = new FileInputStream(filename);
180 |                 logger.info("Reading from file " + filename);
181 |             } catch (FileNotFoundException e) {
182 |                 e.printStackTrace();
183 |             }
184 |         } else {
185 |             // default to stdin
186 |             in = System.in;
187 |             logger.info("Reading from stdin");
188 |         }
189 |         return new DataInputStream(in);
190 |     }
191 | 
192 |     /**
193 |      * returns output stream according to given options
194 |      * @param options: option set for option parser
195 |      * @return output stream
196 |      */
197 |     private static OutputStream getOutputStream(OptionSet options) {
198 |         OutputStream out = null;
199 |         // check if output file was specified
200 |         if (options.has("o")) {
201 |             try {
202 |                 String filename = (String) options.valueOf("o");
203 |                 out = new FileOutputStream(filename);
204 |                 logger.info("Writing to file " + filename);
205 |             } catch (FileNotFoundException e) {
206 |                 e.printStackTrace();
207 |             }
208 |         } else {
209 |             // default to stdout
210 |             out = System.out;
211 |             logger.info("Writing to stdout");
212 |         }
213 |         return new PrintStream(out);
214 |     }
215 | }
216 | 


--------------------------------------------------------------------------------
/src/main/java/de/uni_mannheim/minie/minimize/object/ObjAggressiveMinimization.java:
--------------------------------------------------------------------------------
  1 | package de.uni_mannheim.minie.minimize.object;
  2 | 
  3 | import java.util.HashSet;
  4 | import java.util.Set;
  5 | 
  6 | import de.uni_mannheim.constant.POS_TAG;
  7 | import de.uni_mannheim.constant.REGEX;
  8 | import de.uni_mannheim.minie.annotation.AnnotatedPhrase;
  9 | import de.uni_mannheim.minie.annotation.Quantity;
 10 | 
 11 | import edu.stanford.nlp.ling.IndexedWord;
 12 | import edu.stanford.nlp.ling.tokensregex.TokenSequenceMatcher;
 13 | import edu.stanford.nlp.ling.tokensregex.TokenSequencePattern;
 14 | import edu.stanford.nlp.semgraph.SemanticGraph;
 15 | import edu.stanford.nlp.trees.EnglishGrammaticalRelations;
 16 | import edu.stanford.nlp.trees.GrammaticalRelation;
 17 | import it.unimi.dsi.fastutil.objects.ObjectArrayList;
 18 | 
 19 | import de.uni_mannheim.utils.coreNLP.CoreNLPUtils;
 20 | 
 21 | /**
 22 |  * @author Kiril Gashteovski
 23 |  */
 24 | public class ObjAggressiveMinimization {
 25 |     /**
 26 |      * Object aggressive minimization
 27 |      * @param object: object phrase
 28 |      * @param sg: semantic graph of the sentence
 29 |      */
 30 |     public static void minimizeObject(AnnotatedPhrase object, SemanticGraph sg){
 31 |         // Don't minimize if the phrase contains one word or no  words (rare cases)
 32 |         if (object.getWordList() == null || object.getWordList().size() <= 1){
 33 |             return;
 34 |         }
 35 |         
 36 |         // Don't minimize if the phrase is a multi word NER or multiple nouns in a sequence
 37 |         String seqPosNer = CoreNLPUtils.wordsToPosMergedNerSeq(object.getWordList());
 38 |         if (seqPosNer.matches(REGEX.MULTI_WORD_ENTITY) || seqPosNer.matches(REGEX.MULTI_WORD_NOUN)){
 39 |             return;
 40 |         }
 41 |         
 42 |         // Do safe minimization first
 43 |         ObjSafeMinimization.minimizeObject(object, sg);
 44 |         
 45 |         // List of words to be dropped
 46 |         ObjectArrayList<IndexedWord> dropWords = new ObjectArrayList<>();
 47 |         
 48 |         // Drop some type of modifiers 
 49 |         Set<GrammaticalRelation> excludeRels = new HashSet<>();
 50 |         excludeRels.add(EnglishGrammaticalRelations.ADVERBIAL_MODIFIER);
 51 |         excludeRels.add(EnglishGrammaticalRelations.ADJECTIVAL_MODIFIER);
 52 |         excludeRels.add(EnglishGrammaticalRelations.DETERMINER);
 53 |         excludeRels.add(EnglishGrammaticalRelations.PREDETERMINER);
 54 |         excludeRels.add(EnglishGrammaticalRelations.NUMERIC_MODIFIER);
 55 |         excludeRels.add(EnglishGrammaticalRelations.NUMBER_MODIFIER);
 56 |         excludeRels.add(EnglishGrammaticalRelations.POSSESSION_MODIFIER);
 57 |         excludeRels.add(EnglishGrammaticalRelations.POSSESSIVE_MODIFIER);
 58 |         excludeRels.add(EnglishGrammaticalRelations.QUANTIFIER_MODIFIER);
 59 |         excludeRels.add(EnglishGrammaticalRelations.TEMPORAL_MODIFIER);
 60 |         excludeRels.add(EnglishGrammaticalRelations.NP_ADVERBIAL_MODIFIER);
 61 |         excludeRels.add(EnglishGrammaticalRelations.PREPOSITIONAL_MODIFIER);
 62 |         //excludeRels.add(EnglishGrammaticalRelations.AUX_MODIFIER);
 63 |         for (IndexedWord w: object.getWordList()) {
 64 |             // Skip the words that were included afterwards (not part of the DP)
 65 |             if (w.index() < 0)
 66 |                 continue;
 67 |             
 68 |             // Get the relevant modifiers to be dropped (their modifiers as well)
 69 |             Set<IndexedWord> modifiers = sg.getChildrenWithRelns(w, excludeRels);
 70 |             for (IndexedWord m: modifiers) {
 71 |                 ObjectArrayList<IndexedWord> subModifiers = CoreNLPUtils.getSubTreeSortedNodes(m, sg, null);
 72 |                 for (IndexedWord sm: subModifiers)
 73 |                     //if (!sm.tag().equals(POS_TAG.IN))
 74 |                         dropWords.add(sm);
 75 |             }
 76 |             dropWords.addAll(modifiers);
 77 |             
 78 |             // Drop quantities
 79 |             if (w.ner().equals(Quantity.ST_QUANTITY)) 
 80 |                 dropWords.add(w);
 81 |         }
 82 |         object.getWordList().removeAll(dropWords);
 83 |         // add words to dropped word list
 84 |         object.addDroppedEdges(CoreNLPUtils.listOfIndexedWordsToParentEdges(sg, dropWords));
 85 |         object.addDroppedWords(dropWords);
 86 |         dropWords.clear();
 87 |         
 88 |         // If [IN|TO] .* [IN|TO] => drop [IN|TO] .*, i.e. -> drop PP attachments
 89 |         TokenSequencePattern tPattern = TokenSequencePattern.compile(REGEX.T_PREP_ALL_PREP);
 90 |         TokenSequenceMatcher tMatcher = tPattern.getMatcher(object.getWordCoreLabelList());
 91 |         ObjectArrayList<IndexedWord> matchedWords = new ObjectArrayList<>();
 92 |         while (tMatcher.find()){
 93 |             matchedWords = CoreNLPUtils.listOfCoreMapWordsToIndexedWordList(tMatcher.groupNodes());
 94 |             for (int i = 0; i < matchedWords.size(); i++) {
 95 |                 if (matchedWords.get(i).tag().equals(POS_TAG.IN) || matchedWords.get(i).tag().equals(POS_TAG.TO)) {
 96 |                     if (i == 0) {
 97 |                         if (matchedWords.get(i).tag().equals(POS_TAG.TO) && CoreNLPUtils.isVerb(matchedWords.get(i+1).tag()))
 98 |                             break;
 99 |                         dropWords.add(matchedWords.get(i));
100 |                     } else break;
101 |                 } else {
102 |                     dropWords.add(matchedWords.get(i));
103 |                 }
104 |             }
105 |         }
106 |         object.getWordList().removeAll(dropWords);
107 |         // add words to dropped word list
108 |         object.addDroppedEdges(CoreNLPUtils.listOfIndexedWordsToParentEdges(sg, dropWords));
109 |         object.addDroppedWords(dropWords);
110 |         dropWords.clear();
111 | 
112 |         // TODO: if QUANT + NP + IN => drop "QUANT + NP" ?
113 |         
114 |         // If VB_1+ TO VB_2 => drop VB_1+ TO .*
115 |         tPattern = TokenSequencePattern.compile(REGEX.T_VB_TO_VB);
116 |         tMatcher = tPattern.getMatcher(object.getWordCoreLabelList());
117 |         matchedWords = new ObjectArrayList<>();
118 |         while (tMatcher.find()){
119 |             matchedWords = CoreNLPUtils.listOfCoreMapWordsToIndexedWordList(tMatcher.groupNodes());
120 |             for (int i = 0; i < matchedWords.size(); i++) {
121 |                 if (matchedWords.get(i).tag().equals(POS_TAG.TO)) {
122 |                     dropWords.add(matchedWords.get(i));
123 |                     break;
124 |                 } else {
125 |                     dropWords.add(matchedWords.get(i));
126 |                 }
127 |             }
128 |         }
129 |         object.getWordList().removeAll(dropWords);
130 |         // add words to dropped word list
131 |         object.addDroppedEdges(CoreNLPUtils.listOfIndexedWordsToParentEdges(sg, dropWords));
132 |         object.addDroppedWords(dropWords);
133 |         dropWords.clear();
134 |         
135 |         // Drop auxilaries
136 |         for (IndexedWord w: object.getWordList()) {
137 |             if (w.index() < 0)
138 |                 continue;
139 |             Set<IndexedWord> modifiers = sg.getChildrenWithReln(w, EnglishGrammaticalRelations.AUX_MODIFIER);
140 |             for (IndexedWord m: modifiers) {
141 |                 ObjectArrayList<IndexedWord> subModifiers = CoreNLPUtils.getSubTreeSortedNodes(m, sg, null);
142 |                 for (IndexedWord sm: subModifiers)
143 |                     dropWords.add(sm);
144 |             }
145 |             dropWords.addAll(modifiers);
146 |         }
147 |         object.getWordList().removeAll(dropWords);
148 |         // add words to dropped word list
149 |         object.addDroppedEdges(CoreNLPUtils.listOfIndexedWordsToParentEdges(sg, dropWords));
150 |         object.addDroppedWords(dropWords);
151 |         dropWords.clear();
152 |         
153 |         // Drop noun modifiers with different NERs
154 |         for (IndexedWord w: object.getWordList()) {
155 |             if (w.index() < 0)
156 |                 continue;
157 |             Set<IndexedWord> modifiers = sg.getChildrenWithReln(w, EnglishGrammaticalRelations.NOUN_COMPOUND_MODIFIER);
158 |             for (IndexedWord mw: modifiers) {
159 |                 if (!w.ner().equals(mw.ner())) {
160 |                     dropWords.add(mw);
161 |                     dropWords.addAll(CoreNLPUtils.getSubTreeSortedNodes(mw, sg, null));
162 |                 }
163 |             }
164 |         }
165 |         object.getWordList().removeAll(dropWords);
166 |         // add words to dropped word list
167 |         object.addDroppedEdges(CoreNLPUtils.listOfIndexedWordsToParentEdges(sg, dropWords));
168 |         object.addDroppedWords(dropWords);
169 |         dropWords.clear();
170 |     }
171 | }


--------------------------------------------------------------------------------
/src/main/java/de/uni_mannheim/minie/minimize/object/ObjDictionaryMinimization.java:
--------------------------------------------------------------------------------
 1 | package de.uni_mannheim.minie.minimize.object;
 2 | 
 3 | import java.util.ArrayList;
 4 | import java.util.List;
 5 | 
 6 | import de.uni_mannheim.minie.annotation.AnnotatedPhrase;
 7 | import de.uni_mannheim.minie.minimize.Minimization;
 8 | import de.uni_mannheim.minie.minimize.object.ObjSafeMinimization;
 9 | import de.uni_mannheim.utils.coreNLP.CoreNLPUtils;
10 | 
11 | import edu.stanford.nlp.semgraph.SemanticGraph;
12 | import edu.stanford.nlp.util.CoreMap;
13 | import it.unimi.dsi.fastutil.objects.ObjectOpenHashSet;
14 | 
15 | /**
16 |  * @author Kiril Gashteovski
17 |  */
18 | public class ObjDictionaryMinimization {
19 |     /**
20 |      * Minimize only the objects that are considered to have "non-frequent patterns"
21 |      * @param obj: the object phrase
22 |      * @param sg: semantic graph of the sentence
23 |      * @param freqObjs: dictionary of multi-word expressions (frequent objects)
24 |      */
25 |     public static void minimizeObject(AnnotatedPhrase obj, SemanticGraph sg, ObjectOpenHashSet<String> collocations){
26 |         // Do the safe minimization first
27 |         ObjSafeMinimization.minimizeObject(obj, sg);
28 |         
29 |         // If the object is frequent, don't minimize anything
30 |         if (collocations.contains(CoreNLPUtils.listOfWordsToLemmaString(obj.getWordList()).toLowerCase())){
31 |             return;
32 |         }
33 |         
34 |         // Minimization object
35 |         Minimization simp = new Minimization(obj, sg, collocations);
36 |         
37 |         // remWords: list of words to be removed (reusable variable)
38 |         // matchWords: list of matched words from the regex (reusable variable)
39 |         List<CoreMap> remWords = new ArrayList<>();
40 |         List<CoreMap> matchWords = new ArrayList<>(); 
41 |         
42 |         // Safe minimization on the noun phrases and named entities within the subj. phrase
43 |         simp.nounPhraseDictMinimization(remWords, matchWords);
44 |         simp.namedEntityDictionaryMinimization(remWords, matchWords);
45 |     }
46 | }


--------------------------------------------------------------------------------
/src/main/java/de/uni_mannheim/minie/minimize/object/ObjSafeMinimization.java:
--------------------------------------------------------------------------------
 1 | package de.uni_mannheim.minie.minimize.object;
 2 | 
 3 | import java.util.ArrayList;
 4 | import java.util.List;
 5 | 
 6 | import de.uni_mannheim.minie.annotation.AnnotatedPhrase;
 7 | import de.uni_mannheim.minie.minimize.Minimization;
 8 | import edu.stanford.nlp.semgraph.SemanticGraph;
 9 | import edu.stanford.nlp.util.CoreMap;
10 | 
11 | import it.unimi.dsi.fastutil.objects.ObjectOpenHashSet;
12 | 
13 | /**
14 |  * @author Kiril Gashteovski
15 |  */
16 | public class ObjSafeMinimization {
17 |     /**
18 |      * Minimize only the objects that are considered to have "safe patterns"
19 |      * @param object: the objects phrase
20 |      * @param sg: the semantic graph of the whole sentence
21 |      */
22 |     public static void minimizeObject(AnnotatedPhrase object, SemanticGraph sg){
23 |         Minimization simp = new Minimization(object, sg, new ObjectOpenHashSet<String>());
24 |         
25 |         // remWords: list of words to be removed (reusable variable)
26 |         // matchWords: list of matched words from the regex (reusable variable)
27 |         List<CoreMap> remWords = new ArrayList<>();
28 |         List<CoreMap> matchWords = new ArrayList<>(); 
29 |         
30 |         // Safe minimization on the noun phrases and named entities
31 |         simp.nounPhraseSafeMinimization(remWords, matchWords);
32 |         simp.namedEntitySafeMinimization(remWords, matchWords);
33 |     }
34 | }
35 | 
36 | 


--------------------------------------------------------------------------------
/src/main/java/de/uni_mannheim/minie/minimize/relation/RelAggressiveMinimization.java:
--------------------------------------------------------------------------------
  1 | package de.uni_mannheim.minie.minimize.relation;
  2 | 
  3 | import java.util.HashSet;
  4 | import java.util.Set;
  5 | 
  6 | import de.uni_mannheim.constant.POS_TAG;
  7 | import de.uni_mannheim.constant.REGEX;
  8 | import de.uni_mannheim.minie.annotation.AnnotatedPhrase;
  9 | import de.uni_mannheim.minie.annotation.Quantity;
 10 | 
 11 | import edu.stanford.nlp.ling.IndexedWord;
 12 | import edu.stanford.nlp.ling.tokensregex.TokenSequenceMatcher;
 13 | import edu.stanford.nlp.ling.tokensregex.TokenSequencePattern;
 14 | import edu.stanford.nlp.semgraph.SemanticGraph;
 15 | import edu.stanford.nlp.trees.EnglishGrammaticalRelations;
 16 | import edu.stanford.nlp.trees.GrammaticalRelation;
 17 | import it.unimi.dsi.fastutil.objects.ObjectArrayList;
 18 | 
 19 | import de.uni_mannheim.utils.coreNLP.CoreNLPUtils;
 20 | 
 21 | /**
 22 |  * @author Kiril Gashteovski
 23 |  */
 24 | public class RelAggressiveMinimization {
 25 |     /**
 26 |      * Always minimize the subject towards the root word
 27 |      * @param relation: relation phrase
 28 |      * @param sg: sentence semantic graph (dependency parse graph)
 29 |      */
 30 |     public static void minimizeRelation(AnnotatedPhrase relation, SemanticGraph sg){
 31 |         // Don't minimize if the phrase contains one word or no  words (rare cases)
 32 |         if (relation.getWordList() == null || relation.getWordList().size() <= 1){
 33 |             return;
 34 |         }
 35 |         
 36 |         // Do safe minimization first
 37 |         RelSafeMinimization.minimizeRelation(relation, sg);
 38 |         
 39 |         // List of words to be dropped
 40 |         ObjectArrayList<IndexedWord> dropWords = new ObjectArrayList<>();
 41 |         
 42 |         // Drop some type of modifiers 
 43 |         Set<GrammaticalRelation> excludeRels = new HashSet<>();
 44 |         excludeRels.add(EnglishGrammaticalRelations.ADVERBIAL_MODIFIER);
 45 |         excludeRels.add(EnglishGrammaticalRelations.ADJECTIVAL_MODIFIER);
 46 |         excludeRels.add(EnglishGrammaticalRelations.DETERMINER);
 47 |         excludeRels.add(EnglishGrammaticalRelations.PREDETERMINER);
 48 |         excludeRels.add(EnglishGrammaticalRelations.NUMERIC_MODIFIER);
 49 |         excludeRels.add(EnglishGrammaticalRelations.NUMBER_MODIFIER);
 50 |         excludeRels.add(EnglishGrammaticalRelations.POSSESSION_MODIFIER);
 51 |         excludeRels.add(EnglishGrammaticalRelations.POSSESSIVE_MODIFIER);
 52 |         excludeRels.add(EnglishGrammaticalRelations.QUANTIFIER_MODIFIER);
 53 |         excludeRels.add(EnglishGrammaticalRelations.TEMPORAL_MODIFIER);
 54 |         excludeRels.add(EnglishGrammaticalRelations.NP_ADVERBIAL_MODIFIER);
 55 |         //excludeRels.add(EnglishGrammaticalRelations.AUX_MODIFIER);
 56 |         for (IndexedWord w: relation.getWordList()) {
 57 |             // Skip the words that were included afterwards (not part of the DP)
 58 |             if (w.index() < 0)
 59 |                 continue;
 60 |             
 61 |             // Get the relevant modifiers to be dropped (their modifiers as well)
 62 |             Set<IndexedWord> modifiers = sg.getChildrenWithRelns(w, excludeRels);
 63 |             for (IndexedWord m: modifiers) {
 64 |                 ObjectArrayList<IndexedWord> subModifiers = CoreNLPUtils.getSubTreeSortedNodes(m, sg, null);
 65 |                 for (IndexedWord sm: subModifiers)
 66 |                     if (!sm.tag().equals(POS_TAG.IN))
 67 |                         dropWords.add(sm);
 68 |             }
 69 |             dropWords.addAll(modifiers);
 70 |             
 71 |             // Drop quantities
 72 |             if (w.ner().equals(Quantity.ST_QUANTITY)) 
 73 |                 dropWords.add(w);
 74 |         }
 75 |         relation.getWordList().removeAll(dropWords);
 76 |         // add words to dropped word list
 77 |         relation.addDroppedEdges(CoreNLPUtils.listOfIndexedWordsToParentEdges(sg, dropWords));
 78 |         relation.addDroppedWords(dropWords);
 79 |         dropWords.clear();
 80 |         
 81 |         // If [IN|TO] .* [IN|TO] => drop [IN|TO] .*, i.e. -> drop PP attachments
 82 |         TokenSequencePattern tPattern = TokenSequencePattern.compile(REGEX.T_PREP_ALL_PREP);
 83 |         TokenSequenceMatcher tMatcher = tPattern.getMatcher(relation.getWordCoreLabelList());
 84 |         ObjectArrayList<IndexedWord> matchedWords = new ObjectArrayList<>();
 85 |         while (tMatcher.find()){
 86 |             matchedWords = CoreNLPUtils.listOfCoreMapWordsToIndexedWordList(tMatcher.groupNodes());
 87 |             for (int i = 0; i < matchedWords.size(); i++) {
 88 |                 if (matchedWords.get(i).tag().equals(POS_TAG.IN) || matchedWords.get(i).tag().equals(POS_TAG.TO)) {
 89 |                     if (i == 0) {
 90 |                         if (matchedWords.get(i).tag().equals(POS_TAG.TO) && CoreNLPUtils.isVerb(matchedWords.get(i+1).tag()))
 91 |                             break;
 92 |                         dropWords.add(matchedWords.get(i));
 93 |                     } else break;
 94 |                 } else {
 95 |                     dropWords.add(matchedWords.get(i));
 96 |                 }
 97 |             }
 98 |         }
 99 |         relation.getWordList().removeAll(dropWords);
100 |         // add words to dropped word list
101 |         relation.addDroppedEdges(CoreNLPUtils.listOfIndexedWordsToParentEdges(sg, dropWords));
102 |         relation.addDroppedWords(dropWords);
103 |         dropWords.clear();
104 |         
105 |         // TODO: if QUANT + NP + IN => drop "QUANT + NP" ?
106 |         
107 |         // If VB_1+ TO VB_2 => drop VB_1+ TO .*
108 |         tPattern = TokenSequencePattern.compile(REGEX.T_VB_TO_VB);
109 |         tMatcher = tPattern.getMatcher(relation.getWordCoreLabelList());
110 |         matchedWords = new ObjectArrayList<>();
111 |         while (tMatcher.find()){
112 |             matchedWords = CoreNLPUtils.listOfCoreMapWordsToIndexedWordList(tMatcher.groupNodes());
113 |             for (int i = 0; i < matchedWords.size(); i++) {
114 |                 if (matchedWords.get(i).tag().equals(POS_TAG.TO)) {
115 |                     dropWords.add(matchedWords.get(i));
116 |                     break;
117 |                 } else {
118 |                     dropWords.add(matchedWords.get(i));
119 |                 }
120 |             }
121 |         }
122 |         relation.getWordList().removeAll(dropWords);
123 |         // add words to dropped word list
124 |         relation.addDroppedEdges(CoreNLPUtils.listOfIndexedWordsToParentEdges(sg, dropWords));
125 |         relation.addDroppedWords(dropWords);
126 |         dropWords.clear();
127 |         
128 |         // Drop auxilaries
129 |         for (IndexedWord w: relation.getWordList()) {
130 |             if (w.index() < 0)
131 |                 continue;
132 |             Set<IndexedWord> modifiers = sg.getChildrenWithReln(w, EnglishGrammaticalRelations.AUX_MODIFIER);
133 |             for (IndexedWord m: modifiers) {
134 |                 ObjectArrayList<IndexedWord> subModifiers = CoreNLPUtils.getSubTreeSortedNodes(m, sg, null);
135 |                 for (IndexedWord sm: subModifiers)
136 |                     dropWords.add(sm);
137 |             }
138 |             dropWords.addAll(modifiers);
139 |         }
140 |         relation.getWordList().removeAll(dropWords);
141 |         // add words to dropped word list
142 |         relation.addDroppedEdges(CoreNLPUtils.listOfIndexedWordsToParentEdges(sg, dropWords));
143 |         relation.addDroppedWords(dropWords);
144 |         dropWords.clear();
145 |         
146 |         // Drop noun modifiers with different NERs
147 |         for (IndexedWord w: relation.getWordList()) {
148 |             if (w.index() < 0)
149 |                 continue;
150 |             Set<IndexedWord> modifiers = sg.getChildrenWithReln(w, EnglishGrammaticalRelations.NOUN_COMPOUND_MODIFIER);
151 |             for (IndexedWord mw: modifiers) {
152 |                 if (!w.ner().equals(mw.ner())) {
153 |                     dropWords.add(mw);
154 |                     dropWords.addAll(CoreNLPUtils.getSubTreeSortedNodes(mw, sg, null));
155 |                 }
156 |             }
157 |         }
158 |         relation.getWordList().removeAll(dropWords);
159 |         // add words to dropped word list
160 |         relation.addDroppedEdges(CoreNLPUtils.listOfIndexedWordsToParentEdges(sg, dropWords));
161 |         relation.addDroppedWords(dropWords);
162 |         dropWords.clear();
163 |     }
164 | }
165 | 


--------------------------------------------------------------------------------
/src/main/java/de/uni_mannheim/minie/minimize/relation/RelDictionaryMinimization.java:
--------------------------------------------------------------------------------
 1 | package de.uni_mannheim.minie.minimize.relation;
 2 | 
 3 | import java.util.ArrayList;
 4 | import java.util.List;
 5 | 
 6 | import de.uni_mannheim.minie.annotation.AnnotatedPhrase;
 7 | import de.uni_mannheim.minie.minimize.Minimization;
 8 | import de.uni_mannheim.minie.minimize.relation.RelSafeMinimization;
 9 | 
10 | import edu.stanford.nlp.semgraph.SemanticGraph;
11 | import edu.stanford.nlp.util.CoreMap;
12 | import it.unimi.dsi.fastutil.objects.ObjectOpenHashSet;
13 | 
14 | import de.uni_mannheim.utils.coreNLP.CoreNLPUtils;
15 | 
16 | /**
17 |  * @author Kiril Gashteovski
18 |  */
19 | public class RelDictionaryMinimization {
20 |     /**
21 |      * Minimize only the relations that are considered to have "non-frequent patterns"
22 |      * @param rel: the relation phrase
23 |      * @param sg: semantic graph of the sentence
24 |      * @param freqRels: dictionary of multi-word expressions (frequent relations)
25 |      */
26 |     public static void minimizeRelation(AnnotatedPhrase rel, SemanticGraph sg, ObjectOpenHashSet<String> collocations){
27 |         // Do the safe minimization first
28 |         RelSafeMinimization.minimizeRelation(rel, sg);
29 |         
30 |         // If the subject is frequent, don't minimize anything
31 |         if (collocations.contains(CoreNLPUtils.listOfWordsToLemmaString(rel.getWordList()).toLowerCase())){
32 |             return;
33 |         }
34 |         
35 |         // Do the safe minimization first
36 |         RelSafeMinimization.minimizeRelation(rel, sg);
37 |         
38 |         // remWords: list of words to be removed (reusable variable)
39 |         // matchWords: list of matched words from the regex (reusable variable)
40 |         List<CoreMap> remWords = new ArrayList<>();
41 |         List<CoreMap> matchWords = new ArrayList<>(); 
42 |         
43 |         // Move to the dict. minimization of the noun phrases within the relation
44 |         Minimization simp = new Minimization(rel, sg, collocations);
45 |         simp.nounPhraseDictMinimization(remWords, matchWords);
46 |         simp.namedEntityDictionaryMinimization(remWords, matchWords);
47 |     }
48 | }


--------------------------------------------------------------------------------
/src/main/java/de/uni_mannheim/minie/minimize/relation/RelSafeMinimization.java:
--------------------------------------------------------------------------------
  1 | package de.uni_mannheim.minie.minimize.relation;
  2 | 
  3 | import java.util.ArrayList;
  4 | import java.util.List;
  5 | 
  6 | import de.uni_mannheim.constant.POS_TAG;
  7 | import de.uni_mannheim.constant.REGEX;
  8 | import de.uni_mannheim.minie.annotation.AnnotatedPhrase;
  9 | import de.uni_mannheim.minie.annotation.Polarity;
 10 | import de.uni_mannheim.minie.minimize.Minimization;
 11 | import de.uni_mannheim.utils.coreNLP.CoreNLPUtils;
 12 | 
 13 | import edu.stanford.nlp.ling.IndexedWord;
 14 | import edu.stanford.nlp.ling.tokensregex.TokenSequenceMatcher;
 15 | import edu.stanford.nlp.ling.tokensregex.TokenSequencePattern;
 16 | import edu.stanford.nlp.semgraph.SemanticGraph;
 17 | import edu.stanford.nlp.util.CoreMap;
 18 | import it.unimi.dsi.fastutil.objects.ObjectArrayList;
 19 | import it.unimi.dsi.fastutil.objects.ObjectOpenHashSet;
 20 | 
 21 | /**
 22 |  * @author Kiril Gashteovski
 23 |  */
 24 | public class RelSafeMinimization {
 25 |     /**
 26 |      * Minimize only the relations that are considered to have "safe patterns"
 27 |      * @param rel: the relation phrase
 28 |      * @param sg: semantic graph of the sentence
 29 |      */
 30 |     public static void minimizeRelation(AnnotatedPhrase rel, SemanticGraph sg){
 31 |         // Minimize left/right of the verb 
 32 |         minimizationLeftFromVerb(rel, sg);
 33 |         minimizationRightFromVerb(rel, sg);
 34 |     }
 35 |     
 36 |     /**
 37 |      * Minimize the relations considered to have "safe patterns", and occur from the left of the verb
 38 |      * @param rel: the relation phrase
 39 |      * @param sg: the semantic graph of the sentence
 40 |      */
 41 |     private static void minimizationLeftFromVerb(AnnotatedPhrase rel, SemanticGraph sg){
 42 |         // Minimization object
 43 |         Minimization simp = new Minimization(rel, sg, new ObjectOpenHashSet<String>());
 44 |         
 45 |         // remWords: list of words to be removed (reusable variable)
 46 |         // matchWords: list of matched words from the regex (reusable variable)
 47 |         List<CoreMap> remWords = new ArrayList<>();
 48 |         List<CoreMap> matchWords = new ArrayList<>(); 
 49 |         
 50 |         simp.verbPhraseSafeMinimization(remWords, matchWords);
 51 |     }
 52 |     
 53 |     /**
 54 |      * Minimize the relations considered to have "safe patterns", and occur from the right of the verb
 55 |      * @param rel: the relation phrase
 56 |      * @param sg: the semantic graph of the sentence
 57 |      */
 58 |     private static void minimizationRightFromVerb(AnnotatedPhrase rel, SemanticGraph sg){
 59 |         // Minimization object
 60 |         Minimization simp = new Minimization(rel, sg, new ObjectOpenHashSet<String>());
 61 |         
 62 |         // remWords: list of words to be removed (reusable variable)
 63 |         // matchWords: list of matched words from the regex (reusable variable)
 64 |         List<CoreMap> remWords = new ArrayList<>();
 65 |         List<CoreMap> matchWords = new ArrayList<>(); 
 66 |         
 67 |         // Safe minimization on the noun phrases and named entities within the rel. phrase
 68 |         simp.nounPhraseSafeMinimization(remWords, matchWords);
 69 |         simp.namedEntitySafeMinimization(remWords, matchWords);
 70 |         rel = simp.getPhrase();
 71 |         
 72 |         // Reusable variables
 73 |         ObjectOpenHashSet<IndexedWord> droppedWords = new ObjectOpenHashSet<IndexedWord>();
 74 |         ObjectArrayList<IndexedWord> matchedWords = new ObjectArrayList<>();
 75 |         ObjectArrayList<IndexedWord> verbs = new ObjectArrayList<>();
 76 |         List<IndexedWord> children;
 77 |         
 78 |         // Flags for checking certain conditions
 79 |         boolean containsNEG;
 80 |         boolean isAdverb;
 81 |         
 82 |         // If ^VB+ RB+ VB+ => drop RB+
 83 |         TokenSequencePattern tPattern = TokenSequencePattern.compile(REGEX.T_VB_RB_VB);
 84 |         TokenSequenceMatcher tMatcher = tPattern.getMatcher(rel.getWordCoreLabelList());
 85 |         while (tMatcher.find()){         
 86 |             matchedWords = CoreNLPUtils.listOfCoreMapWordsToIndexedWordList(tMatcher.groupNodes());
 87 |             // Check if the first word of the matched words is the first word of the relation
 88 |             if (matchedWords.get(0).index() != rel.getWordList().get(0).index())
 89 |                 break;
 90 |             
 91 |             verbs = CoreNLPUtils.getChainedTagNoNER(rel.getWordList(), 0);
 92 |             for (int i = 0; i < matchedWords.size(); i++){
 93 |                 isAdverb = matchedWords.get(i).tag().equals(POS_TAG.RB);
 94 |                 containsNEG = Polarity.NEG_WORDS.contains(matchedWords.get(i).lemma().toLowerCase());
 95 |                 
 96 |                 if (isAdverb && !containsNEG) {
 97 |                     // If the adverb is the head word, don't drop it
 98 |                     children = sg.getChildList(rel.getWordList().get(i));
 99 |                     children.retainAll(verbs);
100 |                     if (children.size() == 0) {
101 |                         droppedWords.addAll(CoreNLPUtils.getChainedTagNoNER(rel.getWordList(), i));
102 |                     }
103 |                     break;
104 |                 }
105 |             }
106 |             
107 |             if (droppedWords.size() > 0){
108 |                 rel.removeWordsFromList(droppedWords);
109 |                 // add words to dropped word list
110 |                 rel.addDroppedEdges(CoreNLPUtils.listOfIndexedWordsToParentEdges(sg, droppedWords));
111 |                 rel.addDroppedWords(droppedWords);
112 |                 droppedWords = new ObjectOpenHashSet<IndexedWord>();
113 |             }
114 |         }
115 |         
116 |         // If ^VB+ RB+ => drop RB+
117 |         tPattern = TokenSequencePattern.compile(REGEX.T_VB_RB);
118 |         tMatcher = tPattern.getMatcher(rel.getWordCoreLabelList());
119 |         while (tMatcher.find()){         
120 |             matchedWords = CoreNLPUtils.listOfCoreMapWordsToIndexedWordList(tMatcher.groupNodes());
121 |             // Check if the first word of the matched words is the first word of the relation
122 |             if (matchedWords.get(0).index() != rel.getWordList().get(0).index())
123 |                 break;
124 |             
125 |             verbs = CoreNLPUtils.getChainedTagNoNER(rel.getWordList(), 0);
126 |             for (int i = 0; i < matchedWords.size(); i++){
127 |                 isAdverb = matchedWords.get(i).tag().equals(POS_TAG.RB);
128 |                 containsNEG = Polarity.NEG_WORDS.contains(matchedWords.get(i).lemma().toLowerCase());
129 | 
130 |                 if (isAdverb && !containsNEG) {
131 |                     // If the adverb is the head word, don't drop it
132 |                     children = sg.getChildList(rel.getWordList().get(i));
133 |                     children.retainAll(verbs);
134 |                     if (children.size() == 0) {
135 |                         droppedWords.addAll(CoreNLPUtils.getChainedTagNoNER(rel.getWordList(), i));
136 |                     }
137 |                     break;
138 |                 }
139 |             }
140 |             
141 |             if (droppedWords.size() > 0){
142 |                 rel.removeWordsFromList(droppedWords);
143 |                 // add words to dropped word list
144 |                 rel.addDroppedEdges(CoreNLPUtils.listOfIndexedWordsToParentEdges(sg, droppedWords));
145 |                 rel.addDroppedWords(droppedWords);
146 |                 droppedWords = new ObjectOpenHashSet<IndexedWord>();
147 |             }
148 |         }
149 |     }
150 | }
151 | 


--------------------------------------------------------------------------------
/src/main/java/de/uni_mannheim/minie/minimize/subject/SubjAggressiveMinimization.java:
--------------------------------------------------------------------------------
  1 | package de.uni_mannheim.minie.minimize.subject;
  2 | 
  3 | import java.util.HashSet;
  4 | import java.util.Set;
  5 | 
  6 | import de.uni_mannheim.constant.POS_TAG;
  7 | import de.uni_mannheim.constant.REGEX;
  8 | import de.uni_mannheim.minie.annotation.AnnotatedPhrase;
  9 | import de.uni_mannheim.minie.annotation.Quantity;
 10 | import de.uni_mannheim.utils.coreNLP.CoreNLPUtils;
 11 | 
 12 | import edu.stanford.nlp.ling.IndexedWord;
 13 | import edu.stanford.nlp.ling.tokensregex.TokenSequenceMatcher;
 14 | import edu.stanford.nlp.ling.tokensregex.TokenSequencePattern;
 15 | import edu.stanford.nlp.semgraph.SemanticGraph;
 16 | import edu.stanford.nlp.trees.EnglishGrammaticalRelations;
 17 | import edu.stanford.nlp.trees.GrammaticalRelation;
 18 | import it.unimi.dsi.fastutil.objects.ObjectArrayList;
 19 | 
 20 | public class SubjAggressiveMinimization {
 21 |     
 22 |     /**
 23 |      * Always minimize the subject towards the root word
 24 |      * @param subject: subject phrase
 25 |      * @param sg: semantic graph of the sentence
 26 |      */
 27 |     public static void minimizeSubject(AnnotatedPhrase subject, SemanticGraph sg){
 28 |         // Don't minimize if the phrase contains one word or no  words (rare cases)
 29 |         if (subject.getWordList() == null || subject.getWordList().size() <= 1){
 30 |             return;
 31 |         }
 32 |         // Don't minimize if the phrase is a multi word NER or multiple nouns in a sequence
 33 |         String seqPosNer = CoreNLPUtils.wordsToPosMergedNerSeq(subject.getWordList());
 34 |         if (seqPosNer.matches(REGEX.MULTI_WORD_ENTITY) || seqPosNer.matches(REGEX.MULTI_WORD_NOUN)){
 35 |             return;
 36 |         }
 37 |         
 38 |         // Do safe minimization first
 39 |         SubjSafeMinimization.minimizeSubject(subject, sg);
 40 |         
 41 |         // List of words to be dropped
 42 |         ObjectArrayList<IndexedWord> dropWords = new ObjectArrayList<>();
 43 |         
 44 |         // Drop some type of modifiers 
 45 |         Set<GrammaticalRelation> excludeRels = new HashSet<>();
 46 |         excludeRels.add(EnglishGrammaticalRelations.ADVERBIAL_MODIFIER);
 47 |         excludeRels.add(EnglishGrammaticalRelations.ADJECTIVAL_MODIFIER);
 48 |         excludeRels.add(EnglishGrammaticalRelations.DETERMINER);
 49 |         excludeRels.add(EnglishGrammaticalRelations.PREDETERMINER);
 50 |         excludeRels.add(EnglishGrammaticalRelations.NUMERIC_MODIFIER);
 51 |         excludeRels.add(EnglishGrammaticalRelations.NUMBER_MODIFIER);
 52 |         excludeRels.add(EnglishGrammaticalRelations.POSSESSION_MODIFIER);
 53 |         excludeRels.add(EnglishGrammaticalRelations.POSSESSIVE_MODIFIER);
 54 |         excludeRels.add(EnglishGrammaticalRelations.QUANTIFIER_MODIFIER);
 55 |         excludeRels.add(EnglishGrammaticalRelations.TEMPORAL_MODIFIER);
 56 |         excludeRels.add(EnglishGrammaticalRelations.NP_ADVERBIAL_MODIFIER);
 57 |         excludeRels.add(EnglishGrammaticalRelations.PREPOSITIONAL_MODIFIER);
 58 |         //excludeRels.add(EnglishGrammaticalRelations.AUX_MODIFIER);
 59 |         for (IndexedWord w: subject.getWordList()) {
 60 |             // Skip the words that were included afterwards (not part of the DP)
 61 |             if (w.index() < 0)
 62 |                 continue;
 63 |             
 64 |             // Get the relevant modifiers to be dropped (their modifiers as well)
 65 |             Set<IndexedWord> modifiers = sg.getChildrenWithRelns(w, excludeRels);
 66 |             for (IndexedWord m: modifiers) {
 67 |                 ObjectArrayList<IndexedWord> subModifiers = CoreNLPUtils.getSubTreeSortedNodes(m, sg, null);
 68 |                 for (IndexedWord sm: subModifiers)
 69 |                     //if (!sm.tag().equals(POS_TAG.IN))
 70 |                         dropWords.add(sm);
 71 |             }
 72 |             dropWords.addAll(modifiers);
 73 |             
 74 |             // Drop quantities
 75 |             if (w.ner().equals(Quantity.ST_QUANTITY)) 
 76 |                 dropWords.add(w);
 77 |         }
 78 |         subject.getWordList().removeAll(dropWords);
 79 |         dropWords.clear();        
 80 |         
 81 |         // If [IN|TO] .* [IN|TO] => drop [IN|TO] .*, i.e. -> drop PP attachments
 82 |         TokenSequencePattern tPattern = TokenSequencePattern.compile(REGEX.T_PREP_ALL_PREP);
 83 |         TokenSequenceMatcher tMatcher = tPattern.getMatcher(subject.getWordCoreLabelList());
 84 |         ObjectArrayList<IndexedWord> matchedWords = new ObjectArrayList<>();
 85 |         while (tMatcher.find()){
 86 |             matchedWords = CoreNLPUtils.listOfCoreMapWordsToIndexedWordList(tMatcher.groupNodes());
 87 |             for (int i = 0; i < matchedWords.size(); i++) {
 88 |                 if (matchedWords.get(i).tag().equals(POS_TAG.IN) || matchedWords.get(i).tag().equals(POS_TAG.TO)) {
 89 |                     if (i == 0) {
 90 |                         if (matchedWords.get(i).tag().equals(POS_TAG.TO) && CoreNLPUtils.isVerb(matchedWords.get(i+1).tag()))
 91 |                             break;
 92 |                         dropWords.add(matchedWords.get(i));
 93 |                     } else break;
 94 |                 } else {
 95 |                     dropWords.add(matchedWords.get(i));
 96 |                 }
 97 |             }
 98 |         }
 99 |         subject.addDroppedEdges(CoreNLPUtils.listOfIndexedWordsToParentEdges(sg, dropWords));
100 |         subject.addDroppedWords(dropWords);
101 |         subject.getWordList().removeAll(dropWords);
102 |         dropWords.clear();
103 | 
104 |         // TODO: if QUANT + NP + IN => drop "QUANT + NP" ?
105 |         
106 |         // If VB_1+ TO VB_2 => drop VB_1+ TO .*
107 |         tPattern = TokenSequencePattern.compile(REGEX.T_VB_TO_VB);
108 |         tMatcher = tPattern.getMatcher(subject.getWordCoreLabelList());
109 |          matchedWords = new ObjectArrayList<>();
110 |         while (tMatcher.find()){
111 |             matchedWords = CoreNLPUtils.listOfCoreMapWordsToIndexedWordList(tMatcher.groupNodes());
112 |             for (int i = 0; i < matchedWords.size(); i++) {
113 |                 if (matchedWords.get(i).tag().equals(POS_TAG.TO)) {
114 |                     dropWords.add(matchedWords.get(i));
115 |                     break;
116 |                 } else {
117 |                     dropWords.add(matchedWords.get(i));
118 |                 }
119 |             }
120 |         }
121 |         subject.getWordList().removeAll(dropWords);
122 |         // add words to dropped word list
123 |         subject.addDroppedEdges(CoreNLPUtils.listOfIndexedWordsToParentEdges(sg, dropWords));
124 |         subject.addDroppedWords(dropWords);
125 |         dropWords.clear();
126 |         
127 |         // Drop auxilaries
128 |         for (IndexedWord w: subject.getWordList()) {
129 |             if (w.index() < 0)
130 |                 continue;
131 |             Set<IndexedWord> modifiers = sg.getChildrenWithReln(w, EnglishGrammaticalRelations.AUX_MODIFIER);
132 |             for (IndexedWord m: modifiers) {
133 |                 ObjectArrayList<IndexedWord> subModifiers = CoreNLPUtils.getSubTreeSortedNodes(m, sg, null);
134 |                 for (IndexedWord sm: subModifiers)
135 |                     dropWords.add(sm);
136 |             }
137 |             dropWords.addAll(modifiers);
138 |         }
139 |         subject.getWordList().removeAll(dropWords);
140 |         // add words to dropped word list
141 |         subject.addDroppedEdges(CoreNLPUtils.listOfIndexedWordsToParentEdges(sg, dropWords));
142 |         subject.addDroppedWords(dropWords);
143 |         dropWords.clear();
144 |         
145 |         // Drop noun modifiers with different NERs
146 |         for (IndexedWord w: subject.getWordList()) {
147 |             if (w.index() < 0)
148 |                 continue;
149 |             Set<IndexedWord> modifiers = sg.getChildrenWithReln(w, EnglishGrammaticalRelations.NOUN_COMPOUND_MODIFIER);
150 |             for (IndexedWord mw: modifiers) {
151 |                 if (!w.ner().equals(mw.ner())) {
152 |                     dropWords.add(mw);
153 |                     dropWords.addAll(CoreNLPUtils.getSubTreeSortedNodes(mw, sg, null));
154 |                 }
155 |             }
156 |         }
157 |         subject.getWordList().removeAll(dropWords);
158 |         // add words to dropped word list
159 |         subject.addDroppedEdges(CoreNLPUtils.listOfIndexedWordsToParentEdges(sg, dropWords));
160 |         subject.addDroppedWords(dropWords);
161 |         dropWords.clear();
162 |     }
163 | }


--------------------------------------------------------------------------------
/src/main/java/de/uni_mannheim/minie/minimize/subject/SubjDictionaryMinimization.java:
--------------------------------------------------------------------------------
 1 | package de.uni_mannheim.minie.minimize.subject;
 2 | 
 3 | import java.util.ArrayList;
 4 | import java.util.List;
 5 | 
 6 | import de.uni_mannheim.minie.annotation.AnnotatedPhrase;
 7 | import de.uni_mannheim.minie.minimize.Minimization;
 8 | import de.uni_mannheim.minie.minimize.subject.SubjSafeMinimization;
 9 | import de.uni_mannheim.utils.coreNLP.CoreNLPUtils;
10 | 
11 | import edu.stanford.nlp.semgraph.SemanticGraph;
12 | import edu.stanford.nlp.util.CoreMap;
13 | 
14 | import it.unimi.dsi.fastutil.objects.ObjectOpenHashSet;
15 | 
16 | /**
17 |  * Minimize the subject according to the 'Dictionary minimization' rules. 
18 |  * @param subject: the subject phrase
19 |  * @param sg: semantic graph of the whole sentence
20 |  * @param freqSubj: a multi-word dictionary of frequent subjects
21 |  * @param collocations: a dictionary of collocations
22 | **/
23 | public class SubjDictionaryMinimization {
24 |     public static void minimizeSubject(AnnotatedPhrase subject, SemanticGraph sg, ObjectOpenHashSet<String> collocations){
25 |         // Do the safe minimization first
26 |         SubjSafeMinimization.minimizeSubject(subject, sg);
27 |         
28 |         // If the subject is frequent, don't minimize anything
29 |         if (collocations.contains(CoreNLPUtils.listOfWordsToLemmaString(subject.getWordList()).toLowerCase())){
30 |             return;
31 |         }
32 |         
33 |         // Minimization object
34 |         Minimization simp = new Minimization(subject, sg, collocations);
35 |         
36 |         // remWords: list of words to be removed (reusable variable)
37 |         // matchWords: list of matched words from the regex (reusable variable)
38 |         List<CoreMap> remWords = new ArrayList<>();
39 |         List<CoreMap> matchWords = new ArrayList<>(); 
40 |         
41 |         // Safe minimization on the noun phrases and named entities within the subj. phrase
42 |         simp.nounPhraseDictMinimization(remWords, matchWords);
43 |         simp.removeVerbsBeforeNouns(remWords, matchWords);
44 |         simp.namedEntityDictionaryMinimization(remWords, matchWords);
45 |     }
46 | }


--------------------------------------------------------------------------------
/src/main/java/de/uni_mannheim/minie/minimize/subject/SubjSafeMinimization.java:
--------------------------------------------------------------------------------
 1 | package de.uni_mannheim.minie.minimize.subject;
 2 | 
 3 | import java.util.ArrayList;
 4 | import java.util.List;
 5 | 
 6 | import de.uni_mannheim.minie.annotation.AnnotatedPhrase;
 7 | import de.uni_mannheim.minie.minimize.Minimization;
 8 | import edu.stanford.nlp.semgraph.SemanticGraph;
 9 | import edu.stanford.nlp.util.CoreMap;
10 | import it.unimi.dsi.fastutil.objects.ObjectOpenHashSet;
11 | 
12 | public class SubjSafeMinimization {
13 |     /**
14 |      * Minimize only the subjects that are considered to have "safe patterns"
15 |      * @param subject: the subject phrase
16 |      * @param sg: the semantic graph of the whole sentence
17 |      */
18 |     public static void minimizeSubject(AnnotatedPhrase subject, SemanticGraph sg){
19 |         Minimization simp = new Minimization(subject, sg, new ObjectOpenHashSet<String>());
20 |         
21 |         // remWords: list of words to be removed (reusable variable)
22 |         // matchWords: list of matched words from the regex (reusable variable)
23 |         List<CoreMap> remWords = new ArrayList<>();
24 |         List<CoreMap> matchWords = new ArrayList<>(); 
25 |         
26 |         // Safe minimization on the noun phrases and named entities
27 |         simp.nounPhraseSafeMinimization(remWords, matchWords);
28 |         simp.namedEntitySafeMinimization(remWords, matchWords);
29 |     }
30 | }
31 | 


--------------------------------------------------------------------------------
/src/main/java/de/uni_mannheim/minie/subconstituent/FrequencyCandidates.java:
--------------------------------------------------------------------------------
  1 | package de.uni_mannheim.minie.subconstituent;
  2 | 
  3 | import de.uni_mannheim.clausie.phrase.Phrase;
  4 | import de.uni_mannheim.constant.REGEX;
  5 | import de.uni_mannheim.utils.coreNLP.CoreNLPUtils;
  6 | 
  7 | import edu.stanford.nlp.ling.IndexedWord;
  8 | import edu.stanford.nlp.ling.tokensregex.TokenSequenceMatcher;
  9 | import edu.stanford.nlp.ling.tokensregex.TokenSequencePattern;
 10 | import edu.stanford.nlp.semgraph.SemanticGraph;
 11 | import it.unimi.dsi.fastutil.objects.ObjectArrayList;
 12 | import it.unimi.dsi.fastutil.objects.ObjectOpenHashSet;
 13 | 
 14 | /**
 15 |  * @author Kiril Gashteovski
 16 |  */
 17 | public class FrequencyCandidates {
 18 |     /** The phrase from which the frequency candidates are generated from **/
 19 |     private Phrase phrase;
 20 |     /** The sentence semantic graph **/
 21 |     private SemanticGraph sg;
 22 |     /** The sub constituents' candidates from the phrase **/
 23 |     private ObjectOpenHashSet<String> candidates;
 24 |     
 25 |     /** Default constructor **/
 26 |     public FrequencyCandidates(){
 27 |         this.phrase = new Phrase();
 28 |         this.sg = new SemanticGraph();
 29 |         this.candidates = new ObjectOpenHashSet<>();
 30 |     }
 31 |     
 32 |     /** Parametric constructor **/
 33 |     public FrequencyCandidates(Phrase p, SemanticGraph sentenceSg){
 34 |         this.phrase = p;
 35 |         this.sg = sentenceSg;
 36 |         this.phrase.detectRoot(this.sg);
 37 |         this.candidates = new ObjectOpenHashSet<>();
 38 |     }
 39 |     
 40 |     /** Generate the frequency candidates by default: 
 41 |      *  1) the whole phrase itself
 42 |      *  2) the root word
 43 |      *  3) the chained words from the root 
 44 |      *  4) the chained sub-constituent candidates
 45 |      **/
 46 |     public void generateDefaultFreqCandidates(){
 47 |         // Stoping conditions (when the phrase is just one word or no words at all (sometimes it happens)
 48 |         if (this.phrase.getWordList().size() == 0){
 49 |             return;
 50 |         }
 51 |         else if (this.phrase.getWordList().size() == 1){
 52 |             this.candidates.add(this.phrase.getWordList().get(0).lemma().toLowerCase());
 53 |             return;
 54 |         }
 55 |         
 56 |         // 1) the whole phrase itself
 57 |         this.candidates.add(CoreNLPUtils.listOfWordsToLemmaString(this.phrase.getWordList()).toLowerCase());
 58 |         
 59 |         // 2) the root word
 60 |         this.candidates.add(this.phrase.getRoot().lemma().toLowerCase());
 61 |         
 62 |         // 3) the chained words from the root
 63 |         ObjectArrayList<IndexedWord> chainedRootWords = 
 64 |                 CoreNLPUtils.getChainedWords(this.phrase.getRoot(), this.phrase.getWordList());
 65 |         this.candidates.add(CoreNLPUtils.listOfWordsToLemmaString(chainedRootWords).toLowerCase());
 66 |     }
 67 |     
 68 |     /** Generate candidates for each noun phrase within the phrase **/
 69 |     public void generateNounPhraseFreqCandidates(){
 70 |         SubConstituent sc = new SubConstituent(this.sg);
 71 |         
 72 |         // Generate candidates for [DT|RB|JJ]+ NN+
 73 |         TokenSequencePattern tPattern = TokenSequencePattern.compile(REGEX.T_DT_RB_JJ_PR_NN);
 74 |         TokenSequenceMatcher tMatcher = tPattern.getMatcher(this.phrase.getWordCoreLabelList());
 75 |         this.generateCandidatesFromTokenRegexMatch(tMatcher, sc);
 76 |     }
 77 |     
 78 |     /**
 79 |      * Given a token sequence matcher for regular expressions for sequences over tokens, get the sub-constituents and
 80 |      * store them in the sub-constituent object sc
 81 |      * @param tMatcher: token sequence matcher for regular expressions for sequences over tokens
 82 |      * @param sc: sub-constituent object
 83 |      */
 84 |     public void generateCandidatesFromTokenRegexMatch(TokenSequenceMatcher tMatcher, SubConstituent sc){
 85 |         // The matched list of words and their "root"
 86 |         ObjectArrayList<IndexedWord> matchWords;
 87 |         IndexedWord matchRoot;
 88 |         
 89 |         // Given a match, get the subconstituents
 90 |         while (tMatcher.find()){         
 91 |             matchWords = CoreNLPUtils.listOfCoreMapWordsToIndexedWordList(tMatcher.groupNodes());
 92 |             matchRoot = CoreNLPUtils.getRootFromWordList(this.sg, matchWords);
 93 |             sc.setRoot(matchRoot);
 94 |             sc.setWords(matchWords);
 95 |             sc.generateSubConstituentsFromLeft();
 96 |             for (String cand: sc.getStringSubConstituents()){
 97 |                 this.candidates.add(cand);
 98 |             }
 99 |             sc.clearSubConstituentsAndCandidates();
100 |         }
101 |     }
102 |     
103 |     // Getters
104 |     public Phrase getPhrase(){
105 |         return this.phrase;
106 |     }
107 |     public SemanticGraph getSentenceSemGraph(){
108 |         return this.sg;
109 |     }
110 |     public ObjectOpenHashSet<String> getCandidates(){
111 |         return this.candidates;
112 |     }
113 |     
114 |     // Setters
115 |     public void setPhrase(Phrase p){
116 |         this.phrase = p;
117 |     }
118 |     public void setSentenceSemGraph(SemanticGraph sentenceSg){
119 |         this.sg = sentenceSg;
120 |     }
121 |     public void setCandidates(ObjectOpenHashSet<String> cands){
122 |         this.candidates = cands;
123 |     }
124 |     
125 |     /** Clear the frequency candidates object (empty phrase and semantic graph, and clear the list of candidates) **/
126 |     public void clear(){
127 |         this.phrase = new Phrase();
128 |         this.sg = new SemanticGraph();
129 |         this.candidates.clear();
130 |     }
131 |     /** Clear the candidates list **/
132 |     public void clearCandidates(){
133 |         this.candidates.clear();
134 |     }
135 | }
136 | 


--------------------------------------------------------------------------------
/src/main/java/de/uni_mannheim/utils/Dictionary.java:
--------------------------------------------------------------------------------
  1 | package de.uni_mannheim.utils;
  2 | 
  3 | import java.io.DataInput;
  4 | import java.io.DataInputStream;
  5 | import java.io.IOException;
  6 | import java.io.InputStream;
  7 | 
  8 | import it.unimi.dsi.fastutil.objects.ObjectOpenHashSet;
  9 | 
 10 | import edu.stanford.nlp.ling.IndexedWord;
 11 | 
 12 | /**
 13 |  * A dictionary stores a set of strings.
 14 |  *
 15 |  * @author Kiril Gashteovski
 16 |  */
 17 | public class Dictionary {
 18 | 
 19 |     /** Stores the strings */
 20 |     public ObjectOpenHashSet<String> words;
 21 | 
 22 |     /** Default constructor **/
 23 |     public Dictionary() {
 24 |         this.words = new ObjectOpenHashSet<String>();
 25 |     }
 26 | 
 27 |     /** Opens an empty set of strings (the dictionary) and then loads the dictionary from the input stream **/
 28 |     public Dictionary(InputStream in) throws IOException {
 29 |         this.words = new ObjectOpenHashSet<String>();
 30 |         this.load(in);
 31 |     }
 32 | 
 33 |     /** Opens an empty set of strings (the dictionary) and then loads the dictionary from the resource path **/
 34 |     public Dictionary(String resourcePath) throws IOException {
 35 |         this.words = new ObjectOpenHashSet<String>();
 36 |         this.load(resourcePath);
 37 |     }
 38 | 
 39 |     /** Opens an empty set of strings (the dictionary) and then loads the dictionary from the multiple resources 
 40 |       * @throws IOException **/
 41 |     public Dictionary(String [] resourcePaths) throws IOException {
 42 |         this.words = new ObjectOpenHashSet<String>();
 43 |         this.load(resourcePaths);
 44 |     }
 45 | 
 46 |     /** The size of the dictionary (number of words) **/
 47 |     public int size() {
 48 |         return this.words.size();
 49 |     }
 50 | 
 51 |     /** Checks if a certain word (as a string) is in the dictionary **/
 52 |     public boolean contains(String word) {
 53 |         return this.words.contains(word);
 54 |     }
 55 | 
 56 |     /** Checks if a certain word (IndexedWord object) is in the dictionary in its lemmatized form **/
 57 |     public boolean containsLemmatized(IndexedWord word) {
 58 |         return this.words.contains(word.lemma());
 59 |     }
 60 | 
 61 |     private InputStream getInputStreamFromResource(String resourceName) throws IOException {
 62 |         return this.getClass().getResource(resourceName).openStream();
 63 |     }
 64 | 
 65 |     /** Loads a dictionary from a resource path
 66 |      * @throws IOException 
 67 |      **/
 68 |     public void load(String resourcePath) throws IOException {
 69 |         this.load(this.getInputStreamFromResource(resourcePath));
 70 |     }
 71 |     
 72 |     /** Loads a dictionary from several resource paths 
 73 |      * @throws IOException **/
 74 |     public void load(String [] resourcePaths) throws IOException {
 75 |         for (String path: resourcePaths) {
 76 |             this.load(path);
 77 |         }
 78 |     }
 79 |     
 80 |     /** Loads the dictionary out of an {@link InputStream}. 
 81 |      *  Each line of the original file should contain an entry to the dictionary
 82 |      */
 83 |     public void load(InputStream in) throws IOException {
 84 |         DataInput data = new DataInputStream(in);
 85 |         String line = data.readLine();
 86 |         while (line != null) {
 87 |             line = line.trim();
 88 |             if (line.length() > 0) {
 89 |                 this.words.add(line);
 90 |             }
 91 |             line = data.readLine();
 92 |         }
 93 |     }
 94 | 	
 95 |     /** Get the set of words **/
 96 |     public ObjectOpenHashSet<String> words() {
 97 |         return this.words;
 98 |     }
 99 | 	
100 |     /** Add entries to the dictionary **/
101 |     public void addWords(ObjectOpenHashSet<String> ws) {
102 |         this.words.addAll(ws);
103 |     }
104 | }
105 | 


--------------------------------------------------------------------------------
/src/main/java/de/uni_mannheim/utils/minie/Utils.java:
--------------------------------------------------------------------------------
  1 | package de.uni_mannheim.utils.minie;
  2 | 
  3 | import edu.stanford.nlp.ling.IndexedWord;
  4 | import it.unimi.dsi.fastutil.objects.ObjectArrayList;
  5 | import joptsimple.OptionSet;
  6 | 
  7 | import java.io.IOException;
  8 | import java.util.ArrayList;
  9 | import java.util.Arrays;
 10 | import java.util.Collection;
 11 | import java.util.StringJoiner;
 12 | 
 13 | import de.uni_mannheim.minie.MinIE;
 14 | import de.uni_mannheim.minie.annotation.AnnotatedProposition;
 15 | import de.uni_mannheim.minie.annotation.Attribution;
 16 | import de.uni_mannheim.minie.annotation.Quantity;
 17 | import de.uni_mannheim.utils.Dictionary;
 18 | 
 19 | /**
 20 |  * Helper class for MinIE
 21 |  *
 22 |  * @author Martin Achenbach
 23 |  * @author Kiril Gashteovski
 24 |  */
 25 | public class Utils {
 26 |     /** MinIE default dictionaries **/
 27 |     public static String [] DEFAULT_DICTIONARIES = new String [] {"/minie-resources/wn-mwe.txt", 
 28 |                                                                   "/minie-resources/wiktionary-mw-titles.txt"};
 29 | 
 30 |     /**
 31 |      * formats an annotated proposition in Ollie style
 32 |      * @param proposition: annotated proposition to format
 33 |      * @return formatted proposition
 34 |      */
 35 |     public static String formatProposition(AnnotatedProposition proposition) {
 36 |         // First the triple
 37 |         StringJoiner tripleJoiner = new StringJoiner(";", "(", ")");
 38 |         String subject = proposition.getSubject().toString();
 39 |         if (!subject.isEmpty()) tripleJoiner.add(subject);
 40 |         String relation = proposition.getRelation().toString();
 41 |         if (!relation.isEmpty()) tripleJoiner.add(relation);
 42 |         String object = proposition.getObject().toString();
 43 |         if (!object.isEmpty()) tripleJoiner.add(object);
 44 |         
 45 |         // Factuality
 46 |         String factualityString = "";
 47 |         String factuality = formatFactuality(proposition.getPolarity().getType().toString(), proposition.getModality().getModalityType().toString());
 48 |         if (!factuality.isEmpty()) factualityString = String.format("[factuality=%s]", factuality);
 49 |         
 50 |         /*String clausalModifier = proposition.getClauseModifier().toString();
 51 |         if (!clausalModifier.isEmpty()) annotations.add("clausalModifier=" + clausalModifier);*/
 52 | 
 53 |         // Attribution
 54 |         Attribution attribution = proposition.getAttribution();
 55 |         String attributionString = "";
 56 |         
 57 |         // Only process the attribution if there is a attribution phrase TODO is this suitable?
 58 |         if (attribution != null && attribution.getAttributionPhrase() != null) {
 59 |             StringJoiner attributionAttributesJoiner = new StringJoiner(";");
 60 |             String attributionPhrase = attribution.getAttributionPhrase().toString();
 61 |             if (!attributionPhrase.isEmpty()) attributionAttributesJoiner.add("phrase:" + attributionPhrase);
 62 |             String attributionPredicate = attribution.getPredicateVerb().toString();
 63 |             if (!attributionPredicate.isEmpty()) attributionAttributesJoiner.add("predicate:" + attributionPredicate);
 64 |             String attributionFactuality = formatFactuality(attribution.getPolarityType().toString(), attribution.getModalityType().toString());
 65 |             if (!attributionFactuality.isEmpty()) attributionAttributesJoiner.add("factuality:" + attributionFactuality);
 66 |             attributionString = String.format("[attribution=%s]", attributionAttributesJoiner.toString());
 67 |         }
 68 | 
 69 |         // Quantities
 70 |         StringJoiner quantityJoiner = new StringJoiner(";");
 71 |         String quantitiesString = "";
 72 |         ObjectArrayList<Quantity> quantities = new ObjectArrayList<Quantity>();
 73 | 
 74 |         // Add all quantities
 75 |         quantities.addAll(proposition.getSubject().getQuantities());
 76 |         quantities.addAll(proposition.getRelation().getQuantities());
 77 |         quantities.addAll(proposition.getObject().getQuantities());
 78 |         if (quantities.size() > 0) {
 79 |             for (Quantity q : quantities) {
 80 |                 StringJoiner quantityPhrase = new StringJoiner(" ");
 81 |                 for (IndexedWord w : q.getQuantityWords()) {
 82 |                     quantityPhrase.add(w.originalText());
 83 |                 }
 84 |                 quantityJoiner.add(String.format("QUANT_%s:%s", q.getId(),quantityPhrase.toString()));
 85 |             }
 86 |             quantitiesString = String.format("[quantities=%s]", quantityJoiner.toString());
 87 |         }
 88 |         String output = tripleJoiner.toString() + factualityString + attributionString + quantitiesString;
 89 |         return output;
 90 |     }
 91 | 
 92 |     /**
 93 |      * format a factuality pair
 94 |      * @param polarity: polarity to format
 95 |      * @param modality: modality to format
 96 |      * @return formatted factuality
 97 |      */
 98 |     private static String formatFactuality(String polarity, String modality) {
 99 |         String factuality = "";
100 |         if (!polarity.isEmpty() && !modality.isEmpty()) {
101 |             if (polarity.equalsIgnoreCase("POSITIVE")) {
102 |                 polarity = "+";
103 |             } else {
104 |                 polarity = "-";
105 |             }
106 |             if (modality.equalsIgnoreCase("CERTAINTY")) {
107 |                 modality = "CT";
108 |             } else {
109 |                 modality = "PS";
110 |             }
111 |             factuality = String.format("(%s,%s)", polarity, modality);
112 |         }
113 |         return factuality;
114 |     }
115 | 
116 |     /**
117 |      * parses a string to a MinIE mode
118 |      * @param s: string to parse
119 |      * @return MinIE mode
120 |      */
121 |     public static MinIE.Mode getMode(String s) {
122 |         MinIE.Mode mode;
123 |         if (s.equalsIgnoreCase("aggressive")) {
124 |             mode = MinIE.Mode.AGGRESSIVE;
125 |         } else if (s.equalsIgnoreCase("dictionary")) {
126 |             mode = MinIE.Mode.DICTIONARY;
127 |         } else if (s.equalsIgnoreCase("complete")) {
128 |             mode = MinIE.Mode.COMPLETE;
129 |         } else {
130 |             mode = MinIE.Mode.SAFE;
131 |         }
132 |         return mode;
133 |     }
134 | 
135 |     /**
136 |      * load a dictionary from a given location in the option set
137 |      * @param options: option set to read the locations from
138 |      * @return a dictionary read from the specified locations
139 |      * @throws IOException
140 |      */
141 |     public static Dictionary loadDictionary(OptionSet options) throws IOException {
142 |         Dictionary collocationDictionary = null;
143 |         ArrayList<String> filenames = new ArrayList<String>();
144 |         if (!options.has("dict-overwrite")) {
145 |             // if the overwrite option is not set, add the default dictionaries
146 |             filenames.addAll(Arrays.asList(DEFAULT_DICTIONARIES));
147 |         }
148 |         if (options.has("dict")) {
149 |             filenames.addAll((Collection<? extends String>) options.valuesOf("dict"));
150 |         }
151 |         String[] filenamesArray = Arrays.copyOf(filenames.toArray(), filenames.size(), String[].class);
152 |         //logger.info("Loading dictionaries from " + Arrays.toString(filenamesArray));
153 |         collocationDictionary = new Dictionary(filenamesArray);
154 |         //logger.info("Finished loading dictionaries");
155 |         return collocationDictionary;
156 |     }
157 | }
158 | 


--------------------------------------------------------------------------------
/src/main/java/de/uni_mannheim/utils/phrase/PhraseUtils.java:
--------------------------------------------------------------------------------
 1 | package de.uni_mannheim.utils.phrase;
 2 | 
 3 | import de.uni_mannheim.clausie.phrase.Phrase;
 4 | import de.uni_mannheim.constant.SEPARATOR;
 5 | import de.uni_mannheim.minie.annotation.AnnotatedPhrase;
 6 | import it.unimi.dsi.fastutil.objects.ObjectArrayList;
 7 | 
 8 | /**
 9 |  * @author Kiril Gashteovski
10 |  */
11 | public class PhraseUtils {
12 |     /**
13 |      * Given a list of phrases, return their words concatenated into one string.
14 |      * @param phraseList: list of phrases
15 |      * @return string (words from the phrase list concatenated)
16 |      */
17 |     public static String listOfPhrasesToString(ObjectArrayList<Phrase> phraseList){
18 |         StringBuffer sb = new StringBuffer();
19 |         for (Phrase phrase: phraseList){
20 |             sb.append(phrase.getWords());
21 |             sb.append(SEPARATOR.SPACE);
22 |         }
23 |         return sb.toString().trim();
24 |     }
25 |     
26 |     /**
27 |      * Given a list of annoteted phrases, return their words concatenated into one string.
28 |      * @param phraseList: list of phrases
29 |      * @return string (words from the phrase list concatenated)
30 |      */
31 |     public static String listOfAnnotatedPhrasesToString(ObjectArrayList<AnnotatedPhrase> phraseList){
32 |         StringBuffer sb = new StringBuffer();
33 |         for (AnnotatedPhrase aPhrase: phraseList){
34 |             sb.append(aPhrase.getWords());
35 |             sb.append(SEPARATOR.SPACE);
36 |         }
37 |         return sb.toString().trim();
38 |     }
39 | }
40 | 


--------------------------------------------------------------------------------
/src/main/java/tests/minie/Demo.java:
--------------------------------------------------------------------------------
 1 | package tests.minie;
 2 | 
 3 | import java.io.BufferedReader;
 4 | import java.io.DataInputStream;
 5 | import java.io.IOException;
 6 | import java.io.InputStream;
 7 | import java.io.InputStreamReader;
 8 | 
 9 | import org.python.util.PythonInterpreter;
10 | 
11 | import de.uni_mannheim.minie.MinIE;
12 | import de.uni_mannheim.minie.annotation.AnnotatedProposition;
13 | import de.uni_mannheim.utils.coreNLP.CoreNLPUtils;
14 | 
15 | import edu.stanford.nlp.pipeline.StanfordCoreNLP;
16 | import edu.stanford.nlp.semgraph.SemanticGraph;
17 | 
18 | 
19 | 
20 | /**
21 |  * @author Kiril Gashteovski
22 |  * @author Yide Song
23 |  */
24 | public class Demo {
25 |     public static void main(String args[]) throws IOException, InterruptedException {
26 |         
27 |         
28 |     	
29 |     	// Dependency parsing pipeline initialization
30 |         StanfordCoreNLP parser = CoreNLPUtils.StanfordDepNNParser();
31 |         
32 |         // Input sentence
33 |         String sentence = "Both taxa are produced in abundance by a variety of coniferous plants, and are typical in the Paleogene of the northern UK and Greenland region (Boulter and Manum 1989; Jolley and Whitham 2004; Jolley and Morton 2007), as well as mid-latitude North America (Smith et al., 2007) and Arctic Canada (Greenwood and Basinger, 1993).";
34 |         
35 |         
36 |         
37 |         // Generate the extractions (With SAFE mode)
38 |         MinIE minie = new MinIE(sentence, parser, MinIE.Mode.SAFE);
39 |         System.out.println("New Sentence: " + minie.getNewSentence());
40 |         
41 |         // Print the extractions
42 |         System.out.println("\nInput sentence: " + sentence);
43 |         System.out.println("=============================");
44 |         System.out.println("Extractions:");
45 |         for (AnnotatedProposition ap: minie.getPropositions()) {
46 |             System.out.println("\tTriple: " + ap.getTripleAsString());
47 |             System.out.print("\tFactuality: " + ap.getFactualityAsString());
48 |             if(ap.getCitePolarity() != null && ap.getCitePurpose() != null){
49 |             	System.out.print("\tCite: " + ap.getCiteAsString());
50 |             }
51 |             if (ap.getAttribution().getAttributionPhrase() != null) 
52 |                 System.out.print("\tAttribution: " + ap.getAttribution().toStringCompact());
53 |             else
54 |                 System.out.print("\tAttribution: NONE");
55 |             System.out.println("\n\t----------");
56 |         }
57 |         
58 |         System.out.println("\n\nDONE!");
59 |     }
60 | }
61 | 


--------------------------------------------------------------------------------
/src/main/java/tests/minie/DetectCitationDemo.java:
--------------------------------------------------------------------------------
  1 | package tests.minie;
  2 | 
  3 | import java.io.*;
  4 | import org.python.util.PythonInterpreter;
  5 | 
  6 | import de.uni_mannheim.minie.MinIE;
  7 | import de.uni_mannheim.minie.annotation.AnnotatedProposition;
  8 | import de.uni_mannheim.utils.coreNLP.CoreNLPUtils;
  9 | 
 10 | import edu.stanford.nlp.pipeline.StanfordCoreNLP;
 11 | import edu.stanford.nlp.semgraph.SemanticGraph;
 12 | 
 13 | 
 14 | /**
 15 |  * @author Kiril Gashteovski
 16 |  */
 17 | public class DetectCitationDemo {
 18 |     public static void main(String args[]) throws IOException, InterruptedException {
 19 |     	// Dependency parsing pipeline initialization
 20 |         StanfordCoreNLP parser = CoreNLPUtils.StanfordDepNNParser();
 21 | 
 22 | 
 23 |         PrintWriter pw = null;
 24 |         try {
 25 |             pw = new PrintWriter(new File("CitationSentences.csv"));
 26 |         } catch (FileNotFoundException e) {
 27 |             e.printStackTrace();
 28 |         }
 29 |         
 30 |         StringBuilder builder = new StringBuilder();
 31 |         builder.append("id");
 32 |         builder.append(',');
 33 |         builder.append("Input Sentence");
 34 |         builder.append(',');
 35 |         builder.append("New Sentence");
 36 |         builder.append('\n');
 37 |         
 38 |         File file = new File("SVM_model/Evaluation_Data/oa_200randsents.txt"); 
 39 |         
 40 |         BufferedReader br = null;
 41 |         try {
 42 |             br = new BufferedReader(new FileReader(file));
 43 |         } catch (FileNotFoundException e) {
 44 |             // TODO Auto-generated catch block
 45 |             e.printStackTrace();
 46 |         } 
 47 |         
 48 |         String st; 
 49 |         int id = 0;
 50 |         try {
 51 |             while ((st = br.readLine()) != null){ 
 52 |             	//remove "," in the sentence.
 53 |             	st = removeId(st);
 54 |             	MinIE minie = new MinIE(st, parser, MinIE.Mode.SAFE);
 55 |             	String nst = minie.getNewSentence();
 56 |             	
 57 |             	if(minie.isCitation()==true){
 58 |             		System.out.println("This sentence is citation sentence: " + st);
 59 | 	            	String handleStr=st;
 60 | 	            	if(st.contains(",")){             
 61 | 	            	    if(st.contains("\"")){
 62 | 	            	    	handleStr=st.replace("\"", "\"\"");
 63 | 	            	    }  
 64 | 	            	    handleStr="\""+handleStr+"\"";  
 65 | 	            	}
 66 | 	            	
 67 | 	            	String handleStr2=nst;
 68 | 	            	if(nst.contains(",")){             
 69 | 	            	    if(nst.contains("\"")){
 70 | 	            	    	handleStr2=nst.replace("\"", "\"\"");
 71 | 	            	    }  
 72 | 	            	    handleStr2="\""+handleStr2+"\"";  
 73 | 	            	}
 74 | 	            	System.out.println(handleStr);
 75 | 	                builder.append(id);
 76 | 	                builder.append(',');
 77 | 	                builder.append(handleStr);
 78 | 	                builder.append(',');
 79 | 	                builder.append(handleStr2);
 80 | 	                builder.append(',');
 81 | 	                builder.append(minie.getCitePolarity());
 82 | 	                builder.append(',');
 83 | 	                builder.append(minie.getCitePurpose());
 84 | 	                builder.append('\n');
 85 | 	                id++;
 86 | 	                
 87 | 	                for (AnnotatedProposition ap: minie.getPropositions()) {
 88 | 	                	builder.append(',');
 89 | 		                builder.append(ap.getTripleAsString());
 90 | 		                builder.append(',');
 91 | 		                builder.append(ap.getFactualityAsString());
 92 | 		                builder.append(',');
 93 | 		                if (ap.getAttribution().getAttributionPhrase() != null) {   	
 94 | 			                builder.append(ap.getAttribution().toStringCompact());
 95 | 		                }else{
 96 | 		                	builder.append("NONE");
 97 | 		                }
 98 | 		                builder.append('\n');
 99 | 	                }
100 | 	                builder.append('\n');
101 |             	}
102 |             }
103 |         } catch (IOException e) {
104 |             // TODO Auto-generated catch block
105 |             e.printStackTrace();
106 |         }
107 | 
108 |         pw.write('\ufeff');
109 |         pw.write(builder.toString());
110 |         pw.close();
111 |         System.out.println("done!");
112 |         
113 |         //runMinIE();
114 |     }
115 |     
116 |     public static String removeId (String sentence){
117 |     	sentence = sentence.replaceAll("S[0-9A-Z]*\\:[0-9]*", "");
118 |     	return sentence;
119 |     }
120 |     
121 |     public static void runMinIE() {
122 |     	// Dependency parsing pipeline initialization
123 |         StanfordCoreNLP parser = CoreNLPUtils.StanfordDepNNParser();
124 |         
125 |         // Input sentence
126 |         String sentence = "The Joker believes that the hero Batman was not actually born in foggy Gotham City (Walters, 1994).";
127 | 
128 |         // Generate the extractions (With SAFE mode)
129 |         MinIE minie = new MinIE(sentence, parser, MinIE.Mode.SAFE);
130 |         
131 |         // Print the extractions
132 |         System.out.println("\nInput sentence: " + sentence);
133 |         System.out.println("=============================");
134 |         System.out.println("Extractions:");
135 |         for (AnnotatedProposition ap: minie.getPropositions()) {
136 |             System.out.println("\tTriple: " + ap.getTripleAsString());
137 |             System.out.print("\tFactuality: " + ap.getFactualityAsString());
138 |             if(ap.getCitePolarity() != null && ap.getCitePurpose() != null){
139 |             	System.out.print("\tCite: " + ap.getCiteAsString());
140 |             }
141 |             if (ap.getAttribution().getAttributionPhrase() != null) 
142 |                 System.out.print("\tAttribution: " + ap.getAttribution().toStringCompact());
143 |             else
144 |                 System.out.print("\tAttribution: NONE");
145 |             System.out.println("\n\t----------");
146 |         }
147 |         
148 |         System.out.println("\n\nDONE!");
149 |     }
150 | }
151 | 


--------------------------------------------------------------------------------
/src/main/java/tests/minie/OriginalMinIE.java:
--------------------------------------------------------------------------------
  1 | package tests.minie;
  2 | 
  3 | import java.io.*;
  4 | import org.python.util.PythonInterpreter;
  5 | 
  6 | import de.uni_mannheim.minie.MinIE;
  7 | import de.uni_mannheim.minie.annotation.AnnotatedProposition;
  8 | import de.uni_mannheim.utils.coreNLP.CoreNLPUtils;
  9 | 
 10 | import edu.stanford.nlp.pipeline.StanfordCoreNLP;
 11 | import edu.stanford.nlp.semgraph.SemanticGraph;
 12 | 
 13 | 
 14 | 
 15 | /**
 16 |  * @author Kiril Gashteovski
 17 |  */
 18 | public class OriginalMinIE {
 19 |     public static void main(String args[]) throws IOException, InterruptedException {
 20 |     	// Dependency parsing pipeline initialization
 21 |         StanfordCoreNLP parser = CoreNLPUtils.StanfordDepNNParser();
 22 | 
 23 | 
 24 |         PrintWriter pw = null;
 25 |         try {
 26 |             pw = new PrintWriter(new File("OriginalMinIE.csv"));
 27 |         } catch (FileNotFoundException e) {
 28 |             e.printStackTrace();
 29 |         }
 30 |         
 31 |         StringBuilder builder = new StringBuilder();
 32 |         builder.append("id");
 33 |         builder.append(',');
 34 |         builder.append("Input Sentence");
 35 |         builder.append('\n');
 36 |         
 37 |         File file = new File("SVM_model/Evaluation_Data/oa_200randsents.txt"); 
 38 |         
 39 |         BufferedReader br = null;
 40 |         try {
 41 |             br = new BufferedReader(new FileReader(file));
 42 |         } catch (FileNotFoundException e) {
 43 |             // TODO Auto-generated catch block
 44 |             e.printStackTrace();
 45 |         } 
 46 |         
 47 |         String st; 
 48 |         int id = 0;
 49 |         try {
 50 |             while ((st = br.readLine()) != null){ 
 51 |             	//remove "," in the sentence.
 52 |             	st = removeId(st);
 53 |             	MinIE minie = new MinIE(st, parser, MinIE.Mode.SAFE);
 54 |             	
 55 |             	String handleStr=st;
 56 | 	            if(st.contains(",")){             
 57 | 	            	if(st.contains("\"")){
 58 | 	            		handleStr=st.replace("\"", "\"\"");
 59 | 	            	}  
 60 | 	            	handleStr="\""+handleStr+"\"";  
 61 | 	            }	
 62 | 	            	
 63 | 	            builder.append(id);
 64 | 	            builder.append(',');
 65 | 	            builder.append(handleStr);
 66 | 	            builder.append('\n');
 67 | 	            id++;
 68 | 	                
 69 | 	            for (AnnotatedProposition ap: minie.getPropositions()) {
 70 | 	            	builder.append(',');
 71 | 	                builder.append(ap.getTripleAsString());
 72 | 	                builder.append(',');
 73 | 	                builder.append(ap.getFactualityAsString());
 74 | 	                builder.append(',');
 75 | 	                if (ap.getAttribution().getAttributionPhrase() != null) {   	
 76 | 		                builder.append(ap.getAttribution().toStringCompact());
 77 | 	                }else{
 78 | 	                	builder.append("NONE");
 79 | 	                }
 80 | 	                builder.append('\n');
 81 | 	            }
 82 | 	            builder.append('\n');
 83 |             }
 84 |             
 85 |         } catch (IOException e) {
 86 |             // TODO Auto-generated catch block
 87 |             e.printStackTrace();
 88 |         }
 89 | 
 90 |         pw.write('\ufeff');
 91 |         pw.write(builder.toString());
 92 |         pw.close();
 93 |         System.out.println("done!");
 94 |         
 95 |         //runMinIE();
 96 |     }
 97 |     
 98 |     public static String removeId (String sentence){
 99 |     	sentence = sentence.replaceAll("S[0-9A-Z]*\\:[0-9]*", "");
100 |     	return sentence;
101 |     }
102 |     
103 |     public static void runMinIE() {
104 |     	// Dependency parsing pipeline initialization
105 |         StanfordCoreNLP parser = CoreNLPUtils.StanfordDepNNParser();
106 |         
107 |         // Input sentence
108 |         String sentence = "The Joker believes that the hero Batman was not actually born in foggy Gotham City (Walters, 1994).";
109 | 
110 |         // Generate the extractions (With SAFE mode)
111 |         MinIE minie = new MinIE(sentence, parser, MinIE.Mode.SAFE);
112 |         
113 |         // Print the extractions
114 |         System.out.println("\nInput sentence: " + sentence);
115 |         System.out.println("=============================");
116 |         System.out.println("Extractions:");
117 |         for (AnnotatedProposition ap: minie.getPropositions()) {
118 |             System.out.println("\tTriple: " + ap.getTripleAsString());
119 |             System.out.print("\tFactuality: " + ap.getFactualityAsString());
120 |             if(ap.getCitePolarity() != null && ap.getCitePurpose() != null){
121 |             	System.out.print("\tCite: " + ap.getCiteAsString());
122 |             }
123 |             if (ap.getAttribution().getAttributionPhrase() != null) 
124 |                 System.out.print("\tAttribution: " + ap.getAttribution().toStringCompact());
125 |             else
126 |                 System.out.print("\tAttribution: NONE");
127 |             System.out.println("\n\t----------");
128 |         }
129 |         
130 |         System.out.println("\n\nDONE!");
131 |     }
132 | }
133 | 


--------------------------------------------------------------------------------
/src/main/java/uk/ac/ucl/cs/mr/Fact.java:
--------------------------------------------------------------------------------
 1 | package uk.ac.ucl.cs.mr;
 2 | 
 3 | public class Fact {
 4 | 
 5 |     public String subject = null;
 6 |     public String predicate = null;
 7 |     public String object = null;
 8 | 
 9 |     public Fact(String s, String p, String o) {
10 |         this.subject = s;
11 |         this.predicate = p;
12 |         this.object = o;
13 |     }
14 | 
15 |     public String getSubject() {
16 |         return subject;
17 |     }
18 | 
19 |     public void setSubject(String subject) {
20 |         this.subject = subject;
21 |     }
22 | 
23 |     public String getPredicate() {
24 |         return predicate;
25 |     }
26 | 
27 |     public void setPredicate(String predicate) {
28 |         this.predicate = predicate;
29 |     }
30 | 
31 |     public String getObject() {
32 |         return object;
33 |     }
34 | 
35 |     public void setObject(String object) {
36 |         this.object = object;
37 |     }
38 | 
39 | }
40 | 


--------------------------------------------------------------------------------
/src/main/java/uk/ac/ucl/cs/mr/FactsBean.java:
--------------------------------------------------------------------------------
 1 | package uk.ac.ucl.cs.mr;
 2 | 
 3 | import java.util.List;
 4 | 
 5 | import javax.xml.bind.annotation.XmlRootElement;
 6 | 
 7 | @XmlRootElement
 8 | public class FactsBean {
 9 | 
10 |     public List<Fact> facts;
11 | 
12 |     public FactsBean(List<Fact> facts) {
13 |         this.facts = facts;
14 |     }
15 | 
16 | }
17 | 


--------------------------------------------------------------------------------
/src/main/java/uk/ac/ucl/cs/mr/FactsResource.java:
--------------------------------------------------------------------------------
 1 | package uk.ac.ucl.cs.mr;
 2 | 
 3 | import java.util.List;
 4 | import java.util.ArrayList;
 5 | 
 6 | import javax.ws.rs.POST;
 7 | import javax.ws.rs.Path;
 8 | import javax.ws.rs.Produces;
 9 | import javax.ws.rs.core.MediaType;
10 | 
11 | import de.uni_mannheim.minie.MinIE;
12 | import de.uni_mannheim.minie.annotation.AnnotatedPhrase;
13 | import de.uni_mannheim.minie.annotation.AnnotatedProposition;
14 | import de.uni_mannheim.utils.coreNLP.CoreNLPUtils;
15 | 
16 | import edu.stanford.nlp.pipeline.StanfordCoreNLP;
17 | 
18 | @Path("/query")
19 | public class FactsResource {
20 | 
21 |     private static final StanfordCoreNLP parser = CoreNLPUtils.StanfordDepNNParser();
22 | 
23 |     @POST
24 |     @Produces({MediaType.APPLICATION_JSON})
25 |     public FactsBean query(String sentence) {
26 |         MinIE minie = new MinIE(sentence, FactsResource.parser, MinIE.Mode.SAFE);
27 | 
28 |         List<Fact> facts = new ArrayList<>();
29 | 
30 |         for (AnnotatedProposition ap: minie.getPropositions()) {
31 |             List<AnnotatedPhrase> triple = ap.getTriple();
32 | 
33 |             String s = triple.get(0).toString();
34 |             String p = triple.get(1).toString();
35 |             String o = triple.get(2).toString();
36 | 
37 |             Fact fact = new Fact(s, p, o);
38 |             facts.add(fact);
39 |         }
40 | 
41 |         return new FactsBean(facts);
42 |     }
43 | }
44 | 


--------------------------------------------------------------------------------
/src/main/java/uk/ac/ucl/cs/mr/Main.java:
--------------------------------------------------------------------------------
 1 | package uk.ac.ucl.cs.mr;
 2 | 
 3 | import java.io.IOException;
 4 | import java.net.URI;
 5 | import java.util.logging.Level;
 6 | import java.util.logging.Logger;
 7 | 
 8 | 
 9 | import org.glassfish.jersey.grizzly2.httpserver.GrizzlyHttpServerFactory;
10 | import org.glassfish.jersey.server.ResourceConfig;
11 | 
12 | import org.glassfish.grizzly.http.server.HttpServer;
13 | 
14 | import de.uni_mannheim.utils.coreNLP.CoreNLPUtils;
15 | import edu.stanford.nlp.pipeline.StanfordCoreNLP;
16 | 
17 | public class Main {
18 | 
19 |     private static final URI BASE_URI = URI.create("http://localhost:8080/minie/");
20 | 
21 |     public static void main(String[] args) {
22 |         try {
23 |             System.out.println("MinIE Service");
24 | 
25 |             final HttpServer server = GrizzlyHttpServerFactory
26 |                     .createHttpServer(BASE_URI, create(), false);
27 |             Runtime.getRuntime().addShutdownHook(new Thread(new Runnable() {
28 |                 @Override
29 |                 public void run() {
30 |                     server.shutdownNow();
31 |                 }
32 |             }));
33 |             server.start();
34 | 
35 |             System.out.println(String.format("Application started.%n" +
36 |                     "Stop the application using CTRL+C"));
37 | 
38 |             Thread.currentThread().join();
39 |         } catch (IOException | InterruptedException ex) {
40 |             Logger.getLogger(Main.class.getName()).log(Level.SEVERE, null, ex);
41 |         }
42 | 
43 |     }
44 | 
45 |     public static ResourceConfig create() {
46 |         return new MinIEService();
47 |     }
48 | }
49 | 


--------------------------------------------------------------------------------
/src/main/java/uk/ac/ucl/cs/mr/MinIEService.java:
--------------------------------------------------------------------------------
 1 | package uk.ac.ucl.cs.mr;
 2 | 
 3 | import edu.stanford.nlp.pipeline.StanfordCoreNLP;
 4 | import org.glassfish.jersey.jackson.JacksonFeature;
 5 | import org.glassfish.jersey.server.ResourceConfig;
 6 | 
 7 | public class MinIEService extends ResourceConfig {
 8 | 
 9 |     public MinIEService() {
10 |         super(FactsResource.class, JacksonFeature.class);
11 |     }
12 | 
13 | }
14 | 


--------------------------------------------------------------------------------
/src/main/resources/clausie-resources/clausie.conf:
--------------------------------------------------------------------------------
 1 | conservativeSVA = true
 2 | conservativeSVOA = false
 3 | processCcAllVerbs = true
 4 | processCcNonVerbs = true
 5 | processAppositions = true
 6 | appositionVerb = is
 7 | processPossessives = true
 8 | processPartmods = true
 9 | possessiveVerb = has
10 | lemmatize = false
11 | nary = false
12 | minOptionalArgs = 0
13 | maxOptionalArgs = 1
14 | 
15 | dictCopular = /clausie-resources/dict-copular.txt
16 | dictExtCopular = /clausie-resources/dict-ext-copular.txt
17 | dictNotExtCopular = /clausie-resources/dict-not-ext-copular.txt
18 | dictComplexTransitive = /clausie-resources/dict-complex-transitive.txt
19 | dictAdverbsConj = /clausie-resources/dict-adverbs-conj.txt
20 | dictAdverbsIgnore = /clausie-resources/dict-adverbs-ignore.txt
21 | dictAdverbsInclude = /clausie-resources/dict-adverbs-include.txt


--------------------------------------------------------------------------------
/src/main/resources/clausie-resources/dict-adverbs-conj.txt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gkiril/MinSCIE/ea6b53003f65e2043dc46ebddbe8460e2a2b7dbc/src/main/resources/clausie-resources/dict-adverbs-conj.txt


--------------------------------------------------------------------------------
/src/main/resources/clausie-resources/dict-adverbs-ignore.txt:
--------------------------------------------------------------------------------
1 | so
2 | then
3 | thus
4 | why
5 | as
6 | even


--------------------------------------------------------------------------------
/src/main/resources/clausie-resources/dict-adverbs-include.txt:
--------------------------------------------------------------------------------
1 | hardly
2 | barely
3 | scarcely
4 | seldom
5 | rarely


--------------------------------------------------------------------------------
/src/main/resources/clausie-resources/dict-complex-transitive.txt:
--------------------------------------------------------------------------------
 1 | bring
 2 | catch
 3 | drive
 4 | get
 5 | keep
 6 | lay
 7 | lead
 8 | place
 9 | put
10 | set
11 | sit
12 | show
13 | stand
14 | slip
15 | take


--------------------------------------------------------------------------------
/src/main/resources/clausie-resources/dict-copular.txt:
--------------------------------------------------------------------------------
 1 | act
 2 | appear
 3 | be
 4 | become
 5 | come
 6 | come out
 7 | end up
 8 | get
 9 | go
10 | grow
11 | fall
12 | feel
13 | keep
14 | leave
15 | look
16 | prove
17 | remain
18 | seem
19 | smell
20 | sound
21 | stay
22 | taste
23 | turn
24 | turn up
25 | wind up


--------------------------------------------------------------------------------
/src/main/resources/clausie-resources/dict-ext-copular.txt:
--------------------------------------------------------------------------------
 1 | act
 2 | appear
 3 | be
 4 | become
 5 | come
 6 | come out
 7 | end up
 8 | get
 9 | go
10 | grow
11 | fall
12 | feel
13 | keep
14 | leave
15 | look
16 | prove
17 | remain
18 | seem
19 | smell
20 | sound
21 | stay
22 | taste
23 | turn
24 | turn up
25 | wind up
26 | live
27 | come
28 | go
29 | stand
30 | lie
31 | love
32 | do
33 | try 


--------------------------------------------------------------------------------
/src/main/resources/clausie-resources/dict-not-ext-copular.txt:
--------------------------------------------------------------------------------
1 | die
2 | walk


--------------------------------------------------------------------------------
/src/main/resources/minie-resources/certainty-verbs.dict:
--------------------------------------------------------------------------------
 1 | say
 2 | add
 3 | claim
 4 | write
 5 | publish
 6 | know
 7 | remember
 8 | learn
 9 | discover
10 | forget
11 | admit
12 | prove
13 | show
14 | explain
15 | confirm
16 | acknowledge
17 | recall


--------------------------------------------------------------------------------
/src/main/resources/minie-resources/certainty-words.dict:
--------------------------------------------------------------------------------
1 | certainly
2 | surely
3 | definitely
4 | undoubtedly
5 | clearly
6 | obviously


--------------------------------------------------------------------------------
/src/main/resources/minie-resources/neg-adverbs.dict:
--------------------------------------------------------------------------------
1 | not
2 | never
3 | n't


--------------------------------------------------------------------------------
/src/main/resources/minie-resources/neg-determiners.dict:
--------------------------------------------------------------------------------
1 | no
2 | non


--------------------------------------------------------------------------------
/src/main/resources/minie-resources/neg-words.dict:
--------------------------------------------------------------------------------
1 | no
2 | not
3 | never
4 | non
5 | n't


--------------------------------------------------------------------------------
/src/main/resources/minie-resources/non-subsective-adjectives-cf.dict:
--------------------------------------------------------------------------------
 1 | anti-
 2 | anti
 3 | fabricated
 4 | fake
 5 | fictional
 6 | fictitious
 7 | imaginary
 8 | mythical
 9 | phony
10 | false
11 | artificial
12 | erroneous
13 | mistaken
14 | mock
15 | pseudo-
16 | pseudo
17 | simulated
18 | spurious
19 | unsuccessful
20 | counterfeit
21 | deputy
22 | faulty
23 | virtual


--------------------------------------------------------------------------------
/src/main/resources/minie-resources/non-subsective-adjectives-modal.dict:
--------------------------------------------------------------------------------
 1 | alleged
 2 | believed
 3 | debatable
 4 | disputed
 5 | dubious
 6 | hypothetical
 7 | impossible
 8 | improbable
 9 | plausible
10 | putative
11 | questionable
12 | so-called
13 | supposed
14 | suspicious
15 | theoretical
16 | uncertain
17 | unlikely
18 | would-be
19 | doubtful
20 | apparent
21 | arguable
22 | assumed
23 | likely
24 | ostensible
25 | possible
26 | potential
27 | predicted
28 | presumed
29 | probable
30 | seeming


--------------------------------------------------------------------------------
/src/main/resources/minie-resources/non-subsective-adjectives-temp.dict:
--------------------------------------------------------------------------------
 1 | erstwhile
 2 | ex-
 3 | ex
 4 | expected
 5 | former
 6 | future
 7 | historic
 8 | onetime
 9 | past
10 | proposed


--------------------------------------------------------------------------------
/src/main/resources/minie-resources/poss-adj.dict:
--------------------------------------------------------------------------------
1 | likely
2 | probable
3 | possible


--------------------------------------------------------------------------------
/src/main/resources/minie-resources/poss-adverbs.dict:
--------------------------------------------------------------------------------
 1 | probably
 2 | possibly
 3 | perhaps
 4 | generally
 5 | likely
 6 | unsure
 7 | presumably
 8 | apparently
 9 | seemingly
10 | probable
11 | possible
12 | maybe


--------------------------------------------------------------------------------
/src/main/resources/minie-resources/poss-modal.dict:
--------------------------------------------------------------------------------
 1 | might
 2 | may
 3 | could
 4 | can
 5 | would
 6 | should
 7 | shall
 8 | must
 9 | will
10 | 'll


--------------------------------------------------------------------------------
/src/main/resources/minie-resources/poss-neg-words.dict:
--------------------------------------------------------------------------------
1 | unlikely
2 | unlike
3 | improbable
4 | unbelievable
5 | unbelievably


--------------------------------------------------------------------------------
/src/main/resources/minie-resources/poss-verbs.dict:
--------------------------------------------------------------------------------
 1 | think
 2 | consider
 3 | guess
 4 | predict
 5 | suggest
 6 | believe
 7 | doubt
 8 | wonder
 9 | ask
10 | speculate
11 | theorize
12 | theorise
13 | hypothesize
14 | hypothesise
15 | conjecture
16 | suspect


--------------------------------------------------------------------------------
/src/main/resources/minie-resources/poss-words.dict:
--------------------------------------------------------------------------------
 1 | probably
 2 | possibly
 3 | perhaps
 4 | generally
 5 | likely
 6 | unsure
 7 | presumably
 8 | apparently
 9 | seemingly
10 | probable
11 | possible 
12 | maybe


--------------------------------------------------------------------------------
/src/main/resources/minie-resources/quantities-adjectives.dict:
--------------------------------------------------------------------------------
1 | many


--------------------------------------------------------------------------------
/src/main/resources/minie-resources/quantities-determiners.dict:
--------------------------------------------------------------------------------
1 | some
2 | all
3 | any
4 | each
5 | every
6 | half
7 | many


--------------------------------------------------------------------------------