├── CONFIG.txt
├── LICENSE
├── README.md
├── all_predictions_4.0
    ├── code
    │   ├── convertOutputArgs.py
    │   ├── convertOutputTriggers.py
    │   ├── readLargeInput.py
    │   ├── readLargeInput.pyc
    │   ├── writeArgGold.py
    │   ├── writeArgLiblinear.py
    │   ├── writeRealisGold.py
    │   ├── writeRealisLiblinear.py
    │   ├── writeTriggerGold.py
    │   └── writeTriggerLiblinear.py
    ├── runAll.sh
    ├── runAll_providedTriggers.sh
    ├── runArguments.sh
    ├── runArguments_providedTriggers.sh
    ├── runRealis.sh
    ├── runRealis_providedTriggers.sh
    └── runTriggers.sh
├── config.py
├── outputFormatting
    ├── Chinese_run.sh
    ├── English_run.sh
    ├── Spanish_run.sh
    ├── argument_nugget_linking.py
    ├── finalForm_KBP.py
    ├── formatTriggers
    │   ├── format_andrew_triggers
    │   │   └── format_andrew.py
    │   ├── format_hector_triggers
    │   │   └── format_hector.py
    │   └── format_jun_triggers
    │   │   └── format_jun.py
    ├── out
    │   ├── cleanStore.sh
    │   ├── mergeSubmissions
    │   │   └── mergeSubmissions.py
    │   ├── mergeSubmissions_coreference
    │   │   ├── argument_nugget_linking.py
    │   │   └── mergeSubmissions.py
    │   └── moveToStore.sh
    ├── stopwords.txt
    ├── writeDocMap.py
    └── writeTriggerOutput.py
└── preprocessing_2.0
    ├── CoreNLP_scripts
        ├── StanfordCoreNLP-chinese.properties.simple
        ├── StanfordCoreNLP-spanish.properties.simple
        ├── prefixLines.py
        ├── prepareCoreNLP_input.py
        ├── runCoreNLP_Chn.sh
        ├── runCoreNLP_Eng.sh
        └── runCoreNLP_Span.sh
    ├── MaltParser_scripts
        ├── convertToCoNLL.py
        └── convertToParsingFile.py
    ├── createSetFiles
        └── writeDataFromFiles.py
    ├── entityExtraction
        ├── code
        │   ├── addEntitiesToText.py
        │   ├── addEntitiesToText.sh
        │   └── unify
        │   │   ├── processEntities.py
        │   │   └── unifyEntities.py
        ├── convertTestSet.py
        ├── runEntities.sh
        ├── runEntities_Chinese.sh
        └── runEntities_Spanish.sh
    ├── processChinese.sh
    ├── processEnglish.sh
    ├── processSpanish.sh
    └── readCoreNLP
        ├── convertCoreNLPFormat.py
        ├── convertCoreNLPFormat.sh
        ├── getRootnames.py
        ├── read_CoreNLP_XML.py
        └── write_parsing_from_CoreNLP.py


/CONFIG.txt:
--------------------------------------------------------------------------------
1 | WORD_EMBEDDING_DIR=/home/andrew/DEFT_code_testing/dependencies/wordVectors
2 | CORENLP_DIR=/home/andrew/NLP_tools/CoreNLP/stanford-corenlp-full-2016-10-31
3 | MALTPARSER_DIR=/home/andrew/NLP_tools/MaltParser/maltparser-1.9.0
4 | NER_DIR=/home/andrew/NLP_tools/StanfordNER/stanford-ner-2016-10-31
5 | MODEL_DIR=/home/andrew/DEFT_code_testing/dependencies/models
6 | LIBLINEAR_DIR=/home/andrew/ML_tools/LIBLINEAR/liblinear-1.94
7 | POS_DIR=/home/andrew/DEFT_code_testing/dependencies/pos
8 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2017 ahsi
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | CMU Multilingual Event Extractor
 2 | ===============================
 3 | 
 4 | Requirements:
 5 | -------------
 6 | - Python
 7 | - Stanford CoreNLP
 8 | - MaltParser
 9 | - LIBLINEAR
10 | - Stanford NER
11 | - Model Files (available at http://cairo.lti.cs.cmu.edu/~ahsi/CMUCS_multilingual_event_extraction_models/CMUCS_Multilingual_Event_Extractor_models.tar.gz)
12 | 
13 | Installation:
14 | -------------
15 | Modify the CONFIG.txt file to point to the directories containing the required software and model files, then run "python config.py" from this directory.
16 | 
17 | Usage:
18 | ------
19 | Usage is split by language.  Although all model files use multilingual training, testing on different languages must be done separately (due to differences in preprocessing steps across different languages).
20 | 
21 | To run the code, cd into "preprocessing_2.0", then run one of the following:
22 | 
23 | - "./processEnglish.sh FILELIST"
24 | - "./processChinese.sh FILELIST"
25 | - "./processSpanish.sh FILELIST"
26 | 
27 | where FILELIST is a list of raw text files to be processed with absolute paths, one per line.
28 | 
29 | Output files are stored in outputFormatting/out/store/, containing the following subdirectories:
30 | 
31 | arguments/
32 | corpusLinking/
33 | linking/
34 | nuggets/
35 | 
36 | The overall output format closely matches that of the the 2016 TAC KBP Event Argument Extraction and Linking (EAL) Task, with some slightly modifications.  Files in the arguments/ subdirectory are exactly the same, except for two additional columns at the end of each line, providing the starting offset and ID for the associated event nugget. Files in the corpusLinking/ sudirectory and linking/ subdirectory exactly match the EAL Task specifications.  Files in the nuggets/ directory exactly match the format of the 2016 TAC KBP Event Nugget Detection Task.
37 | 


--------------------------------------------------------------------------------
/all_predictions_4.0/code/convertOutputArgs.py:
--------------------------------------------------------------------------------
 1 | # script to convert the liblinear output files to work with my evaluation script
 2 | import sys
 3 | 
 4 | def main():
 5 |     if len(sys.argv) != 5:
 6 |         print "Expect roles dict, prediction file, easy reading file, output file."
 7 |         sys.exit()
 8 | 
 9 |     predictions = []
10 |     labelOnly = True
11 |     input = open(sys.argv[2], "r")
12 |     for line in input:
13 |         if line.startswith("labels"):
14 |             labelOnly = False
15 |             continue
16 |         if labelOnly:
17 |             predictions.append(line.strip())
18 |         else:
19 |             temp = line.split(" ")[0]
20 |             predictions.append(temp)
21 |     input.close()
22 | 
23 |     roleDict = dict()
24 |     input = open(sys.argv[1], "r")
25 |     for line in input:
26 |         tokens = line.strip().split(":")
27 |         roleDict[tokens[1]] = tokens[0]
28 |     input.close()
29 | 
30 |     input = open(sys.argv[3], "r")
31 |     output = open(sys.argv[4], "w")
32 |     index = 0
33 |     for line in input:
34 |         tokens = line.strip().split("\t")
35 |         sentStr = tokens[0]
36 |         text = tokens[2]
37 | 
38 |         predictedRole = predictions[index]
39 | 
40 |         output.write(text + "|||" + sentStr + "|||" + roleDict[predictedRole] + "\n")
41 |         index += 1
42 |     input.close()
43 |     output.close()
44 | 
45 | main()
46 | 


--------------------------------------------------------------------------------
/all_predictions_4.0/code/convertOutputTriggers.py:
--------------------------------------------------------------------------------
 1 | # script to convert the liblinear output files to work with my evaluation script
 2 | import sys
 3 | 
 4 | def main():
 5 |     if len(sys.argv) != 5:
 6 |         print "Expect roles dict, prediction file, easy reading file, output file."
 7 |         sys.exit()
 8 | 
 9 |     predictions = []
10 |     input = open(sys.argv[2], "r")
11 |     for line in input:
12 |         predictions.append(line.strip())
13 |     input.close()
14 | 
15 |     roleDict = dict()
16 |     input = open(sys.argv[1], "r")
17 |     for line in input:
18 |         tokens = line.strip().split(":")
19 |         roleDict[tokens[1]] = tokens[0]
20 |     input.close()
21 | 
22 |     input = open(sys.argv[3], "r")
23 |     output = open(sys.argv[4], "w")
24 |     index = 0
25 | 
26 |     for index in range(len(predictions)):
27 |         predictedRole = roleDict[predictions[index]]
28 |         output.write(predictedRole + "\n")
29 |     input.close()
30 |     output.close()
31 | 
32 | main()
33 | 


--------------------------------------------------------------------------------
/all_predictions_4.0/code/readLargeInput.py:
--------------------------------------------------------------------------------
  1 | # file to contain the most up-to-date version of readInput()
  2 | 
  3 | class Sentence:
  4 |     def __init__(self, wordsArg, lemmasArg, labelsArg, posTagsArg, entityArg, goldArgParam, docIDParam, startParam, endParam, realisArg, allStarts=[]):
  5 |         self.words = wordsArg
  6 |         self.lemmas = lemmasArg
  7 |         self.labels = labelsArg
  8 |         self.posTags = posTagsArg
  9 |         self.entities = entityArg
 10 |         self.goldArgs = goldArgParam
 11 |         self.depByGovIndex = dict()
 12 |         self.depByDepIndex = dict()
 13 | 
 14 |         self.startOffset = startParam
 15 |         self.endOffset = endParam
 16 | 
 17 |         self.docID = docIDParam
 18 | 
 19 |         self.realisLabels = realisArg
 20 | 
 21 |         self.offsets = allStarts
 22 | 
 23 |     def addDependency(self, dep):
 24 |         gIndex = dep.gIndex
 25 |         dIndex = dep.dIndex
 26 | 
 27 |         if gIndex not in self.depByGovIndex:
 28 |             self.depByGovIndex[gIndex] = set()
 29 |         if dIndex not in self.depByDepIndex:
 30 |             self.depByDepIndex[dIndex] = set()
 31 | 
 32 |         self.depByGovIndex[gIndex].add(dep)
 33 |         self.depByDepIndex[dIndex].add(dep)
 34 | 
 35 | class Dependency:
 36 |     def __init__(self, depTypeArg, governorArg, gIndexArg, dependentArg, dIndexArg):
 37 |         self.depType = depTypeArg
 38 |         self.governor = governorArg
 39 |         self.gIndex = gIndexArg
 40 |         self.dependent = dependentArg
 41 |         self.dIndex = dIndexArg
 42 | 
 43 | class ArgumentParse:
 44 |     def __init__(self, beginArg, roleArg, triggerArg, triggerIndexArg):
 45 |         self.begin = beginArg
 46 |         self.role = roleArg
 47 |         self.triggerText = triggerArg
 48 |         self.triggerIndex = triggerIndexArg
 49 | 
 50 | class Argument:
 51 |     def __init__(self, textParam, roleParam, associatedIndexesParam, triggerTextParam, triggerIndexParam):
 52 |         self.text = textParam
 53 |         self.role = roleParam
 54 |         self.associatedIndexes = associatedIndexesParam
 55 |         self.triggerText = triggerTextParam
 56 |         self.triggerIndex = triggerIndexParam
 57 | 
 58 |     def minIndex(self):
 59 |         minVal = -1
 60 |         for index in self.associatedIndexes:
 61 |             if index < minVal or minVal == -1:
 62 |                 minVal = index
 63 |         return minVal
 64 |         
 65 | 
 66 | # extracts argument info, returns as a list
 67 | # FORMAT: "ArgsGold[begin|||ROLE|||triggerText|||triggerStart;]"
 68 | def readArguments(line, realisMode):
 69 |     argList = []
 70 | 
 71 |     start = line.find("[")
 72 | 
 73 |     cleaned = line[start+1:]
 74 |     tokens = cleaned.split(";;;")
 75 |     for tok in tokens:
 76 |         if tok != "]":
 77 |             subparts = tok.split("|||")
 78 |             if realisMode:
 79 |                 if len(subparts) == 4:
 80 |                     curArg = ArgumentParse(subparts[0], "UNK_REALIS", subparts[2], int(subparts[3]))
 81 |                 else:
 82 |                     curArg = ArgumentParse(subparts[0], subparts[4], subparts[2], int(subparts[3]))
 83 |             else:
 84 |                 curArg = ArgumentParse(subparts[0], subparts[1], subparts[2], int(subparts[3]))
 85 |             argList.append(curArg)
 86 | 
 87 |     return argList
 88 | 
 89 | # input: list of argument information (one ArgumentParse list per word)
 90 | # output: list of Arguments for the sentence
 91 | def extractGoldArgs(argLists, words, converter):
 92 |     outputList = []
 93 | 
 94 |     for index in range(len(argLists)):
 95 |         curList = argLists[index]
 96 | 
 97 |         for parse in curList:
 98 |             if parse.begin == "B":
 99 |                 textParam, associatedIndexes = extractArgument(argLists, index, parse.role, words, parse.triggerIndex)
100 | 
101 |                 converted = -1  # can be -1 if trigger is not within sentence boundaries (e.g. our sentence segmentation is off)
102 |                 if parse.triggerIndex in converter:
103 |                     converted = converter[parse.triggerIndex]
104 | 
105 |                 curArg = Argument(textParam, parse.role, associatedIndexes, parse.triggerText, converted)
106 |                 outputList.append(curArg)
107 | 
108 |     return outputList
109 | 
110 | # extract the words/indexes associated with a particular entity candidate
111 | def extractArgument(argLists, index, role, words, triggerIndex):
112 |     text = words[index]
113 |     indexes = set()
114 |     indexes.add(index)
115 | 
116 |     altIndex = index + 1
117 |     while altIndex < len(words):
118 |         curList = argLists[altIndex]
119 |         found = False
120 |         for parse in curList:
121 |             if parse.begin != "I":
122 |                 continue
123 |             if role == parse.role and triggerIndex == parse.triggerIndex:
124 |                 indexes.add(altIndex)
125 |                 text += " " + words[altIndex]
126 |                 found = True
127 |                 break
128 |         if found == False:
129 |             break
130 | 
131 |         altIndex += 1
132 | 
133 |     return text, indexes
134 | 
135 | class EntityParse:
136 |     def __init__(self, beginArg, typeArg, subtypeArg, headArg, corefArg):
137 |         self.begin = beginArg
138 |         self.entType = typeArg
139 |         self.subtype = subtypeArg
140 |         self.head = headArg
141 |         self.corefStr = corefArg
142 | 
143 | class Entity:
144 |     def __init__(self, textParam, typeParam, subtypeParam, associatedIndexesParam, argRoleParam, argTriggerParam, headParam, corefParam, startParam, endParam):
145 |         self.text = textParam
146 |         self.entType = typeParam
147 |         self.subtype = subtypeParam
148 |         self.associatedIndexes = associatedIndexesParam
149 |         self.head = headParam
150 |         self.corefStr = corefParam
151 | 
152 |         self.start = startParam
153 |         self.end = endParam
154 | 
155 |         self.argRole = argRoleParam
156 |         self.argTrigger = argTriggerParam
157 | 
158 |     def minIndex(self):
159 |         minVal = -1
160 |         for index in self.associatedIndexes:
161 |             if index < minVal or minVal == -1:
162 |                 minVal = index
163 |         return minVal
164 | 
165 |     def maxIndex(self):
166 |         maxVal = -1
167 |         for index in self.associatedIndexes:
168 |             if index > maxVal or maxVal == -1:
169 |                 maxVal = index
170 |         return maxVal
171 | 
172 | # extracts entity info, returns as a list
173 | # FORMAT: "EntitiesGold[begin|||PER|||Individual|||headWord;]"
174 | def readEntities(line):
175 |     entList = []
176 | 
177 |     start = line.find("[")
178 | 
179 |     cleaned = line[start+1:]
180 |     tokens = cleaned.split(";;;")
181 |     for tok in tokens:
182 |         if tok != "]":
183 |             subparts = tok.split("|||")
184 |             curEnt = EntityParse(subparts[0], subparts[1], subparts[2], subparts[3], subparts[4])
185 |             entList.append(curEnt)
186 | 
187 |     return entList
188 | 
189 | # input: list of entity information (one EntityParse list per word)
190 | # output: list of Entities for the sentence
191 | def extractCandidateArgs(entityLists, words, starts, ends):
192 |     outputList = []
193 | 
194 |     for index in range(len(entityLists)):
195 |         curList = entityLists[index]
196 | 
197 |         for parse in curList:
198 |             if parse.begin == "B":
199 |                 textParam, associatedIndexes, entStart, entEnd = extractCandidate(entityLists, index, parse.entType, parse.subtype, words, parse.head, starts, ends)
200 | 
201 |                 curEntity = Entity(textParam, parse.entType, parse.subtype, associatedIndexes, "", "", parse.head, parse.corefStr, entStart, entEnd)
202 |                 outputList.append(curEntity)
203 | 
204 |     return outputList
205 | 
206 | # extract the words/indexes associated with a particular entity candidate
207 | def extractCandidate(entityLists, index, entType, subtype, words, head, starts, ends):
208 |     typeName = entType + "_" + subtype
209 |     text = words[index]
210 |     indexes = set()
211 |     indexes.add(index)
212 | 
213 |     entStart = starts[index]
214 |     entEnd = ends[index]
215 | 
216 |     altIndex = index + 1
217 |     while altIndex < len(words):
218 |         curList = entityLists[altIndex]
219 |         found = False
220 |         for parse in curList:
221 |             if parse.begin != "I":
222 |                 continue
223 |             curName = parse.entType + "_" + parse.subtype
224 |             if curName == typeName and parse.head == head:
225 |                 indexes.add(altIndex)
226 |                 text += " " + words[altIndex]
227 |                 found = True
228 |                 entEnd = ends[altIndex]
229 |                 break
230 |         if found == False:
231 |             break
232 | 
233 |         altIndex += 1
234 | 
235 |     return text, indexes, entStart, entEnd
236 | 
237 | def scanInput(filename, parsingFilename, inputTriggers = None, entityOut = None, realisMode = False):
238 |     print "Reading " + filename
239 |     input = open(filename, "r")
240 |     parsingInput = open(parsingFilename, "r")
241 | 
242 |     possibleLabels = set()
243 |     possibleArgs = set()
244 |     possibleArgs.add("NONE")
245 | 
246 |     words = []
247 |     lemmas = []
248 |     posTags = []
249 |     labels = []
250 | 
251 |     docID = ""
252 | 
253 |     entityInfo = []
254 |     argInfo = []
255 | 
256 |     starts = []
257 |     ends = []
258 | 
259 |     indexConverter = dict() # converts from character offsets -> within-sentence word indexes
260 | 
261 |     count = 0
262 |     for line in input:
263 |         # if empty line
264 |         if line.strip() == "":
265 |             # if we have data, process and reset
266 |             if len(words) > 0:
267 |                 entCandidates = extractCandidateArgs(entityInfo, words, starts, ends)
268 | 
269 |                 goldArgs = extractGoldArgs(argInfo, words, indexConverter)
270 |                 for arg in goldArgs:
271 |                     possibleArgs.add(arg.role)
272 | 
273 |                 words = []
274 |                 lemmas = []
275 |                 posTags = []
276 |                 labels = []
277 |                 entityInfo = []
278 |                 argInfo = []
279 | 
280 |                 starts = []
281 |                 ends = []
282 | 
283 |                 realisList = []
284 | 
285 |                 docID = ""
286 | 
287 |                 indexConverter = dict()
288 |         else:
289 |             tokens = line.strip().split("\t")
290 |             ### How to read input (by token):
291 |             ### 0: start index, 1: end index, 2: word, 3: lemma, 4: posTag, 5: docID, 6:gold entities, 7: trigger type, 8: trigger subtype, 9: argument role, 10: trigger realis (optional)
292 | 
293 |             start = int(tokens[0])
294 |             indexConverter[start] = len(words)
295 | 
296 |             starts.append(int(tokens[0]))
297 |             ends.append(int(tokens[1]))
298 | 
299 |             curWord = tokens[2]
300 |             curPOS = tokens[4]
301 |             curLabel = tokens[7] + "_" + tokens[8]
302 | 
303 |             docID = tokens[5]
304 | 
305 |             words.append(curWord)
306 |             lemmas.append(tokens[3])
307 |             posTags.append(curPOS)
308 | 
309 |             if inputTriggers != None:
310 |                 labels.append(inputTriggers[count])
311 |             else:
312 |                 labels.append(curLabel)
313 | 
314 |             if len(tokens) >= 11:
315 |                 realisList.append(tokens[10])
316 | 
317 |             curEnt = readEntities(tokens[6])
318 |             entityInfo.append(curEnt)
319 | 
320 |             curArg = readArguments(tokens[9], realisMode)
321 |             argInfo.append(curArg)
322 | 
323 |             possibleLabels.add(curLabel)
324 | 
325 |             count += 1
326 | 
327 |     return possibleLabels, possibleArgs
328 | 
329 | def readInput(input, parsingInput, inputTriggers = None, entityOut = None, count=0, realisMode = False):
330 |     words = []
331 |     lemmas = []
332 |     posTags = []
333 |     labels = []
334 | 
335 |     docID = ""
336 | 
337 |     entityInfo = []
338 |     argInfo = []
339 | 
340 |     starts = []
341 |     ends = []
342 | 
343 |     realisList = []
344 | 
345 |     indexConverter = dict() # converts from character offsets -> within-sentence word indexes
346 | 
347 |     eof = False
348 | 
349 |     sentence = None
350 | 
351 |     while True:
352 |         line = input.readline()
353 |         eof = line == ""
354 | 
355 |         # if empty line
356 |         if line.strip() == "":
357 |             # if we have data, process and reset
358 |             if len(words) > 0:
359 |                 entCandidates = extractCandidateArgs(entityInfo, words, starts, ends)
360 |                 goldArgs = extractGoldArgs(argInfo, words, indexConverter)
361 | 
362 |                 sentence = Sentence(words, lemmas, labels, posTags, entCandidates, goldArgs, docID, starts[0], ends[len(ends)-1], realisList, starts)
363 | 
364 |                 if entityOut != None:
365 |                     for ent in entCandidates:
366 |                         text = ent.text
367 |                         coref = ent.corefStr + "_" + docID
368 |                         #print text + "\t" + coref + "\t" + str(ent.start) + "\t" + str(ent.end)
369 |                         entityOut.write(text + "\t" + coref + "\t" + str(ent.start) + "\t" + str(ent.end) + "\n")
370 | 
371 |                 words = []
372 |                 lemmas = []
373 |                 posTags = []
374 |                 labels = []
375 |                 entityInfo = []
376 |                 argInfo = []
377 | 
378 |                 starts = []
379 |                 ends = []
380 | 
381 |                 realisList = []
382 | 
383 |                 docID = ""
384 | 
385 |                 indexConverter = dict()
386 |         else:
387 |             tokens = line.strip().split("\t")
388 |             ### How to read input (by token):
389 |             ### 0: start index, 1: end index, 2: word, 3: lemma, 4: posTag, 5: docID, 6:gold entities, 7: trigger type, 8: trigger subtype, 9: argument role, 10: trigger realis (optional)
390 | 
391 |             start = int(tokens[0])
392 |             indexConverter[start] = len(words)
393 | 
394 |             starts.append(int(tokens[0]))
395 |             ends.append(int(tokens[1]))
396 | 
397 |             curWord = tokens[2]
398 |             curPOS = tokens[4]
399 |             curLabel = tokens[7] + "_" + tokens[8]
400 | 
401 |             docID = tokens[5]
402 | 
403 |             words.append(curWord)
404 |             lemmas.append(tokens[3])
405 |             posTags.append(curPOS)
406 | 
407 |             if inputTriggers != None:
408 |                 labels.append(inputTriggers[count])
409 |             else:
410 |                 labels.append(curLabel)
411 | 
412 |             if len(tokens) >= 11:
413 |                 realisList.append(tokens[10])
414 | 
415 |             curEnt = readEntities(tokens[6])
416 |             entityInfo.append(curEnt)
417 | 
418 |             curArg = readArguments(tokens[9], realisMode)
419 |             argInfo.append(curArg)
420 | 
421 |             count += 1
422 |         if eof or sentence != None:
423 |             break
424 | 
425 |     if entityOut != None:
426 |         entityOut.close()
427 | 
428 |     # add dependencies
429 |     while True:
430 |         line = parsingInput.readline()
431 |         clean = line.strip()
432 | 
433 |         if clean == "":
434 |             break
435 |         else:
436 |             # rare case -- we have a token "|", can't do splitting like normal
437 |             if "||||" in clean:
438 |                 depType, gov, govIndex, dep, depIndex = parseDep_Exception(clean)
439 |             else:
440 |                 tokens = clean.split("|||")
441 |                 depType = tokens[0]
442 |                 gov = tokens[1]
443 |                 govIndex = int(tokens[2])
444 |                 dep = tokens[3]
445 |                 depIndex = int(tokens[len(tokens) - 1])
446 |                 ###depIndex = int(tokens[4])
447 | 
448 |             # account for off-by-one (in CoreNLP, 0 = Root, rather than first word)
449 |             curDependency = Dependency(depType, gov, govIndex - 1, dep, depIndex - 1)
450 |             sentence.addDependency(curDependency)
451 | 
452 |     return sentence, not eof, count
453 | 
454 | # method to parse a dependency when one of the words contains "|" at the front or end
455 | def parseDep_Exception(clean):
456 |     # depType
457 |     tempIndex = clean.find("|||")
458 |     depType = clean[:tempIndex]
459 |     clean = clean[tempIndex+3:]
460 | 
461 |     # gov word
462 |     tempIndex = clean.find("|||")
463 |     altIndex = clean.find("||||")
464 |     curWord = ""
465 |     while tempIndex == altIndex:
466 |         curWord += clean[0]
467 |         clean = clean[1:]
468 | 
469 |         tempIndex = clean.find("|||")
470 |         altIndex = clean.find("||||")
471 |     curWord += clean[:tempIndex]
472 |     gov = curWord
473 |     clean = clean[tempIndex+3:]
474 | 
475 |     # govIndex
476 |     tempIndex = clean.find("|||")
477 |     govIndex = int(clean[:tempIndex])
478 |     clean = clean[tempIndex+3:]
479 | 
480 |     # dep word
481 |     tempIndex = clean.find("|||")
482 |     altIndex = clean.find("||||")
483 |     curWord = ""
484 |     while tempIndex == altIndex:
485 |         curWord += clean[0]
486 |         clean = clean[1:]
487 | 
488 |         tempIndex = clean.find("|||")
489 |         altIndex = clean.find("||||")
490 |     curWord += clean[:tempIndex]
491 |     dep = curWord
492 |     clean = clean[tempIndex+3:]
493 | 
494 |     try:
495 |         depIndex = int(clean)
496 |     except ValueError:
497 |         start = clean.rfind("|")
498 |         depIndex = int(clean[start+1:])
499 | 
500 |     return depType, gov, govIndex, dep, depIndex
501 | 


--------------------------------------------------------------------------------
/all_predictions_4.0/code/readLargeInput.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ahsi/Multilingual_Event_Extraction/eed002e864e16dc06c2b2970267b1465adcf825c/all_predictions_4.0/code/readLargeInput.pyc


--------------------------------------------------------------------------------
/all_predictions_4.0/code/writeArgGold.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | import copy
 3 | from random import shuffle
 4 | import string
 5 | from nltk.stem.wordnet import WordNetLemmatizer
 6 | from readLargeInput import *
 7 | 
 8 | 
 9 | DEBUG=False
10 | SMALLDEBUG=False
11 | PREDICTION_DEBUG=False
12 | EMPTY_TRIGGER="not_trigger_not_trigger"
13 | EMPTY_ROLE="NONE"
14 | stepSize=1
15 | beamSize=1
16 | maxIters=20
17 | 
18 | def processSentence(curSentence, possibleLabels, possibleArgs, output, sentIndex):
19 |     writtenSet = set()
20 |     for arg in curSentence.goldArgs:
21 |         argString = arg.text + "|||sent_" + str(sentIndex) + "|||" + arg.role
22 |         if argString not in writtenSet:
23 |             output.write(argString + "\n")
24 |             writtenSet.add(argString)
25 | 
26 | def processEntity(curSentence, entityIndex, possibleArgs, triggerIndex, triggerLabel, output, sentIndex):
27 |     curWords = curSentence.words
28 |     triggerWord = curWords[triggerIndex]
29 |     curEntity = curSentence.entities[entityIndex]
30 | 
31 |     goldArgRole = "NONE"
32 | 
33 |     # find what the gold val is for this entity
34 |     curGoldArgs = curSentence.goldArgs
35 |     foundArg = None
36 |     for arg in curGoldArgs:
37 |         argText = arg.text
38 |         minIndex = arg.minIndex()
39 |         argTriggerIndex = arg.triggerIndex
40 | 
41 |         # if same entity (text and location) and same associated trigger
42 |         if argText == curEntity.text and minIndex == curEntity.minIndex() and argTriggerIndex == triggerIndex and goldArgRole == "NONE":
43 |             goldArgRole = arg.role
44 |             foundArg = arg
45 |         elif argText == curEntity.text and minIndex == curEntity.minIndex() and argTriggerIndex == triggerIndex:
46 |             print "Found duplicate!"
47 |             print argText + "\t" + arg.triggerText + "\t" + arg.role
48 |             print "Alternate:"
49 |             print foundArg.text + "\t" + foundArg.triggerText + "\t" + goldArgRole
50 |             sys.exit()
51 |     if goldArgRole != EMPTY_ROLE:
52 |         output.write(curEntity.text + "|||" + "sent_" + str(sentIndex) + "|||" + goldArgRole + "\n")
53 | 
54 | def main():
55 |     if len(sys.argv) != 4:
56 |         print "Expect input training data, output args file, output sentences file."
57 |         sys.exit()
58 | 
59 |     possibleLabels, possibleArgs = scanInput(sys.argv[1], sys.argv[1] + ".parsing")
60 | 
61 |     print "Total # of trigger labels to predict over: " + str(len(possibleLabels))
62 |     print "Total # of argument roles to predict over: " + str(len(possibleArgs))
63 | 
64 |     output = open(sys.argv[2], "w")
65 | 
66 |     # go over each sentence in the training data
67 |     # NOTE: in this version, using gold triggers, gold entity mentions
68 |     input = open(sys.argv[1], "r")
69 |     parsingInput = open(sys.argv[1] + ".parsing", "r")
70 |     count = 0
71 | 
72 |     sentenceOutput = open(sys.argv[3], "w")
73 |     while True:
74 |         sentence, valid, dummy = readInput(input, parsingInput)
75 | 
76 |         if count % 1000 == 0:
77 |             print "Processing sentence " + str(count)
78 |             if DEBUG:
79 |                 print "Sentence length: " + str(len(sentence.words))
80 |                 print "Total entities: " + str(len(sentence.entities))
81 | 
82 |         if sentence != None:
83 |             processSentence(sentence, possibleLabels, possibleArgs, output, count)
84 | 
85 |             for word in sentence.words:
86 |                 sentenceOutput.write(word + " ")
87 |             sentenceOutput.write("\n")
88 | 
89 |         count += 1
90 |         if not valid:
91 |             break
92 |     output.close()
93 |     sentenceOutput.close()
94 | 
95 | if __name__ == "__main__":
96 |     main()
97 | 


--------------------------------------------------------------------------------
/all_predictions_4.0/code/writeRealisGold.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | import copy
 3 | from random import shuffle
 4 | import string
 5 | from readLargeInput import *
 6 | 
 7 | 
 8 | DEBUG=False
 9 | SMALLDEBUG=False
10 | PREDICTION_DEBUG=False
11 | EMPTY_TRIGGER="not_trigger_not_trigger"
12 | EMPTY_ROLE="NONE"
13 | stepSize=1
14 | beamSize=1
15 | maxIters=20
16 | 
17 | def processSentence(curSentence, possibleLabels, possibleArgs, output, sentIndex):
18 |     writtenSet = set()
19 |     for arg in curSentence.goldArgs:
20 |         argString = arg.text + "|||sent_" + str(sentIndex) + "|||" + arg.role
21 |         if argString not in writtenSet:
22 |             output.write(argString + "\n")
23 |             writtenSet.add(argString)
24 | 
25 | def processEntity(curSentence, entityIndex, possibleArgs, triggerIndex, triggerLabel, output, sentIndex):
26 |     curWords = curSentence.words
27 |     triggerWord = curWords[triggerIndex]
28 |     curEntity = curSentence.entities[entityIndex]
29 | 
30 |     goldArgRole = "NONE"
31 | 
32 |     # find what the gold val is for this entity
33 |     curGoldArgs = curSentence.goldArgs
34 |     foundArg = None
35 |     for arg in curGoldArgs:
36 |         argText = arg.text
37 |         minIndex = arg.minIndex()
38 |         argTriggerIndex = arg.triggerIndex
39 | 
40 |         # if same entity (text and location) and same associated trigger
41 |         if argText == curEntity.text and minIndex == curEntity.minIndex() and argTriggerIndex == triggerIndex and goldArgRole == "NONE":
42 |             goldArgRole = arg.role
43 |             foundArg = arg
44 |         elif argText == curEntity.text and minIndex == curEntity.minIndex() and argTriggerIndex == triggerIndex:
45 |             print "Found duplicate!"
46 |             print argText + "\t" + arg.triggerText + "\t" + arg.role
47 |             print "Alternate:"
48 |             print foundArg.text + "\t" + foundArg.triggerText + "\t" + goldArgRole
49 |             sys.exit()
50 |     if goldArgRole != EMPTY_ROLE:
51 |         output.write(curEntity.text + "|||" + "sent_" + str(sentIndex) + "|||" + goldArgRole + "\n")
52 | 
53 | def main():
54 |     if len(sys.argv) != 4:
55 |         print "Expect input training data, output args file, output sentences file."
56 |         sys.exit()
57 | 
58 |     possibleLabels, possibleArgs = scanInput(sys.argv[1], sys.argv[1] + ".parsing", realisMode = True)
59 | 
60 |     print "Total # of trigger labels to predict over: " + str(len(possibleLabels))
61 |     print "Total # of argument roles to predict over: " + str(len(possibleArgs))
62 | 
63 |     output = open(sys.argv[2], "w")
64 | 
65 |     # go over each sentence in the training data
66 |     # NOTE: in this version, using gold triggers, gold entity mentions
67 |     input = open(sys.argv[1], "r")
68 |     parsingInput = open(sys.argv[1] + ".parsing", "r")
69 |     count = 0
70 | 
71 |     sentenceOutput = open(sys.argv[3], "w")
72 |     while True:
73 |         sentence, valid, dummy = readInput(input, parsingInput, realisMode = True)
74 | 
75 |         if count % 1000 == 0:
76 |             print "Processing sentence " + str(count)
77 |             if DEBUG:
78 |                 print "Sentence length: " + str(len(sentence.words))
79 |                 print "Total entities: " + str(len(sentence.entities))
80 | 
81 |         if sentence != None:
82 |             processSentence(sentence, possibleLabels, possibleArgs, output, count)
83 | 
84 |             for word in sentence.words:
85 |                 sentenceOutput.write(word + " ")
86 |             sentenceOutput.write("\n")
87 | 
88 |         count += 1
89 |         if not valid:
90 |             break
91 |     output.close()
92 |     sentenceOutput.close()
93 | 
94 | if __name__ == "__main__":
95 |     main()
96 | 


--------------------------------------------------------------------------------
/all_predictions_4.0/code/writeTriggerGold.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | import copy
 3 | from random import shuffle
 4 | import string
 5 | from readLargeInput import *
 6 | 
 7 | 
 8 | DEBUG=False
 9 | SMALLDEBUG=False
10 | PREDICTION_DEBUG=False
11 | EMPTY_TRIGGER="not_trigger_not_trigger"
12 | EMPTY_ROLE="NONE"
13 | stepSize=1
14 | beamSize=1
15 | maxIters=20
16 | 
17 | def processSentence(curSentence, possibleLabels, possibleArgs, output, sentIndex):
18 |     curWords = curSentence.words
19 |     curGold = curSentence.labels
20 |     for triggerIndex in range(len(curWords)):
21 |         word = curWords[triggerIndex]
22 |         triggerLabel = curGold[triggerIndex]
23 | 
24 |         output.write(triggerLabel + "\n")
25 | 
26 | def main():
27 |     if len(sys.argv) != 3:
28 |         print "Expect input training data, output args file."
29 |         sys.exit()
30 | 
31 |     possibleLabels, possibleArgs = scanInput(sys.argv[1], sys.argv[1] + ".parsing")
32 | 
33 |     print "Total # of trigger labels to predict over: " + str(len(possibleLabels))
34 |     print "Total # of argument roles to predict over: " + str(len(possibleArgs))
35 | 
36 |     output = open(sys.argv[2], "w")
37 | 
38 |     # go over each sentence in the training data
39 |     # NOTE: in this version, using gold triggers, gold entity mentions
40 |     input = open(sys.argv[1], "r")
41 |     parsingInput = open(sys.argv[1] + ".parsing", "r")
42 |     count = 0
43 |     while True:
44 |         sentence, valid, nothing = readInput(input, parsingInput)
45 | 
46 |         if count % 1000 == 0:
47 |             print "Processing sentence " + str(count)
48 |             if DEBUG:
49 |                 print "Sentence length: " + str(len(sentence.words))
50 |                 print "Total entities: " + str(len(sentence.entities))
51 | 
52 |         if sentence != None:
53 |             processSentence(sentence, possibleLabels, possibleArgs, output, count)
54 | 
55 |         count += 1
56 |         if not valid:
57 |             break
58 |     output.close()
59 | 
60 | if __name__ == "__main__":
61 |     main()
62 | 


--------------------------------------------------------------------------------
/all_predictions_4.0/code/writeTriggerLiblinear.py:
--------------------------------------------------------------------------------
  1 | import sys
  2 | import copy
  3 | from random import shuffle
  4 | import string
  5 | import os
  6 | from readLargeInput import *
  7 | 
  8 | 
  9 | #### FILEPATHS TO BE SET BY USER
 10 | WORD_EMBEDDING_PATH="/home/andrew/DEFT_code_testing/dependencies/wordVectors"
 11 | UNIV_POS_PATH="/home/andrew/DEFT_code_testing/dependencies/pos"
 12 | 
 13 | 
 14 | DEBUG=False
 15 | SMALLDEBUG=False
 16 | PREDICTION_DEBUG=False
 17 | # for when the input data is too big, turn this off.
 18 | EASY_READ_OUTPUT=False
 19 | EMPTY_TRIGGER="not_trigger_not_trigger"
 20 | stepSize=1
 21 | beamSize=1
 22 | maxIters=20
 23 | 
 24 | ### Original word vectors
 25 | # English vectors only
 26 | WORD2VEC_FILENAME=WORD_EMBEDDING_PATH+"/en-wiki-april-6-2015.word2vec_vectors"
 27 | # Chinese vectors only
 28 | CHINESE_WORD2VEC_FILENAME=WORD_EMBEDDING_PATH+"/chinese-wiki-20160305.word2vec"
 29 | # Spanish vectors only
 30 | SPANISH_WORD2VEC_FILENAME=WORD_EMBEDDING_PATH+"/es-wiki-may-2-2016.word2vec_vectors"
 31 | # multilingual -- Noah
 32 | #MULTI_WORD2VEC_FILENAME="/home/andrew/data/LORELEI/word_vectors_noah/embeddings/andrew_scripts/eng-chn-noah.wordVectors"
 33 | 
 34 | ### ACE subset word vectors
 35 | #WORD2VEC_FILENAME="../../multilingualWordVectors/createWordVectorSubset/en-wiki-april-6-2015.word2vec.ACE.subset"
 36 | #CHINESE_WORD2VEC_FILENAME="../../multilingualWordVectors/createWordVectorSubset/chinese-wiki-march-5-2016.word2vec.ACE.subset"
 37 | #SPANISH_WORD2VEC_FILENAME="empty.txt"
 38 | #MULTI_WORD2VEC_FILENAME="../../multilingualWordVectors/vectorAlignment/out/multilingualEngChn.formatted.final"
 39 | 
 40 | ### empty word vectors
 41 | #WORD2VEC_FILENAME="empty.txt"
 42 | #CHINESE_WORD2VEC_FILENAME="empty.txt"
 43 | #SPANISH_WORD2VEC_FILENAME="empty.txt"
 44 | MULTI_WORD2VEC_FILENAME="empty.txt"
 45 | 
 46 | ### dictionary based on CEDICT, containing the ACE English trigger words
 47 | #TRIGGER_BILINGUAL_DICTIONARY_LIST="../../multilingualWordVectors/extractTriggerWords/ACE.English.triggerWords.translations"
 48 | TRIGGER_BILINGUAL_DICTIONARY_LIST="empty.txt"
 49 | ### dictionary based on CEDICT, containing the entire ACE English lexicon
 50 | #BILINGUAL_DICTIONARY_LIST="../../multilingualWordVectors/extractTriggerWords/ACE.English.lexicon.translations"
 51 | BILINGUAL_DICTIONARY_LIST="empty.txt"
 52 | 
 53 | wordVecs = dict()   # dict: word -> vector
 54 | chineseWordVecs = dict()
 55 | multiWordVecs = dict()
 56 | spanishWordVecs = dict()
 57 | bilingualDictionary = dict()
 58 | triggerBilingualDictionary = dict()
 59 | 
 60 | univPOS_EngFile=UNIV_POS_PATH+"/en-ptb.map"
 61 | univPOS_ChnFile=UNIV_POS_PATH+"/zh-ctb6.map"
 62 | univPOS_SpanFile=UNIV_POS_PATH+"/es-cast3lb.map"
 63 | universalPOS_converter = dict()
 64 | 
 65 | def readConfigFile(filename):
 66 | 	input = open(filename, "r")
 67 | 	returnPath = ""
 68 | 	for line in input:
 69 | 		if line.startswith("WORD_EMBEDDING_DIR"):
 70 | 			returnPath = line.strip().split("=")[1]
 71 | 	input.close()
 72 | 
 73 | 	return returnPath
 74 | 
 75 | class State:
 76 |     def __init__(self, triggerParam = None, entParam = None, scoreParam = 0.0):
 77 |         if triggerParam == None:
 78 |             self.triggerStates = []
 79 |         else:
 80 |             self.triggerStates = triggerParam
 81 |         if entParam == None:
 82 |             self.entStates = dict()
 83 |         else:
 84 |             self.entStates = entParam
 85 | 
 86 |         self.score = scoreParam
 87 | 
 88 |     def addTrigger(self, trigger):
 89 |         self.triggerStates.append(trigger)
 90 | 
 91 |     def updateScore(self, val):
 92 |         self.score += val
 93 | 
 94 |     def addEntityAssignment(self, arg):
 95 |         key = arg.text + "|||" + str(arg.minIndex()) + "|||" + str(arg.triggerIndex)
 96 |         self.entStates[key] = arg.role
 97 |         
 98 |     def copy(self):
 99 |         altScore = self.score
100 |         altTriggerStates = []
101 |         altEntStates = dict()
102 | 
103 |         for state in self.triggerStates:
104 |             altTriggerStates.append(state)
105 |         for key in self.entStates:
106 |             altEntStates[key] = self.entStates[key]
107 | 
108 |         return State(altTriggerStates, altEntStates, altScore)
109 | 
110 | def processSentence(curSentence, output, easyOutput, roleIndexDict, featureIndexDict, sentenceIndex, testMode=False):
111 |     # handle each word in the sentence
112 |     triggerIndex = 0
113 |     curWords = curSentence.words
114 |     curGold = curSentence.labels
115 |     curEnts = curSentence.entities
116 | 
117 |     for triggerIndex in range(len(curWords)):
118 |         word = curWords[triggerIndex]
119 |         triggerLabel = curGold[triggerIndex]
120 | 
121 |         processWord(curSentence, triggerIndex, triggerLabel, output, easyOutput, roleIndexDict, featureIndexDict, sentenceIndex, testMode)
122 | 
123 | def processWord(curSentence, triggerIndex, triggerLabel, output, easyOutput, roleIndexDict, featureIndexDict, sentenceIndex, testMode):
124 |     curWords = curSentence.words
125 |     triggerWord = curWords[triggerIndex]
126 | 
127 |     features = genFeatures(curSentence, triggerIndex, triggerLabel)
128 | 
129 |     if triggerLabel not in roleIndexDict:
130 |         roleIndexDict[triggerLabel] = len(roleIndexDict) + 1
131 |     argID = roleIndexDict[triggerLabel]
132 |     output.write(str(argID))
133 |     if EASY_READ_OUTPUT:
134 |         easyOutput.write("sent_" + str(sentenceIndex) + "\tPhrase:\t" + curWords[triggerIndex] + "\tRole:\t" + triggerLabel)
135 | 
136 |         for feature in features:
137 |             easyOutput.write("\t" + feature)
138 | 
139 |     # place all feature names in here
140 |     featureIDs = [] 
141 |     # place word embedding features in here (i.e. non-binary features)
142 |     word2vecDict = dict()
143 | 
144 |     for feature in features:
145 |         if testMode and feature not in featureIndexDict and not feature.startswith("WORD2VEC"):
146 |             continue
147 | 
148 |         if feature.startswith("WORD2VEC"):
149 |             temp = feature.find("=")
150 |             featureName = feature[:temp]
151 |             featureVal = feature[temp+1:]
152 | 
153 |             if featureName not in featureIndexDict:
154 |                 featureIndexDict[featureName] = len(featureIndexDict) + 1
155 | 
156 |             # add the corresponding feature ID to the list(s)
157 |             featureIDs.append(featureIndexDict[featureName])
158 |             word2vecDict[featureIndexDict[featureName]] = featureVal
159 |         else:
160 |             if feature not in featureIndexDict:
161 |                 featureIndexDict[feature] = len(featureIndexDict) + 1
162 |             featureID = featureIndexDict[feature]
163 |             featureIDs.append(featureID)
164 | 
165 |     sortedIDs = sorted(featureIDs)
166 |     for featureID in sortedIDs:
167 |         if featureID in word2vecDict:
168 |             val = word2vecDict[featureID]
169 |             output.write(" " + str(featureID) + ":" + str(val))
170 |         else:
171 |             output.write(" " + str(featureID) + ":1")
172 |     output.write("\n")
173 |     if EASY_READ_OUTPUT:
174 |         easyOutput.write("\n")
175 | 
176 | def main():
177 |     if len(sys.argv) != 4 and len(sys.argv) != 6:
178 |         print len(sys.argv)
179 |         print "Expect mode (train/test), feature file, output liblinear file, and (if test-mode) feature dictionary, role dictionary."
180 |         print "Expect input training data, output liblinear file, input dev data, output liblinear file, input test data, output liblinear file."
181 |         sys.exit()
182 | 
183 | 	try:
184 | 		WORD_EMBEDDING_PATH = readConfigFile("../CONFIG.txt")
185 | 	except:
186 | 		print "Could not find CONFIG.txt.  Terminating..."
187 | 		sys.exit()
188 | 
189 |     trainMode = (sys.argv[1] == "train")
190 |     if not trainMode and sys.argv[1] != "test":
191 |         sys.exit()
192 | 
193 |     textFile = sys.argv[2]
194 |     outputFile = sys.argv[3]
195 | 
196 |     roleIndexDict = dict()
197 |     featureIndexDict = dict()
198 |     if not trainMode:
199 |         featureIndexDict = readDict(sys.argv[4])
200 |         roleIndexDict = readDict(sys.argv[5])
201 | 
202 |     # open UniversalPOS converter
203 |     input = open(univPOS_EngFile, "r")
204 |     for line in input:
205 |         tokens = line.strip().split("\t")
206 |         source = tokens[0]
207 |         target = tokens[1]
208 | 
209 |         universalPOS_converter[source] = target
210 |     input.close()
211 |     input = open(univPOS_ChnFile, "r")
212 |     for line in input:
213 |         tokens = line.strip().split("\t")
214 |         source = tokens[0]
215 |         target = tokens[1]
216 | 
217 |         universalPOS_converter[source] = target
218 |     input.close()
219 |     input = open(univPOS_SpanFile, "r")
220 |     for line in input:
221 |         tokens = line.strip().split("\t")
222 |         source = tokens[0]
223 |         target = tokens[1]
224 | 
225 |         universalPOS_converter[source] = target
226 |     input.close()
227 | 
228 |     # open the bilingual dictionary
229 |     # usage: add a new feature.  For english words, activates if the word appears; for chinese words, activates any associated translations
230 |     input = open(BILINGUAL_DICTIONARY_LIST, "r")
231 |     for line in input:
232 |         tokens = line.strip().split()
233 |         chineseWord = tokens[0]
234 |         englishWord = tokens[1]
235 | 
236 |         if chineseWord not in bilingualDictionary:
237 |             bilingualDictionary[chineseWord] = set()
238 |         if englishWord not in bilingualDictionary:
239 |             bilingualDictionary[englishWord] = set()
240 |         bilingualDictionary[chineseWord].add(englishWord)
241 |     input.close()
242 | 
243 |     input = open(TRIGGER_BILINGUAL_DICTIONARY_LIST, "r")
244 |     for line in input:
245 |         tokens = line.strip().split()
246 |         chineseWord = tokens[0]
247 |         englishWord = tokens[1]
248 | 
249 |         if chineseWord not in triggerBilingualDictionary:
250 |             triggerBilingualDictionary[chineseWord] = set()
251 |         if englishWord not in triggerBilingualDictionary:
252 |             triggerBilingualDictionary[englishWord] = set()
253 |         triggerBilingualDictionary[chineseWord].add(englishWord)
254 |     input.close()
255 |         
256 | 
257 |     # open and read word vectors
258 |     input = open(WORD2VEC_FILENAME, "r")
259 |     for line in input:
260 |         # skip any headers
261 |         if line.count(" ") < 5:
262 |             continue
263 |         else:
264 |             index = line.find(" ")
265 |             curWord = line[:index]
266 |             rest = line[index+1:]
267 |             tokens = rest.strip().split(" ")
268 | 
269 |             numTokens = []
270 |             for tok in tokens:
271 |                 numTokens.append(float(tok))
272 | 
273 |             wordVecs[curWord] = numTokens
274 |     input.close()
275 | 
276 |     input = open(CHINESE_WORD2VEC_FILENAME, "r")
277 |     for line in input:
278 |         # skip any headers
279 |         if line.count(" ") < 5:
280 |             continue
281 |         else:
282 |             index = line.find(" ")
283 |             curWord = line[:index]
284 |             rest = line[index+1:]
285 |             tokens = rest.strip().split(" ")
286 | 
287 |             numTokens = []
288 |             for tok in tokens:
289 |                 numTokens.append(float(tok))
290 | 
291 |             chineseWordVecs[curWord] = numTokens
292 |     input.close()
293 | 
294 |     input = open(SPANISH_WORD2VEC_FILENAME, "r")
295 |     for line in input:
296 |         # skip any headers
297 |         if line.count(" ") < 5:
298 |             continue
299 |         else:
300 |             index = line.find(" ")
301 |             curWord = line[:index]
302 |             rest = line[index+1:]
303 |             tokens = rest.strip().split(" ")
304 | 
305 |             numTokens = []
306 |             for tok in tokens:
307 |                 numTokens.append(float(tok))
308 | 
309 |             spanishWordVecs[curWord] = numTokens
310 |     input.close()
311 | 
312 |     input = open(MULTI_WORD2VEC_FILENAME, "r")
313 |     for line in input:
314 |         # skip any headers
315 |         if line.count(" ") < 5:
316 |             continue
317 |         else:
318 |             index = line.find(" ")
319 |             curWord = line[:index]
320 |             rest = line[index+1:]
321 |             tokens = rest.strip().split(" ")
322 | 
323 |             numTokens = []
324 |             for tok in tokens:
325 |                 numTokens.append(float(tok))
326 | 
327 |             multiWordVecs[curWord] = numTokens
328 |     input.close()
329 | 
330 |     ### Training
331 |     if trainMode:
332 |         possibleLabels, possibleArgs = scanInput(textFile, textFile + ".parsing")
333 | 
334 |         print "Total # of trigger labels to predict over: " + str(len(possibleLabels))
335 |         print "Total # of argument roles to predict over: " + str(len(possibleArgs))
336 | 
337 | 
338 |         # go over each sentence in the training data
339 |         # NOTE: in this version, using gold triggers, gold entity mentions
340 |         output = open(outputFile, "w")
341 |         easyOutput = open(outputFile + ".easyRead", "w")
342 | 
343 |         input = open(textFile, "r")
344 |         parsingInput = open(textFile + ".parsing", "r")
345 |         count = 0
346 | 
347 |         print "Writing training set"
348 |         while True:
349 |             sentence, valid, nothing = readInput(input, parsingInput)
350 | 
351 |             if count % 1000 == 0:
352 |                 print "Processing sentence " + str(count)
353 |                 if DEBUG:
354 |                     print "Sentence length: " + str(len(sentence.words))
355 |                     print "Total entities: " + str(len(sentence.entities))
356 | 
357 |             if sentence != None:
358 |                 processSentence(sentence, output, easyOutput, roleIndexDict, featureIndexDict, count)
359 | 
360 |             count += 1
361 | 
362 |             if not valid:
363 |                 break
364 |         output.close()
365 |         easyOutput.close()
366 |         input.close()
367 |         parsingInput.close()
368 | 
369 |         writeDicts(featureIndexDict, roleIndexDict, "features.dict", "roles.dict")
370 |     ### Testing
371 |     else:
372 |         output = open(outputFile, "w")
373 |         easyOutput = open(outputFile + ".easyRead", "w")
374 | 
375 |         input = open(textFile, "r")
376 |         parsingInput = open(textFile + ".parsing", "r")
377 | 
378 |         count = 0
379 |         print "Writing test set"
380 |         while True:
381 |             sentence, valid, nothing = readInput(input, parsingInput)
382 | 
383 |             if count % 1000 == 0:
384 |                 print "Processing sentence " + str(count)
385 |                 if DEBUG:
386 |                     print "Sentence length: " + str(len(sentence.words))
387 |                     print "Total entities: " + str(len(sentence.entities))
388 | 
389 |             if sentence != None:
390 |                 processSentence(sentence, output, easyOutput, roleIndexDict, featureIndexDict, count, testMode=True)
391 | 
392 |             count += 1
393 | 
394 |             if not valid:
395 |                 break
396 |         output.close()
397 |         easyOutput.close()
398 |         input.close()
399 |         parsingInput.close()
400 | 
401 | 
402 | def writeDicts(featureDict, roleDict, filenameF, filenameR):
403 |     output = open(filenameF, "w")
404 |     for feature in featureDict:
405 |         curID = featureDict[feature]
406 |         output.write(feature + ":" + str(curID) + "\n")
407 |     output.close()
408 | 
409 |     output = open(filenameR, "w")
410 |     for role in roleDict:
411 |         curID = roleDict[role]
412 |         output.write(role + ":" + str(curID) + "\n")
413 |     output.close()
414 | 
415 | def readDict(filename):
416 |     input = open(filename, "r")
417 |     curDict = dict()
418 |     for line in input:
419 |         clean = line.strip()
420 |         splitPoint = clean.rfind(":")
421 |         index = clean[:splitPoint]
422 |         val = int(clean[splitPoint+1:])
423 |         curDict[index] = val
424 |     return curDict
425 | 
426 | def isYear(text):
427 |     if len(text) < 4:
428 |         return False
429 |     for index in range(4):
430 |         if text[index] not in string.digits:
431 |             return False
432 |     # sometimes errors from Stanford segmenter
433 |     if len(text) != 4:
434 |         if text[4] not in string.punctuation:
435 |             return False
436 |     return True
437 | 
438 | 
439 | # returns (absolute value) of distance between entity and trigger
440 | def calcArgTriggerDistance(triggerIndex, start, end):
441 |     if triggerIndex < start:
442 |         return start - triggerIndex
443 |     elif triggerIndex > end:
444 |         return triggerIndex - end
445 |     else:
446 |         return 0
447 | 
448 | def toUnivPOS(tag):
449 |     # removeNumbers at end of POS tag if needed (Stanford Spanish seems to add this)
450 |     tempTag = ""
451 |     for character in tag:
452 |         if character not in string.digits:
453 |             tempTag += character
454 | 
455 |     copyPOS = tempTag
456 |     while len(copyPOS) > 0:
457 |         if copyPOS in universalPOS_converter:
458 |             return universalPOS_converter[copyPOS]
459 |         else:
460 |             copyPOS = copyPOS[:-1]
461 |     return tempTag
462 | 
463 | # assuming each can be represented by an indicator function
464 | # i.e. binary features
465 | # GOTOFEATURES
466 | def genFeatures(curSentence, index, proposedLabel):
467 |     words = curSentence.words
468 |     lemmas = curSentence.lemmas
469 |     posTags = curSentence.posTags
470 | 
471 |     # default
472 |     previousWord = "<START>"
473 |     prevPOS = "<START>"
474 |     previousWord_2 = "<START>"
475 |     prevPOS_2 = "<START>"
476 |     if index != 0:
477 |         previousWord = words[index-1]
478 |         prevPOS = posTags[index-1]
479 |         if index != 1:
480 |             previousWord_2 = words[index-2]
481 |             prevPOS_2 = posTags[index-2]
482 | 
483 |     nextWord = "<END>"
484 |     nextPOS = "<END>"
485 |     nextWord_2 = "<END>"
486 |     nextPOS_2 = "<END>"
487 |     if index != len(words) - 1:
488 |         nextWord = words[index+1]
489 |         nextPOS = posTags[index+1]
490 |         if index != len(words) - 2:
491 |             nextWord_2 = words[index+2]
492 |             nextWord_2 = posTags[index+2]
493 | 
494 |     word = words[index]
495 |     lemma = lemmas[index]
496 |     curPOS = posTags[index]
497 | 
498 |     featureSet = set()
499 | 
500 |     # if the sentence is for the title (approximate)
501 |     foundUnderscore = False
502 |     foundNEWS = False
503 |     for word in words:
504 |         if "_" in word:
505 |             foundUnderscore = True
506 |         if "NEWS" in word:
507 |             foundNEWS = True
508 |     isTitle = foundUnderscore and foundNEWS
509 |     if isTitle:
510 |         featureSet.add("isTitle")
511 | 
512 |     # length of current word
513 |     featureSet.add(str(len(word)) + "_lengthCurWord")
514 | 
515 |     # unigrams -- words
516 |     featureSet.add(word + "_curWord")
517 |     featureSet.add(previousWord + "_prevWord")
518 |     featureSet.add(nextWord + "_nextWord")
519 |     featureSet.add(previousWord_2 + "_prevWord2")
520 |     featureSet.add(nextWord_2 + "_nextWord2")
521 | 
522 |     # unigrams -- words lowercase
523 |     featureSet.add(word.lower() + "_curWordLower")
524 |     featureSet.add(previousWord.lower() + "_prevWordLower")
525 |     featureSet.add(nextWord.lower() + "_nextWordLower")
526 |     featureSet.add(previousWord_2.lower() + "_prevWord2Lower")
527 |     featureSet.add(nextWord_2.lower() + "_nextWord2Lower")
528 | 
529 |     # unigrams -- lemma
530 |     featureSet.add(lemma + "_curLemma")
531 | 
532 |     # bigrams -- words
533 |     featureSet.add(word + "_curWord" + "|||" + previousWord + "_prevWord")
534 |     featureSet.add(word + "_curWord" + "|||" + nextWord + "_nextWord")
535 |     featureSet.add(word + "_curWord" + "|||" + previousWord_2 + "_prevWord2")
536 |     featureSet.add(word + "_curWord" + "|||" + nextWord_2 + "_nextWord2")
537 | 
538 |     # bigrams -- words lowercase
539 |     featureSet.add(word.lower() + "_curWordLower" + "|||" + previousWord.lower() + "_prevWordLower")
540 |     featureSet.add(word.lower() + "_curWordLower" + "|||" + nextWord.lower() + "_nextWordLower")
541 |     featureSet.add(word.lower() + "_curWordLower" + "|||" + previousWord_2.lower() + "_prevWordLower2")
542 |     featureSet.add(word.lower() + "_curWordLower" + "|||" + nextWord_2.lower() + "_nextWordLower2")
543 | 
544 |     # bigrams -- word + POS
545 |     featureSet.add(word + "_curWord" + "|||" + curPOS + "_curPOS")
546 |     featureSet.add(word + "_curWord" + "|||" + toUnivPOS(curPOS) + "_curUNIVPOS")
547 | 
548 | 
549 |     # word-"shape" features
550 |     if "_" in word:
551 |         featureSet.add("containsUnderscore")
552 | 
553 |     # if number
554 |     number = True
555 |     for character in word:
556 |         if character not in string.digits:
557 |             number = False
558 |             break
559 |     if number:
560 |         featureSet.add("isNumber")
561 | 
562 |     # capitalized
563 |     firstChar = word[0]
564 |     if firstChar in string.ascii_uppercase:
565 |         featureSet.add("isCapitalized")
566 |     else:
567 |         featureSet.add("isNotCapitalized")
568 | 
569 |     # punctuationOnly
570 |     punct = True
571 |     for character in word:
572 |         if character not in string.punctuation:
573 |             punct = False
574 |             break
575 |     if punct:
576 |         featureSet.add("isPunctuation")
577 | 
578 |     # POS features
579 |     if curPOS.startswith("V"):
580 |         featureSet.add("POS_VType")
581 |     featureSet.add(curPOS + "_curPOS")
582 |     featureSet.add(prevPOS + "_prevPOS")
583 |     featureSet.add(nextPOS + "_nextPOS")
584 |     featureSet.add(prevPOS_2 + "_prevPOS2")
585 |     featureSet.add(nextPOS_2 + "_nextPOS2")
586 | 
587 |     if toUnivPOS(curPOS).startswith("V"):
588 |         featureSet.add("UNIVPOS_VType")
589 |     featureSet.add(toUnivPOS(curPOS) + "_curUNIVPOS")
590 |     featureSet.add(toUnivPOS(prevPOS) + "_prevUNIVPOS")
591 |     featureSet.add(toUnivPOS(nextPOS) + "_nextUNIVPOS")
592 |     featureSet.add(toUnivPOS(prevPOS_2) + "_prevUNIVPOS2")
593 |     featureSet.add(toUnivPOS(nextPOS_2) + "_nextUNIVPOS2")
594 | 
595 |     # POS bigrams
596 |     featureSet.add(curPOS + "_curPOS" + "|||" + prevPOS + "_prevPOS")
597 |     featureSet.add(curPOS + "_curPOS" + "|||" + nextPOS + "_nextPOS")
598 |     featureSet.add(curPOS + "_curPOS" + "|||" + prevPOS_2 + "_prevPOS2")
599 |     featureSet.add(curPOS + "_curPOS" + "|||" + nextPOS_2 + "_nextPOS2")
600 | 
601 |     featureSet.add(toUnivPOS(curPOS) + "_curUNIVPOS" + "|||" + toUnivPOS(prevPOS) + "_prevUNIVPOS")
602 |     featureSet.add(toUnivPOS(curPOS) + "_curUNIVPOS" + "|||" + toUnivPOS(nextPOS) + "_nextUNIVPOS")
603 |     featureSet.add(toUnivPOS(curPOS) + "_curUNIVPOS" + "|||" + toUnivPOS(prevPOS_2) + "_prevUNIVPOS2")
604 |     featureSet.add(toUnivPOS(curPOS) + "_curUNIVPOS" + "|||" + toUnivPOS(nextPOS_2) + "_nextUNIVPOS2")
605 | 
606 |     # dependency parsing features -- governor
607 |     if index in curSentence.depByGovIndex:
608 |         for dependency in curSentence.depByGovIndex[index]:
609 |             depType = dependency.depType
610 |             dependent = dependency.dependent
611 | 
612 |             featureSet.add("ParsingGov_" + depType + "_type")
613 |             featureSet.add("ParsingGov_" + dependent + "_dependent")
614 |             featureSet.add("ParsingGov_" + depType + "_type" + "|||" + dependent + "_dependent")
615 | 
616 |     # dependency parsing features -- dependent
617 |     if index in curSentence.depByDepIndex:
618 |         for dependency in curSentence.depByDepIndex[index]:
619 |             depType = dependency.depType
620 |             governor = dependency.governor
621 | 
622 |             featureSet.add("ParsingDep_" + depType + "_type")
623 |             featureSet.add("ParsingDep_" + governor + "_governor")
624 |             featureSet.add("ParsingDep_" + depType + "_type" + "|||" + governor + "_governor")
625 |     
626 |     # only include if we have the word as one of our vectors
627 |     word = words[index]
628 |     if word in wordVecs:
629 |         curVector = wordVecs[word]
630 |         vecLocation = 0
631 |         for tok in curVector:
632 |             featureSet.add("WORD2VEC_ENG_" + str(vecLocation) + "=" + str(tok))
633 |             vecLocation += 1
634 | 
635 |     word = words[index]
636 |     if word in chineseWordVecs:
637 |         curVector = chineseWordVecs[word]
638 |         vecLocation = 0
639 |         for tok in curVector:
640 |             featureSet.add("WORD2VEC_CHN_" + str(vecLocation) + "=" + str(tok))
641 |             vecLocation += 1
642 | 
643 |     word = words[index]
644 |     if word in spanishWordVecs:
645 |         curVector = spanishWordVecs[word]
646 |         vecLocation = 0
647 |         for tok in curVector:
648 |             featureSet.add("WORD2VEC_SPAN_" + str(vecLocation) + "=" + str(tok))
649 |             vecLocation += 1
650 | 
651 |     word = words[index]
652 |     if word in multiWordVecs:
653 |         curVector = multiWordVecs[word]
654 |         vecLocation = 0
655 |         for tok in curVector:
656 |             featureSet.add("WORD2VEC_MULTI_" + str(vecLocation) + "=" + str(tok))
657 |             vecLocation += 1
658 | 
659 |     # bilingual dictionary features
660 |     word = words[index]
661 |     if word in bilingualDictionary:
662 |         wordSet = bilingualDictionary[word]
663 |         # if English word:
664 |         if len(wordSet) == 0:
665 |             featureSet.add("WORD_TRANSLATION_" + word)
666 |         else:
667 |             for translation in wordSet:
668 |                 featureSet.add("WORD_TRANSLATION_" + translation)
669 | 
670 |     word = words[index]
671 |     if word in triggerBilingualDictionary:
672 |         wordSet = triggerBilingualDictionary[word]
673 |         # if English word:
674 |         if len(wordSet) == 0:
675 |             featureSet.add("TRIGGER_WORD_TRANSLATION_" + word)
676 |         else:
677 |             for translation in wordSet:
678 |                 featureSet.add("TRIGGER_WORD_TRANSLATION_" + translation)
679 | 
680 |     return featureSet
681 | 
682 | if __name__ == "__main__":
683 |     main()
684 | 


--------------------------------------------------------------------------------
/all_predictions_4.0/runAll.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | echo "Start TRIGGERS"
4 | ./runTriggers.sh 1 $1
5 | echo "START ARGUMENTS"
6 | ./runArguments.sh 1 $1
7 | echo "Start REALIS"
8 | ./runRealis.sh 1 $1
9 | 


--------------------------------------------------------------------------------
/all_predictions_4.0/runAll_providedTriggers.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | echo "START ARGUMENTS"
4 | ./runArguments_providedTriggers.sh 1 $1
5 | echo "Start REALIS"
6 | ./runRealis_providedTriggers.sh 1 $1
7 | 


--------------------------------------------------------------------------------
/all_predictions_4.0/runArguments.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | LIBLINEAR_PATH=/home/andrew/ML_tools/LIBLINEAR/liblinear-1.94
 4 | 
 5 | if [ "$#" -ne 2 ]; then
 6 | 	echo "Illegal number of parameters, provide a value for C parameter, test file"
 7 | 	exit 1
 8 | fi
 9 | 
10 | # create the input liblinear files
11 | cd code/
12 | # write English Liblinear
13 | python writeArgLiblinear.py test ../$2 test.out /home/andrew/DEFT_code_testing/dependencies/models/liblinear/arguments.features.dict /home/andrew/DEFT_code_testing/dependencies/models/liblinear/arguments.roles.dict ../currentPredictionsForTriggers/testSet.predictions
14 | 
15 | cd ../
16 | # running on the test set
17 | ${LIBLINEAR_PATH}/predict code/test.out /home/andrew/DEFT_code_testing/dependencies/models/liblinear/arguments.model output.test.arguments
18 | 
19 | # report results on the data
20 | python code/convertOutputArgs.py /home/andrew/DEFT_code_testing/dependencies/models/liblinear/arguments.roles.dict output.test.arguments code/test.out.easyRead currentPredictionsForArgs/testSet.predictions
21 | # record trigger easyRead data
22 | cp code/test.out.easyRead arguments.out.easyRead
23 | cp code/test.out.entityCoref arguments.out.entityCoref
24 | 


--------------------------------------------------------------------------------
/all_predictions_4.0/runArguments_providedTriggers.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | LIBLINEAR_PATH=/home/andrew/ML_tools/LIBLINEAR/liblinear-1.94
 4 | 
 5 | if [ "$#" -ne 2 ]; then
 6 | 	echo "Illegal number of parameters, provide a value for C parameter, test file"
 7 | 	exit 1
 8 | fi
 9 | 
10 | # create the input liblinear files
11 | cd code/
12 | # write English Liblinear
13 | python writeArgLiblinear.py test ../$2 test.out /home/andrew/DEFT_code_testing/dependencies/models/liblinear/arguments.features.dict /home/andrew/DEFT_code_testing/dependencies/models/liblinear/arguments.roles.dict NONE
14 | 
15 | cd ../
16 | # running on the test set
17 | ${LIBLINEAR_PATH}/predict code/test.out /home/andrew/DEFT_code_testing/dependencies/models/liblinear/arguments.model output.test.arguments
18 | 
19 | # report results on the data
20 | python code/convertOutputArgs.py /home/andrew/DEFT_code_testing/dependencies/models/liblinear/arguments.roles.dict output.test.arguments code/test.out.easyRead currentPredictionsForArgs/testSet.predictions
21 | # record trigger easyRead data
22 | cp code/test.out.easyRead arguments.out.easyRead
23 | cp code/test.out.entityCoref arguments.out.entityCoref
24 | 


--------------------------------------------------------------------------------
/all_predictions_4.0/runRealis.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | LIBLINEAR_PATH=/home/andrew/ML_tools/LIBLINEAR/liblinear-1.94
 4 | 
 5 | if [ "$#" -ne 2 ]; then
 6 | 	echo "Illegal number of parameters, provide a value for C parameter, test file"
 7 | 	exit 1
 8 | fi
 9 | 
10 | # create the input liblinear files
11 | cd code/
12 | # write English Liblinear
13 | python writeRealisLiblinear.py test ../$2 test.out /home/andrew/DEFT_code_testing/dependencies/models/liblinear/realis.features.dict /home/andrew/DEFT_code_testing/dependencies/models/liblinear/realis.roles.dict ../currentPredictionsForTriggers/testSet.predictions
14 | 
15 | cd ../
16 | 
17 | # testing on the training/validation/testing sets
18 | ${LIBLINEAR_PATH}/predict code/test.out /home/andrew/DEFT_code_testing/dependencies/models/liblinear/realis.model output.test.realis
19 | 
20 | # report results on the data
21 | python code/convertOutputArgs.py /home/andrew/DEFT_code_testing/dependencies/models/liblinear/realis.roles.dict output.test.realis code/test.out.easyRead currentPredictionsForRealis/testSet.predictions
22 | cp code/test.out.easyRead realis.out.easyRead
23 | 


--------------------------------------------------------------------------------
/all_predictions_4.0/runRealis_providedTriggers.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | LIBLINEAR_PATH=/home/andrew/ML_tools/LIBLINEAR/liblinear-1.94
 4 | 
 5 | if [ "$#" -ne 2 ]; then
 6 | 	echo "Illegal number of parameters, provide a value for C parameter, test file"
 7 | 	exit 1
 8 | fi
 9 | 
10 | # create the input liblinear files
11 | cd code/
12 | # write English Liblinear
13 | python writeRealisLiblinear.py test ../$2 test.out /home/andrew/DEFT_code_testing/dependencies/models/liblinear/realis.features.dict /home/andrew/DEFT_code_testing/dependencies/models/liblinear/realis.roles.dict NONE
14 | 
15 | cd ../
16 | 
17 | # testing on the training/validation/testing sets
18 | ${LIBLINEAR_PATH}/predict code/test.out /home/andrew/DEFT_code_testing/dependencies/models/liblinear/realis.model output.test.realis
19 | 
20 | # report results on the data
21 | python code/convertOutputArgs.py /home/andrew/DEFT_code_testing/dependencies/models/liblinear/realis.roles.dict output.test.realis code/test.out.easyRead currentPredictionsForRealis/testSet.predictions
22 | cp code/test.out.easyRead realis.out.easyRead
23 | 


--------------------------------------------------------------------------------
/all_predictions_4.0/runTriggers.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | LIBLINEAR_PATH=/home/andrew/ML_tools/LIBLINEAR/liblinear-1.94
 4 | 
 5 | if [ "$#" -ne 2 ]; then
 6 | 	echo "Illegal number of parameters, provide a value for C parameter, test file"
 7 | 	exit 1
 8 | fi
 9 | 
10 | # create the input liblinear files
11 | cd code/
12 | # write English Liblinear
13 | python writeTriggerLiblinear.py test ../$2 test.out /home/andrew/DEFT_code_testing/dependencies/models/liblinear/triggers.features.dict /home/andrew/DEFT_code_testing/dependencies/models/liblinear/triggers.roles.dict
14 | 
15 | cd ../
16 | # running on the test set
17 | ${LIBLINEAR_PATH}/predict code/test.out /home/andrew/DEFT_code_testing/dependencies/models/liblinear/triggers.model output.test.triggers
18 | 
19 | # report results on the data
20 | python code/convertOutputTriggers.py /home/andrew/DEFT_code_testing/dependencies/models/liblinear/triggers.roles.dict output.test.triggers code/test.out.easyRead currentPredictionsForTriggers/testSet.predictions
21 | # record trigger easyRead data
22 | cp code/test.out.easyRead triggers.out.easyRead
23 | 


--------------------------------------------------------------------------------
/config.py:
--------------------------------------------------------------------------------
  1 | # script to adjust the filepaths as needed
  2 | import sys
  3 | 
  4 | def parseConfigFile(filename):
  5 | 	input = open(filename, "r")
  6 | 
  7 | 	embeddingPath = ""
  8 | 	corenlpPath = ""
  9 | 	parserPath = ""
 10 | 	nerPath = ""
 11 | 	for line in input:
 12 | 		if line.startswith("WORD_EMBEDDING_DIR"):
 13 | 			embeddingPath = line.strip().split("=")[1]
 14 | 		elif line.startswith("CORENLP_DIR"):
 15 | 			corenlpPath = line.strip().split("=")[1]
 16 | 		elif line.startswith("MALTPARSER_DIR"):
 17 | 			parserPath = line.strip().split("=")[1]
 18 | 		elif line.startswith("NER_DIR"):
 19 | 			nerPath = line.strip().split("=")[1]
 20 | 		elif line.startswith("MODEL_DIR"):
 21 | 			modelPath = line.strip().split("=")[1]
 22 | 		elif line.startswith("LIBLINEAR_DIR"):
 23 | 			liblinearPath = line.strip().split("=")[1]
 24 | 		elif line.startswith("POS_DIR"):
 25 | 			posPath = line.strip().split("=")[1]
 26 | 	input.close()
 27 | 
 28 | 	return corenlpPath, embeddingPath, parserPath, nerPath, modelPath, liblinearPath, posPath
 29 | 
 30 | def main():
 31 | 	try:
 32 | 		corenlpDir, embeddingDir, parserDir, nerDir, modelDir, liblinearDir, posDir = parseConfigFile("CONFIG.txt")
 33 | 	except:
 34 | 		print "Please run in same directory as CONFIG.txt."
 35 | 		sys.exit()
 36 | 
 37 | 	# update the Chinese code for running Maltparser
 38 | 	curInput = open("preprocessing_2.0/processChinese.sh", "r")
 39 | 	lines = []
 40 | 	for line in curInput:
 41 | 		lines.append(line)
 42 | 	curInput.close()
 43 | 	output = open("preprocessing_2.0/processChinese.sh", "w")
 44 | 	for line in lines:
 45 | 		if line.startswith("python") or line.startswith("java"):
 46 | 			tokens = line.strip().split()
 47 | 			for tok in tokens:
 48 | 				if tok.startswith("python") or tok.startswith("java"):
 49 | 					output.write(tok)
 50 | 				elif tok.endswith(".map"):
 51 | 					if "/" not in tok:
 52 | 						output.write(" " + posDir + "/" + tok)
 53 | 					else:
 54 | 						tmpToks = tok.split("/")
 55 | 						output.write(" " + posDir + "/" + tmpToks[len(tmpToks)-1])
 56 | 				elif tok.endswith(".jar"):
 57 | 					if "/" not in tok:
 58 | 						output.write(" " + parserDir + "/" + tok)
 59 | 					else:
 60 | 						tmpToks = tok.split("/")
 61 | 						output.write(" " + parserDir + "/" + tmpToks[len(tmpToks)-1])
 62 | 				else:
 63 | 					output.write(" " + tok)
 64 | 			output.write("\n")
 65 | 		elif line.startswith("cp"):
 66 | 			tokens = line.strip().split()
 67 | 			for tok in tokens:
 68 | 				if tok == "cp":
 69 | 					output.write(tok)
 70 | 				elif tok.endswith(".mco"):
 71 | 					if "/" not in tok:
 72 | 						output.write(" " + modelDir + "/maltparser/" + tok)
 73 | 					else:
 74 | 						tmpToks = tok.split("/")
 75 | 						output.write(" " + modelDir + "/maltparser/" + tmpToks[len(tmpToks)-1])
 76 | 				else:
 77 | 					output.write(" " + tok)
 78 | 			output.write("\n")
 79 | 		else:
 80 | 			output.write(line)
 81 | 	output.close()
 82 | 		
 83 | 	# update the Spanish code for running Maltparser
 84 | 	curInput = open("preprocessing_2.0/processSpanish.sh", "r")
 85 | 	lines = []
 86 | 	for line in curInput:
 87 | 		lines.append(line)
 88 | 	curInput.close()
 89 | 	output = open("preprocessing_2.0/processSpanish.sh", "w")
 90 | 	for line in lines:
 91 | 		if line.startswith("python") or line.startswith("java"):
 92 | 			tokens = line.strip().split()
 93 | 			for tok in tokens:
 94 | 				if tok.startswith("python") or tok.startswith("java"):
 95 | 					output.write(tok)
 96 | 				elif tok.endswith(".map"):
 97 | 					if "/" not in tok:
 98 | 						output.write(" " + posDir + "/" + tok)
 99 | 					else:
100 | 						tmpToks = tok.split("/")
101 | 						output.write(" " + posDir + "/" + tmpToks[len(tmpToks)-1])
102 | 				elif tok.endswith(".jar"):
103 | 					if "/" not in tok:
104 | 						output.write(" " + parserDir + "/" + tok)
105 | 					else:
106 | 						tmpToks = tok.split("/")
107 | 						output.write(" " + parserDir + "/" + tmpToks[len(tmpToks)-1])
108 | 				else:
109 | 					output.write(" " + tok)
110 | 			output.write("\n")
111 | 		elif line.startswith("cp"):
112 | 			tokens = line.strip().split()
113 | 			for tok in tokens:
114 | 				if tok == "cp":
115 | 					output.write(tok)
116 | 				elif tok.endswith(".mco"):
117 | 					if "/" not in tok:
118 | 						output.write(" " + modelDir + "/maltparser/" + tok)
119 | 					else:
120 | 						tmpToks = tok.split("/")
121 | 						output.write(" " + modelDir + "/maltparser/" + tmpToks[len(tmpToks)-1])
122 | 				else:
123 | 					output.write(" " + tok)
124 | 			output.write("\n")
125 | 		else:
126 | 			output.write(line)
127 | 	output.close()
128 | 
129 | 
130 | 	# update the output formatting code
131 | 	filenames = ["outputFormatting/English_run.sh", "outputFormatting/Chinese_run.sh", "outputFormatting/Spanish_run.sh"]
132 | 	for filename in filenames:
133 | 		curInput = open(filename, "r")
134 | 		lines = []
135 | 		for line in curInput:
136 | 			lines.append(line)
137 | 		curInput.close()
138 | 		output = open(filename, "w")
139 | 		for line in lines:
140 | 			if line.startswith("python"):
141 | 				tokens = line.strip().split()
142 | 				for tok in tokens:
143 | 					if tok.startswith("python"):
144 | 						output.write(tok)
145 | 					elif tok.endswith(".dict"):
146 | 						if "/" not in tok:
147 | 							output.write(" " + modelDir + "/liblinear/" + tok)
148 | 						else:
149 | 							tmpToks = tok.split("/")
150 | 							output.write(" " + modelDir + "/liblinear/" + tmpToks[len(tmpToks)-1])
151 | 					else:
152 | 						output.write(" " + tok)
153 | 				output.write("\n")
154 | 			else:
155 | 				output.write(line)
156 | 		output.close()
157 | 
158 | 	# update the liblinear files
159 | 	filenames = ["runArguments.sh", "runArguments_providedTriggers.sh", "runRealis_providedTriggers.sh", "runRealis.sh", "runTriggers.sh"]
160 | 
161 | 	curInput = open("all_predictions_4.0/code/writeTriggerLiblinear.py", "r")
162 | 	lines = []
163 | 	for line in curInput:
164 | 		lines.append(line)
165 | 	curInput.close()
166 | 	output = open("all_predictions_4.0/code/writeTriggerLiblinear.py", "w")
167 | 	for line in lines:
168 | 		if line.startswith("WORD_EMBEDDING_PATH="):
169 | 			output.write("WORD_EMBEDDING_PATH=\"" + embeddingDir + "\"\n")
170 | 		elif line.startswith("UNIV_POS_PATH="):
171 | 			output.write("UNIV_POS_PATH=\"" + posDir + "\"\n")
172 | 		else:
173 | 			output.write(line)
174 | 	output.close()
175 | 
176 | 	for filename in filenames:
177 | 		liblinearInput = open("all_predictions_4.0/" + filename, "r")
178 | 		lines = []
179 | 		for line in liblinearInput:
180 | 			lines.append(line)
181 | 		liblinearInput.close()
182 | 		output = open("all_predictions_4.0/" + filename, "w")
183 | 		for line in lines:
184 | 			if line.startswith("LIBLINEAR_PATH="):
185 | 				output.write("LIBLINEAR_PATH=" + liblinearDir + "\n")
186 | 			elif line.startswith("${LIBLINEAR_PATH}") or line.startswith("python"):
187 | 				tokens = line.strip().split()
188 | 				for tok in tokens:
189 | 					if tok.startswith("${LIBLINEAR_PATH}") or tok.startswith("python"):
190 | 						output.write(tok)
191 | 					elif tok.endswith(".model") or tok.endswith(".dict"):
192 | 						if "/" not in tok:
193 | 							output.write(" " + modelDir + "/liblinear/" + tok)
194 | 						else:
195 | 							tmpToks = tok.split("/")
196 | 							output.write(" " + modelDir + "/liblinear/" + tmpToks[len(tmpToks)-1])
197 | 					else:
198 | 						output.write(" " + tok)
199 | 				output.write("\n")
200 | 			else:
201 | 				output.write(line)
202 | 		output.close()
203 | 
204 | 	# update the CoreNLP filepath - English
205 | 	corenlpInput = open("preprocessing_2.0/CoreNLP_scripts/runCoreNLP_Eng.sh", "r")
206 | 	lines = []
207 | 	for line in corenlpInput:
208 | 		lines.append(line)
209 | 	corenlpInput.close()
210 | 	output = open("preprocessing_2.0/CoreNLP_scripts/runCoreNLP_Eng.sh", "w")
211 | 	for line in lines:
212 | 		if line.startswith("STANFORD_CORENLP"):
213 | 			output.write("STANFORD_CORENLP=" + corenlpDir + "\n")
214 | 		else:
215 | 			output.write(line)
216 | 	output.close()
217 | 
218 | 	# update the NER filepath - English
219 | 	corenlpInput = open("preprocessing_2.0/entityExtraction/runEntities.sh", "r")
220 | 	lines = []
221 | 	for line in corenlpInput:
222 | 		lines.append(line)
223 | 	corenlpInput.close()
224 | 	output = open("preprocessing_2.0/entityExtraction/runEntities.sh", "w")
225 | 	for line in lines:
226 | 		if line.startswith("STANFORD_NER"):
227 | 			output.write("STANFORD_NER=" + nerDir + "\n")
228 | 		elif line.startswith("\tjava -mx16g -cp"):
229 | 			tokens = line.strip().split()
230 | 			output.write("\t")
231 | 			for tok in tokens:
232 | 				if tok.endswith(".gz"):
233 | 					if "/" not in tok:
234 | 						output.write(" " + modelDir + "/entities/" + tok)
235 | 					else:
236 | 						tmpToks = tok.split("/")
237 | 						output.write(" " + modelDir + "/entities/" + tmpToks[len(tmpToks)-1])
238 | 				elif tok == "java":
239 | 					output.write(tok)
240 | 				else:
241 | 					output.write(" " + tok)
242 | 			output.write("\n")
243 | 		else:
244 | 			output.write(line)
245 | 	output.close()
246 | 	
247 | 	# update the CoreNLP filepath - Chinese
248 | 	corenlpInput = open("preprocessing_2.0/CoreNLP_scripts/runCoreNLP_Chn.sh", "r")
249 | 	lines = []
250 | 	for line in corenlpInput:
251 | 		lines.append(line)
252 | 	corenlpInput.close()
253 | 	output = open("preprocessing_2.0/CoreNLP_scripts/runCoreNLP_Chn.sh", "w")
254 | 	for line in lines:
255 | 		if line.startswith("STANFORD_CORENLP"):
256 | 			output.write("STANFORD_CORENLP=" + corenlpDir + "\n")
257 | 		else:
258 | 			output.write(line)
259 | 	output.close()
260 | 
261 | 	# update the NER filepath - Chinese
262 | 	corenlpInput = open("preprocessing_2.0/entityExtraction/runEntities_Chinese.sh", "r")
263 | 	lines = []
264 | 	for line in corenlpInput:
265 | 		lines.append(line)
266 | 	corenlpInput.close()
267 | 	output = open("preprocessing_2.0/entityExtraction/runEntities_Chinese.sh", "w")
268 | 	for line in lines:
269 | 		if line.startswith("STANFORD_NER"):
270 | 			output.write("STANFORD_NER=" + nerDir + "\n")
271 | 		elif line.startswith("\tjava -mx16g -cp"):
272 | 			tokens = line.strip().split()
273 | 			output.write("\t")
274 | 			for tok in tokens:
275 | 				if tok.endswith(".gz"):
276 | 					if "/" not in tok:
277 | 						output.write(" " + modelDir + "/entities/" + tok)
278 | 					else:
279 | 						tmpToks = tok.split("/")
280 | 						output.write(" " + modelDir + "/entities/" + tmpToks[len(tmpToks)-1])
281 | 				elif tok == "java":
282 | 					output.write(tok)
283 | 				else:
284 | 					output.write(" " + tok)
285 | 			output.write("\n")
286 | 		else:
287 | 			output.write(line)
288 | 	output.close()
289 | 
290 | 	# update the CoreNLP filepath - Spanish
291 | 	corenlpInput = open("preprocessing_2.0/CoreNLP_scripts/runCoreNLP_Span.sh", "r")
292 | 	lines = []
293 | 	for line in corenlpInput:
294 | 		lines.append(line)
295 | 	corenlpInput.close()
296 | 	output = open("preprocessing_2.0/CoreNLP_scripts/runCoreNLP_Span.sh", "w")
297 | 	for line in lines:
298 | 		if line.startswith("STANFORD_CORENLP"):
299 | 			output.write("STANFORD_CORENLP=" + corenlpDir + "\n")
300 | 		else:
301 | 			output.write(line)
302 | 	output.close()
303 | 
304 | 	# update the NER filepath - Spanish
305 | 	corenlpInput = open("preprocessing_2.0/entityExtraction/runEntities_Spanish.sh", "r")
306 | 	lines = []
307 | 	for line in corenlpInput:
308 | 		lines.append(line)
309 | 	corenlpInput.close()
310 | 	output = open("preprocessing_2.0/entityExtraction/runEntities_Spanish.sh", "w")
311 | 	for line in lines:
312 | 		if line.startswith("STANFORD_NER"):
313 | 			output.write("STANFORD_NER=" + nerDir + "\n")
314 | 		elif line.startswith("\tjava -mx16g -cp"):
315 | 			tokens = line.strip().split()
316 | 			output.write("\t")
317 | 			for tok in tokens:
318 | 				if tok.endswith(".gz"):
319 | 					if "/" not in tok:
320 | 						output.write(" " + modelDir + "/entities/" + tok)
321 | 					else:
322 | 						tmpToks = tok.split("/")
323 | 						output.write(" " + modelDir + "/entities/" + tmpToks[len(tmpToks)-1])
324 | 				elif tok == "java":
325 | 					output.write(tok)
326 | 				else:
327 | 					output.write(" " + tok)
328 | 			output.write("\n")
329 | 		else:
330 | 			output.write(line)
331 | 	output.close()
332 | 
333 | main()
334 | 


--------------------------------------------------------------------------------
/outputFormatting/Chinese_run.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # extract nugget output
 4 | cd formatTriggers/format_andrew_triggers/
 5 | python format_andrew.py ../../../all_predictions_4.0/currentPredictionsForTriggers/testSet.predictions ../../../preprocessing_2.0/createSetFiles/setFile.withEntities.tmp.Chn andrew.triggers.out
 6 | cd ../../
 7 | 
 8 | # arguments:	1.) test.out file, from ../argumentPrediction/	2.) Easy-read arguments file	3.) Roles dictionary	4.) Entity coref output	5.) docmap file	6.) stopwords file 7.) realis file
 9 | python finalForm_KBP.py ../all_predictions_4.0/output.test.arguments ../all_predictions_4.0/arguments.out.easyRead /home/andrew/DEFT_code_testing/dependencies/models/liblinear/arguments.roles.dict ../all_predictions_4.0/arguments.out.entityCoref ../preprocessing_2.0/documents.paths.tmp stopwords.txt ../all_predictions_4.0/currentPredictionsForRealis/testSet.predictions
10 | 
11 | # write nugget output, one per document
12 | python writeTriggerOutput.py formatTriggers/format_andrew_triggers/andrew.triggers.out
13 | 
14 | # connect arguments and nuggets together
15 | python argument_nugget_linking.py ../preprocessing_2.0/documents.rootnames.tmp
16 | 
17 | cd out
18 | ./moveToStore.sh
19 | 


--------------------------------------------------------------------------------
/outputFormatting/English_run.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # extract nugget output
 4 | cd formatTriggers/format_andrew_triggers/
 5 | python format_andrew.py ../../../all_predictions_4.0/currentPredictionsForTriggers/testSet.predictions ../../../preprocessing_2.0/createSetFiles/setFile.withEntities.tmp andrew.triggers.out
 6 | cd ../../
 7 | 
 8 | # arguments:	1.) test.out file, from ../argumentPrediction/	2.) Easy-read arguments file	3.) Roles dictionary	4.) Entity coref output	5.) docmap file	6.) stopwords file 7.) realis file
 9 | python finalForm_KBP.py ../all_predictions_4.0/output.test.arguments ../all_predictions_4.0/arguments.out.easyRead /home/andrew/DEFT_code_testing/dependencies/models/liblinear/arguments.roles.dict ../all_predictions_4.0/arguments.out.entityCoref ../preprocessing_2.0/documents.paths.tmp stopwords.txt ../all_predictions_4.0/currentPredictionsForRealis/testSet.predictions
10 | 
11 | # write nugget output, one per document
12 | python writeTriggerOutput.py formatTriggers/format_andrew_triggers/andrew.triggers.out
13 | 
14 | # connect arguments and nuggets together
15 | python argument_nugget_linking.py ../preprocessing_2.0/documents.rootnames.tmp
16 | 
17 | cd out
18 | ./moveToStore.sh
19 | 


--------------------------------------------------------------------------------
/outputFormatting/Spanish_run.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # extract nugget output
 4 | cd formatTriggers/format_andrew_triggers/
 5 | python format_andrew.py ../../../all_predictions_4.0/currentPredictionsForTriggers/testSet.predictions ../../../preprocessing_2.0/createSetFiles/setFile.withEntities.tmp.Span andrew.triggers.out
 6 | cd ../../
 7 | 
 8 | # arguments:	1.) test.out file, from ../argumentPrediction/	2.) Easy-read arguments file	3.) Roles dictionary	4.) Entity coref output	5.) docmap file	6.) stopwords file 7.) realis file
 9 | python finalForm_KBP.py ../all_predictions_4.0/output.test.arguments ../all_predictions_4.0/arguments.out.easyRead /home/andrew/DEFT_code_testing/dependencies/models/liblinear/arguments.roles.dict ../all_predictions_4.0/arguments.out.entityCoref ../preprocessing_2.0/documents.paths.tmp stopwords.txt ../all_predictions_4.0/currentPredictionsForRealis/testSet.predictions
10 | 
11 | # write nugget output, one per document
12 | python writeTriggerOutput.py formatTriggers/format_andrew_triggers/andrew.triggers.out
13 | 
14 | # connect arguments and nuggets together
15 | python argument_nugget_linking.py ../preprocessing_2.0/documents.rootnames.tmp
16 | 
17 | cd out
18 | ./moveToStore.sh
19 | 


--------------------------------------------------------------------------------
/outputFormatting/argument_nugget_linking.py:
--------------------------------------------------------------------------------
  1 | # script to link together the argument and nugget files
  2 | import sys
  3 | 
  4 | def main():
  5 |     if len(sys.argv) != 2:
  6 |         print "Expect list of filenames."
  7 |         sys.exit()
  8 | 
  9 |     argDir = "out/arguments/"
 10 |     argModDir = "out/linked_arguments/"
 11 |     nuggetDir = "out/nuggets/"
 12 |     linkingDir = "out/linking/"
 13 |     corpusLinking = "out/corpusLinking/corpusLinking"
 14 | 
 15 |     filenames = []
 16 |     input = open(sys.argv[1], "r")
 17 |     for line in input:
 18 |         filenames.append(line.strip())
 19 |     input.close()
 20 | 
 21 |     corpusID = 1
 22 |     corpusOut = open(corpusLinking, "w")
 23 | 
 24 |     for filename in filenames:
 25 |         nuggetDict = dict() # eventType -> offset -> ID
 26 |         nuggetNameDict = dict() # eventType -> offset -> nugget_string
 27 |         argumentDict = dict()   # nuggetID -> attached_arguments
 28 | 
 29 |         try:
 30 |             input = open(nuggetDir + filename, "r")
 31 |             for line in input:
 32 |                 if not line.startswith("#") and not line.startswith("@"):
 33 |                     tokens = line.strip().split("\t")
 34 |                     eventType = tokens[5]
 35 |                     startOffset = tokens[3].split(",")[0]
 36 |                     nuggetID = tokens[2]
 37 |                     nuggetName = tokens[4]
 38 | 
 39 |                     if eventType not in nuggetDict:
 40 |                         nuggetDict[eventType] = dict()
 41 |                         nuggetNameDict[eventType] = dict()
 42 |                     nuggetDict[eventType][startOffset] = nuggetID
 43 |                     nuggetNameDict[eventType][startOffset] = nuggetName
 44 | 
 45 |             input.close()
 46 |         except:
 47 |             print "No nugget file found for " + filename + "; continuing..."
 48 | 
 49 |         input = open(argDir + filename, "r")
 50 |         output = open(argModDir + filename, "w")
 51 |         for line in input:
 52 |             tokens = line.strip().split("\t")
 53 |             triggerOffset = tokens[11]
 54 |             eventType = tokens[2]
 55 |             argumentID = tokens[0]
 56 | 
 57 | 
 58 |             # rewrite the arguments/ file
 59 |             nuggetID = nuggetDict[eventType][triggerOffset]
 60 |             nuggetSpan = triggerOffset + "-" + str(len(nuggetNameDict[eventType][triggerOffset]) + int(triggerOffset) - 1)
 61 |             count = 0
 62 |             for tok in tokens:
 63 |                 if count == 6:
 64 |                     output.write(nuggetSpan + "\t")
 65 |                 else:
 66 |                     output.write(tok + "\t")
 67 |                 count += 1
 68 |             output.write(nuggetID + "\n")
 69 | 
 70 |             # store information for linking file
 71 |             if nuggetID not in argumentDict:
 72 |                 argumentDict[nuggetID] = []
 73 |             argumentDict[nuggetID].append(argumentID)
 74 |         input.close()
 75 |         output.close()
 76 | 
 77 |         # reread coreference and write linking file
 78 |         seenNuggets = set()
 79 |         linkingID = 1
 80 | 
 81 |         output = open(linkingDir + filename, "w")
 82 | 
 83 |         try:
 84 |             input = open(nuggetDir + filename, "r")
 85 |             for line in input:
 86 |                 if line.startswith("@"):
 87 |                     tokens = line.strip().split("\t")
 88 |                     nuggets = tokens[2].split(",")
 89 |                     first = True
 90 | 
 91 |                     outputArgs = []
 92 | 
 93 |                     for nuggetID in nuggets:
 94 |                         if nuggetID in argumentDict:
 95 |                             seenNuggets.add(nuggetID)
 96 | 
 97 |                             argumentList = argumentDict[nuggetID]
 98 |                             for arg in argumentList:
 99 |                                 outputArgs.append(arg)
100 | 
101 | 
102 |                     if len(outputArgs) > 0:
103 |                         output.write(str(linkingID) + "\t")
104 |                         corpusOut.write(str(corpusID) + "\t" + filename + "-" + str(linkingID) + "\n")
105 |                         corpusID += 1
106 | 
107 |                         for index in range(len(outputArgs)):
108 |                             if index == 0:
109 |                                 output.write(outputArgs[index])
110 |                             else:
111 |                                 output.write(" " + outputArgs[index])
112 |                         output.write("\n")
113 | 
114 |                         linkingID += 1
115 | 
116 |             # now, write any singleton nuggets
117 |             for nugget in argumentDict:
118 |                 if nugget not in seenNuggets:
119 |                     output.write(str(linkingID) + "\t")
120 |                     corpusOut.write(str(corpusID) + "\t" + filename + "-" + str(linkingID) + "\n")
121 |                     corpusID += 1
122 | 
123 |                     argumentList = argumentDict[nugget]
124 |                     for index in range(len(argumentList)):
125 |                         if index == 0:
126 |                             output.write(argumentList[index])
127 |                             linkingID += 1
128 |                         else:
129 |                             output.write(" " + argumentList[index])
130 |                     output.write("\n")
131 | 
132 |             input.close()
133 |         except:
134 |             print "Skipping empty nugget file again..."
135 | 
136 |         output.close()
137 | 
138 | 
139 |     corpusOut.close()
140 | 
141 | main()
142 | 


--------------------------------------------------------------------------------
/outputFormatting/finalForm_KBP.py:
--------------------------------------------------------------------------------
  1 | # script to change the format to the TAC KBP 2015 required formatting
  2 | import string
  3 | import sys
  4 | 
  5 | responseIDs = set()
  6 | eventIDs = dict()   # dict from eventString -> ID
  7 | 
  8 | seenResponses = set()
  9 | 
 10 | stopwordSet = set()
 11 | 
 12 | corefClusters = dict()  # dict from corefID -> set of strings
 13 | 
 14 | docDict = dict()    # dict from docID -> filename
 15 | 
 16 | # example below:
 17 | #sent_1	Phrase:	they	CorefStr:	coref_2	Role:	Person	Trigger:	arriving	EventType:	Movement_Transport	EntityType:	PER.Group	DOCID:	CNNHL_ENG_20030416_133739.9	START:	127	END:	131
 18 | class ProcessedArgument:
 19 |     def __init__(self, easyReadLine, assignedRole, confArg, curRealis):
 20 |         tokens = easyReadLine.strip().split("\t")
 21 |         sentStr = tokens[0]
 22 |         self.text = tokens[2]
 23 | 
 24 |         self.corefID = tokens[4] + "_" + tokens[14]
 25 |         self.triggerText = tokens[8]
 26 | 
 27 |         eventTokens = tokens[10].split("_")
 28 |         self.eventType = convertEventType(eventTokens[0])+ "." + convertEventType(eventTokens[1])
 29 | 
 30 |         self.realis = curRealis
 31 |         if self.realis == "UNK_REALIS":
 32 |             self.realis = "ACTUAL"
 33 |         
 34 |         self.entityType = convertEntityType(tokens[12].split(".")[0])
 35 | 
 36 |         self.role = assignedRole
 37 | 
 38 |         self.JET_role = tokens[6]
 39 | 
 40 |         self.confidence = confArg
 41 |         self.docID = tokens[14] 
 42 |         if self.docID.endswith(".mpdf"):
 43 |             self.docID = self.docID[:-5]
 44 | 
 45 |         if self.docID.startswith("CMN"):
 46 |             tmp = removeWhitespace(self.text)
 47 |             self.text = tmp
 48 | 
 49 |         self.baseStart = tokens[16]
 50 |         self.baseEnd = str(int(tokens[18]) - 1) # note that we should be one less -- ACE vs KBP differences
 51 | 
 52 |         self.sentStart = tokens[20]
 53 |         self.sentEnd = str(int(tokens[22]) - 1) # note that we should be one less -- ACE vs KBP differences
 54 | 
 55 |         # use this to integrate with the event nugget data (Feb. 10, 2017)
 56 |         self.triggerOffset = tokens[24]
 57 | 
 58 |         # ACE -> KBP changes
 59 |         if self.eventType == "Contact.Phone-Write":
 60 |             self.eventType = "Contact.Correspondence"
 61 | 
 62 |         if self.eventType == "Transaction.Transfer-Ownership":
 63 |             if self.role == "Artifact":
 64 |                 self.role == "Thing"
 65 | 
 66 |         if self.eventType == "Movement.Transport":
 67 |             if self.role == "Artifact":
 68 |                 self.eventType = "Movement.Transport-Artifact"
 69 |             else:
 70 |                 self.eventType = "Movement.Transport-Person"
 71 | 
 72 |         if self.role.startswith("Time"):
 73 |             self.role = "Time"
 74 | 
 75 |         if self.JET_role.startswith("Time"):
 76 |             self.JET_role = "Time"
 77 | 
 78 |     def activateJET(self):
 79 |         self.role = self.JET_role
 80 | 
 81 | def removeWhitespace(arg):
 82 |     alt = ""
 83 |     for character in arg:
 84 |         if character not in string.whitespace:
 85 |             alt += character
 86 |     return alt
 87 | 
 88 | def convertWhitespace(arg):
 89 |     alt = ""
 90 |     for character in arg:
 91 |         if character in string.whitespace:
 92 |             alt += " "
 93 |         else:
 94 |             alt += character
 95 |     return alt
 96 | 
 97 | def isYear(text):
 98 |     if len(text) != 4:
 99 |         return False
100 |     for index in range(4):
101 |         if text[index] not in string.digits:
102 |             return False
103 |     return True
104 | 
105 | def isNumber(text):
106 |     for character in text:
107 |         if character not in string.digits:
108 |             return False
109 |     return True
110 | 
111 | def isDay(text, prevMonth):
112 |     if prevMonth and (len(text) == 1 or len(text) == 2):
113 |         if isNumber(text):
114 |             num = int(text)
115 |             if num >= 1 and num <= 31:
116 |                 if len(text) == 1:
117 |                     return "0" + str(num)
118 |                 else:
119 |                     return str(num)
120 |         return ""
121 |     if len(text) == 3:
122 |         numPart = text[0]
123 |         if isNumber(numPart):
124 |             num = int(numPart)
125 |             if num >=1 and num <= 31:
126 |                 return "0" + str(num)
127 |         return ""
128 |     if len(text) == 4:
129 |         numPart = text[0] + text[1]
130 |         if isNumber(numPart):
131 |             num = int(numPart)
132 |             if num >=1 and num <= 31:
133 |                 return str(num)
134 |         return ""
135 | 
136 |     return ""
137 | 
138 | def isMonth(text):
139 |     temp = text.lower()
140 |     if temp == "january" or temp == "jan" or temp == "jan.":
141 |         return "01"
142 |     elif temp == "february" or temp == "feb" or temp == "feb.":
143 |         return "02"
144 |     elif temp == "march" or temp == "mar" or temp == "mar.":
145 |         return "03"
146 |     elif temp == "april" or temp == "apr" or temp == "apr.":
147 |         return "04"
148 |     elif temp == "may":
149 |         return "05"
150 |     elif temp == "june" or temp == "jun" or temp == "jun.":
151 |         return "06"
152 |     elif temp == "july" or temp == "jul" or temp == "jul.":
153 |         return "07"
154 |     elif temp == "august" or temp == "aug" or temp == "aug.":
155 |         return "08"
156 |     elif temp == "september" or temp == "sept" or temp == "sept." or temp == "sep" or temp == "sep.":
157 |         return "09"
158 |     elif temp == "october" or temp == "oct" or temp == "oct.":
159 |         return "10"
160 |     elif temp == "november" or temp == "nov" or temp == "nov.":
161 |         return "11"
162 |     elif temp == "december" or temp == "dec" or temp == "dec.":
163 |         return "12"
164 |     else:
165 |         return ""
166 | 
167 | def timeNormalization(timeString):
168 |     year = "XXXX"
169 |     month = "XX"
170 |     day = "XX"
171 | 
172 |     prevMonth = False
173 |     
174 |     tokens = convertWhitespace(timeString).split(" ")
175 |     for tok in tokens:
176 |         if isYear(tok):
177 |             year = tok
178 |             continue
179 |         monthStr = isMonth(tok)
180 |         if monthStr != "":
181 |             month = monthStr
182 |             prevMonth = True
183 |             continue
184 | 
185 |         dayStr = isDay(tok, prevMonth)
186 |         if dayStr != "":
187 |             day = dayStr
188 |             continue
189 |         prevMonth = False
190 | 
191 |     finalString = year + "-" + month + "-" + day
192 |     return finalString
193 | 
194 | def validEntityType(argument):
195 |     role = argument.role
196 |     entityType = argument.entityType
197 | 
198 |     # if we don't know the entity type, assume valid
199 |     if entityType == "NULL":
200 |         return True
201 | 
202 |     validSet = set()
203 | 
204 |     if role == "Adjudicator":
205 |         validSet.add("PER")
206 |         validSet.add("ORG")
207 |         validSet.add("GPE")
208 |     elif role == "Agent":
209 |         validSet.add("PER")
210 |         validSet.add("ORG")
211 |         validSet.add("GPE")
212 |         validSet.add("FAC")
213 |     elif role == "Artifact":
214 |         validSet.add("VEH")
215 |         validSet.add("WEA")
216 |         validSet.add("FAC")
217 |         validSet.add("ORG")
218 |         validSet.add("COM")
219 |     elif role == "Attacker":
220 |         validSet.add("PER")
221 |         validSet.add("ORG")
222 |         validSet.add("GPE")
223 |     elif role == "Beneficiary":
224 |         validSet.add("PER")
225 |         validSet.add("ORG")
226 |         validSet.add("GPE")
227 |     elif role == "Buyer":
228 |         validSet.add("PER")
229 |         validSet.add("ORG")
230 |         validSet.add("GPE")
231 |     elif role == "Crime":
232 |         validSet.add("CRIME")
233 |     elif role == "Defendant":
234 |         validSet.add("PER")
235 |         validSet.add("ORG")
236 |         validSet.add("GPE")
237 |     elif role == "Destination":
238 |         validSet.add("GPE")
239 |         validSet.add("LOC")
240 |         validSet.add("FAC")
241 |     elif role == "Entity":
242 |         validSet.add("ORG")
243 |         validSet.add("GPE")
244 |         validSet.add("PER")
245 |     elif role == "Giver":
246 |         validSet.add("ORG")
247 |         validSet.add("GPE")
248 |         validSet.add("PER")
249 |     elif role == "Instrument":
250 |         validSet.add("WEA")
251 |         validSet.add("VEH")
252 |     elif role == "Money":
253 |         validSet.add("MONEY")
254 |         validSet.add("NUM")
255 |     elif role == "Org":
256 |         validSet.add("ORG")
257 |     elif role == "Origin":
258 |         validSet.add("GPE")
259 |         validSet.add("LOC")
260 |         validSet.add("FAC")
261 |     elif role == "Person":
262 |         validSet.add("PER")
263 |     elif role == "Place":
264 |         validSet.add("GPE")
265 |         validSet.add("LOC")
266 |         validSet.add("FAC")
267 |     elif role == "Plaintiff":
268 |         validSet.add("PER")
269 |         validSet.add("ORG")
270 |         validSet.add("GPE")
271 |     elif role == "Position":
272 |         validSet.add("JOB")
273 |     elif role == "Price":
274 |         validSet.add("MONEY")
275 |         validSet.add("NUM")
276 |     elif role == "Prosecutor":
277 |         validSet.add("PER")
278 |         validSet.add("ORG")
279 |         validSet.add("GPE")
280 |     elif role == "Recipient":
281 |         validSet.add("PER")
282 |         validSet.add("ORG")
283 |         validSet.add("GPE")
284 |     elif role == "Seller":
285 |         validSet.add("PER")
286 |         validSet.add("ORG")
287 |         validSet.add("GPE")
288 |     elif role == "Sentence":
289 |         validSet.add("SENTENCE")
290 |     elif role == "Target":
291 |         validSet.add("PER")
292 |         validSet.add("ORG")
293 |         validSet.add("VEH")
294 |         validSet.add("FAC")
295 |         validSet.add("WEA")
296 |     elif role == "Vehicle":
297 |         validSet.add("VEH")
298 |     elif role == "Victim":
299 |         validSet.add("PER")
300 |     elif role.startswith("Time"):
301 |         validSet.add("TIME")
302 |     elif role == "Audience":
303 |         validSet.add("PER")
304 |         validSet.add("ORG")
305 |         validSet.add("GPE")
306 |     elif role == "Thing":
307 |         validSet.add("VEH")
308 |         validSet.add("WEA")
309 |         validSet.add("ORG")
310 |         validSet.add("FAC")
311 |     else:
312 |         print "Don't recognize this role: " + role
313 |         return False
314 | 
315 |     if entityType not in validSet:
316 |         return False
317 |     return True
318 | 
319 | def validRole(argument):
320 |     eventType = argument.eventType
321 |     role = argument.role
322 | 
323 |     validSet = set()
324 | 
325 |     notKBP2016_set = set(["Business.Mergeorg", "Business.Startorg", "Business.Endorg", "Life.Beborn", "Business.Declarebankruptcy", "Justice.Releaseparole", "Justice.Chargeindict", "Justice.Trialhearing", "Business.Declare-Bankruptcy", "Business.Merge-Org", "Life.Marry", "Life.Divorce", "Personnel.Nominate", "Justice.Release-Parole", "Justice.Trial-Hearing", "Justice.Sentence", "Justice.Fine", "Justice.Charge-Indict", "Justice.Sue", "Justice.Extradite", "Justice.Acquit", "Justice.Convict", "Justice.Appeal", "Justice.Execute", "Justice.Pardon", "Manufacture.Artifact"])
326 |     if eventType in notKBP2016_set:
327 |         return False
328 | 
329 | 
330 | #    if eventType == "Business.Declare-Bankruptcy":
331 | #        validSet.add("Org")
332 | #    elif eventType == "Business.Merge-Org":
333 | #        validSet.add("Org")
334 |     if eventType == "Conflict.Attack":
335 |         validSet.add("Attacker")
336 |         validSet.add("Target")
337 |         validSet.add("Instrument")
338 |     elif eventType == "Conflict.Demonstrate":
339 |         validSet.add("Entity")
340 |     elif eventType == "Contact.Meet":
341 |         validSet.add("Entity")
342 |     elif eventType == "Contact.Correspondence":
343 |         validSet.add("Entity")
344 |     elif eventType == "Contact.Contact":
345 |         validSet.add("Entity")
346 |     elif eventType == "Contact.Broadcast":
347 |         validSet.add("Audience")
348 |         validSet.add("Entity")
349 | #    elif eventType == "Life.Marry":
350 | #        validSet.add("Person")
351 | #    elif eventType == "Life.Divorce":
352 | #        validSet.add("Person")
353 |     elif eventType == "Life.Injure":
354 |         validSet.add("Agent")
355 |         validSet.add("Victim")
356 |         validSet.add("Instrument")
357 |     elif eventType == "Life.Die":
358 |         validSet.add("Agent")
359 |         validSet.add("Victim")
360 |         validSet.add("Instrument")
361 |     elif eventType == "Movement.Transport-Person":
362 |         validSet.add("Agent")
363 |         validSet.add("Person")
364 |         validSet.add("Instrument")
365 |         validSet.add("Origin")
366 |         validSet.add("Destination")
367 |     elif eventType == "Movement.Transport-Artifact":
368 |         validSet.add("Agent")
369 |         validSet.add("Artifact")
370 |         validSet.add("Instrument")
371 |         validSet.add("Origin")
372 |         validSet.add("Destination")
373 |     elif eventType == "Personnel.Start-Position":
374 |         validSet.add("Person")
375 |         validSet.add("Entity")
376 |         validSet.add("Position")
377 |     elif eventType == "Personnel.End-Position":
378 |         validSet.add("Person")
379 |         validSet.add("Entity")
380 |         validSet.add("Position")
381 | #    elif eventType == "Personnel.Nominate":
382 | #        validSet.add("Agent")
383 | #        validSet.add("Person")
384 | #        validSet.add("Position")
385 |     elif eventType == "Personnel.Elect":
386 |         validSet.add("Person")
387 |         validSet.add("Agent")
388 |         validSet.add("Position")
389 |     elif eventType == "Transaction.Transaction":
390 |         validSet.add("Giver")
391 |         validSet.add("Recipient")
392 |         validSet.add("Beneficiary")
393 |     elif eventType == "Transaction.Transfer-Ownership":
394 |         validSet.add("Giver")
395 |         validSet.add("Recipient")
396 |         validSet.add("Beneficiary")
397 |         validSet.add("Thing")
398 |     elif eventType == "Transaction.Transfer-Money":
399 |         validSet.add("Giver")
400 |         validSet.add("Recipient")
401 |         validSet.add("Beneficiary")
402 |         validSet.add("Money")
403 |     elif eventType == "Justice.Arrest-Jail":
404 |         validSet.add("Agent")
405 |         validSet.add("Person")
406 |         validSet.add("Crime")
407 | #    elif eventType == "Justice.Release-Parole":
408 | #        validSet.add("Entity")
409 | #        validSet.add("Person")
410 | #        validSet.add("Crime")
411 | #    elif eventType == "Justice.Trial-Hearing":
412 | #        validSet.add("Prosecutor")
413 | #        validSet.add("Adjudicator")
414 | #        validSet.add("Defendant")
415 | #        validSet.add("Crime")
416 | #    elif eventType == "Justice.Sentence":
417 | #        validSet.add("Adjudicator")
418 | #        validSet.add("Defendant")
419 | #        validSet.add("Sentence")
420 | #        validSet.add("Crime")
421 | #    elif eventType == "Justice.Fine":
422 | #        validSet.add("Adjudicator")
423 | #        validSet.add("Entity")
424 | #        validSet.add("Money")
425 | #        validSet.add("Crime")
426 | #    elif eventType == "Justice.Charge-Indict":
427 | #        validSet.add("Prosecutor")
428 | #        validSet.add("Adjudicator")
429 | #        validSet.add("Defendant")
430 | #        validSet.add("Crime")
431 | #    elif eventType == "Justice.Sue":
432 | #        validSet.add("Plantiff")
433 | #        validSet.add("Adjudicator")
434 | #        validSet.add("Defendant")
435 | #        validSet.add("Crime")
436 | #    elif eventType == "Justice.Extradite":
437 | #        validSet.add("Agent")
438 | #        validSet.add("Person")
439 | #        validSet.add("Origin")
440 | #        validSet.add("Destination")
441 | #        validSet.add("Crime")
442 | #    elif eventType == "Justice.Acquit":
443 | #        validSet.add("Adjudicator")
444 | #        validSet.add("Defendant")
445 | #        validSet.add("Crime")
446 | #    elif eventType == "Justice.Convict":
447 | #        validSet.add("Adjudicator")
448 | #        validSet.add("Defendant")
449 | #        validSet.add("Crime")
450 | #    elif eventType == "Justice.Appeal":
451 | #        validSet.add("Prosecutor")
452 | #        validSet.add("Adjudicator")
453 | #        validSet.add("Defendant")
454 | #        validSet.add("Crime")
455 | #    elif eventType == "Justice.Execute":
456 | #        validSet.add("Agent")
457 | #        validSet.add("Person")
458 | #        validSet.add("Crime")
459 | #    elif eventType == "Justice.Pardon":
460 | #        validSet.add("Adjudicator")
461 | #        validSet.add("Defendant")
462 | #        validSet.add("Crime")
463 | #    elif eventType == "Manufacture.Artifact":
464 | #        validSet.add("Agent")
465 | #        validSet.add("Artifact")
466 | #        validSet.add("Instrument")
467 |     else:
468 |         print "Don't recognize this event type: " + eventType
469 |         return False
470 | 
471 |     if role == "Place" and eventType.startswith("Movement"):
472 |         return False
473 | 
474 |     if role == "Place" or role.startswith("Time"):
475 |         return True
476 | 
477 |     if role not in validSet:
478 |         return False
479 |     return True
480 | 
481 | 
482 | 
483 | def main():
484 |     if len(sys.argv) != 8:
485 |         print "Expect predictions file, easyRead file, roles dict, coref file, docID dictionary file, stopwords list, realisOutput."
486 |         print "Output to be placed in out/arguments/ and out/linking"
487 |         sys.exit()
488 | 
489 |     # first, write an empty file for each docID.  At least make sure we have a file, even if we don't find any arguments
490 |     input = open(sys.argv[6], "r")
491 |     for line in input:
492 |         word = line.strip()
493 |         stopwordSet.add(word)
494 |     input.close()
495 | 
496 |     input = open(sys.argv[5], "r")
497 |     for line in input:
498 |         tokens = line.strip().split("\t")
499 |         key = tokens[0]
500 | 
501 |         if key.endswith(".mpdf"):
502 |             key = key[:-5]
503 | 
504 | 
505 |         filename = tokens[1]
506 |         docDict[key] = filename
507 | 
508 |         output = open("out/arguments/" + key, "w")
509 |         output.close()
510 |         output = open("out/linking/" + key, "w")
511 |         output.close()
512 |         output = open("out/corpusLinking/corpusLinking", "w")
513 | 
514 |     input.close()
515 | 
516 |     predictionsRaw = []
517 |     confidence = []
518 |     input = open(sys.argv[1], "r")
519 |     labelOnly = True
520 |     for line in input:
521 |         if line.startswith("labels"):
522 |             labelOnly = False
523 |             continue
524 |         if labelOnly:
525 |             predictionsRaw.append(line.strip())
526 |             confidence.append("0.5")
527 |         else:
528 |             tempTokens = line.split(" ")
529 |             temp = tempTokens[0]
530 |             predictionsRaw.append(temp)
531 |             confidence.append(tempTokens[int(temp)])
532 |     input.close()
533 | 
534 |     input = open(sys.argv[4], "r")
535 |     for line in input:
536 |         tokens = line.strip().split("\t")
537 |         text = tokens[0]
538 |         corefID = tokens[1]
539 |         start = tokens[2]
540 |         end = str(int(tokens[3]) - 1)
541 | 
542 |         if corefID not in corefClusters:
543 |             corefClusters[corefID] = set()
544 |         corefClusters[corefID].add(text + "|||" + start + "|||" + end)
545 |     input.close()
546 | 
547 |     roleDict = dict()
548 |     input = open(sys.argv[3], "r")
549 |     for line in input:
550 |         tokens = line.strip().split(":")
551 | 
552 |         ### 2016 -- convert labels to correct format
553 |         roleDict[tokens[1]] = convertRoleLabels(tokens[0])
554 |     input.close()
555 | 
556 |     # read the realis labels
557 |     realis = []
558 |     input = open(sys.argv[7], "r")
559 |     for line in input:
560 |         start = line.strip().rfind("|")
561 |         realis.append(line.strip()[start+1:])
562 |     input.close()
563 | 
564 |     predictions = []
565 |     input = open(sys.argv[2], "r")
566 |     index = 0
567 |     for line in input:
568 |         predictedRole = roleDict[predictionsRaw[index]]
569 |         curConf = confidence[index]
570 |         curRealis = realis[index]
571 | 
572 |         arg = ProcessedArgument(line, predictedRole, curConf, curRealis)
573 |         predictions.append(arg)
574 | 
575 |         index += 1
576 |     input.close()
577 | 
578 |     docDict_Args = dict()    # dict from docID -> set of string
579 |     docDict_Linking = dict()    # dict from docID -> dict:{eventID -> set of responseIDs}
580 | 
581 |     # the system predicted ones
582 |     for arg in predictions: 
583 |         if arg.role == "NONE":
584 |             continue
585 | 
586 |         if not validRole(arg) or not validEntityType(arg):
587 |             continue
588 | 
589 | 
590 |         argString, docID, eventID, responseID, responseString = readArgument(arg)
591 | 
592 |         if responseString in seenResponses:
593 |             continue
594 | 
595 |         seenResponses.add(responseString)
596 | 
597 |         if docID not in docDict_Args:
598 |             docDict_Args[docID] = set()
599 |             docDict_Linking[docID] = dict()
600 |         docDict_Args[docID].add(argString)
601 | 
602 |         if arg.realis != "GENERIC":
603 |             if eventID not in docDict_Linking[docID]:
604 |                 docDict_Linking[docID][eventID] = set()
605 |             docDict_Linking[docID][eventID].add(responseID)
606 | 
607 |     for docID in docDict_Args:
608 |         output = open("out/arguments/" + docID, "w")
609 |         for line in docDict_Args[docID]:
610 |             output.write(line)
611 |         output.close()
612 | 
613 |     corpusOutput = open("out/corpusLinking/corpusLinking", "w")
614 |     corpusCount = 1
615 |     for docID in docDict_Linking:
616 |         output = open("out/linking/" + docID, "w")
617 |         eventCount = 1
618 |         for eventID in docDict_Linking[docID]:
619 |             output.write(str(eventCount) + "\t")
620 |             corpusOutput.write(str(corpusCount) + "\t" + docID + "-" + str(eventCount) + "\n")
621 | 
622 |             eventCount += 1
623 |             corpusCount += 1
624 | 
625 |             idSet = docDict_Linking[docID][eventID]
626 |             line = ""
627 |             for item in idSet:
628 |                 line += str(item) + " "
629 |             line = line.strip()
630 |             output.write(line.strip() + "\n")
631 |         output.close()
632 |     corpusOutput.close()
633 | 
634 | def properNoun(text):
635 |     tokens = text.split(" ")
636 |     proper = False
637 |     for tok in tokens:
638 |         if tok.lower() in stopwordSet:
639 |             continue
640 |         elif tok.lower() != tok:
641 |             proper = True
642 |     return proper
643 | 
644 | def canonicalForm(stringSet):
645 |     best = ""
646 |     bestStart = -1
647 |     bestEnd = -1
648 |     bestCapital = False
649 |     containsComma = False
650 |     for item in stringSet:
651 |         tokens = item.split("|||")
652 |         text = tokens[0]
653 |         start = tokens[1]
654 |         end = tokens[2]
655 | 
656 |         proper = properNoun(text)
657 | 
658 |         if best == "":
659 |             best = text
660 |             bestStart = start
661 |             bestEnd = end
662 |             if "," in text:
663 |                 containsComma = True
664 |             if proper:
665 |                 bestCapital = True
666 |         elif proper:
667 |             if not bestCapital:
668 |                 best = text
669 |                 bestStart = start
670 |                 bestEnd = end
671 |                 bestCapital = True
672 |                 if "," in text:
673 |                     containsComma = True
674 |             elif "," not in text and (len(text) > len(best) or containsComma):
675 |                 best = text
676 |                 bestStart = start
677 |                 bestEnd = end
678 |                 bestCapital = True
679 |         elif not bestCapital and "," not in text and (len(text) > len(best) or containsComma):
680 |             best = text
681 |             bestStart = start
682 |             bestEnd = end
683 | 
684 |     return best, bestStart, bestEnd
685 | 
686 | def convertOffset(value, docID):
687 |     ### counting the XML now in offsets, don't need below
688 |     return value
689 | 
690 |     #filename = docDict[docID]
691 |     #input = open(filename, "r")
692 |     #nonXML_Index = 0
693 |     #withXML_Index = 0
694 | 
695 |     #debug = ""
696 | 
697 |     #inXML = False
698 |     #broke = False
699 |     #for line in input:
700 |     #    for character in line:
701 |     #        if nonXML_Index == value:
702 |     #            broke = True
703 |     #            break
704 | 
705 |     #        withXML_Index += 1
706 |     #        if character == "<":
707 |     #            inXML = True
708 |     #        elif character == ">":
709 |     #            inXML = False
710 |     #        elif not inXML:
711 |     #            nonXML_Index += 1
712 |     #            debug += character
713 | 
714 |     #input.close()
715 | 
716 |     #if not broke:
717 |     #    print nonXML_Index
718 |     #    print value
719 |     #    print filename
720 |     #    print "ERROR!!!!"
721 |     #    sys.exit()
722 | 
723 |     #print "\t\t" + debug
724 |     
725 |     #return withXML_Index
726 | 
727 | def readArgument(inputArg):
728 |     responseID = len(responseIDs)
729 |     responseIDs.add(responseID)
730 | 
731 |     docID = inputArg.docID
732 |     ### NEW -- remove .xml extension
733 |     if docID.endswith(".xml"):
734 |         docID = docID[:-4]
735 | 
736 | 
737 |     eventType = inputArg.eventType
738 |     role = inputArg.role
739 | 
740 |     CAS_String, CAS_start, CAS_end = canonicalForm(corefClusters[inputArg.corefID])
741 |     # adjust whitespace
742 |     temp = convertWhitespace(CAS_String)
743 |     CAS_String = temp
744 | 
745 |     if role == "Time":
746 |         alternate_CAS_String = timeNormalization(CAS_String)
747 |         CAS_String = alternate_CAS_String
748 | 
749 |     #offsets = CAS_start + "-" + CAS_end
750 |     adjusted_CAS_start = convertOffset(int(CAS_start), docID)
751 |     adjusted_CAS_end = convertOffset(int(CAS_end), docID)
752 | 
753 |     if adjusted_CAS_end < adjusted_CAS_start:
754 |         adjusted_CAS_end = adjusted_CAS_start
755 | 
756 |     offsets = str(adjusted_CAS_start) + "-" + str(adjusted_CAS_end)
757 | 
758 |     adjusted_sentStart = convertOffset(int(inputArg.sentStart), docID)
759 |     adjusted_sentEnd = convertOffset(int(inputArg.sentEnd), docID)
760 |     justificationOffset = str(adjusted_sentStart) + "-" + str(adjusted_sentEnd)
761 | 
762 |     baseFiller = inputArg.text
763 |     adjusted_baseStart = convertOffset(int(inputArg.baseStart), docID)
764 |     adjusted_baseEnd = convertOffset(int(inputArg.baseEnd), docID)
765 | 
766 |     ### Linking with nuggets -- ColdStart++
767 |     triggerOffset = inputArg.triggerOffset
768 | 
769 |     ### KBP2016 -- no entity coref
770 |     CAS_String = baseFiller
771 |     CAS_start = adjusted_baseStart
772 |     CAS_end = adjusted_baseEnd
773 |     offsets = str(CAS_start) + "-" + str(CAS_end)
774 | 
775 |     ### KBP2016 -- justification must be < 200 characters
776 |     while adjusted_sentEnd - adjusted_sentStart >= 200:
777 |         if adjusted_baseEnd != adjusted_sentEnd:
778 |             adjusted_sentEnd -= 1
779 |         elif adjusted_baseStart != adjusted_sentStart:
780 |             adjusted_sentStart += 1
781 |         else:
782 |             adjusted_sentEnd -= 1
783 |     justificationOffset = str(adjusted_sentStart) + "-" + str(adjusted_sentEnd)
784 | 
785 |     if adjusted_baseEnd < adjusted_baseStart:
786 |         adjusted_baseEnd = adjusted_baseStart
787 |     baseFillerOffsets = str(adjusted_baseStart) + "-" + str(adjusted_baseEnd)
788 | 
789 |     argJustificationOffsets = "NIL"
790 |     realis = inputArg.realis
791 |     confidence = inputArg.confidence  # [0-1]
792 | 
793 |     # 2016 -- link things together if they have the same docID and same eventType
794 |     eventString = docID + "_" + eventType   
795 |     if eventString not in eventIDs:
796 |         eventIDs[eventString] = len(eventIDs)
797 | 
798 | 
799 |     ### original version below (before ColdStart++, used for TAC KBP 2016)
800 |     #outputString = str(responseID) + "\t" + docID + "\t" + eventType + "\t" + role + "\t" + CAS_String + "\t" + offsets + "\t" + justificationOffset + "\t" + baseFillerOffsets + "\t" + argJustificationOffsets + "\t" + realis + "\t" + confidence + "\n"
801 |     # new version -- used for ColdStart++ merging with nuggets
802 |     outputString = str(responseID) + "\t" + docID + "\t" + eventType + "\t" + role + "\t" + CAS_String + "\t" + offsets + "\t" + justificationOffset + "\t" + baseFillerOffsets + "\t" + argJustificationOffsets + "\t" + realis + "\t" + confidence + "\t" + triggerOffset + "\n"
803 | 
804 |     # below: not for output, but for identifying arguments that end up having the same ID (e.g. both play the AGENT role of some trigger in the same sentence)
805 |     responseString = docID + "\t" + eventType + "\t" + role + "\t" + CAS_String + "\t" + offsets + "\t" + justificationOffset + "\t" + baseFillerOffsets + "\t" + argJustificationOffsets + "\t" + realis + "\n"
806 | 
807 |     return outputString, docID, eventIDs[eventString], responseID, responseString
808 | 
809 | def convertRoleLabels(label):
810 |     newLabel = ""
811 |     prevChar = ""
812 |     first = True
813 |     for character in label:
814 |         if first:
815 |             newLabel += character.upper()
816 |             first = False
817 |         elif prevChar in string.punctuation:
818 |             newLabel += character.upper()
819 |         else:
820 |             newLabel += character
821 | 
822 |         prevChar = character
823 |     
824 |     return newLabel
825 | 
826 | def convertEventType(text):
827 |     tmp = convertRoleLabels(text)
828 | 
829 |     if tmp == "Transportperson":
830 |         return "Transport-Person"
831 |     elif tmp == "Transportartifact":
832 |         return "Transport-Artifact"
833 |     elif tmp == "Endposition":
834 |         return "End-Position"
835 |     elif tmp == "Startposition":
836 |         return "Start-Position"
837 |     elif tmp == "Arrestjail":
838 |         return "Arrest-Jail"
839 |     elif tmp == "Transfermoney":
840 |         return "Transfer-Money"
841 |     elif tmp == "Transferownership":
842 |         return "Transfer-Ownership"
843 |     else:
844 |         return tmp
845 | 
846 | def convertEntityType(text):
847 |     if text == "weapon":
848 |         return "WEA"
849 |     elif text == "vehicle":
850 |         return "VEH"
851 |     elif text == "sentence":
852 |         return "Sentence"
853 |     elif text == "crime":
854 |         return "CRIME"
855 |     elif text == "title":
856 |         return "Title"
857 |     elif text == "money":
858 |         return "MONEY"
859 |     elif text == "time":
860 |         return "TIME"
861 |     return text
862 | 
863 | main()
864 | 


--------------------------------------------------------------------------------
/outputFormatting/formatTriggers/format_andrew_triggers/format_andrew.py:
--------------------------------------------------------------------------------
 1 | # script to convert my output files to the Event Nugget Output format
 2 | import sys
 3 | 
 4 | def main():
 5 |     if len(sys.argv) != 4:
 6 |         print "Need output triggers (with role names), createSetFiles file, output file."
 7 |         sys.exit()
 8 | 
 9 |     triggers = []
10 |     input = open(sys.argv[1], "r")
11 |     for line in input:
12 |         triggers.append(line.strip().lower())
13 |     input.close()
14 | 
15 |     input = open(sys.argv[2], "r")
16 |     output = open(sys.argv[3], "w")
17 | 
18 |     curDoc = ""
19 |     mentionID = 1
20 |     index = 0
21 | 
22 |     for line in input:
23 |         clean = line.strip()
24 |         if clean != "":
25 |             tokens = line.strip().split("\t")
26 |             docID = tokens[5]
27 | 
28 |             if docID.endswith(".xml"):
29 |                 docID = docID[:-4]
30 | 
31 |             if docID != curDoc:
32 |                 if curDoc != "":
33 |                     output.write("#EndOfDocument\n")
34 |                 output.write("#BeginOfDocument " + docID + "\n")
35 |                 curDoc = docID
36 |                 mentionID = 1
37 | 
38 |             startOffset = tokens[0]
39 |             endOffset = tokens[1]
40 |             word = tokens[2]
41 | 
42 |             # skip rest if the word isn't a trigger
43 |             if triggers[index] == "not_trigger_not_trigger":
44 |                 index += 1
45 |                 continue
46 | 
47 |             output.write("andrewSystem\t" + curDoc + "\t" + str(mentionID) + "\t" + startOffset + "," + endOffset + "\t" + word + "\t" + triggers[index] + "\tActual" + "\t0.5" + "\n")
48 |             mentionID += 1
49 |             index += 1
50 | 
51 |     output.write("#EndOfDocument\n")
52 |     input.close()
53 |     output.close()
54 | 
55 | 
56 | 
57 | 
58 | 
59 | 
60 | 
61 | main()
62 | 


--------------------------------------------------------------------------------
/outputFormatting/formatTriggers/format_hector_triggers/format_hector.py:
--------------------------------------------------------------------------------
 1 | # script to convert my output files to the Event Nugget Output format
 2 | import sys
 3 | 
 4 | def main():
 5 |     if len(sys.argv) != 3:
 6 |         print "Need nuggets from Jun, output file."
 7 |         sys.exit()
 8 | 
 9 |     input = open(sys.argv[1], "r")
10 |     output = open(sys.argv[2], "w")
11 | 
12 |     docID = ""
13 |     for line in input:
14 |         if line.startswith("#BeginOfDocument"):
15 |             tokens = line.strip().split()
16 |             name = tokens[1]
17 |             if name.endswith(".xml"):
18 |                 name = name[:-4]
19 |             output.write(tokens[0] + " " + name + "\n")
20 |         elif line.startswith("#EndOfDocument"):
21 |             output.write(line)
22 |         elif line.startswith("@Coreference"):
23 |             output.write(line)
24 |         else:
25 |             tokens = line.strip().split("\t")
26 | 
27 |             if tokens[5] == "OUTSIDE":
28 |                 continue
29 | 
30 | #            labelTokens = tokens[5].split(".")
31 |             labelTokens = tokens[5].split("_")
32 |             label = labelTokens[0] + "_" + labelTokens[1]
33 | 
34 | #            output.write("junSystem" + "\t" + tokens[1] + "\t" + tokens[2] + "\t" + tokens[3] + "\t" + tokens[4] + "\t" + tokens[5].lower() + "\t" + tokens[6] + "\n")
35 | #            output.write("hectorSystem" + "\t" + tokens[1] + "\t" + tokens[2] + "\t" + tokens[3] + "\t" + tokens[4] + "\t" + label.lower() + "\t" + tokens[6] + "\n")
36 |             output.write("hectorSystem" + "\t" + tokens[1] + "\t" + tokens[2] + "\t" + tokens[3] + "\t" + tokens[4] + "\t" + label.lower() + "\t" + tokens[6] + "\t" + "0.5" + "\n")
37 |     input.close()
38 |     output.close()
39 | 
40 | 
41 | 
42 | 
43 | 
44 | 
45 | 
46 | main()
47 | 


--------------------------------------------------------------------------------
/outputFormatting/formatTriggers/format_jun_triggers/format_jun.py:
--------------------------------------------------------------------------------
 1 | # script to convert my output files to the Event Nugget Output format
 2 | import sys
 3 | 
 4 | def main():
 5 |     if len(sys.argv) != 3:
 6 |         print "Need nuggets from Jun, output file."
 7 |         sys.exit()
 8 | 
 9 |     input = open(sys.argv[1], "r")
10 |     output = open(sys.argv[2], "w")
11 | 
12 |     docID = ""
13 |     for line in input:
14 |         if line.startswith("#BeginOfDocument"):
15 |             tokens = line.strip().split()
16 |             name = tokens[1]
17 |             if name.endswith(".xml"):
18 |                 name = name[:-4]
19 |             output.write(tokens[0] + " " + name + "\n")
20 |         elif line.startswith("#EndOfDocument"):
21 |             output.write(line)
22 |         elif line.startswith("@Coreference"):
23 |             output.write(line)
24 |         else:
25 |             tokens = line.strip().split("\t")
26 | 
27 |             labelTokens = tokens[5].split(".")
28 |             label = labelTokens[0] + "_" + labelTokens[1]
29 | 
30 | #            output.write("junSystem" + "\t" + tokens[1] + "\t" + tokens[2] + "\t" + tokens[3] + "\t" + tokens[4] + "\t" + tokens[5].lower() + "\t" + tokens[6] + "\n")
31 | #            output.write("junSystem" + "\t" + tokens[1] + "\t" + tokens[2] + "\t" + tokens[3] + "\t" + tokens[4] + "\t" + label.lower() + "\t" + tokens[6] + "\n")
32 |             output.write("junSystem" + "\t" + tokens[1] + "\t" + tokens[2] + "\t" + tokens[3] + "\t" + tokens[4] + "\t" + label.lower() + "\t" + tokens[6] + "\t" + tokens[8] + "\n")
33 |     input.close()
34 |     output.close()
35 | 
36 | 
37 | 
38 | 
39 | 
40 | 
41 | 
42 | main()
43 | 


--------------------------------------------------------------------------------
/outputFormatting/out/cleanStore.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # move to store and clean again
 4 | cd store
 5 | for file in arguments/*; do rm "$file" ; done
 6 | for file in linking/*; do rm "$file" ; done
 7 | for file in nuggets/*; do rm "$file" ; done
 8 | rm corpusLinking/*
 9 | 
10 | # return to original directory
11 | cd ../
12 | 


--------------------------------------------------------------------------------
/outputFormatting/out/mergeSubmissions/mergeSubmissions.py:
--------------------------------------------------------------------------------
 1 | # script to merge the submissions from three sources into a single directory
 2 | import sys
 3 | 
 4 | def main():
 5 |     if len(sys.argv) != 2:
 6 |         print "Expect a list of files to read.  Assuming directories are under andrew/ hector/ and jun/"
 7 |         sys.exit()
 8 | 
 9 |     filenames = []
10 |     input = open(sys.argv[1], "r")
11 |     for line in input:
12 |         filenames.append(line.strip())
13 | 
14 |     dirList = ["andrew", "hector", "jun"]
15 | 
16 |     corpusOutput = open("all/corpusLinking/corpusLinking", "w")
17 |     corpusID = 1
18 | 
19 |     for filename in filenames:
20 |         idCount = 1
21 |         linesToWrite = []
22 |         output = open("all/arguments/" + filename, "w")
23 | 
24 |         # for making the linking file -- link together all arguments with the same event type
25 |         idsByEventType = dict() # eventType -> list of ids
26 | 
27 |         for inDir in dirList:
28 |             input = open(inDir + "/arguments/" + filename, "r")
29 | 
30 |             for line in input:
31 |                 start = line.find("\t") + 1
32 |                 data = line[start:]
33 | 
34 |                 if data not in linesToWrite:
35 |                     linesToWrite.append(data)
36 |                     output.write(str(idCount) + "\t" + data)
37 | 
38 |                     tokens = line.strip().split("\t")
39 |                     eventType = tokens[2]
40 | 
41 |                     if eventType not in idsByEventType:
42 |                         idsByEventType[eventType] = []
43 |                     idsByEventType[eventType].append(idCount)
44 | 
45 |                     idCount += 1
46 |             input.close()
47 | 
48 |         output.close()
49 | 
50 |         output = open("all/linking/" + filename, "w")
51 |         linkingID = 1
52 | 
53 |         for eventType in idsByEventType:
54 |             idList = idsByEventType[eventType]
55 |             output.write(str(linkingID) + "\t")
56 |             corpusOutput.write(str(corpusID) + "\t" + filename + "-" + str(linkingID) + "\n")
57 | 
58 |             for index in range(len(idList)):
59 |                 if index == len(idList) - 1:
60 |                     output.write(str(idList[index]) + "\n")
61 |                 else:
62 |                     output.write(str(idList[index]) + " ")
63 | 
64 |             linkingID += 1
65 |             corpusID += 1
66 | 
67 | 
68 |     corpusOutput.close()
69 | 
70 | 
71 | 
72 | 
73 | 
74 | 
75 | 
76 | 
77 | 
78 | 
79 | 
80 | 
81 | 
82 | main()
83 | 


--------------------------------------------------------------------------------
/outputFormatting/out/mergeSubmissions_coreference/argument_nugget_linking.py:
--------------------------------------------------------------------------------
  1 | # script to link together the argument and nugget files
  2 | import sys
  3 | 
  4 | def main():
  5 |     if len(sys.argv) != 2:
  6 |         print "Expect list of filenames."
  7 |         sys.exit()
  8 | 
  9 |     argDir = "all/arguments/"
 10 |     argModDir = "all/linked_arguments/"
 11 |     nuggetDir = "all/nuggets/"
 12 |     linkingDir = "all/linking/"
 13 |     corpusLinking = "all/corpusLinking/corpusLinking"
 14 | 
 15 |     filenames = []
 16 |     input = open(sys.argv[1], "r")
 17 |     for line in input:
 18 |         filenames.append(line.strip())
 19 |     input.close()
 20 | 
 21 |     corpusID = 1
 22 |     corpusOut = open(corpusLinking, "w")
 23 | 
 24 |     for filename in filenames:
 25 |         print filename
 26 |         nuggetDict = dict() # eventType -> offset -> ID
 27 |         argumentDict = dict()   # nuggetID -> attached_arguments
 28 | 
 29 |         input = open(nuggetDir + filename, "r")
 30 |         for line in input:
 31 |             if not line.startswith("#") and not line.startswith("@"):
 32 |                 tokens = line.strip().split("\t")
 33 |                 eventType = tokens[5]
 34 |                 startOffset = tokens[3].split(",")[0]
 35 |                 nuggetID = tokens[2]
 36 | 
 37 |                 if eventType not in nuggetDict:
 38 |                     nuggetDict[eventType] = dict()
 39 |                 nuggetDict[eventType][startOffset] = nuggetID
 40 |         input.close()
 41 | 
 42 |         input = open(argDir + filename, "r")
 43 |         output = open(argModDir + filename, "w")
 44 |         for line in input:
 45 |             print line
 46 |             tokens = line.strip().split("\t")
 47 |             print len(tokens)
 48 |             triggerOffset = tokens[11]
 49 |             eventType = tokens[2]
 50 |             argumentID = tokens[0]
 51 | 
 52 | 
 53 |             # rewrite the arguments/ file
 54 |             nuggetID = nuggetDict[eventType][triggerOffset]
 55 |             for tok in tokens:
 56 |                 output.write(tok + "\t")
 57 |             output.write(nuggetID + "\n")
 58 | 
 59 |             # store information for linking file
 60 |             if nuggetID not in argumentDict:
 61 |                 argumentDict[nuggetID] = []
 62 |             argumentDict[nuggetID].append(argumentID)
 63 |         input.close()
 64 |         output.close()
 65 | 
 66 |         # reread coreference and write linking file
 67 |         seenNuggets = set()
 68 |         linkingID = 1
 69 | 
 70 |         input = open(nuggetDir + filename, "r")
 71 |         output = open(linkingDir + filename, "w")
 72 |         for line in input:
 73 |             if line.startswith("@"):
 74 |                 tokens = line.strip().split("\t")
 75 |                 nuggets = tokens[2].split(",")
 76 |                 first = True
 77 | 
 78 |                 outputArgs = []
 79 | 
 80 |                 for nuggetID in nuggets:
 81 |                     if nuggetID in argumentDict:
 82 |                         seenNuggets.add(nuggetID)
 83 | 
 84 |                         argumentList = argumentDict[nuggetID]
 85 |                         for arg in argumentList:
 86 |                             outputArgs.append(arg)
 87 | 
 88 | 
 89 |                 if len(outputArgs) > 0:
 90 |                     output.write(str(linkingID) + "\t")
 91 |                     corpusOut.write(str(corpusID) + "\t" + filename + "-" + str(linkingID) + "\n")
 92 |                     corpusID += 1
 93 | 
 94 |                     for index in range(len(outputArgs)):
 95 |                         if index == 0:
 96 |                             output.write(outputArgs[index])
 97 |                         else:
 98 |                             output.write(" " + outputArgs[index])
 99 |                     output.write("\n")
100 | 
101 |                     linkingID += 1
102 | 
103 |         # now, write any singleton nuggets
104 |         for nugget in argumentDict:
105 |             if nugget not in seenNuggets:
106 |                 output.write(str(linkingID) + "\t")
107 |                 corpusOut.write(str(corpusID) + "\t" + filename + "-" + str(linkingID) + "\n")
108 |                 corpusID += 1
109 | 
110 |                 argumentList = argumentDict[nugget]
111 |                 for index in range(len(argumentList)):
112 |                     if index == 0:
113 |                         output.write(argumentList[index])
114 |                         linkingID += 1
115 |                     else:
116 |                         output.write(" " + argumentList[index])
117 |                 output.write("\n")
118 | 
119 | 
120 |         output.close()
121 |         input.close()
122 | 
123 | 
124 |     corpusOut.close()
125 | 
126 | main()
127 | 


--------------------------------------------------------------------------------
/outputFormatting/out/mergeSubmissions_coreference/mergeSubmissions.py:
--------------------------------------------------------------------------------
  1 | # script to merge the submissions from three sources into a single directory
  2 | import sys
  3 | 
  4 | def main():
  5 |     if len(sys.argv) != 2:
  6 |         print "Writes the nuggets/ and arguments/ files.  Use another script for linking."
  7 |         print "Expect a list of files to read.  Assuming directories are under andrew/ hector/ and jun/"
  8 |         sys.exit()
  9 | 
 10 |     filenames = []
 11 |     input = open(sys.argv[1], "r")
 12 |     for line in input:
 13 |         filenames.append(line.strip())
 14 | 
 15 |     dirList = ["hector", "jun", "andrew"]
 16 | 
 17 |     corpusID = 1
 18 |     for filename in filenames:
 19 |         # Begin Nugget Writing
 20 |         # process nuggets first
 21 |         output = open("all/nuggets/" + filename, "w")
 22 |         output.write("#BeginOfDocument" + " " + filename + "\n")
 23 | 
 24 |         writtenNuggetKeys = set()
 25 |         writtenNuggets = set()
 26 |         nuggetID_toKey = dict()
 27 |         corefDict = dict()  # dict from key -> set of coreferent nugget ids
 28 |         for inDir in dirList:
 29 |             try:
 30 |                 input = open(inDir + "/nuggets/" + filename, "r")
 31 |             except:
 32 |                 continue
 33 | 
 34 |             for line in input:
 35 |                 if line.startswith("#"):
 36 |                     continue
 37 |                 elif line.startswith("@"):
 38 |                     tokens = line.strip().split("\t")[2].split(",")
 39 |                     first = inDir + "_" + tokens[0]
 40 |                     firstKey = nuggetID_toKey[first]
 41 |                     rest = tokens[1:]
 42 | 
 43 |                     # add the remaining nuggets to the corefSet of the first nugget
 44 |                     for tok in rest:
 45 |                         curID = inDir + "_" + tok
 46 |                         corefDict[firstKey].add(curID)
 47 | 
 48 | 
 49 |                         # maybe I don't need this?  Merging later may take care of it
 50 |                         '''
 51 |                         # delete the corefSets of the other nuggets
 52 |                         curKey = nuggetID_toKey[curID]
 53 |                         if curKey in corefDict:
 54 |                             del corefDict[curKey]
 55 |                         '''
 56 |                         
 57 | 
 58 | 
 59 |                 else:
 60 |                     tokens = line.strip().split("\t")
 61 |                     key = tokens[3] + "_" + tokens[5]   # key = offset_label
 62 |                     nuggetID = inDir + "_" + tokens[2]
 63 | 
 64 |                     # if we haven't seen the key yet, add it
 65 |                     if key not in writtenNuggetKeys:
 66 |                         output.write("mergedSystem\t" + filename + "\t" + nuggetID + "\t" + tokens[3] + "\t" + tokens[4] + "\t" + tokens[5] + "\t" + tokens[6] + "\n")
 67 |                         writtenNuggetKeys.add(key)
 68 |                         writtenNuggets.add(nuggetID)
 69 |                         curSet = set()
 70 |                         curSet.add(nuggetID)
 71 |                         corefDict[key] = curSet
 72 | 
 73 |                         nuggetID_toKey[nuggetID] = key
 74 |                     # if we have seen it, then need to add to right corefSet
 75 |                     else:
 76 |                         corefDict[key].add(nuggetID)
 77 |                         nuggetID_toKey[nuggetID] = key
 78 | 
 79 |             input.close()
 80 | 
 81 |         # before writing coref, merge together any overlapping coref sets
 82 |         done = True
 83 |         first = True
 84 |         while not done or first:
 85 |             first = False
 86 |             done = True
 87 | 
 88 |             removeKey = None
 89 |             # if we find overlap, break out and start over again
 90 |             for key in corefDict:
 91 |                 curSet = corefDict[key]
 92 |                 for altKey in corefDict:
 93 |                     if key == altKey:
 94 |                         continue
 95 | 
 96 |                     altSet = corefDict[altKey]
 97 |                     overlap = False
 98 | 
 99 |                     for nugget in curSet:
100 |                         if nugget in altSet:
101 |                             overlap = True
102 |                             done = False
103 |                             break
104 | 
105 |                     if overlap:
106 |                         for nugget in altSet:
107 |                             corefDict[key].add(nugget)
108 | 
109 |                         removeKey = altKey
110 |                         break
111 | 
112 |                 if removeKey != None:
113 |                     break
114 |             if removeKey != None:
115 |                 print removeKey
116 |                 del corefDict[removeKey]
117 | 
118 |         corefID = 1
119 |         for key in corefDict:
120 |             writeList = []
121 |             for nugget in corefDict[key]:
122 |                 if nugget in writtenNuggets:
123 |                     writeList.append(nugget)
124 | 
125 |             if len(writeList) > 1:
126 |                 output.write("@Coreference\tR" + str(corefID) + "\t")
127 |                 first = True
128 |                 for nugget in writeList:
129 |                     if first:
130 |                         output.write(nugget)
131 |                         first = False
132 |                     else:
133 |                         output.write("," + nugget)
134 |                 output.write("\n")
135 |                 corefID += 1
136 |         output.write("#EndOfDocument\n")
137 |         ### End Nugget writing
138 | 
139 |         ### Begin argument writing
140 |         idCount = 1
141 |         linesToWrite = []
142 |         output = open("all/arguments/" + filename, "w")
143 | 
144 |         # write everything EXCEPT the last column.  Use the other script to get that.
145 |         for inDir in dirList:
146 |             input = open(inDir + "/arguments/" + filename, "r")
147 | 
148 |             for line in input:
149 |                 start = line.find("\t") + 1
150 |                 end = line.rfind("\t")
151 |                 data = line[start:end]
152 | 
153 |                 if data not in linesToWrite:
154 |                     linesToWrite.append(data)
155 |                     output.write(str(idCount) + "\t" + data + "\n")
156 | 
157 |                     idCount += 1
158 |             input.close()
159 | 
160 |         output.close()
161 | 
162 | main()
163 | 


--------------------------------------------------------------------------------
/outputFormatting/out/moveToStore.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | ./cleanStore.sh
 3 | 
 4 | cp linked_arguments/* arguments/.
 5 | 
 6 | cd store
 7 | 
 8 | cp -r ../arguments/ .
 9 | cp -r ../linking/ .
10 | cp -r ../corpusLinking/ .
11 | cp -r ../nuggets/ .
12 | 
13 | cd ../
14 | for file in arguments/*; do rm "$file" ; done
15 | for file in linking/*; do rm "$file" ; done
16 | for file in nuggets/*; do rm "$file" ; done
17 | for file in linked_arguments/*; do rm "$file" ; done
18 | rm corpusLinking/*
19 | 


--------------------------------------------------------------------------------
/outputFormatting/stopwords.txt:
--------------------------------------------------------------------------------
  1 | a
  2 | a's
  3 | able
  4 | about
  5 | above
  6 | according
  7 | accordingly
  8 | across
  9 | actually
 10 | after
 11 | afterwards
 12 | again
 13 | against
 14 | ain't
 15 | all
 16 | allow
 17 | allows
 18 | almost
 19 | alone
 20 | along
 21 | already
 22 | also
 23 | although
 24 | always
 25 | am
 26 | among
 27 | amongst
 28 | an
 29 | and
 30 | another
 31 | any
 32 | anybody
 33 | anyhow
 34 | anyone
 35 | anything
 36 | anyway
 37 | anyways
 38 | anywhere
 39 | apart
 40 | appear
 41 | appreciate
 42 | appropriate
 43 | are
 44 | aren't
 45 | around
 46 | as
 47 | aside
 48 | ask
 49 | asking
 50 | associated
 51 | at
 52 | available
 53 | away
 54 | awfully
 55 | b
 56 | be
 57 | became
 58 | because
 59 | become
 60 | becomes
 61 | becoming
 62 | been
 63 | before
 64 | beforehand
 65 | behind
 66 | being
 67 | believe
 68 | below
 69 | beside
 70 | besides
 71 | best
 72 | better
 73 | between
 74 | beyond
 75 | both
 76 | brief
 77 | but
 78 | by
 79 | c
 80 | c'mon
 81 | c's
 82 | came
 83 | can
 84 | can't
 85 | cannot
 86 | cant
 87 | cause
 88 | causes
 89 | certain
 90 | certainly
 91 | changes
 92 | clearly
 93 | co
 94 | com
 95 | come
 96 | comes
 97 | concerning
 98 | consequently
 99 | consider
100 | considering
101 | contain
102 | containing
103 | contains
104 | corresponding
105 | could
106 | couldn't
107 | course
108 | currently
109 | d
110 | definitely
111 | described
112 | despite
113 | did
114 | didn't
115 | different
116 | do
117 | does
118 | doesn't
119 | doing
120 | don't
121 | done
122 | down
123 | downwards
124 | during
125 | e
126 | each
127 | edu
128 | eg
129 | eight
130 | either
131 | else
132 | elsewhere
133 | enough
134 | entirely
135 | especially
136 | et
137 | etc
138 | even
139 | ever
140 | every
141 | everybody
142 | everyone
143 | everything
144 | everywhere
145 | ex
146 | exactly
147 | example
148 | except
149 | f
150 | far
151 | few
152 | fifth
153 | first
154 | five
155 | followed
156 | following
157 | follows
158 | for
159 | former
160 | formerly
161 | forth
162 | four
163 | from
164 | further
165 | furthermore
166 | g
167 | get
168 | gets
169 | getting
170 | given
171 | gives
172 | go
173 | goes
174 | going
175 | gone
176 | got
177 | gotten
178 | greetings
179 | h
180 | had
181 | hadn't
182 | happens
183 | hardly
184 | has
185 | hasn't
186 | have
187 | haven't
188 | having
189 | he
190 | he's
191 | hello
192 | help
193 | hence
194 | her
195 | here
196 | here's
197 | hereafter
198 | hereby
199 | herein
200 | hereupon
201 | hers
202 | herself
203 | hi
204 | him
205 | himself
206 | his
207 | hither
208 | hopefully
209 | how
210 | howbeit
211 | however
212 | i
213 | i'd
214 | i'll
215 | i'm
216 | i've
217 | ie
218 | if
219 | ignored
220 | immediate
221 | in
222 | inasmuch
223 | inc
224 | indeed
225 | indicate
226 | indicated
227 | indicates
228 | inner
229 | insofar
230 | instead
231 | into
232 | inward
233 | is
234 | isn't
235 | it
236 | it'd
237 | it'll
238 | it's
239 | its
240 | itself
241 | j
242 | just
243 | k
244 | keep
245 | keeps
246 | kept
247 | know
248 | knows
249 | known
250 | l
251 | last
252 | lately
253 | later
254 | latter
255 | latterly
256 | least
257 | less
258 | lest
259 | let
260 | let's
261 | like
262 | liked
263 | likely
264 | little
265 | look
266 | looking
267 | looks
268 | ltd
269 | m
270 | mainly
271 | many
272 | may
273 | maybe
274 | me
275 | mean
276 | meanwhile
277 | merely
278 | might
279 | more
280 | moreover
281 | most
282 | mostly
283 | much
284 | must
285 | my
286 | myself
287 | n
288 | name
289 | namely
290 | nd
291 | near
292 | nearly
293 | necessary
294 | need
295 | needs
296 | neither
297 | never
298 | nevertheless
299 | new
300 | next
301 | nine
302 | no
303 | nobody
304 | non
305 | none
306 | noone
307 | nor
308 | normally
309 | not
310 | nothing
311 | novel
312 | now
313 | nowhere
314 | o
315 | obviously
316 | of
317 | off
318 | often
319 | oh
320 | ok
321 | okay
322 | old
323 | on
324 | once
325 | one
326 | ones
327 | only
328 | onto
329 | or
330 | other
331 | others
332 | otherwise
333 | ought
334 | our
335 | ours
336 | ourselves
337 | out
338 | outside
339 | over
340 | overall
341 | own
342 | p
343 | particular
344 | particularly
345 | per
346 | perhaps
347 | placed
348 | please
349 | plus
350 | possible
351 | presumably
352 | probably
353 | provides
354 | q
355 | que
356 | quite
357 | qv
358 | r
359 | rather
360 | rd
361 | re
362 | really
363 | reasonably
364 | regarding
365 | regardless
366 | regards
367 | relatively
368 | respectively
369 | right
370 | s
371 | said
372 | same
373 | saw
374 | say
375 | saying
376 | says
377 | second
378 | secondly
379 | see
380 | seeing
381 | seem
382 | seemed
383 | seeming
384 | seems
385 | seen
386 | self
387 | selves
388 | sensible
389 | sent
390 | serious
391 | seriously
392 | seven
393 | several
394 | shall
395 | she
396 | should
397 | shouldn't
398 | since
399 | six
400 | so
401 | some
402 | somebody
403 | somehow
404 | someone
405 | something
406 | sometime
407 | sometimes
408 | somewhat
409 | somewhere
410 | soon
411 | sorry
412 | specified
413 | specify
414 | specifying
415 | still
416 | sub
417 | such
418 | sup
419 | sure
420 | t
421 | t's
422 | take
423 | taken
424 | tell
425 | tends
426 | th
427 | than
428 | thank
429 | thanks
430 | thanx
431 | that
432 | that's
433 | thats
434 | the
435 | their
436 | theirs
437 | them
438 | themselves
439 | then
440 | thence
441 | there
442 | there's
443 | thereafter
444 | thereby
445 | therefore
446 | therein
447 | theres
448 | thereupon
449 | these
450 | they
451 | they'd
452 | they'll
453 | they're
454 | they've
455 | think
456 | third
457 | this
458 | thorough
459 | thoroughly
460 | those
461 | though
462 | three
463 | through
464 | throughout
465 | thru
466 | thus
467 | to
468 | together
469 | too
470 | took
471 | toward
472 | towards
473 | tried
474 | tries
475 | truly
476 | try
477 | trying
478 | twice
479 | two
480 | u
481 | un
482 | under
483 | unfortunately
484 | unless
485 | unlikely
486 | until
487 | unto
488 | up
489 | upon
490 | us
491 | use
492 | used
493 | useful
494 | uses
495 | using
496 | usually
497 | uucp
498 | v
499 | value
500 | various
501 | very
502 | via
503 | viz
504 | vs
505 | w
506 | want
507 | wants
508 | was
509 | wasn't
510 | way
511 | we
512 | we'd
513 | we'll
514 | we're
515 | we've
516 | welcome
517 | well
518 | went
519 | were
520 | weren't
521 | what
522 | what's
523 | whatever
524 | when
525 | whence
526 | whenever
527 | where
528 | where's
529 | whereafter
530 | whereas
531 | whereby
532 | wherein
533 | whereupon
534 | wherever
535 | whether
536 | which
537 | while
538 | whither
539 | who
540 | who's
541 | whoever
542 | whole
543 | whom
544 | whose
545 | why
546 | will
547 | willing
548 | wish
549 | with
550 | within
551 | without
552 | won't
553 | wonder
554 | would
555 | would
556 | wouldn't
557 | x
558 | y
559 | yes
560 | yet
561 | you
562 | you'd
563 | you'll
564 | you're
565 | you've
566 | your
567 | yours
568 | yourself
569 | yourselves
570 | z
571 | zero
572 | 


--------------------------------------------------------------------------------
/outputFormatting/writeDocMap.py:
--------------------------------------------------------------------------------
 1 | # script to write the docmap file
 2 | import sys
 3 | 
 4 | def getRootname(line):
 5 | 	# first, remove any absolute path
 6 | 	text = line
 7 | 	if "/" in text:
 8 | 		start = text.rfind("/") + 1
 9 | 		text = text[start:]
10 | 	
11 | 	# remove the extension
12 | 	if "." in text:
13 | 		end = text.rfind(".")
14 | 		text = text[:end]
15 | 
16 | 	return text
17 | 
18 | def main():
19 | 	if len(sys.argv) != 2:
20 | 		print "Expect list of documents with absolute paths."
21 | 		sys.exit()
22 | 
23 | 	input = open(sys.argv[1], "r")
24 | 	lines = []
25 | 	for line in input:
26 | 		lines.append(line.strip())
27 | 	input.close()
28 | 
29 | 	output = open("documents.paths.tmp", "w")
30 | 	for line in lines:
31 | 		rootname = getRootname(line)
32 | 		output.write(rootname + "\t" + line + "\n")
33 | 	output.close()
34 | 
35 | 	output = open("documents.rootnames.tmp", "w")
36 | 	for line in lines:
37 | 		rootname = getRootname(line)
38 | 		output.write(rootname + "\n")
39 | 	output.close()
40 | 	
41 | 
42 | main()
43 | 


--------------------------------------------------------------------------------
/outputFormatting/writeTriggerOutput.py:
--------------------------------------------------------------------------------
 1 | # script to write the trigger output -- one file per document
 2 | import sys
 3 | import string
 4 | 
 5 | def main():
 6 |     if len(sys.argv) != 2:
 7 |         print "Expect list of triggers."
 8 |         sys.exit()
 9 | 
10 |     storeDir = "out/nuggets/"
11 | 
12 |     input = open(sys.argv[1], "r")
13 |     curDoc = ""
14 |     for line in input:
15 |         if line.startswith("#BeginOfDocument"):
16 |             tokens = line.strip().split()
17 |             name = tokens[1]
18 |             if name.endswith(".xml"):
19 |                 name = name[:-4]
20 |             
21 |             curDoc = name
22 |             output = open(storeDir + curDoc, "w")
23 | 
24 |             output.write(tokens[0] + " " + name + "\n")
25 |         elif line.startswith("#EndOfDocument"):
26 |             output.write(line)
27 |             output.close()
28 |         elif line.startswith("@Coreference"):
29 |             output.write(line)
30 |         else:
31 |             tokens = line.strip().split("\t")
32 | 
33 |             sysName = tokens[0]
34 |             docID = tokens[1]
35 |             mentionID = tokens[2]
36 |             offsets = tokens[3]
37 |             word = tokens[4]
38 |             label = tokens[5]
39 |             realis = tokens[6]
40 | 
41 |             confidence = tokens[7]
42 | 
43 |             eventTokens = label.split("_")
44 |             eventType = convertEventType(eventTokens[0])+ "." + convertEventType(eventTokens[1])
45 | 
46 |             if eventType == "Contact.Phone-Write":
47 |                 eventType = "Contact.Correspondence"
48 | 
49 |             if eventType == "Movement.Transport":
50 |                 self.eventType = "Movement.Transport-Person"
51 | 
52 |             output.write(sysName + "\t" + docID + "\t" + mentionID + "\t" + offsets + "\t" + word + "\t" + eventType + "\t" + realis + "\t" + confidence + "\n")
53 | 
54 | 
55 |     input.close()
56 | 
57 | def convertRoleLabels(label):
58 |     newLabel = ""
59 |     prevChar = ""
60 |     first = True
61 |     for character in label:
62 |         if first:
63 |             newLabel += character.upper()
64 |             first = False
65 |         elif prevChar in string.punctuation:
66 |             newLabel += character.upper()
67 |         else:
68 |             newLabel += character
69 | 
70 |         prevChar = character
71 |     
72 |     return newLabel
73 | 
74 | def convertEventType(text):
75 |     tmp = convertRoleLabels(text)
76 | 
77 |     if tmp == "Transportperson":
78 |         return "Transport-Person"
79 |     elif tmp == "Transportartifact":
80 |         return "Transport-Artifact"
81 |     elif tmp == "Endposition":
82 |         return "End-Position"
83 |     elif tmp == "Startposition":
84 |         return "Start-Position"
85 |     elif tmp == "Arrestjail":
86 |         return "Arrest-Jail"
87 |     elif tmp == "Transfermoney":
88 |         return "Transfer-Money"
89 |     elif tmp == "Transferownership":
90 |         return "Transfer-Ownership"
91 |     else:
92 |         return tmp
93 | 
94 | main()
95 | 


--------------------------------------------------------------------------------
/preprocessing_2.0/CoreNLP_scripts/StanfordCoreNLP-chinese.properties.simple:
--------------------------------------------------------------------------------
 1 | annotators = segment, ssplit, pos
 2 | 
 3 | customAnnotatorClass.segment = edu.stanford.nlp.pipeline.ChineseSegmenterAnnotator
 4 | 
 5 | segment.model = edu/stanford/nlp/models/segmenter/chinese/ctb.gz
 6 | segment.sighanCorporaDict = edu/stanford/nlp/models/segmenter/chinese
 7 | segment.serDictionary = edu/stanford/nlp/models/segmenter/chinese/dict-chris6.ser.gz
 8 | segment.sighanPostProcessing = true
 9 | 
10 | ssplit.boundaryTokenRegex = [.]|[!?]+|[。]|[！？]+
11 | 
12 | pos.model = edu/stanford/nlp/models/pos-tagger/chinese-distsim/chinese-distsim.tagger
13 | 


--------------------------------------------------------------------------------
/preprocessing_2.0/CoreNLP_scripts/StanfordCoreNLP-spanish.properties.simple:
--------------------------------------------------------------------------------
1 | annotators = tokenize, ssplit, pos
2 | 
3 | tokenize.language = es
4 | 
5 | pos.model = edu/stanford/nlp/models/pos-tagger/spanish/spanish-distsim.tagger
6 | 


--------------------------------------------------------------------------------
/preprocessing_2.0/CoreNLP_scripts/prefixLines.py:
--------------------------------------------------------------------------------
 1 | # script that takes a txt file and prepends the given string to each line
 2 | import sys
 3 | 
 4 | def main():
 5 |     if len(sys.argv) != 4:
 6 |         print "Expect text file, string to prepend, output file."
 7 |         sys.exit()
 8 | 
 9 |     input = open(sys.argv[1], "r")
10 |     output = open(sys.argv[3], "w")
11 | 
12 |     prefix = sys.argv[2]
13 | 
14 |     for line in input:
15 |         output.write(prefix + line)
16 |     input.close()
17 |     output.close()
18 | 
19 | main()
20 | 


--------------------------------------------------------------------------------
/preprocessing_2.0/CoreNLP_scripts/prepareCoreNLP_input.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | 
 3 | def main():
 4 | 
 5 |     if len(sys.argv) != 4:
 6 |         print "Expect input, output, $PWD."
 7 |         sys.exit()
 8 | 
 9 |     input = open(sys.argv[1], "r")
10 |     output = open(sys.argv[2], "w")
11 |     pwd = sys.argv[3]
12 | 
13 |     for line in input:
14 |         output.write(pwd + line[1:])
15 |     input.close()
16 | 
17 |     output.close()
18 | 
19 | main()
20 | 


--------------------------------------------------------------------------------
/preprocessing_2.0/CoreNLP_scripts/runCoreNLP_Chn.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | 
 3 | STANFORD_CORENLP=/home/andrew/NLP_tools/CoreNLP/stanford-corenlp-full-2016-10-31
 4 | INPUTS=$1
 5 | 
 6 | CURRENT_PATH=${PWD}
 7 | 
 8 | echo "Call Stanford CoreNLP..."
 9 | java -cp "$STANFORD_CORENLP/*" -Xmx16g edu.stanford.nlp.pipeline.StanfordCoreNLP -filelist $INPUTS -props ${CURRENT_PATH}/CoreNLP_scripts/StanfordCoreNLP-chinese.properties.simple -threads 8 -outputDirectory ${CURRENT_PATH}/CoreNLP_scripts/tmp_Chn/ -outputExtension .out
10 | 


--------------------------------------------------------------------------------
/preprocessing_2.0/CoreNLP_scripts/runCoreNLP_Eng.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | 
 3 | STANFORD_CORENLP=/home/andrew/NLP_tools/CoreNLP/stanford-corenlp-full-2016-10-31
 4 | INPUTS=$1
 5 | 
 6 | CURRENT_PATH=${PWD}
 7 | 
 8 | 
 9 | echo "Call Stanford CoreNLP..."
10 | java -cp "$STANFORD_CORENLP/*" -Xmx16g edu.stanford.nlp.pipeline.StanfordCoreNLP -filelist $INPUTS -annotators tokenize,ssplit,pos,lemma,ner,parse -threads 8 -outputDirectory ${CURRENT_PATH}/CoreNLP_scripts/tmp_Eng/ -outputExtension .out
11 | 


--------------------------------------------------------------------------------
/preprocessing_2.0/CoreNLP_scripts/runCoreNLP_Span.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | 
 3 | STANFORD_CORENLP=/home/andrew/NLP_tools/CoreNLP/stanford-corenlp-full-2016-10-31
 4 | INPUTS=$1
 5 | 
 6 | CURRENT_PATH=${PWD}
 7 | 
 8 | echo "Call Stanford CoreNLP..."
 9 | java -cp "$STANFORD_CORENLP/*" -Xmx16g edu.stanford.nlp.pipeline.StanfordCoreNLP -filelist $INPUTS -props ${CURRENT_PATH}/CoreNLP_scripts/StanfordCoreNLP-spanish.properties.simple -threads 8 -outputDirectory ${CURRENT_PATH}/CoreNLP_scripts/tmp_Span/ -outputExtension .out
10 | 


--------------------------------------------------------------------------------
/preprocessing_2.0/MaltParser_scripts/convertToCoNLL.py:
--------------------------------------------------------------------------------
 1 | # script to convert from createSetFiles file to CoNLL format
 2 | import sys
 3 | import string
 4 | 
 5 | def convertPOS(pos, converter):
 6 |     newPOS = ""
 7 |     for character in pos:
 8 |         if character not in string.digits:
 9 |             newPOS += character
10 | 
11 |     copyPOS = newPOS
12 |     while len(copyPOS) > 0:
13 |         if copyPOS in converter:
14 |             return converter[copyPOS], copyPOS
15 |         else:
16 |             copyPOS = copyPOS[:-1]
17 | 
18 |     return newPOS, newPOS
19 | 
20 | def removeDigits(pos):
21 |     newPOS = ""
22 |     for character in pos:
23 |         if character not in string.digits:
24 |             newPOS += character
25 |     return newPOS
26 | 
27 | def main():
28 |     if len(sys.argv) != 4:
29 |         print "Expect createSetFiles data, output file, universal POS Tag file."
30 |         sys.exit()
31 |     posConverter = dict()
32 | 
33 |     input = open(sys.argv[3], "r")
34 |     for line in input:
35 |         tokens = line.strip().split("\t")
36 |         posConverter[tokens[0]] = tokens[1]
37 |     input.close()
38 | 
39 |     input = open(sys.argv[1], "r")
40 |     output = open(sys.argv[2], "w")
41 | 
42 |     prevBlank = True
43 |     wordCount = 0
44 |     for line in input:
45 |         if line.strip() != "":
46 |             tokens = line.strip().split("\t")
47 |             if wordCount == 0:
48 |                 output.write("# " + tokens[5] + "\n")
49 | 
50 |             wordCount += 1
51 |             word = tokens[2]
52 |             pos = tokens[4]
53 | 
54 |             newPOS, originalPOS = convertPOS(pos, posConverter)
55 | 
56 |             if len(newPOS) == 0:
57 |                 newPOS = "_"
58 |                 originalPOS = "_"
59 | 
60 |             output.write(str(wordCount) + "\t" + word + "\t_\t" + newPOS + "\t" + originalPOS + "\t_\t_\t_\t_\t_\n")
61 |             prevBlank = False
62 |         else:
63 |             wordCount = 0
64 |             if not prevBlank:
65 |                 output.write("\n")
66 |                 prevBlank = True
67 | 
68 |     input.close()
69 |     output.close()
70 | 
71 | main()
72 | 


--------------------------------------------------------------------------------
/preprocessing_2.0/MaltParser_scripts/convertToParsingFile.py:
--------------------------------------------------------------------------------
 1 | # script to convert from the CoNLL output back to my createSetFiles output
 2 | import sys
 3 | 
 4 | def printDep(dep, output, wordDict):
 5 |     relation = dep[0]
 6 |     relationIndex = dep[1]
 7 |     wordIndex = dep[2]
 8 | 
 9 |     output.write(relation + "|||" + wordDict[relationIndex] + "|||" + relationIndex + "|||" + wordDict[wordIndex] + "|||" + wordIndex + "\n")
10 | 
11 | def main():
12 |     if len(sys.argv) != 3:
13 |         print "Expect input file, output file."
14 |         sys.exit()
15 | 
16 |     input = open(sys.argv[1], "r")
17 |     output = open(sys.argv[2], "w")
18 | 
19 |     wordDict = dict()
20 |     wordDict["0"] = "ROOT"
21 |     depList = []
22 |     for line in input:
23 |         if line.strip() == "":
24 |             # print out dependencies if needed
25 |             if len(wordDict) != 0:
26 |                 for dep in depList:
27 |                     printDep(dep, output, wordDict)
28 |                 wordDict = dict()
29 |                 wordDict["0"] = "ROOT"
30 |                 depList = []
31 |             output.write("\n")
32 |         elif not line.startswith("#"):
33 |             tokens = line.strip().split("\t")
34 |             wordIndex = tokens[0]
35 |             word = tokens[1]
36 |             relation = tokens[7]
37 |             relationIndex = tokens[6]
38 | 
39 |             wordDict[wordIndex] = word
40 | 
41 |             depList.append( (relation, relationIndex, wordIndex) )
42 | 
43 |     if len(wordDict) != 0:
44 |         for dep in depList:
45 |             printDep(dep, output)
46 |         wordDict = dict()
47 |         depList = []
48 |     output.write("\n")
49 | 
50 |     input.close()
51 |     output.close()
52 | 
53 | main()
54 | 


--------------------------------------------------------------------------------
/preprocessing_2.0/createSetFiles/writeDataFromFiles.py:
--------------------------------------------------------------------------------
 1 | # reads each line, write the data from those files to a single file
 2 | 
 3 | import sys
 4 | 
 5 | def main():
 6 |     if len(sys.argv) != 3:
 7 |         print "Expect list of files, output file."
 8 |         sys.exit()
 9 | 
10 |     files = []
11 |     input = open(sys.argv[1], "r")
12 |     for line in input:
13 |         files.append(line.strip())
14 |     input.close()
15 | 
16 |     output = open(sys.argv[2], "w")
17 |     parsingOutput = open(sys.argv[2] + ".parsing", "w")
18 |     for filename in files:
19 |         tempIndex = filename.find(".mergedAnnotations")
20 |         tempName = filename[:tempIndex]
21 |         parsingFilename = tempName + ".parsingAnnotations"
22 | 
23 |         input = open(filename, "r")
24 |         for line in input:
25 |             output.write(line)
26 |         input.close()
27 | 
28 |         try:
29 |             input = open(parsingFilename, "r")
30 |             for line in input:
31 |                 parsingOutput.write(line)
32 |             input.close()
33 |         except IOError:
34 |             print "Could not open file: " + parsingFilename
35 |             print "Continuing..."
36 |     output.close()
37 |     parsingOutput.close()
38 | 
39 | main()
40 | 


--------------------------------------------------------------------------------
/preprocessing_2.0/entityExtraction/code/addEntitiesToText.py:
--------------------------------------------------------------------------------
  1 | import sys
  2 | import xml.etree.ElementTree as ET
  3 | import string
  4 | from xml.sax.saxutils import escape
  5 | 
  6 | # set to maintain unique coref labels across all entities
  7 | corefSet = set()
  8 | 
  9 | def processEvents(root, triggerDict, argDict):
 10 |     if root.tag == "event":
 11 |         eventType = root.attrib["TYPE"]
 12 |         eventSubtype = root.attrib["SUBTYPE"]
 13 | 
 14 |         for child in root:
 15 |             if child.tag == "event_mention":
 16 |                 processEventMentions(child, triggerDict, argDict, eventType, eventSubtype)
 17 | 
 18 |     else:
 19 |         for child in root:
 20 |             processEvents(child, triggerDict, argDict)
 21 | 
 22 | def processEventMentions(root, triggerDict, argDict, eventType, eventSubtype):
 23 |     anchorText = ""
 24 |     anchorIndex = -1
 25 |     for child in root:
 26 |        anchorText, anchorIndex = processEvent_Helper_Anchor(child, triggerDict, argDict, eventType, eventSubtype)
 27 |        if anchorText != "":
 28 |            break
 29 |  
 30 |     for child in root:
 31 |        processEvent_Helper_Arg(child, triggerDict, argDict, eventType, eventSubtype, anchorText, anchorIndex)
 32 | 
 33 | def processEvent_Helper_Anchor(root, triggerDict, argDict, eventType, eventSubtype):
 34 |     if root.tag == "anchor":
 35 |         for child in root:
 36 |             return processEvent_Anchor(child, triggerDict, eventType, eventSubtype)
 37 |     else:
 38 |         returnStr = ""
 39 |         returnNum = -1
 40 |         for child in root:
 41 |             returnStr, returnNum = processEvent_Helper_Anchor(child, triggerDict, argDict, eventType, eventSubtype)
 42 |             if returnStr != "":
 43 |                 break
 44 | 
 45 |         return returnStr, returnNum
 46 | 
 47 | def processEvent_Helper_Arg(root, triggerDict, argDict, eventType, eventSubtype, eventText, eventIndex):
 48 |     if root.tag == "event_mention_argument":
 49 |         role = root.attrib["ROLE"]
 50 |         for child in root:
 51 |             processEvent_Argument(child, argDict, role, eventText, eventIndex)
 52 |     else:
 53 |         for child in root:
 54 |             processEvent_Helper_Arg(child, triggerDict, argDict, eventType, eventSubtype, eventText, eventIndex)
 55 | 
 56 | def processEvent_Argument(root, argDict, role, eventText, eventIndex):
 57 |     if root.tag == "charseq":
 58 |         start = int(root.attrib["START"])
 59 |         end = int(root.attrib["END"])
 60 |         text = escape(root.text)
 61 | 
 62 |         # write the characters to the dict
 63 |         index = start
 64 | 
 65 |         while index <= end:
 66 |             if index not in argDict:
 67 |                 argDict[index] = []
 68 |             if index == start:
 69 |                 argDict[index].append( ("B", role, text, eventText, eventIndex) )
 70 |             else:
 71 |                 argDict[index].append( ("I", role, text, eventText, eventIndex) )
 72 |             index += 1
 73 |     else:
 74 |         for child in root:
 75 |             processEvent_Argument(child, argDict, role, eventText, eventIndex)
 76 | 
 77 | def processEvent_Anchor(root, triggerDict, eventType, eventSubtype):
 78 |     if root.tag == "charseq":
 79 |         start = int(root.attrib["START"])
 80 |         end = int(root.attrib["END"])
 81 |         text = escape(root.text)
 82 | 
 83 |         # write the characters to the dict
 84 |         index = start
 85 | 
 86 |         while index <= end:
 87 |             triggerDict[index] = (eventType, eventSubtype, text)
 88 |             index += 1
 89 |         return text, start
 90 |     else:
 91 |         for child in root:
 92 |             return processEvent_Anchor(child, triggerDict, eventType, eventSubtype)
 93 | 
 94 | def processExtent(root):
 95 |     for child in root:
 96 |         if child.tag == "charseq":
 97 |             start = int(child.attrib["START"])
 98 |             end = int(child.attrib["END"])
 99 |             text = escape(child.text)
100 | 
101 |             return start, end, text
102 |     raise RuntimeError("Improper XML detected.")
103 | 
104 | def cleanWhitespace(text):
105 |     clean = ""
106 |     for char in text:
107 |         if char in string.whitespace:
108 |             clean += " "
109 |         else:
110 |             clean += char
111 |     return clean
112 | 
113 | def processHead(root):
114 |     for child in root:
115 |         if child.tag == "charseq":
116 |             return child.text
117 |     raise RuntimeError("Improper XML detected.")
118 | 
119 | 
120 | def processEntities(root, labelDict):
121 |     if root.tag == "entity":
122 |         entityType = root.attrib["TYPE"]
123 |         entitySubtype = root.attrib["SUBTYPE"]
124 | 
125 |         corefLabel = len(corefSet)
126 |         corefSet.add(corefLabel)
127 | 
128 |         # process and write each mention to the dict
129 |         for mention in root:
130 |             if mention.tag == "entity_mention":
131 |                 start = -1
132 |                 end = -1
133 |                 head = ""
134 |                 text = ""
135 | 
136 |                 for child in mention:
137 |                     if child.tag == "head":
138 |                         head = cleanWhitespace(processHead(child))
139 |                     elif child.tag == "extent":
140 |                         start, end, text = processExtent(child)
141 | 
142 | 
143 |                 # write the characters to the dict
144 |                 if start < 0:
145 |                     raise ValueError('Did not read indexes for entity')
146 |                 index = start
147 | 
148 |                 while index <= end:
149 |                     if index not in labelDict:
150 |                         labelDict[index] = []
151 | 
152 |                     if index == start:
153 |                         labelDict[index].append( ("B", entityType, entitySubtype, text, head, corefLabel) )
154 |                     else:
155 |                         labelDict[index].append( ("I", entityType, entitySubtype, text, head, corefLabel) )
156 |                     index += 1
157 | 
158 |     elif root.tag == "timex2":
159 |         corefLabel = len(corefSet)
160 |         corefSet.add(corefLabel)
161 | 
162 |         for child in root:
163 |             processTime(child, labelDict, corefLabel)
164 |     elif root.tag == "value":
165 |         corefLabel = len(corefSet)
166 |         corefSet.add(corefLabel)
167 | 
168 |         valueType = root.attrib["TYPE"]
169 |         if "SUBTYPE" in root.attrib:
170 |             valueSubtype = root.attrib["SUBTYPE"]
171 |         else:
172 |             valueSubtype = root.attrib["TYPE"]
173 | 
174 |         for child in root:
175 |             processValue(child, labelDict, corefLabel, valueType, valueSubtype)
176 |     else:
177 |         for child in root:
178 |             processEntities(child, labelDict)
179 | 
180 | def processEntity_Helper(root, labelDict, entityType, entitySubtype):
181 |     if root.tag == "extent":
182 |         for child in root:
183 |            processExtent(child, labelDict, entityType, entitySubtype)
184 |     else:
185 |         for child in root:
186 |           processEntity_Helper(child, labelDict, entityType, entitySubtype)
187 | 
188 | def processTime(root, labelDict, corefLabel):
189 |     if root.tag == "charseq":
190 |         start = int(root.attrib["START"])
191 |         end = int(root.attrib["END"])
192 |         text = escape(root.text)
193 | 
194 |         # write the characters to the dict
195 |         index = start
196 | 
197 |         while index <= end:
198 |             if index not in labelDict:
199 |                 labelDict[index] = []
200 |     
201 |             # NOTE: timex values don't have heads -- just use the text again
202 |             if index == start:
203 |                 labelDict[index].append( ("B", "TIME", "TIME", text, cleanWhitespace(text), corefLabel) )
204 |             else:
205 |                 labelDict[index].append( ("I", "TIME", "TIME", text, cleanWhitespace(text), corefLabel) )
206 |             index += 1
207 |     else:
208 |         for child in root:
209 |             processTime(child, labelDict, corefLabel)
210 | 
211 | def processValue(root, labelDict, corefLabel, valueType, valueSubtype):
212 |     if root.tag == "charseq":
213 |         start = int(root.attrib["START"])
214 |         end = int(root.attrib["END"])
215 |         text = escape(root.text)
216 | 
217 |         # write the characters to the dict
218 |         index = start
219 | 
220 |         while index <= end:
221 |             if index not in labelDict:
222 |                 labelDict[index] = []
223 |     
224 |             # NOTE: timex values don't have heads -- just use the text again
225 |             if index == start:
226 |                 labelDict[index].append( ("B", valueType, valueSubtype, text, cleanWhitespace(text), corefLabel) )
227 |             else:
228 |                 labelDict[index].append( ("I", valueType, valueSubtype, text, cleanWhitespace(text), corefLabel) )
229 |             index += 1
230 |     else:
231 |         for child in root:
232 |             processValue(child, labelDict, corefLabel, valueType, valueSubtype)
233 | 
234 | 
235 | def main():
236 |     if len(sys.argv) != 4:
237 |         print "Expect stanford annotations (XML), coreNLP features (extracted), output file."
238 |         sys.exit()
239 | 
240 |     print "Starting document " + sys.argv[3]
241 | 
242 |     # read the annotation XML
243 |     labelDict = dict()  # dict from offset -> (B/I, labelType, labelSubtype, fullName)
244 |     triggerDict = dict() # dict from offset -> (triggerType, triggerSubtype)
245 |     argDict = dict() # dict from offset -> (argument role)
246 | 
247 |     corefCount = 0
248 | 
249 |     # read the Stanford mentions
250 |     input = open(sys.argv[1], "r")
251 |     wordDict = dict()   # dict from word count -> (B/I, labelType, labelSubtype, fullName, head)
252 |     for line in input:
253 |         tokens = line.strip().split('\t')
254 |         wordCount = int(tokens[0])
255 |         entityName = tokens[1]
256 |         entityType = tokens[2]
257 | 
258 |         numWords = entityName.count(" ") + 1
259 |         lastWord = entityName
260 |         if numWords > 1:
261 |             start = entityName.rfind(" ")
262 |             lastWord = entityName[start+1:] 
263 |         corefCount + 1
264 | 
265 |         for index in range(numWords):
266 |             if index == 0:
267 |                 if wordCount not in wordDict:
268 |                     wordDict[wordCount] = []
269 |                 wordDict[wordCount].append( ("B", entityType, entityType, entityName, lastWord, str(corefCount)) )
270 |             else:
271 |                 if wordCount + index not in wordDict:
272 |                     wordDict[wordCount + index] = []
273 |                 wordDict[wordCount + index].append( ("I", entityType, entityType, entityName, lastWord, str(corefCount)) )
274 |     input.close()
275 | 
276 | 
277 |     input = open(sys.argv[2], "r")
278 |     output = open(sys.argv[3], "w")
279 | 
280 | 
281 |     lineCounter = 0
282 |     wordCount = 0
283 |     for line in input:
284 |         lineCounter += 1
285 | 
286 |         if line.startswith("BEGIN_SENTENCE"):
287 |             output.write("\n")
288 |         else:
289 |             clean = line.strip()
290 |             tokens = clean.split("\t")
291 |             startOffset = int(tokens[0])
292 |             endOffset = int(tokens[1])
293 |             curWord = tokens[2] 
294 | 
295 |             labelDict[startOffset] = []
296 |             if wordCount in wordDict:
297 |                 labelDict[startOffset] = wordDict[wordCount]
298 |             wordCount += 1
299 | 
300 |             entityInfo = "EntitesGold["
301 | 
302 |             if startOffset in labelDict:
303 |                 for curTuple in labelDict[startOffset]:
304 |                     begin = curTuple[0]
305 |                     entType = curTuple[1]
306 |                     entSubtype = curTuple[2]
307 |                     head = curTuple[4]#.encode('utf-8')
308 | 
309 |                     coref = "coref_" + str(curTuple[5])
310 | 
311 |                     entityInfo += (begin + "|||" + entType + "|||" + entSubtype + "|||" + head + "|||" + coref + ";;;")
312 | 
313 |                     # for debugging only
314 |                     tupleFullName = curTuple[3]#.encode('utf-8')
315 | 
316 |                     # below: good for verifying alignment, but final version should not contain.
317 |                     #output.write(clean + "\t" + begin + "-" + entType + "\t" + begin + "-" + entSubtype + "\t" + tupleFullName + "\n")
318 |             entityInfo += "]"
319 | 
320 |             eventType = "not_trigger"
321 |             eventSubtype = "not_trigger"
322 | 
323 |             if startOffset in triggerDict:
324 |                 curTuple = triggerDict[startOffset]
325 |                 eventType = curTuple[0]
326 |                 eventSubtype = curTuple[1]
327 | 
328 |             argInfo = "ArgsGold["
329 |             if startOffset in argDict:
330 |                 for curTuple in argDict[startOffset]:
331 |                     begin = curTuple[0]
332 |                     argRole = curTuple[1]
333 |                     eventText = curTuple[3]
334 |                     eventIndex = curTuple[4]
335 | 
336 |                     eventText = replaceWhiteSpace(eventText)
337 | 
338 |                     argInfo += (begin + "|||" + argRole + "|||" + eventText.encode('utf-8') + "|||" + str(eventIndex) + ";;;")
339 |             argInfo += "]"
340 | 
341 |             output.write(clean + "\t" + entityInfo + "\t" + eventType + "\t" + eventSubtype + "\t" + argInfo + "\n")
342 |     output.write("\n")
343 |     input.close()
344 |     output.close()
345 | 
346 |     print "Finished processing document!  Written to " + sys.argv[3]
347 | 
348 | def replaceWhiteSpace(text):
349 |     newStr = ""
350 |     for character in text:
351 |         if character in string.whitespace:
352 |             newStr += " "
353 |         else:
354 |             newStr += character
355 |     return newStr
356 | 
357 | 
358 | main()        
359 | 


--------------------------------------------------------------------------------
/preprocessing_2.0/entityExtraction/code/addEntitiesToText.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | 
 3 | # clean the directory first
 4 | rm tmp_formatted/*
 5 | 
 6 | while read -r line
 7 | do
 8 | 	name=$line
 9 | 	python ../../readCoreNLP/read_CoreNLP_XML.py ${2}${name}.out tmp.txt
10 | 	python addEntitiesToText.py unify/out_processed/${name} tmp.txt tmp_formatted/${name}.mergedAnnotations
11 | 	python ../../readCoreNLP/write_parsing_from_CoreNLP.py ${2}${name}.out tmp_formatted/${name}.parsingAnnotations 
12 | done < ${1}
13 | rm tmp.txt
14 | 


--------------------------------------------------------------------------------
/preprocessing_2.0/entityExtraction/code/unify/processEntities.py:
--------------------------------------------------------------------------------
 1 | # script to change from 1-line per word, to 1-line per entity
 2 | import sys
 3 | 
 4 | def main():
 5 |     if len(sys.argv) != 4:
 6 |         print "Expect list of files, input directory, output directory."
 7 |         sys.exit()
 8 | 
 9 | 
10 |     filenames = []
11 |     input = open(sys.argv[1], "r")
12 |     for line in input:
13 |         filenames.append(line.strip())
14 |     input.close()
15 | 
16 |     inDir = sys.argv[2]
17 |     outDir = sys.argv[3]
18 | 
19 |     for filename in filenames:
20 |         print "Processing: " + filename
21 |         input = open(inDir + filename, "r")
22 |         output = open(outDir + filename, "w")
23 | 
24 |         lines = input.readlines()
25 | 
26 |         for index in range(len(lines)):
27 |             curLine = lines[index]
28 | 
29 |             tokens = curLine.strip().split("\t")
30 |             word = tokens[0]
31 |             label = tokens[1]
32 | 
33 | 
34 |             labels = [label]
35 |             if ";" in label:
36 |                 labels = label.split(";")
37 | 
38 |             for label in labels:
39 |                 if label.startswith("B"):
40 |                     # find how long this goes for
41 |                     entityName = word
42 | 
43 |                     suffix = label[1:]
44 |                     altIndex = index + 1
45 |                     while altIndex < len(lines):
46 |                         altLine = lines[altIndex]
47 |                         altTokens = altLine.strip().split("\t")
48 |                         altWord = altTokens[0]
49 |                         altLabel = altTokens[1]
50 | 
51 |                         altLabels = [altLabel]
52 | 
53 |                         if ";" in altLabel:
54 |                             altLabels = altLabel.split(";")
55 |                         found = False
56 |                         for altLabel in altLabels:
57 |                             if altLabel.endswith(suffix):
58 |                                 entityName += " " + altWord
59 |                                 found = True
60 |                                 continue
61 | 
62 |                         if not found:
63 |                             break
64 |                         else:
65 |                             altIndex += 1
66 | 
67 |                     output.write(str(index) + "\t" + entityName + "\t" + suffix[1:] + "\n")
68 |         input.close()
69 |         output.close()
70 | 
71 | main()
72 | 
73 | 


--------------------------------------------------------------------------------
/preprocessing_2.0/entityExtraction/code/unify/unifyEntities.py:
--------------------------------------------------------------------------------
 1 | # script to take the (1-file-per-class) NER output and unify into a single file PER DOCUMENT
 2 | import sys
 3 | 
 4 | def main():
 5 |     if len(sys.argv) != 4:
 6 |         print "Expect createSetFiles file, list of files, output directory."
 7 |         sys.exit()
 8 | 
 9 |     # for each line read, record the word, a set of entity labels, and the documentID
10 |     words = []  
11 |     labels = []
12 |     documents = []
13 | 
14 |     docSet = set()
15 | 
16 |     files = []
17 |     input = open(sys.argv[2], "r")
18 |     for line in input:
19 |         files.append(line.strip())
20 |     input.close()
21 | 
22 |     input = open(sys.argv[1], "r")
23 |     for line in input:
24 |         if line.strip() != "":
25 |             tokens = line.strip().split("\t")
26 |             word = tokens[2]
27 |             docid = tokens[5]
28 | 
29 |             words.append(word)
30 |             labels.append(set())
31 |             documents.append(docid)
32 | 
33 |             docSet.add(docid)
34 |     input.close()
35 | 
36 |     for filename in files:
37 |         input = open(filename, "r")
38 |         count = 0
39 |         for line in input:
40 |             if line.strip() != "":
41 |                 tokens = line.strip().split("\t")
42 |                 word = tokens[0]
43 |                 label = tokens[2]
44 | 
45 |                 if label != "O":
46 |                     labels[count].add(label)
47 | 
48 |                 count += 1
49 |         input.close()
50 | 
51 |     outPrefix = sys.argv[3]
52 | 
53 |     prevDoc = documents[0]
54 |     prevSet = set()
55 |     output = open(outPrefix + documents[0], "w")
56 |     for index in range(len(words)):
57 |         curWord = words[index]
58 |         curDoc = documents[index]
59 |         labelSet = labels[index]
60 | 
61 |         if curDoc != prevDoc:
62 |             output.close()
63 |             output = open(outPrefix + curDoc, "w")
64 | 
65 |         output.write(curWord + "\t")
66 |         if len(labelSet) == 0:
67 |             output.write("EMPTY\n")
68 |         else:
69 |             first = True
70 |             for label in labelSet:
71 |                 if first:
72 |                     if label not in prevSet:
73 |                         output.write("B-" + label)
74 |                     else:
75 |                         output.write("I-" + label)
76 |                     first = False
77 |                 else:
78 |                     if label not in prevSet:
79 |                         output.write(";B-" + label)
80 |                     else:
81 |                         output.write(";I-" + label)
82 |             output.write("\n")
83 | 
84 |         prevDoc = curDoc
85 |         prevSet = labelSet
86 |     output.close()
87 | 
88 | 
89 | main()
90 | 


--------------------------------------------------------------------------------
/preprocessing_2.0/entityExtraction/convertTestSet.py:
--------------------------------------------------------------------------------
  1 | # script to convert from the "createSetFiles" output to the training format for Stanford NER
  2 | import sys
  3 | import string
  4 | 
  5 | def main():
  6 |     if len(sys.argv) != 4:
  7 |         print "Expect input file, parsing file, output file."
  8 |         sys.exit()
  9 | 
 10 |     labelDict = dict()
 11 |     labelDict["WEA"] = "weapon"
 12 |     labelDict["Sentence"] = "sentence"
 13 |     labelDict["Crime"] = "crime"
 14 |     labelDict["Job-Title"] = "title"
 15 |     labelDict["VEH"] = "vehicle"
 16 |     labelDict["TIME"] = "time"
 17 |     labelDict["Numeric"] = "money"
 18 | 
 19 |     sentences = []
 20 |     sentencesRelation = []
 21 |     governorDict = dict()   # wordIndex -> governor word
 22 |     relationDict = dict()   # wordIndex -> dependency relationship
 23 |     wordCount = 0
 24 |     input = open(sys.argv[2], "r")
 25 |     for line in input:
 26 |         if line.strip() != "":
 27 |             wordCount += 1
 28 |             tokens = line.strip().split("|||")
 29 |             governor = tokens[1]
 30 |             start = line.strip().rfind("|") + 1
 31 |             wordIndex = int(line.strip()[start:])
 32 | 
 33 |             governorDict[wordIndex] = governor
 34 | 
 35 |             relation = tokens[0]
 36 |             relationDict[wordIndex] = relation
 37 |         else:
 38 |             sentences.append(governorDict)
 39 |             sentencesRelation.append(relationDict)
 40 |             wordCount = 0
 41 |             governorDict = dict()
 42 |             relationDict = dict()
 43 |     if len(governorDict) != 0:
 44 |         sentences.append(governorDict)
 45 |         sentencesRelation.append(relationDict)
 46 |     input.close()
 47 | 
 48 |     input = open(sys.argv[1], "r")
 49 |     prefix = sys.argv[3]
 50 | 
 51 |     labelSet = set()
 52 | 
 53 |     # first, scan the text, figure out how many labels
 54 |     for line in input:
 55 |         clean = line.strip()
 56 |         if clean != "":
 57 |             tokens = clean.split("\t")
 58 |             entity = tokens[6]
 59 | 
 60 |             if entity != "EntitesGold[]":
 61 |                 start = entity.find("[") + 1
 62 |                 end = entity.find(";;;")
 63 |                 substring = entity[start:end]
 64 | 
 65 |                 entTokens = substring.split(";;;")
 66 |                 for tok in entTokens:
 67 |                     tmpLabel = substring.split("|||")[1]
 68 |                     if tmpLabel in labelDict:
 69 |                         tmpLabel = labelDict[tmpLabel]
 70 |                     #labelSet.add(substring.split("|||")[1])
 71 |                     labelSet.add(tmpLabel)
 72 | 
 73 |     input.close()
 74 | 
 75 |     output = open(sys.argv[3], "w")
 76 | 
 77 |     sentenceCount = 0
 78 |     wordCount = 0
 79 | 
 80 |     input = open(sys.argv[1], "r")
 81 |     prevEmpty = True
 82 |     for line in input:
 83 |         clean = line.strip()
 84 |         if clean != "":
 85 |             prevEmpty = False
 86 | 
 87 |             wordCount += 1
 88 |             tokens = clean.split("\t")
 89 |             word = tokens[2]
 90 |             entity = tokens[6]
 91 | 
 92 |             governor = "<NONE>"
 93 |             relation = "<NONE>"
 94 |             if wordCount in sentences[sentenceCount]:
 95 |                 governor = sentences[sentenceCount][wordCount]
 96 |                 relation = sentencesRelation[sentenceCount][wordCount]
 97 |             if governor.strip() == "":
 98 |                 governor = "<NONE>"
 99 |             if relation.strip() == "":
100 |                 relation = "<NONE>"
101 | 
102 |             output.write(removeWhitespace(word) + "\tO\t" + removeWhitespace(governor) + "_" + removeWhitespace(relation) + "\n")
103 |         elif not prevEmpty:
104 |             output.write("\n")
105 |             wordCount = 0
106 |             sentenceCount += 1
107 |             prevEmpty = True
108 | 
109 |     input.close()
110 |     for label in labelSet:
111 |         outputs[label].close()
112 | 
113 | def removeWhitespace(text):
114 |     newText = ""
115 |     for character in text:
116 |         if character not in string.whitespace:
117 |             newText += character
118 |         else:
119 |             newText += "_"
120 | 
121 |     return newText
122 | 
123 | main()
124 | 


--------------------------------------------------------------------------------
/preprocessing_2.0/entityExtraction/runEntities.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | STANFORD_NER=/home/andrew/NLP_tools/StanfordNER/stanford-ner-2016-10-31
 3 | 
 4 | for i in Contact-Info crime FAC GPE title LOC money ORG PER sentence time weapon vehicle age commodity
 5 | do
 6 | 	java -mx16g -cp "$STANFORD_NER/*:$STANFORD_NER/lib/*" edu.stanford.nlp.ie.crf.CRFClassifier -loadClassifier /home/andrew/DEFT_code_testing/dependencies/models/entities/ACE_ERE-English.ner-model.${i}.dependency.full.ser.gz -testfile $1 > code/unify/in/${1}_${i}
 7 | done
 8 | 
 9 | ls code/unify/in/* > code/unify/in.txt
10 | python code/unify/unifyEntities.py ../createSetFiles/setFile.noEntities.tmp code/unify/in.txt code/unify/out/
11 | ls code/unify/out/ > code/unify/out.txt
12 | python code/unify/processEntities.py code/unify/out.txt code/unify/out/ code/unify/out_processed/
13 | 
14 | cd code/
15 | ./addEntitiesToText.sh ../../tmp.list ../../CoreNLP_scripts/tmp_Eng/
16 | cd ../
17 | 
18 | rm code/unify/in.txt
19 | rm code/unify/out.txt
20 | rm -r code/unify/in/
21 | rm -r code/unify/out/
22 | rm -r code/unify/out_processed/
23 | mkdir code/unify/in/
24 | mkdir code/unify/out/
25 | mkdir code/unify/out_processed/
26 | 
27 | 
28 | 


--------------------------------------------------------------------------------
/preprocessing_2.0/entityExtraction/runEntities_Chinese.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | STANFORD_NER=/home/andrew/NLP_tools/StanfordNER/stanford-ner-2016-10-31
 3 | 
 4 | for i in Contact-Info crime FAC GPE title LOC money ORG PER sentence time weapon vehicle age commodity
 5 | do
 6 | 	java -mx16g -cp "$STANFORD_NER/*:$STANFORD_NER/lib/*" edu.stanford.nlp.ie.crf.CRFClassifier -loadClassifier /home/andrew/DEFT_code_testing/dependencies/models/entities/ACE_ERE-Chinese.ner-model.${i}.dependency.full.ser.gz -testfile $1 > code/unify/in/${1}_${i}
 7 | done
 8 | 
 9 | ls code/unify/in/* > code/unify/in.txt
10 | python code/unify/unifyEntities.py ../createSetFiles/setFile.noEntities.tmp.Chn code/unify/in.txt code/unify/out/
11 | ls code/unify/out/ > code/unify/out.txt
12 | python code/unify/processEntities.py code/unify/out.txt code/unify/out/ code/unify/out_processed/
13 | 
14 | cd code/
15 | ./addEntitiesToText.sh ../../tmp.chinese.list ../../CoreNLP_scripts/tmp_Chn/
16 | cd ../
17 | 
18 | rm code/unify/in.txt
19 | rm code/unify/out.txt
20 | rm -r code/unify/in/
21 | rm -r code/unify/out/
22 | rm -r code/unify/out_processed/
23 | mkdir code/unify/in/
24 | mkdir code/unify/out/
25 | mkdir code/unify/out_processed/
26 | 


--------------------------------------------------------------------------------
/preprocessing_2.0/entityExtraction/runEntities_Spanish.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | STANFORD_NER=/home/andrew/NLP_tools/StanfordNER/stanford-ner-2016-10-31
 3 | 
 4 | for i in crime FAC GPE title LOC ORG PER sentence time weapon vehicle age commodity
 5 | do
 6 | 	java -mx16g -cp "$STANFORD_NER/*:$STANFORD_NER/lib/*" edu.stanford.nlp.ie.crf.CRFClassifier -loadClassifier /home/andrew/DEFT_code_testing/dependencies/models/entities/RichERE-Spanish.ner-model.${i}.dependency.full.ser.gz -testfile $1 > code/unify/in/${1}_${i}
 7 | done
 8 | 
 9 | ls code/unify/in/* > code/unify/in.txt
10 | python code/unify/unifyEntities.py ../createSetFiles/setFile.noEntities.tmp.Span code/unify/in.txt code/unify/out/
11 | ls code/unify/out/ > code/unify/out.txt
12 | python code/unify/processEntities.py code/unify/out.txt code/unify/out/ code/unify/out_processed/
13 | 
14 | cd code/
15 | ./addEntitiesToText.sh ../../tmp.spanish.list ../../CoreNLP_scripts/tmp_Span/
16 | cd ../
17 | 
18 | rm code/unify/in.txt
19 | rm code/unify/out.txt
20 | rm -r code/unify/in/
21 | rm -r code/unify/out/
22 | rm -r code/unify/out_processed/
23 | mkdir code/unify/in/
24 | mkdir code/unify/out/
25 | mkdir code/unify/out_processed/
26 | 


--------------------------------------------------------------------------------
/preprocessing_2.0/processChinese.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | 
 3 | if [ "$#" -ne 1 ]; then
 4 | 	echo "Illegal number of parameters.  Expect list of files to read (with absolute filepaths)."
 5 | 	exit 1
 6 | fi
 7 | 
 8 | # store processed documents in tmp/
 9 | rm CoreNLP_scripts/tmp_Chn/*
10 | ./CoreNLP_scripts/runCoreNLP_Chn.sh $1
11 | 
12 | # readCoreNLP
13 | python readCoreNLP/getRootnames.py $1 tmp.chinese.list
14 | cd readCoreNLP/
15 | rm tmp_formatted_Chn/*
16 | ./convertCoreNLPFormat.sh ../tmp.chinese.list ../CoreNLP_scripts/tmp_Chn/ tmp_formatted_Chn/
17 | cd ../
18 | 
19 | # createSetFiles
20 | find readCoreNLP/tmp_formatted_Chn/ -name "*.mergedAnnotations" > mergedFilenames.tmp.Chn
21 | rm createSetFiles/*.tmp
22 | rm createSetFiles/*.parsing
23 | python createSetFiles/writeDataFromFiles.py mergedFilenames.tmp.Chn createSetFiles/setFile.noEntities.tmp.Chn
24 | rm mergedFilenames.tmp.Chn
25 | 
26 | # add dependency parsing via MaltParser
27 | python MaltParser_scripts/convertToCoNLL.py createSetFiles/setFile.noEntities.tmp.Chn MaltParser_scripts/Chinese.conll.tmp /home/andrew/DEFT_code_testing/dependencies/pos/zh-ctb6.map
28 | cp /home/andrew/DEFT_code_testing/dependencies/models/maltparser/UD.Chinese.model.mco UD.Chinese.model.mco.tmp
29 | mv UD.Chinese.model.mco.tmp UD.Chinese.model.mco
30 | java -jar /home/andrew/NLP_tools/MaltParser/maltparser-1.9.0/maltparser-1.9.0.jar -c UD.Chinese.model.mco -i MaltParser_scripts/Chinese.conll.tmp -o MaltParser_scripts/Chinese.conll.tmp.output -m parse
31 | rm UD.Chinese.model.mco
32 | python MaltParser_scripts/convertToParsingFile.py MaltParser_scripts/Chinese.conll.tmp.output createSetFiles/setFile.noEntities.tmp.Chn.parsing
33 | 
34 | # entity extraction
35 | cd entityExtraction/
36 | python convertTestSet.py ../createSetFiles/setFile.noEntities.tmp.Chn ../createSetFiles/setFile.noEntities.tmp.Chn.parsing entityTestSet.tmp.chn
37 | ./runEntities_Chinese.sh entityTestSet.tmp.chn
38 | cd ../
39 | find entityExtraction/code/tmp_formatted/ -name "*.mergedAnnotations" > mergedFilenames.tmp.chn
40 | python createSetFiles/writeDataFromFiles.py mergedFilenames.tmp.chn createSetFiles/setFile.withEntities.tmp.Chn
41 | python MaltParser_scripts/convertToParsingFile.py MaltParser_scripts/Chinese.conll.tmp.output createSetFiles/setFile.withEntities.tmp.Chn.parsing
42 | rm mergedFilenames.tmp.chn
43 | rm tmp.chinese.list
44 | 
45 | # predictions
46 | cd ../all_predictions_4.0/
47 | ./runAll.sh ../preprocessing_2.0/createSetFiles/setFile.withEntities.tmp.Chn
48 | cd ../preprocessing_2.0/
49 | 
50 | # outputFormatting
51 | python ../outputFormatting/writeDocMap.py $1
52 | cd ../outputFormatting/
53 | ./Chinese_run.sh
54 | cd ../preprocessing_2.0/
55 | 
56 | 
57 | # clear the tmp folders
58 | rm CoreNLP_scripts/tmp_Chn/*
59 | rm createSetFiles/*.Chn
60 | rm createSetFiles/*.parsing
61 | rm entityExtraction/*.chn
62 | rm entityExtraction/code/tmp_formatted/*
63 | rm readCoreNLP/tmp_formatted_Chn/*
64 | rm MaltParser_scripts/Chinese.conll.tmp.output
65 | rm MaltParser_scripts/Chinese.conll.tmp
66 | rm *.tmp
67 | 
68 | rm ../all_predictions_4.0/output.*
69 | rm ../all_predictions_4.0/*.easyRead
70 | rm ../all_predictions_4.0/*.entityCoref
71 | rm ../all_predictions_4.0/currentPredictionsForTriggers/testSet.predictions
72 | rm ../all_predictions_4.0/currentPredictionsForArgs/testSet.predictions
73 | rm ../all_predictions_4.0/currentPredictionsForRealis/testSet.predictions
74 | rm ../all_predictions_4.0/code/test.*
75 | rm ../outputFormatting/formatTriggers/format_andrew_triggers/andrew.triggers.out
76 | 


--------------------------------------------------------------------------------
/preprocessing_2.0/processEnglish.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | 
 3 | if [ "$#" -ne 1 ]; then
 4 | 	echo "Illegal number of parameters.  Expect list of files to read (with absolute filepaths)."
 5 | 	exit 1
 6 | fi
 7 | 
 8 | # store processed documents in tmp/
 9 | rm CoreNLP_scripts/tmp_Eng/*
10 | ./CoreNLP_scripts/runCoreNLP_Eng.sh $1
11 | 
12 | # readCoreNLP
13 | python readCoreNLP/getRootnames.py $1 tmp.list
14 | cd readCoreNLP/
15 | rm tmp_formatted/*
16 | ./convertCoreNLPFormat.sh ../tmp.list ../CoreNLP_scripts/tmp_Eng/ tmp_formatted/
17 | cd ../
18 | 
19 | # createSetFiles
20 | find readCoreNLP/tmp_formatted/ -name "*.mergedAnnotations" > mergedFilenames.tmp
21 | rm createSetFiles/*.tmp
22 | rm createSetFiles/*.parsing
23 | python createSetFiles/writeDataFromFiles.py mergedFilenames.tmp createSetFiles/setFile.noEntities.tmp
24 | rm mergedFilenames.tmp
25 | 
26 | # entity extraction
27 | cd entityExtraction/
28 | python convertTestSet.py ../createSetFiles/setFile.noEntities.tmp ../createSetFiles/setFile.noEntities.tmp.parsing entityTestSet.tmp
29 | ./runEntities.sh entityTestSet.tmp
30 | cd ../
31 | find entityExtraction/code/tmp_formatted/ -name "*.mergedAnnotations" > mergedFilenames.tmp
32 | python createSetFiles/writeDataFromFiles.py mergedFilenames.tmp createSetFiles/setFile.withEntities.tmp
33 | rm mergedFilenames.tmp
34 | rm tmp.list
35 | 
36 | # predictions
37 | cd ../all_predictions_4.0/
38 | ./runAll.sh ../preprocessing_2.0/createSetFiles/setFile.withEntities.tmp
39 | cd ../preprocessing_2.0/
40 | 
41 | # outputFormatting
42 | python ../outputFormatting/writeDocMap.py $1
43 | cd ../outputFormatting/
44 | ./English_run.sh
45 | cd ../preprocessing_2.0/
46 | 
47 | # clear the tmp folders
48 | 
49 | #rm CoreNLP_scripts/tmp_Eng/*
50 | #rm createSetFiles/*.tmp
51 | #rm createSetFiles/*.parsing
52 | #rm entityExtraction/*.tmp
53 | #rm entityExtraction/code/tmp_formatted/*
54 | #rm readCoreNLP/tmp_formatted/*
55 | #rm *.tmp
56 | 
57 | #rm ../all_predictions_4.0/output.*
58 | #rm ../all_predictions_4.0/*.easyRead
59 | #rm ../all_predictions_4.0/*.entityCoref
60 | #rm ../all_predictions_4.0/currentPredictionsForTriggers/testSet.predictions
61 | #rm ../all_predictions_4.0/currentPredictionsForArgs/testSet.predictions
62 | #rm ../all_predictions_4.0/currentPredictionsForRealis/testSet.predictions
63 | #rm ../all_predictions_4.0/code/test.*
64 | #rm ../outputFormatting/formatTriggers/format_andrew_triggers/andrew.triggers.out
65 | 


--------------------------------------------------------------------------------
/preprocessing_2.0/processSpanish.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | 
 3 | if [ "$#" -ne 1 ]; then
 4 | 	echo "Illegal number of parameters.  Expect list of files to read (with absolute filepaths)."
 5 | 	exit 1
 6 | fi
 7 | 
 8 | # store processed documents in tmp/
 9 | rm CoreNLP_scripts/tmp_Span/*
10 | ./CoreNLP_scripts/runCoreNLP_Span.sh $1
11 | 
12 | # readCoreNLP
13 | python readCoreNLP/getRootnames.py $1 tmp.spanish.list
14 | cd readCoreNLP/
15 | rm tmp_formatted_Span/*
16 | ./convertCoreNLPFormat.sh ../tmp.spanish.list ../CoreNLP_scripts/tmp_Span/ tmp_formatted_Span/
17 | cd ../
18 | 
19 | # createSetFiles
20 | find readCoreNLP/tmp_formatted_Span/ -name "*.mergedAnnotations" > mergedFilenames.tmp.Span
21 | rm createSetFiles/*.tmp
22 | rm createSetFiles/*.parsing
23 | python createSetFiles/writeDataFromFiles.py mergedFilenames.tmp.Span createSetFiles/setFile.noEntities.tmp.Span
24 | rm mergedFilenames.tmp.Span
25 | 
26 | # add dependency parsing via MaltParser
27 | python MaltParser_scripts/convertToCoNLL.py createSetFiles/setFile.noEntities.tmp.Span MaltParser_scripts/Spanish.conll.tmp /home/andrew/DEFT_code_testing/dependencies/pos/es-cast3lb.map
28 | cp /home/andrew/DEFT_code_testing/dependencies/models/maltparser/UD.Spanish.model.mco UD.Spanish.model.mco.tmp
29 | mv UD.Spanish.model.mco.tmp UD.Spanish.model.mco
30 | java -jar /home/andrew/NLP_tools/MaltParser/maltparser-1.9.0/maltparser-1.9.0.jar -c UD.Spanish.model.mco -i MaltParser_scripts/Spanish.conll.tmp -o MaltParser_scripts/Spanish.conll.tmp.output -m parse
31 | rm UD.Spanish.model.mco
32 | python MaltParser_scripts/convertToParsingFile.py MaltParser_scripts/Spanish.conll.tmp.output createSetFiles/setFile.noEntities.tmp.Span.parsing
33 | 
34 | # entity extraction
35 | cd entityExtraction/
36 | python convertTestSet.py ../createSetFiles/setFile.noEntities.tmp.Span ../createSetFiles/setFile.noEntities.tmp.Span.parsing entityTestSet.tmp.span
37 | ./runEntities_Spanish.sh entityTestSet.tmp.span
38 | cd ../
39 | find entityExtraction/code/tmp_formatted/ -name "*.mergedAnnotations" > mergedFilenames.tmp.span
40 | python createSetFiles/writeDataFromFiles.py mergedFilenames.tmp.span createSetFiles/setFile.withEntities.tmp.Span
41 | python MaltParser_scripts/convertToParsingFile.py MaltParser_scripts/Spanish.conll.tmp.output createSetFiles/setFile.withEntities.tmp.Span.parsing
42 | rm mergedFilenames.tmp.span
43 | rm tmp.spanish.list
44 | 
45 | # predictions
46 | cd ../all_predictions_4.0/
47 | ./runAll.sh ../preprocessing_2.0/createSetFiles/setFile.withEntities.tmp.Span
48 | cd ../preprocessing_2.0/
49 | 
50 | # outputFormatting
51 | python ../outputFormatting/writeDocMap.py $1
52 | cd ../outputFormatting/
53 | ./Spanish_run.sh
54 | cd ../preprocessing_2.0/
55 | 
56 | 
57 | # clear the tmp folder
58 | rm CoreNLP_scripts/tmp_Span/*
59 | rm createSetFiles/*.Span
60 | rm createSetFiles/*.parsing
61 | rm entityExtraction/*.span
62 | rm entityExtraction/code/tmp_formatted/*
63 | rm readCoreNLP/tmp_formatted_Span/*
64 | rm *.tmp
65 | 
66 | rm MaltParser_scripts/Spanish.conll.tmp.output
67 | rm MaltParser_scripts/Spanish.conll.tmp
68 | 
69 | 
70 | rm ../all_predictions_4.0/output.*
71 | rm ../all_predictions_4.0/*.easyRead
72 | rm ../all_predictions_4.0/*.entityCoref
73 | rm ../all_predictions_4.0/currentPredictionsForTriggers/testSet.predictions
74 | rm ../all_predictions_4.0/currentPredictionsForArgs/testSet.predictions
75 | rm ../all_predictions_4.0/currentPredictionsForRealis/testSet.predictions
76 | rm ../all_predictions_4.0/code/test.*
77 | rm ../outputFormatting/formatTriggers/format_andrew_triggers/andrew.triggers.out
78 | 
79 | 


--------------------------------------------------------------------------------
/preprocessing_2.0/readCoreNLP/convertCoreNLPFormat.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | import xml.etree.ElementTree as ET
 3 | import string
 4 | from xml.sax.saxutils import escape
 5 | 
 6 | def cleanWhitespace(text):
 7 |     clean = ""
 8 |     for char in text:
 9 |         if char in string.whitespace:
10 |             clean += " "
11 |         else:
12 |             clean += char
13 |     return clean
14 | 
15 | def main():
16 |     if len(sys.argv) != 3:
17 |         print "Expect coreNLP features (extracted), output file."
18 |         sys.exit()
19 | 
20 |     labelDict = dict()  # dict from offset -> (B/I, labelType, labelSubtype, fullName)
21 |     triggerDict = dict() # dict from offset -> (triggerType, triggerSubtype)
22 |     argDict = dict() # dict from offset -> (argument role)
23 | 
24 |     entityDict = dict() # dict from mentionID -> (start, end, text)
25 | 
26 |     input = open(sys.argv[1], "r")
27 |     output = open(sys.argv[2], "w")
28 | 
29 |     lineCounter = 0
30 |     for line in input:
31 |         lineCounter += 1
32 | 
33 |         if line.startswith("BEGIN_SENTENCE"):
34 |             output.write("\n")
35 |         else:
36 |             clean = line.strip()
37 |             tokens = clean.split("\t")
38 |             startOffset = int(tokens[0])
39 |             endOffset = int(tokens[1])
40 |             curWord = tokens[2] 
41 | 
42 |             entityInfo = "EntitesGold["
43 | 
44 |             if startOffset in labelDict:
45 |                 for curTuple in labelDict[startOffset]:
46 |                     begin = curTuple[0]
47 |                     entType = curTuple[1]
48 |                     entSubtype = curTuple[2]
49 |                     head = curTuple[4].encode('utf-8')
50 | 
51 |                     coref = "coref_" + str(curTuple[5])
52 | 
53 |                     entityInfo += (begin + "|||" + entType + "|||" + entSubtype + "|||" + head + "|||" + coref + ";;;")
54 | 
55 |                     tupleFullName = curTuple[3].encode('utf-8')
56 |             entityInfo += "]"
57 | 
58 |             eventType = "not_trigger"
59 |             eventSubtype = "not_trigger"
60 | 
61 |             if startOffset in triggerDict:
62 |                 curTuple = triggerDict[startOffset]
63 |                 eventType = curTuple[0]
64 |                 eventSubtype = curTuple[1]
65 | 
66 |             argInfo = "ArgsGold["
67 |             if startOffset in argDict:
68 |                 for curTuple in argDict[startOffset]:
69 |                     begin = curTuple[0]
70 |                     argRole = curTuple[1]
71 |                     eventText = curTuple[3]
72 |                     eventIndex = curTuple[4]
73 |                     argRealis = curTuple[5]
74 | 
75 |                     eventText = replaceWhiteSpace(eventText)
76 | 
77 |                     argInfo += (begin + "|||" + argRole + "|||" + eventText.encode('utf-8') + "|||" + str(eventIndex) + "|||" + argRealis + ";;;")
78 |             argInfo += "]"
79 | 
80 |             output.write(clean + "\t" + entityInfo + "\t" + eventType + "\t" + eventSubtype + "\t" + argInfo + "\n")
81 |     output.write("\n")
82 |     input.close()
83 |     output.close()
84 | 
85 | 
86 | def replaceWhiteSpace(text):
87 |     newStr = ""
88 |     for character in text:
89 |         if character in string.whitespace:
90 |             newStr += " "
91 |         else:
92 |             newStr += character
93 |     return newStr
94 | 
95 | 
96 | main()        
97 | 


--------------------------------------------------------------------------------
/preprocessing_2.0/readCoreNLP/convertCoreNLPFormat.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | if [ "$#" -ne 3 ]; then
 3 | 	echo "Illegal number of parameters"
 4 | 	echo "Use format: './convertCoreNLPFormat.sh [list_of_files] [path_to_files] [output_dir]"
 5 | 	echo "-- list_of_files: text document containing filenames for documents"
 6 | 	echo "-- path_to_files: path to the filenames contained in list_of_files"
 7 | 	echo "-- output_dir: output directory"
 8 | 	exit 1
 9 | fi
10 | 
11 | while read -r line
12 | do
13 | 	name=$line
14 | 	python read_CoreNLP_XML.py	${2}${name}.out tmp.txt
15 | 	python convertCoreNLPFormat.py tmp.txt ${3}/${name}.mergedAnnotations
16 | 	python write_parsing_from_CoreNLP.py ${2}${name}.out ${3}/${name}.parsingAnnotations 
17 | done < ${1}
18 | rm tmp.txt
19 | 


--------------------------------------------------------------------------------
/preprocessing_2.0/readCoreNLP/getRootnames.py:
--------------------------------------------------------------------------------
 1 | # script to extract the relative filenames for each file in a list
 2 | import sys
 3 | 
 4 | def main():
 5 |     if len(sys.argv) != 3:
 6 |         print "Expect an input file, output file."
 7 |         sys.exit()
 8 | 
 9 |     input = open(sys.argv[1], "r")
10 |     output = open(sys.argv[2], "w")
11 | 
12 |     for line in input:
13 |         name = line.strip()
14 |         if "/" in name:
15 |             index = name.rfind("/") + 1
16 |             name = name[index:]
17 |         output.write(name + "\n")
18 | 
19 |     input.close()
20 |     output.close()
21 | 
22 | 
23 | if __name__ == "__main__":
24 |     main()
25 | 


--------------------------------------------------------------------------------
/preprocessing_2.0/readCoreNLP/read_CoreNLP_XML.py:
--------------------------------------------------------------------------------
 1 | # script to read the CoreNLP output
 2 | import sys
 3 | import xml.etree.ElementTree as ET
 4 | 
 5 | def getRootname(text):
 6 |     name = text
 7 |     if "/" in name:
 8 |         index = name.rfind("/") + 1
 9 |         name = name[index:]
10 |     if ".out" in name:
11 |         index = name.find(".out")
12 |         name = name[:index]
13 | 
14 | #    if ".sgm" in name:
15 | #        index = name.find(".sgm")
16 | #        name = name[:index]
17 | #    elif ".mpdf" in name:
18 | #        index = name.find(".mpdf")
19 | #        name = name[:index]
20 | #    elif ".cmp" in name:
21 | #        index = name.find(".cmp")
22 | #        name = name[:index]
23 | #    elif ".txt" in name:
24 | #        index = name.find(".txt")
25 | #        name = name[:index]
26 | #    elif ".xml" in name:
27 | #        index = name.find(".xml")
28 | #        name = name[:index]
29 | 
30 |     return name
31 | 
32 | 
33 | def main():
34 |     if len(sys.argv) != 3:
35 |         print "Expect input XML file, output file."
36 |         sys.exit()
37 | 
38 |     print "Reading: " + sys.argv[1]
39 |     tree = ET.parse(sys.argv[1])
40 |     root = tree.getroot()
41 | 
42 |     rootName = getRootname(sys.argv[1])
43 | 
44 |     wordIndexDict = dict()  # maps word indexes to character level indexes
45 | 
46 |     output = open(sys.argv[2], "w")
47 | 
48 |     # print word, start offset, end offset, POS tag for each character in the data
49 |     printWordInfo(root, rootName, wordIndexDict, output, 0)    
50 | 
51 |     output.close()
52 | 
53 | def printWordInfo(root, rootName, wordIndexDict, output, sentenceNum):
54 |     if root.tag == "coreference":
55 |         return
56 | 
57 |     if root.tag == "sentence":
58 |         output.write("BEGIN_SENTENCE\n")
59 |         sentenceNum = root.attrib["id"]
60 | 
61 |     if root.tag == "token":
62 |         processWord(root, rootName, wordIndexDict, sentenceNum + "_" + root.attrib["id"], output)
63 |     else:
64 |         for child in root:
65 |             printWordInfo(child, rootName, wordIndexDict, output, sentenceNum)
66 | 
67 | def processWord(root, rootName, wordIndexDict, wordID, output):
68 |     pos = ""
69 |     offsetStart = -1
70 | 
71 |     word = ""
72 |     lemma = ""
73 |     # extract the needed info
74 |     for child in root:
75 |         if child.tag == "POS":
76 |             pos = child.text
77 |         elif child.tag == "CharacterOffsetBegin":
78 |             offsetStart = int(child.text)
79 |             wordIndexDict[wordID] = offsetStart
80 |         elif child.tag == "word":
81 |             word = child.text
82 |         elif child.tag == "CharacterOffsetEnd":
83 |             offsetEnd = int(child.text)
84 |         elif child.tag == "lemma":
85 |             lemma = child.text
86 | 
87 |     outString = str(offsetStart) + "\t" + str(offsetEnd) + "\t" + word.encode('utf-8') + "\t" + lemma.encode('utf-8') + "\t" + pos + "\t" + rootName
88 |     output.write(outString + "\n")
89 | 
90 | main()
91 | 


--------------------------------------------------------------------------------
/preprocessing_2.0/readCoreNLP/write_parsing_from_CoreNLP.py:
--------------------------------------------------------------------------------
 1 | # script to read the CoreNLP parsing output
 2 | import sys
 3 | import xml.etree.ElementTree as ET
 4 | 
 5 | def main():
 6 |     if len(sys.argv) != 3:
 7 |         print "Expect input XML file, output file for dependency parsing."
 8 |         sys.exit()
 9 | 
10 |     tree = ET.parse(sys.argv[1])
11 |     root = tree.getroot()
12 |     
13 |     output = open(sys.argv[2], "w")
14 | 
15 |     # output dependency info.  Add line of white space between each sentence.
16 |     writeInfo(root, output)    
17 | 
18 |     output.close()
19 | 
20 | def writeInfo(root, output):
21 |     if root.tag == "dependencies" and root.attrib["type"] == "basic-dependencies":
22 |         for child in root:
23 |             processDependencies(child, output)
24 |         output.write("\n")
25 |     else:
26 |         for child in root:
27 |             writeInfo(child, output)
28 | 
29 | def processDependencies(root, output):
30 |     depType = root.attrib["type"]
31 | 
32 |     governor = ""
33 |     dependent = ""
34 | 
35 |     govIndex = -1
36 |     depIndex = -1
37 | 
38 |     # extract the needed info
39 |     for child in root:
40 |         if child.tag == "governor":
41 |             governor = child.text.encode('utf-8')
42 |             govIndex = child.attrib["idx"]
43 |         elif child.tag == "dependent":
44 |             dependent = child.text.encode('utf-8')
45 |             depIndex = child.attrib["idx"]
46 | 
47 |     output.write(depType.encode('utf-8'))
48 |     output.write("|||")
49 |     output.write(governor)
50 |     output.write("|||")
51 |     output.write(str(govIndex))
52 |     output.write("|||")
53 |     output.write(dependent)
54 |     output.write("|||")
55 |     output.write(str(depIndex))
56 |     output.write("\n")
57 | 
58 | main()
59 | 


--------------------------------------------------------------------------------