├── CONFIG.txt ├── LICENSE ├── README.md ├── all_predictions_4.0 ├── code │ ├── convertOutputArgs.py │ ├── convertOutputTriggers.py │ ├── readLargeInput.py │ ├── readLargeInput.pyc │ ├── writeArgGold.py │ ├── writeArgLiblinear.py │ ├── writeRealisGold.py │ ├── writeRealisLiblinear.py │ ├── writeTriggerGold.py │ └── writeTriggerLiblinear.py ├── runAll.sh ├── runAll_providedTriggers.sh ├── runArguments.sh ├── runArguments_providedTriggers.sh ├── runRealis.sh ├── runRealis_providedTriggers.sh └── runTriggers.sh ├── config.py ├── outputFormatting ├── Chinese_run.sh ├── English_run.sh ├── Spanish_run.sh ├── argument_nugget_linking.py ├── finalForm_KBP.py ├── formatTriggers │ ├── format_andrew_triggers │ │ └── format_andrew.py │ ├── format_hector_triggers │ │ └── format_hector.py │ └── format_jun_triggers │ │ └── format_jun.py ├── out │ ├── cleanStore.sh │ ├── mergeSubmissions │ │ └── mergeSubmissions.py │ ├── mergeSubmissions_coreference │ │ ├── argument_nugget_linking.py │ │ └── mergeSubmissions.py │ └── moveToStore.sh ├── stopwords.txt ├── writeDocMap.py └── writeTriggerOutput.py └── preprocessing_2.0 ├── CoreNLP_scripts ├── StanfordCoreNLP-chinese.properties.simple ├── StanfordCoreNLP-spanish.properties.simple ├── prefixLines.py ├── prepareCoreNLP_input.py ├── runCoreNLP_Chn.sh ├── runCoreNLP_Eng.sh └── runCoreNLP_Span.sh ├── MaltParser_scripts ├── convertToCoNLL.py └── convertToParsingFile.py ├── createSetFiles └── writeDataFromFiles.py ├── entityExtraction ├── code │ ├── addEntitiesToText.py │ ├── addEntitiesToText.sh │ └── unify │ │ ├── processEntities.py │ │ └── unifyEntities.py ├── convertTestSet.py ├── runEntities.sh ├── runEntities_Chinese.sh └── runEntities_Spanish.sh ├── processChinese.sh ├── processEnglish.sh ├── processSpanish.sh └── readCoreNLP ├── convertCoreNLPFormat.py ├── convertCoreNLPFormat.sh ├── getRootnames.py ├── read_CoreNLP_XML.py └── write_parsing_from_CoreNLP.py /CONFIG.txt: -------------------------------------------------------------------------------- 1 | WORD_EMBEDDING_DIR=/home/andrew/DEFT_code_testing/dependencies/wordVectors 2 | CORENLP_DIR=/home/andrew/NLP_tools/CoreNLP/stanford-corenlp-full-2016-10-31 3 | MALTPARSER_DIR=/home/andrew/NLP_tools/MaltParser/maltparser-1.9.0 4 | NER_DIR=/home/andrew/NLP_tools/StanfordNER/stanford-ner-2016-10-31 5 | MODEL_DIR=/home/andrew/DEFT_code_testing/dependencies/models 6 | LIBLINEAR_DIR=/home/andrew/ML_tools/LIBLINEAR/liblinear-1.94 7 | POS_DIR=/home/andrew/DEFT_code_testing/dependencies/pos 8 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2017 ahsi 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | CMU Multilingual Event Extractor 2 | =============================== 3 | 4 | Requirements: 5 | ------------- 6 | - Python 7 | - Stanford CoreNLP 8 | - MaltParser 9 | - LIBLINEAR 10 | - Stanford NER 11 | - Model Files (available at http://cairo.lti.cs.cmu.edu/~ahsi/CMUCS_multilingual_event_extraction_models/CMUCS_Multilingual_Event_Extractor_models.tar.gz) 12 | 13 | Installation: 14 | ------------- 15 | Modify the CONFIG.txt file to point to the directories containing the required software and model files, then run "python config.py" from this directory. 16 | 17 | Usage: 18 | ------ 19 | Usage is split by language. Although all model files use multilingual training, testing on different languages must be done separately (due to differences in preprocessing steps across different languages). 20 | 21 | To run the code, cd into "preprocessing_2.0", then run one of the following: 22 | 23 | - "./processEnglish.sh FILELIST" 24 | - "./processChinese.sh FILELIST" 25 | - "./processSpanish.sh FILELIST" 26 | 27 | where FILELIST is a list of raw text files to be processed with absolute paths, one per line. 28 | 29 | Output files are stored in outputFormatting/out/store/, containing the following subdirectories: 30 | 31 | arguments/ 32 | corpusLinking/ 33 | linking/ 34 | nuggets/ 35 | 36 | The overall output format closely matches that of the the 2016 TAC KBP Event Argument Extraction and Linking (EAL) Task, with some slightly modifications. Files in the arguments/ subdirectory are exactly the same, except for two additional columns at the end of each line, providing the starting offset and ID for the associated event nugget. Files in the corpusLinking/ sudirectory and linking/ subdirectory exactly match the EAL Task specifications. Files in the nuggets/ directory exactly match the format of the 2016 TAC KBP Event Nugget Detection Task. 37 | -------------------------------------------------------------------------------- /all_predictions_4.0/code/convertOutputArgs.py: -------------------------------------------------------------------------------- 1 | # script to convert the liblinear output files to work with my evaluation script 2 | import sys 3 | 4 | def main(): 5 | if len(sys.argv) != 5: 6 | print "Expect roles dict, prediction file, easy reading file, output file." 7 | sys.exit() 8 | 9 | predictions = [] 10 | labelOnly = True 11 | input = open(sys.argv[2], "r") 12 | for line in input: 13 | if line.startswith("labels"): 14 | labelOnly = False 15 | continue 16 | if labelOnly: 17 | predictions.append(line.strip()) 18 | else: 19 | temp = line.split(" ")[0] 20 | predictions.append(temp) 21 | input.close() 22 | 23 | roleDict = dict() 24 | input = open(sys.argv[1], "r") 25 | for line in input: 26 | tokens = line.strip().split(":") 27 | roleDict[tokens[1]] = tokens[0] 28 | input.close() 29 | 30 | input = open(sys.argv[3], "r") 31 | output = open(sys.argv[4], "w") 32 | index = 0 33 | for line in input: 34 | tokens = line.strip().split("\t") 35 | sentStr = tokens[0] 36 | text = tokens[2] 37 | 38 | predictedRole = predictions[index] 39 | 40 | output.write(text + "|||" + sentStr + "|||" + roleDict[predictedRole] + "\n") 41 | index += 1 42 | input.close() 43 | output.close() 44 | 45 | main() 46 | -------------------------------------------------------------------------------- /all_predictions_4.0/code/convertOutputTriggers.py: -------------------------------------------------------------------------------- 1 | # script to convert the liblinear output files to work with my evaluation script 2 | import sys 3 | 4 | def main(): 5 | if len(sys.argv) != 5: 6 | print "Expect roles dict, prediction file, easy reading file, output file." 7 | sys.exit() 8 | 9 | predictions = [] 10 | input = open(sys.argv[2], "r") 11 | for line in input: 12 | predictions.append(line.strip()) 13 | input.close() 14 | 15 | roleDict = dict() 16 | input = open(sys.argv[1], "r") 17 | for line in input: 18 | tokens = line.strip().split(":") 19 | roleDict[tokens[1]] = tokens[0] 20 | input.close() 21 | 22 | input = open(sys.argv[3], "r") 23 | output = open(sys.argv[4], "w") 24 | index = 0 25 | 26 | for index in range(len(predictions)): 27 | predictedRole = roleDict[predictions[index]] 28 | output.write(predictedRole + "\n") 29 | input.close() 30 | output.close() 31 | 32 | main() 33 | -------------------------------------------------------------------------------- /all_predictions_4.0/code/readLargeInput.py: -------------------------------------------------------------------------------- 1 | # file to contain the most up-to-date version of readInput() 2 | 3 | class Sentence: 4 | def __init__(self, wordsArg, lemmasArg, labelsArg, posTagsArg, entityArg, goldArgParam, docIDParam, startParam, endParam, realisArg, allStarts=[]): 5 | self.words = wordsArg 6 | self.lemmas = lemmasArg 7 | self.labels = labelsArg 8 | self.posTags = posTagsArg 9 | self.entities = entityArg 10 | self.goldArgs = goldArgParam 11 | self.depByGovIndex = dict() 12 | self.depByDepIndex = dict() 13 | 14 | self.startOffset = startParam 15 | self.endOffset = endParam 16 | 17 | self.docID = docIDParam 18 | 19 | self.realisLabels = realisArg 20 | 21 | self.offsets = allStarts 22 | 23 | def addDependency(self, dep): 24 | gIndex = dep.gIndex 25 | dIndex = dep.dIndex 26 | 27 | if gIndex not in self.depByGovIndex: 28 | self.depByGovIndex[gIndex] = set() 29 | if dIndex not in self.depByDepIndex: 30 | self.depByDepIndex[dIndex] = set() 31 | 32 | self.depByGovIndex[gIndex].add(dep) 33 | self.depByDepIndex[dIndex].add(dep) 34 | 35 | class Dependency: 36 | def __init__(self, depTypeArg, governorArg, gIndexArg, dependentArg, dIndexArg): 37 | self.depType = depTypeArg 38 | self.governor = governorArg 39 | self.gIndex = gIndexArg 40 | self.dependent = dependentArg 41 | self.dIndex = dIndexArg 42 | 43 | class ArgumentParse: 44 | def __init__(self, beginArg, roleArg, triggerArg, triggerIndexArg): 45 | self.begin = beginArg 46 | self.role = roleArg 47 | self.triggerText = triggerArg 48 | self.triggerIndex = triggerIndexArg 49 | 50 | class Argument: 51 | def __init__(self, textParam, roleParam, associatedIndexesParam, triggerTextParam, triggerIndexParam): 52 | self.text = textParam 53 | self.role = roleParam 54 | self.associatedIndexes = associatedIndexesParam 55 | self.triggerText = triggerTextParam 56 | self.triggerIndex = triggerIndexParam 57 | 58 | def minIndex(self): 59 | minVal = -1 60 | for index in self.associatedIndexes: 61 | if index < minVal or minVal == -1: 62 | minVal = index 63 | return minVal 64 | 65 | 66 | # extracts argument info, returns as a list 67 | # FORMAT: "ArgsGold[begin|||ROLE|||triggerText|||triggerStart;]" 68 | def readArguments(line, realisMode): 69 | argList = [] 70 | 71 | start = line.find("[") 72 | 73 | cleaned = line[start+1:] 74 | tokens = cleaned.split(";;;") 75 | for tok in tokens: 76 | if tok != "]": 77 | subparts = tok.split("|||") 78 | if realisMode: 79 | if len(subparts) == 4: 80 | curArg = ArgumentParse(subparts[0], "UNK_REALIS", subparts[2], int(subparts[3])) 81 | else: 82 | curArg = ArgumentParse(subparts[0], subparts[4], subparts[2], int(subparts[3])) 83 | else: 84 | curArg = ArgumentParse(subparts[0], subparts[1], subparts[2], int(subparts[3])) 85 | argList.append(curArg) 86 | 87 | return argList 88 | 89 | # input: list of argument information (one ArgumentParse list per word) 90 | # output: list of Arguments for the sentence 91 | def extractGoldArgs(argLists, words, converter): 92 | outputList = [] 93 | 94 | for index in range(len(argLists)): 95 | curList = argLists[index] 96 | 97 | for parse in curList: 98 | if parse.begin == "B": 99 | textParam, associatedIndexes = extractArgument(argLists, index, parse.role, words, parse.triggerIndex) 100 | 101 | converted = -1 # can be -1 if trigger is not within sentence boundaries (e.g. our sentence segmentation is off) 102 | if parse.triggerIndex in converter: 103 | converted = converter[parse.triggerIndex] 104 | 105 | curArg = Argument(textParam, parse.role, associatedIndexes, parse.triggerText, converted) 106 | outputList.append(curArg) 107 | 108 | return outputList 109 | 110 | # extract the words/indexes associated with a particular entity candidate 111 | def extractArgument(argLists, index, role, words, triggerIndex): 112 | text = words[index] 113 | indexes = set() 114 | indexes.add(index) 115 | 116 | altIndex = index + 1 117 | while altIndex < len(words): 118 | curList = argLists[altIndex] 119 | found = False 120 | for parse in curList: 121 | if parse.begin != "I": 122 | continue 123 | if role == parse.role and triggerIndex == parse.triggerIndex: 124 | indexes.add(altIndex) 125 | text += " " + words[altIndex] 126 | found = True 127 | break 128 | if found == False: 129 | break 130 | 131 | altIndex += 1 132 | 133 | return text, indexes 134 | 135 | class EntityParse: 136 | def __init__(self, beginArg, typeArg, subtypeArg, headArg, corefArg): 137 | self.begin = beginArg 138 | self.entType = typeArg 139 | self.subtype = subtypeArg 140 | self.head = headArg 141 | self.corefStr = corefArg 142 | 143 | class Entity: 144 | def __init__(self, textParam, typeParam, subtypeParam, associatedIndexesParam, argRoleParam, argTriggerParam, headParam, corefParam, startParam, endParam): 145 | self.text = textParam 146 | self.entType = typeParam 147 | self.subtype = subtypeParam 148 | self.associatedIndexes = associatedIndexesParam 149 | self.head = headParam 150 | self.corefStr = corefParam 151 | 152 | self.start = startParam 153 | self.end = endParam 154 | 155 | self.argRole = argRoleParam 156 | self.argTrigger = argTriggerParam 157 | 158 | def minIndex(self): 159 | minVal = -1 160 | for index in self.associatedIndexes: 161 | if index < minVal or minVal == -1: 162 | minVal = index 163 | return minVal 164 | 165 | def maxIndex(self): 166 | maxVal = -1 167 | for index in self.associatedIndexes: 168 | if index > maxVal or maxVal == -1: 169 | maxVal = index 170 | return maxVal 171 | 172 | # extracts entity info, returns as a list 173 | # FORMAT: "EntitiesGold[begin|||PER|||Individual|||headWord;]" 174 | def readEntities(line): 175 | entList = [] 176 | 177 | start = line.find("[") 178 | 179 | cleaned = line[start+1:] 180 | tokens = cleaned.split(";;;") 181 | for tok in tokens: 182 | if tok != "]": 183 | subparts = tok.split("|||") 184 | curEnt = EntityParse(subparts[0], subparts[1], subparts[2], subparts[3], subparts[4]) 185 | entList.append(curEnt) 186 | 187 | return entList 188 | 189 | # input: list of entity information (one EntityParse list per word) 190 | # output: list of Entities for the sentence 191 | def extractCandidateArgs(entityLists, words, starts, ends): 192 | outputList = [] 193 | 194 | for index in range(len(entityLists)): 195 | curList = entityLists[index] 196 | 197 | for parse in curList: 198 | if parse.begin == "B": 199 | textParam, associatedIndexes, entStart, entEnd = extractCandidate(entityLists, index, parse.entType, parse.subtype, words, parse.head, starts, ends) 200 | 201 | curEntity = Entity(textParam, parse.entType, parse.subtype, associatedIndexes, "", "", parse.head, parse.corefStr, entStart, entEnd) 202 | outputList.append(curEntity) 203 | 204 | return outputList 205 | 206 | # extract the words/indexes associated with a particular entity candidate 207 | def extractCandidate(entityLists, index, entType, subtype, words, head, starts, ends): 208 | typeName = entType + "_" + subtype 209 | text = words[index] 210 | indexes = set() 211 | indexes.add(index) 212 | 213 | entStart = starts[index] 214 | entEnd = ends[index] 215 | 216 | altIndex = index + 1 217 | while altIndex < len(words): 218 | curList = entityLists[altIndex] 219 | found = False 220 | for parse in curList: 221 | if parse.begin != "I": 222 | continue 223 | curName = parse.entType + "_" + parse.subtype 224 | if curName == typeName and parse.head == head: 225 | indexes.add(altIndex) 226 | text += " " + words[altIndex] 227 | found = True 228 | entEnd = ends[altIndex] 229 | break 230 | if found == False: 231 | break 232 | 233 | altIndex += 1 234 | 235 | return text, indexes, entStart, entEnd 236 | 237 | def scanInput(filename, parsingFilename, inputTriggers = None, entityOut = None, realisMode = False): 238 | print "Reading " + filename 239 | input = open(filename, "r") 240 | parsingInput = open(parsingFilename, "r") 241 | 242 | possibleLabels = set() 243 | possibleArgs = set() 244 | possibleArgs.add("NONE") 245 | 246 | words = [] 247 | lemmas = [] 248 | posTags = [] 249 | labels = [] 250 | 251 | docID = "" 252 | 253 | entityInfo = [] 254 | argInfo = [] 255 | 256 | starts = [] 257 | ends = [] 258 | 259 | indexConverter = dict() # converts from character offsets -> within-sentence word indexes 260 | 261 | count = 0 262 | for line in input: 263 | # if empty line 264 | if line.strip() == "": 265 | # if we have data, process and reset 266 | if len(words) > 0: 267 | entCandidates = extractCandidateArgs(entityInfo, words, starts, ends) 268 | 269 | goldArgs = extractGoldArgs(argInfo, words, indexConverter) 270 | for arg in goldArgs: 271 | possibleArgs.add(arg.role) 272 | 273 | words = [] 274 | lemmas = [] 275 | posTags = [] 276 | labels = [] 277 | entityInfo = [] 278 | argInfo = [] 279 | 280 | starts = [] 281 | ends = [] 282 | 283 | realisList = [] 284 | 285 | docID = "" 286 | 287 | indexConverter = dict() 288 | else: 289 | tokens = line.strip().split("\t") 290 | ### How to read input (by token): 291 | ### 0: start index, 1: end index, 2: word, 3: lemma, 4: posTag, 5: docID, 6:gold entities, 7: trigger type, 8: trigger subtype, 9: argument role, 10: trigger realis (optional) 292 | 293 | start = int(tokens[0]) 294 | indexConverter[start] = len(words) 295 | 296 | starts.append(int(tokens[0])) 297 | ends.append(int(tokens[1])) 298 | 299 | curWord = tokens[2] 300 | curPOS = tokens[4] 301 | curLabel = tokens[7] + "_" + tokens[8] 302 | 303 | docID = tokens[5] 304 | 305 | words.append(curWord) 306 | lemmas.append(tokens[3]) 307 | posTags.append(curPOS) 308 | 309 | if inputTriggers != None: 310 | labels.append(inputTriggers[count]) 311 | else: 312 | labels.append(curLabel) 313 | 314 | if len(tokens) >= 11: 315 | realisList.append(tokens[10]) 316 | 317 | curEnt = readEntities(tokens[6]) 318 | entityInfo.append(curEnt) 319 | 320 | curArg = readArguments(tokens[9], realisMode) 321 | argInfo.append(curArg) 322 | 323 | possibleLabels.add(curLabel) 324 | 325 | count += 1 326 | 327 | return possibleLabels, possibleArgs 328 | 329 | def readInput(input, parsingInput, inputTriggers = None, entityOut = None, count=0, realisMode = False): 330 | words = [] 331 | lemmas = [] 332 | posTags = [] 333 | labels = [] 334 | 335 | docID = "" 336 | 337 | entityInfo = [] 338 | argInfo = [] 339 | 340 | starts = [] 341 | ends = [] 342 | 343 | realisList = [] 344 | 345 | indexConverter = dict() # converts from character offsets -> within-sentence word indexes 346 | 347 | eof = False 348 | 349 | sentence = None 350 | 351 | while True: 352 | line = input.readline() 353 | eof = line == "" 354 | 355 | # if empty line 356 | if line.strip() == "": 357 | # if we have data, process and reset 358 | if len(words) > 0: 359 | entCandidates = extractCandidateArgs(entityInfo, words, starts, ends) 360 | goldArgs = extractGoldArgs(argInfo, words, indexConverter) 361 | 362 | sentence = Sentence(words, lemmas, labels, posTags, entCandidates, goldArgs, docID, starts[0], ends[len(ends)-1], realisList, starts) 363 | 364 | if entityOut != None: 365 | for ent in entCandidates: 366 | text = ent.text 367 | coref = ent.corefStr + "_" + docID 368 | #print text + "\t" + coref + "\t" + str(ent.start) + "\t" + str(ent.end) 369 | entityOut.write(text + "\t" + coref + "\t" + str(ent.start) + "\t" + str(ent.end) + "\n") 370 | 371 | words = [] 372 | lemmas = [] 373 | posTags = [] 374 | labels = [] 375 | entityInfo = [] 376 | argInfo = [] 377 | 378 | starts = [] 379 | ends = [] 380 | 381 | realisList = [] 382 | 383 | docID = "" 384 | 385 | indexConverter = dict() 386 | else: 387 | tokens = line.strip().split("\t") 388 | ### How to read input (by token): 389 | ### 0: start index, 1: end index, 2: word, 3: lemma, 4: posTag, 5: docID, 6:gold entities, 7: trigger type, 8: trigger subtype, 9: argument role, 10: trigger realis (optional) 390 | 391 | start = int(tokens[0]) 392 | indexConverter[start] = len(words) 393 | 394 | starts.append(int(tokens[0])) 395 | ends.append(int(tokens[1])) 396 | 397 | curWord = tokens[2] 398 | curPOS = tokens[4] 399 | curLabel = tokens[7] + "_" + tokens[8] 400 | 401 | docID = tokens[5] 402 | 403 | words.append(curWord) 404 | lemmas.append(tokens[3]) 405 | posTags.append(curPOS) 406 | 407 | if inputTriggers != None: 408 | labels.append(inputTriggers[count]) 409 | else: 410 | labels.append(curLabel) 411 | 412 | if len(tokens) >= 11: 413 | realisList.append(tokens[10]) 414 | 415 | curEnt = readEntities(tokens[6]) 416 | entityInfo.append(curEnt) 417 | 418 | curArg = readArguments(tokens[9], realisMode) 419 | argInfo.append(curArg) 420 | 421 | count += 1 422 | if eof or sentence != None: 423 | break 424 | 425 | if entityOut != None: 426 | entityOut.close() 427 | 428 | # add dependencies 429 | while True: 430 | line = parsingInput.readline() 431 | clean = line.strip() 432 | 433 | if clean == "": 434 | break 435 | else: 436 | # rare case -- we have a token "|", can't do splitting like normal 437 | if "||||" in clean: 438 | depType, gov, govIndex, dep, depIndex = parseDep_Exception(clean) 439 | else: 440 | tokens = clean.split("|||") 441 | depType = tokens[0] 442 | gov = tokens[1] 443 | govIndex = int(tokens[2]) 444 | dep = tokens[3] 445 | depIndex = int(tokens[len(tokens) - 1]) 446 | ###depIndex = int(tokens[4]) 447 | 448 | # account for off-by-one (in CoreNLP, 0 = Root, rather than first word) 449 | curDependency = Dependency(depType, gov, govIndex - 1, dep, depIndex - 1) 450 | sentence.addDependency(curDependency) 451 | 452 | return sentence, not eof, count 453 | 454 | # method to parse a dependency when one of the words contains "|" at the front or end 455 | def parseDep_Exception(clean): 456 | # depType 457 | tempIndex = clean.find("|||") 458 | depType = clean[:tempIndex] 459 | clean = clean[tempIndex+3:] 460 | 461 | # gov word 462 | tempIndex = clean.find("|||") 463 | altIndex = clean.find("||||") 464 | curWord = "" 465 | while tempIndex == altIndex: 466 | curWord += clean[0] 467 | clean = clean[1:] 468 | 469 | tempIndex = clean.find("|||") 470 | altIndex = clean.find("||||") 471 | curWord += clean[:tempIndex] 472 | gov = curWord 473 | clean = clean[tempIndex+3:] 474 | 475 | # govIndex 476 | tempIndex = clean.find("|||") 477 | govIndex = int(clean[:tempIndex]) 478 | clean = clean[tempIndex+3:] 479 | 480 | # dep word 481 | tempIndex = clean.find("|||") 482 | altIndex = clean.find("||||") 483 | curWord = "" 484 | while tempIndex == altIndex: 485 | curWord += clean[0] 486 | clean = clean[1:] 487 | 488 | tempIndex = clean.find("|||") 489 | altIndex = clean.find("||||") 490 | curWord += clean[:tempIndex] 491 | dep = curWord 492 | clean = clean[tempIndex+3:] 493 | 494 | try: 495 | depIndex = int(clean) 496 | except ValueError: 497 | start = clean.rfind("|") 498 | depIndex = int(clean[start+1:]) 499 | 500 | return depType, gov, govIndex, dep, depIndex 501 | -------------------------------------------------------------------------------- /all_predictions_4.0/code/readLargeInput.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ahsi/Multilingual_Event_Extraction/eed002e864e16dc06c2b2970267b1465adcf825c/all_predictions_4.0/code/readLargeInput.pyc -------------------------------------------------------------------------------- /all_predictions_4.0/code/writeArgGold.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import copy 3 | from random import shuffle 4 | import string 5 | from nltk.stem.wordnet import WordNetLemmatizer 6 | from readLargeInput import * 7 | 8 | 9 | DEBUG=False 10 | SMALLDEBUG=False 11 | PREDICTION_DEBUG=False 12 | EMPTY_TRIGGER="not_trigger_not_trigger" 13 | EMPTY_ROLE="NONE" 14 | stepSize=1 15 | beamSize=1 16 | maxIters=20 17 | 18 | def processSentence(curSentence, possibleLabels, possibleArgs, output, sentIndex): 19 | writtenSet = set() 20 | for arg in curSentence.goldArgs: 21 | argString = arg.text + "|||sent_" + str(sentIndex) + "|||" + arg.role 22 | if argString not in writtenSet: 23 | output.write(argString + "\n") 24 | writtenSet.add(argString) 25 | 26 | def processEntity(curSentence, entityIndex, possibleArgs, triggerIndex, triggerLabel, output, sentIndex): 27 | curWords = curSentence.words 28 | triggerWord = curWords[triggerIndex] 29 | curEntity = curSentence.entities[entityIndex] 30 | 31 | goldArgRole = "NONE" 32 | 33 | # find what the gold val is for this entity 34 | curGoldArgs = curSentence.goldArgs 35 | foundArg = None 36 | for arg in curGoldArgs: 37 | argText = arg.text 38 | minIndex = arg.minIndex() 39 | argTriggerIndex = arg.triggerIndex 40 | 41 | # if same entity (text and location) and same associated trigger 42 | if argText == curEntity.text and minIndex == curEntity.minIndex() and argTriggerIndex == triggerIndex and goldArgRole == "NONE": 43 | goldArgRole = arg.role 44 | foundArg = arg 45 | elif argText == curEntity.text and minIndex == curEntity.minIndex() and argTriggerIndex == triggerIndex: 46 | print "Found duplicate!" 47 | print argText + "\t" + arg.triggerText + "\t" + arg.role 48 | print "Alternate:" 49 | print foundArg.text + "\t" + foundArg.triggerText + "\t" + goldArgRole 50 | sys.exit() 51 | if goldArgRole != EMPTY_ROLE: 52 | output.write(curEntity.text + "|||" + "sent_" + str(sentIndex) + "|||" + goldArgRole + "\n") 53 | 54 | def main(): 55 | if len(sys.argv) != 4: 56 | print "Expect input training data, output args file, output sentences file." 57 | sys.exit() 58 | 59 | possibleLabels, possibleArgs = scanInput(sys.argv[1], sys.argv[1] + ".parsing") 60 | 61 | print "Total # of trigger labels to predict over: " + str(len(possibleLabels)) 62 | print "Total # of argument roles to predict over: " + str(len(possibleArgs)) 63 | 64 | output = open(sys.argv[2], "w") 65 | 66 | # go over each sentence in the training data 67 | # NOTE: in this version, using gold triggers, gold entity mentions 68 | input = open(sys.argv[1], "r") 69 | parsingInput = open(sys.argv[1] + ".parsing", "r") 70 | count = 0 71 | 72 | sentenceOutput = open(sys.argv[3], "w") 73 | while True: 74 | sentence, valid, dummy = readInput(input, parsingInput) 75 | 76 | if count % 1000 == 0: 77 | print "Processing sentence " + str(count) 78 | if DEBUG: 79 | print "Sentence length: " + str(len(sentence.words)) 80 | print "Total entities: " + str(len(sentence.entities)) 81 | 82 | if sentence != None: 83 | processSentence(sentence, possibleLabels, possibleArgs, output, count) 84 | 85 | for word in sentence.words: 86 | sentenceOutput.write(word + " ") 87 | sentenceOutput.write("\n") 88 | 89 | count += 1 90 | if not valid: 91 | break 92 | output.close() 93 | sentenceOutput.close() 94 | 95 | if __name__ == "__main__": 96 | main() 97 | -------------------------------------------------------------------------------- /all_predictions_4.0/code/writeRealisGold.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import copy 3 | from random import shuffle 4 | import string 5 | from readLargeInput import * 6 | 7 | 8 | DEBUG=False 9 | SMALLDEBUG=False 10 | PREDICTION_DEBUG=False 11 | EMPTY_TRIGGER="not_trigger_not_trigger" 12 | EMPTY_ROLE="NONE" 13 | stepSize=1 14 | beamSize=1 15 | maxIters=20 16 | 17 | def processSentence(curSentence, possibleLabels, possibleArgs, output, sentIndex): 18 | writtenSet = set() 19 | for arg in curSentence.goldArgs: 20 | argString = arg.text + "|||sent_" + str(sentIndex) + "|||" + arg.role 21 | if argString not in writtenSet: 22 | output.write(argString + "\n") 23 | writtenSet.add(argString) 24 | 25 | def processEntity(curSentence, entityIndex, possibleArgs, triggerIndex, triggerLabel, output, sentIndex): 26 | curWords = curSentence.words 27 | triggerWord = curWords[triggerIndex] 28 | curEntity = curSentence.entities[entityIndex] 29 | 30 | goldArgRole = "NONE" 31 | 32 | # find what the gold val is for this entity 33 | curGoldArgs = curSentence.goldArgs 34 | foundArg = None 35 | for arg in curGoldArgs: 36 | argText = arg.text 37 | minIndex = arg.minIndex() 38 | argTriggerIndex = arg.triggerIndex 39 | 40 | # if same entity (text and location) and same associated trigger 41 | if argText == curEntity.text and minIndex == curEntity.minIndex() and argTriggerIndex == triggerIndex and goldArgRole == "NONE": 42 | goldArgRole = arg.role 43 | foundArg = arg 44 | elif argText == curEntity.text and minIndex == curEntity.minIndex() and argTriggerIndex == triggerIndex: 45 | print "Found duplicate!" 46 | print argText + "\t" + arg.triggerText + "\t" + arg.role 47 | print "Alternate:" 48 | print foundArg.text + "\t" + foundArg.triggerText + "\t" + goldArgRole 49 | sys.exit() 50 | if goldArgRole != EMPTY_ROLE: 51 | output.write(curEntity.text + "|||" + "sent_" + str(sentIndex) + "|||" + goldArgRole + "\n") 52 | 53 | def main(): 54 | if len(sys.argv) != 4: 55 | print "Expect input training data, output args file, output sentences file." 56 | sys.exit() 57 | 58 | possibleLabels, possibleArgs = scanInput(sys.argv[1], sys.argv[1] + ".parsing", realisMode = True) 59 | 60 | print "Total # of trigger labels to predict over: " + str(len(possibleLabels)) 61 | print "Total # of argument roles to predict over: " + str(len(possibleArgs)) 62 | 63 | output = open(sys.argv[2], "w") 64 | 65 | # go over each sentence in the training data 66 | # NOTE: in this version, using gold triggers, gold entity mentions 67 | input = open(sys.argv[1], "r") 68 | parsingInput = open(sys.argv[1] + ".parsing", "r") 69 | count = 0 70 | 71 | sentenceOutput = open(sys.argv[3], "w") 72 | while True: 73 | sentence, valid, dummy = readInput(input, parsingInput, realisMode = True) 74 | 75 | if count % 1000 == 0: 76 | print "Processing sentence " + str(count) 77 | if DEBUG: 78 | print "Sentence length: " + str(len(sentence.words)) 79 | print "Total entities: " + str(len(sentence.entities)) 80 | 81 | if sentence != None: 82 | processSentence(sentence, possibleLabels, possibleArgs, output, count) 83 | 84 | for word in sentence.words: 85 | sentenceOutput.write(word + " ") 86 | sentenceOutput.write("\n") 87 | 88 | count += 1 89 | if not valid: 90 | break 91 | output.close() 92 | sentenceOutput.close() 93 | 94 | if __name__ == "__main__": 95 | main() 96 | -------------------------------------------------------------------------------- /all_predictions_4.0/code/writeTriggerGold.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import copy 3 | from random import shuffle 4 | import string 5 | from readLargeInput import * 6 | 7 | 8 | DEBUG=False 9 | SMALLDEBUG=False 10 | PREDICTION_DEBUG=False 11 | EMPTY_TRIGGER="not_trigger_not_trigger" 12 | EMPTY_ROLE="NONE" 13 | stepSize=1 14 | beamSize=1 15 | maxIters=20 16 | 17 | def processSentence(curSentence, possibleLabels, possibleArgs, output, sentIndex): 18 | curWords = curSentence.words 19 | curGold = curSentence.labels 20 | for triggerIndex in range(len(curWords)): 21 | word = curWords[triggerIndex] 22 | triggerLabel = curGold[triggerIndex] 23 | 24 | output.write(triggerLabel + "\n") 25 | 26 | def main(): 27 | if len(sys.argv) != 3: 28 | print "Expect input training data, output args file." 29 | sys.exit() 30 | 31 | possibleLabels, possibleArgs = scanInput(sys.argv[1], sys.argv[1] + ".parsing") 32 | 33 | print "Total # of trigger labels to predict over: " + str(len(possibleLabels)) 34 | print "Total # of argument roles to predict over: " + str(len(possibleArgs)) 35 | 36 | output = open(sys.argv[2], "w") 37 | 38 | # go over each sentence in the training data 39 | # NOTE: in this version, using gold triggers, gold entity mentions 40 | input = open(sys.argv[1], "r") 41 | parsingInput = open(sys.argv[1] + ".parsing", "r") 42 | count = 0 43 | while True: 44 | sentence, valid, nothing = readInput(input, parsingInput) 45 | 46 | if count % 1000 == 0: 47 | print "Processing sentence " + str(count) 48 | if DEBUG: 49 | print "Sentence length: " + str(len(sentence.words)) 50 | print "Total entities: " + str(len(sentence.entities)) 51 | 52 | if sentence != None: 53 | processSentence(sentence, possibleLabels, possibleArgs, output, count) 54 | 55 | count += 1 56 | if not valid: 57 | break 58 | output.close() 59 | 60 | if __name__ == "__main__": 61 | main() 62 | -------------------------------------------------------------------------------- /all_predictions_4.0/code/writeTriggerLiblinear.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import copy 3 | from random import shuffle 4 | import string 5 | import os 6 | from readLargeInput import * 7 | 8 | 9 | #### FILEPATHS TO BE SET BY USER 10 | WORD_EMBEDDING_PATH="/home/andrew/DEFT_code_testing/dependencies/wordVectors" 11 | UNIV_POS_PATH="/home/andrew/DEFT_code_testing/dependencies/pos" 12 | 13 | 14 | DEBUG=False 15 | SMALLDEBUG=False 16 | PREDICTION_DEBUG=False 17 | # for when the input data is too big, turn this off. 18 | EASY_READ_OUTPUT=False 19 | EMPTY_TRIGGER="not_trigger_not_trigger" 20 | stepSize=1 21 | beamSize=1 22 | maxIters=20 23 | 24 | ### Original word vectors 25 | # English vectors only 26 | WORD2VEC_FILENAME=WORD_EMBEDDING_PATH+"/en-wiki-april-6-2015.word2vec_vectors" 27 | # Chinese vectors only 28 | CHINESE_WORD2VEC_FILENAME=WORD_EMBEDDING_PATH+"/chinese-wiki-20160305.word2vec" 29 | # Spanish vectors only 30 | SPANISH_WORD2VEC_FILENAME=WORD_EMBEDDING_PATH+"/es-wiki-may-2-2016.word2vec_vectors" 31 | # multilingual -- Noah 32 | #MULTI_WORD2VEC_FILENAME="/home/andrew/data/LORELEI/word_vectors_noah/embeddings/andrew_scripts/eng-chn-noah.wordVectors" 33 | 34 | ### ACE subset word vectors 35 | #WORD2VEC_FILENAME="../../multilingualWordVectors/createWordVectorSubset/en-wiki-april-6-2015.word2vec.ACE.subset" 36 | #CHINESE_WORD2VEC_FILENAME="../../multilingualWordVectors/createWordVectorSubset/chinese-wiki-march-5-2016.word2vec.ACE.subset" 37 | #SPANISH_WORD2VEC_FILENAME="empty.txt" 38 | #MULTI_WORD2VEC_FILENAME="../../multilingualWordVectors/vectorAlignment/out/multilingualEngChn.formatted.final" 39 | 40 | ### empty word vectors 41 | #WORD2VEC_FILENAME="empty.txt" 42 | #CHINESE_WORD2VEC_FILENAME="empty.txt" 43 | #SPANISH_WORD2VEC_FILENAME="empty.txt" 44 | MULTI_WORD2VEC_FILENAME="empty.txt" 45 | 46 | ### dictionary based on CEDICT, containing the ACE English trigger words 47 | #TRIGGER_BILINGUAL_DICTIONARY_LIST="../../multilingualWordVectors/extractTriggerWords/ACE.English.triggerWords.translations" 48 | TRIGGER_BILINGUAL_DICTIONARY_LIST="empty.txt" 49 | ### dictionary based on CEDICT, containing the entire ACE English lexicon 50 | #BILINGUAL_DICTIONARY_LIST="../../multilingualWordVectors/extractTriggerWords/ACE.English.lexicon.translations" 51 | BILINGUAL_DICTIONARY_LIST="empty.txt" 52 | 53 | wordVecs = dict() # dict: word -> vector 54 | chineseWordVecs = dict() 55 | multiWordVecs = dict() 56 | spanishWordVecs = dict() 57 | bilingualDictionary = dict() 58 | triggerBilingualDictionary = dict() 59 | 60 | univPOS_EngFile=UNIV_POS_PATH+"/en-ptb.map" 61 | univPOS_ChnFile=UNIV_POS_PATH+"/zh-ctb6.map" 62 | univPOS_SpanFile=UNIV_POS_PATH+"/es-cast3lb.map" 63 | universalPOS_converter = dict() 64 | 65 | def readConfigFile(filename): 66 | input = open(filename, "r") 67 | returnPath = "" 68 | for line in input: 69 | if line.startswith("WORD_EMBEDDING_DIR"): 70 | returnPath = line.strip().split("=")[1] 71 | input.close() 72 | 73 | return returnPath 74 | 75 | class State: 76 | def __init__(self, triggerParam = None, entParam = None, scoreParam = 0.0): 77 | if triggerParam == None: 78 | self.triggerStates = [] 79 | else: 80 | self.triggerStates = triggerParam 81 | if entParam == None: 82 | self.entStates = dict() 83 | else: 84 | self.entStates = entParam 85 | 86 | self.score = scoreParam 87 | 88 | def addTrigger(self, trigger): 89 | self.triggerStates.append(trigger) 90 | 91 | def updateScore(self, val): 92 | self.score += val 93 | 94 | def addEntityAssignment(self, arg): 95 | key = arg.text + "|||" + str(arg.minIndex()) + "|||" + str(arg.triggerIndex) 96 | self.entStates[key] = arg.role 97 | 98 | def copy(self): 99 | altScore = self.score 100 | altTriggerStates = [] 101 | altEntStates = dict() 102 | 103 | for state in self.triggerStates: 104 | altTriggerStates.append(state) 105 | for key in self.entStates: 106 | altEntStates[key] = self.entStates[key] 107 | 108 | return State(altTriggerStates, altEntStates, altScore) 109 | 110 | def processSentence(curSentence, output, easyOutput, roleIndexDict, featureIndexDict, sentenceIndex, testMode=False): 111 | # handle each word in the sentence 112 | triggerIndex = 0 113 | curWords = curSentence.words 114 | curGold = curSentence.labels 115 | curEnts = curSentence.entities 116 | 117 | for triggerIndex in range(len(curWords)): 118 | word = curWords[triggerIndex] 119 | triggerLabel = curGold[triggerIndex] 120 | 121 | processWord(curSentence, triggerIndex, triggerLabel, output, easyOutput, roleIndexDict, featureIndexDict, sentenceIndex, testMode) 122 | 123 | def processWord(curSentence, triggerIndex, triggerLabel, output, easyOutput, roleIndexDict, featureIndexDict, sentenceIndex, testMode): 124 | curWords = curSentence.words 125 | triggerWord = curWords[triggerIndex] 126 | 127 | features = genFeatures(curSentence, triggerIndex, triggerLabel) 128 | 129 | if triggerLabel not in roleIndexDict: 130 | roleIndexDict[triggerLabel] = len(roleIndexDict) + 1 131 | argID = roleIndexDict[triggerLabel] 132 | output.write(str(argID)) 133 | if EASY_READ_OUTPUT: 134 | easyOutput.write("sent_" + str(sentenceIndex) + "\tPhrase:\t" + curWords[triggerIndex] + "\tRole:\t" + triggerLabel) 135 | 136 | for feature in features: 137 | easyOutput.write("\t" + feature) 138 | 139 | # place all feature names in here 140 | featureIDs = [] 141 | # place word embedding features in here (i.e. non-binary features) 142 | word2vecDict = dict() 143 | 144 | for feature in features: 145 | if testMode and feature not in featureIndexDict and not feature.startswith("WORD2VEC"): 146 | continue 147 | 148 | if feature.startswith("WORD2VEC"): 149 | temp = feature.find("=") 150 | featureName = feature[:temp] 151 | featureVal = feature[temp+1:] 152 | 153 | if featureName not in featureIndexDict: 154 | featureIndexDict[featureName] = len(featureIndexDict) + 1 155 | 156 | # add the corresponding feature ID to the list(s) 157 | featureIDs.append(featureIndexDict[featureName]) 158 | word2vecDict[featureIndexDict[featureName]] = featureVal 159 | else: 160 | if feature not in featureIndexDict: 161 | featureIndexDict[feature] = len(featureIndexDict) + 1 162 | featureID = featureIndexDict[feature] 163 | featureIDs.append(featureID) 164 | 165 | sortedIDs = sorted(featureIDs) 166 | for featureID in sortedIDs: 167 | if featureID in word2vecDict: 168 | val = word2vecDict[featureID] 169 | output.write(" " + str(featureID) + ":" + str(val)) 170 | else: 171 | output.write(" " + str(featureID) + ":1") 172 | output.write("\n") 173 | if EASY_READ_OUTPUT: 174 | easyOutput.write("\n") 175 | 176 | def main(): 177 | if len(sys.argv) != 4 and len(sys.argv) != 6: 178 | print len(sys.argv) 179 | print "Expect mode (train/test), feature file, output liblinear file, and (if test-mode) feature dictionary, role dictionary." 180 | print "Expect input training data, output liblinear file, input dev data, output liblinear file, input test data, output liblinear file." 181 | sys.exit() 182 | 183 | try: 184 | WORD_EMBEDDING_PATH = readConfigFile("../CONFIG.txt") 185 | except: 186 | print "Could not find CONFIG.txt. Terminating..." 187 | sys.exit() 188 | 189 | trainMode = (sys.argv[1] == "train") 190 | if not trainMode and sys.argv[1] != "test": 191 | sys.exit() 192 | 193 | textFile = sys.argv[2] 194 | outputFile = sys.argv[3] 195 | 196 | roleIndexDict = dict() 197 | featureIndexDict = dict() 198 | if not trainMode: 199 | featureIndexDict = readDict(sys.argv[4]) 200 | roleIndexDict = readDict(sys.argv[5]) 201 | 202 | # open UniversalPOS converter 203 | input = open(univPOS_EngFile, "r") 204 | for line in input: 205 | tokens = line.strip().split("\t") 206 | source = tokens[0] 207 | target = tokens[1] 208 | 209 | universalPOS_converter[source] = target 210 | input.close() 211 | input = open(univPOS_ChnFile, "r") 212 | for line in input: 213 | tokens = line.strip().split("\t") 214 | source = tokens[0] 215 | target = tokens[1] 216 | 217 | universalPOS_converter[source] = target 218 | input.close() 219 | input = open(univPOS_SpanFile, "r") 220 | for line in input: 221 | tokens = line.strip().split("\t") 222 | source = tokens[0] 223 | target = tokens[1] 224 | 225 | universalPOS_converter[source] = target 226 | input.close() 227 | 228 | # open the bilingual dictionary 229 | # usage: add a new feature. For english words, activates if the word appears; for chinese words, activates any associated translations 230 | input = open(BILINGUAL_DICTIONARY_LIST, "r") 231 | for line in input: 232 | tokens = line.strip().split() 233 | chineseWord = tokens[0] 234 | englishWord = tokens[1] 235 | 236 | if chineseWord not in bilingualDictionary: 237 | bilingualDictionary[chineseWord] = set() 238 | if englishWord not in bilingualDictionary: 239 | bilingualDictionary[englishWord] = set() 240 | bilingualDictionary[chineseWord].add(englishWord) 241 | input.close() 242 | 243 | input = open(TRIGGER_BILINGUAL_DICTIONARY_LIST, "r") 244 | for line in input: 245 | tokens = line.strip().split() 246 | chineseWord = tokens[0] 247 | englishWord = tokens[1] 248 | 249 | if chineseWord not in triggerBilingualDictionary: 250 | triggerBilingualDictionary[chineseWord] = set() 251 | if englishWord not in triggerBilingualDictionary: 252 | triggerBilingualDictionary[englishWord] = set() 253 | triggerBilingualDictionary[chineseWord].add(englishWord) 254 | input.close() 255 | 256 | 257 | # open and read word vectors 258 | input = open(WORD2VEC_FILENAME, "r") 259 | for line in input: 260 | # skip any headers 261 | if line.count(" ") < 5: 262 | continue 263 | else: 264 | index = line.find(" ") 265 | curWord = line[:index] 266 | rest = line[index+1:] 267 | tokens = rest.strip().split(" ") 268 | 269 | numTokens = [] 270 | for tok in tokens: 271 | numTokens.append(float(tok)) 272 | 273 | wordVecs[curWord] = numTokens 274 | input.close() 275 | 276 | input = open(CHINESE_WORD2VEC_FILENAME, "r") 277 | for line in input: 278 | # skip any headers 279 | if line.count(" ") < 5: 280 | continue 281 | else: 282 | index = line.find(" ") 283 | curWord = line[:index] 284 | rest = line[index+1:] 285 | tokens = rest.strip().split(" ") 286 | 287 | numTokens = [] 288 | for tok in tokens: 289 | numTokens.append(float(tok)) 290 | 291 | chineseWordVecs[curWord] = numTokens 292 | input.close() 293 | 294 | input = open(SPANISH_WORD2VEC_FILENAME, "r") 295 | for line in input: 296 | # skip any headers 297 | if line.count(" ") < 5: 298 | continue 299 | else: 300 | index = line.find(" ") 301 | curWord = line[:index] 302 | rest = line[index+1:] 303 | tokens = rest.strip().split(" ") 304 | 305 | numTokens = [] 306 | for tok in tokens: 307 | numTokens.append(float(tok)) 308 | 309 | spanishWordVecs[curWord] = numTokens 310 | input.close() 311 | 312 | input = open(MULTI_WORD2VEC_FILENAME, "r") 313 | for line in input: 314 | # skip any headers 315 | if line.count(" ") < 5: 316 | continue 317 | else: 318 | index = line.find(" ") 319 | curWord = line[:index] 320 | rest = line[index+1:] 321 | tokens = rest.strip().split(" ") 322 | 323 | numTokens = [] 324 | for tok in tokens: 325 | numTokens.append(float(tok)) 326 | 327 | multiWordVecs[curWord] = numTokens 328 | input.close() 329 | 330 | ### Training 331 | if trainMode: 332 | possibleLabels, possibleArgs = scanInput(textFile, textFile + ".parsing") 333 | 334 | print "Total # of trigger labels to predict over: " + str(len(possibleLabels)) 335 | print "Total # of argument roles to predict over: " + str(len(possibleArgs)) 336 | 337 | 338 | # go over each sentence in the training data 339 | # NOTE: in this version, using gold triggers, gold entity mentions 340 | output = open(outputFile, "w") 341 | easyOutput = open(outputFile + ".easyRead", "w") 342 | 343 | input = open(textFile, "r") 344 | parsingInput = open(textFile + ".parsing", "r") 345 | count = 0 346 | 347 | print "Writing training set" 348 | while True: 349 | sentence, valid, nothing = readInput(input, parsingInput) 350 | 351 | if count % 1000 == 0: 352 | print "Processing sentence " + str(count) 353 | if DEBUG: 354 | print "Sentence length: " + str(len(sentence.words)) 355 | print "Total entities: " + str(len(sentence.entities)) 356 | 357 | if sentence != None: 358 | processSentence(sentence, output, easyOutput, roleIndexDict, featureIndexDict, count) 359 | 360 | count += 1 361 | 362 | if not valid: 363 | break 364 | output.close() 365 | easyOutput.close() 366 | input.close() 367 | parsingInput.close() 368 | 369 | writeDicts(featureIndexDict, roleIndexDict, "features.dict", "roles.dict") 370 | ### Testing 371 | else: 372 | output = open(outputFile, "w") 373 | easyOutput = open(outputFile + ".easyRead", "w") 374 | 375 | input = open(textFile, "r") 376 | parsingInput = open(textFile + ".parsing", "r") 377 | 378 | count = 0 379 | print "Writing test set" 380 | while True: 381 | sentence, valid, nothing = readInput(input, parsingInput) 382 | 383 | if count % 1000 == 0: 384 | print "Processing sentence " + str(count) 385 | if DEBUG: 386 | print "Sentence length: " + str(len(sentence.words)) 387 | print "Total entities: " + str(len(sentence.entities)) 388 | 389 | if sentence != None: 390 | processSentence(sentence, output, easyOutput, roleIndexDict, featureIndexDict, count, testMode=True) 391 | 392 | count += 1 393 | 394 | if not valid: 395 | break 396 | output.close() 397 | easyOutput.close() 398 | input.close() 399 | parsingInput.close() 400 | 401 | 402 | def writeDicts(featureDict, roleDict, filenameF, filenameR): 403 | output = open(filenameF, "w") 404 | for feature in featureDict: 405 | curID = featureDict[feature] 406 | output.write(feature + ":" + str(curID) + "\n") 407 | output.close() 408 | 409 | output = open(filenameR, "w") 410 | for role in roleDict: 411 | curID = roleDict[role] 412 | output.write(role + ":" + str(curID) + "\n") 413 | output.close() 414 | 415 | def readDict(filename): 416 | input = open(filename, "r") 417 | curDict = dict() 418 | for line in input: 419 | clean = line.strip() 420 | splitPoint = clean.rfind(":") 421 | index = clean[:splitPoint] 422 | val = int(clean[splitPoint+1:]) 423 | curDict[index] = val 424 | return curDict 425 | 426 | def isYear(text): 427 | if len(text) < 4: 428 | return False 429 | for index in range(4): 430 | if text[index] not in string.digits: 431 | return False 432 | # sometimes errors from Stanford segmenter 433 | if len(text) != 4: 434 | if text[4] not in string.punctuation: 435 | return False 436 | return True 437 | 438 | 439 | # returns (absolute value) of distance between entity and trigger 440 | def calcArgTriggerDistance(triggerIndex, start, end): 441 | if triggerIndex < start: 442 | return start - triggerIndex 443 | elif triggerIndex > end: 444 | return triggerIndex - end 445 | else: 446 | return 0 447 | 448 | def toUnivPOS(tag): 449 | # removeNumbers at end of POS tag if needed (Stanford Spanish seems to add this) 450 | tempTag = "" 451 | for character in tag: 452 | if character not in string.digits: 453 | tempTag += character 454 | 455 | copyPOS = tempTag 456 | while len(copyPOS) > 0: 457 | if copyPOS in universalPOS_converter: 458 | return universalPOS_converter[copyPOS] 459 | else: 460 | copyPOS = copyPOS[:-1] 461 | return tempTag 462 | 463 | # assuming each can be represented by an indicator function 464 | # i.e. binary features 465 | # GOTOFEATURES 466 | def genFeatures(curSentence, index, proposedLabel): 467 | words = curSentence.words 468 | lemmas = curSentence.lemmas 469 | posTags = curSentence.posTags 470 | 471 | # default 472 | previousWord = "" 473 | prevPOS = "" 474 | previousWord_2 = "" 475 | prevPOS_2 = "" 476 | if index != 0: 477 | previousWord = words[index-1] 478 | prevPOS = posTags[index-1] 479 | if index != 1: 480 | previousWord_2 = words[index-2] 481 | prevPOS_2 = posTags[index-2] 482 | 483 | nextWord = "" 484 | nextPOS = "" 485 | nextWord_2 = "" 486 | nextPOS_2 = "" 487 | if index != len(words) - 1: 488 | nextWord = words[index+1] 489 | nextPOS = posTags[index+1] 490 | if index != len(words) - 2: 491 | nextWord_2 = words[index+2] 492 | nextWord_2 = posTags[index+2] 493 | 494 | word = words[index] 495 | lemma = lemmas[index] 496 | curPOS = posTags[index] 497 | 498 | featureSet = set() 499 | 500 | # if the sentence is for the title (approximate) 501 | foundUnderscore = False 502 | foundNEWS = False 503 | for word in words: 504 | if "_" in word: 505 | foundUnderscore = True 506 | if "NEWS" in word: 507 | foundNEWS = True 508 | isTitle = foundUnderscore and foundNEWS 509 | if isTitle: 510 | featureSet.add("isTitle") 511 | 512 | # length of current word 513 | featureSet.add(str(len(word)) + "_lengthCurWord") 514 | 515 | # unigrams -- words 516 | featureSet.add(word + "_curWord") 517 | featureSet.add(previousWord + "_prevWord") 518 | featureSet.add(nextWord + "_nextWord") 519 | featureSet.add(previousWord_2 + "_prevWord2") 520 | featureSet.add(nextWord_2 + "_nextWord2") 521 | 522 | # unigrams -- words lowercase 523 | featureSet.add(word.lower() + "_curWordLower") 524 | featureSet.add(previousWord.lower() + "_prevWordLower") 525 | featureSet.add(nextWord.lower() + "_nextWordLower") 526 | featureSet.add(previousWord_2.lower() + "_prevWord2Lower") 527 | featureSet.add(nextWord_2.lower() + "_nextWord2Lower") 528 | 529 | # unigrams -- lemma 530 | featureSet.add(lemma + "_curLemma") 531 | 532 | # bigrams -- words 533 | featureSet.add(word + "_curWord" + "|||" + previousWord + "_prevWord") 534 | featureSet.add(word + "_curWord" + "|||" + nextWord + "_nextWord") 535 | featureSet.add(word + "_curWord" + "|||" + previousWord_2 + "_prevWord2") 536 | featureSet.add(word + "_curWord" + "|||" + nextWord_2 + "_nextWord2") 537 | 538 | # bigrams -- words lowercase 539 | featureSet.add(word.lower() + "_curWordLower" + "|||" + previousWord.lower() + "_prevWordLower") 540 | featureSet.add(word.lower() + "_curWordLower" + "|||" + nextWord.lower() + "_nextWordLower") 541 | featureSet.add(word.lower() + "_curWordLower" + "|||" + previousWord_2.lower() + "_prevWordLower2") 542 | featureSet.add(word.lower() + "_curWordLower" + "|||" + nextWord_2.lower() + "_nextWordLower2") 543 | 544 | # bigrams -- word + POS 545 | featureSet.add(word + "_curWord" + "|||" + curPOS + "_curPOS") 546 | featureSet.add(word + "_curWord" + "|||" + toUnivPOS(curPOS) + "_curUNIVPOS") 547 | 548 | 549 | # word-"shape" features 550 | if "_" in word: 551 | featureSet.add("containsUnderscore") 552 | 553 | # if number 554 | number = True 555 | for character in word: 556 | if character not in string.digits: 557 | number = False 558 | break 559 | if number: 560 | featureSet.add("isNumber") 561 | 562 | # capitalized 563 | firstChar = word[0] 564 | if firstChar in string.ascii_uppercase: 565 | featureSet.add("isCapitalized") 566 | else: 567 | featureSet.add("isNotCapitalized") 568 | 569 | # punctuationOnly 570 | punct = True 571 | for character in word: 572 | if character not in string.punctuation: 573 | punct = False 574 | break 575 | if punct: 576 | featureSet.add("isPunctuation") 577 | 578 | # POS features 579 | if curPOS.startswith("V"): 580 | featureSet.add("POS_VType") 581 | featureSet.add(curPOS + "_curPOS") 582 | featureSet.add(prevPOS + "_prevPOS") 583 | featureSet.add(nextPOS + "_nextPOS") 584 | featureSet.add(prevPOS_2 + "_prevPOS2") 585 | featureSet.add(nextPOS_2 + "_nextPOS2") 586 | 587 | if toUnivPOS(curPOS).startswith("V"): 588 | featureSet.add("UNIVPOS_VType") 589 | featureSet.add(toUnivPOS(curPOS) + "_curUNIVPOS") 590 | featureSet.add(toUnivPOS(prevPOS) + "_prevUNIVPOS") 591 | featureSet.add(toUnivPOS(nextPOS) + "_nextUNIVPOS") 592 | featureSet.add(toUnivPOS(prevPOS_2) + "_prevUNIVPOS2") 593 | featureSet.add(toUnivPOS(nextPOS_2) + "_nextUNIVPOS2") 594 | 595 | # POS bigrams 596 | featureSet.add(curPOS + "_curPOS" + "|||" + prevPOS + "_prevPOS") 597 | featureSet.add(curPOS + "_curPOS" + "|||" + nextPOS + "_nextPOS") 598 | featureSet.add(curPOS + "_curPOS" + "|||" + prevPOS_2 + "_prevPOS2") 599 | featureSet.add(curPOS + "_curPOS" + "|||" + nextPOS_2 + "_nextPOS2") 600 | 601 | featureSet.add(toUnivPOS(curPOS) + "_curUNIVPOS" + "|||" + toUnivPOS(prevPOS) + "_prevUNIVPOS") 602 | featureSet.add(toUnivPOS(curPOS) + "_curUNIVPOS" + "|||" + toUnivPOS(nextPOS) + "_nextUNIVPOS") 603 | featureSet.add(toUnivPOS(curPOS) + "_curUNIVPOS" + "|||" + toUnivPOS(prevPOS_2) + "_prevUNIVPOS2") 604 | featureSet.add(toUnivPOS(curPOS) + "_curUNIVPOS" + "|||" + toUnivPOS(nextPOS_2) + "_nextUNIVPOS2") 605 | 606 | # dependency parsing features -- governor 607 | if index in curSentence.depByGovIndex: 608 | for dependency in curSentence.depByGovIndex[index]: 609 | depType = dependency.depType 610 | dependent = dependency.dependent 611 | 612 | featureSet.add("ParsingGov_" + depType + "_type") 613 | featureSet.add("ParsingGov_" + dependent + "_dependent") 614 | featureSet.add("ParsingGov_" + depType + "_type" + "|||" + dependent + "_dependent") 615 | 616 | # dependency parsing features -- dependent 617 | if index in curSentence.depByDepIndex: 618 | for dependency in curSentence.depByDepIndex[index]: 619 | depType = dependency.depType 620 | governor = dependency.governor 621 | 622 | featureSet.add("ParsingDep_" + depType + "_type") 623 | featureSet.add("ParsingDep_" + governor + "_governor") 624 | featureSet.add("ParsingDep_" + depType + "_type" + "|||" + governor + "_governor") 625 | 626 | # only include if we have the word as one of our vectors 627 | word = words[index] 628 | if word in wordVecs: 629 | curVector = wordVecs[word] 630 | vecLocation = 0 631 | for tok in curVector: 632 | featureSet.add("WORD2VEC_ENG_" + str(vecLocation) + "=" + str(tok)) 633 | vecLocation += 1 634 | 635 | word = words[index] 636 | if word in chineseWordVecs: 637 | curVector = chineseWordVecs[word] 638 | vecLocation = 0 639 | for tok in curVector: 640 | featureSet.add("WORD2VEC_CHN_" + str(vecLocation) + "=" + str(tok)) 641 | vecLocation += 1 642 | 643 | word = words[index] 644 | if word in spanishWordVecs: 645 | curVector = spanishWordVecs[word] 646 | vecLocation = 0 647 | for tok in curVector: 648 | featureSet.add("WORD2VEC_SPAN_" + str(vecLocation) + "=" + str(tok)) 649 | vecLocation += 1 650 | 651 | word = words[index] 652 | if word in multiWordVecs: 653 | curVector = multiWordVecs[word] 654 | vecLocation = 0 655 | for tok in curVector: 656 | featureSet.add("WORD2VEC_MULTI_" + str(vecLocation) + "=" + str(tok)) 657 | vecLocation += 1 658 | 659 | # bilingual dictionary features 660 | word = words[index] 661 | if word in bilingualDictionary: 662 | wordSet = bilingualDictionary[word] 663 | # if English word: 664 | if len(wordSet) == 0: 665 | featureSet.add("WORD_TRANSLATION_" + word) 666 | else: 667 | for translation in wordSet: 668 | featureSet.add("WORD_TRANSLATION_" + translation) 669 | 670 | word = words[index] 671 | if word in triggerBilingualDictionary: 672 | wordSet = triggerBilingualDictionary[word] 673 | # if English word: 674 | if len(wordSet) == 0: 675 | featureSet.add("TRIGGER_WORD_TRANSLATION_" + word) 676 | else: 677 | for translation in wordSet: 678 | featureSet.add("TRIGGER_WORD_TRANSLATION_" + translation) 679 | 680 | return featureSet 681 | 682 | if __name__ == "__main__": 683 | main() 684 | -------------------------------------------------------------------------------- /all_predictions_4.0/runAll.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | echo "Start TRIGGERS" 4 | ./runTriggers.sh 1 $1 5 | echo "START ARGUMENTS" 6 | ./runArguments.sh 1 $1 7 | echo "Start REALIS" 8 | ./runRealis.sh 1 $1 9 | -------------------------------------------------------------------------------- /all_predictions_4.0/runAll_providedTriggers.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | echo "START ARGUMENTS" 4 | ./runArguments_providedTriggers.sh 1 $1 5 | echo "Start REALIS" 6 | ./runRealis_providedTriggers.sh 1 $1 7 | -------------------------------------------------------------------------------- /all_predictions_4.0/runArguments.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | LIBLINEAR_PATH=/home/andrew/ML_tools/LIBLINEAR/liblinear-1.94 4 | 5 | if [ "$#" -ne 2 ]; then 6 | echo "Illegal number of parameters, provide a value for C parameter, test file" 7 | exit 1 8 | fi 9 | 10 | # create the input liblinear files 11 | cd code/ 12 | # write English Liblinear 13 | python writeArgLiblinear.py test ../$2 test.out /home/andrew/DEFT_code_testing/dependencies/models/liblinear/arguments.features.dict /home/andrew/DEFT_code_testing/dependencies/models/liblinear/arguments.roles.dict ../currentPredictionsForTriggers/testSet.predictions 14 | 15 | cd ../ 16 | # running on the test set 17 | ${LIBLINEAR_PATH}/predict code/test.out /home/andrew/DEFT_code_testing/dependencies/models/liblinear/arguments.model output.test.arguments 18 | 19 | # report results on the data 20 | python code/convertOutputArgs.py /home/andrew/DEFT_code_testing/dependencies/models/liblinear/arguments.roles.dict output.test.arguments code/test.out.easyRead currentPredictionsForArgs/testSet.predictions 21 | # record trigger easyRead data 22 | cp code/test.out.easyRead arguments.out.easyRead 23 | cp code/test.out.entityCoref arguments.out.entityCoref 24 | -------------------------------------------------------------------------------- /all_predictions_4.0/runArguments_providedTriggers.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | LIBLINEAR_PATH=/home/andrew/ML_tools/LIBLINEAR/liblinear-1.94 4 | 5 | if [ "$#" -ne 2 ]; then 6 | echo "Illegal number of parameters, provide a value for C parameter, test file" 7 | exit 1 8 | fi 9 | 10 | # create the input liblinear files 11 | cd code/ 12 | # write English Liblinear 13 | python writeArgLiblinear.py test ../$2 test.out /home/andrew/DEFT_code_testing/dependencies/models/liblinear/arguments.features.dict /home/andrew/DEFT_code_testing/dependencies/models/liblinear/arguments.roles.dict NONE 14 | 15 | cd ../ 16 | # running on the test set 17 | ${LIBLINEAR_PATH}/predict code/test.out /home/andrew/DEFT_code_testing/dependencies/models/liblinear/arguments.model output.test.arguments 18 | 19 | # report results on the data 20 | python code/convertOutputArgs.py /home/andrew/DEFT_code_testing/dependencies/models/liblinear/arguments.roles.dict output.test.arguments code/test.out.easyRead currentPredictionsForArgs/testSet.predictions 21 | # record trigger easyRead data 22 | cp code/test.out.easyRead arguments.out.easyRead 23 | cp code/test.out.entityCoref arguments.out.entityCoref 24 | -------------------------------------------------------------------------------- /all_predictions_4.0/runRealis.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | LIBLINEAR_PATH=/home/andrew/ML_tools/LIBLINEAR/liblinear-1.94 4 | 5 | if [ "$#" -ne 2 ]; then 6 | echo "Illegal number of parameters, provide a value for C parameter, test file" 7 | exit 1 8 | fi 9 | 10 | # create the input liblinear files 11 | cd code/ 12 | # write English Liblinear 13 | python writeRealisLiblinear.py test ../$2 test.out /home/andrew/DEFT_code_testing/dependencies/models/liblinear/realis.features.dict /home/andrew/DEFT_code_testing/dependencies/models/liblinear/realis.roles.dict ../currentPredictionsForTriggers/testSet.predictions 14 | 15 | cd ../ 16 | 17 | # testing on the training/validation/testing sets 18 | ${LIBLINEAR_PATH}/predict code/test.out /home/andrew/DEFT_code_testing/dependencies/models/liblinear/realis.model output.test.realis 19 | 20 | # report results on the data 21 | python code/convertOutputArgs.py /home/andrew/DEFT_code_testing/dependencies/models/liblinear/realis.roles.dict output.test.realis code/test.out.easyRead currentPredictionsForRealis/testSet.predictions 22 | cp code/test.out.easyRead realis.out.easyRead 23 | -------------------------------------------------------------------------------- /all_predictions_4.0/runRealis_providedTriggers.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | LIBLINEAR_PATH=/home/andrew/ML_tools/LIBLINEAR/liblinear-1.94 4 | 5 | if [ "$#" -ne 2 ]; then 6 | echo "Illegal number of parameters, provide a value for C parameter, test file" 7 | exit 1 8 | fi 9 | 10 | # create the input liblinear files 11 | cd code/ 12 | # write English Liblinear 13 | python writeRealisLiblinear.py test ../$2 test.out /home/andrew/DEFT_code_testing/dependencies/models/liblinear/realis.features.dict /home/andrew/DEFT_code_testing/dependencies/models/liblinear/realis.roles.dict NONE 14 | 15 | cd ../ 16 | 17 | # testing on the training/validation/testing sets 18 | ${LIBLINEAR_PATH}/predict code/test.out /home/andrew/DEFT_code_testing/dependencies/models/liblinear/realis.model output.test.realis 19 | 20 | # report results on the data 21 | python code/convertOutputArgs.py /home/andrew/DEFT_code_testing/dependencies/models/liblinear/realis.roles.dict output.test.realis code/test.out.easyRead currentPredictionsForRealis/testSet.predictions 22 | cp code/test.out.easyRead realis.out.easyRead 23 | -------------------------------------------------------------------------------- /all_predictions_4.0/runTriggers.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | LIBLINEAR_PATH=/home/andrew/ML_tools/LIBLINEAR/liblinear-1.94 4 | 5 | if [ "$#" -ne 2 ]; then 6 | echo "Illegal number of parameters, provide a value for C parameter, test file" 7 | exit 1 8 | fi 9 | 10 | # create the input liblinear files 11 | cd code/ 12 | # write English Liblinear 13 | python writeTriggerLiblinear.py test ../$2 test.out /home/andrew/DEFT_code_testing/dependencies/models/liblinear/triggers.features.dict /home/andrew/DEFT_code_testing/dependencies/models/liblinear/triggers.roles.dict 14 | 15 | cd ../ 16 | # running on the test set 17 | ${LIBLINEAR_PATH}/predict code/test.out /home/andrew/DEFT_code_testing/dependencies/models/liblinear/triggers.model output.test.triggers 18 | 19 | # report results on the data 20 | python code/convertOutputTriggers.py /home/andrew/DEFT_code_testing/dependencies/models/liblinear/triggers.roles.dict output.test.triggers code/test.out.easyRead currentPredictionsForTriggers/testSet.predictions 21 | # record trigger easyRead data 22 | cp code/test.out.easyRead triggers.out.easyRead 23 | -------------------------------------------------------------------------------- /config.py: -------------------------------------------------------------------------------- 1 | # script to adjust the filepaths as needed 2 | import sys 3 | 4 | def parseConfigFile(filename): 5 | input = open(filename, "r") 6 | 7 | embeddingPath = "" 8 | corenlpPath = "" 9 | parserPath = "" 10 | nerPath = "" 11 | for line in input: 12 | if line.startswith("WORD_EMBEDDING_DIR"): 13 | embeddingPath = line.strip().split("=")[1] 14 | elif line.startswith("CORENLP_DIR"): 15 | corenlpPath = line.strip().split("=")[1] 16 | elif line.startswith("MALTPARSER_DIR"): 17 | parserPath = line.strip().split("=")[1] 18 | elif line.startswith("NER_DIR"): 19 | nerPath = line.strip().split("=")[1] 20 | elif line.startswith("MODEL_DIR"): 21 | modelPath = line.strip().split("=")[1] 22 | elif line.startswith("LIBLINEAR_DIR"): 23 | liblinearPath = line.strip().split("=")[1] 24 | elif line.startswith("POS_DIR"): 25 | posPath = line.strip().split("=")[1] 26 | input.close() 27 | 28 | return corenlpPath, embeddingPath, parserPath, nerPath, modelPath, liblinearPath, posPath 29 | 30 | def main(): 31 | try: 32 | corenlpDir, embeddingDir, parserDir, nerDir, modelDir, liblinearDir, posDir = parseConfigFile("CONFIG.txt") 33 | except: 34 | print "Please run in same directory as CONFIG.txt." 35 | sys.exit() 36 | 37 | # update the Chinese code for running Maltparser 38 | curInput = open("preprocessing_2.0/processChinese.sh", "r") 39 | lines = [] 40 | for line in curInput: 41 | lines.append(line) 42 | curInput.close() 43 | output = open("preprocessing_2.0/processChinese.sh", "w") 44 | for line in lines: 45 | if line.startswith("python") or line.startswith("java"): 46 | tokens = line.strip().split() 47 | for tok in tokens: 48 | if tok.startswith("python") or tok.startswith("java"): 49 | output.write(tok) 50 | elif tok.endswith(".map"): 51 | if "/" not in tok: 52 | output.write(" " + posDir + "/" + tok) 53 | else: 54 | tmpToks = tok.split("/") 55 | output.write(" " + posDir + "/" + tmpToks[len(tmpToks)-1]) 56 | elif tok.endswith(".jar"): 57 | if "/" not in tok: 58 | output.write(" " + parserDir + "/" + tok) 59 | else: 60 | tmpToks = tok.split("/") 61 | output.write(" " + parserDir + "/" + tmpToks[len(tmpToks)-1]) 62 | else: 63 | output.write(" " + tok) 64 | output.write("\n") 65 | elif line.startswith("cp"): 66 | tokens = line.strip().split() 67 | for tok in tokens: 68 | if tok == "cp": 69 | output.write(tok) 70 | elif tok.endswith(".mco"): 71 | if "/" not in tok: 72 | output.write(" " + modelDir + "/maltparser/" + tok) 73 | else: 74 | tmpToks = tok.split("/") 75 | output.write(" " + modelDir + "/maltparser/" + tmpToks[len(tmpToks)-1]) 76 | else: 77 | output.write(" " + tok) 78 | output.write("\n") 79 | else: 80 | output.write(line) 81 | output.close() 82 | 83 | # update the Spanish code for running Maltparser 84 | curInput = open("preprocessing_2.0/processSpanish.sh", "r") 85 | lines = [] 86 | for line in curInput: 87 | lines.append(line) 88 | curInput.close() 89 | output = open("preprocessing_2.0/processSpanish.sh", "w") 90 | for line in lines: 91 | if line.startswith("python") or line.startswith("java"): 92 | tokens = line.strip().split() 93 | for tok in tokens: 94 | if tok.startswith("python") or tok.startswith("java"): 95 | output.write(tok) 96 | elif tok.endswith(".map"): 97 | if "/" not in tok: 98 | output.write(" " + posDir + "/" + tok) 99 | else: 100 | tmpToks = tok.split("/") 101 | output.write(" " + posDir + "/" + tmpToks[len(tmpToks)-1]) 102 | elif tok.endswith(".jar"): 103 | if "/" not in tok: 104 | output.write(" " + parserDir + "/" + tok) 105 | else: 106 | tmpToks = tok.split("/") 107 | output.write(" " + parserDir + "/" + tmpToks[len(tmpToks)-1]) 108 | else: 109 | output.write(" " + tok) 110 | output.write("\n") 111 | elif line.startswith("cp"): 112 | tokens = line.strip().split() 113 | for tok in tokens: 114 | if tok == "cp": 115 | output.write(tok) 116 | elif tok.endswith(".mco"): 117 | if "/" not in tok: 118 | output.write(" " + modelDir + "/maltparser/" + tok) 119 | else: 120 | tmpToks = tok.split("/") 121 | output.write(" " + modelDir + "/maltparser/" + tmpToks[len(tmpToks)-1]) 122 | else: 123 | output.write(" " + tok) 124 | output.write("\n") 125 | else: 126 | output.write(line) 127 | output.close() 128 | 129 | 130 | # update the output formatting code 131 | filenames = ["outputFormatting/English_run.sh", "outputFormatting/Chinese_run.sh", "outputFormatting/Spanish_run.sh"] 132 | for filename in filenames: 133 | curInput = open(filename, "r") 134 | lines = [] 135 | for line in curInput: 136 | lines.append(line) 137 | curInput.close() 138 | output = open(filename, "w") 139 | for line in lines: 140 | if line.startswith("python"): 141 | tokens = line.strip().split() 142 | for tok in tokens: 143 | if tok.startswith("python"): 144 | output.write(tok) 145 | elif tok.endswith(".dict"): 146 | if "/" not in tok: 147 | output.write(" " + modelDir + "/liblinear/" + tok) 148 | else: 149 | tmpToks = tok.split("/") 150 | output.write(" " + modelDir + "/liblinear/" + tmpToks[len(tmpToks)-1]) 151 | else: 152 | output.write(" " + tok) 153 | output.write("\n") 154 | else: 155 | output.write(line) 156 | output.close() 157 | 158 | # update the liblinear files 159 | filenames = ["runArguments.sh", "runArguments_providedTriggers.sh", "runRealis_providedTriggers.sh", "runRealis.sh", "runTriggers.sh"] 160 | 161 | curInput = open("all_predictions_4.0/code/writeTriggerLiblinear.py", "r") 162 | lines = [] 163 | for line in curInput: 164 | lines.append(line) 165 | curInput.close() 166 | output = open("all_predictions_4.0/code/writeTriggerLiblinear.py", "w") 167 | for line in lines: 168 | if line.startswith("WORD_EMBEDDING_PATH="): 169 | output.write("WORD_EMBEDDING_PATH=\"" + embeddingDir + "\"\n") 170 | elif line.startswith("UNIV_POS_PATH="): 171 | output.write("UNIV_POS_PATH=\"" + posDir + "\"\n") 172 | else: 173 | output.write(line) 174 | output.close() 175 | 176 | for filename in filenames: 177 | liblinearInput = open("all_predictions_4.0/" + filename, "r") 178 | lines = [] 179 | for line in liblinearInput: 180 | lines.append(line) 181 | liblinearInput.close() 182 | output = open("all_predictions_4.0/" + filename, "w") 183 | for line in lines: 184 | if line.startswith("LIBLINEAR_PATH="): 185 | output.write("LIBLINEAR_PATH=" + liblinearDir + "\n") 186 | elif line.startswith("${LIBLINEAR_PATH}") or line.startswith("python"): 187 | tokens = line.strip().split() 188 | for tok in tokens: 189 | if tok.startswith("${LIBLINEAR_PATH}") or tok.startswith("python"): 190 | output.write(tok) 191 | elif tok.endswith(".model") or tok.endswith(".dict"): 192 | if "/" not in tok: 193 | output.write(" " + modelDir + "/liblinear/" + tok) 194 | else: 195 | tmpToks = tok.split("/") 196 | output.write(" " + modelDir + "/liblinear/" + tmpToks[len(tmpToks)-1]) 197 | else: 198 | output.write(" " + tok) 199 | output.write("\n") 200 | else: 201 | output.write(line) 202 | output.close() 203 | 204 | # update the CoreNLP filepath - English 205 | corenlpInput = open("preprocessing_2.0/CoreNLP_scripts/runCoreNLP_Eng.sh", "r") 206 | lines = [] 207 | for line in corenlpInput: 208 | lines.append(line) 209 | corenlpInput.close() 210 | output = open("preprocessing_2.0/CoreNLP_scripts/runCoreNLP_Eng.sh", "w") 211 | for line in lines: 212 | if line.startswith("STANFORD_CORENLP"): 213 | output.write("STANFORD_CORENLP=" + corenlpDir + "\n") 214 | else: 215 | output.write(line) 216 | output.close() 217 | 218 | # update the NER filepath - English 219 | corenlpInput = open("preprocessing_2.0/entityExtraction/runEntities.sh", "r") 220 | lines = [] 221 | for line in corenlpInput: 222 | lines.append(line) 223 | corenlpInput.close() 224 | output = open("preprocessing_2.0/entityExtraction/runEntities.sh", "w") 225 | for line in lines: 226 | if line.startswith("STANFORD_NER"): 227 | output.write("STANFORD_NER=" + nerDir + "\n") 228 | elif line.startswith("\tjava -mx16g -cp"): 229 | tokens = line.strip().split() 230 | output.write("\t") 231 | for tok in tokens: 232 | if tok.endswith(".gz"): 233 | if "/" not in tok: 234 | output.write(" " + modelDir + "/entities/" + tok) 235 | else: 236 | tmpToks = tok.split("/") 237 | output.write(" " + modelDir + "/entities/" + tmpToks[len(tmpToks)-1]) 238 | elif tok == "java": 239 | output.write(tok) 240 | else: 241 | output.write(" " + tok) 242 | output.write("\n") 243 | else: 244 | output.write(line) 245 | output.close() 246 | 247 | # update the CoreNLP filepath - Chinese 248 | corenlpInput = open("preprocessing_2.0/CoreNLP_scripts/runCoreNLP_Chn.sh", "r") 249 | lines = [] 250 | for line in corenlpInput: 251 | lines.append(line) 252 | corenlpInput.close() 253 | output = open("preprocessing_2.0/CoreNLP_scripts/runCoreNLP_Chn.sh", "w") 254 | for line in lines: 255 | if line.startswith("STANFORD_CORENLP"): 256 | output.write("STANFORD_CORENLP=" + corenlpDir + "\n") 257 | else: 258 | output.write(line) 259 | output.close() 260 | 261 | # update the NER filepath - Chinese 262 | corenlpInput = open("preprocessing_2.0/entityExtraction/runEntities_Chinese.sh", "r") 263 | lines = [] 264 | for line in corenlpInput: 265 | lines.append(line) 266 | corenlpInput.close() 267 | output = open("preprocessing_2.0/entityExtraction/runEntities_Chinese.sh", "w") 268 | for line in lines: 269 | if line.startswith("STANFORD_NER"): 270 | output.write("STANFORD_NER=" + nerDir + "\n") 271 | elif line.startswith("\tjava -mx16g -cp"): 272 | tokens = line.strip().split() 273 | output.write("\t") 274 | for tok in tokens: 275 | if tok.endswith(".gz"): 276 | if "/" not in tok: 277 | output.write(" " + modelDir + "/entities/" + tok) 278 | else: 279 | tmpToks = tok.split("/") 280 | output.write(" " + modelDir + "/entities/" + tmpToks[len(tmpToks)-1]) 281 | elif tok == "java": 282 | output.write(tok) 283 | else: 284 | output.write(" " + tok) 285 | output.write("\n") 286 | else: 287 | output.write(line) 288 | output.close() 289 | 290 | # update the CoreNLP filepath - Spanish 291 | corenlpInput = open("preprocessing_2.0/CoreNLP_scripts/runCoreNLP_Span.sh", "r") 292 | lines = [] 293 | for line in corenlpInput: 294 | lines.append(line) 295 | corenlpInput.close() 296 | output = open("preprocessing_2.0/CoreNLP_scripts/runCoreNLP_Span.sh", "w") 297 | for line in lines: 298 | if line.startswith("STANFORD_CORENLP"): 299 | output.write("STANFORD_CORENLP=" + corenlpDir + "\n") 300 | else: 301 | output.write(line) 302 | output.close() 303 | 304 | # update the NER filepath - Spanish 305 | corenlpInput = open("preprocessing_2.0/entityExtraction/runEntities_Spanish.sh", "r") 306 | lines = [] 307 | for line in corenlpInput: 308 | lines.append(line) 309 | corenlpInput.close() 310 | output = open("preprocessing_2.0/entityExtraction/runEntities_Spanish.sh", "w") 311 | for line in lines: 312 | if line.startswith("STANFORD_NER"): 313 | output.write("STANFORD_NER=" + nerDir + "\n") 314 | elif line.startswith("\tjava -mx16g -cp"): 315 | tokens = line.strip().split() 316 | output.write("\t") 317 | for tok in tokens: 318 | if tok.endswith(".gz"): 319 | if "/" not in tok: 320 | output.write(" " + modelDir + "/entities/" + tok) 321 | else: 322 | tmpToks = tok.split("/") 323 | output.write(" " + modelDir + "/entities/" + tmpToks[len(tmpToks)-1]) 324 | elif tok == "java": 325 | output.write(tok) 326 | else: 327 | output.write(" " + tok) 328 | output.write("\n") 329 | else: 330 | output.write(line) 331 | output.close() 332 | 333 | main() 334 | -------------------------------------------------------------------------------- /outputFormatting/Chinese_run.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # extract nugget output 4 | cd formatTriggers/format_andrew_triggers/ 5 | python format_andrew.py ../../../all_predictions_4.0/currentPredictionsForTriggers/testSet.predictions ../../../preprocessing_2.0/createSetFiles/setFile.withEntities.tmp.Chn andrew.triggers.out 6 | cd ../../ 7 | 8 | # arguments: 1.) test.out file, from ../argumentPrediction/ 2.) Easy-read arguments file 3.) Roles dictionary 4.) Entity coref output 5.) docmap file 6.) stopwords file 7.) realis file 9 | python finalForm_KBP.py ../all_predictions_4.0/output.test.arguments ../all_predictions_4.0/arguments.out.easyRead /home/andrew/DEFT_code_testing/dependencies/models/liblinear/arguments.roles.dict ../all_predictions_4.0/arguments.out.entityCoref ../preprocessing_2.0/documents.paths.tmp stopwords.txt ../all_predictions_4.0/currentPredictionsForRealis/testSet.predictions 10 | 11 | # write nugget output, one per document 12 | python writeTriggerOutput.py formatTriggers/format_andrew_triggers/andrew.triggers.out 13 | 14 | # connect arguments and nuggets together 15 | python argument_nugget_linking.py ../preprocessing_2.0/documents.rootnames.tmp 16 | 17 | cd out 18 | ./moveToStore.sh 19 | -------------------------------------------------------------------------------- /outputFormatting/English_run.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # extract nugget output 4 | cd formatTriggers/format_andrew_triggers/ 5 | python format_andrew.py ../../../all_predictions_4.0/currentPredictionsForTriggers/testSet.predictions ../../../preprocessing_2.0/createSetFiles/setFile.withEntities.tmp andrew.triggers.out 6 | cd ../../ 7 | 8 | # arguments: 1.) test.out file, from ../argumentPrediction/ 2.) Easy-read arguments file 3.) Roles dictionary 4.) Entity coref output 5.) docmap file 6.) stopwords file 7.) realis file 9 | python finalForm_KBP.py ../all_predictions_4.0/output.test.arguments ../all_predictions_4.0/arguments.out.easyRead /home/andrew/DEFT_code_testing/dependencies/models/liblinear/arguments.roles.dict ../all_predictions_4.0/arguments.out.entityCoref ../preprocessing_2.0/documents.paths.tmp stopwords.txt ../all_predictions_4.0/currentPredictionsForRealis/testSet.predictions 10 | 11 | # write nugget output, one per document 12 | python writeTriggerOutput.py formatTriggers/format_andrew_triggers/andrew.triggers.out 13 | 14 | # connect arguments and nuggets together 15 | python argument_nugget_linking.py ../preprocessing_2.0/documents.rootnames.tmp 16 | 17 | cd out 18 | ./moveToStore.sh 19 | -------------------------------------------------------------------------------- /outputFormatting/Spanish_run.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # extract nugget output 4 | cd formatTriggers/format_andrew_triggers/ 5 | python format_andrew.py ../../../all_predictions_4.0/currentPredictionsForTriggers/testSet.predictions ../../../preprocessing_2.0/createSetFiles/setFile.withEntities.tmp.Span andrew.triggers.out 6 | cd ../../ 7 | 8 | # arguments: 1.) test.out file, from ../argumentPrediction/ 2.) Easy-read arguments file 3.) Roles dictionary 4.) Entity coref output 5.) docmap file 6.) stopwords file 7.) realis file 9 | python finalForm_KBP.py ../all_predictions_4.0/output.test.arguments ../all_predictions_4.0/arguments.out.easyRead /home/andrew/DEFT_code_testing/dependencies/models/liblinear/arguments.roles.dict ../all_predictions_4.0/arguments.out.entityCoref ../preprocessing_2.0/documents.paths.tmp stopwords.txt ../all_predictions_4.0/currentPredictionsForRealis/testSet.predictions 10 | 11 | # write nugget output, one per document 12 | python writeTriggerOutput.py formatTriggers/format_andrew_triggers/andrew.triggers.out 13 | 14 | # connect arguments and nuggets together 15 | python argument_nugget_linking.py ../preprocessing_2.0/documents.rootnames.tmp 16 | 17 | cd out 18 | ./moveToStore.sh 19 | -------------------------------------------------------------------------------- /outputFormatting/argument_nugget_linking.py: -------------------------------------------------------------------------------- 1 | # script to link together the argument and nugget files 2 | import sys 3 | 4 | def main(): 5 | if len(sys.argv) != 2: 6 | print "Expect list of filenames." 7 | sys.exit() 8 | 9 | argDir = "out/arguments/" 10 | argModDir = "out/linked_arguments/" 11 | nuggetDir = "out/nuggets/" 12 | linkingDir = "out/linking/" 13 | corpusLinking = "out/corpusLinking/corpusLinking" 14 | 15 | filenames = [] 16 | input = open(sys.argv[1], "r") 17 | for line in input: 18 | filenames.append(line.strip()) 19 | input.close() 20 | 21 | corpusID = 1 22 | corpusOut = open(corpusLinking, "w") 23 | 24 | for filename in filenames: 25 | nuggetDict = dict() # eventType -> offset -> ID 26 | nuggetNameDict = dict() # eventType -> offset -> nugget_string 27 | argumentDict = dict() # nuggetID -> attached_arguments 28 | 29 | try: 30 | input = open(nuggetDir + filename, "r") 31 | for line in input: 32 | if not line.startswith("#") and not line.startswith("@"): 33 | tokens = line.strip().split("\t") 34 | eventType = tokens[5] 35 | startOffset = tokens[3].split(",")[0] 36 | nuggetID = tokens[2] 37 | nuggetName = tokens[4] 38 | 39 | if eventType not in nuggetDict: 40 | nuggetDict[eventType] = dict() 41 | nuggetNameDict[eventType] = dict() 42 | nuggetDict[eventType][startOffset] = nuggetID 43 | nuggetNameDict[eventType][startOffset] = nuggetName 44 | 45 | input.close() 46 | except: 47 | print "No nugget file found for " + filename + "; continuing..." 48 | 49 | input = open(argDir + filename, "r") 50 | output = open(argModDir + filename, "w") 51 | for line in input: 52 | tokens = line.strip().split("\t") 53 | triggerOffset = tokens[11] 54 | eventType = tokens[2] 55 | argumentID = tokens[0] 56 | 57 | 58 | # rewrite the arguments/ file 59 | nuggetID = nuggetDict[eventType][triggerOffset] 60 | nuggetSpan = triggerOffset + "-" + str(len(nuggetNameDict[eventType][triggerOffset]) + int(triggerOffset) - 1) 61 | count = 0 62 | for tok in tokens: 63 | if count == 6: 64 | output.write(nuggetSpan + "\t") 65 | else: 66 | output.write(tok + "\t") 67 | count += 1 68 | output.write(nuggetID + "\n") 69 | 70 | # store information for linking file 71 | if nuggetID not in argumentDict: 72 | argumentDict[nuggetID] = [] 73 | argumentDict[nuggetID].append(argumentID) 74 | input.close() 75 | output.close() 76 | 77 | # reread coreference and write linking file 78 | seenNuggets = set() 79 | linkingID = 1 80 | 81 | output = open(linkingDir + filename, "w") 82 | 83 | try: 84 | input = open(nuggetDir + filename, "r") 85 | for line in input: 86 | if line.startswith("@"): 87 | tokens = line.strip().split("\t") 88 | nuggets = tokens[2].split(",") 89 | first = True 90 | 91 | outputArgs = [] 92 | 93 | for nuggetID in nuggets: 94 | if nuggetID in argumentDict: 95 | seenNuggets.add(nuggetID) 96 | 97 | argumentList = argumentDict[nuggetID] 98 | for arg in argumentList: 99 | outputArgs.append(arg) 100 | 101 | 102 | if len(outputArgs) > 0: 103 | output.write(str(linkingID) + "\t") 104 | corpusOut.write(str(corpusID) + "\t" + filename + "-" + str(linkingID) + "\n") 105 | corpusID += 1 106 | 107 | for index in range(len(outputArgs)): 108 | if index == 0: 109 | output.write(outputArgs[index]) 110 | else: 111 | output.write(" " + outputArgs[index]) 112 | output.write("\n") 113 | 114 | linkingID += 1 115 | 116 | # now, write any singleton nuggets 117 | for nugget in argumentDict: 118 | if nugget not in seenNuggets: 119 | output.write(str(linkingID) + "\t") 120 | corpusOut.write(str(corpusID) + "\t" + filename + "-" + str(linkingID) + "\n") 121 | corpusID += 1 122 | 123 | argumentList = argumentDict[nugget] 124 | for index in range(len(argumentList)): 125 | if index == 0: 126 | output.write(argumentList[index]) 127 | linkingID += 1 128 | else: 129 | output.write(" " + argumentList[index]) 130 | output.write("\n") 131 | 132 | input.close() 133 | except: 134 | print "Skipping empty nugget file again..." 135 | 136 | output.close() 137 | 138 | 139 | corpusOut.close() 140 | 141 | main() 142 | -------------------------------------------------------------------------------- /outputFormatting/finalForm_KBP.py: -------------------------------------------------------------------------------- 1 | # script to change the format to the TAC KBP 2015 required formatting 2 | import string 3 | import sys 4 | 5 | responseIDs = set() 6 | eventIDs = dict() # dict from eventString -> ID 7 | 8 | seenResponses = set() 9 | 10 | stopwordSet = set() 11 | 12 | corefClusters = dict() # dict from corefID -> set of strings 13 | 14 | docDict = dict() # dict from docID -> filename 15 | 16 | # example below: 17 | #sent_1 Phrase: they CorefStr: coref_2 Role: Person Trigger: arriving EventType: Movement_Transport EntityType: PER.Group DOCID: CNNHL_ENG_20030416_133739.9 START: 127 END: 131 18 | class ProcessedArgument: 19 | def __init__(self, easyReadLine, assignedRole, confArg, curRealis): 20 | tokens = easyReadLine.strip().split("\t") 21 | sentStr = tokens[0] 22 | self.text = tokens[2] 23 | 24 | self.corefID = tokens[4] + "_" + tokens[14] 25 | self.triggerText = tokens[8] 26 | 27 | eventTokens = tokens[10].split("_") 28 | self.eventType = convertEventType(eventTokens[0])+ "." + convertEventType(eventTokens[1]) 29 | 30 | self.realis = curRealis 31 | if self.realis == "UNK_REALIS": 32 | self.realis = "ACTUAL" 33 | 34 | self.entityType = convertEntityType(tokens[12].split(".")[0]) 35 | 36 | self.role = assignedRole 37 | 38 | self.JET_role = tokens[6] 39 | 40 | self.confidence = confArg 41 | self.docID = tokens[14] 42 | if self.docID.endswith(".mpdf"): 43 | self.docID = self.docID[:-5] 44 | 45 | if self.docID.startswith("CMN"): 46 | tmp = removeWhitespace(self.text) 47 | self.text = tmp 48 | 49 | self.baseStart = tokens[16] 50 | self.baseEnd = str(int(tokens[18]) - 1) # note that we should be one less -- ACE vs KBP differences 51 | 52 | self.sentStart = tokens[20] 53 | self.sentEnd = str(int(tokens[22]) - 1) # note that we should be one less -- ACE vs KBP differences 54 | 55 | # use this to integrate with the event nugget data (Feb. 10, 2017) 56 | self.triggerOffset = tokens[24] 57 | 58 | # ACE -> KBP changes 59 | if self.eventType == "Contact.Phone-Write": 60 | self.eventType = "Contact.Correspondence" 61 | 62 | if self.eventType == "Transaction.Transfer-Ownership": 63 | if self.role == "Artifact": 64 | self.role == "Thing" 65 | 66 | if self.eventType == "Movement.Transport": 67 | if self.role == "Artifact": 68 | self.eventType = "Movement.Transport-Artifact" 69 | else: 70 | self.eventType = "Movement.Transport-Person" 71 | 72 | if self.role.startswith("Time"): 73 | self.role = "Time" 74 | 75 | if self.JET_role.startswith("Time"): 76 | self.JET_role = "Time" 77 | 78 | def activateJET(self): 79 | self.role = self.JET_role 80 | 81 | def removeWhitespace(arg): 82 | alt = "" 83 | for character in arg: 84 | if character not in string.whitespace: 85 | alt += character 86 | return alt 87 | 88 | def convertWhitespace(arg): 89 | alt = "" 90 | for character in arg: 91 | if character in string.whitespace: 92 | alt += " " 93 | else: 94 | alt += character 95 | return alt 96 | 97 | def isYear(text): 98 | if len(text) != 4: 99 | return False 100 | for index in range(4): 101 | if text[index] not in string.digits: 102 | return False 103 | return True 104 | 105 | def isNumber(text): 106 | for character in text: 107 | if character not in string.digits: 108 | return False 109 | return True 110 | 111 | def isDay(text, prevMonth): 112 | if prevMonth and (len(text) == 1 or len(text) == 2): 113 | if isNumber(text): 114 | num = int(text) 115 | if num >= 1 and num <= 31: 116 | if len(text) == 1: 117 | return "0" + str(num) 118 | else: 119 | return str(num) 120 | return "" 121 | if len(text) == 3: 122 | numPart = text[0] 123 | if isNumber(numPart): 124 | num = int(numPart) 125 | if num >=1 and num <= 31: 126 | return "0" + str(num) 127 | return "" 128 | if len(text) == 4: 129 | numPart = text[0] + text[1] 130 | if isNumber(numPart): 131 | num = int(numPart) 132 | if num >=1 and num <= 31: 133 | return str(num) 134 | return "" 135 | 136 | return "" 137 | 138 | def isMonth(text): 139 | temp = text.lower() 140 | if temp == "january" or temp == "jan" or temp == "jan.": 141 | return "01" 142 | elif temp == "february" or temp == "feb" or temp == "feb.": 143 | return "02" 144 | elif temp == "march" or temp == "mar" or temp == "mar.": 145 | return "03" 146 | elif temp == "april" or temp == "apr" or temp == "apr.": 147 | return "04" 148 | elif temp == "may": 149 | return "05" 150 | elif temp == "june" or temp == "jun" or temp == "jun.": 151 | return "06" 152 | elif temp == "july" or temp == "jul" or temp == "jul.": 153 | return "07" 154 | elif temp == "august" or temp == "aug" or temp == "aug.": 155 | return "08" 156 | elif temp == "september" or temp == "sept" or temp == "sept." or temp == "sep" or temp == "sep.": 157 | return "09" 158 | elif temp == "october" or temp == "oct" or temp == "oct.": 159 | return "10" 160 | elif temp == "november" or temp == "nov" or temp == "nov.": 161 | return "11" 162 | elif temp == "december" or temp == "dec" or temp == "dec.": 163 | return "12" 164 | else: 165 | return "" 166 | 167 | def timeNormalization(timeString): 168 | year = "XXXX" 169 | month = "XX" 170 | day = "XX" 171 | 172 | prevMonth = False 173 | 174 | tokens = convertWhitespace(timeString).split(" ") 175 | for tok in tokens: 176 | if isYear(tok): 177 | year = tok 178 | continue 179 | monthStr = isMonth(tok) 180 | if monthStr != "": 181 | month = monthStr 182 | prevMonth = True 183 | continue 184 | 185 | dayStr = isDay(tok, prevMonth) 186 | if dayStr != "": 187 | day = dayStr 188 | continue 189 | prevMonth = False 190 | 191 | finalString = year + "-" + month + "-" + day 192 | return finalString 193 | 194 | def validEntityType(argument): 195 | role = argument.role 196 | entityType = argument.entityType 197 | 198 | # if we don't know the entity type, assume valid 199 | if entityType == "NULL": 200 | return True 201 | 202 | validSet = set() 203 | 204 | if role == "Adjudicator": 205 | validSet.add("PER") 206 | validSet.add("ORG") 207 | validSet.add("GPE") 208 | elif role == "Agent": 209 | validSet.add("PER") 210 | validSet.add("ORG") 211 | validSet.add("GPE") 212 | validSet.add("FAC") 213 | elif role == "Artifact": 214 | validSet.add("VEH") 215 | validSet.add("WEA") 216 | validSet.add("FAC") 217 | validSet.add("ORG") 218 | validSet.add("COM") 219 | elif role == "Attacker": 220 | validSet.add("PER") 221 | validSet.add("ORG") 222 | validSet.add("GPE") 223 | elif role == "Beneficiary": 224 | validSet.add("PER") 225 | validSet.add("ORG") 226 | validSet.add("GPE") 227 | elif role == "Buyer": 228 | validSet.add("PER") 229 | validSet.add("ORG") 230 | validSet.add("GPE") 231 | elif role == "Crime": 232 | validSet.add("CRIME") 233 | elif role == "Defendant": 234 | validSet.add("PER") 235 | validSet.add("ORG") 236 | validSet.add("GPE") 237 | elif role == "Destination": 238 | validSet.add("GPE") 239 | validSet.add("LOC") 240 | validSet.add("FAC") 241 | elif role == "Entity": 242 | validSet.add("ORG") 243 | validSet.add("GPE") 244 | validSet.add("PER") 245 | elif role == "Giver": 246 | validSet.add("ORG") 247 | validSet.add("GPE") 248 | validSet.add("PER") 249 | elif role == "Instrument": 250 | validSet.add("WEA") 251 | validSet.add("VEH") 252 | elif role == "Money": 253 | validSet.add("MONEY") 254 | validSet.add("NUM") 255 | elif role == "Org": 256 | validSet.add("ORG") 257 | elif role == "Origin": 258 | validSet.add("GPE") 259 | validSet.add("LOC") 260 | validSet.add("FAC") 261 | elif role == "Person": 262 | validSet.add("PER") 263 | elif role == "Place": 264 | validSet.add("GPE") 265 | validSet.add("LOC") 266 | validSet.add("FAC") 267 | elif role == "Plaintiff": 268 | validSet.add("PER") 269 | validSet.add("ORG") 270 | validSet.add("GPE") 271 | elif role == "Position": 272 | validSet.add("JOB") 273 | elif role == "Price": 274 | validSet.add("MONEY") 275 | validSet.add("NUM") 276 | elif role == "Prosecutor": 277 | validSet.add("PER") 278 | validSet.add("ORG") 279 | validSet.add("GPE") 280 | elif role == "Recipient": 281 | validSet.add("PER") 282 | validSet.add("ORG") 283 | validSet.add("GPE") 284 | elif role == "Seller": 285 | validSet.add("PER") 286 | validSet.add("ORG") 287 | validSet.add("GPE") 288 | elif role == "Sentence": 289 | validSet.add("SENTENCE") 290 | elif role == "Target": 291 | validSet.add("PER") 292 | validSet.add("ORG") 293 | validSet.add("VEH") 294 | validSet.add("FAC") 295 | validSet.add("WEA") 296 | elif role == "Vehicle": 297 | validSet.add("VEH") 298 | elif role == "Victim": 299 | validSet.add("PER") 300 | elif role.startswith("Time"): 301 | validSet.add("TIME") 302 | elif role == "Audience": 303 | validSet.add("PER") 304 | validSet.add("ORG") 305 | validSet.add("GPE") 306 | elif role == "Thing": 307 | validSet.add("VEH") 308 | validSet.add("WEA") 309 | validSet.add("ORG") 310 | validSet.add("FAC") 311 | else: 312 | print "Don't recognize this role: " + role 313 | return False 314 | 315 | if entityType not in validSet: 316 | return False 317 | return True 318 | 319 | def validRole(argument): 320 | eventType = argument.eventType 321 | role = argument.role 322 | 323 | validSet = set() 324 | 325 | notKBP2016_set = set(["Business.Mergeorg", "Business.Startorg", "Business.Endorg", "Life.Beborn", "Business.Declarebankruptcy", "Justice.Releaseparole", "Justice.Chargeindict", "Justice.Trialhearing", "Business.Declare-Bankruptcy", "Business.Merge-Org", "Life.Marry", "Life.Divorce", "Personnel.Nominate", "Justice.Release-Parole", "Justice.Trial-Hearing", "Justice.Sentence", "Justice.Fine", "Justice.Charge-Indict", "Justice.Sue", "Justice.Extradite", "Justice.Acquit", "Justice.Convict", "Justice.Appeal", "Justice.Execute", "Justice.Pardon", "Manufacture.Artifact"]) 326 | if eventType in notKBP2016_set: 327 | return False 328 | 329 | 330 | # if eventType == "Business.Declare-Bankruptcy": 331 | # validSet.add("Org") 332 | # elif eventType == "Business.Merge-Org": 333 | # validSet.add("Org") 334 | if eventType == "Conflict.Attack": 335 | validSet.add("Attacker") 336 | validSet.add("Target") 337 | validSet.add("Instrument") 338 | elif eventType == "Conflict.Demonstrate": 339 | validSet.add("Entity") 340 | elif eventType == "Contact.Meet": 341 | validSet.add("Entity") 342 | elif eventType == "Contact.Correspondence": 343 | validSet.add("Entity") 344 | elif eventType == "Contact.Contact": 345 | validSet.add("Entity") 346 | elif eventType == "Contact.Broadcast": 347 | validSet.add("Audience") 348 | validSet.add("Entity") 349 | # elif eventType == "Life.Marry": 350 | # validSet.add("Person") 351 | # elif eventType == "Life.Divorce": 352 | # validSet.add("Person") 353 | elif eventType == "Life.Injure": 354 | validSet.add("Agent") 355 | validSet.add("Victim") 356 | validSet.add("Instrument") 357 | elif eventType == "Life.Die": 358 | validSet.add("Agent") 359 | validSet.add("Victim") 360 | validSet.add("Instrument") 361 | elif eventType == "Movement.Transport-Person": 362 | validSet.add("Agent") 363 | validSet.add("Person") 364 | validSet.add("Instrument") 365 | validSet.add("Origin") 366 | validSet.add("Destination") 367 | elif eventType == "Movement.Transport-Artifact": 368 | validSet.add("Agent") 369 | validSet.add("Artifact") 370 | validSet.add("Instrument") 371 | validSet.add("Origin") 372 | validSet.add("Destination") 373 | elif eventType == "Personnel.Start-Position": 374 | validSet.add("Person") 375 | validSet.add("Entity") 376 | validSet.add("Position") 377 | elif eventType == "Personnel.End-Position": 378 | validSet.add("Person") 379 | validSet.add("Entity") 380 | validSet.add("Position") 381 | # elif eventType == "Personnel.Nominate": 382 | # validSet.add("Agent") 383 | # validSet.add("Person") 384 | # validSet.add("Position") 385 | elif eventType == "Personnel.Elect": 386 | validSet.add("Person") 387 | validSet.add("Agent") 388 | validSet.add("Position") 389 | elif eventType == "Transaction.Transaction": 390 | validSet.add("Giver") 391 | validSet.add("Recipient") 392 | validSet.add("Beneficiary") 393 | elif eventType == "Transaction.Transfer-Ownership": 394 | validSet.add("Giver") 395 | validSet.add("Recipient") 396 | validSet.add("Beneficiary") 397 | validSet.add("Thing") 398 | elif eventType == "Transaction.Transfer-Money": 399 | validSet.add("Giver") 400 | validSet.add("Recipient") 401 | validSet.add("Beneficiary") 402 | validSet.add("Money") 403 | elif eventType == "Justice.Arrest-Jail": 404 | validSet.add("Agent") 405 | validSet.add("Person") 406 | validSet.add("Crime") 407 | # elif eventType == "Justice.Release-Parole": 408 | # validSet.add("Entity") 409 | # validSet.add("Person") 410 | # validSet.add("Crime") 411 | # elif eventType == "Justice.Trial-Hearing": 412 | # validSet.add("Prosecutor") 413 | # validSet.add("Adjudicator") 414 | # validSet.add("Defendant") 415 | # validSet.add("Crime") 416 | # elif eventType == "Justice.Sentence": 417 | # validSet.add("Adjudicator") 418 | # validSet.add("Defendant") 419 | # validSet.add("Sentence") 420 | # validSet.add("Crime") 421 | # elif eventType == "Justice.Fine": 422 | # validSet.add("Adjudicator") 423 | # validSet.add("Entity") 424 | # validSet.add("Money") 425 | # validSet.add("Crime") 426 | # elif eventType == "Justice.Charge-Indict": 427 | # validSet.add("Prosecutor") 428 | # validSet.add("Adjudicator") 429 | # validSet.add("Defendant") 430 | # validSet.add("Crime") 431 | # elif eventType == "Justice.Sue": 432 | # validSet.add("Plantiff") 433 | # validSet.add("Adjudicator") 434 | # validSet.add("Defendant") 435 | # validSet.add("Crime") 436 | # elif eventType == "Justice.Extradite": 437 | # validSet.add("Agent") 438 | # validSet.add("Person") 439 | # validSet.add("Origin") 440 | # validSet.add("Destination") 441 | # validSet.add("Crime") 442 | # elif eventType == "Justice.Acquit": 443 | # validSet.add("Adjudicator") 444 | # validSet.add("Defendant") 445 | # validSet.add("Crime") 446 | # elif eventType == "Justice.Convict": 447 | # validSet.add("Adjudicator") 448 | # validSet.add("Defendant") 449 | # validSet.add("Crime") 450 | # elif eventType == "Justice.Appeal": 451 | # validSet.add("Prosecutor") 452 | # validSet.add("Adjudicator") 453 | # validSet.add("Defendant") 454 | # validSet.add("Crime") 455 | # elif eventType == "Justice.Execute": 456 | # validSet.add("Agent") 457 | # validSet.add("Person") 458 | # validSet.add("Crime") 459 | # elif eventType == "Justice.Pardon": 460 | # validSet.add("Adjudicator") 461 | # validSet.add("Defendant") 462 | # validSet.add("Crime") 463 | # elif eventType == "Manufacture.Artifact": 464 | # validSet.add("Agent") 465 | # validSet.add("Artifact") 466 | # validSet.add("Instrument") 467 | else: 468 | print "Don't recognize this event type: " + eventType 469 | return False 470 | 471 | if role == "Place" and eventType.startswith("Movement"): 472 | return False 473 | 474 | if role == "Place" or role.startswith("Time"): 475 | return True 476 | 477 | if role not in validSet: 478 | return False 479 | return True 480 | 481 | 482 | 483 | def main(): 484 | if len(sys.argv) != 8: 485 | print "Expect predictions file, easyRead file, roles dict, coref file, docID dictionary file, stopwords list, realisOutput." 486 | print "Output to be placed in out/arguments/ and out/linking" 487 | sys.exit() 488 | 489 | # first, write an empty file for each docID. At least make sure we have a file, even if we don't find any arguments 490 | input = open(sys.argv[6], "r") 491 | for line in input: 492 | word = line.strip() 493 | stopwordSet.add(word) 494 | input.close() 495 | 496 | input = open(sys.argv[5], "r") 497 | for line in input: 498 | tokens = line.strip().split("\t") 499 | key = tokens[0] 500 | 501 | if key.endswith(".mpdf"): 502 | key = key[:-5] 503 | 504 | 505 | filename = tokens[1] 506 | docDict[key] = filename 507 | 508 | output = open("out/arguments/" + key, "w") 509 | output.close() 510 | output = open("out/linking/" + key, "w") 511 | output.close() 512 | output = open("out/corpusLinking/corpusLinking", "w") 513 | 514 | input.close() 515 | 516 | predictionsRaw = [] 517 | confidence = [] 518 | input = open(sys.argv[1], "r") 519 | labelOnly = True 520 | for line in input: 521 | if line.startswith("labels"): 522 | labelOnly = False 523 | continue 524 | if labelOnly: 525 | predictionsRaw.append(line.strip()) 526 | confidence.append("0.5") 527 | else: 528 | tempTokens = line.split(" ") 529 | temp = tempTokens[0] 530 | predictionsRaw.append(temp) 531 | confidence.append(tempTokens[int(temp)]) 532 | input.close() 533 | 534 | input = open(sys.argv[4], "r") 535 | for line in input: 536 | tokens = line.strip().split("\t") 537 | text = tokens[0] 538 | corefID = tokens[1] 539 | start = tokens[2] 540 | end = str(int(tokens[3]) - 1) 541 | 542 | if corefID not in corefClusters: 543 | corefClusters[corefID] = set() 544 | corefClusters[corefID].add(text + "|||" + start + "|||" + end) 545 | input.close() 546 | 547 | roleDict = dict() 548 | input = open(sys.argv[3], "r") 549 | for line in input: 550 | tokens = line.strip().split(":") 551 | 552 | ### 2016 -- convert labels to correct format 553 | roleDict[tokens[1]] = convertRoleLabels(tokens[0]) 554 | input.close() 555 | 556 | # read the realis labels 557 | realis = [] 558 | input = open(sys.argv[7], "r") 559 | for line in input: 560 | start = line.strip().rfind("|") 561 | realis.append(line.strip()[start+1:]) 562 | input.close() 563 | 564 | predictions = [] 565 | input = open(sys.argv[2], "r") 566 | index = 0 567 | for line in input: 568 | predictedRole = roleDict[predictionsRaw[index]] 569 | curConf = confidence[index] 570 | curRealis = realis[index] 571 | 572 | arg = ProcessedArgument(line, predictedRole, curConf, curRealis) 573 | predictions.append(arg) 574 | 575 | index += 1 576 | input.close() 577 | 578 | docDict_Args = dict() # dict from docID -> set of string 579 | docDict_Linking = dict() # dict from docID -> dict:{eventID -> set of responseIDs} 580 | 581 | # the system predicted ones 582 | for arg in predictions: 583 | if arg.role == "NONE": 584 | continue 585 | 586 | if not validRole(arg) or not validEntityType(arg): 587 | continue 588 | 589 | 590 | argString, docID, eventID, responseID, responseString = readArgument(arg) 591 | 592 | if responseString in seenResponses: 593 | continue 594 | 595 | seenResponses.add(responseString) 596 | 597 | if docID not in docDict_Args: 598 | docDict_Args[docID] = set() 599 | docDict_Linking[docID] = dict() 600 | docDict_Args[docID].add(argString) 601 | 602 | if arg.realis != "GENERIC": 603 | if eventID not in docDict_Linking[docID]: 604 | docDict_Linking[docID][eventID] = set() 605 | docDict_Linking[docID][eventID].add(responseID) 606 | 607 | for docID in docDict_Args: 608 | output = open("out/arguments/" + docID, "w") 609 | for line in docDict_Args[docID]: 610 | output.write(line) 611 | output.close() 612 | 613 | corpusOutput = open("out/corpusLinking/corpusLinking", "w") 614 | corpusCount = 1 615 | for docID in docDict_Linking: 616 | output = open("out/linking/" + docID, "w") 617 | eventCount = 1 618 | for eventID in docDict_Linking[docID]: 619 | output.write(str(eventCount) + "\t") 620 | corpusOutput.write(str(corpusCount) + "\t" + docID + "-" + str(eventCount) + "\n") 621 | 622 | eventCount += 1 623 | corpusCount += 1 624 | 625 | idSet = docDict_Linking[docID][eventID] 626 | line = "" 627 | for item in idSet: 628 | line += str(item) + " " 629 | line = line.strip() 630 | output.write(line.strip() + "\n") 631 | output.close() 632 | corpusOutput.close() 633 | 634 | def properNoun(text): 635 | tokens = text.split(" ") 636 | proper = False 637 | for tok in tokens: 638 | if tok.lower() in stopwordSet: 639 | continue 640 | elif tok.lower() != tok: 641 | proper = True 642 | return proper 643 | 644 | def canonicalForm(stringSet): 645 | best = "" 646 | bestStart = -1 647 | bestEnd = -1 648 | bestCapital = False 649 | containsComma = False 650 | for item in stringSet: 651 | tokens = item.split("|||") 652 | text = tokens[0] 653 | start = tokens[1] 654 | end = tokens[2] 655 | 656 | proper = properNoun(text) 657 | 658 | if best == "": 659 | best = text 660 | bestStart = start 661 | bestEnd = end 662 | if "," in text: 663 | containsComma = True 664 | if proper: 665 | bestCapital = True 666 | elif proper: 667 | if not bestCapital: 668 | best = text 669 | bestStart = start 670 | bestEnd = end 671 | bestCapital = True 672 | if "," in text: 673 | containsComma = True 674 | elif "," not in text and (len(text) > len(best) or containsComma): 675 | best = text 676 | bestStart = start 677 | bestEnd = end 678 | bestCapital = True 679 | elif not bestCapital and "," not in text and (len(text) > len(best) or containsComma): 680 | best = text 681 | bestStart = start 682 | bestEnd = end 683 | 684 | return best, bestStart, bestEnd 685 | 686 | def convertOffset(value, docID): 687 | ### counting the XML now in offsets, don't need below 688 | return value 689 | 690 | #filename = docDict[docID] 691 | #input = open(filename, "r") 692 | #nonXML_Index = 0 693 | #withXML_Index = 0 694 | 695 | #debug = "" 696 | 697 | #inXML = False 698 | #broke = False 699 | #for line in input: 700 | # for character in line: 701 | # if nonXML_Index == value: 702 | # broke = True 703 | # break 704 | 705 | # withXML_Index += 1 706 | # if character == "<": 707 | # inXML = True 708 | # elif character == ">": 709 | # inXML = False 710 | # elif not inXML: 711 | # nonXML_Index += 1 712 | # debug += character 713 | 714 | #input.close() 715 | 716 | #if not broke: 717 | # print nonXML_Index 718 | # print value 719 | # print filename 720 | # print "ERROR!!!!" 721 | # sys.exit() 722 | 723 | #print "\t\t" + debug 724 | 725 | #return withXML_Index 726 | 727 | def readArgument(inputArg): 728 | responseID = len(responseIDs) 729 | responseIDs.add(responseID) 730 | 731 | docID = inputArg.docID 732 | ### NEW -- remove .xml extension 733 | if docID.endswith(".xml"): 734 | docID = docID[:-4] 735 | 736 | 737 | eventType = inputArg.eventType 738 | role = inputArg.role 739 | 740 | CAS_String, CAS_start, CAS_end = canonicalForm(corefClusters[inputArg.corefID]) 741 | # adjust whitespace 742 | temp = convertWhitespace(CAS_String) 743 | CAS_String = temp 744 | 745 | if role == "Time": 746 | alternate_CAS_String = timeNormalization(CAS_String) 747 | CAS_String = alternate_CAS_String 748 | 749 | #offsets = CAS_start + "-" + CAS_end 750 | adjusted_CAS_start = convertOffset(int(CAS_start), docID) 751 | adjusted_CAS_end = convertOffset(int(CAS_end), docID) 752 | 753 | if adjusted_CAS_end < adjusted_CAS_start: 754 | adjusted_CAS_end = adjusted_CAS_start 755 | 756 | offsets = str(adjusted_CAS_start) + "-" + str(adjusted_CAS_end) 757 | 758 | adjusted_sentStart = convertOffset(int(inputArg.sentStart), docID) 759 | adjusted_sentEnd = convertOffset(int(inputArg.sentEnd), docID) 760 | justificationOffset = str(adjusted_sentStart) + "-" + str(adjusted_sentEnd) 761 | 762 | baseFiller = inputArg.text 763 | adjusted_baseStart = convertOffset(int(inputArg.baseStart), docID) 764 | adjusted_baseEnd = convertOffset(int(inputArg.baseEnd), docID) 765 | 766 | ### Linking with nuggets -- ColdStart++ 767 | triggerOffset = inputArg.triggerOffset 768 | 769 | ### KBP2016 -- no entity coref 770 | CAS_String = baseFiller 771 | CAS_start = adjusted_baseStart 772 | CAS_end = adjusted_baseEnd 773 | offsets = str(CAS_start) + "-" + str(CAS_end) 774 | 775 | ### KBP2016 -- justification must be < 200 characters 776 | while adjusted_sentEnd - adjusted_sentStart >= 200: 777 | if adjusted_baseEnd != adjusted_sentEnd: 778 | adjusted_sentEnd -= 1 779 | elif adjusted_baseStart != adjusted_sentStart: 780 | adjusted_sentStart += 1 781 | else: 782 | adjusted_sentEnd -= 1 783 | justificationOffset = str(adjusted_sentStart) + "-" + str(adjusted_sentEnd) 784 | 785 | if adjusted_baseEnd < adjusted_baseStart: 786 | adjusted_baseEnd = adjusted_baseStart 787 | baseFillerOffsets = str(adjusted_baseStart) + "-" + str(adjusted_baseEnd) 788 | 789 | argJustificationOffsets = "NIL" 790 | realis = inputArg.realis 791 | confidence = inputArg.confidence # [0-1] 792 | 793 | # 2016 -- link things together if they have the same docID and same eventType 794 | eventString = docID + "_" + eventType 795 | if eventString not in eventIDs: 796 | eventIDs[eventString] = len(eventIDs) 797 | 798 | 799 | ### original version below (before ColdStart++, used for TAC KBP 2016) 800 | #outputString = str(responseID) + "\t" + docID + "\t" + eventType + "\t" + role + "\t" + CAS_String + "\t" + offsets + "\t" + justificationOffset + "\t" + baseFillerOffsets + "\t" + argJustificationOffsets + "\t" + realis + "\t" + confidence + "\n" 801 | # new version -- used for ColdStart++ merging with nuggets 802 | outputString = str(responseID) + "\t" + docID + "\t" + eventType + "\t" + role + "\t" + CAS_String + "\t" + offsets + "\t" + justificationOffset + "\t" + baseFillerOffsets + "\t" + argJustificationOffsets + "\t" + realis + "\t" + confidence + "\t" + triggerOffset + "\n" 803 | 804 | # below: not for output, but for identifying arguments that end up having the same ID (e.g. both play the AGENT role of some trigger in the same sentence) 805 | responseString = docID + "\t" + eventType + "\t" + role + "\t" + CAS_String + "\t" + offsets + "\t" + justificationOffset + "\t" + baseFillerOffsets + "\t" + argJustificationOffsets + "\t" + realis + "\n" 806 | 807 | return outputString, docID, eventIDs[eventString], responseID, responseString 808 | 809 | def convertRoleLabels(label): 810 | newLabel = "" 811 | prevChar = "" 812 | first = True 813 | for character in label: 814 | if first: 815 | newLabel += character.upper() 816 | first = False 817 | elif prevChar in string.punctuation: 818 | newLabel += character.upper() 819 | else: 820 | newLabel += character 821 | 822 | prevChar = character 823 | 824 | return newLabel 825 | 826 | def convertEventType(text): 827 | tmp = convertRoleLabels(text) 828 | 829 | if tmp == "Transportperson": 830 | return "Transport-Person" 831 | elif tmp == "Transportartifact": 832 | return "Transport-Artifact" 833 | elif tmp == "Endposition": 834 | return "End-Position" 835 | elif tmp == "Startposition": 836 | return "Start-Position" 837 | elif tmp == "Arrestjail": 838 | return "Arrest-Jail" 839 | elif tmp == "Transfermoney": 840 | return "Transfer-Money" 841 | elif tmp == "Transferownership": 842 | return "Transfer-Ownership" 843 | else: 844 | return tmp 845 | 846 | def convertEntityType(text): 847 | if text == "weapon": 848 | return "WEA" 849 | elif text == "vehicle": 850 | return "VEH" 851 | elif text == "sentence": 852 | return "Sentence" 853 | elif text == "crime": 854 | return "CRIME" 855 | elif text == "title": 856 | return "Title" 857 | elif text == "money": 858 | return "MONEY" 859 | elif text == "time": 860 | return "TIME" 861 | return text 862 | 863 | main() 864 | -------------------------------------------------------------------------------- /outputFormatting/formatTriggers/format_andrew_triggers/format_andrew.py: -------------------------------------------------------------------------------- 1 | # script to convert my output files to the Event Nugget Output format 2 | import sys 3 | 4 | def main(): 5 | if len(sys.argv) != 4: 6 | print "Need output triggers (with role names), createSetFiles file, output file." 7 | sys.exit() 8 | 9 | triggers = [] 10 | input = open(sys.argv[1], "r") 11 | for line in input: 12 | triggers.append(line.strip().lower()) 13 | input.close() 14 | 15 | input = open(sys.argv[2], "r") 16 | output = open(sys.argv[3], "w") 17 | 18 | curDoc = "" 19 | mentionID = 1 20 | index = 0 21 | 22 | for line in input: 23 | clean = line.strip() 24 | if clean != "": 25 | tokens = line.strip().split("\t") 26 | docID = tokens[5] 27 | 28 | if docID.endswith(".xml"): 29 | docID = docID[:-4] 30 | 31 | if docID != curDoc: 32 | if curDoc != "": 33 | output.write("#EndOfDocument\n") 34 | output.write("#BeginOfDocument " + docID + "\n") 35 | curDoc = docID 36 | mentionID = 1 37 | 38 | startOffset = tokens[0] 39 | endOffset = tokens[1] 40 | word = tokens[2] 41 | 42 | # skip rest if the word isn't a trigger 43 | if triggers[index] == "not_trigger_not_trigger": 44 | index += 1 45 | continue 46 | 47 | output.write("andrewSystem\t" + curDoc + "\t" + str(mentionID) + "\t" + startOffset + "," + endOffset + "\t" + word + "\t" + triggers[index] + "\tActual" + "\t0.5" + "\n") 48 | mentionID += 1 49 | index += 1 50 | 51 | output.write("#EndOfDocument\n") 52 | input.close() 53 | output.close() 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | main() 62 | -------------------------------------------------------------------------------- /outputFormatting/formatTriggers/format_hector_triggers/format_hector.py: -------------------------------------------------------------------------------- 1 | # script to convert my output files to the Event Nugget Output format 2 | import sys 3 | 4 | def main(): 5 | if len(sys.argv) != 3: 6 | print "Need nuggets from Jun, output file." 7 | sys.exit() 8 | 9 | input = open(sys.argv[1], "r") 10 | output = open(sys.argv[2], "w") 11 | 12 | docID = "" 13 | for line in input: 14 | if line.startswith("#BeginOfDocument"): 15 | tokens = line.strip().split() 16 | name = tokens[1] 17 | if name.endswith(".xml"): 18 | name = name[:-4] 19 | output.write(tokens[0] + " " + name + "\n") 20 | elif line.startswith("#EndOfDocument"): 21 | output.write(line) 22 | elif line.startswith("@Coreference"): 23 | output.write(line) 24 | else: 25 | tokens = line.strip().split("\t") 26 | 27 | if tokens[5] == "OUTSIDE": 28 | continue 29 | 30 | # labelTokens = tokens[5].split(".") 31 | labelTokens = tokens[5].split("_") 32 | label = labelTokens[0] + "_" + labelTokens[1] 33 | 34 | # output.write("junSystem" + "\t" + tokens[1] + "\t" + tokens[2] + "\t" + tokens[3] + "\t" + tokens[4] + "\t" + tokens[5].lower() + "\t" + tokens[6] + "\n") 35 | # output.write("hectorSystem" + "\t" + tokens[1] + "\t" + tokens[2] + "\t" + tokens[3] + "\t" + tokens[4] + "\t" + label.lower() + "\t" + tokens[6] + "\n") 36 | output.write("hectorSystem" + "\t" + tokens[1] + "\t" + tokens[2] + "\t" + tokens[3] + "\t" + tokens[4] + "\t" + label.lower() + "\t" + tokens[6] + "\t" + "0.5" + "\n") 37 | input.close() 38 | output.close() 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | main() 47 | -------------------------------------------------------------------------------- /outputFormatting/formatTriggers/format_jun_triggers/format_jun.py: -------------------------------------------------------------------------------- 1 | # script to convert my output files to the Event Nugget Output format 2 | import sys 3 | 4 | def main(): 5 | if len(sys.argv) != 3: 6 | print "Need nuggets from Jun, output file." 7 | sys.exit() 8 | 9 | input = open(sys.argv[1], "r") 10 | output = open(sys.argv[2], "w") 11 | 12 | docID = "" 13 | for line in input: 14 | if line.startswith("#BeginOfDocument"): 15 | tokens = line.strip().split() 16 | name = tokens[1] 17 | if name.endswith(".xml"): 18 | name = name[:-4] 19 | output.write(tokens[0] + " " + name + "\n") 20 | elif line.startswith("#EndOfDocument"): 21 | output.write(line) 22 | elif line.startswith("@Coreference"): 23 | output.write(line) 24 | else: 25 | tokens = line.strip().split("\t") 26 | 27 | labelTokens = tokens[5].split(".") 28 | label = labelTokens[0] + "_" + labelTokens[1] 29 | 30 | # output.write("junSystem" + "\t" + tokens[1] + "\t" + tokens[2] + "\t" + tokens[3] + "\t" + tokens[4] + "\t" + tokens[5].lower() + "\t" + tokens[6] + "\n") 31 | # output.write("junSystem" + "\t" + tokens[1] + "\t" + tokens[2] + "\t" + tokens[3] + "\t" + tokens[4] + "\t" + label.lower() + "\t" + tokens[6] + "\n") 32 | output.write("junSystem" + "\t" + tokens[1] + "\t" + tokens[2] + "\t" + tokens[3] + "\t" + tokens[4] + "\t" + label.lower() + "\t" + tokens[6] + "\t" + tokens[8] + "\n") 33 | input.close() 34 | output.close() 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | main() 43 | -------------------------------------------------------------------------------- /outputFormatting/out/cleanStore.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # move to store and clean again 4 | cd store 5 | for file in arguments/*; do rm "$file" ; done 6 | for file in linking/*; do rm "$file" ; done 7 | for file in nuggets/*; do rm "$file" ; done 8 | rm corpusLinking/* 9 | 10 | # return to original directory 11 | cd ../ 12 | -------------------------------------------------------------------------------- /outputFormatting/out/mergeSubmissions/mergeSubmissions.py: -------------------------------------------------------------------------------- 1 | # script to merge the submissions from three sources into a single directory 2 | import sys 3 | 4 | def main(): 5 | if len(sys.argv) != 2: 6 | print "Expect a list of files to read. Assuming directories are under andrew/ hector/ and jun/" 7 | sys.exit() 8 | 9 | filenames = [] 10 | input = open(sys.argv[1], "r") 11 | for line in input: 12 | filenames.append(line.strip()) 13 | 14 | dirList = ["andrew", "hector", "jun"] 15 | 16 | corpusOutput = open("all/corpusLinking/corpusLinking", "w") 17 | corpusID = 1 18 | 19 | for filename in filenames: 20 | idCount = 1 21 | linesToWrite = [] 22 | output = open("all/arguments/" + filename, "w") 23 | 24 | # for making the linking file -- link together all arguments with the same event type 25 | idsByEventType = dict() # eventType -> list of ids 26 | 27 | for inDir in dirList: 28 | input = open(inDir + "/arguments/" + filename, "r") 29 | 30 | for line in input: 31 | start = line.find("\t") + 1 32 | data = line[start:] 33 | 34 | if data not in linesToWrite: 35 | linesToWrite.append(data) 36 | output.write(str(idCount) + "\t" + data) 37 | 38 | tokens = line.strip().split("\t") 39 | eventType = tokens[2] 40 | 41 | if eventType not in idsByEventType: 42 | idsByEventType[eventType] = [] 43 | idsByEventType[eventType].append(idCount) 44 | 45 | idCount += 1 46 | input.close() 47 | 48 | output.close() 49 | 50 | output = open("all/linking/" + filename, "w") 51 | linkingID = 1 52 | 53 | for eventType in idsByEventType: 54 | idList = idsByEventType[eventType] 55 | output.write(str(linkingID) + "\t") 56 | corpusOutput.write(str(corpusID) + "\t" + filename + "-" + str(linkingID) + "\n") 57 | 58 | for index in range(len(idList)): 59 | if index == len(idList) - 1: 60 | output.write(str(idList[index]) + "\n") 61 | else: 62 | output.write(str(idList[index]) + " ") 63 | 64 | linkingID += 1 65 | corpusID += 1 66 | 67 | 68 | corpusOutput.close() 69 | 70 | 71 | 72 | 73 | 74 | 75 | 76 | 77 | 78 | 79 | 80 | 81 | 82 | main() 83 | -------------------------------------------------------------------------------- /outputFormatting/out/mergeSubmissions_coreference/argument_nugget_linking.py: -------------------------------------------------------------------------------- 1 | # script to link together the argument and nugget files 2 | import sys 3 | 4 | def main(): 5 | if len(sys.argv) != 2: 6 | print "Expect list of filenames." 7 | sys.exit() 8 | 9 | argDir = "all/arguments/" 10 | argModDir = "all/linked_arguments/" 11 | nuggetDir = "all/nuggets/" 12 | linkingDir = "all/linking/" 13 | corpusLinking = "all/corpusLinking/corpusLinking" 14 | 15 | filenames = [] 16 | input = open(sys.argv[1], "r") 17 | for line in input: 18 | filenames.append(line.strip()) 19 | input.close() 20 | 21 | corpusID = 1 22 | corpusOut = open(corpusLinking, "w") 23 | 24 | for filename in filenames: 25 | print filename 26 | nuggetDict = dict() # eventType -> offset -> ID 27 | argumentDict = dict() # nuggetID -> attached_arguments 28 | 29 | input = open(nuggetDir + filename, "r") 30 | for line in input: 31 | if not line.startswith("#") and not line.startswith("@"): 32 | tokens = line.strip().split("\t") 33 | eventType = tokens[5] 34 | startOffset = tokens[3].split(",")[0] 35 | nuggetID = tokens[2] 36 | 37 | if eventType not in nuggetDict: 38 | nuggetDict[eventType] = dict() 39 | nuggetDict[eventType][startOffset] = nuggetID 40 | input.close() 41 | 42 | input = open(argDir + filename, "r") 43 | output = open(argModDir + filename, "w") 44 | for line in input: 45 | print line 46 | tokens = line.strip().split("\t") 47 | print len(tokens) 48 | triggerOffset = tokens[11] 49 | eventType = tokens[2] 50 | argumentID = tokens[0] 51 | 52 | 53 | # rewrite the arguments/ file 54 | nuggetID = nuggetDict[eventType][triggerOffset] 55 | for tok in tokens: 56 | output.write(tok + "\t") 57 | output.write(nuggetID + "\n") 58 | 59 | # store information for linking file 60 | if nuggetID not in argumentDict: 61 | argumentDict[nuggetID] = [] 62 | argumentDict[nuggetID].append(argumentID) 63 | input.close() 64 | output.close() 65 | 66 | # reread coreference and write linking file 67 | seenNuggets = set() 68 | linkingID = 1 69 | 70 | input = open(nuggetDir + filename, "r") 71 | output = open(linkingDir + filename, "w") 72 | for line in input: 73 | if line.startswith("@"): 74 | tokens = line.strip().split("\t") 75 | nuggets = tokens[2].split(",") 76 | first = True 77 | 78 | outputArgs = [] 79 | 80 | for nuggetID in nuggets: 81 | if nuggetID in argumentDict: 82 | seenNuggets.add(nuggetID) 83 | 84 | argumentList = argumentDict[nuggetID] 85 | for arg in argumentList: 86 | outputArgs.append(arg) 87 | 88 | 89 | if len(outputArgs) > 0: 90 | output.write(str(linkingID) + "\t") 91 | corpusOut.write(str(corpusID) + "\t" + filename + "-" + str(linkingID) + "\n") 92 | corpusID += 1 93 | 94 | for index in range(len(outputArgs)): 95 | if index == 0: 96 | output.write(outputArgs[index]) 97 | else: 98 | output.write(" " + outputArgs[index]) 99 | output.write("\n") 100 | 101 | linkingID += 1 102 | 103 | # now, write any singleton nuggets 104 | for nugget in argumentDict: 105 | if nugget not in seenNuggets: 106 | output.write(str(linkingID) + "\t") 107 | corpusOut.write(str(corpusID) + "\t" + filename + "-" + str(linkingID) + "\n") 108 | corpusID += 1 109 | 110 | argumentList = argumentDict[nugget] 111 | for index in range(len(argumentList)): 112 | if index == 0: 113 | output.write(argumentList[index]) 114 | linkingID += 1 115 | else: 116 | output.write(" " + argumentList[index]) 117 | output.write("\n") 118 | 119 | 120 | output.close() 121 | input.close() 122 | 123 | 124 | corpusOut.close() 125 | 126 | main() 127 | -------------------------------------------------------------------------------- /outputFormatting/out/mergeSubmissions_coreference/mergeSubmissions.py: -------------------------------------------------------------------------------- 1 | # script to merge the submissions from three sources into a single directory 2 | import sys 3 | 4 | def main(): 5 | if len(sys.argv) != 2: 6 | print "Writes the nuggets/ and arguments/ files. Use another script for linking." 7 | print "Expect a list of files to read. Assuming directories are under andrew/ hector/ and jun/" 8 | sys.exit() 9 | 10 | filenames = [] 11 | input = open(sys.argv[1], "r") 12 | for line in input: 13 | filenames.append(line.strip()) 14 | 15 | dirList = ["hector", "jun", "andrew"] 16 | 17 | corpusID = 1 18 | for filename in filenames: 19 | # Begin Nugget Writing 20 | # process nuggets first 21 | output = open("all/nuggets/" + filename, "w") 22 | output.write("#BeginOfDocument" + " " + filename + "\n") 23 | 24 | writtenNuggetKeys = set() 25 | writtenNuggets = set() 26 | nuggetID_toKey = dict() 27 | corefDict = dict() # dict from key -> set of coreferent nugget ids 28 | for inDir in dirList: 29 | try: 30 | input = open(inDir + "/nuggets/" + filename, "r") 31 | except: 32 | continue 33 | 34 | for line in input: 35 | if line.startswith("#"): 36 | continue 37 | elif line.startswith("@"): 38 | tokens = line.strip().split("\t")[2].split(",") 39 | first = inDir + "_" + tokens[0] 40 | firstKey = nuggetID_toKey[first] 41 | rest = tokens[1:] 42 | 43 | # add the remaining nuggets to the corefSet of the first nugget 44 | for tok in rest: 45 | curID = inDir + "_" + tok 46 | corefDict[firstKey].add(curID) 47 | 48 | 49 | # maybe I don't need this? Merging later may take care of it 50 | ''' 51 | # delete the corefSets of the other nuggets 52 | curKey = nuggetID_toKey[curID] 53 | if curKey in corefDict: 54 | del corefDict[curKey] 55 | ''' 56 | 57 | 58 | 59 | else: 60 | tokens = line.strip().split("\t") 61 | key = tokens[3] + "_" + tokens[5] # key = offset_label 62 | nuggetID = inDir + "_" + tokens[2] 63 | 64 | # if we haven't seen the key yet, add it 65 | if key not in writtenNuggetKeys: 66 | output.write("mergedSystem\t" + filename + "\t" + nuggetID + "\t" + tokens[3] + "\t" + tokens[4] + "\t" + tokens[5] + "\t" + tokens[6] + "\n") 67 | writtenNuggetKeys.add(key) 68 | writtenNuggets.add(nuggetID) 69 | curSet = set() 70 | curSet.add(nuggetID) 71 | corefDict[key] = curSet 72 | 73 | nuggetID_toKey[nuggetID] = key 74 | # if we have seen it, then need to add to right corefSet 75 | else: 76 | corefDict[key].add(nuggetID) 77 | nuggetID_toKey[nuggetID] = key 78 | 79 | input.close() 80 | 81 | # before writing coref, merge together any overlapping coref sets 82 | done = True 83 | first = True 84 | while not done or first: 85 | first = False 86 | done = True 87 | 88 | removeKey = None 89 | # if we find overlap, break out and start over again 90 | for key in corefDict: 91 | curSet = corefDict[key] 92 | for altKey in corefDict: 93 | if key == altKey: 94 | continue 95 | 96 | altSet = corefDict[altKey] 97 | overlap = False 98 | 99 | for nugget in curSet: 100 | if nugget in altSet: 101 | overlap = True 102 | done = False 103 | break 104 | 105 | if overlap: 106 | for nugget in altSet: 107 | corefDict[key].add(nugget) 108 | 109 | removeKey = altKey 110 | break 111 | 112 | if removeKey != None: 113 | break 114 | if removeKey != None: 115 | print removeKey 116 | del corefDict[removeKey] 117 | 118 | corefID = 1 119 | for key in corefDict: 120 | writeList = [] 121 | for nugget in corefDict[key]: 122 | if nugget in writtenNuggets: 123 | writeList.append(nugget) 124 | 125 | if len(writeList) > 1: 126 | output.write("@Coreference\tR" + str(corefID) + "\t") 127 | first = True 128 | for nugget in writeList: 129 | if first: 130 | output.write(nugget) 131 | first = False 132 | else: 133 | output.write("," + nugget) 134 | output.write("\n") 135 | corefID += 1 136 | output.write("#EndOfDocument\n") 137 | ### End Nugget writing 138 | 139 | ### Begin argument writing 140 | idCount = 1 141 | linesToWrite = [] 142 | output = open("all/arguments/" + filename, "w") 143 | 144 | # write everything EXCEPT the last column. Use the other script to get that. 145 | for inDir in dirList: 146 | input = open(inDir + "/arguments/" + filename, "r") 147 | 148 | for line in input: 149 | start = line.find("\t") + 1 150 | end = line.rfind("\t") 151 | data = line[start:end] 152 | 153 | if data not in linesToWrite: 154 | linesToWrite.append(data) 155 | output.write(str(idCount) + "\t" + data + "\n") 156 | 157 | idCount += 1 158 | input.close() 159 | 160 | output.close() 161 | 162 | main() 163 | -------------------------------------------------------------------------------- /outputFormatting/out/moveToStore.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | ./cleanStore.sh 3 | 4 | cp linked_arguments/* arguments/. 5 | 6 | cd store 7 | 8 | cp -r ../arguments/ . 9 | cp -r ../linking/ . 10 | cp -r ../corpusLinking/ . 11 | cp -r ../nuggets/ . 12 | 13 | cd ../ 14 | for file in arguments/*; do rm "$file" ; done 15 | for file in linking/*; do rm "$file" ; done 16 | for file in nuggets/*; do rm "$file" ; done 17 | for file in linked_arguments/*; do rm "$file" ; done 18 | rm corpusLinking/* 19 | -------------------------------------------------------------------------------- /outputFormatting/stopwords.txt: -------------------------------------------------------------------------------- 1 | a 2 | a's 3 | able 4 | about 5 | above 6 | according 7 | accordingly 8 | across 9 | actually 10 | after 11 | afterwards 12 | again 13 | against 14 | ain't 15 | all 16 | allow 17 | allows 18 | almost 19 | alone 20 | along 21 | already 22 | also 23 | although 24 | always 25 | am 26 | among 27 | amongst 28 | an 29 | and 30 | another 31 | any 32 | anybody 33 | anyhow 34 | anyone 35 | anything 36 | anyway 37 | anyways 38 | anywhere 39 | apart 40 | appear 41 | appreciate 42 | appropriate 43 | are 44 | aren't 45 | around 46 | as 47 | aside 48 | ask 49 | asking 50 | associated 51 | at 52 | available 53 | away 54 | awfully 55 | b 56 | be 57 | became 58 | because 59 | become 60 | becomes 61 | becoming 62 | been 63 | before 64 | beforehand 65 | behind 66 | being 67 | believe 68 | below 69 | beside 70 | besides 71 | best 72 | better 73 | between 74 | beyond 75 | both 76 | brief 77 | but 78 | by 79 | c 80 | c'mon 81 | c's 82 | came 83 | can 84 | can't 85 | cannot 86 | cant 87 | cause 88 | causes 89 | certain 90 | certainly 91 | changes 92 | clearly 93 | co 94 | com 95 | come 96 | comes 97 | concerning 98 | consequently 99 | consider 100 | considering 101 | contain 102 | containing 103 | contains 104 | corresponding 105 | could 106 | couldn't 107 | course 108 | currently 109 | d 110 | definitely 111 | described 112 | despite 113 | did 114 | didn't 115 | different 116 | do 117 | does 118 | doesn't 119 | doing 120 | don't 121 | done 122 | down 123 | downwards 124 | during 125 | e 126 | each 127 | edu 128 | eg 129 | eight 130 | either 131 | else 132 | elsewhere 133 | enough 134 | entirely 135 | especially 136 | et 137 | etc 138 | even 139 | ever 140 | every 141 | everybody 142 | everyone 143 | everything 144 | everywhere 145 | ex 146 | exactly 147 | example 148 | except 149 | f 150 | far 151 | few 152 | fifth 153 | first 154 | five 155 | followed 156 | following 157 | follows 158 | for 159 | former 160 | formerly 161 | forth 162 | four 163 | from 164 | further 165 | furthermore 166 | g 167 | get 168 | gets 169 | getting 170 | given 171 | gives 172 | go 173 | goes 174 | going 175 | gone 176 | got 177 | gotten 178 | greetings 179 | h 180 | had 181 | hadn't 182 | happens 183 | hardly 184 | has 185 | hasn't 186 | have 187 | haven't 188 | having 189 | he 190 | he's 191 | hello 192 | help 193 | hence 194 | her 195 | here 196 | here's 197 | hereafter 198 | hereby 199 | herein 200 | hereupon 201 | hers 202 | herself 203 | hi 204 | him 205 | himself 206 | his 207 | hither 208 | hopefully 209 | how 210 | howbeit 211 | however 212 | i 213 | i'd 214 | i'll 215 | i'm 216 | i've 217 | ie 218 | if 219 | ignored 220 | immediate 221 | in 222 | inasmuch 223 | inc 224 | indeed 225 | indicate 226 | indicated 227 | indicates 228 | inner 229 | insofar 230 | instead 231 | into 232 | inward 233 | is 234 | isn't 235 | it 236 | it'd 237 | it'll 238 | it's 239 | its 240 | itself 241 | j 242 | just 243 | k 244 | keep 245 | keeps 246 | kept 247 | know 248 | knows 249 | known 250 | l 251 | last 252 | lately 253 | later 254 | latter 255 | latterly 256 | least 257 | less 258 | lest 259 | let 260 | let's 261 | like 262 | liked 263 | likely 264 | little 265 | look 266 | looking 267 | looks 268 | ltd 269 | m 270 | mainly 271 | many 272 | may 273 | maybe 274 | me 275 | mean 276 | meanwhile 277 | merely 278 | might 279 | more 280 | moreover 281 | most 282 | mostly 283 | much 284 | must 285 | my 286 | myself 287 | n 288 | name 289 | namely 290 | nd 291 | near 292 | nearly 293 | necessary 294 | need 295 | needs 296 | neither 297 | never 298 | nevertheless 299 | new 300 | next 301 | nine 302 | no 303 | nobody 304 | non 305 | none 306 | noone 307 | nor 308 | normally 309 | not 310 | nothing 311 | novel 312 | now 313 | nowhere 314 | o 315 | obviously 316 | of 317 | off 318 | often 319 | oh 320 | ok 321 | okay 322 | old 323 | on 324 | once 325 | one 326 | ones 327 | only 328 | onto 329 | or 330 | other 331 | others 332 | otherwise 333 | ought 334 | our 335 | ours 336 | ourselves 337 | out 338 | outside 339 | over 340 | overall 341 | own 342 | p 343 | particular 344 | particularly 345 | per 346 | perhaps 347 | placed 348 | please 349 | plus 350 | possible 351 | presumably 352 | probably 353 | provides 354 | q 355 | que 356 | quite 357 | qv 358 | r 359 | rather 360 | rd 361 | re 362 | really 363 | reasonably 364 | regarding 365 | regardless 366 | regards 367 | relatively 368 | respectively 369 | right 370 | s 371 | said 372 | same 373 | saw 374 | say 375 | saying 376 | says 377 | second 378 | secondly 379 | see 380 | seeing 381 | seem 382 | seemed 383 | seeming 384 | seems 385 | seen 386 | self 387 | selves 388 | sensible 389 | sent 390 | serious 391 | seriously 392 | seven 393 | several 394 | shall 395 | she 396 | should 397 | shouldn't 398 | since 399 | six 400 | so 401 | some 402 | somebody 403 | somehow 404 | someone 405 | something 406 | sometime 407 | sometimes 408 | somewhat 409 | somewhere 410 | soon 411 | sorry 412 | specified 413 | specify 414 | specifying 415 | still 416 | sub 417 | such 418 | sup 419 | sure 420 | t 421 | t's 422 | take 423 | taken 424 | tell 425 | tends 426 | th 427 | than 428 | thank 429 | thanks 430 | thanx 431 | that 432 | that's 433 | thats 434 | the 435 | their 436 | theirs 437 | them 438 | themselves 439 | then 440 | thence 441 | there 442 | there's 443 | thereafter 444 | thereby 445 | therefore 446 | therein 447 | theres 448 | thereupon 449 | these 450 | they 451 | they'd 452 | they'll 453 | they're 454 | they've 455 | think 456 | third 457 | this 458 | thorough 459 | thoroughly 460 | those 461 | though 462 | three 463 | through 464 | throughout 465 | thru 466 | thus 467 | to 468 | together 469 | too 470 | took 471 | toward 472 | towards 473 | tried 474 | tries 475 | truly 476 | try 477 | trying 478 | twice 479 | two 480 | u 481 | un 482 | under 483 | unfortunately 484 | unless 485 | unlikely 486 | until 487 | unto 488 | up 489 | upon 490 | us 491 | use 492 | used 493 | useful 494 | uses 495 | using 496 | usually 497 | uucp 498 | v 499 | value 500 | various 501 | very 502 | via 503 | viz 504 | vs 505 | w 506 | want 507 | wants 508 | was 509 | wasn't 510 | way 511 | we 512 | we'd 513 | we'll 514 | we're 515 | we've 516 | welcome 517 | well 518 | went 519 | were 520 | weren't 521 | what 522 | what's 523 | whatever 524 | when 525 | whence 526 | whenever 527 | where 528 | where's 529 | whereafter 530 | whereas 531 | whereby 532 | wherein 533 | whereupon 534 | wherever 535 | whether 536 | which 537 | while 538 | whither 539 | who 540 | who's 541 | whoever 542 | whole 543 | whom 544 | whose 545 | why 546 | will 547 | willing 548 | wish 549 | with 550 | within 551 | without 552 | won't 553 | wonder 554 | would 555 | would 556 | wouldn't 557 | x 558 | y 559 | yes 560 | yet 561 | you 562 | you'd 563 | you'll 564 | you're 565 | you've 566 | your 567 | yours 568 | yourself 569 | yourselves 570 | z 571 | zero 572 | -------------------------------------------------------------------------------- /outputFormatting/writeDocMap.py: -------------------------------------------------------------------------------- 1 | # script to write the docmap file 2 | import sys 3 | 4 | def getRootname(line): 5 | # first, remove any absolute path 6 | text = line 7 | if "/" in text: 8 | start = text.rfind("/") + 1 9 | text = text[start:] 10 | 11 | # remove the extension 12 | if "." in text: 13 | end = text.rfind(".") 14 | text = text[:end] 15 | 16 | return text 17 | 18 | def main(): 19 | if len(sys.argv) != 2: 20 | print "Expect list of documents with absolute paths." 21 | sys.exit() 22 | 23 | input = open(sys.argv[1], "r") 24 | lines = [] 25 | for line in input: 26 | lines.append(line.strip()) 27 | input.close() 28 | 29 | output = open("documents.paths.tmp", "w") 30 | for line in lines: 31 | rootname = getRootname(line) 32 | output.write(rootname + "\t" + line + "\n") 33 | output.close() 34 | 35 | output = open("documents.rootnames.tmp", "w") 36 | for line in lines: 37 | rootname = getRootname(line) 38 | output.write(rootname + "\n") 39 | output.close() 40 | 41 | 42 | main() 43 | -------------------------------------------------------------------------------- /outputFormatting/writeTriggerOutput.py: -------------------------------------------------------------------------------- 1 | # script to write the trigger output -- one file per document 2 | import sys 3 | import string 4 | 5 | def main(): 6 | if len(sys.argv) != 2: 7 | print "Expect list of triggers." 8 | sys.exit() 9 | 10 | storeDir = "out/nuggets/" 11 | 12 | input = open(sys.argv[1], "r") 13 | curDoc = "" 14 | for line in input: 15 | if line.startswith("#BeginOfDocument"): 16 | tokens = line.strip().split() 17 | name = tokens[1] 18 | if name.endswith(".xml"): 19 | name = name[:-4] 20 | 21 | curDoc = name 22 | output = open(storeDir + curDoc, "w") 23 | 24 | output.write(tokens[0] + " " + name + "\n") 25 | elif line.startswith("#EndOfDocument"): 26 | output.write(line) 27 | output.close() 28 | elif line.startswith("@Coreference"): 29 | output.write(line) 30 | else: 31 | tokens = line.strip().split("\t") 32 | 33 | sysName = tokens[0] 34 | docID = tokens[1] 35 | mentionID = tokens[2] 36 | offsets = tokens[3] 37 | word = tokens[4] 38 | label = tokens[5] 39 | realis = tokens[6] 40 | 41 | confidence = tokens[7] 42 | 43 | eventTokens = label.split("_") 44 | eventType = convertEventType(eventTokens[0])+ "." + convertEventType(eventTokens[1]) 45 | 46 | if eventType == "Contact.Phone-Write": 47 | eventType = "Contact.Correspondence" 48 | 49 | if eventType == "Movement.Transport": 50 | self.eventType = "Movement.Transport-Person" 51 | 52 | output.write(sysName + "\t" + docID + "\t" + mentionID + "\t" + offsets + "\t" + word + "\t" + eventType + "\t" + realis + "\t" + confidence + "\n") 53 | 54 | 55 | input.close() 56 | 57 | def convertRoleLabels(label): 58 | newLabel = "" 59 | prevChar = "" 60 | first = True 61 | for character in label: 62 | if first: 63 | newLabel += character.upper() 64 | first = False 65 | elif prevChar in string.punctuation: 66 | newLabel += character.upper() 67 | else: 68 | newLabel += character 69 | 70 | prevChar = character 71 | 72 | return newLabel 73 | 74 | def convertEventType(text): 75 | tmp = convertRoleLabels(text) 76 | 77 | if tmp == "Transportperson": 78 | return "Transport-Person" 79 | elif tmp == "Transportartifact": 80 | return "Transport-Artifact" 81 | elif tmp == "Endposition": 82 | return "End-Position" 83 | elif tmp == "Startposition": 84 | return "Start-Position" 85 | elif tmp == "Arrestjail": 86 | return "Arrest-Jail" 87 | elif tmp == "Transfermoney": 88 | return "Transfer-Money" 89 | elif tmp == "Transferownership": 90 | return "Transfer-Ownership" 91 | else: 92 | return tmp 93 | 94 | main() 95 | -------------------------------------------------------------------------------- /preprocessing_2.0/CoreNLP_scripts/StanfordCoreNLP-chinese.properties.simple: -------------------------------------------------------------------------------- 1 | annotators = segment, ssplit, pos 2 | 3 | customAnnotatorClass.segment = edu.stanford.nlp.pipeline.ChineseSegmenterAnnotator 4 | 5 | segment.model = edu/stanford/nlp/models/segmenter/chinese/ctb.gz 6 | segment.sighanCorporaDict = edu/stanford/nlp/models/segmenter/chinese 7 | segment.serDictionary = edu/stanford/nlp/models/segmenter/chinese/dict-chris6.ser.gz 8 | segment.sighanPostProcessing = true 9 | 10 | ssplit.boundaryTokenRegex = [.]|[!?]+|[。]|[!?]+ 11 | 12 | pos.model = edu/stanford/nlp/models/pos-tagger/chinese-distsim/chinese-distsim.tagger 13 | -------------------------------------------------------------------------------- /preprocessing_2.0/CoreNLP_scripts/StanfordCoreNLP-spanish.properties.simple: -------------------------------------------------------------------------------- 1 | annotators = tokenize, ssplit, pos 2 | 3 | tokenize.language = es 4 | 5 | pos.model = edu/stanford/nlp/models/pos-tagger/spanish/spanish-distsim.tagger 6 | -------------------------------------------------------------------------------- /preprocessing_2.0/CoreNLP_scripts/prefixLines.py: -------------------------------------------------------------------------------- 1 | # script that takes a txt file and prepends the given string to each line 2 | import sys 3 | 4 | def main(): 5 | if len(sys.argv) != 4: 6 | print "Expect text file, string to prepend, output file." 7 | sys.exit() 8 | 9 | input = open(sys.argv[1], "r") 10 | output = open(sys.argv[3], "w") 11 | 12 | prefix = sys.argv[2] 13 | 14 | for line in input: 15 | output.write(prefix + line) 16 | input.close() 17 | output.close() 18 | 19 | main() 20 | -------------------------------------------------------------------------------- /preprocessing_2.0/CoreNLP_scripts/prepareCoreNLP_input.py: -------------------------------------------------------------------------------- 1 | import sys 2 | 3 | def main(): 4 | 5 | if len(sys.argv) != 4: 6 | print "Expect input, output, $PWD." 7 | sys.exit() 8 | 9 | input = open(sys.argv[1], "r") 10 | output = open(sys.argv[2], "w") 11 | pwd = sys.argv[3] 12 | 13 | for line in input: 14 | output.write(pwd + line[1:]) 15 | input.close() 16 | 17 | output.close() 18 | 19 | main() 20 | -------------------------------------------------------------------------------- /preprocessing_2.0/CoreNLP_scripts/runCoreNLP_Chn.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | STANFORD_CORENLP=/home/andrew/NLP_tools/CoreNLP/stanford-corenlp-full-2016-10-31 4 | INPUTS=$1 5 | 6 | CURRENT_PATH=${PWD} 7 | 8 | echo "Call Stanford CoreNLP..." 9 | java -cp "$STANFORD_CORENLP/*" -Xmx16g edu.stanford.nlp.pipeline.StanfordCoreNLP -filelist $INPUTS -props ${CURRENT_PATH}/CoreNLP_scripts/StanfordCoreNLP-chinese.properties.simple -threads 8 -outputDirectory ${CURRENT_PATH}/CoreNLP_scripts/tmp_Chn/ -outputExtension .out 10 | -------------------------------------------------------------------------------- /preprocessing_2.0/CoreNLP_scripts/runCoreNLP_Eng.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | STANFORD_CORENLP=/home/andrew/NLP_tools/CoreNLP/stanford-corenlp-full-2016-10-31 4 | INPUTS=$1 5 | 6 | CURRENT_PATH=${PWD} 7 | 8 | 9 | echo "Call Stanford CoreNLP..." 10 | java -cp "$STANFORD_CORENLP/*" -Xmx16g edu.stanford.nlp.pipeline.StanfordCoreNLP -filelist $INPUTS -annotators tokenize,ssplit,pos,lemma,ner,parse -threads 8 -outputDirectory ${CURRENT_PATH}/CoreNLP_scripts/tmp_Eng/ -outputExtension .out 11 | -------------------------------------------------------------------------------- /preprocessing_2.0/CoreNLP_scripts/runCoreNLP_Span.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | STANFORD_CORENLP=/home/andrew/NLP_tools/CoreNLP/stanford-corenlp-full-2016-10-31 4 | INPUTS=$1 5 | 6 | CURRENT_PATH=${PWD} 7 | 8 | echo "Call Stanford CoreNLP..." 9 | java -cp "$STANFORD_CORENLP/*" -Xmx16g edu.stanford.nlp.pipeline.StanfordCoreNLP -filelist $INPUTS -props ${CURRENT_PATH}/CoreNLP_scripts/StanfordCoreNLP-spanish.properties.simple -threads 8 -outputDirectory ${CURRENT_PATH}/CoreNLP_scripts/tmp_Span/ -outputExtension .out 10 | -------------------------------------------------------------------------------- /preprocessing_2.0/MaltParser_scripts/convertToCoNLL.py: -------------------------------------------------------------------------------- 1 | # script to convert from createSetFiles file to CoNLL format 2 | import sys 3 | import string 4 | 5 | def convertPOS(pos, converter): 6 | newPOS = "" 7 | for character in pos: 8 | if character not in string.digits: 9 | newPOS += character 10 | 11 | copyPOS = newPOS 12 | while len(copyPOS) > 0: 13 | if copyPOS in converter: 14 | return converter[copyPOS], copyPOS 15 | else: 16 | copyPOS = copyPOS[:-1] 17 | 18 | return newPOS, newPOS 19 | 20 | def removeDigits(pos): 21 | newPOS = "" 22 | for character in pos: 23 | if character not in string.digits: 24 | newPOS += character 25 | return newPOS 26 | 27 | def main(): 28 | if len(sys.argv) != 4: 29 | print "Expect createSetFiles data, output file, universal POS Tag file." 30 | sys.exit() 31 | posConverter = dict() 32 | 33 | input = open(sys.argv[3], "r") 34 | for line in input: 35 | tokens = line.strip().split("\t") 36 | posConverter[tokens[0]] = tokens[1] 37 | input.close() 38 | 39 | input = open(sys.argv[1], "r") 40 | output = open(sys.argv[2], "w") 41 | 42 | prevBlank = True 43 | wordCount = 0 44 | for line in input: 45 | if line.strip() != "": 46 | tokens = line.strip().split("\t") 47 | if wordCount == 0: 48 | output.write("# " + tokens[5] + "\n") 49 | 50 | wordCount += 1 51 | word = tokens[2] 52 | pos = tokens[4] 53 | 54 | newPOS, originalPOS = convertPOS(pos, posConverter) 55 | 56 | if len(newPOS) == 0: 57 | newPOS = "_" 58 | originalPOS = "_" 59 | 60 | output.write(str(wordCount) + "\t" + word + "\t_\t" + newPOS + "\t" + originalPOS + "\t_\t_\t_\t_\t_\n") 61 | prevBlank = False 62 | else: 63 | wordCount = 0 64 | if not prevBlank: 65 | output.write("\n") 66 | prevBlank = True 67 | 68 | input.close() 69 | output.close() 70 | 71 | main() 72 | -------------------------------------------------------------------------------- /preprocessing_2.0/MaltParser_scripts/convertToParsingFile.py: -------------------------------------------------------------------------------- 1 | # script to convert from the CoNLL output back to my createSetFiles output 2 | import sys 3 | 4 | def printDep(dep, output, wordDict): 5 | relation = dep[0] 6 | relationIndex = dep[1] 7 | wordIndex = dep[2] 8 | 9 | output.write(relation + "|||" + wordDict[relationIndex] + "|||" + relationIndex + "|||" + wordDict[wordIndex] + "|||" + wordIndex + "\n") 10 | 11 | def main(): 12 | if len(sys.argv) != 3: 13 | print "Expect input file, output file." 14 | sys.exit() 15 | 16 | input = open(sys.argv[1], "r") 17 | output = open(sys.argv[2], "w") 18 | 19 | wordDict = dict() 20 | wordDict["0"] = "ROOT" 21 | depList = [] 22 | for line in input: 23 | if line.strip() == "": 24 | # print out dependencies if needed 25 | if len(wordDict) != 0: 26 | for dep in depList: 27 | printDep(dep, output, wordDict) 28 | wordDict = dict() 29 | wordDict["0"] = "ROOT" 30 | depList = [] 31 | output.write("\n") 32 | elif not line.startswith("#"): 33 | tokens = line.strip().split("\t") 34 | wordIndex = tokens[0] 35 | word = tokens[1] 36 | relation = tokens[7] 37 | relationIndex = tokens[6] 38 | 39 | wordDict[wordIndex] = word 40 | 41 | depList.append( (relation, relationIndex, wordIndex) ) 42 | 43 | if len(wordDict) != 0: 44 | for dep in depList: 45 | printDep(dep, output) 46 | wordDict = dict() 47 | depList = [] 48 | output.write("\n") 49 | 50 | input.close() 51 | output.close() 52 | 53 | main() 54 | -------------------------------------------------------------------------------- /preprocessing_2.0/createSetFiles/writeDataFromFiles.py: -------------------------------------------------------------------------------- 1 | # reads each line, write the data from those files to a single file 2 | 3 | import sys 4 | 5 | def main(): 6 | if len(sys.argv) != 3: 7 | print "Expect list of files, output file." 8 | sys.exit() 9 | 10 | files = [] 11 | input = open(sys.argv[1], "r") 12 | for line in input: 13 | files.append(line.strip()) 14 | input.close() 15 | 16 | output = open(sys.argv[2], "w") 17 | parsingOutput = open(sys.argv[2] + ".parsing", "w") 18 | for filename in files: 19 | tempIndex = filename.find(".mergedAnnotations") 20 | tempName = filename[:tempIndex] 21 | parsingFilename = tempName + ".parsingAnnotations" 22 | 23 | input = open(filename, "r") 24 | for line in input: 25 | output.write(line) 26 | input.close() 27 | 28 | try: 29 | input = open(parsingFilename, "r") 30 | for line in input: 31 | parsingOutput.write(line) 32 | input.close() 33 | except IOError: 34 | print "Could not open file: " + parsingFilename 35 | print "Continuing..." 36 | output.close() 37 | parsingOutput.close() 38 | 39 | main() 40 | -------------------------------------------------------------------------------- /preprocessing_2.0/entityExtraction/code/addEntitiesToText.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import xml.etree.ElementTree as ET 3 | import string 4 | from xml.sax.saxutils import escape 5 | 6 | # set to maintain unique coref labels across all entities 7 | corefSet = set() 8 | 9 | def processEvents(root, triggerDict, argDict): 10 | if root.tag == "event": 11 | eventType = root.attrib["TYPE"] 12 | eventSubtype = root.attrib["SUBTYPE"] 13 | 14 | for child in root: 15 | if child.tag == "event_mention": 16 | processEventMentions(child, triggerDict, argDict, eventType, eventSubtype) 17 | 18 | else: 19 | for child in root: 20 | processEvents(child, triggerDict, argDict) 21 | 22 | def processEventMentions(root, triggerDict, argDict, eventType, eventSubtype): 23 | anchorText = "" 24 | anchorIndex = -1 25 | for child in root: 26 | anchorText, anchorIndex = processEvent_Helper_Anchor(child, triggerDict, argDict, eventType, eventSubtype) 27 | if anchorText != "": 28 | break 29 | 30 | for child in root: 31 | processEvent_Helper_Arg(child, triggerDict, argDict, eventType, eventSubtype, anchorText, anchorIndex) 32 | 33 | def processEvent_Helper_Anchor(root, triggerDict, argDict, eventType, eventSubtype): 34 | if root.tag == "anchor": 35 | for child in root: 36 | return processEvent_Anchor(child, triggerDict, eventType, eventSubtype) 37 | else: 38 | returnStr = "" 39 | returnNum = -1 40 | for child in root: 41 | returnStr, returnNum = processEvent_Helper_Anchor(child, triggerDict, argDict, eventType, eventSubtype) 42 | if returnStr != "": 43 | break 44 | 45 | return returnStr, returnNum 46 | 47 | def processEvent_Helper_Arg(root, triggerDict, argDict, eventType, eventSubtype, eventText, eventIndex): 48 | if root.tag == "event_mention_argument": 49 | role = root.attrib["ROLE"] 50 | for child in root: 51 | processEvent_Argument(child, argDict, role, eventText, eventIndex) 52 | else: 53 | for child in root: 54 | processEvent_Helper_Arg(child, triggerDict, argDict, eventType, eventSubtype, eventText, eventIndex) 55 | 56 | def processEvent_Argument(root, argDict, role, eventText, eventIndex): 57 | if root.tag == "charseq": 58 | start = int(root.attrib["START"]) 59 | end = int(root.attrib["END"]) 60 | text = escape(root.text) 61 | 62 | # write the characters to the dict 63 | index = start 64 | 65 | while index <= end: 66 | if index not in argDict: 67 | argDict[index] = [] 68 | if index == start: 69 | argDict[index].append( ("B", role, text, eventText, eventIndex) ) 70 | else: 71 | argDict[index].append( ("I", role, text, eventText, eventIndex) ) 72 | index += 1 73 | else: 74 | for child in root: 75 | processEvent_Argument(child, argDict, role, eventText, eventIndex) 76 | 77 | def processEvent_Anchor(root, triggerDict, eventType, eventSubtype): 78 | if root.tag == "charseq": 79 | start = int(root.attrib["START"]) 80 | end = int(root.attrib["END"]) 81 | text = escape(root.text) 82 | 83 | # write the characters to the dict 84 | index = start 85 | 86 | while index <= end: 87 | triggerDict[index] = (eventType, eventSubtype, text) 88 | index += 1 89 | return text, start 90 | else: 91 | for child in root: 92 | return processEvent_Anchor(child, triggerDict, eventType, eventSubtype) 93 | 94 | def processExtent(root): 95 | for child in root: 96 | if child.tag == "charseq": 97 | start = int(child.attrib["START"]) 98 | end = int(child.attrib["END"]) 99 | text = escape(child.text) 100 | 101 | return start, end, text 102 | raise RuntimeError("Improper XML detected.") 103 | 104 | def cleanWhitespace(text): 105 | clean = "" 106 | for char in text: 107 | if char in string.whitespace: 108 | clean += " " 109 | else: 110 | clean += char 111 | return clean 112 | 113 | def processHead(root): 114 | for child in root: 115 | if child.tag == "charseq": 116 | return child.text 117 | raise RuntimeError("Improper XML detected.") 118 | 119 | 120 | def processEntities(root, labelDict): 121 | if root.tag == "entity": 122 | entityType = root.attrib["TYPE"] 123 | entitySubtype = root.attrib["SUBTYPE"] 124 | 125 | corefLabel = len(corefSet) 126 | corefSet.add(corefLabel) 127 | 128 | # process and write each mention to the dict 129 | for mention in root: 130 | if mention.tag == "entity_mention": 131 | start = -1 132 | end = -1 133 | head = "" 134 | text = "" 135 | 136 | for child in mention: 137 | if child.tag == "head": 138 | head = cleanWhitespace(processHead(child)) 139 | elif child.tag == "extent": 140 | start, end, text = processExtent(child) 141 | 142 | 143 | # write the characters to the dict 144 | if start < 0: 145 | raise ValueError('Did not read indexes for entity') 146 | index = start 147 | 148 | while index <= end: 149 | if index not in labelDict: 150 | labelDict[index] = [] 151 | 152 | if index == start: 153 | labelDict[index].append( ("B", entityType, entitySubtype, text, head, corefLabel) ) 154 | else: 155 | labelDict[index].append( ("I", entityType, entitySubtype, text, head, corefLabel) ) 156 | index += 1 157 | 158 | elif root.tag == "timex2": 159 | corefLabel = len(corefSet) 160 | corefSet.add(corefLabel) 161 | 162 | for child in root: 163 | processTime(child, labelDict, corefLabel) 164 | elif root.tag == "value": 165 | corefLabel = len(corefSet) 166 | corefSet.add(corefLabel) 167 | 168 | valueType = root.attrib["TYPE"] 169 | if "SUBTYPE" in root.attrib: 170 | valueSubtype = root.attrib["SUBTYPE"] 171 | else: 172 | valueSubtype = root.attrib["TYPE"] 173 | 174 | for child in root: 175 | processValue(child, labelDict, corefLabel, valueType, valueSubtype) 176 | else: 177 | for child in root: 178 | processEntities(child, labelDict) 179 | 180 | def processEntity_Helper(root, labelDict, entityType, entitySubtype): 181 | if root.tag == "extent": 182 | for child in root: 183 | processExtent(child, labelDict, entityType, entitySubtype) 184 | else: 185 | for child in root: 186 | processEntity_Helper(child, labelDict, entityType, entitySubtype) 187 | 188 | def processTime(root, labelDict, corefLabel): 189 | if root.tag == "charseq": 190 | start = int(root.attrib["START"]) 191 | end = int(root.attrib["END"]) 192 | text = escape(root.text) 193 | 194 | # write the characters to the dict 195 | index = start 196 | 197 | while index <= end: 198 | if index not in labelDict: 199 | labelDict[index] = [] 200 | 201 | # NOTE: timex values don't have heads -- just use the text again 202 | if index == start: 203 | labelDict[index].append( ("B", "TIME", "TIME", text, cleanWhitespace(text), corefLabel) ) 204 | else: 205 | labelDict[index].append( ("I", "TIME", "TIME", text, cleanWhitespace(text), corefLabel) ) 206 | index += 1 207 | else: 208 | for child in root: 209 | processTime(child, labelDict, corefLabel) 210 | 211 | def processValue(root, labelDict, corefLabel, valueType, valueSubtype): 212 | if root.tag == "charseq": 213 | start = int(root.attrib["START"]) 214 | end = int(root.attrib["END"]) 215 | text = escape(root.text) 216 | 217 | # write the characters to the dict 218 | index = start 219 | 220 | while index <= end: 221 | if index not in labelDict: 222 | labelDict[index] = [] 223 | 224 | # NOTE: timex values don't have heads -- just use the text again 225 | if index == start: 226 | labelDict[index].append( ("B", valueType, valueSubtype, text, cleanWhitespace(text), corefLabel) ) 227 | else: 228 | labelDict[index].append( ("I", valueType, valueSubtype, text, cleanWhitespace(text), corefLabel) ) 229 | index += 1 230 | else: 231 | for child in root: 232 | processValue(child, labelDict, corefLabel, valueType, valueSubtype) 233 | 234 | 235 | def main(): 236 | if len(sys.argv) != 4: 237 | print "Expect stanford annotations (XML), coreNLP features (extracted), output file." 238 | sys.exit() 239 | 240 | print "Starting document " + sys.argv[3] 241 | 242 | # read the annotation XML 243 | labelDict = dict() # dict from offset -> (B/I, labelType, labelSubtype, fullName) 244 | triggerDict = dict() # dict from offset -> (triggerType, triggerSubtype) 245 | argDict = dict() # dict from offset -> (argument role) 246 | 247 | corefCount = 0 248 | 249 | # read the Stanford mentions 250 | input = open(sys.argv[1], "r") 251 | wordDict = dict() # dict from word count -> (B/I, labelType, labelSubtype, fullName, head) 252 | for line in input: 253 | tokens = line.strip().split('\t') 254 | wordCount = int(tokens[0]) 255 | entityName = tokens[1] 256 | entityType = tokens[2] 257 | 258 | numWords = entityName.count(" ") + 1 259 | lastWord = entityName 260 | if numWords > 1: 261 | start = entityName.rfind(" ") 262 | lastWord = entityName[start+1:] 263 | corefCount + 1 264 | 265 | for index in range(numWords): 266 | if index == 0: 267 | if wordCount not in wordDict: 268 | wordDict[wordCount] = [] 269 | wordDict[wordCount].append( ("B", entityType, entityType, entityName, lastWord, str(corefCount)) ) 270 | else: 271 | if wordCount + index not in wordDict: 272 | wordDict[wordCount + index] = [] 273 | wordDict[wordCount + index].append( ("I", entityType, entityType, entityName, lastWord, str(corefCount)) ) 274 | input.close() 275 | 276 | 277 | input = open(sys.argv[2], "r") 278 | output = open(sys.argv[3], "w") 279 | 280 | 281 | lineCounter = 0 282 | wordCount = 0 283 | for line in input: 284 | lineCounter += 1 285 | 286 | if line.startswith("BEGIN_SENTENCE"): 287 | output.write("\n") 288 | else: 289 | clean = line.strip() 290 | tokens = clean.split("\t") 291 | startOffset = int(tokens[0]) 292 | endOffset = int(tokens[1]) 293 | curWord = tokens[2] 294 | 295 | labelDict[startOffset] = [] 296 | if wordCount in wordDict: 297 | labelDict[startOffset] = wordDict[wordCount] 298 | wordCount += 1 299 | 300 | entityInfo = "EntitesGold[" 301 | 302 | if startOffset in labelDict: 303 | for curTuple in labelDict[startOffset]: 304 | begin = curTuple[0] 305 | entType = curTuple[1] 306 | entSubtype = curTuple[2] 307 | head = curTuple[4]#.encode('utf-8') 308 | 309 | coref = "coref_" + str(curTuple[5]) 310 | 311 | entityInfo += (begin + "|||" + entType + "|||" + entSubtype + "|||" + head + "|||" + coref + ";;;") 312 | 313 | # for debugging only 314 | tupleFullName = curTuple[3]#.encode('utf-8') 315 | 316 | # below: good for verifying alignment, but final version should not contain. 317 | #output.write(clean + "\t" + begin + "-" + entType + "\t" + begin + "-" + entSubtype + "\t" + tupleFullName + "\n") 318 | entityInfo += "]" 319 | 320 | eventType = "not_trigger" 321 | eventSubtype = "not_trigger" 322 | 323 | if startOffset in triggerDict: 324 | curTuple = triggerDict[startOffset] 325 | eventType = curTuple[0] 326 | eventSubtype = curTuple[1] 327 | 328 | argInfo = "ArgsGold[" 329 | if startOffset in argDict: 330 | for curTuple in argDict[startOffset]: 331 | begin = curTuple[0] 332 | argRole = curTuple[1] 333 | eventText = curTuple[3] 334 | eventIndex = curTuple[4] 335 | 336 | eventText = replaceWhiteSpace(eventText) 337 | 338 | argInfo += (begin + "|||" + argRole + "|||" + eventText.encode('utf-8') + "|||" + str(eventIndex) + ";;;") 339 | argInfo += "]" 340 | 341 | output.write(clean + "\t" + entityInfo + "\t" + eventType + "\t" + eventSubtype + "\t" + argInfo + "\n") 342 | output.write("\n") 343 | input.close() 344 | output.close() 345 | 346 | print "Finished processing document! Written to " + sys.argv[3] 347 | 348 | def replaceWhiteSpace(text): 349 | newStr = "" 350 | for character in text: 351 | if character in string.whitespace: 352 | newStr += " " 353 | else: 354 | newStr += character 355 | return newStr 356 | 357 | 358 | main() 359 | -------------------------------------------------------------------------------- /preprocessing_2.0/entityExtraction/code/addEntitiesToText.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | # clean the directory first 4 | rm tmp_formatted/* 5 | 6 | while read -r line 7 | do 8 | name=$line 9 | python ../../readCoreNLP/read_CoreNLP_XML.py ${2}${name}.out tmp.txt 10 | python addEntitiesToText.py unify/out_processed/${name} tmp.txt tmp_formatted/${name}.mergedAnnotations 11 | python ../../readCoreNLP/write_parsing_from_CoreNLP.py ${2}${name}.out tmp_formatted/${name}.parsingAnnotations 12 | done < ${1} 13 | rm tmp.txt 14 | -------------------------------------------------------------------------------- /preprocessing_2.0/entityExtraction/code/unify/processEntities.py: -------------------------------------------------------------------------------- 1 | # script to change from 1-line per word, to 1-line per entity 2 | import sys 3 | 4 | def main(): 5 | if len(sys.argv) != 4: 6 | print "Expect list of files, input directory, output directory." 7 | sys.exit() 8 | 9 | 10 | filenames = [] 11 | input = open(sys.argv[1], "r") 12 | for line in input: 13 | filenames.append(line.strip()) 14 | input.close() 15 | 16 | inDir = sys.argv[2] 17 | outDir = sys.argv[3] 18 | 19 | for filename in filenames: 20 | print "Processing: " + filename 21 | input = open(inDir + filename, "r") 22 | output = open(outDir + filename, "w") 23 | 24 | lines = input.readlines() 25 | 26 | for index in range(len(lines)): 27 | curLine = lines[index] 28 | 29 | tokens = curLine.strip().split("\t") 30 | word = tokens[0] 31 | label = tokens[1] 32 | 33 | 34 | labels = [label] 35 | if ";" in label: 36 | labels = label.split(";") 37 | 38 | for label in labels: 39 | if label.startswith("B"): 40 | # find how long this goes for 41 | entityName = word 42 | 43 | suffix = label[1:] 44 | altIndex = index + 1 45 | while altIndex < len(lines): 46 | altLine = lines[altIndex] 47 | altTokens = altLine.strip().split("\t") 48 | altWord = altTokens[0] 49 | altLabel = altTokens[1] 50 | 51 | altLabels = [altLabel] 52 | 53 | if ";" in altLabel: 54 | altLabels = altLabel.split(";") 55 | found = False 56 | for altLabel in altLabels: 57 | if altLabel.endswith(suffix): 58 | entityName += " " + altWord 59 | found = True 60 | continue 61 | 62 | if not found: 63 | break 64 | else: 65 | altIndex += 1 66 | 67 | output.write(str(index) + "\t" + entityName + "\t" + suffix[1:] + "\n") 68 | input.close() 69 | output.close() 70 | 71 | main() 72 | 73 | -------------------------------------------------------------------------------- /preprocessing_2.0/entityExtraction/code/unify/unifyEntities.py: -------------------------------------------------------------------------------- 1 | # script to take the (1-file-per-class) NER output and unify into a single file PER DOCUMENT 2 | import sys 3 | 4 | def main(): 5 | if len(sys.argv) != 4: 6 | print "Expect createSetFiles file, list of files, output directory." 7 | sys.exit() 8 | 9 | # for each line read, record the word, a set of entity labels, and the documentID 10 | words = [] 11 | labels = [] 12 | documents = [] 13 | 14 | docSet = set() 15 | 16 | files = [] 17 | input = open(sys.argv[2], "r") 18 | for line in input: 19 | files.append(line.strip()) 20 | input.close() 21 | 22 | input = open(sys.argv[1], "r") 23 | for line in input: 24 | if line.strip() != "": 25 | tokens = line.strip().split("\t") 26 | word = tokens[2] 27 | docid = tokens[5] 28 | 29 | words.append(word) 30 | labels.append(set()) 31 | documents.append(docid) 32 | 33 | docSet.add(docid) 34 | input.close() 35 | 36 | for filename in files: 37 | input = open(filename, "r") 38 | count = 0 39 | for line in input: 40 | if line.strip() != "": 41 | tokens = line.strip().split("\t") 42 | word = tokens[0] 43 | label = tokens[2] 44 | 45 | if label != "O": 46 | labels[count].add(label) 47 | 48 | count += 1 49 | input.close() 50 | 51 | outPrefix = sys.argv[3] 52 | 53 | prevDoc = documents[0] 54 | prevSet = set() 55 | output = open(outPrefix + documents[0], "w") 56 | for index in range(len(words)): 57 | curWord = words[index] 58 | curDoc = documents[index] 59 | labelSet = labels[index] 60 | 61 | if curDoc != prevDoc: 62 | output.close() 63 | output = open(outPrefix + curDoc, "w") 64 | 65 | output.write(curWord + "\t") 66 | if len(labelSet) == 0: 67 | output.write("EMPTY\n") 68 | else: 69 | first = True 70 | for label in labelSet: 71 | if first: 72 | if label not in prevSet: 73 | output.write("B-" + label) 74 | else: 75 | output.write("I-" + label) 76 | first = False 77 | else: 78 | if label not in prevSet: 79 | output.write(";B-" + label) 80 | else: 81 | output.write(";I-" + label) 82 | output.write("\n") 83 | 84 | prevDoc = curDoc 85 | prevSet = labelSet 86 | output.close() 87 | 88 | 89 | main() 90 | -------------------------------------------------------------------------------- /preprocessing_2.0/entityExtraction/convertTestSet.py: -------------------------------------------------------------------------------- 1 | # script to convert from the "createSetFiles" output to the training format for Stanford NER 2 | import sys 3 | import string 4 | 5 | def main(): 6 | if len(sys.argv) != 4: 7 | print "Expect input file, parsing file, output file." 8 | sys.exit() 9 | 10 | labelDict = dict() 11 | labelDict["WEA"] = "weapon" 12 | labelDict["Sentence"] = "sentence" 13 | labelDict["Crime"] = "crime" 14 | labelDict["Job-Title"] = "title" 15 | labelDict["VEH"] = "vehicle" 16 | labelDict["TIME"] = "time" 17 | labelDict["Numeric"] = "money" 18 | 19 | sentences = [] 20 | sentencesRelation = [] 21 | governorDict = dict() # wordIndex -> governor word 22 | relationDict = dict() # wordIndex -> dependency relationship 23 | wordCount = 0 24 | input = open(sys.argv[2], "r") 25 | for line in input: 26 | if line.strip() != "": 27 | wordCount += 1 28 | tokens = line.strip().split("|||") 29 | governor = tokens[1] 30 | start = line.strip().rfind("|") + 1 31 | wordIndex = int(line.strip()[start:]) 32 | 33 | governorDict[wordIndex] = governor 34 | 35 | relation = tokens[0] 36 | relationDict[wordIndex] = relation 37 | else: 38 | sentences.append(governorDict) 39 | sentencesRelation.append(relationDict) 40 | wordCount = 0 41 | governorDict = dict() 42 | relationDict = dict() 43 | if len(governorDict) != 0: 44 | sentences.append(governorDict) 45 | sentencesRelation.append(relationDict) 46 | input.close() 47 | 48 | input = open(sys.argv[1], "r") 49 | prefix = sys.argv[3] 50 | 51 | labelSet = set() 52 | 53 | # first, scan the text, figure out how many labels 54 | for line in input: 55 | clean = line.strip() 56 | if clean != "": 57 | tokens = clean.split("\t") 58 | entity = tokens[6] 59 | 60 | if entity != "EntitesGold[]": 61 | start = entity.find("[") + 1 62 | end = entity.find(";;;") 63 | substring = entity[start:end] 64 | 65 | entTokens = substring.split(";;;") 66 | for tok in entTokens: 67 | tmpLabel = substring.split("|||")[1] 68 | if tmpLabel in labelDict: 69 | tmpLabel = labelDict[tmpLabel] 70 | #labelSet.add(substring.split("|||")[1]) 71 | labelSet.add(tmpLabel) 72 | 73 | input.close() 74 | 75 | output = open(sys.argv[3], "w") 76 | 77 | sentenceCount = 0 78 | wordCount = 0 79 | 80 | input = open(sys.argv[1], "r") 81 | prevEmpty = True 82 | for line in input: 83 | clean = line.strip() 84 | if clean != "": 85 | prevEmpty = False 86 | 87 | wordCount += 1 88 | tokens = clean.split("\t") 89 | word = tokens[2] 90 | entity = tokens[6] 91 | 92 | governor = "" 93 | relation = "" 94 | if wordCount in sentences[sentenceCount]: 95 | governor = sentences[sentenceCount][wordCount] 96 | relation = sentencesRelation[sentenceCount][wordCount] 97 | if governor.strip() == "": 98 | governor = "" 99 | if relation.strip() == "": 100 | relation = "" 101 | 102 | output.write(removeWhitespace(word) + "\tO\t" + removeWhitespace(governor) + "_" + removeWhitespace(relation) + "\n") 103 | elif not prevEmpty: 104 | output.write("\n") 105 | wordCount = 0 106 | sentenceCount += 1 107 | prevEmpty = True 108 | 109 | input.close() 110 | for label in labelSet: 111 | outputs[label].close() 112 | 113 | def removeWhitespace(text): 114 | newText = "" 115 | for character in text: 116 | if character not in string.whitespace: 117 | newText += character 118 | else: 119 | newText += "_" 120 | 121 | return newText 122 | 123 | main() 124 | -------------------------------------------------------------------------------- /preprocessing_2.0/entityExtraction/runEntities.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | STANFORD_NER=/home/andrew/NLP_tools/StanfordNER/stanford-ner-2016-10-31 3 | 4 | for i in Contact-Info crime FAC GPE title LOC money ORG PER sentence time weapon vehicle age commodity 5 | do 6 | java -mx16g -cp "$STANFORD_NER/*:$STANFORD_NER/lib/*" edu.stanford.nlp.ie.crf.CRFClassifier -loadClassifier /home/andrew/DEFT_code_testing/dependencies/models/entities/ACE_ERE-English.ner-model.${i}.dependency.full.ser.gz -testfile $1 > code/unify/in/${1}_${i} 7 | done 8 | 9 | ls code/unify/in/* > code/unify/in.txt 10 | python code/unify/unifyEntities.py ../createSetFiles/setFile.noEntities.tmp code/unify/in.txt code/unify/out/ 11 | ls code/unify/out/ > code/unify/out.txt 12 | python code/unify/processEntities.py code/unify/out.txt code/unify/out/ code/unify/out_processed/ 13 | 14 | cd code/ 15 | ./addEntitiesToText.sh ../../tmp.list ../../CoreNLP_scripts/tmp_Eng/ 16 | cd ../ 17 | 18 | rm code/unify/in.txt 19 | rm code/unify/out.txt 20 | rm -r code/unify/in/ 21 | rm -r code/unify/out/ 22 | rm -r code/unify/out_processed/ 23 | mkdir code/unify/in/ 24 | mkdir code/unify/out/ 25 | mkdir code/unify/out_processed/ 26 | 27 | 28 | -------------------------------------------------------------------------------- /preprocessing_2.0/entityExtraction/runEntities_Chinese.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | STANFORD_NER=/home/andrew/NLP_tools/StanfordNER/stanford-ner-2016-10-31 3 | 4 | for i in Contact-Info crime FAC GPE title LOC money ORG PER sentence time weapon vehicle age commodity 5 | do 6 | java -mx16g -cp "$STANFORD_NER/*:$STANFORD_NER/lib/*" edu.stanford.nlp.ie.crf.CRFClassifier -loadClassifier /home/andrew/DEFT_code_testing/dependencies/models/entities/ACE_ERE-Chinese.ner-model.${i}.dependency.full.ser.gz -testfile $1 > code/unify/in/${1}_${i} 7 | done 8 | 9 | ls code/unify/in/* > code/unify/in.txt 10 | python code/unify/unifyEntities.py ../createSetFiles/setFile.noEntities.tmp.Chn code/unify/in.txt code/unify/out/ 11 | ls code/unify/out/ > code/unify/out.txt 12 | python code/unify/processEntities.py code/unify/out.txt code/unify/out/ code/unify/out_processed/ 13 | 14 | cd code/ 15 | ./addEntitiesToText.sh ../../tmp.chinese.list ../../CoreNLP_scripts/tmp_Chn/ 16 | cd ../ 17 | 18 | rm code/unify/in.txt 19 | rm code/unify/out.txt 20 | rm -r code/unify/in/ 21 | rm -r code/unify/out/ 22 | rm -r code/unify/out_processed/ 23 | mkdir code/unify/in/ 24 | mkdir code/unify/out/ 25 | mkdir code/unify/out_processed/ 26 | -------------------------------------------------------------------------------- /preprocessing_2.0/entityExtraction/runEntities_Spanish.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | STANFORD_NER=/home/andrew/NLP_tools/StanfordNER/stanford-ner-2016-10-31 3 | 4 | for i in crime FAC GPE title LOC ORG PER sentence time weapon vehicle age commodity 5 | do 6 | java -mx16g -cp "$STANFORD_NER/*:$STANFORD_NER/lib/*" edu.stanford.nlp.ie.crf.CRFClassifier -loadClassifier /home/andrew/DEFT_code_testing/dependencies/models/entities/RichERE-Spanish.ner-model.${i}.dependency.full.ser.gz -testfile $1 > code/unify/in/${1}_${i} 7 | done 8 | 9 | ls code/unify/in/* > code/unify/in.txt 10 | python code/unify/unifyEntities.py ../createSetFiles/setFile.noEntities.tmp.Span code/unify/in.txt code/unify/out/ 11 | ls code/unify/out/ > code/unify/out.txt 12 | python code/unify/processEntities.py code/unify/out.txt code/unify/out/ code/unify/out_processed/ 13 | 14 | cd code/ 15 | ./addEntitiesToText.sh ../../tmp.spanish.list ../../CoreNLP_scripts/tmp_Span/ 16 | cd ../ 17 | 18 | rm code/unify/in.txt 19 | rm code/unify/out.txt 20 | rm -r code/unify/in/ 21 | rm -r code/unify/out/ 22 | rm -r code/unify/out_processed/ 23 | mkdir code/unify/in/ 24 | mkdir code/unify/out/ 25 | mkdir code/unify/out_processed/ 26 | -------------------------------------------------------------------------------- /preprocessing_2.0/processChinese.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | if [ "$#" -ne 1 ]; then 4 | echo "Illegal number of parameters. Expect list of files to read (with absolute filepaths)." 5 | exit 1 6 | fi 7 | 8 | # store processed documents in tmp/ 9 | rm CoreNLP_scripts/tmp_Chn/* 10 | ./CoreNLP_scripts/runCoreNLP_Chn.sh $1 11 | 12 | # readCoreNLP 13 | python readCoreNLP/getRootnames.py $1 tmp.chinese.list 14 | cd readCoreNLP/ 15 | rm tmp_formatted_Chn/* 16 | ./convertCoreNLPFormat.sh ../tmp.chinese.list ../CoreNLP_scripts/tmp_Chn/ tmp_formatted_Chn/ 17 | cd ../ 18 | 19 | # createSetFiles 20 | find readCoreNLP/tmp_formatted_Chn/ -name "*.mergedAnnotations" > mergedFilenames.tmp.Chn 21 | rm createSetFiles/*.tmp 22 | rm createSetFiles/*.parsing 23 | python createSetFiles/writeDataFromFiles.py mergedFilenames.tmp.Chn createSetFiles/setFile.noEntities.tmp.Chn 24 | rm mergedFilenames.tmp.Chn 25 | 26 | # add dependency parsing via MaltParser 27 | python MaltParser_scripts/convertToCoNLL.py createSetFiles/setFile.noEntities.tmp.Chn MaltParser_scripts/Chinese.conll.tmp /home/andrew/DEFT_code_testing/dependencies/pos/zh-ctb6.map 28 | cp /home/andrew/DEFT_code_testing/dependencies/models/maltparser/UD.Chinese.model.mco UD.Chinese.model.mco.tmp 29 | mv UD.Chinese.model.mco.tmp UD.Chinese.model.mco 30 | java -jar /home/andrew/NLP_tools/MaltParser/maltparser-1.9.0/maltparser-1.9.0.jar -c UD.Chinese.model.mco -i MaltParser_scripts/Chinese.conll.tmp -o MaltParser_scripts/Chinese.conll.tmp.output -m parse 31 | rm UD.Chinese.model.mco 32 | python MaltParser_scripts/convertToParsingFile.py MaltParser_scripts/Chinese.conll.tmp.output createSetFiles/setFile.noEntities.tmp.Chn.parsing 33 | 34 | # entity extraction 35 | cd entityExtraction/ 36 | python convertTestSet.py ../createSetFiles/setFile.noEntities.tmp.Chn ../createSetFiles/setFile.noEntities.tmp.Chn.parsing entityTestSet.tmp.chn 37 | ./runEntities_Chinese.sh entityTestSet.tmp.chn 38 | cd ../ 39 | find entityExtraction/code/tmp_formatted/ -name "*.mergedAnnotations" > mergedFilenames.tmp.chn 40 | python createSetFiles/writeDataFromFiles.py mergedFilenames.tmp.chn createSetFiles/setFile.withEntities.tmp.Chn 41 | python MaltParser_scripts/convertToParsingFile.py MaltParser_scripts/Chinese.conll.tmp.output createSetFiles/setFile.withEntities.tmp.Chn.parsing 42 | rm mergedFilenames.tmp.chn 43 | rm tmp.chinese.list 44 | 45 | # predictions 46 | cd ../all_predictions_4.0/ 47 | ./runAll.sh ../preprocessing_2.0/createSetFiles/setFile.withEntities.tmp.Chn 48 | cd ../preprocessing_2.0/ 49 | 50 | # outputFormatting 51 | python ../outputFormatting/writeDocMap.py $1 52 | cd ../outputFormatting/ 53 | ./Chinese_run.sh 54 | cd ../preprocessing_2.0/ 55 | 56 | 57 | # clear the tmp folders 58 | rm CoreNLP_scripts/tmp_Chn/* 59 | rm createSetFiles/*.Chn 60 | rm createSetFiles/*.parsing 61 | rm entityExtraction/*.chn 62 | rm entityExtraction/code/tmp_formatted/* 63 | rm readCoreNLP/tmp_formatted_Chn/* 64 | rm MaltParser_scripts/Chinese.conll.tmp.output 65 | rm MaltParser_scripts/Chinese.conll.tmp 66 | rm *.tmp 67 | 68 | rm ../all_predictions_4.0/output.* 69 | rm ../all_predictions_4.0/*.easyRead 70 | rm ../all_predictions_4.0/*.entityCoref 71 | rm ../all_predictions_4.0/currentPredictionsForTriggers/testSet.predictions 72 | rm ../all_predictions_4.0/currentPredictionsForArgs/testSet.predictions 73 | rm ../all_predictions_4.0/currentPredictionsForRealis/testSet.predictions 74 | rm ../all_predictions_4.0/code/test.* 75 | rm ../outputFormatting/formatTriggers/format_andrew_triggers/andrew.triggers.out 76 | -------------------------------------------------------------------------------- /preprocessing_2.0/processEnglish.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | if [ "$#" -ne 1 ]; then 4 | echo "Illegal number of parameters. Expect list of files to read (with absolute filepaths)." 5 | exit 1 6 | fi 7 | 8 | # store processed documents in tmp/ 9 | rm CoreNLP_scripts/tmp_Eng/* 10 | ./CoreNLP_scripts/runCoreNLP_Eng.sh $1 11 | 12 | # readCoreNLP 13 | python readCoreNLP/getRootnames.py $1 tmp.list 14 | cd readCoreNLP/ 15 | rm tmp_formatted/* 16 | ./convertCoreNLPFormat.sh ../tmp.list ../CoreNLP_scripts/tmp_Eng/ tmp_formatted/ 17 | cd ../ 18 | 19 | # createSetFiles 20 | find readCoreNLP/tmp_formatted/ -name "*.mergedAnnotations" > mergedFilenames.tmp 21 | rm createSetFiles/*.tmp 22 | rm createSetFiles/*.parsing 23 | python createSetFiles/writeDataFromFiles.py mergedFilenames.tmp createSetFiles/setFile.noEntities.tmp 24 | rm mergedFilenames.tmp 25 | 26 | # entity extraction 27 | cd entityExtraction/ 28 | python convertTestSet.py ../createSetFiles/setFile.noEntities.tmp ../createSetFiles/setFile.noEntities.tmp.parsing entityTestSet.tmp 29 | ./runEntities.sh entityTestSet.tmp 30 | cd ../ 31 | find entityExtraction/code/tmp_formatted/ -name "*.mergedAnnotations" > mergedFilenames.tmp 32 | python createSetFiles/writeDataFromFiles.py mergedFilenames.tmp createSetFiles/setFile.withEntities.tmp 33 | rm mergedFilenames.tmp 34 | rm tmp.list 35 | 36 | # predictions 37 | cd ../all_predictions_4.0/ 38 | ./runAll.sh ../preprocessing_2.0/createSetFiles/setFile.withEntities.tmp 39 | cd ../preprocessing_2.0/ 40 | 41 | # outputFormatting 42 | python ../outputFormatting/writeDocMap.py $1 43 | cd ../outputFormatting/ 44 | ./English_run.sh 45 | cd ../preprocessing_2.0/ 46 | 47 | # clear the tmp folders 48 | 49 | #rm CoreNLP_scripts/tmp_Eng/* 50 | #rm createSetFiles/*.tmp 51 | #rm createSetFiles/*.parsing 52 | #rm entityExtraction/*.tmp 53 | #rm entityExtraction/code/tmp_formatted/* 54 | #rm readCoreNLP/tmp_formatted/* 55 | #rm *.tmp 56 | 57 | #rm ../all_predictions_4.0/output.* 58 | #rm ../all_predictions_4.0/*.easyRead 59 | #rm ../all_predictions_4.0/*.entityCoref 60 | #rm ../all_predictions_4.0/currentPredictionsForTriggers/testSet.predictions 61 | #rm ../all_predictions_4.0/currentPredictionsForArgs/testSet.predictions 62 | #rm ../all_predictions_4.0/currentPredictionsForRealis/testSet.predictions 63 | #rm ../all_predictions_4.0/code/test.* 64 | #rm ../outputFormatting/formatTriggers/format_andrew_triggers/andrew.triggers.out 65 | -------------------------------------------------------------------------------- /preprocessing_2.0/processSpanish.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | if [ "$#" -ne 1 ]; then 4 | echo "Illegal number of parameters. Expect list of files to read (with absolute filepaths)." 5 | exit 1 6 | fi 7 | 8 | # store processed documents in tmp/ 9 | rm CoreNLP_scripts/tmp_Span/* 10 | ./CoreNLP_scripts/runCoreNLP_Span.sh $1 11 | 12 | # readCoreNLP 13 | python readCoreNLP/getRootnames.py $1 tmp.spanish.list 14 | cd readCoreNLP/ 15 | rm tmp_formatted_Span/* 16 | ./convertCoreNLPFormat.sh ../tmp.spanish.list ../CoreNLP_scripts/tmp_Span/ tmp_formatted_Span/ 17 | cd ../ 18 | 19 | # createSetFiles 20 | find readCoreNLP/tmp_formatted_Span/ -name "*.mergedAnnotations" > mergedFilenames.tmp.Span 21 | rm createSetFiles/*.tmp 22 | rm createSetFiles/*.parsing 23 | python createSetFiles/writeDataFromFiles.py mergedFilenames.tmp.Span createSetFiles/setFile.noEntities.tmp.Span 24 | rm mergedFilenames.tmp.Span 25 | 26 | # add dependency parsing via MaltParser 27 | python MaltParser_scripts/convertToCoNLL.py createSetFiles/setFile.noEntities.tmp.Span MaltParser_scripts/Spanish.conll.tmp /home/andrew/DEFT_code_testing/dependencies/pos/es-cast3lb.map 28 | cp /home/andrew/DEFT_code_testing/dependencies/models/maltparser/UD.Spanish.model.mco UD.Spanish.model.mco.tmp 29 | mv UD.Spanish.model.mco.tmp UD.Spanish.model.mco 30 | java -jar /home/andrew/NLP_tools/MaltParser/maltparser-1.9.0/maltparser-1.9.0.jar -c UD.Spanish.model.mco -i MaltParser_scripts/Spanish.conll.tmp -o MaltParser_scripts/Spanish.conll.tmp.output -m parse 31 | rm UD.Spanish.model.mco 32 | python MaltParser_scripts/convertToParsingFile.py MaltParser_scripts/Spanish.conll.tmp.output createSetFiles/setFile.noEntities.tmp.Span.parsing 33 | 34 | # entity extraction 35 | cd entityExtraction/ 36 | python convertTestSet.py ../createSetFiles/setFile.noEntities.tmp.Span ../createSetFiles/setFile.noEntities.tmp.Span.parsing entityTestSet.tmp.span 37 | ./runEntities_Spanish.sh entityTestSet.tmp.span 38 | cd ../ 39 | find entityExtraction/code/tmp_formatted/ -name "*.mergedAnnotations" > mergedFilenames.tmp.span 40 | python createSetFiles/writeDataFromFiles.py mergedFilenames.tmp.span createSetFiles/setFile.withEntities.tmp.Span 41 | python MaltParser_scripts/convertToParsingFile.py MaltParser_scripts/Spanish.conll.tmp.output createSetFiles/setFile.withEntities.tmp.Span.parsing 42 | rm mergedFilenames.tmp.span 43 | rm tmp.spanish.list 44 | 45 | # predictions 46 | cd ../all_predictions_4.0/ 47 | ./runAll.sh ../preprocessing_2.0/createSetFiles/setFile.withEntities.tmp.Span 48 | cd ../preprocessing_2.0/ 49 | 50 | # outputFormatting 51 | python ../outputFormatting/writeDocMap.py $1 52 | cd ../outputFormatting/ 53 | ./Spanish_run.sh 54 | cd ../preprocessing_2.0/ 55 | 56 | 57 | # clear the tmp folder 58 | rm CoreNLP_scripts/tmp_Span/* 59 | rm createSetFiles/*.Span 60 | rm createSetFiles/*.parsing 61 | rm entityExtraction/*.span 62 | rm entityExtraction/code/tmp_formatted/* 63 | rm readCoreNLP/tmp_formatted_Span/* 64 | rm *.tmp 65 | 66 | rm MaltParser_scripts/Spanish.conll.tmp.output 67 | rm MaltParser_scripts/Spanish.conll.tmp 68 | 69 | 70 | rm ../all_predictions_4.0/output.* 71 | rm ../all_predictions_4.0/*.easyRead 72 | rm ../all_predictions_4.0/*.entityCoref 73 | rm ../all_predictions_4.0/currentPredictionsForTriggers/testSet.predictions 74 | rm ../all_predictions_4.0/currentPredictionsForArgs/testSet.predictions 75 | rm ../all_predictions_4.0/currentPredictionsForRealis/testSet.predictions 76 | rm ../all_predictions_4.0/code/test.* 77 | rm ../outputFormatting/formatTriggers/format_andrew_triggers/andrew.triggers.out 78 | 79 | -------------------------------------------------------------------------------- /preprocessing_2.0/readCoreNLP/convertCoreNLPFormat.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import xml.etree.ElementTree as ET 3 | import string 4 | from xml.sax.saxutils import escape 5 | 6 | def cleanWhitespace(text): 7 | clean = "" 8 | for char in text: 9 | if char in string.whitespace: 10 | clean += " " 11 | else: 12 | clean += char 13 | return clean 14 | 15 | def main(): 16 | if len(sys.argv) != 3: 17 | print "Expect coreNLP features (extracted), output file." 18 | sys.exit() 19 | 20 | labelDict = dict() # dict from offset -> (B/I, labelType, labelSubtype, fullName) 21 | triggerDict = dict() # dict from offset -> (triggerType, triggerSubtype) 22 | argDict = dict() # dict from offset -> (argument role) 23 | 24 | entityDict = dict() # dict from mentionID -> (start, end, text) 25 | 26 | input = open(sys.argv[1], "r") 27 | output = open(sys.argv[2], "w") 28 | 29 | lineCounter = 0 30 | for line in input: 31 | lineCounter += 1 32 | 33 | if line.startswith("BEGIN_SENTENCE"): 34 | output.write("\n") 35 | else: 36 | clean = line.strip() 37 | tokens = clean.split("\t") 38 | startOffset = int(tokens[0]) 39 | endOffset = int(tokens[1]) 40 | curWord = tokens[2] 41 | 42 | entityInfo = "EntitesGold[" 43 | 44 | if startOffset in labelDict: 45 | for curTuple in labelDict[startOffset]: 46 | begin = curTuple[0] 47 | entType = curTuple[1] 48 | entSubtype = curTuple[2] 49 | head = curTuple[4].encode('utf-8') 50 | 51 | coref = "coref_" + str(curTuple[5]) 52 | 53 | entityInfo += (begin + "|||" + entType + "|||" + entSubtype + "|||" + head + "|||" + coref + ";;;") 54 | 55 | tupleFullName = curTuple[3].encode('utf-8') 56 | entityInfo += "]" 57 | 58 | eventType = "not_trigger" 59 | eventSubtype = "not_trigger" 60 | 61 | if startOffset in triggerDict: 62 | curTuple = triggerDict[startOffset] 63 | eventType = curTuple[0] 64 | eventSubtype = curTuple[1] 65 | 66 | argInfo = "ArgsGold[" 67 | if startOffset in argDict: 68 | for curTuple in argDict[startOffset]: 69 | begin = curTuple[0] 70 | argRole = curTuple[1] 71 | eventText = curTuple[3] 72 | eventIndex = curTuple[4] 73 | argRealis = curTuple[5] 74 | 75 | eventText = replaceWhiteSpace(eventText) 76 | 77 | argInfo += (begin + "|||" + argRole + "|||" + eventText.encode('utf-8') + "|||" + str(eventIndex) + "|||" + argRealis + ";;;") 78 | argInfo += "]" 79 | 80 | output.write(clean + "\t" + entityInfo + "\t" + eventType + "\t" + eventSubtype + "\t" + argInfo + "\n") 81 | output.write("\n") 82 | input.close() 83 | output.close() 84 | 85 | 86 | def replaceWhiteSpace(text): 87 | newStr = "" 88 | for character in text: 89 | if character in string.whitespace: 90 | newStr += " " 91 | else: 92 | newStr += character 93 | return newStr 94 | 95 | 96 | main() 97 | -------------------------------------------------------------------------------- /preprocessing_2.0/readCoreNLP/convertCoreNLPFormat.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | if [ "$#" -ne 3 ]; then 3 | echo "Illegal number of parameters" 4 | echo "Use format: './convertCoreNLPFormat.sh [list_of_files] [path_to_files] [output_dir]" 5 | echo "-- list_of_files: text document containing filenames for documents" 6 | echo "-- path_to_files: path to the filenames contained in list_of_files" 7 | echo "-- output_dir: output directory" 8 | exit 1 9 | fi 10 | 11 | while read -r line 12 | do 13 | name=$line 14 | python read_CoreNLP_XML.py ${2}${name}.out tmp.txt 15 | python convertCoreNLPFormat.py tmp.txt ${3}/${name}.mergedAnnotations 16 | python write_parsing_from_CoreNLP.py ${2}${name}.out ${3}/${name}.parsingAnnotations 17 | done < ${1} 18 | rm tmp.txt 19 | -------------------------------------------------------------------------------- /preprocessing_2.0/readCoreNLP/getRootnames.py: -------------------------------------------------------------------------------- 1 | # script to extract the relative filenames for each file in a list 2 | import sys 3 | 4 | def main(): 5 | if len(sys.argv) != 3: 6 | print "Expect an input file, output file." 7 | sys.exit() 8 | 9 | input = open(sys.argv[1], "r") 10 | output = open(sys.argv[2], "w") 11 | 12 | for line in input: 13 | name = line.strip() 14 | if "/" in name: 15 | index = name.rfind("/") + 1 16 | name = name[index:] 17 | output.write(name + "\n") 18 | 19 | input.close() 20 | output.close() 21 | 22 | 23 | if __name__ == "__main__": 24 | main() 25 | -------------------------------------------------------------------------------- /preprocessing_2.0/readCoreNLP/read_CoreNLP_XML.py: -------------------------------------------------------------------------------- 1 | # script to read the CoreNLP output 2 | import sys 3 | import xml.etree.ElementTree as ET 4 | 5 | def getRootname(text): 6 | name = text 7 | if "/" in name: 8 | index = name.rfind("/") + 1 9 | name = name[index:] 10 | if ".out" in name: 11 | index = name.find(".out") 12 | name = name[:index] 13 | 14 | # if ".sgm" in name: 15 | # index = name.find(".sgm") 16 | # name = name[:index] 17 | # elif ".mpdf" in name: 18 | # index = name.find(".mpdf") 19 | # name = name[:index] 20 | # elif ".cmp" in name: 21 | # index = name.find(".cmp") 22 | # name = name[:index] 23 | # elif ".txt" in name: 24 | # index = name.find(".txt") 25 | # name = name[:index] 26 | # elif ".xml" in name: 27 | # index = name.find(".xml") 28 | # name = name[:index] 29 | 30 | return name 31 | 32 | 33 | def main(): 34 | if len(sys.argv) != 3: 35 | print "Expect input XML file, output file." 36 | sys.exit() 37 | 38 | print "Reading: " + sys.argv[1] 39 | tree = ET.parse(sys.argv[1]) 40 | root = tree.getroot() 41 | 42 | rootName = getRootname(sys.argv[1]) 43 | 44 | wordIndexDict = dict() # maps word indexes to character level indexes 45 | 46 | output = open(sys.argv[2], "w") 47 | 48 | # print word, start offset, end offset, POS tag for each character in the data 49 | printWordInfo(root, rootName, wordIndexDict, output, 0) 50 | 51 | output.close() 52 | 53 | def printWordInfo(root, rootName, wordIndexDict, output, sentenceNum): 54 | if root.tag == "coreference": 55 | return 56 | 57 | if root.tag == "sentence": 58 | output.write("BEGIN_SENTENCE\n") 59 | sentenceNum = root.attrib["id"] 60 | 61 | if root.tag == "token": 62 | processWord(root, rootName, wordIndexDict, sentenceNum + "_" + root.attrib["id"], output) 63 | else: 64 | for child in root: 65 | printWordInfo(child, rootName, wordIndexDict, output, sentenceNum) 66 | 67 | def processWord(root, rootName, wordIndexDict, wordID, output): 68 | pos = "" 69 | offsetStart = -1 70 | 71 | word = "" 72 | lemma = "" 73 | # extract the needed info 74 | for child in root: 75 | if child.tag == "POS": 76 | pos = child.text 77 | elif child.tag == "CharacterOffsetBegin": 78 | offsetStart = int(child.text) 79 | wordIndexDict[wordID] = offsetStart 80 | elif child.tag == "word": 81 | word = child.text 82 | elif child.tag == "CharacterOffsetEnd": 83 | offsetEnd = int(child.text) 84 | elif child.tag == "lemma": 85 | lemma = child.text 86 | 87 | outString = str(offsetStart) + "\t" + str(offsetEnd) + "\t" + word.encode('utf-8') + "\t" + lemma.encode('utf-8') + "\t" + pos + "\t" + rootName 88 | output.write(outString + "\n") 89 | 90 | main() 91 | -------------------------------------------------------------------------------- /preprocessing_2.0/readCoreNLP/write_parsing_from_CoreNLP.py: -------------------------------------------------------------------------------- 1 | # script to read the CoreNLP parsing output 2 | import sys 3 | import xml.etree.ElementTree as ET 4 | 5 | def main(): 6 | if len(sys.argv) != 3: 7 | print "Expect input XML file, output file for dependency parsing." 8 | sys.exit() 9 | 10 | tree = ET.parse(sys.argv[1]) 11 | root = tree.getroot() 12 | 13 | output = open(sys.argv[2], "w") 14 | 15 | # output dependency info. Add line of white space between each sentence. 16 | writeInfo(root, output) 17 | 18 | output.close() 19 | 20 | def writeInfo(root, output): 21 | if root.tag == "dependencies" and root.attrib["type"] == "basic-dependencies": 22 | for child in root: 23 | processDependencies(child, output) 24 | output.write("\n") 25 | else: 26 | for child in root: 27 | writeInfo(child, output) 28 | 29 | def processDependencies(root, output): 30 | depType = root.attrib["type"] 31 | 32 | governor = "" 33 | dependent = "" 34 | 35 | govIndex = -1 36 | depIndex = -1 37 | 38 | # extract the needed info 39 | for child in root: 40 | if child.tag == "governor": 41 | governor = child.text.encode('utf-8') 42 | govIndex = child.attrib["idx"] 43 | elif child.tag == "dependent": 44 | dependent = child.text.encode('utf-8') 45 | depIndex = child.attrib["idx"] 46 | 47 | output.write(depType.encode('utf-8')) 48 | output.write("|||") 49 | output.write(governor) 50 | output.write("|||") 51 | output.write(str(govIndex)) 52 | output.write("|||") 53 | output.write(dependent) 54 | output.write("|||") 55 | output.write(str(depIndex)) 56 | output.write("\n") 57 | 58 | main() 59 | --------------------------------------------------------------------------------