├── .gitignore ├── .idea ├── checkstyle-idea.xml └── dictionaries │ └── vads.xml ├── Base.py ├── Event.py ├── Features.py ├── LexicalFeatures.py ├── Main.py ├── Project Description-Linguistic Learners copy.pdf ├── Project_F16_NLP6320.pdf ├── README.md ├── README.txt ├── SemanticFeatures.py ├── SyntacticFeatures.py ├── Utilities.py ├── project_ideas.txt ├── project_notes.txt ├── sampleInput.csv ├── sampleInput.txt └── timex.py /.gitignore: -------------------------------------------------------------------------------- 1 | *.pyc 2 | *.log 3 | *iml 4 | .idea 5 | baseOutput.txt 6 | sampleOutput.txt 7 | -------------------------------------------------------------------------------- /.idea/checkstyle-idea.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 9 | 10 | -------------------------------------------------------------------------------- /.idea/dictionaries/vads.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | -------------------------------------------------------------------------------- /Base.py: -------------------------------------------------------------------------------- 1 | import Utilities 2 | import re, sys 3 | from tabulate import tabulate 4 | 5 | 6 | # Predefined strings. 7 | numbers = "(^a(?=\s)|one|two|three|four|five|six|seven|eight|nine|ten| \ 8 | eleven|twelve|thirteen|fourteen|fifteen|sixteen|seventeen| \ 9 | eighteen|nineteen|twenty|thirty|forty|fifty|sixty|seventy|eighty| \ 10 | ninety|hundred|thousand)" 11 | day = "(monday|tuesday|wednesday|thursday|friday|saturday|sunday)" 12 | week_day = "(monday|tuesday|wednesday|thursday|friday|saturday|sunday)" 13 | month = "(january|february|march|april|may|june|july|august|september| \ 14 | october|november|december)" 15 | dmy = "(year|day|week|month)" 16 | rel_day = "(today|tomorrow|tonight|tonite)" 17 | exp1 = "(after)" 18 | exp2 = "(this)" 19 | iso = "\d+[/-]\d+[/-]\d+ \d+:\d+:\d+\.\d+" 20 | year = "((?<=\s)\d{4}|^\d{4})" 21 | regxp1 = "((\d+|(" + numbers + "[-\s]?)+) " + dmy + "s? " + exp1 + ")" 22 | regxp2 = "(" + exp2 + " (" + dmy + "|" + week_day + "|" + month + "))" 23 | 24 | reg1 = re.compile(regxp1, re.IGNORECASE) 25 | reg2 = re.compile(regxp2, re.IGNORECASE) 26 | reg3 = re.compile(rel_day, re.IGNORECASE) 27 | reg4 = re.compile(iso) 28 | reg5 = re.compile(year) 29 | 30 | def getCommandLineArgs(): 31 | return sys.argv[1], sys.argv[2] 32 | 33 | def isRequiredEvent(line): 34 | for word in ['marriage', 'birthday', 'meeting', 'anniversary', 'seminar']: 35 | if word in line.lower(): 36 | return True, word 37 | 38 | return False, "" 39 | 40 | def preProcessData(input): 41 | 42 | # Read input file 43 | objects = Utilities.parseInputFile(inputFileName) 44 | 45 | # Split text into lines based on delimiter 46 | #lines = Utilities.split(inputData, ".") 47 | 48 | # Get rid of empty lines. 49 | #lines = filter(None, lines) 50 | 51 | #print "lines: {}".format(lines) 52 | 53 | return objects 54 | 55 | def extractDate(text): 56 | # Initialization 57 | temporalExpressionFound = [] 58 | 59 | # re.findall() finds all the substring matches, keep only the full 60 | # matching string. Captures expressions such as 'number of days' ago, etc. 61 | found = reg1.findall(text) 62 | found = [a[0] for a in found if len(a) > 1] 63 | for timex in found: 64 | temporalExpressionFound.append(timex) 65 | 66 | # Variations of this thursday, next year, etc 67 | found = reg2.findall(text) 68 | found = [a[0] for a in found if len(a) > 1] 69 | for timex in found: 70 | temporalExpressionFound.append(timex) 71 | 72 | # today, tomorrow, etc 73 | found = reg3.findall(text) 74 | for timex in found: 75 | temporalExpressionFound.append(timex) 76 | 77 | # ISO 78 | found = reg4.findall(text) 79 | for timex in found: 80 | temporalExpressionFound.append(timex) 81 | 82 | # Year 83 | found = reg5.findall(text) 84 | for timex in found: 85 | temporalExpressionFound.append(timex) 86 | 87 | # print "temporal expressions: {}".format(temporalExpressionFound) 88 | if temporalExpressionFound: 89 | return ",".join(temporalExpressionFound) 90 | else: 91 | return "" 92 | 93 | 94 | 95 | def initialize(): 96 | Utilities.setupLog() 97 | 98 | if __name__ == '__main__': 99 | initialize() 100 | 101 | # read commmand line parameters 102 | inputFileName, outputFileName = getCommandLineArgs() 103 | 104 | # Preprocess input data 105 | lines = preProcessData(inputFileName) 106 | 107 | result = [] 108 | for line in lines: 109 | isRequired, eventType = isRequiredEvent(line.getText()) 110 | if isRequired: 111 | # print "line : {}".format(line) 112 | eventDate = extractDate(line.getText()) 113 | if eventDate: 114 | # print "eventdate: ".format(eventDate) 115 | if line.getActual() == "yes": 116 | Utilities.incrementTP() 117 | 118 | line.setPredict("yes") 119 | result.append([eventType, eventDate, "", line.getText()]) 120 | else: 121 | Utilities.writeLog("INFO [NAIVE APPROACH]: Event Detected but is identified as past event :" + line.getText()) 122 | else: 123 | Utilities.writeLog("INFO [NAIVE APPROACH]: Event Detected but event type did not match with required events :" + line.getText()) 124 | 125 | Utilities.writeOutput(outputFileName, ["Event", "When", "Where", "Text"]) 126 | [ Utilities.writeOutput(outputFileName, x) for x in result ] 127 | # Utilities.writeOutput(outputFileName, tabulate(result, headers=["Event", "When", "Where", "Text"], tablefmt="grid")) 128 | 129 | Utilities.computeRecall(lines) 130 | Utilities.printMetrics() -------------------------------------------------------------------------------- /Event.py: -------------------------------------------------------------------------------- 1 | 2 | class Event(object): 3 | 4 | def __init__(self, type, date, location): 5 | self.type = type 6 | self.date = date 7 | self.location = location 8 | 9 | def format(self): 10 | formattedResult = "" 11 | if self.location != "": 12 | formattedResult = "Event : {}, when: {}, where: {}".format(self.type, self.date, self.location) 13 | else: 14 | formattedResult = "Event : {}, when: {}".format(self.type, self.date) 15 | 16 | return formattedResult -------------------------------------------------------------------------------- /Features.py: -------------------------------------------------------------------------------- 1 | from SyntacticFeatures import SyntacticFeatures 2 | from SemanticFeatures import SemanticFeatures 3 | from LexicalFeatures import LexicalFeatures 4 | 5 | class Features(object): 6 | 7 | def __init__(self, text, actual): 8 | self.text = text 9 | self.syntacticFeatures = SyntacticFeatures() 10 | self.lexicalFeatures = LexicalFeatures() 11 | self.semanticFeatures = SemanticFeatures() 12 | self.event = None 13 | self.actual = actual 14 | self.predict = "no" 15 | 16 | def setPredict(self, predict): 17 | self.predict = predict 18 | 19 | def getPredicted(self): 20 | return self.predict 21 | 22 | def getActual(self): 23 | return self.actual 24 | 25 | def setEvent(self, event): 26 | self.event = event 27 | 28 | def setText(self, text): 29 | self.text = text 30 | 31 | def getText(self): 32 | return self.text 33 | 34 | def getEvent(self): 35 | return self.event 36 | 37 | def getLexicalFeatures(self): 38 | return self.lexicalFeatures 39 | 40 | def getSyntacticFeatures(self): 41 | return self.syntacticFeatures 42 | 43 | def getSemanticFeatures(self): 44 | return self.semanticFeatures -------------------------------------------------------------------------------- /LexicalFeatures.py: -------------------------------------------------------------------------------- 1 | 2 | class LexicalFeatures(object): 3 | 4 | def __init__(self): 5 | self.tokens = [] 6 | self.spellCorrection = "" 7 | 8 | def setTokens(self, tokens): 9 | self.tokens = tokens 10 | 11 | def setSpellCorrection(self, sentence): 12 | self.spellCorrection = sentence 13 | 14 | def getSpellCorrection(self): 15 | return self.spellCorrection 16 | 17 | def getTokens(self): 18 | return self.tokens -------------------------------------------------------------------------------- /Main.py: -------------------------------------------------------------------------------- 1 | import nltk, sys, re 2 | from nltk.corpus import wordnet 3 | from enchant.checker import SpellChecker 4 | from autocorrect import spell 5 | import timex, Utilities 6 | from Event import Event 7 | from nltk.tag import StanfordNERTagger 8 | from Features import Features 9 | 10 | KEYWORDS = ['marriage', 'birthday', 'meeting', 'anniversary', 'seminar'] 11 | SYNONYMS_FOR_KEYWORDS = {} 12 | PAST_TENSE_TAGS = ['VBD','VBN'] 13 | TIMEX_TAG = "" 14 | #STANFORD_NER_ROOT = "/Users/vads/Downloads/stanford-ner-2014-06-16/" 15 | STANFORD_NER_ROOT = "/home/ram/Downloads/stanford-ner-2014-06-16/" 16 | STANFORD_NER_PATH = STANFORD_NER_ROOT + 'stanford-ner.jar' 17 | RESULT = [] 18 | RESULT_HEADER = ["Event", "When", "Where", "Original Text", "Lexical-Tokens", "Lexical-SpellCorrection", "Syntactic-POS tags", "Syntactic-Temporal tag", "Semantic-Synonym", "Semantic-Location" ] 19 | TIMEX_TAG_REGEX = r'.+?' 20 | 21 | def initialize(): 22 | setupKeywords() 23 | SYNONYMS_FOR_KEYWORDS['seminar'].append('lecture') 24 | Utilities.setupLog() 25 | 26 | 27 | #perform spell correction 28 | def performSpellCorrection(featureObj): 29 | checker = SpellChecker("en_US", featureObj.getText()) 30 | for word in checker: 31 | word.replace(spell(word.word)) 32 | 33 | featureObj.getLexicalFeatures().setSpellCorrection(checker.get_text()) 34 | 35 | return featureObj 36 | 37 | #get synonyms for given word 38 | def getSynonyms(word): 39 | lemmas = [] 40 | synsets = wordnet.synsets(word) 41 | for sense in synsets: 42 | lemmas += [re.sub("_", " ", lemma.name()) for lemma in sense.lemmas()] 43 | return list(set(lemmas)) 44 | 45 | def setupKeywords(): 46 | # get all synonyms for given keywords 47 | global SYNONYMS_FOR_KEYWORDS 48 | for word in KEYWORDS: 49 | SYNONYMS_FOR_KEYWORDS[word] = getSynonyms(word) 50 | 51 | def isRequiredEvent(obj, dict): 52 | for word in dict: 53 | for synonym in dict[word]: 54 | if synonym in obj.getText().lower(): 55 | obj.getSemanticFeatures().setSynonym(str(dict[word])) 56 | return True, word 57 | return False, "" 58 | 59 | def getCommandLineArgs(): 60 | if len(sys.argv) < 2: 61 | print "ERROR: Usage: Main.py " 62 | exit(1) 63 | 64 | return sys.argv[1], sys.argv[2] 65 | 66 | def preProcessData(input): 67 | # read input file 68 | inputObjects = Utilities.parseInputFile(inputFileName) 69 | # split text into lines based on delimiter 70 | #lines = Utilities.split(inputData, ".") 71 | # perform spell correction 72 | featureObjects = [] 73 | for obj in inputObjects: 74 | featureObjects.append(performSpellCorrection(obj)) 75 | 76 | return featureObjects 77 | 78 | def performTagging(featureObjects): 79 | taggedLines = [] 80 | for obj in featureObjects: 81 | taggedLine = "" 82 | try: 83 | taggedLine = timex.tag(obj.getLexicalFeatures().getSpellCorrection().lower()) 84 | taggedLine = timex.ground(taggedLine, timex.gmt()) 85 | except: 86 | taggedLine = "" 87 | 88 | if not Utilities.isEmpty(taggedLine): 89 | obj.getSyntacticFeatures().setTemporalTag(Utilities.firstMatching(TIMEX_TAG_REGEX, taggedLine)) 90 | taggedLines.append(obj) 91 | 92 | return taggedLines 93 | 94 | #check whether event is past 95 | def isEventPast(obj): 96 | initialTokens = Utilities.split(obj.getText().lower(), " ") 97 | 98 | obj.getLexicalFeatures().setTokens(initialTokens) 99 | 100 | tokens = [] 101 | #remove empty or dummy tokens 102 | for token in initialTokens: 103 | if not Utilities.isEmpty(token): 104 | tokens.append(token) 105 | 106 | taggedWords = nltk.pos_tag(tokens) 107 | obj.getSyntacticFeatures().setPOSTags(taggedWords) 108 | 109 | for (word, tag) in taggedWords: 110 | if tag in PAST_TENSE_TAGS: 111 | return True 112 | return False 113 | 114 | def parseLocation(obj): 115 | event = re.sub("|", "", obj.getLexicalFeatures().getSpellCorrection()) 116 | #print "event: {}".format(event) 117 | 118 | entities = [] 119 | try: 120 | nerTagger = StanfordNERTagger( STANFORD_NER_ROOT + '/classifiers/english.muc.7class.distsim.crf.ser.gz', STANFORD_NER_PATH) 121 | entities = nerTagger.tag(event.split()) 122 | except: 123 | print("Unexpected error:", sys.exc_info()[0]) 124 | 125 | result = "" 126 | for entity in entities: 127 | if entity[1] != 'O': 128 | result += " {}".format( entity[0] ) 129 | 130 | #print "location: {}".format(result) 131 | obj.getSemanticFeatures().setLocation(result) 132 | return result 133 | 134 | def setupEvent(obj, eventType): 135 | eventDate = Utilities.parseDate(obj.getSyntacticFeatures().getTemporalTag()) 136 | eventLocation = parseLocation(obj) 137 | return Event(eventType, eventDate, eventLocation) 138 | 139 | if __name__ == '__main__': 140 | #initialize variables 141 | initialize() 142 | 143 | #read commmand line parameters 144 | inputFileName, outputFileName = getCommandLineArgs() 145 | 146 | #preprocess input data 147 | featureObjects = preProcessData(inputFileName) 148 | 149 | #perform temporal expression tagging 150 | taggedLines = performTagging(featureObjects) 151 | 152 | #select lines which have tag 153 | eventsList = Utilities.filter(taggedLines, TIMEX_TAG) 154 | 155 | #for lines identified as events, check each whether any word matches with synonyms for keywords 156 | for obj in eventsList: 157 | #print "event: {}".format(event) 158 | isRequired, eventType = isRequiredEvent(obj, SYNONYMS_FOR_KEYWORDS) 159 | if isRequired: 160 | eventObj = setupEvent(obj, eventType) 161 | obj.setEvent(eventObj) 162 | if not isEventPast(obj): 163 | #["Original Text", "Lexical-Tokens", "Lexical-SpellCorrection", "Syntactic-POS tags", "Syntactic-Temporal tag", "Semantic-Synonym", "Semantic-Location" ] 164 | Utilities.computePrecision(obj) 165 | obj.setPredict("yes") 166 | RESULT.append([obj.getEvent().type, 167 | obj.getEvent().date, 168 | obj.getEvent().location, 169 | obj.getText(), 170 | str(obj.getLexicalFeatures().getTokens()), 171 | obj.getLexicalFeatures().getSpellCorrection(), 172 | str(obj.getSyntacticFeatures().getPOSTags()), 173 | obj.getSyntacticFeatures().getTemporalTag(), 174 | obj.getSemanticFeatures().getSynonym(), 175 | obj.getSemanticFeatures().getLocation()]) 176 | else: 177 | if Utilities.isDateInFuture(obj.getSyntacticFeatures().getTemporalTag()): 178 | obj.setPredict("yes") 179 | Utilities.computePrecision(obj) 180 | RESULT.append([obj.getEvent().type, 181 | obj.getEvent().date, 182 | obj.getEvent().location, 183 | obj.getText(), 184 | str(obj.getLexicalFeatures().getTokens()), 185 | obj.getLexicalFeatures().getSpellCorrection(), 186 | str(obj.getSyntacticFeatures().getPOSTags()), 187 | obj.getSyntacticFeatures().getTemporalTag(), 188 | obj.getSemanticFeatures().getSynonym(), 189 | obj.getSemanticFeatures().getLocation()]) 190 | else: 191 | Utilities.writeLog("INFO [IMPROVED APPROACH]: Event Detected but is identified as past event :" + obj.getText()) 192 | else: 193 | Utilities.writeLog("INFO [IMPROVED APPROACH]: Event Detected but event type did not match with required events :" + obj.getText()) 194 | 195 | 196 | Utilities.writeOutput(outputFileName, RESULT_HEADER) 197 | for feature in RESULT: 198 | Utilities.writeOutput(outputFileName, feature) 199 | 200 | Utilities.computeRecall(featureObjects) 201 | 202 | Utilities.printMetrics() 203 | 204 | -------------------------------------------------------------------------------- /Project Description-Linguistic Learners copy.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/helloram52/automated-event-extraction/03e19ec515724a774e3adc90f410012ee568a3ee/Project Description-Linguistic Learners copy.pdf -------------------------------------------------------------------------------- /Project_F16_NLP6320.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/helloram52/automated-event-extraction/03e19ec515724a774e3adc90f410012ee568a3ee/Project_F16_NLP6320.pdf -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # **Description** # 2 | Identify future calendar events with date and time from text. Five events – Marriage, Birthday Party, Meeting Anniversary, Seminar will be included in scope. 3 | 4 | ## **NLP Modules** ## 5 | * NLTK – WordNet, Spell correction, Timex, Named Entity Recognition 6 | 7 | Pre-requisites: 8 | --------------- 9 | - Install nltk, pyenchant and autocorrect. 10 | - pip install nltk 11 | - pip install pyenchant 12 | - pip install autocorrect 13 | - Stanford NER 14 | - Download jar files from http://nlp.stanford.edu/software/stanford-ner-2014-06-16.zip 15 | 16 | Input Data Format: 17 | ------------------ 18 | - Text (.txt) format 19 | 20 | References: 21 | ----------- 22 | - Named Entity Recognition 23 | https://gist.github.com/onyxfish/322906 24 | - Stanford NER 25 | http://textminingonline.com/how-to-use-stanford-named-entity-recognizer-ner-in-python-nltk-and-other-programming-languages 26 | 27 | -------------------------------------------------------------------------------- /README.txt: -------------------------------------------------------------------------------- 1 | Pre-requisites: 2 | --------------- 3 | - Install nltk, pyenchant and autocorrect. 4 | - pip install nltk 5 | - pip install pyenchant 6 | - pip install autocorrect 7 | - Stanford NER 8 | - Download jar files from http://nlp.stanford.edu/software/stanford-ner-2014-06-16.zip 9 | 10 | Input Data Format: 11 | ------------------ 12 | - An input is a single sentence. 13 | 14 | References: 15 | ----------- 16 | - Named Entity Recognition 17 | https://gist.github.com/onyxfish/322906 18 | - Stanford NER 19 | http://textminingonline.com/how-to-use-stanford-named-entity-recognizer-ner-in-python-nltk-and-other-programming-languages 20 | -------------------------------------------------------------------------------- /SemanticFeatures.py: -------------------------------------------------------------------------------- 1 | 2 | class SemanticFeatures(object): 3 | 4 | def __init__(self): 5 | self.synonym = "" 6 | self.location = "" 7 | 8 | def setSynonym(self, word): 9 | self.synonym = word 10 | 11 | def setLocation(self, location): 12 | self.location = location 13 | 14 | def getSynonym(self): 15 | return self.synonym 16 | 17 | def getLocation(self): 18 | return self.location -------------------------------------------------------------------------------- /SyntacticFeatures.py: -------------------------------------------------------------------------------- 1 | 2 | class SyntacticFeatures(object): 3 | 4 | def __init__(self): 5 | self.POSTags = [] 6 | self.temporalTag = "" 7 | 8 | def setPOSTags(self, tags): 9 | self.POSTags = tags 10 | 11 | def setTemporalTag(self, tag): 12 | self.temporalTag = tag 13 | 14 | def getTemporalTag(self): 15 | return self.temporalTag 16 | 17 | def getPOSTags(self): 18 | return self.POSTags -------------------------------------------------------------------------------- /Utilities.py: -------------------------------------------------------------------------------- 1 | from datetime import datetime 2 | import logging, random, copy, re, csv 3 | import logging.config 4 | from Features import Features 5 | 6 | TRUE_POSITIVE = 0.0 7 | FALSE_POSITIVE = 0.0 8 | FALSE_NEGATIVE = 0.0 9 | TRUE_NEGATIVE = 0.0 10 | 11 | TIMEX_TAG = "" 12 | TIMEX_TAG_REGEX = r'.+?' 13 | 14 | def days(d1, d2): 15 | d1 = datetime.strptime(d1, "%Y-%m-%d") 16 | d2 = datetime.strptime(d2, "%Y-%m-%d") 17 | return abs((d1 - d2).days) 18 | 19 | def split(sentence, delimiter): 20 | return sentence.split(delimiter) 21 | 22 | def isEmpty(string): 23 | return string == '' or string == None 24 | 25 | def parseInputFileText(inputFileName): 26 | inputString = "" 27 | with open(inputFileName, 'r') as inputFile: 28 | for line in inputFile: 29 | # print "line: {}".format(line) 30 | # line = line.rstrip() 31 | # print "\tline: {}".format(line) 32 | inputString = "{}{}".format(inputString, line.strip()) 33 | #print "for ended: {}".format(inputString) 34 | return inputString 35 | 36 | def incrementTP(): 37 | global TRUE_POSITIVE 38 | TRUE_POSITIVE += 1 39 | 40 | #parse input file - read all the input lines 41 | def parseInputFile(inputFileName): 42 | featureObjects = [] 43 | with open(inputFileName, 'r') as inputFile: 44 | csvFile = csv.reader(inputFile) 45 | for line in csvFile: 46 | feature = Features(line[0], line[1]) 47 | featureObjects.append(feature) 48 | 49 | return featureObjects 50 | 51 | def setupLog(): 52 | logging.basicConfig(level=logging.DEBUG, 53 | format='%(asctime)s %(levelname)s %(message)s', 54 | filename='eventDetector.log', 55 | filemode='w') 56 | 57 | def parseDate(line): 58 | if TIMEX_TAG in line: 59 | r = re.compile(TIMEX_TAG_REGEX) 60 | dates = r.findall(line) 61 | return re.sub("<\/?TIMEX2([^<.]+)?>", "", dates[0]) 62 | 63 | return "" 64 | 65 | def filter(taggedLines, searchString): 66 | events = [] 67 | for taggedLine in taggedLines: 68 | if searchString in taggedLine.getSyntacticFeatures().getTemporalTag(): 69 | events.append(taggedLine) 70 | 71 | return events 72 | 73 | def firstMatching(pattern, string): 74 | expression = re.compile(pattern) 75 | results = expression.findall(string) 76 | return results[0] if len(results) > 0 else "" 77 | 78 | def remove(pattern, string): 79 | return re.sub(pattern, "", string) 80 | 81 | #check whether date is in future 82 | def isDateInFuture(event): 83 | date = firstMatching(r'val=.+>', event) 84 | date = remove(r"(>.+\/?TIMEX2>)|(val=)|'|\"", date) 85 | #remove token 'val=' 86 | #date = remove(r'val=', date) 87 | #remove single quote 88 | #date = remove(r"'|\"", date) 89 | 90 | #Based on length check whether it is past or future event 91 | if len(date) == 4: 92 | return int(datetime.now().year) < int(date) 93 | elif len(date) == 10: 94 | return days(date, datetime.now().strftime("%Y-%m-%d")) > 0 95 | elif len(date) == 7 and 'W' in date: 96 | return datetime.now().isocalendar()[1] <= int(date[5:]) 97 | else: 98 | return False 99 | 100 | #write log message 101 | def writeLog(line): 102 | #print line 103 | logging.warn(line) 104 | 105 | def computePrecision(obj): 106 | global TRUE_POSITIVE, FALSE_POSITIVE 107 | if obj.getActual() == "yes": 108 | TRUE_POSITIVE += 1 109 | else: 110 | FALSE_POSITIVE += 1 111 | 112 | def computeRecall(featureObjects): 113 | global FALSE_NEGATIVE, TRUE_NEGATIVE 114 | for obj in featureObjects: 115 | if obj.getActual() == "no" and obj.getPredicted() == "yes": 116 | FALSE_NEGATIVE += 1 117 | elif obj.getActual() == "no" and obj.getPredicted() == "no": 118 | TRUE_NEGATIVE += 1 119 | 120 | #write data to output 121 | def writeOutput(outputFileName, row): 122 | with open(outputFileName, 'a') as outputFile: 123 | outputCSV = csv.writer(outputFile) 124 | outputCSV.writerow(row) 125 | 126 | def printMetrics(): 127 | print "TP: {}, FP : {}, FN: {}, TN: {}".format(TRUE_POSITIVE, FALSE_POSITIVE, FALSE_NEGATIVE, TRUE_NEGATIVE) 128 | print "Precision : {}".format(TRUE_POSITIVE/(TRUE_POSITIVE+FALSE_POSITIVE)) 129 | print "Recall : {}".format(TRUE_POSITIVE / (TRUE_POSITIVE + FALSE_NEGATIVE)) 130 | -------------------------------------------------------------------------------- /project_ideas.txt: -------------------------------------------------------------------------------- 1 | Language detection: 2 | http://alias-i.com/lingpipe/demos/tutorial/langid/read-me.html 3 | 4 | Picking out good dishes from Yelp 5 | http://nlp.stanford.edu/courses/cs224n/2015/reports/4.pdf 6 | 7 | Entity Based Sentiment Analysis on Twitter 8 | http://nlp.stanford.edu/courses/cs224n/2010/reports/drao-sidbatra.pdf 9 | 10 | -------------------------------------------------------------------------------- /project_notes.txt: -------------------------------------------------------------------------------- 1 | Email dataset: 2 | http://www.cs.cmu.edu/~enron/ 3 | 4 | programming tools: 5 | porter stemmer package in python 6 | 7 | named entity recognition 8 | https://gist.github.com/onyxfish/322906 9 | 10 | stemming.porter2 11 | 12 | A Simple Bayesian Modelling Approach to Event Extraction from Twitter 13 | https://aclweb.org/anthology/P/P14/P14-2114.xhtml 14 | 15 | 16 | nltk named entity extraction 17 | https://gist.github.com/onyxfish/322906 18 | 19 | temporal parsing 20 | https://github.com/cnorthwood/ternip 21 | 22 | Context-dependent Semantic Parsing for Time Expressions 23 | http://homes.cs.washington.edu/~kentonl/pub/ladz-acl.2014.pdf 24 | 25 | Parsing Time: Learning to Interpret Time Expressions 26 | http://web.stanford.edu/~jurafsky/2012-naacl-temporal.pdf 27 | 28 | https://github.com/nltk/nltk_contrib/blob/master/nltk_contrib/timex.py 29 | 30 | https://github.com/yifange/event_extraction 31 | http://nlp.stanford.edu/courses/cs224n/2004/jblack-final-report.pdf 32 | 33 | http://people.tamu.edu/~zyue1105/yzhuo/yzhuo_files/IR_final_report.pdf 34 | 35 | http://web.mit.edu/6.863/www/fall2012/projects12.pdf 36 | http://nlp.stanford.edu/courses/cs224n/ 37 | 38 | picking out good foods from yelp 39 | https://bitbucket.org/anjoola/food/overview 40 | 41 | http://nlp.stanford.edu/courses/cs224n/2015/reports/4.pdf 42 | 43 | https://trevor.shinyapps.io/InteractiveApp/ 44 | http://www.goodfoodbadservice.com/ 45 | 46 | 47 | 48 | 49 | 50 | O/P Format: 51 | ----------- 52 | Event Format: 53 | EVENT_TYPE YYYY-MM-DD at 07:00 AM/PM [in EVENT_PLACE] 54 | where EVENT_TYPE is of 55 | - Marriage, 56 | - Birthday party, 57 | - Meeting, 58 | - anniversary, 59 | - Seminar 60 | 61 | TODO: 62 | ----- 63 | - Prepare a 100 volume dataset for the above five events with the below info. 64 | - w/o events(40%) 65 | - event word is present but no time. (5) 66 | - time is present but not relevant to the above 5 categories (10) 67 | - eg.: historic events from wikipedia 68 | - no events(25) 69 | - with events(60%) 70 | - past/future events of the above 5 categories.(60) 71 | 72 | - Modules 73 | - Bag of words approach 74 | - Pattern matching using regex 75 | - eg: starts at .* \d+:\d+(am|pm) 76 | - Tokenization 77 | - Spell correction 78 | - Explore spell correction in nltk 79 | - POS Tagger 80 | - temporal tagging and pos 81 | - Syntactic pattern(remove events in past tense) 82 | - Synonymy for 5 events 83 | - Named entity recognition 84 | - Workflow 85 | - Perform tokenization, syntactic and then semantic. 86 | - List the event iff a date/time is present and is one of the above 5 events. 87 | - optional: location(NER) 88 | 89 | - Report 90 | 91 | 92 | Depeendency for mxDateTime package needed for Timex module 93 | http://www.egenix.com/products/python/mxBase/#Download 94 | 95 | possible improvements for report 96 | - may could mean a date or a verb 97 | - he may do this or in May of 2011, the minister resigned. 98 | - include date ranges 99 | - oct 5-6 100 | 101 | Data format: 102 | ------------ 103 | 104 | 105 | 106 | 107 | For Demo and Report: 108 | --------------------- 109 | Identify event for each line (not multiple lines at once) 110 | 111 | -------------------------------------------------------------------------------- /sampleInput.csv: -------------------------------------------------------------------------------- 1 | Lecture starts tomorrow,yes 2 | His wedding Anniversary was over last week,no 3 | Homework is due tomrrow,no 4 | Board meeting is scheduled next week,yes 5 | There is a seminar next week,yes 6 | " Hitler joined the German Workers' Party, the prcursor of the NSDAP, in 1919 and became leader of the NSDAP in 1921",no 7 | Please gather in the auditorium tomorrow morning,yes 8 | Can we meet at starbucks tomorrow evening?,yes 9 | Will you be joning today's team meeting?,yes 10 | The lecture starts tomorrow morning in the Auditorium,yes 11 | Elizabeth II died during great war of Britain in 1933,no 12 | Hitler commmited suicide in 1945,no 13 | John got married in 1988,no 14 | Rajinikanth's next movie will be released on 2018,no 15 | Abdul kalam's next Anniversary will be held in Chennai on August 2017,yes 16 | I will be there for your son's birthday party tomorrow,yes 17 | I will be late for the meeting by 10 minutes,yes 18 | Do you expect more guests for the get togather tomorrow at Dallas?,yes 19 | Independence day is a day of remembrance for all citizens,yes 20 | He is planning to participate in marathon next year,no 21 | There is a annual meeting for Agriculture development conducted by the Dallas Agro Development Deparment,yes 22 | No new movies are released tomorrow,no 23 | " Yesterday, he said he would attend the meeting, but he didnt",no 24 | More people attended Tom's Anniversary last week,no 25 | I will definitely attend his marriage next month at San Fransisco,yes 26 | Anniversaries are hard to remember these days,no 27 | I just got a text from office that there is a meeting scheduled tomorrow morning,yes 28 | I forgot that I had a meeting to attend,no 29 | We are planning a surprise gift for our parents' wedding Anniversary,no 30 | My sister's marriage is happening tomorrow in Temple,yes 31 | Are you available tomorrow for a short meeting?,yes 32 | My boss said we need to have a one to one meeting tomorrow,yes 33 | I invite all of you for my son's birthday next week at my place,yes 34 | Will you attend my birthday party today?,yes 35 | I am very happy to hear about your wedding,no 36 | There is a meeting scheduled tomorrow to discuss about recent happenings,yes 37 | We have a gathering at our place next monday,yes 38 | Tom's wedding is happening next month,yes 39 | His marriage was celebrated in a grand manner,no 40 | Lot of activities are planned for his 56th Anniversary at his house,no 41 | He is not going to attend the meeting tomorrow,yes 42 | He is not attending the birthday party tomorrow,yes 43 | He is not attending Tom's birthday party tomorrow,yes 44 | He is not attending the lecture next week,yes 45 | He is not attending the seminar next month,yes 46 | What can we do if he doesn't attend the meeting next month,yes 47 | Tomorrow's board meeting is cancelled,yes 48 | Please dont come to my son's wedding next week,yes 49 | Annual meeting is cancelled tomorrow,yes 50 | Nivas will not be attending any meeting tomorrow,yes 51 | Nivas is getting married next year,yes 52 | Nivas's birthday party is planned tomorrow evening,yes 53 | He will not go to movie tomorrow,no 54 | He is very sick,no 55 | Do we have class tomorrow?,no 56 | How to train your dragon within a week?,no 57 | She was on phone the whole night,no 58 | -------------------------------------------------------------------------------- /sampleInput.txt: -------------------------------------------------------------------------------- 1 | Hitler was born in Austria, then part of Austria-Hungary, and raised near Linz.He moved to Germany in 1913 and was decorated during his married couple in the German Army in World War I. 2 | He joined the German Workers' Party, the prcursor of the NSDAP, in 1919 and became leader of the NSDAP in 1921. In 1923, he attempted a coup in Munich to seize power. 3 | The failed coup resulted in Hitler's imprisonment, during which time he dictated the first volume of his autobiography and political manifesto Mein Kampf ("My Struggle"). 4 | After his release in 1924, Hitler gained popular support by attcking the Traty of Versailles and promoting Pan-Germanism, anti-Semitism, and anti-communism with charismatic oratory and Nazi propaganda. 5 | Hitler frequently denounced international capitalism and communism as being part of a Jewish conspracy. Lecture starts at 3 PM. Anniversary was over last week. Homework is due tomorrow. 6 | Board meeting is scheduled tomorrow.There is a seminar tomorrow at 11:00 AM at the Auditorium in University of Texas at Dallas. 7 | Do you expect more guests for the get together tomorrow at Dallas? 8 | -------------------------------------------------------------------------------- /timex.py: -------------------------------------------------------------------------------- 1 | import re 2 | import string 3 | import os 4 | import sys 5 | 6 | # Requires eGenix.com mx Base Distribution 7 | # http://www.egenix.com/products/python/mxBase/ 8 | try: 9 | from mx.DateTime import * 10 | except ImportError: 11 | print """ 12 | Requires eGenix.com mx Base Distribution 13 | http://www.egenix.com/products/python/mxBase/""" 14 | 15 | # Predefined strings. 16 | numbers = "(^a(?=\s)|one|two|three|four|five|six|seven|eight|nine|ten| \ 17 | eleven|twelve|thirteen|fourteen|fifteen|sixteen|seventeen| \ 18 | eighteen|nineteen|twenty|thirty|forty|fifty|sixty|seventy|eighty| \ 19 | ninety|hundred|thousand)" 20 | day = "(monday|tuesday|wednesday|thursday|friday|saturday|sunday)" 21 | week_day = "(monday|tuesday|wednesday|thursday|friday|saturday|sunday)" 22 | month = "(january|february|march|april|may|june|july|august|september| \ 23 | october|november|december)" 24 | dmy = "(year|day|week|month)" 25 | rel_day = "(today|yesterday|tomorrow|tonight|tonite)" 26 | exp1 = "(before|after|earlier|later|ago)" 27 | exp2 = "(this|next|last)" 28 | iso = "\d+[/-]\d+[/-]\d+ \d+:\d+:\d+\.\d+" 29 | year = "((?<=\s)\d{4}|^\d{4})" 30 | regxp1 = "((\d+|(" + numbers + "[-\s]?)+) " + dmy + "s? " + exp1 + ")" 31 | regxp2 = "(" + exp2 + " (" + dmy + "|" + week_day + "|" + month + "))" 32 | 33 | reg1 = re.compile(regxp1, re.IGNORECASE) 34 | reg2 = re.compile(regxp2, re.IGNORECASE) 35 | reg3 = re.compile(rel_day, re.IGNORECASE) 36 | reg4 = re.compile(iso) 37 | reg5 = re.compile(year) 38 | 39 | def tag(text): 40 | 41 | # Initialization 42 | timex_found = [] 43 | 44 | # re.findall() finds all the substring matches, keep only the full 45 | # matching string. Captures expressions such as 'number of days' ago, etc. 46 | found = reg1.findall(text) 47 | found = [a[0] for a in found if len(a) > 1] 48 | for timex in found: 49 | timex_found.append(timex) 50 | 51 | # Variations of this thursday, next year, etc 52 | found = reg2.findall(text) 53 | found = [a[0] for a in found if len(a) > 1] 54 | for timex in found: 55 | timex_found.append(timex) 56 | 57 | # today, tomorrow, etc 58 | found = reg3.findall(text) 59 | for timex in found: 60 | timex_found.append(timex) 61 | 62 | # ISO 63 | found = reg4.findall(text) 64 | for timex in found: 65 | timex_found.append(timex) 66 | 67 | # Year 68 | found = reg5.findall(text) 69 | for timex in found: 70 | timex_found.append(timex) 71 | 72 | # Tag only temporal expressions which haven't been tagged. 73 | for timex in timex_found: 74 | text = re.sub(timex + '(?!)', '' + timex + '', text) 75 | 76 | return text 77 | 78 | # Hash function for week days to simplify the grounding task. 79 | # [Mon..Sun] -> [0..6] 80 | hashweekdays = { 81 | 'Monday': 0, 82 | 'Tuesday': 1, 83 | 'Wednesday': 2, 84 | 'Thursday': 3, 85 | 'Friday': 4, 86 | 'Saturday': 5, 87 | 'Sunday': 6} 88 | 89 | # Hash function for months to simplify the grounding task. 90 | # [Jan..Dec] -> [1..12] 91 | hashmonths = { 92 | 'January': 1, 93 | 'February': 2, 94 | 'March': 3, 95 | 'April': 4, 96 | 'May': 5, 97 | 'June': 6, 98 | 'July': 7, 99 | 'August': 8, 100 | 'September': 9, 101 | 'October': 10, 102 | 'November': 11, 103 | 'December': 12} 104 | 105 | # Hash number in words into the corresponding integer value 106 | def hashnum(number): 107 | if re.match(r'one|^a\b', number, re.IGNORECASE): 108 | return 1 109 | if re.match(r'two', number, re.IGNORECASE): 110 | return 2 111 | if re.match(r'three', number, re.IGNORECASE): 112 | return 3 113 | if re.match(r'four', number, re.IGNORECASE): 114 | return 4 115 | if re.match(r'five', number, re.IGNORECASE): 116 | return 5 117 | if re.match(r'six', number, re.IGNORECASE): 118 | return 6 119 | if re.match(r'seven', number, re.IGNORECASE): 120 | return 7 121 | if re.match(r'eight', number, re.IGNORECASE): 122 | return 8 123 | if re.match(r'nine', number, re.IGNORECASE): 124 | return 9 125 | if re.match(r'ten', number, re.IGNORECASE): 126 | return 10 127 | if re.match(r'eleven', number, re.IGNORECASE): 128 | return 11 129 | if re.match(r'twelve', number, re.IGNORECASE): 130 | return 12 131 | if re.match(r'thirteen', number, re.IGNORECASE): 132 | return 13 133 | if re.match(r'fourteen', number, re.IGNORECASE): 134 | return 14 135 | if re.match(r'fifteen', number, re.IGNORECASE): 136 | return 15 137 | if re.match(r'sixteen', number, re.IGNORECASE): 138 | return 16 139 | if re.match(r'seventeen', number, re.IGNORECASE): 140 | return 17 141 | if re.match(r'eighteen', number, re.IGNORECASE): 142 | return 18 143 | if re.match(r'nineteen', number, re.IGNORECASE): 144 | return 19 145 | if re.match(r'twenty', number, re.IGNORECASE): 146 | return 20 147 | if re.match(r'thirty', number, re.IGNORECASE): 148 | return 30 149 | if re.match(r'forty', number, re.IGNORECASE): 150 | return 40 151 | if re.match(r'fifty', number, re.IGNORECASE): 152 | return 50 153 | if re.match(r'sixty', number, re.IGNORECASE): 154 | return 60 155 | if re.match(r'seventy', number, re.IGNORECASE): 156 | return 70 157 | if re.match(r'eighty', number, re.IGNORECASE): 158 | return 80 159 | if re.match(r'ninety', number, re.IGNORECASE): 160 | return 90 161 | if re.match(r'hundred', number, re.IGNORECASE): 162 | return 100 163 | if re.match(r'thousand', number, re.IGNORECASE): 164 | return 1000 165 | 166 | # Given a timex_tagged_text and a Date object set to base_date, 167 | # returns timex_grounded_text 168 | def ground(tagged_text, base_date): 169 | 170 | # Find all identified timex and put them into a list 171 | timex_regex = re.compile(r'.*?', re.DOTALL) 172 | timex_found = timex_regex.findall(tagged_text) 173 | timex_found = map(lambda timex:re.sub(r'', '', timex), \ 174 | timex_found) 175 | 176 | # Calculate the new date accordingly 177 | for timex in timex_found: 178 | timex_val = 'UNKNOWN' # Default value 179 | 180 | timex_ori = timex # Backup original timex for later substitution 181 | 182 | # If numbers are given in words, hash them into corresponding numbers. 183 | # eg. twenty five days ago --> 25 days ago 184 | if re.search(numbers, timex, re.IGNORECASE): 185 | split_timex = re.split(r'\s(?=days?|months?|years?|weeks?)', \ 186 | timex, re.IGNORECASE) 187 | value = split_timex[0] 188 | unit = split_timex[1] 189 | num_list = map(lambda s:hashnum(s),re.findall(numbers + '+', \ 190 | value, re.IGNORECASE)) 191 | timex = `sum(num_list)` + ' ' + unit 192 | 193 | month = "" 194 | # If timex matches ISO format, remove 'time' and reorder 'date' 195 | if re.match(r'\d+[/-]\d+[/-]\d+ \d+:\d+:\d+\.\d+', timex): 196 | dmy = re.split(r'\s', timex)[0] 197 | dmy = re.split(r'/|-', dmy) 198 | timex_val = str(dmy[2]) + '-' + str(dmy[1]) + '-' + str(dmy[0]) 199 | 200 | # Specific dates 201 | elif re.match(r'\d{4}', timex): 202 | timex_val = str(timex) 203 | 204 | # Relative dates 205 | elif re.match(r'tonight|tonite|today', timex, re.IGNORECASE): 206 | timex_val = str(base_date) 207 | elif re.match(r'yesterday', timex, re.IGNORECASE): 208 | timex_val = str(base_date + RelativeDateTime(days=-1)) 209 | elif re.match(r'tomorrow', timex, re.IGNORECASE): 210 | timex_val = str(base_date + RelativeDateTime(days=+1)) 211 | 212 | # Weekday in the previous week. 213 | elif re.match(r'last ' + week_day, timex, re.IGNORECASE): 214 | day = hashweekdays[timex.split()[1]] 215 | timex_val = str(base_date + RelativeDateTime(weeks=-1, \ 216 | weekday=(day,0))) 217 | 218 | # Weekday in the current week. 219 | elif re.match(r'this ' + week_day, timex, re.IGNORECASE): 220 | day = hashweekdays[timex.split()[1]] 221 | timex_val = str(base_date + RelativeDateTime(weeks=0, \ 222 | weekday=(day,0))) 223 | 224 | # Weekday in the following week. 225 | elif re.match(r'next ' + week_day, timex, re.IGNORECASE): 226 | day = hashweekdays[timex.split()[1]] 227 | timex_val = str(base_date + RelativeDateTime(weeks=+1, \ 228 | weekday=(day,0))) 229 | 230 | # Last, this, next week. 231 | elif re.match(r'last week', timex, re.IGNORECASE): 232 | year = (base_date + RelativeDateTime(weeks=-1)).year 233 | 234 | # iso_week returns a triple (year, week, day) hence, retrieve 235 | # only week value. 236 | week = (base_date + RelativeDateTime(weeks=-1)).iso_week[1] 237 | timex_val = str(year) + 'W' + str(week) 238 | elif re.match(r'this week', timex, re.IGNORECASE): 239 | year = (base_date + RelativeDateTime(weeks=0)).year 240 | week = (base_date + RelativeDateTime(weeks=0)).iso_week[1] 241 | timex_val = str(year) + 'W' + str(week) 242 | elif re.match(r'next week', timex, re.IGNORECASE): 243 | year = (base_date + RelativeDateTime(weeks=+1)).year 244 | week = (base_date + RelativeDateTime(weeks=+1)).iso_week[1] 245 | timex_val = str(year) + 'W' + str(week) 246 | 247 | # Month in the previous year. 248 | elif re.match(r'last ' + month, timex, re.IGNORECASE): 249 | month = hashmonths[timex.split()[1]] 250 | timex_val = str(base_date.year - 1) + '-' + str(month) 251 | 252 | # Month in the current year. 253 | elif re.match(r'this ' + month, timex, re.IGNORECASE): 254 | month = hashmonths[timex.split()[1]] 255 | timex_val = str(base_date.year) + '-' + str(month) 256 | 257 | # Month in the following year. 258 | elif re.match(r'next ' + month, timex, re.IGNORECASE): 259 | month = hashmonths[timex.split()[1]] 260 | timex_val = str(base_date.year + 1) + '-' + str(month) 261 | elif re.match(r'last month', timex, re.IGNORECASE): 262 | 263 | # Handles the year boundary. 264 | if base_date.month == 1: 265 | timex_val = str(base_date.year - 1) + '-' + '12' 266 | else: 267 | timex_val = str(base_date.year) + '-' + str(base_date.month - 1) 268 | elif re.match(r'this month', timex, re.IGNORECASE): 269 | timex_val = str(base_date.year) + '-' + str(base_date.month) 270 | elif re.match(r'next month', timex, re.IGNORECASE): 271 | 272 | # Handles the year boundary. 273 | if base_date.month == 12: 274 | timex_val = str(base_date.year + 1) + '-' + '1' 275 | else: 276 | timex_val = str(base_date.year) + '-' + str(base_date.month + 1) 277 | elif re.match(r'last year', timex, re.IGNORECASE): 278 | timex_val = str(base_date.year - 1) 279 | elif re.match(r'this year', timex, re.IGNORECASE): 280 | timex_val = str(base_date.year) 281 | elif re.match(r'next year', timex, re.IGNORECASE): 282 | timex_val = str(base_date.year + 1) 283 | elif re.match(r'\d+ days? (ago|earlier|before)', timex, re.IGNORECASE): 284 | 285 | # Calculate the offset by taking '\d+' part from the timex. 286 | offset = int(re.split(r'\s', timex)[0]) 287 | timex_val = str(base_date + RelativeDateTime(days=-offset)) 288 | elif re.match(r'\d+ days? (later|after)', timex, re.IGNORECASE): 289 | offset = int(re.split(r'\s', timex)[0]) 290 | timex_val = str(base_date + RelativeDateTime(days=+offset)) 291 | elif re.match(r'\d+ weeks? (ago|earlier|before)', timex, re.IGNORECASE): 292 | offset = int(re.split(r'\s', timex)[0]) 293 | year = (base_date + RelativeDateTime(weeks=-offset)).year 294 | week = (base_date + \ 295 | RelativeDateTime(weeks=-offset)).iso_week[1] 296 | timex_val = str(year) + 'W' + str(week) 297 | elif re.match(r'\d+ weeks? (later|after)', timex, re.IGNORECASE): 298 | offset = int(re.split(r'\s', timex)[0]) 299 | year = (base_date + RelativeDateTime(weeks=+offset)).year 300 | week = (base_date + RelativeDateTime(weeks=+offset)).iso_week[1] 301 | timex_val = str(year) + 'W' + str(week) 302 | elif re.match(r'\d+ months? (ago|earlier|before)', timex, re.IGNORECASE): 303 | extra = 0 304 | offset = int(re.split(r'\s', timex)[0]) 305 | 306 | # Checks if subtracting the remainder of (offset / 12) to the base month 307 | # crosses the year boundary. 308 | if (base_date.month - offset % 12) < 1: 309 | extra = 1 310 | 311 | # Calculate new values for the year and the month. 312 | year = str(base_date.year - offset // 12 - extra) 313 | month = str((base_date.month - offset % 12) % 12) 314 | 315 | # Fix for the special case. 316 | if month == '0': 317 | month = '12' 318 | timex_val = year + '-' + month 319 | elif re.match(r'\d+ months? (later|after)', timex, re.IGNORECASE): 320 | extra = 0 321 | offset = int(re.split(r'\s', timex)[0]) 322 | if (base_date.month + offset % 12) > 12: 323 | extra = 1 324 | year = str(base_date.year + offset // 12 + extra) 325 | month = str((base_date.month + offset % 12) % 12) 326 | if month == '0': 327 | month = '12' 328 | timex_val = year + '-' + month 329 | elif re.match(r'\d+ years? (ago|earlier|before)', timex, re.IGNORECASE): 330 | offset = int(re.split(r'\s', timex)[0]) 331 | timex_val = str(base_date.year - offset) 332 | elif re.match(r'\d+ years? (later|after)', timex, re.IGNORECASE): 333 | offset = int(re.split(r'\s', timex)[0]) 334 | timex_val = str(base_date.year + offset) 335 | 336 | # Remove 'time' from timex_val. 337 | # For example, If timex_val = 2000-02-20 12:23:34.45, then 338 | # timex_val = 2000-02-20 339 | timex_val = re.sub(r'\s.*', '', timex_val) 340 | 341 | # Substitute tag+timex in the text with grounded tag+timex. 342 | tagged_text = re.sub('' + timex_ori + '', '' + timex_ori + '', tagged_text) 344 | 345 | return tagged_text 346 | 347 | #### 348 | 349 | def demo(): 350 | import nltk 351 | text = nltk.corpus.abc.raw('rural.txt')[:10000] 352 | print tag(text) 353 | 354 | if __name__ == '__main__': 355 | demo() 356 | --------------------------------------------------------------------------------