├── .gitignore
├── .idea
├── checkstyle-idea.xml
└── dictionaries
│ └── vads.xml
├── Base.py
├── Event.py
├── Features.py
├── LexicalFeatures.py
├── Main.py
├── Project Description-Linguistic Learners copy.pdf
├── Project_F16_NLP6320.pdf
├── README.md
├── README.txt
├── SemanticFeatures.py
├── SyntacticFeatures.py
├── Utilities.py
├── project_ideas.txt
├── project_notes.txt
├── sampleInput.csv
├── sampleInput.txt
└── timex.py
/.gitignore:
--------------------------------------------------------------------------------
1 | *.pyc
2 | *.log
3 | *iml
4 | .idea
5 | baseOutput.txt
6 | sampleOutput.txt
7 |
--------------------------------------------------------------------------------
/.idea/checkstyle-idea.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
9 |
10 |
--------------------------------------------------------------------------------
/.idea/dictionaries/vads.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
--------------------------------------------------------------------------------
/Base.py:
--------------------------------------------------------------------------------
1 | import Utilities
2 | import re, sys
3 | from tabulate import tabulate
4 |
5 |
6 | # Predefined strings.
7 | numbers = "(^a(?=\s)|one|two|three|four|five|six|seven|eight|nine|ten| \
8 | eleven|twelve|thirteen|fourteen|fifteen|sixteen|seventeen| \
9 | eighteen|nineteen|twenty|thirty|forty|fifty|sixty|seventy|eighty| \
10 | ninety|hundred|thousand)"
11 | day = "(monday|tuesday|wednesday|thursday|friday|saturday|sunday)"
12 | week_day = "(monday|tuesday|wednesday|thursday|friday|saturday|sunday)"
13 | month = "(january|february|march|april|may|june|july|august|september| \
14 | october|november|december)"
15 | dmy = "(year|day|week|month)"
16 | rel_day = "(today|tomorrow|tonight|tonite)"
17 | exp1 = "(after)"
18 | exp2 = "(this)"
19 | iso = "\d+[/-]\d+[/-]\d+ \d+:\d+:\d+\.\d+"
20 | year = "((?<=\s)\d{4}|^\d{4})"
21 | regxp1 = "((\d+|(" + numbers + "[-\s]?)+) " + dmy + "s? " + exp1 + ")"
22 | regxp2 = "(" + exp2 + " (" + dmy + "|" + week_day + "|" + month + "))"
23 |
24 | reg1 = re.compile(regxp1, re.IGNORECASE)
25 | reg2 = re.compile(regxp2, re.IGNORECASE)
26 | reg3 = re.compile(rel_day, re.IGNORECASE)
27 | reg4 = re.compile(iso)
28 | reg5 = re.compile(year)
29 |
30 | def getCommandLineArgs():
31 | return sys.argv[1], sys.argv[2]
32 |
33 | def isRequiredEvent(line):
34 | for word in ['marriage', 'birthday', 'meeting', 'anniversary', 'seminar']:
35 | if word in line.lower():
36 | return True, word
37 |
38 | return False, ""
39 |
40 | def preProcessData(input):
41 |
42 | # Read input file
43 | objects = Utilities.parseInputFile(inputFileName)
44 |
45 | # Split text into lines based on delimiter
46 | #lines = Utilities.split(inputData, ".")
47 |
48 | # Get rid of empty lines.
49 | #lines = filter(None, lines)
50 |
51 | #print "lines: {}".format(lines)
52 |
53 | return objects
54 |
55 | def extractDate(text):
56 | # Initialization
57 | temporalExpressionFound = []
58 |
59 | # re.findall() finds all the substring matches, keep only the full
60 | # matching string. Captures expressions such as 'number of days' ago, etc.
61 | found = reg1.findall(text)
62 | found = [a[0] for a in found if len(a) > 1]
63 | for timex in found:
64 | temporalExpressionFound.append(timex)
65 |
66 | # Variations of this thursday, next year, etc
67 | found = reg2.findall(text)
68 | found = [a[0] for a in found if len(a) > 1]
69 | for timex in found:
70 | temporalExpressionFound.append(timex)
71 |
72 | # today, tomorrow, etc
73 | found = reg3.findall(text)
74 | for timex in found:
75 | temporalExpressionFound.append(timex)
76 |
77 | # ISO
78 | found = reg4.findall(text)
79 | for timex in found:
80 | temporalExpressionFound.append(timex)
81 |
82 | # Year
83 | found = reg5.findall(text)
84 | for timex in found:
85 | temporalExpressionFound.append(timex)
86 |
87 | # print "temporal expressions: {}".format(temporalExpressionFound)
88 | if temporalExpressionFound:
89 | return ",".join(temporalExpressionFound)
90 | else:
91 | return ""
92 |
93 |
94 |
95 | def initialize():
96 | Utilities.setupLog()
97 |
98 | if __name__ == '__main__':
99 | initialize()
100 |
101 | # read commmand line parameters
102 | inputFileName, outputFileName = getCommandLineArgs()
103 |
104 | # Preprocess input data
105 | lines = preProcessData(inputFileName)
106 |
107 | result = []
108 | for line in lines:
109 | isRequired, eventType = isRequiredEvent(line.getText())
110 | if isRequired:
111 | # print "line : {}".format(line)
112 | eventDate = extractDate(line.getText())
113 | if eventDate:
114 | # print "eventdate: ".format(eventDate)
115 | if line.getActual() == "yes":
116 | Utilities.incrementTP()
117 |
118 | line.setPredict("yes")
119 | result.append([eventType, eventDate, "", line.getText()])
120 | else:
121 | Utilities.writeLog("INFO [NAIVE APPROACH]: Event Detected but is identified as past event :" + line.getText())
122 | else:
123 | Utilities.writeLog("INFO [NAIVE APPROACH]: Event Detected but event type did not match with required events :" + line.getText())
124 |
125 | Utilities.writeOutput(outputFileName, ["Event", "When", "Where", "Text"])
126 | [ Utilities.writeOutput(outputFileName, x) for x in result ]
127 | # Utilities.writeOutput(outputFileName, tabulate(result, headers=["Event", "When", "Where", "Text"], tablefmt="grid"))
128 |
129 | Utilities.computeRecall(lines)
130 | Utilities.printMetrics()
--------------------------------------------------------------------------------
/Event.py:
--------------------------------------------------------------------------------
1 |
2 | class Event(object):
3 |
4 | def __init__(self, type, date, location):
5 | self.type = type
6 | self.date = date
7 | self.location = location
8 |
9 | def format(self):
10 | formattedResult = ""
11 | if self.location != "":
12 | formattedResult = "Event : {}, when: {}, where: {}".format(self.type, self.date, self.location)
13 | else:
14 | formattedResult = "Event : {}, when: {}".format(self.type, self.date)
15 |
16 | return formattedResult
--------------------------------------------------------------------------------
/Features.py:
--------------------------------------------------------------------------------
1 | from SyntacticFeatures import SyntacticFeatures
2 | from SemanticFeatures import SemanticFeatures
3 | from LexicalFeatures import LexicalFeatures
4 |
5 | class Features(object):
6 |
7 | def __init__(self, text, actual):
8 | self.text = text
9 | self.syntacticFeatures = SyntacticFeatures()
10 | self.lexicalFeatures = LexicalFeatures()
11 | self.semanticFeatures = SemanticFeatures()
12 | self.event = None
13 | self.actual = actual
14 | self.predict = "no"
15 |
16 | def setPredict(self, predict):
17 | self.predict = predict
18 |
19 | def getPredicted(self):
20 | return self.predict
21 |
22 | def getActual(self):
23 | return self.actual
24 |
25 | def setEvent(self, event):
26 | self.event = event
27 |
28 | def setText(self, text):
29 | self.text = text
30 |
31 | def getText(self):
32 | return self.text
33 |
34 | def getEvent(self):
35 | return self.event
36 |
37 | def getLexicalFeatures(self):
38 | return self.lexicalFeatures
39 |
40 | def getSyntacticFeatures(self):
41 | return self.syntacticFeatures
42 |
43 | def getSemanticFeatures(self):
44 | return self.semanticFeatures
--------------------------------------------------------------------------------
/LexicalFeatures.py:
--------------------------------------------------------------------------------
1 |
2 | class LexicalFeatures(object):
3 |
4 | def __init__(self):
5 | self.tokens = []
6 | self.spellCorrection = ""
7 |
8 | def setTokens(self, tokens):
9 | self.tokens = tokens
10 |
11 | def setSpellCorrection(self, sentence):
12 | self.spellCorrection = sentence
13 |
14 | def getSpellCorrection(self):
15 | return self.spellCorrection
16 |
17 | def getTokens(self):
18 | return self.tokens
--------------------------------------------------------------------------------
/Main.py:
--------------------------------------------------------------------------------
1 | import nltk, sys, re
2 | from nltk.corpus import wordnet
3 | from enchant.checker import SpellChecker
4 | from autocorrect import spell
5 | import timex, Utilities
6 | from Event import Event
7 | from nltk.tag import StanfordNERTagger
8 | from Features import Features
9 |
10 | KEYWORDS = ['marriage', 'birthday', 'meeting', 'anniversary', 'seminar']
11 | SYNONYMS_FOR_KEYWORDS = {}
12 | PAST_TENSE_TAGS = ['VBD','VBN']
13 | TIMEX_TAG = ""
14 | #STANFORD_NER_ROOT = "/Users/vads/Downloads/stanford-ner-2014-06-16/"
15 | STANFORD_NER_ROOT = "/home/ram/Downloads/stanford-ner-2014-06-16/"
16 | STANFORD_NER_PATH = STANFORD_NER_ROOT + 'stanford-ner.jar'
17 | RESULT = []
18 | RESULT_HEADER = ["Event", "When", "Where", "Original Text", "Lexical-Tokens", "Lexical-SpellCorrection", "Syntactic-POS tags", "Syntactic-Temporal tag", "Semantic-Synonym", "Semantic-Location" ]
19 | TIMEX_TAG_REGEX = r'.+?'
20 |
21 | def initialize():
22 | setupKeywords()
23 | SYNONYMS_FOR_KEYWORDS['seminar'].append('lecture')
24 | Utilities.setupLog()
25 |
26 |
27 | #perform spell correction
28 | def performSpellCorrection(featureObj):
29 | checker = SpellChecker("en_US", featureObj.getText())
30 | for word in checker:
31 | word.replace(spell(word.word))
32 |
33 | featureObj.getLexicalFeatures().setSpellCorrection(checker.get_text())
34 |
35 | return featureObj
36 |
37 | #get synonyms for given word
38 | def getSynonyms(word):
39 | lemmas = []
40 | synsets = wordnet.synsets(word)
41 | for sense in synsets:
42 | lemmas += [re.sub("_", " ", lemma.name()) for lemma in sense.lemmas()]
43 | return list(set(lemmas))
44 |
45 | def setupKeywords():
46 | # get all synonyms for given keywords
47 | global SYNONYMS_FOR_KEYWORDS
48 | for word in KEYWORDS:
49 | SYNONYMS_FOR_KEYWORDS[word] = getSynonyms(word)
50 |
51 | def isRequiredEvent(obj, dict):
52 | for word in dict:
53 | for synonym in dict[word]:
54 | if synonym in obj.getText().lower():
55 | obj.getSemanticFeatures().setSynonym(str(dict[word]))
56 | return True, word
57 | return False, ""
58 |
59 | def getCommandLineArgs():
60 | if len(sys.argv) < 2:
61 | print "ERROR: Usage: Main.py