├── .gitignore ├── README ├── kilgoretrout └── extract.py ├── requirements.txt └── samples ├── techcrunch_facebook_ipo.txt ├── ulysses.pickle ├── ulysses.short.txt └── ulysses.txt /.gitignore: -------------------------------------------------------------------------------- 1 | *.pyc 2 | *.sw* 3 | .*project 4 | -------------------------------------------------------------------------------- /README: -------------------------------------------------------------------------------- 1 | Extract person-location and person-GPE relationships. 2 | 3 | Usage: python extract.py filename 4 | -------------------------------------------------------------------------------- /kilgoretrout/extract.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | from collections import defaultdict 3 | import logging 4 | import nltk 5 | from nltk.sem import relextract 6 | import operator 7 | import re 8 | import sys 9 | 10 | LOG = logging.getLogger(__name__) 11 | 12 | NE_TYPES = ( 13 | "ORGANIZATION", # Georgia-Pacific Corp., WHO 14 | "PERSON", # Eddy Bonte, President Obama 15 | "LOCATION", # Murray River, Mount Everest 16 | "DATE", # June, 2008-06-29 17 | "TIME", # two fifty a m, 1:30 p.m. 18 | "MONEY", # 175 million Canadian Dollars, GBP 10.40 19 | "PERCENT", # twenty pct, 18.75 % 20 | "FACILITY", # Washington Monument, Stonehenge 21 | "GPE", # South East Asia, Midlothian 22 | ) 23 | 24 | 25 | def is_single_item(item): 26 | return not hasattr(item, "__iter__") 27 | 28 | 29 | def iter_nodes(tree): 30 | for elem in tree: 31 | try: 32 | yield elem.node 33 | except AttributeError: 34 | pass 35 | 36 | 37 | class NECorpus: 38 | def __init__(self, text): 39 | self._sents = self.tokenize_sentences(text) 40 | self._sents = self.tokenize_words(self._sents) 41 | self._sents = self.tag_nes(self._sents) 42 | self._postprocess() 43 | 44 | 45 | def tokenize_sentences(cls, text): 46 | tokenizer_url = 'nltk:tokenizers/punkt/english.pickle' 47 | sentence_tokenizer = nltk.data.load(tokenizer_url) 48 | sents = sentence_tokenizer.tokenize(text) 49 | return sents 50 | 51 | 52 | def tokenize_words(cls, sents): 53 | word_tokenizer = nltk.tokenize.treebank.TreebankWordTokenizer() 54 | tokenized_sents = [word_tokenizer.tokenize(sent) for sent in sents] 55 | return tokenized_sents 56 | 57 | 58 | def tag_nes(cls, tokenized_sents): 59 | tagger_url = 'nltk:taggers/maxent_treebank_pos_tagger/english.pickle' 60 | tagger = nltk.data.load(tagger_url) 61 | tagged = tagger.batch_tag(tokenized_sents) 62 | 63 | ne_chunker_url = 'nltk:chunkers/maxent_ne_chunker/english_ace_multiclass.pickle' 64 | ne_chunker = nltk.data.load(ne_chunker_url) 65 | nes = ne_chunker.batch_parse(tagged) 66 | return nes 67 | 68 | 69 | def _postprocess(self): 70 | """Perform postprocessing techniques to increase accuracy and recall.""" 71 | # normalize choices of NE throughout the text, e.g. if "Billy Flannigan" 72 | # is usually a PERSON, make him always a PERSON 73 | def symbolize(tree): 74 | return '_'.join(word.lower() for word, tag in tree.leaves()) 75 | 76 | nes = self.nes() 77 | 78 | counts = defaultdict(lambda: defaultdict(int)) 79 | for sentence_no, ne in nes: 80 | sym = symbolize(ne) 81 | choice = ne.node 82 | counts[sym][choice] += 1 83 | 84 | normalized = dict() 85 | for sym, choices in counts.iteritems(): 86 | if len(choices) < 2: 87 | continue 88 | 89 | choice, _ = max(choices.iteritems(), key=operator.itemgetter(1)) 90 | normalized[sym] = choice 91 | LOG.debug("Normalizing NE '%s' from choices %s => %s" % (sym, choices.items(), choice)) 92 | 93 | for sentence_no, ne in nes: 94 | sym = symbolize(ne) 95 | ne.node = normalized.get(sym, ne.node) 96 | 97 | 98 | def rejoin_sent(cls, tree): 99 | return ' '.join(word for word, tag in tree.leaves()) 100 | 101 | 102 | def sents(self): 103 | """Get the sentences as split by this corpus. Note the original text 104 | is not saved, so rejoining the tokenized version may have slight 105 | differences. 106 | 107 | >>> text = 'The Project Gutenberg EBook of Ulysses, by James Joyce. Use this with care.' 108 | >>> corpus = NECorpus(text) 109 | >>> corpus.sents() 110 | ['The Project Gutenberg EBook of Ulysses , by James Joyce .', 'Use this with care .'] 111 | """ 112 | return [self.rejoin_sent(sent) for sent in self._sents] 113 | 114 | 115 | def parsed_sents(self): 116 | """Get sentences as parsed, tokenized, and tagged by this corpus.""" 117 | return self._sents 118 | 119 | 120 | def nes(self, nes=NE_TYPES): 121 | """Get all NEs of the specified types in the text, or every NE if 122 | not specified, and the sentence they occur in.""" 123 | if is_single_item(nes): 124 | nes = [nes] 125 | nes = set(nes) 126 | result = [] 127 | for index, sent in enumerate(self._sents): 128 | for elem in sent: 129 | try: 130 | if elem.node in nes: 131 | result.append((index, elem)) 132 | except AttributeError: 133 | pass 134 | 135 | return result 136 | 137 | 138 | def ne_sents(self, nes=NE_TYPES, match_all=False): 139 | """Get sentences containing any of the specified NEs, or any NE if not specified. 140 | 141 | >>> text = 'The Project Gutenberg EBook of Ulysses, by James Joyce. Use this with care.' 142 | >>> corpus = NECorpus(text) 143 | >>> corpus.ne_sents('PERSON') 144 | ['The Project Gutenberg EBook of Ulysses , by James Joyce .'] 145 | """ 146 | return [self.rejoin_sent(sent) for sent in self.ne_parsed_sents(nes, match_all)] 147 | 148 | 149 | def ne_parsed_sents(self, nes=NE_TYPES, match_all=False): 150 | """Get parsed sentences containing any of the specified NEs, or any NE if not specified.""" 151 | if is_single_item(nes): 152 | nes = [nes] 153 | nes = set(nes) 154 | filterfn = self._contains_all_nodes if match_all else self._contains_any_node 155 | return [sent for sent in self._sents if filterfn(sent, nes)] 156 | 157 | 158 | def _contains_all_nodes(cls, tree, nodes): 159 | nes = set(iter_nodes(tree)) 160 | return nodes.issuperset(nes) 161 | 162 | 163 | def _contains_any_node(cls, tree, nodes): 164 | nes = set(iter_nodes(tree)) 165 | return nodes.intersection(nes) 166 | 167 | 168 | def extract_rels(self, subj, obj): 169 | """Extract relationships of the given named entity subj and obj 170 | type.""" 171 | return self._naive_extract(subj, obj) 172 | 173 | 174 | def _naive_extract(self, subj, obj): 175 | """Get sentences containing both subj and obj named entities.""" 176 | # Duplicating self.ne_parsed_sents([subj, obj]) ... 177 | cond = set((subj, obj)) 178 | result = [] 179 | for index, sent in enumerate(self._sents): 180 | nes = [elem for elem in sent if hasattr(elem, 'node')] 181 | nodes = set(elem.node for elem in nes) 182 | if cond.issubset(nodes): 183 | matching_nes = [elem for elem in nes if elem.node in cond] 184 | result.append((index, matching_nes)) 185 | 186 | return result 187 | 188 | 189 | def _nltk_extract(self, subj, obj): 190 | """Use NLTK's built-in relationship extractor to get subj and obj 191 | named entity relationships and context.""" 192 | re_location = re.compile(".*") 193 | result = [] 194 | for sent in self._sents: 195 | extraction = relextract.extract_rels( 196 | subj, 197 | obj, 198 | sent, 199 | pattern=re_location, 200 | ) 201 | 202 | if extraction: 203 | result.append(extraction) 204 | 205 | return result 206 | 207 | 208 | if __name__ == "__main__": 209 | parser = argparse.ArgumentParser(description='Extract person-location and person-GPE relationships.') 210 | parser.add_argument('infile', nargs='?', type=argparse.FileType('r'), help='filename to process, or - for stdin') 211 | parser.add_argument('--tests', dest='run_tests', action='store_true', help='run tests') 212 | args = parser.parse_args() 213 | 214 | logging.basicConfig(level=logging.DEBUG) 215 | 216 | did_something = False 217 | 218 | if args.run_tests: 219 | did_something = True 220 | import doctest 221 | doctest.testmod() 222 | 223 | text = args.infile.read() if args.infile else '' 224 | if not text: 225 | if not did_something: 226 | parser.print_usage() 227 | sys.exit() 228 | 229 | corpus = NECorpus(text) 230 | a = corpus.extract_rels('PERSON', 'LOCATION') 231 | b = corpus.extract_rels('PERSON', 'GPE') 232 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | PyYAML==3.09 2 | argparse==1.2.1 3 | nltk==2.0.1rc1 4 | numpy==1.5.1 5 | wsgiref==0.1.2 6 | -------------------------------------------------------------------------------- /samples/techcrunch_facebook_ipo.txt: -------------------------------------------------------------------------------- 1 | Today, LinkedIn officially started trading its shares on the New York Stock Exchange, Facebook may not be too far behind in its own public offering. Facebook COO Sheryl Sandberg spoke briefly about the possibility of a Facebook IPO at Reuters Global Technology Summit today, saying that a public offering of Facebook shares is “inevitable.” Reuters reports that Sandberg declined to comment on when an IPO would take place. As stated inReuters’ account of her remarks, Sandberg said: “It’s a process that all companies go through. It’s an inevitable process for us, the next thing that happens…People used to ask us if we were going to get sold. People have stopped asking that question — we’re not … No one is buying us, we’re going public.” She also said that LinkedIn’s public offering “validated the importance” of the social networking business. Facebook is reportedly meeting with bankers to discuss IPO size and time frame for an offering. It’s been thought that the social network will go public by April 2012, but it could happen before this date. 2 | 3 | -------------------------------------------------------------------------------- /samples/ulysses.short.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/john-kurkowski/Kilgore-Trout/66cef67becf18713765b39e8376ee1cd6f5c4c5a/samples/ulysses.short.txt -------------------------------------------------------------------------------- /samples/ulysses.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/john-kurkowski/Kilgore-Trout/66cef67becf18713765b39e8376ee1cd6f5c4c5a/samples/ulysses.txt --------------------------------------------------------------------------------