├── README.md ├── brown.pos ├── catchunked.py ├── chapter1.py ├── chapter2.py ├── chapter3.py ├── chapter4.py ├── chapter5.py ├── chapter6.py ├── chapter7.py ├── chapter8.py ├── chapter9.py ├── chunkers.py ├── classification.py ├── conll.iob ├── corpus.py ├── dist_featx.py ├── encoding.py ├── featx.py ├── heading_text.txt ├── mongoreader.py ├── movie_neg.txt ├── movie_pos.txt ├── mywords.txt ├── plists.py ├── rediscollections.py ├── redisprob.py ├── remote_chunk.py ├── remote_double.py ├── remote_tag.py ├── remote_word_count.py ├── replacers.py ├── requirements.txt ├── tag_util.py ├── taggers.py ├── transforms.py ├── treebank.chunk └── wordlist /README.md: -------------------------------------------------------------------------------- 1 | nltk3-cookbook 2 | ============== 3 | 4 | Python 3 code and corpus examples for the [Python 3 Text-Processing with NLTK 3 Cookbook](http://www.amazon.com/gp/product/B00N2RWMJU/ref=as_li_tl?ie=UTF8&camp=1789&creative=390957&creativeASIN=B00N2RWMJU&linkCode=as2&tag=streamhacker-20&linkId=4PEY2DGQOVOOEIDP). 5 | -------------------------------------------------------------------------------- /brown.pos: -------------------------------------------------------------------------------- 1 | The/at-tl expense/nn and/cc time/nn involved/vbn are/ber astronomical/jj ./. -------------------------------------------------------------------------------- /catchunked.py: -------------------------------------------------------------------------------- 1 | from nltk.corpus.reader import CategorizedCorpusReader, ChunkedCorpusReader 2 | from nltk.corpus.reader import ConllCorpusReader, ConllChunkCorpusReader 3 | 4 | class CategorizedChunkedCorpusReader(CategorizedCorpusReader, ChunkedCorpusReader): 5 | """ 6 | A reader for chunked corpora whose documents are divided into categories 7 | based on their file identifiers. 8 | """ 9 | # code adapted from CategorizedTaggedCorpusReader 10 | def __init__(self, *args, **kwargs): 11 | CategorizedCorpusReader.__init__(self, kwargs) 12 | ChunkedCorpusReader.__init__(self, *args, **kwargs) 13 | 14 | def _resolve(self, fileids, categories): 15 | if fileids is not None and categories is not None: 16 | raise ValueError('Specify fileids or categories, not both') 17 | if categories is not None: 18 | return self.fileids(categories) 19 | else: 20 | return fileids 21 | 22 | def raw(self, fileids=None, categories=None): 23 | return ChunkedCorpusReader.raw(self, self._resolve(fileids, categories)) 24 | 25 | def words(self, fileids=None, categories=None): 26 | return ChunkedCorpusReader.words(self, self._resolve(fileids, categories)) 27 | 28 | def sents(self, fileids=None, categories=None): 29 | return ChunkedCorpusReader.sents(self, self._resolve(fileids, categories)) 30 | 31 | def paras(self, fileids=None, categories=None): 32 | return ChunkedCorpusReader.paras(self, self._resolve(fileids, categories)) 33 | 34 | def tagged_words(self, fileids=None, categories=None): 35 | return ChunkedCorpusReader.tagged_words(self, self._resolve(fileids, categories)) 36 | 37 | def tagged_sents(self, fileids=None, categories=None): 38 | return ChunkedCorpusReader.tagged_sents(self, self._resolve(fileids, categories)) 39 | 40 | def tagged_paras(self, fileids=None, categories=None): 41 | return ChunkedCorpusReader.tagged_paras(self, self._resolve(fileids, categories)) 42 | 43 | def chunked_words(self, fileids=None, categories=None): 44 | return ChunkedCorpusReader.chunked_words( 45 | self, self._resolve(fileids, categories)) 46 | 47 | def chunked_sents(self, fileids=None, categories=None): 48 | return ChunkedCorpusReader.chunked_sents( 49 | self, self._resolve(fileids, categories)) 50 | 51 | def chunked_paras(self, fileids=None, categories=None): 52 | return ChunkedCorpusReader.chunked_paras( 53 | self, self._resolve(fileids, categories)) 54 | 55 | class CategorizedConllChunkCorpusReader(CategorizedCorpusReader, ConllChunkCorpusReader): 56 | """ 57 | A reader for conll chunked corpora whose documents are divided into 58 | categories based on their file identifiers. 59 | """ 60 | def __init__(self, *args, **kwargs): 61 | # NOTE: in addition to cat_pattern, ConllChunkCorpusReader also requires 62 | # chunk_types as third argument, which defaults to ('NP','VP','PP') 63 | CategorizedCorpusReader.__init__(self, kwargs) 64 | ConllChunkCorpusReader.__init__(self, *args, **kwargs) 65 | 66 | def _resolve(self, fileids, categories): 67 | if fileids is not None and categories is not None: 68 | raise ValueError('Specify fileids or categories, not both') 69 | if categories is not None: 70 | return self.fileids(categories) 71 | else: 72 | return fileids 73 | 74 | def raw(self, fileids=None, categories=None): 75 | return ConllCorpusReader.raw(self, self._resolve(fileids, categories)) 76 | 77 | def words(self, fileids=None, categories=None): 78 | return ConllCorpusReader.words(self, self._resolve(fileids, categories)) 79 | 80 | def sents(self, fileids=None, categories=None): 81 | return ConllCorpusReader.sents(self, self._resolve(fileids, categories)) 82 | 83 | def tagged_words(self, fileids=None, categories=None): 84 | return ConllCorpusReader.tagged_words(self, self._resolve(fileids, categories)) 85 | 86 | def tagged_sents(self, fileids=None, categories=None): 87 | return ConllCorpusReader.tagged_sents(self, self._resolve(fileids, categories)) 88 | 89 | def chunked_words(self, fileids=None, categories=None, chunk_types=None): 90 | return ConllCorpusReader.chunked_words( 91 | self, self._resolve(fileids, categories), chunk_types) 92 | 93 | def chunked_sents(self, fileids=None, categories=None, chunk_types=None): 94 | return ConllCorpusReader.chunked_sents( 95 | self, self._resolve(fileids, categories), chunk_types) 96 | 97 | def parsed_sents(self, fileids=None, categories=None, pos_in_tree=None): 98 | return ConllCorpusReader.parsed_sents( 99 | self, self._resolve(fileids, categories), pos_in_tree) 100 | 101 | def srl_spans(self, fileids=None, categories=None): 102 | return ConllCorpusReader.srl_spans(self, self._resolve(fileids, categories)) 103 | 104 | def srl_instances(self, fileids=None, categories=None, pos_in_tree=None, flatten=True): 105 | return ConllCorpusReader.srl_instances( 106 | self, self._resolve(fileids, categories), pos_in_tree, flatten) 107 | 108 | def iob_words(self, fileids=None, categories=None): 109 | return ConllCorpusReader.iob_words(self, self._resolve(fileids, categories)) 110 | 111 | def iob_sents(self, fileids=None, categories=None): 112 | return ConllCorpusReader.iob_sents(self, self._resolve(fileids, categories)) -------------------------------------------------------------------------------- /chapter1.py: -------------------------------------------------------------------------------- 1 | """ 2 | ============================== 3 | Tokenizing Text into Sentences 4 | ============================== 5 | 6 | >>> para = "Hello World. It's good to see you. Thanks for buying this book." 7 | >>> from nltk.tokenize import sent_tokenize 8 | >>> sent_tokenize(para) 9 | ['Hello World.', "It's good to see you.", 'Thanks for buying this book.'] 10 | 11 | >>> import nltk.data 12 | >>> tokenizer = nltk.data.load('tokenizers/punkt/PY3/english.pickle') 13 | >>> tokenizer.tokenize(para) 14 | ['Hello World.', "It's good to see you.", 'Thanks for buying this book.'] 15 | 16 | >>> spanish_tokenizer = nltk.data.load('tokenizers/punkt/PY3/spanish.pickle') 17 | >>> spanish_tokenizer.tokenize('Hola amigo. Estoy bien.') 18 | ['Hola amigo.', 'Estoy bien.'] 19 | 20 | 21 | =============================== 22 | Tokenizing Sentences into Words 23 | =============================== 24 | 25 | >>> from nltk.tokenize import word_tokenize 26 | >>> word_tokenize('Hello World.') 27 | ['Hello', 'World', '.'] 28 | 29 | >>> from nltk.tokenize import TreebankWordTokenizer 30 | >>> tokenizer = TreebankWordTokenizer() 31 | >>> tokenizer.tokenize('Hello World.') 32 | ['Hello', 'World', '.'] 33 | 34 | >>> word_tokenize("can't") 35 | ['ca', "n't"] 36 | 37 | >>> from nltk.tokenize import PunktWordTokenizer 38 | >>> tokenizer = PunktWordTokenizer() 39 | >>> tokenizer.tokenize("Can't is a contraction.") 40 | ['Can', "'t", 'is', 'a', 'contraction.'] 41 | 42 | >>> from nltk.tokenize import WordPunctTokenizer 43 | >>> tokenizer = WordPunctTokenizer() 44 | >>> tokenizer.tokenize("Can't is a contraction.") 45 | ['Can', "'", 't', 'is', 'a', 'contraction', '.'] 46 | 47 | 48 | ============================================== 49 | Tokenizing Sentences using Regular Expressions 50 | ============================================== 51 | 52 | >>> from nltk.tokenize import RegexpTokenizer 53 | >>> tokenizer = RegexpTokenizer("[\w']+") 54 | >>> tokenizer.tokenize("Can't is a contraction.") 55 | ["Can't", 'is', 'a', 'contraction'] 56 | 57 | >>> from nltk.tokenize import regexp_tokenize 58 | >>> regexp_tokenize("Can't is a contraction.", "[\w']+") 59 | ["Can't", 'is', 'a', 'contraction'] 60 | 61 | >>> tokenizer = RegexpTokenizer('\s+', gaps=True) 62 | >>> tokenizer.tokenize("Can't is a contraction.") 63 | ["Can't", 'is', 'a', 'contraction.'] 64 | 65 | 66 | ============================= 67 | Training a Sentence Tokenizer 68 | ============================= 69 | 70 | >>> from nltk.tokenize import PunktSentenceTokenizer 71 | >>> from nltk.corpus import webtext 72 | >>> text = webtext.raw('overheard.txt') 73 | >>> sent_tokenizer = PunktSentenceTokenizer(text) 74 | >>> sents1 = sent_tokenizer.tokenize(text) 75 | >>> sents1[0] 76 | 'White guy: So, do you have any plans for this evening?' 77 | >>> from nltk.tokenize import sent_tokenize 78 | >>> sents2 = sent_tokenize(text) 79 | >>> sents2[0] 80 | 'White guy: So, do you have any plans for this evening?' 81 | >>> sents1[678] 82 | 'Girl: But you already have a Big Mac...' 83 | >>> sents2[678] 84 | 'Girl: But you already have a Big Mac...\\nHobo: Oh, this is all theatrical.' 85 | 86 | >>> with open('/usr/share/nltk_data/corpora/webtext/overheard.txt', encoding='ISO-8859-2') as f: 87 | ... text = f.read() 88 | >>> sent_tokenizer = PunktSentenceTokenizer(text) 89 | >>> sents = sent_tokenizer.tokenize(text) 90 | >>> sents[0] 91 | 'White guy: So, do you have any plans for this evening?' 92 | >>> sents[678] 93 | 'Girl: But you already have a Big Mac...' 94 | 95 | 96 | =========================================== 97 | Filtering Stopwords in a Tokenized Sentence 98 | =========================================== 99 | 100 | >>> from nltk.corpus import stopwords 101 | >>> english_stops = set(stopwords.words('english')) 102 | >>> words = ["Can't", 'is', 'a', 'contraction'] 103 | >>> [word for word in words if word not in english_stops] 104 | ["Can't", 'contraction'] 105 | 106 | >>> stopwords.fileids() 107 | ['danish', 'dutch', 'english', 'finnish', 'french', 'german', 'hungarian', 'italian', 'norwegian', 'portuguese', 'russian', 'spanish', 'swedish', 'turkish'] 108 | 109 | >>> stopwords.words('dutch') 110 | ['de', 'en', 'van', 'ik', 'te', 'dat', 'die', 'in', 'een', 'hij', 'het', 'niet', 'zijn', 'is', 'was', 'op', 'aan', 'met', 'als', 'voor', 'had', 'er', 'maar', 'om', 'hem', 'dan', 'zou', 'of', 'wat', 'mijn', 'men', 'dit', 'zo', 'door', 'over', 'ze', 'zich', 'bij', 'ook', 'tot', 'je', 'mij', 'uit', 'der', 'daar', 'haar', 'naar', 'heb', 'hoe', 'heeft', 'hebben', 'deze', 'u', 'want', 'nog', 'zal', 'me', 'zij', 'nu', 'ge', 'geen', 'omdat', 'iets', 'worden', 'toch', 'al', 'waren', 'veel', 'meer', 'doen', 'toen', 'moet', 'ben', 'zonder', 'kan', 'hun', 'dus', 'alles', 'onder', 'ja', 'eens', 'hier', 'wie', 'werd', 'altijd', 'doch', 'wordt', 'wezen', 'kunnen', 'ons', 'zelf', 'tegen', 'na', 'reeds', 'wil', 'kon', 'niets', 'uw', 'iemand', 'geweest', 'andere'] 111 | 112 | ========================================= 113 | Looking up a Synset for a Word in WordNet 114 | ========================================= 115 | 116 | >>> from nltk.corpus import wordnet 117 | >>> syn = wordnet.synsets('cookbook')[0] 118 | >>> syn.name() 119 | 'cookbook.n.01' 120 | >>> syn.definition() 121 | 'a book of recipes and cooking directions' 122 | 123 | >>> wordnet.synset('cookbook.n.01') 124 | Synset('cookbook.n.01') 125 | 126 | >>> wordnet.synsets('cooking')[0].examples() 127 | ['cooking can be a great art', 'people are needed who have experience in cookery', 'he left the preparation of meals to his wife'] 128 | 129 | >>> syn.hypernyms() 130 | [Synset('reference_book.n.01')] 131 | >>> syn.hypernyms()[0].hyponyms() 132 | [Synset('annual.n.02'), Synset('atlas.n.02'), Synset('cookbook.n.01'), Synset('directory.n.01'), Synset('encyclopedia.n.01'), Synset('handbook.n.01'), Synset('instruction_book.n.01'), Synset('source_book.n.01'), Synset('wordbook.n.01')] 133 | >>> syn.root_hypernyms() 134 | [Synset('entity.n.01')] 135 | 136 | >>> syn.hypernym_paths() 137 | [[Synset('entity.n.01'), Synset('physical_entity.n.01'), Synset('object.n.01'), Synset('whole.n.02'), Synset('artifact.n.01'), Synset('creation.n.02'), Synset('product.n.02'), Synset('work.n.02'), Synset('publication.n.01'), Synset('book.n.01'), Synset('reference_book.n.01'), Synset('cookbook.n.01')]] 138 | 139 | >>> syn.pos() 140 | 'n' 141 | 142 | >>> len(wordnet.synsets('great')) 143 | 7 144 | >>> len(wordnet.synsets('great', pos='n')) 145 | 1 146 | >>> len(wordnet.synsets('great', pos='a')) 147 | 6 148 | 149 | 150 | ========================================= 151 | Looking up Lemmas and Synonyms in WordNet 152 | ========================================= 153 | 154 | >>> from nltk.corpus import wordnet 155 | >>> syn = wordnet.synsets('cookbook')[0] 156 | >>> lemmas = syn.lemmas() 157 | >>> len(lemmas) 158 | 2 159 | >>> lemmas[0].name() 160 | 'cookbook' 161 | >>> lemmas[1].name() 162 | 'cookery_book' 163 | >>> lemmas[0].synset() == lemmas[1].synset() 164 | True 165 | 166 | >>> [lemma.name() for lemma in syn.lemmas()] 167 | ['cookbook', 'cookery_book'] 168 | 169 | >>> synonyms = [] 170 | >>> for syn in wordnet.synsets('book'): 171 | ... for lemma in syn.lemmas(): 172 | ... synonyms.append(lemma.name()) 173 | >>> len(synonyms) 174 | 38 175 | 176 | >>> len(set(synonyms)) 177 | 25 178 | 179 | >>> gn2 = wordnet.synset('good.n.02') 180 | >>> gn2.definition() 181 | 'moral excellence or admirableness' 182 | >>> evil = gn2.lemmas()[0].antonyms()[0] 183 | >>> evil.name() 184 | 'evil' 185 | >>> evil.synset().definition() 186 | 'the quality of being morally wrong in principle or practice' 187 | >>> ga1 = wordnet.synset('good.a.01') 188 | >>> ga1.definition() 189 | 'having desirable or positive qualities especially those suitable for a thing specified' 190 | >>> bad = ga1.lemmas()[0].antonyms()[0] 191 | >>> bad.name() 192 | 'bad' 193 | >>> bad.synset().definition() 194 | 'having undesirable or negative qualities' 195 | 196 | 197 | ===================================== 198 | Calculating WordNet Synset Similarity 199 | ===================================== 200 | 201 | >>> from nltk.corpus import wordnet 202 | >>> cb = wordnet.synset('cookbook.n.01') 203 | >>> ib = wordnet.synset('instruction_book.n.01') 204 | >>> cb.wup_similarity(ib) 205 | 0.9166666666666666 206 | 207 | >>> ref = cb.hypernyms()[0] 208 | >>> cb.shortest_path_distance(ref) 209 | 1 210 | >>> ib.shortest_path_distance(ref) 211 | 1 212 | >>> cb.shortest_path_distance(ib) 213 | 2 214 | 215 | >>> dog = wordnet.synsets('dog')[0] 216 | >>> dog.wup_similarity(cb) 217 | 0.38095238095238093 218 | 219 | >>> sorted(dog.common_hypernyms(cb)) 220 | [Synset('entity.n.01'), Synset('object.n.01'), Synset('physical_entity.n.01'), Synset('whole.n.02')] 221 | 222 | >>> cook = wordnet.synset('cook.v.01') 223 | >>> bake = wordnet.synset('bake.v.02') 224 | >>> cook.wup_similarity(bake) 225 | 0.6666666666666666 226 | 227 | >>> cb.path_similarity(ib) 228 | 0.3333333333333333 229 | >>> cb.path_similarity(dog) 230 | 0.07142857142857142 231 | >>> cb.lch_similarity(ib) 232 | 2.538973871058276 233 | >>> cb.lch_similarity(dog) 234 | 0.9985288301111273 235 | 236 | 237 | ============================= 238 | Discovering Word Collocations 239 | ============================= 240 | 241 | >>> from nltk.corpus import webtext 242 | >>> from nltk.collocations import BigramCollocationFinder 243 | >>> from nltk.metrics import BigramAssocMeasures 244 | >>> words = [w.lower() for w in webtext.words('grail.txt')] 245 | >>> bcf = BigramCollocationFinder.from_words(words) 246 | >>> bcf.nbest(BigramAssocMeasures.likelihood_ratio, 4) 247 | [("'", 's'), ('arthur', ':'), ('#', '1'), ("'", 't')] 248 | 249 | >>> from nltk.corpus import stopwords 250 | >>> stopset = set(stopwords.words('english')) 251 | >>> filter_stops = lambda w: len(w) < 3 or w in stopset 252 | >>> bcf.apply_word_filter(filter_stops) 253 | >>> bcf.nbest(BigramAssocMeasures.likelihood_ratio, 4) 254 | [('black', 'knight'), ('clop', 'clop'), ('head', 'knight'), ('mumble', 'mumble')] 255 | 256 | >>> from nltk.collocations import TrigramCollocationFinder 257 | >>> from nltk.metrics import TrigramAssocMeasures 258 | >>> words = [w.lower() for w in webtext.words('singles.txt')] 259 | >>> tcf = TrigramCollocationFinder.from_words(words) 260 | >>> tcf.apply_word_filter(filter_stops) 261 | >>> tcf.apply_freq_filter(3) 262 | >>> tcf.nbest(TrigramAssocMeasures.likelihood_ratio, 4) 263 | [('long', 'term', 'relationship')] 264 | """ 265 | 266 | if __name__ == '__main__': 267 | import doctest 268 | doctest.testmod() -------------------------------------------------------------------------------- /chapter2.py: -------------------------------------------------------------------------------- 1 | """ 2 | ============== 3 | Stemming Words 4 | ============== 5 | 6 | >>> from nltk.stem import PorterStemmer 7 | >>> stemmer = PorterStemmer() 8 | >>> stemmer.stem('cooking') 9 | 'cook' 10 | >>> stemmer.stem('cookery') 11 | 'cookeri' 12 | 13 | >>> from nltk.stem import LancasterStemmer 14 | >>> stemmer = LancasterStemmer() 15 | >>> stemmer.stem('cooking') 16 | 'cook' 17 | >>> stemmer.stem('cookery') 18 | 'cookery' 19 | 20 | >>> from nltk.stem import RegexpStemmer 21 | >>> stemmer = RegexpStemmer('ing') 22 | >>> stemmer.stem('cooking') 23 | 'cook' 24 | >>> stemmer.stem('cookery') 25 | 'cookery' 26 | >>> stemmer.stem('ingleside') 27 | 'leside' 28 | 29 | >>> from nltk.stem import SnowballStemmer 30 | >>> SnowballStemmer.languages 31 | ('danish', 'dutch', 'english', 'finnish', 'french', 'german', 'hungarian', 'italian', 'norwegian', 'porter', 'portuguese', 'romanian', 'russian', 'spanish', 'swedish') 32 | >>> spanish_stemmer = SnowballStemmer('spanish') 33 | >>> spanish_stemmer.stem('hola') 34 | 'hol' 35 | 36 | 37 | ============================== 38 | Lemmatising Words with WordNet 39 | ============================== 40 | 41 | >>> from nltk.stem import WordNetLemmatizer 42 | >>> lemmatizer = WordNetLemmatizer() 43 | >>> lemmatizer.lemmatize('cooking') 44 | 'cooking' 45 | >>> lemmatizer.lemmatize('cooking', pos='v') 46 | 'cook' 47 | >>> lemmatizer.lemmatize('cookbooks') 48 | 'cookbook' 49 | 50 | >>> from nltk.stem import PorterStemmer 51 | >>> stemmer = PorterStemmer() 52 | >>> stemmer.stem('believes') 53 | 'believ' 54 | >>> lemmatizer.lemmatize('believes') 55 | 'belief' 56 | 57 | >>> stemmer.stem('buses') 58 | 'buse' 59 | >>> lemmatizer.lemmatize('buses') 60 | 'bus' 61 | >>> stemmer.stem('bus') 62 | 'bu' 63 | 64 | 65 | ============================================ 66 | Replacing Words Matching Regular Expressions 67 | ============================================ 68 | 69 | >>> from replacers import RegexpReplacer 70 | >>> replacer = RegexpReplacer() 71 | >>> replacer.replace("can't is a contraction") 72 | 'cannot is a contraction' 73 | >>> replacer.replace("I should've done that thing I didn't do") 74 | 'I should have done that thing I did not do' 75 | 76 | >>> from nltk.tokenize import word_tokenize 77 | >>> from replacers import RegexpReplacer 78 | >>> replacer = RegexpReplacer() 79 | >>> word_tokenize("can't is a contraction") 80 | ['ca', "n't", 'is', 'a', 'contraction'] 81 | >>> word_tokenize(replacer.replace("can't is a contraction")) 82 | ['can', 'not', 'is', 'a', 'contraction'] 83 | 84 | 85 | ============================= 86 | Removing Repeating Characters 87 | ============================= 88 | 89 | >>> from replacers import RepeatReplacer 90 | >>> replacer = RepeatReplacer() 91 | >>> replacer.replace('looooove') 92 | 'love' 93 | >>> replacer.replace('oooooh') 94 | 'ooh' 95 | >>> replacer.replace('goose') 96 | 'goose' 97 | 98 | 99 | ================================ 100 | Spelling Correction with Enchant 101 | ================================ 102 | 103 | >>> from replacers import SpellingReplacer 104 | >>> replacer = SpellingReplacer() 105 | >>> replacer.replace('cookbok') 106 | 'cookbook' 107 | 108 | >>> import enchant 109 | >>> d = enchant.Dict('en') 110 | >>> d.suggest('languege') 111 | ['language', 'languages', 'languor', "language's"] 112 | 113 | >>> from nltk.metrics import edit_distance 114 | >>> edit_distance('language', 'languege') 115 | 1 116 | >>> edit_distance('language', 'languor') 117 | 3 118 | 119 | >>> enchant.list_languages() 120 | ['en', 'en_CA', 'en_GB', 'en_US'] 121 | 122 | >>> dUS = enchant.Dict('en_US') 123 | >>> dUS.check('theater') 124 | True 125 | >>> dGB = enchant.Dict('en_GB') 126 | >>> dGB.check('theater') 127 | False 128 | >>> us_replacer = SpellingReplacer('en_US') 129 | >>> us_replacer.replace('theater') 130 | 'theater' 131 | >>> gb_replacer = SpellingReplacer('en_GB') 132 | >>> gb_replacer.replace('theater') 133 | 'theatre' 134 | 135 | >>> d = enchant.Dict('en_US') 136 | >>> d.check('nltk') 137 | False 138 | >>> d = enchant.DictWithPWL('en_US', 'mywords.txt') 139 | >>> d.check('nltk') 140 | True 141 | 142 | >>> from replacers import CustomSpellingReplacer 143 | >>> d = enchant.DictWithPWL('en_US', 'mywords.txt') 144 | >>> replacer = CustomSpellingReplacer(d) 145 | >>> replacer.replace('nltk') 146 | 'nltk' 147 | 148 | ================================= 149 | Replacing Negations with Antonyms 150 | ================================= 151 | 152 | >>> from replacers import AntonymReplacer 153 | >>> replacer = AntonymReplacer() 154 | >>> replacer.replace('good') 155 | >>> replacer.replace('uglify') 156 | 'beautify' 157 | >>> sent = ["let's", 'not', 'uglify', 'our', 'code'] 158 | >>> replacer.replace_negations(sent) 159 | ["let's", 'beautify', 'our', 'code'] 160 | 161 | >>> from replacers import AntonymWordReplacer 162 | >>> replacer = AntonymWordReplacer({'evil': 'good'}) 163 | >>> replacer.replace_negations(['good', 'is', 'not', 'evil']) 164 | ['good', 'is', 'good'] 165 | """ 166 | 167 | if __name__ == '__main__': 168 | import doctest 169 | doctest.testmod() -------------------------------------------------------------------------------- /chapter3.py: -------------------------------------------------------------------------------- 1 | """ 2 | ========================== 3 | Setting up a Custom Corpus 4 | ========================== 5 | 6 | >>> import os, os.path 7 | >>> path = os.path.expanduser('~/nltk_data') 8 | >>> if not os.path.exists(path): 9 | ... os.mkdir(path) 10 | >>> os.path.exists(path) 11 | True 12 | >>> import nltk.data 13 | >>> path in nltk.data.path 14 | True 15 | 16 | >>> nltk.data.load('corpora/cookbook/mywords.txt', format='raw') 17 | b'nltk\\n' 18 | 19 | >>> nltk.data.load('corpora/cookbook/synonyms.yaml') 20 | {'bday': 'birthday'} 21 | 22 | 23 | =========================== 24 | Creating a Word List Corpus 25 | =========================== 26 | 27 | >>> from nltk.corpus.reader import WordListCorpusReader 28 | >>> reader = WordListCorpusReader('.', ['wordlist']) 29 | >>> reader.words() 30 | ['nltk', 'corpus', 'corpora', 'wordnet'] 31 | >>> reader.fileids() 32 | ['wordlist'] 33 | 34 | >>> reader.raw() 35 | 'nltk\\ncorpus\\ncorpora\\nwordnet\\n' 36 | >>> from nltk.tokenize import line_tokenize 37 | >>> line_tokenize(reader.raw()) 38 | ['nltk', 'corpus', 'corpora', 'wordnet'] 39 | 40 | >>> from nltk.corpus import names 41 | >>> names.fileids() 42 | ['female.txt', 'male.txt'] 43 | >>> len(names.words('female.txt')) 44 | 5001 45 | >>> len(names.words('male.txt')) 46 | 2943 47 | 48 | >>> from nltk.corpus import words 49 | >>> words.fileids() 50 | ['en', 'en-basic'] 51 | >>> len(words.words('en-basic')) 52 | 850 53 | >>> len(words.words('en')) 54 | 234936 55 | 56 | 57 | ============================================ 58 | Creating a Part-of-Speech Tagged Word Corpus 59 | ============================================ 60 | 61 | >>> from nltk.corpus.reader import TaggedCorpusReader 62 | >>> reader = TaggedCorpusReader('.', r'.*\.pos') 63 | >>> reader.words() 64 | ['The', 'expense', 'and', 'time', 'involved', 'are', ...] 65 | >>> reader.tagged_words() 66 | [('The', 'AT-TL'), ('expense', 'NN'), ('and', 'CC'), ...] 67 | >>> reader.sents() 68 | [['The', 'expense', 'and', 'time', 'involved', 'are', 'astronomical', '.']] 69 | >>> reader.tagged_sents() 70 | [[('The', 'AT-TL'), ('expense', 'NN'), ('and', 'CC'), ('time', 'NN'), ('involved', 'VBN'), ('are', 'BER'), ('astronomical', 'JJ'), ('.', '.')]] 71 | >>> reader.paras() 72 | [[['The', 'expense', 'and', 'time', 'involved', 'are', 'astronomical', '.']]] 73 | >>> reader.tagged_paras() 74 | [[[('The', 'AT-TL'), ('expense', 'NN'), ('and', 'CC'), ('time', 'NN'), ('involved', 'VBN'), ('are', 'BER'), ('astronomical', 'JJ'), ('.', '.')]]] 75 | 76 | >>> from nltk.tokenize import SpaceTokenizer 77 | >>> reader = TaggedCorpusReader('.', r'.*\.pos', word_tokenizer=SpaceTokenizer()) 78 | >>> reader.words() 79 | ['The', 'expense', 'and', 'time', 'involved', 'are', ...] 80 | 81 | >>> from nltk.tokenize import LineTokenizer 82 | >>> reader = TaggedCorpusReader('.', r'.*\.pos', sent_tokenizer=LineTokenizer()) 83 | >>> reader.sents() 84 | [['The', 'expense', 'and', 'time', 'involved', 'are', 'astronomical', '.']] 85 | 86 | >>> reader = TaggedCorpusReader('.', r'.*\.pos', tagset='en-brown') 87 | >>> reader.tagged_words(tagset='universal') 88 | [('The', 'DET'), ('expense', 'NOUN'), ('and', 'CONJ'), ...] 89 | 90 | >>> from nltk.corpus import treebank 91 | >>> treebank.tagged_words() 92 | [('Pierre', 'NNP'), ('Vinken', 'NNP'), (',', ','), ...] 93 | >>> treebank.tagged_words(tagset='universal') 94 | [('Pierre', 'NOUN'), ('Vinken', 'NOUN'), (',', '.'), ...] 95 | >>> treebank.tagged_words(tagset='brown') 96 | [('Pierre', 'UNK'), ('Vinken', 'UNK'), (',', 'UNK'), ...] 97 | 98 | 99 | ================================ 100 | Creating a Chunked Phrase Corpus 101 | ================================ 102 | 103 | >>> from nltk.corpus.reader import ChunkedCorpusReader 104 | >>> reader = ChunkedCorpusReader('.', r'.*\.chunk') 105 | >>> reader.chunked_words() 106 | [Tree('NP', [('Earlier', 'JJR'), ('staff-reduction', 'NN'), ('moves', 'NNS')]), ('have', 'VBP'), ...] 107 | >>> reader.chunked_sents() 108 | [Tree('S', [Tree('NP', [('Earlier', 'JJR'), ('staff-reduction', 'NN'), ('moves', 'NNS')]), ('have', 'VBP'), ('trimmed', 'VBN'), ('about', 'IN'), Tree('NP', [('300', 'CD'), ('jobs', 'NNS')]), (',', ','), Tree('NP', [('the', 'DT'), ('spokesman', 'NN')]), ('said', 'VBD'), ('.', '.')])] 109 | >>> reader.chunked_paras() 110 | [[Tree('S', [Tree('NP', [('Earlier', 'JJR'), ('staff-reduction', 'NN'), ('moves', 'NNS')]), ('have', 'VBP'), ('trimmed', 'VBN'), ('about', 'IN'), Tree('NP', [('300', 'CD'), ('jobs', 'NNS')]), (',', ','), Tree('NP', [('the', 'DT'), ('spokesman', 'NN')]), ('said', 'VBD'), ('.', '.')])]] 111 | 112 | >>> from nltk.corpus.reader import ConllChunkCorpusReader 113 | >>> conllreader = ConllChunkCorpusReader('.', r'.*\.iob', ('NP', 'VP', 'PP')) 114 | >>> conllreader.chunked_words() 115 | [Tree('NP', [('Mr.', 'NNP'), ('Meador', 'NNP')]), Tree('VP', [('had', 'VBD'), ('been', 'VBN')]), ...] 116 | >>> conllreader.chunked_sents() 117 | [Tree('S', [Tree('NP', [('Mr.', 'NNP'), ('Meador', 'NNP')]), Tree('VP', [('had', 'VBD'), ('been', 'VBN')]), Tree('NP', [('executive', 'JJ'), ('vice', 'NN'), ('president', 'NN')]), Tree('PP', [('of', 'IN')]), Tree('NP', [('Balcor', 'NNP')]), ('.', '.')])] 118 | >>> conllreader.iob_words() 119 | [('Mr.', 'NNP', 'B-NP'), ('Meador', 'NNP', 'I-NP'), ...] 120 | >>> conllreader.iob_sents() 121 | [[('Mr.', 'NNP', 'B-NP'), ('Meador', 'NNP', 'I-NP'), ('had', 'VBD', 'B-VP'), ('been', 'VBN', 'I-VP'), ('executive', 'JJ', 'B-NP'), ('vice', 'NN', 'I-NP'), ('president', 'NN', 'I-NP'), ('of', 'IN', 'B-PP'), ('Balcor', 'NNP', 'B-NP'), ('.', '.', 'O')]] 122 | 123 | >>> reader.chunked_words()[0].leaves() 124 | [('Earlier', 'JJR'), ('staff-reduction', 'NN'), ('moves', 'NNS')] 125 | >>> reader.chunked_sents()[0].leaves() 126 | [('Earlier', 'JJR'), ('staff-reduction', 'NN'), ('moves', 'NNS'), ('have', 'VBP'), ('trimmed', 'VBN'), ('about', 'IN'), ('300', 'CD'), ('jobs', 'NNS'), (',', ','), ('the', 'DT'), ('spokesman', 'NN'), ('said', 'VBD'), ('.', '.')] 127 | >>> reader.chunked_paras()[0][0].leaves() 128 | [('Earlier', 'JJR'), ('staff-reduction', 'NN'), ('moves', 'NNS'), ('have', 'VBP'), ('trimmed', 'VBN'), ('about', 'IN'), ('300', 'CD'), ('jobs', 'NNS'), (',', ','), ('the', 'DT'), ('spokesman', 'NN'), ('said', 'VBD'), ('.', '.')] 129 | 130 | 131 | ================================== 132 | Creating a Categorized Text Corpus 133 | ================================== 134 | 135 | >>> from nltk.corpus import brown 136 | >>> brown.categories() 137 | ['adventure', 'belles_lettres', 'editorial', 'fiction', 'government', 'hobbies', 'humor', 'learned', 'lore', 'mystery', 'news', 'religion', 'reviews', 'romance', 'science_fiction'] 138 | 139 | >>> from nltk.corpus.reader import CategorizedPlaintextCorpusReader 140 | >>> reader = CategorizedPlaintextCorpusReader('.', r'movie_.*\.txt', cat_pattern=r'movie_(\w+)\.txt') 141 | >>> reader.categories() 142 | ['neg', 'pos'] 143 | >>> reader.fileids(categories=['neg']) 144 | ['movie_neg.txt'] 145 | >>> reader.fileids(categories=['pos']) 146 | ['movie_pos.txt'] 147 | 148 | >>> reader = CategorizedPlaintextCorpusReader('.', r'movie_.*\.txt', cat_map={'movie_pos.txt': ['pos'], 'movie_neg.txt': ['neg']}) 149 | >>> reader.categories() 150 | ['neg', 'pos'] 151 | 152 | 153 | =================================== 154 | Creating a Categorized Chunk Corpus 155 | =================================== 156 | 157 | >>> import nltk.data 158 | >>> from catchunked import CategorizedChunkedCorpusReader 159 | >>> path = nltk.data.find('corpora/treebank/tagged') 160 | >>> reader = CategorizedChunkedCorpusReader(path, r'wsj_.*\.pos', cat_pattern=r'wsj_(.*)\.pos') 161 | >>> len(reader.categories()) == len(reader.fileids()) 162 | True 163 | >>> len(reader.chunked_sents(categories=['0001'])) 164 | 16 165 | 166 | >>> import nltk.data 167 | >>> from catchunked import CategorizedConllChunkCorpusReader 168 | >>> path = nltk.data.find('corpora/conll2000') 169 | >>> reader = CategorizedConllChunkCorpusReader(path, r'.*\.txt', ('NP','VP','PP'), cat_pattern=r'(.*)\.txt') 170 | >>> reader.categories() 171 | ['test', 'train'] 172 | >>> reader.fileids() 173 | ['test.txt', 'train.txt'] 174 | >>> len(reader.chunked_sents(categories=['test'])) 175 | 2012 176 | 177 | 178 | =================== 179 | Lazy Corpus Loading 180 | =================== 181 | 182 | >>> from nltk.corpus.util import LazyCorpusLoader 183 | >>> from nltk.corpus.reader import WordListCorpusReader 184 | >>> reader = LazyCorpusLoader('cookbook', WordListCorpusReader, ['wordlist']) 185 | >>> isinstance(reader, LazyCorpusLoader) 186 | True 187 | >>> reader.fileids() 188 | ['wordlist'] 189 | >>> isinstance(reader, LazyCorpusLoader) 190 | False 191 | >>> isinstance(reader, WordListCorpusReader) 192 | True 193 | 194 | 195 | ============================= 196 | Creating a Custom Corpus View 197 | ============================= 198 | 199 | >>> from nltk.corpus.reader import PlaintextCorpusReader 200 | >>> plain = PlaintextCorpusReader('.', ['heading_text.txt']) 201 | >>> len(plain.paras()) 202 | 4 203 | >>> from corpus import IgnoreHeadingCorpusReader 204 | >>> reader = IgnoreHeadingCorpusReader('.', ['heading_text.txt']) 205 | >>> len(reader.paras()) 206 | 3 207 | """ 208 | 209 | if __name__ == '__main__': 210 | import doctest 211 | doctest.testmod() -------------------------------------------------------------------------------- /chapter4.py: -------------------------------------------------------------------------------- 1 | """ 2 | =============== 3 | Default Tagging 4 | =============== 5 | 6 | >>> from nltk.tag import DefaultTagger 7 | >>> tagger = DefaultTagger('NN') 8 | >>> tagger.tag(['Hello', 'World']) 9 | [('Hello', 'NN'), ('World', 'NN')] 10 | 11 | >>> from nltk.corpus import treebank 12 | >>> test_sents = treebank.tagged_sents()[3000:] 13 | >>> tagger.evaluate(test_sents) 14 | 0.14331966328512843 15 | 16 | >>> tagger.tag_sents([['Hello', 'world', '.'], ['How', 'are', 'you', '?']]) 17 | [[('Hello', 'NN'), ('world', 'NN'), ('.', 'NN')], [('How', 'NN'), ('are', 'NN'), ('you', 'NN'), ('?', 'NN')]] 18 | 19 | >>> from nltk.tag import untag 20 | >>> untag([('Hello', 'NN'), ('World', 'NN')]) 21 | ['Hello', 'World'] 22 | 23 | 24 | ======================================== 25 | Training a Unigram Part-of-Speech Tagger 26 | ======================================== 27 | 28 | >>> from nltk.tag import UnigramTagger 29 | >>> from nltk.corpus import treebank 30 | >>> train_sents = treebank.tagged_sents()[:3000] 31 | >>> tagger = UnigramTagger(train_sents) 32 | >>> treebank.sents()[0] 33 | ['Pierre', 'Vinken', ',', '61', 'years', 'old', ',', 'will', 'join', 'the', 'board', 'as', 'a', 'nonexecutive', 'director', 'Nov.', '29', '.'] 34 | >>> tagger.tag(treebank.sents()[0]) 35 | [('Pierre', 'NNP'), ('Vinken', 'NNP'), (',', ','), ('61', 'CD'), ('years', 'NNS'), ('old', 'JJ'), (',', ','), ('will', 'MD'), ('join', 'VB'), ('the', 'DT'), ('board', 'NN'), ('as', 'IN'), ('a', 'DT'), ('nonexecutive', 'JJ'), ('director', 'NN'), ('Nov.', 'NNP'), ('29', 'CD'), ('.', '.')] 36 | 37 | >>> tagger.evaluate(test_sents) 38 | 0.8588819339520829 39 | 40 | >>> tagger = UnigramTagger(model={'Pierre': 'NN'}) 41 | >>> tagger.tag(treebank.sents()[0]) 42 | [('Pierre', 'NN'), ('Vinken', None), (',', None), ('61', None), ('years', None), ('old', None), (',', None), ('will', None), ('join', None), ('the', None), ('board', None), ('as', None), ('a', None), ('nonexecutive', None), ('director', None), ('Nov.', None), ('29', None), ('.', None)] 43 | 44 | >>> tagger = UnigramTagger(train_sents, cutoff=3) 45 | >>> tagger.evaluate(test_sents) 46 | 0.7757392618173969 47 | 48 | 49 | ====================================== 50 | Combining Taggers with Backoff Tagging 51 | ====================================== 52 | 53 | >>> tagger1 = DefaultTagger('NN') 54 | >>> tagger2 = UnigramTagger(train_sents, backoff=tagger1) 55 | >>> tagger2.evaluate(test_sents) 56 | 0.8758471832505935 57 | 58 | >>> tagger1._taggers == [tagger1] 59 | True 60 | >>> tagger2._taggers == [tagger2, tagger1] 61 | True 62 | 63 | >>> import pickle 64 | >>> f = open('tagger.pickle', 'wb') 65 | >>> pickle.dump(tagger, f) 66 | >>> f.close() 67 | >>> f = open('tagger.pickle', 'rb') 68 | >>> tagger = pickle.load(f) 69 | 70 | 71 | ==================================== 72 | Training and Combining Ngram Taggers 73 | ==================================== 74 | 75 | >>> from nltk.tag import BigramTagger, TrigramTagger 76 | >>> bitagger = BigramTagger(train_sents) 77 | >>> bitagger.evaluate(test_sents) 78 | 0.11310166199007123 79 | >>> tritagger = TrigramTagger(train_sents) 80 | >>> tritagger.evaluate(test_sents) 81 | 0.0688107058061731 82 | 83 | >>> from tag_util import backoff_tagger 84 | >>> backoff = DefaultTagger('NN') 85 | >>> tagger = backoff_tagger(train_sents, [UnigramTagger, BigramTagger, TrigramTagger], backoff=backoff) 86 | >>> tagger.evaluate(test_sents) 87 | 0.8806820634578028 88 | 89 | >>> tagger._taggers[-1] == backoff 90 | True 91 | >>> isinstance(tagger._taggers[0], TrigramTagger) 92 | True 93 | >>> isinstance(tagger._taggers[1], BigramTagger) 94 | True 95 | 96 | >>> from nltk.tag import NgramTagger 97 | >>> quadtagger = NgramTagger(4, train_sents) 98 | >>> quadtagger.evaluate(test_sents) 99 | 0.058234405352903085 100 | 101 | >>> from taggers import QuadgramTagger 102 | >>> quadtagger = backoff_tagger(train_sents, [UnigramTagger, BigramTagger, TrigramTagger, QuadgramTagger], backoff=backoff) 103 | >>> quadtagger.evaluate(test_sents) 104 | 0.8806388948845241 105 | 106 | 107 | ==================================== 108 | Creating a Model of Likely Word Tags 109 | ==================================== 110 | 111 | >>> from tag_util import word_tag_model 112 | >>> from nltk.corpus import treebank 113 | >>> model = word_tag_model(treebank.words(), treebank.tagged_words()) 114 | >>> tagger = UnigramTagger(model=model) 115 | >>> tagger.evaluate(test_sents) 116 | 0.559680552557738 117 | 118 | >>> default_tagger = DefaultTagger('NN') 119 | >>> likely_tagger = UnigramTagger(model=model, backoff=default_tagger) 120 | >>> tagger = backoff_tagger(train_sents, [UnigramTagger, BigramTagger, TrigramTagger], backoff=likely_tagger) 121 | >>> tagger.evaluate(test_sents) 122 | 0.8806820634578028 123 | 124 | >>> tagger = backoff_tagger(train_sents, [UnigramTagger, BigramTagger, TrigramTagger], backoff=default_tagger) 125 | >>> likely_tagger = UnigramTagger(model=model, backoff=tagger) 126 | >>> likely_tagger.evaluate(test_sents) 127 | 0.8824088063889488 128 | 129 | 130 | ================================ 131 | Tagging with Regular Expressions 132 | ================================ 133 | 134 | >>> from tag_util import patterns 135 | >>> from nltk.tag import RegexpTagger 136 | >>> tagger = RegexpTagger(patterns) 137 | >>> tagger.evaluate(test_sents) 138 | 0.037470321605870924 139 | 140 | 141 | ============= 142 | Affix Tagging 143 | ============= 144 | 145 | >>> from nltk.tag import AffixTagger 146 | >>> tagger = AffixTagger(train_sents) 147 | >>> tagger.evaluate(test_sents) 148 | 0.27558817181092166 149 | 150 | >>> prefix_tagger = AffixTagger(train_sents, affix_length=3) 151 | >>> prefix_tagger.evaluate(test_sents) 152 | 0.23587308439456076 153 | 154 | >>> suffix_tagger = AffixTagger(train_sents, affix_length=-2) 155 | >>> suffix_tagger.evaluate(test_sents) 156 | 0.31940427368875457 157 | 158 | >>> pre3_tagger = AffixTagger(train_sents, affix_length=3) 159 | >>> pre3_tagger.evaluate(test_sents) 160 | 0.23587308439456076 161 | >>> pre2_tagger = AffixTagger(train_sents, affix_length=2, backoff=pre3_tagger) 162 | >>> pre2_tagger.evaluate(test_sents) 163 | 0.29786315562270665 164 | >>> suf2_tagger = AffixTagger(train_sents, affix_length=-2, backoff=pre2_tagger) 165 | >>> suf2_tagger.evaluate(test_sents) 166 | 0.32467083962875026 167 | >>> suf3_tagger = AffixTagger(train_sents, affix_length=-3, backoff=suf2_tagger) 168 | >>> suf3_tagger.evaluate(test_sents) 169 | 0.3590761925318368 170 | 171 | 172 | ======================= 173 | Training a Brill Tagger 174 | ======================= 175 | 176 | >>> default_tagger = DefaultTagger('NN') 177 | >>> initial_tagger = backoff_tagger(train_sents, [UnigramTagger, BigramTagger, TrigramTagger], backoff=default_tagger) 178 | >>> initial_tagger.evaluate(test_sents) 179 | 0.8806820634578028 180 | >>> from tag_util import train_brill_tagger 181 | >>> brill_tagger = train_brill_tagger(initial_tagger, train_sents) 182 | >>> brill_tagger.evaluate(test_sents) 183 | 0.8827541549751781 184 | 185 | 186 | ===================== 187 | Training a TnT Tagger 188 | ===================== 189 | 190 | >>> from nltk.tag import tnt 191 | >>> tnt_tagger = tnt.TnT() 192 | >>> tnt_tagger.train(train_sents) 193 | >>> tnt_tagger.evaluate(test_sents) 194 | 0.8756313403842003 195 | 196 | >>> from nltk.tag import DefaultTagger 197 | >>> unk = DefaultTagger('NN') 198 | >>> tnt_tagger = tnt.TnT(unk=unk, Trained=True) 199 | >>> tnt_tagger.train(train_sents) 200 | >>> tnt_tagger.evaluate(test_sents) 201 | 0.892467083962875 202 | 203 | >>> tnt_tagger = tnt.TnT(N=100) 204 | >>> tnt_tagger.train(train_sents) 205 | >>> tnt_tagger.evaluate(test_sents) 206 | 0.8756313403842003 207 | 208 | 209 | ========================= 210 | Using WordNet for Tagging 211 | ========================= 212 | 213 | >>> from taggers import WordNetTagger 214 | >>> wn_tagger = WordNetTagger() 215 | >>> wn_tagger.evaluate(train_sents) 216 | 0.17914876598160262 217 | 218 | >>> from tag_util import backoff_tagger 219 | >>> from nltk.tag import UnigramTagger, BigramTagger, TrigramTagger 220 | >>> tagger = backoff_tagger(train_sents, [UnigramTagger, BigramTagger, TrigramTagger], backoff=wn_tagger) 221 | >>> tagger.evaluate(test_sents) 222 | 0.8848262464925534 223 | 224 | 225 | ======================== 226 | Classifier Based Tagging 227 | ======================== 228 | 229 | >>> from nltk.tag.sequential import ClassifierBasedPOSTagger 230 | >>> tagger = ClassifierBasedPOSTagger(train=train_sents) 231 | >>> tagger.evaluate(test_sents) 232 | 0.9309734513274336 233 | 234 | >>> from nltk.classify import MaxentClassifier 235 | >>> me_tagger = ClassifierBasedPOSTagger(train=train_sents, classifier_builder=MaxentClassifier.train) 236 | ==> Training (100 iterations) 237 | 238 | Iteration Log Likelihood Accuracy 239 | --------------------------------------- 240 | 1 -3.82864 0.008 241 | 2 -0.76859 0.957 242 | Final nan 0.984 243 | >>> me_tagger.evaluate(test_sents) 244 | 0.9258363911072739 245 | 246 | >>> from nltk.tag.sequential import ClassifierBasedTagger 247 | >>> from tag_util import unigram_feature_detector 248 | >>> tagger = ClassifierBasedTagger(train=train_sents, feature_detector=unigram_feature_detector) 249 | >>> tagger.evaluate(test_sents) 250 | 0.8733865745737104 251 | 252 | >>> default = DefaultTagger('NN') 253 | >>> tagger = ClassifierBasedPOSTagger(train=train_sents, backoff=default, cutoff_prob=0.3) 254 | >>> tagger.evaluate(test_sents) 255 | 0.9311029570472696 256 | """ 257 | 258 | if __name__ == '__main__': 259 | import doctest 260 | doctest.testmod() -------------------------------------------------------------------------------- /chapter5.py: -------------------------------------------------------------------------------- 1 | """ 2 | ============================================== 3 | Chunking and Chinking with Regular Expressions 4 | ============================================== 5 | 6 | >>> from nltk.chunk.regexp import tag_pattern2re_pattern 7 | >>> tag_pattern2re_pattern('
?+') 8 | '(<(DT)>)?(<(NN[^\\\{\\\}<>]*)>)+' 9 | 10 | >>> from nltk.chunk import RegexpParser 11 | >>> chunker = RegexpParser(r''' 12 | ... NP: 13 | ... {
<.*>*} 14 | ... }{ 15 | ... ''') 16 | >>> chunker.parse([('the', 'DT'), ('book', 'NN'), ('has', 'VBZ'), ('many', 'JJ'), ('chapters', 'NNS')]) 17 | Tree('S', [Tree('NP', [('the', 'DT'), ('book', 'NN')]), ('has', 'VBZ'), Tree('NP', [('many', 'JJ'), ('chapters', 'NNS')])]) 18 | 19 | >>> from nltk.chunk.regexp import ChunkString, ChunkRule, ChinkRule 20 | >>> from nltk.tree import Tree 21 | >>> t = Tree('S', [('the', 'DT'), ('book', 'NN'), ('has', 'VBZ'), ('many', 'JJ'), ('chapters', 'NNS')]) 22 | >>> cs = ChunkString(t) 23 | >>> cs 24 | '> 25 | >>> ur = ChunkRule('
<.*>*', 'chunk determiners and nouns') 26 | >>> ur.apply(cs) 27 | >>> cs 28 | }'> 29 | >>> ir = ChinkRule('', 'chink verbs') 30 | >>> ir.apply(cs) 31 | >>> cs 32 | }{}'> 33 | >>> cs.to_chunkstruct() 34 | Tree('S', [Tree('CHUNK', [('the', 'DT'), ('book', 'NN')]), ('has', 'VBZ'), Tree('CHUNK', [('many', 'JJ'), ('chapters', 'NNS')])]) 35 | 36 | >>> from nltk.chunk import RegexpChunkParser 37 | >>> chunker = RegexpChunkParser([ur, ir]) 38 | >>> chunker.parse(t) 39 | Tree('S', [Tree('NP', [('the', 'DT'), ('book', 'NN')]), ('has', 'VBZ'), Tree('NP', [('many', 'JJ'), ('chapters', 'NNS')])]) 40 | 41 | >>> from nltk.chunk import RegexpChunkParser 42 | >>> chunker = RegexpChunkParser([ur, ir], chunk_label='CP') 43 | >>> chunker.parse(t) 44 | Tree('S', [Tree('CP', [('the', 'DT'), ('book', 'NN')]), ('has', 'VBZ'), Tree('CP', [('many', 'JJ'), ('chapters', 'NNS')])]) 45 | 46 | >>> chunker = RegexpParser(r''' 47 | ... NP: 48 | ... {
} 49 | ... {} 50 | ... ''') 51 | >>> chunker.parse(t) 52 | Tree('S', [Tree('NP', [('the', 'DT'), ('book', 'NN')]), ('has', 'VBZ'), Tree('NP', [('many', 'JJ'), ('chapters', 'NNS')])]) 53 | 54 | >>> chunker = RegexpParser(r''' 55 | ... NP: 56 | ... {(
|)} 57 | ... ''') 58 | >>> chunker.parse(t) 59 | Tree('S', [Tree('NP', [('the', 'DT'), ('book', 'NN')]), ('has', 'VBZ'), Tree('NP', [('many', 'JJ'), ('chapters', 'NNS')])]) 60 | 61 | >>> from nltk.chunk.regexp import ChunkRuleWithContext 62 | >>> ctx = ChunkRuleWithContext('
', '', '<.*>', 'chunk nouns only after determiners') 63 | >>> cs = ChunkString(t) 64 | >>> cs 65 | '> 66 | >>> ctx.apply(cs) 67 | >>> cs 68 | {}'> 69 | >>> cs.to_chunkstruct() 70 | Tree('S', [('the', 'DT'), Tree('CHUNK', [('book', 'NN')]), ('has', 'VBZ'), ('many', 'JJ'), ('chapters', 'NNS')]) 71 | 72 | >>> chunker = RegexpParser(r''' 73 | ... NP: 74 | ...
{} 75 | ... ''') 76 | >>> chunker.parse(t) 77 | Tree('S', [('the', 'DT'), Tree('NP', [('book', 'NN')]), ('has', 'VBZ'), ('many', 'JJ'), ('chapters', 'NNS')]) 78 | 79 | 80 | ===================================================== 81 | Merging and Splitting Chunks with Regular Expressions 82 | ===================================================== 83 | 84 | >>> chunker = RegexpParser(r''' 85 | ... NP: 86 | ... {
<.*>*} 87 | ... }{<.*> 88 | ... <.*>}{
89 | ... {} 90 | ... ''') 91 | >>> sent = [('the', 'DT'), ('sushi', 'NN'), ('roll', 'NN'), ('was', 'VBD'), ('filled', 'VBN'), ('with', 'IN'), ('the', 'DT'), ('fish', 'NN')] 92 | >>> chunker.parse(sent) 93 | Tree('S', [Tree('NP', [('the', 'DT'), ('sushi', 'NN'), ('roll', 'NN')]), Tree('NP', [('was', 'VBD'), ('filled', 'VBN'), ('with', 'IN')]), Tree('NP', [('the', 'DT'), ('fish', 'NN')])]) 94 | 95 | >>> from nltk.chunk.regexp import MergeRule, SplitRule 96 | >>> cs = ChunkString(Tree('S', sent)) 97 | >>> cs 98 |
'> 99 | >>> ur = ChunkRule('
<.*>*', 'chunk determiner to noun') 100 | >>> ur.apply(cs) 101 | >>> cs 102 |
}'> 103 | >>> sr1 = SplitRule('', '<.*>', 'split after noun') 104 | >>> sr1.apply(cs) 105 | >>> cs 106 | }{}{
}'> 107 | >>> sr2 = SplitRule('<.*>', '
', 'split before determiner') 108 | >>> sr2.apply(cs) 109 | >>> cs 110 | }{}{}{
}'> 111 | >>> mr = MergeRule('', '', 'merge nouns') 112 | >>> mr.apply(cs) 113 | >>> cs 114 | }{}{
}'> 115 | >>> cs.to_chunkstruct() 116 | Tree('S', [Tree('CHUNK', [('the', 'DT'), ('sushi', 'NN'), ('roll', 'NN')]), Tree('CHUNK', [('was', 'VBD'), ('filled', 'VBN'), ('with', 'IN')]), Tree('CHUNK', [('the', 'DT'), ('fish', 'NN')])]) 117 | 118 | >>> from nltk.chunk.regexp import RegexpChunkRule 119 | >>> RegexpChunkRule.fromstring('{
<.*>*}') 120 | <.*>*'> 121 | >>> RegexpChunkRule.fromstring('<.*>}{
') 122 | ', '
'> 123 | >>> RegexpChunkRule.fromstring('{}') 124 | ', ''> 125 | 126 | >>> RegexpChunkRule.fromstring('{
<.*>*} # chunk everything').descr() 127 | 'chunk everything' 128 | >>> RegexpChunkRule.fromstring('{
<.*>*}').descr() 129 | '' 130 | 131 | 132 | ====================================================== 133 | Expanding and Removing Chunks with Regular Expressions 134 | ====================================================== 135 | 136 | >>> from nltk.chunk.regexp import ChunkRule, ExpandLeftRule, ExpandRightRule, UnChunkRule 137 | >>> from nltk.chunk import RegexpChunkParser 138 | >>> ur = ChunkRule('', 'single noun') 139 | >>> el = ExpandLeftRule('
', '', 'get left determiner') 140 | >>> er = ExpandRightRule('', '', 'get right plural noun') 141 | >>> un = UnChunkRule('
*', 'unchunk everything') 142 | >>> chunker = RegexpChunkParser([ur, el, er, un]) 143 | >>> sent = [('the', 'DT'), ('sushi', 'NN'), ('rolls', 'NNS')] 144 | >>> chunker.parse(sent) 145 | Tree('S', [('the', 'DT'), ('sushi', 'NN'), ('rolls', 'NNS')]) 146 | 147 | >>> from nltk.chunk.regexp import ChunkString 148 | >>> from nltk.tree import Tree 149 | >>> cs = ChunkString(Tree('S', sent)) 150 | >>> cs 151 | '> 152 | >>> ur.apply(cs) 153 | >>> cs 154 | {}'> 155 | >>> el.apply(cs) 156 | >>> cs 157 | }'> 158 | >>> er.apply(cs) 159 | >>> cs 160 | }'> 161 | >>> un.apply(cs) 162 | >>> cs 163 | '> 164 | 165 | 166 | ======================================== 167 | Partial Parsing with Regular Expressions 168 | ======================================== 169 | 170 | >>> chunker = RegexpParser(r''' 171 | ... NP: 172 | ... {
?+} # chunk optional determiner with nouns 173 | ... {} # merge adjective with noun chunk 174 | ... PP: 175 | ... {} # chunk preposition 176 | ... VP: 177 | ... {?} # chunk optional modal with verb 178 | ... ''') 179 | >>> from nltk.corpus import conll2000 180 | >>> score = chunker.evaluate(conll2000.chunked_sents()) 181 | >>> score.accuracy() 182 | 0.6148573545757688 183 | 184 | >>> from nltk.corpus import treebank_chunk 185 | >>> treebank_score = chunker.evaluate(treebank_chunk.chunked_sents()) 186 | >>> treebank_score.accuracy() 187 | 0.49033970276008493 188 | 189 | >>> score.precision() 190 | 0.60201948127375 191 | >>> score.recall() 192 | 0.606072502505847 193 | 194 | >>> len(score.missed()) 195 | 47161 196 | >>> len(score.incorrect()) 197 | 47967 198 | >>> len(score.correct()) 199 | 119720 200 | >>> len(score.guessed()) 201 | 120526 202 | 203 | 204 | =============================== 205 | Training a Tagger Based Chunker 206 | =============================== 207 | 208 | >>> from nltk.chunk.util import tree2conlltags, conlltags2tree 209 | >>> from nltk.tree import Tree 210 | >>> t = Tree('S', [Tree('NP', [('the', 'DT'), ('book', 'NN')])]) 211 | >>> tree2conlltags(t) 212 | [('the', 'DT', 'B-NP'), ('book', 'NN', 'I-NP')] 213 | >>> conlltags2tree([('the', 'DT', 'B-NP'), ('book', 'NN', 'I-NP')]) 214 | Tree('S', [Tree('NP', [('the', 'DT'), ('book', 'NN')])]) 215 | 216 | >>> from chunkers import TagChunker 217 | >>> from nltk.corpus import treebank_chunk 218 | >>> train_chunks = treebank_chunk.chunked_sents()[:3000] 219 | >>> test_chunks = treebank_chunk.chunked_sents()[3000:] 220 | >>> chunker = TagChunker(train_chunks) 221 | >>> score = chunker.evaluate(test_chunks) 222 | >>> score.accuracy() 223 | 0.9732039335251428 224 | >>> score.precision() 225 | 0.9166534370535006 226 | >>> score.recall() 227 | 0.9465573770491803 228 | 229 | >>> from nltk.corpus import conll2000 230 | >>> conll_train = conll2000.chunked_sents('train.txt') 231 | >>> conll_test = conll2000.chunked_sents('test.txt') 232 | >>> chunker = TagChunker(conll_train) 233 | >>> score = chunker.evaluate(conll_test) 234 | >>> score.accuracy() 235 | 0.8950545623403762 236 | >>> score.precision() 237 | 0.8114841974355675 238 | >>> score.recall() 239 | 0.8644191676944863 240 | 241 | >>> from nltk.tag import UnigramTagger 242 | >>> uni_chunker = TagChunker(train_chunks, tagger_classes=[UnigramTagger]) 243 | >>> score = uni_chunker.evaluate(test_chunks) 244 | >>> score.accuracy() 245 | 0.9674925924335466 246 | 247 | 248 | ============================= 249 | Classification Based Chunking 250 | ============================= 251 | 252 | >>> from chunkers import ClassifierChunker 253 | >>> chunker = ClassifierChunker(train_chunks) 254 | >>> score = chunker.evaluate(test_chunks) 255 | >>> score.accuracy() 256 | 0.9721733155838022 257 | >>> score.precision() 258 | 0.9258838793383068 259 | >>> score.recall() 260 | 0.9359016393442623 261 | 262 | >>> chunker = ClassifierChunker(conll_train) 263 | >>> score = chunker.evaluate(conll_test) 264 | >>> score.accuracy() 265 | 0.9264622074002153 266 | >>> score.precision() 267 | 0.8737924310910219 268 | >>> score.recall() 269 | 0.9007354620620346 270 | 271 | >>> from nltk.classify import MaxentClassifier 272 | >>> builder = lambda toks: MaxentClassifier.train(toks, trace=0, max_iter=10, min_lldelta=0.01) 273 | >>> me_chunker = ClassifierChunker(train_chunks, classifier_builder=builder) 274 | >>> score = me_chunker.evaluate(test_chunks) 275 | >>> score.accuracy() 276 | 0.9743204362949285 277 | >>> score.precision() 278 | 0.9334423548650859 279 | >>> score.recall() 280 | 0.9357377049180328 281 | 282 | 283 | ========================= 284 | Extracting Named Entities 285 | ========================= 286 | 287 | >>> from nltk.chunk import ne_chunk 288 | >>> ne_chunk(treebank_chunk.tagged_sents()[0]) 289 | Tree('S', [Tree('PERSON', [('Pierre', 'NNP')]), Tree('ORGANIZATION', [('Vinken', 'NNP')]), (',', ','), ('61', 'CD'), ('years', 'NNS'), ('old', 'JJ'), (',', ','), ('will', 'MD'), ('join', 'VB'), ('the', 'DT'), ('board', 'NN'), ('as', 'IN'), ('a', 'DT'), ('nonexecutive', 'JJ'), ('director', 'NN'), ('Nov.', 'NNP'), ('29', 'CD'), ('.', '.')]) 290 | 291 | >>> tree = ne_chunk(treebank_chunk.tagged_sents()[0]) 292 | >>> from chunkers import sub_leaves 293 | >>> sub_leaves(tree, 'PERSON') 294 | [[('Pierre', 'NNP')]] 295 | >>> sub_leaves(tree, 'ORGANIZATION') 296 | [[('Vinken', 'NNP')]] 297 | 298 | >>> from nltk.chunk import ne_chunk_sents 299 | >>> trees = ne_chunk_sents(treebank_chunk.tagged_sents()[:10]) 300 | >>> [sub_leaves(t, 'ORGANIZATION') for t in trees] 301 | [[[('Vinken', 'NNP')]], [[('Elsevier', 'NNP')]], [[('Consolidated', 'NNP'), ('Gold', 'NNP'), ('Fields', 'NNP')]], [], [], [[('Inc.', 'NNP')], [('Micronite', 'NN')]], [[('New', 'NNP'), ('England', 'NNP'), ('Journal', 'NNP')]], [[('Lorillard', 'NNP')]], [], []] 302 | 303 | >>> ne_chunk(treebank_chunk.tagged_sents()[0], binary=True) 304 | Tree('S', [Tree('NE', [('Pierre', 'NNP'), ('Vinken', 'NNP')]), (',', ','), ('61', 'CD'), ('years', 'NNS'), ('old', 'JJ'), (',', ','), ('will', 'MD'), ('join', 'VB'), ('the', 'DT'), ('board', 'NN'), ('as', 'IN'), ('a', 'DT'), ('nonexecutive', 'JJ'), ('director', 'NN'), ('Nov.', 'NNP'), ('29', 'CD'), ('.', '.')]) 305 | 306 | >>> sub_leaves(ne_chunk(treebank_chunk.tagged_sents()[0], binary=True), 'NE') 307 | [[('Pierre', 'NNP'), ('Vinken', 'NNP')]] 308 | 309 | 310 | =============================== 311 | Extracting Proper Noun Entities 312 | =============================== 313 | 314 | >>> chunker = RegexpParser(r''' 315 | ... NAME: 316 | ... {+} 317 | ... ''') 318 | >>> sub_leaves(chunker.parse(treebank_chunk.tagged_sents()[0]), 'NAME') 319 | [[('Pierre', 'NNP'), ('Vinken', 'NNP')], [('Nov.', 'NNP')]] 320 | 321 | 322 | =============================== 323 | Training a Named Entity Chunker 324 | =============================== 325 | 326 | >>> from chunkers import ieer_chunked_sents 327 | >>> ieer_chunks = list(ieer_chunked_sents()) 328 | >>> len(ieer_chunks) 329 | 94 330 | >>> chunker = ClassifierChunker(ieer_chunks[:80]) 331 | >>> chunker.parse(treebank_chunk.tagged_sents()[0]) 332 | Tree('S', [Tree('LOCATION', [('Pierre', 'NNP'), ('Vinken', 'NNP')]), (',', ','), Tree('DURATION', [('61', 'CD'), ('years', 'NNS')]), Tree('MEASURE', [('old', 'JJ')]), (',', ','), ('will', 'MD'), ('join', 'VB'), ('the', 'DT'), ('board', 'NN'), ('as', 'IN'), ('a', 'DT'), ('nonexecutive', 'JJ'), ('director', 'NN'), Tree('DATE', [('Nov.', 'NNP'), ('29', 'CD')]), ('.', '.')]) 333 | 334 | >>> score = chunker.evaluate(ieer_chunks[80:]) 335 | >>> score.accuracy() 336 | 0.8829018388070625 337 | >>> score.precision() 338 | 0.4088717454194793 339 | >>> score.recall() 340 | 0.5053635280095352 341 | 342 | >>> from nltk.corpus import ieer 343 | >>> ieer.parsed_docs()[0].headline 344 | Tree('DOCUMENT', ['Kenyans', 'protest', 'tax', 'hikes']) 345 | 346 | """ 347 | 348 | if __name__ == '__main__': 349 | import doctest 350 | doctest.testmod() -------------------------------------------------------------------------------- /chapter6.py: -------------------------------------------------------------------------------- 1 | ''' 2 | ============================= 3 | Filtering Insignificant Words 4 | ============================= 5 | 6 | >>> from transforms import filter_insignificant 7 | >>> filter_insignificant([('your', 'PRP$'), ('book', 'NN'), ('is', 'VBZ'), ('great', 'JJ')], tag_suffixes=['PRP', 'PRP$']) 8 | [('book', 'NN'), ('is', 'VBZ'), ('great', 'JJ')] 9 | 10 | 11 | ===================== 12 | Swapping Verb Phrases 13 | ===================== 14 | 15 | >>> from transforms import swap_verb_phrase 16 | >>> swap_verb_phrase(filter_insignificant([('this', 'DT'), ('gripping', 'VBG'), ('book', 'NN'), ('is', 'VBZ'), ('fantastic', 'JJ')])) 17 | [('fantastic', 'JJ'), ('gripping', 'VBG'), ('book', 'NN')] 18 | >>> filter_insignificant(swap_verb_phrase([('this', 'DT'), ('gripping', 'VBG'), ('book', 'NN'), ('is', 'VBZ'), ('fantastic', 'JJ')])) 19 | [('fantastic', 'JJ'), ('gripping', 'VBG'), ('book', 'NN')] 20 | 21 | 22 | ============================== 23 | Chaining Chunk Transformations 24 | ============================== 25 | 26 | >>> from transforms import transform_chunk 27 | >>> transform_chunk([('the', 'DT'), ('book', 'NN'), ('of', 'IN'), ('recipes', 'NNS'), ('is', 'VBZ'), ('delicious', 'JJ')], trace=1) 28 | filter_insignificant : [('book', 'NN'), ('of', 'IN'), ('recipes', 'NNS'), ('is', 'VBZ'), ('delicious', 'JJ')] 29 | swap_verb_phrase : [('delicious', 'JJ'), ('book', 'NN'), ('of', 'IN'), ('recipes', 'NNS')] 30 | swap_infinitive_phrase : [('delicious', 'JJ'), ('recipes', 'NNS'), ('book', 'NN')] 31 | singularize_plural_noun : [('delicious', 'JJ'), ('recipe', 'NN'), ('book', 'NN')] 32 | [('delicious', 'JJ'), ('recipe', 'NN'), ('book', 'NN')] 33 | 34 | 35 | =============================== 36 | Converting a Chunk Tree to Text 37 | =============================== 38 | 39 | >>> from nltk.corpus import treebank_chunk 40 | >>> tree = treebank_chunk.chunked_sents()[0] 41 | >>> ' '.join([w for w, t in tree.leaves()]) 42 | 'Pierre Vinken , 61 years old , will join the board as a nonexecutive director Nov. 29 .' 43 | 44 | >>> from transforms import chunk_tree_to_sent 45 | >>> chunk_tree_to_sent(tree) 46 | 'Pierre Vinken, 61 years old, will join the board as a nonexecutive director Nov. 29.' 47 | 48 | 49 | ===================== 50 | Flattening Deep Trees 51 | ===================== 52 | 53 | >>> from nltk.corpus import treebank 54 | >>> from transforms import flatten_deeptree 55 | >>> flatten_deeptree(treebank.parsed_sents()[0]) 56 | Tree('S', [Tree('NP', [('Pierre', 'NNP'), ('Vinken', 'NNP')]), (',', ','), Tree('NP', [('61', 'CD'), ('years', 'NNS')]), ('old', 'JJ'), (',', ','), ('will', 'MD'), ('join', 'VB'), Tree('NP', [('the', 'DT'), ('board', 'NN')]), ('as', 'IN'), Tree('NP', [('a', 'DT'), ('nonexecutive', 'JJ'), ('director', 'NN')]), Tree('NP-TMP', [('Nov.', 'NNP'), ('29', 'CD')]), ('.', '.')]) 57 | 58 | >>> from nltk.tree import Tree 59 | >>> Tree('NNP', ['Pierre']).height() 60 | 2 61 | 62 | >>> Tree('NNP', ['Pierre']).pos() 63 | [('Pierre', 'NNP')] 64 | 65 | >>> Tree('NP', [Tree('NNP', ['Pierre']), Tree('NNP', ['Vinken'])]).height() 66 | 3 67 | 68 | >>> Tree('NP', [Tree('NNP', ['Pierre']), Tree('NNP', ['Vinken'])]).pos() 69 | [('Pierre', 'NNP'), ('Vinken', 'NNP')] 70 | 71 | >>> from nltk.chunk.util import tree2conlltags 72 | >>> tree2conlltags(treebank.parsed_sents()[0]) 73 | Traceback (most recent call last): 74 | File "", line 1, in 75 | File "/usr/local/lib/python2.6/dist-packages/nltk/chunk/util.py", line 417, in tree2conlltags 76 | raise ValueError, "Tree is too deeply nested to be printed in CoNLL format" 77 | ValueError: Tree is too deeply nested to be printed in CoNLL format 78 | 79 | >>> tree2conlltags(flatten_deeptree(treebank.parsed_sents()[0])) 80 | [('Pierre', 'NNP', 'B-NP'), ('Vinken', 'NNP', 'I-NP'), (',', ',', 'O'), ('61', 'CD', 'B-NP'), ('years', 'NNS', 'I-NP'), ('old', 'JJ', 'O'), (',', ',', 'O'), ('will', 'MD', 'O'), ('join', 'VB', 'O'), ('the', 'DT', 'B-NP'), ('board', 'NN', 'I-NP'), ('as', 'IN', 'O'), ('a', 'DT', 'B-NP'), ('nonexecutive', 'JJ', 'I-NP'), ('director', 'NN', 'I-NP'), ('Nov.', 'NNP', 'B-NP-TMP'), ('29', 'CD', 'I-NP-TMP'), ('.', '.', 'O')] 81 | 82 | >>> from nltk.corpus import cess_esp 83 | >>> cess_esp.parsed_sents()[0].height() 84 | 22 85 | >>> flatten_deeptree(cess_esp.parsed_sents()[0]).height() 86 | 3 87 | 88 | 89 | ======================= 90 | Creating a Shallow Tree 91 | ======================= 92 | 93 | >>> from transforms import shallow_tree 94 | >>> shallow_tree(treebank.parsed_sents()[0]) 95 | Tree('S', [Tree('NP-SBJ', [('Pierre', 'NNP'), ('Vinken', 'NNP'), (',', ','), ('61', 'CD'), ('years', 'NNS'), ('old', 'JJ'), (',', ',')]), Tree('VP', [('will', 'MD'), ('join', 'VB'), ('the', 'DT'), ('board', 'NN'), ('as', 'IN'), ('a', 'DT'), ('nonexecutive', 'JJ'), ('director', 'NN'), ('Nov.', 'NNP'), ('29', 'CD')]), ('.', '.')]) 96 | 97 | >>> treebank.parsed_sents()[0].height() 98 | 7 99 | >>> shallow_tree(treebank.parsed_sents()[0]).height() 100 | 3 101 | 102 | 103 | ====================== 104 | Converting Tree Labels 105 | ====================== 106 | 107 | >>> from transforms import convert_tree_labels 108 | >>> mapping = {'NP-SBJ': 'NP', 'NP-TMP': 'NP'} 109 | >>> convert_tree_labels(treebank.parsed_sents()[0], mapping) 110 | Tree('S', [Tree('NP', [Tree('NP', [Tree('NNP', ['Pierre']), Tree('NNP', ['Vinken'])]), Tree(',', [',']), Tree('ADJP', [Tree('NP', [Tree('CD', ['61']), Tree('NNS', ['years'])]), Tree('JJ', ['old'])]), Tree(',', [','])]), Tree('VP', [Tree('MD', ['will']), Tree('VP', [Tree('VB', ['join']), Tree('NP', [Tree('DT', ['the']), Tree('NN', ['board'])]), Tree('PP-CLR', [Tree('IN', ['as']), Tree('NP', [Tree('DT', ['a']), Tree('JJ', ['nonexecutive']), Tree('NN', ['director'])])]), Tree('NP', [Tree('NNP', ['Nov.']), Tree('CD', ['29'])])])]), Tree('.', ['.'])]) 111 | ''' 112 | # TODO: also do a task on converting tree nodes so NP-TMP -> NP 113 | 114 | if __name__ == '__main__': 115 | import doctest 116 | doctest.testmod() -------------------------------------------------------------------------------- /chapter7.py: -------------------------------------------------------------------------------- 1 | ''' 2 | ================================= 3 | Training a Naive Bayes Classifier 4 | ================================= 5 | 6 | >>> from nltk.corpus import movie_reviews 7 | >>> from featx import label_feats_from_corpus, split_label_feats 8 | >>> movie_reviews.categories() 9 | ['neg', 'pos'] 10 | >>> lfeats = label_feats_from_corpus(movie_reviews) 11 | >>> lfeats.keys() 12 | dict_keys(['neg', 'pos']) 13 | >>> train_feats, test_feats = split_label_feats(lfeats) 14 | >>> len(train_feats) 15 | 1500 16 | >>> len(test_feats) 17 | 500 18 | 19 | >>> from nltk.classify import NaiveBayesClassifier 20 | >>> nb_classifier = NaiveBayesClassifier.train(train_feats) 21 | >>> nb_classifier.labels() 22 | ['neg', 'pos'] 23 | 24 | >>> from featx import bag_of_words 25 | >>> negfeat = bag_of_words(['the', 'plot', 'was', 'ludicrous']) 26 | >>> nb_classifier.classify(negfeat) 27 | 'neg' 28 | >>> posfeat = bag_of_words(['kate', 'winslet', 'is', 'accessible']) 29 | >>> nb_classifier.classify(posfeat) 30 | 'pos' 31 | 32 | >>> from nltk.classify.util import accuracy 33 | >>> accuracy(nb_classifier, test_feats) 34 | 0.728 35 | 36 | >>> probs = nb_classifier.prob_classify(test_feats[0][0]) 37 | >>> probs.samples() 38 | dict_keys(['neg', 'pos']) 39 | >>> probs.max() 40 | 'pos' 41 | >>> probs.prob('pos') 42 | 0.9999999646430913 43 | >>> probs.prob('neg') 44 | 3.535688969240647e-08 45 | 46 | >>> nb_classifier.most_informative_features(n=5) 47 | [('magnificent', True), ('outstanding', True), ('insulting', True), ('vulnerable', True), ('ludicrous', True)] 48 | 49 | >>> from nltk.probability import LaplaceProbDist 50 | >>> nb_classifier = NaiveBayesClassifier.train(train_feats, estimator=LaplaceProbDist) 51 | >>> accuracy(nb_classifier, test_feats) 52 | 0.716 53 | 54 | >>> from nltk.probability import DictionaryProbDist 55 | >>> label_probdist = DictionaryProbDist({'pos': 0.5, 'neg': 0.5}) 56 | >>> true_probdist = DictionaryProbDist({True: 1}) 57 | >>> feature_probdist = {('pos', 'yes'): true_probdist, ('neg', 'no'): true_probdist} 58 | >>> classifier = NaiveBayesClassifier(label_probdist, feature_probdist) 59 | >>> classifier.classify({'yes': True}) 60 | 'pos' 61 | >>> classifier.classify({'no': True}) 62 | 'neg' 63 | 64 | 65 | =================================== 66 | Training a Decision Tree Classifier 67 | =================================== 68 | 69 | >>> from nltk.classify import DecisionTreeClassifier 70 | >>> dt_classifier = DecisionTreeClassifier.train(train_feats, binary=True, entropy_cutoff=0.8, depth_cutoff=5, support_cutoff=30) 71 | >>> accuracy(dt_classifier, test_feats) 72 | 0.688 73 | 74 | >>> from nltk.probability import FreqDist, MLEProbDist, entropy 75 | >>> fd = FreqDist({'pos': 30, 'neg': 10}) 76 | >>> entropy(MLEProbDist(fd)) 77 | 0.8112781244591328 78 | >>> fd['neg'] = 25 79 | >>> entropy(MLEProbDist(fd)) 80 | 0.9940302114769565 81 | >>> fd['neg'] = 30 82 | >>> entropy(MLEProbDist(fd)) 83 | 1.0 84 | >>> fd['neg'] = 1 85 | >>> entropy(MLEProbDist(fd)) 86 | 0.20559250818508304 87 | 88 | 89 | ===================================== 90 | Training a Maximum Entropy Classifier 91 | ===================================== 92 | 93 | >>> from nltk.classify import MaxentClassifier 94 | >>> me_classifier = MaxentClassifier.train(train_feats, trace=0, max_iter=1, min_lldelta=0.5) 95 | >>> accuracy(me_classifier, test_feats) 96 | 0.5 97 | 98 | >>> me_classifier = MaxentClassifier.train(train_feats, algorithm='gis', trace=0, max_iter=10, min_lldelta=0.5) 99 | >>> accuracy(me_classifier, test_feats) 100 | 0.722 101 | 102 | ================================= 103 | Training Scikit-Learn Classifiers 104 | ================================= 105 | 106 | >>> from nltk.classify.scikitlearn import SklearnClassifier 107 | >>> from sklearn.naive_bayes import MultinomialNB 108 | >>> sk_classifier = SklearnClassifier(MultinomialNB()) 109 | >>> sk_classifier.train(train_feats) 110 | 111 | >>> accuracy(sk_classifier, test_feats) 112 | 0.83 113 | 114 | >>> from sklearn.naive_bayes import BernoulliNB 115 | >>> sk_classifier = SklearnClassifier(BernoulliNB()) 116 | >>> sk_classifier.train(train_feats) 117 | 118 | >>> accuracy(sk_classifier, test_feats) 119 | 0.812 120 | 121 | >>> from sklearn.linear_model import LogisticRegression 122 | >>> sk_classifier = SklearnClassifier(LogisticRegression()).train(train_feats) 123 | >>> accuracy(sk_classifier, test_feats) 124 | 0.892 125 | 126 | >>> from sklearn.svm import SVC 127 | >>> sk_classifier = SklearnClassifier(SVC()).train(train_feats) 128 | >>> accuracy(sk_classifier, test_feats) 129 | 0.69 130 | 131 | >>> from sklearn.svm import LinearSVC 132 | >>> sk_classifier = SklearnClassifier(LinearSVC()).train(train_feats) 133 | >>> accuracy(sk_classifier, test_feats) 134 | 0.864 135 | 136 | >>> from sklearn.svm import NuSVC 137 | >>> sk_classifier = SklearnClassifier(NuSVC()).train(train_feats) 138 | >>> accuracy(sk_classifier, test_feats) 139 | 0.882 140 | 141 | ============================================== 142 | Measuring Precision and Recall of a Classifier 143 | ============================================== 144 | 145 | >>> from classification import precision_recall 146 | >>> nb_precisions, nb_recalls = precision_recall(nb_classifier, test_feats) 147 | >>> nb_precisions['pos'] 148 | 0.6413612565445026 149 | >>> nb_precisions['neg'] 150 | 0.9576271186440678 151 | >>> nb_recalls['pos'] 152 | 0.98 153 | >>> nb_recalls['neg'] 154 | 0.452 155 | 156 | >>> me_precisions, me_recalls = precision_recall(me_classifier, test_feats) 157 | >>> me_precisions['pos'] 158 | 0.6456692913385826 159 | >>> me_precisions['neg'] 160 | 0.9663865546218487 161 | >>> me_recalls['pos'] 162 | 0.984 163 | >>> me_recalls['neg'] 164 | 0.46 165 | 166 | >>> sk_precisions, sk_recalls = precision_recall(sk_classifier, test_feats) 167 | >>> sk_precisions['pos'] 168 | 0.9063829787234042 169 | >>> sk_precisions['neg'] 170 | 0.8603773584905661 171 | >>> sk_recalls['pos'] 172 | 0.852 173 | >>> sk_recalls['neg'] 174 | 0.912 175 | 176 | 177 | ================================== 178 | Calculating High Information Words 179 | ================================== 180 | 181 | >>> from featx import high_information_words, bag_of_words_in_set 182 | >>> labels = movie_reviews.categories() 183 | >>> labeled_words = [(l, movie_reviews.words(categories=[l])) for l in labels] 184 | >>> high_info_words = set(high_information_words(labeled_words)) 185 | >>> feat_det = lambda words: bag_of_words_in_set(words, high_info_words) 186 | >>> lfeats = label_feats_from_corpus(movie_reviews, feature_detector=feat_det) 187 | >>> train_feats, test_feats = split_label_feats(lfeats) 188 | 189 | >>> nb_classifier = NaiveBayesClassifier.train(train_feats) 190 | >>> accuracy(nb_classifier, test_feats) 191 | 0.91 192 | >>> nb_precisions, nb_recalls = precision_recall(nb_classifier, test_feats) 193 | >>> nb_precisions['pos'] 194 | 0.8988326848249028 195 | >>> nb_precisions['neg'] 196 | 0.9218106995884774 197 | >>> nb_recalls['pos'] 198 | 0.924 199 | >>> nb_recalls['neg'] 200 | 0.896 201 | 202 | >>> me_classifier = MaxentClassifier.train(train_feats, algorithm='gis', trace=0, max_iter=10, min_lldelta=0.5) 203 | >>> accuracy(me_classifier, test_feats) 204 | 0.912 205 | >>> me_precisions, me_recalls = precision_recall(me_classifier, test_feats) 206 | >>> me_precisions['pos'] 207 | 0.8992248062015504 208 | >>> me_precisions['neg'] 209 | 0.9256198347107438 210 | >>> me_recalls['pos'] 211 | 0.928 212 | >>> me_recalls['neg'] 213 | 0.896 214 | 215 | >>> dt_classifier = DecisionTreeClassifier.train(train_feats, binary=True, depth_cutoff=20, support_cutoff=20, entropy_cutoff=0.01) 216 | >>> accuracy(dt_classifier, test_feats) 217 | 0.688 218 | >>> dt_precisions, dt_recalls = precision_recall(dt_classifier, test_feats) 219 | >>> dt_precisions['pos'] 220 | 0.6766917293233082 221 | >>> dt_precisions['neg'] 222 | 0.7008547008547008 223 | >>> dt_recalls['pos'] 224 | 0.72 225 | >>> dt_recalls['neg'] 226 | 0.656 227 | 228 | >>> sk_classifier = SklearnClassifier(LinearSVC()).train(train_feats) 229 | >>> accuracy(sk_classifier, test_feats) 230 | 0.86 231 | >>> sk_precisions, sk_recalls = precision_recall(sk_classifier, test_feats) 232 | >>> sk_precisions['pos'] 233 | 0.871900826446281 234 | >>> sk_precisions['neg'] 235 | 0.8488372093023255 236 | >>> sk_recalls['pos'] 237 | 0.844 238 | >>> sk_recalls['neg'] 239 | 0.876 240 | 241 | 242 | ================================= 243 | Combining Classifiers with Voting 244 | ================================= 245 | 246 | >>> from classification import MaxVoteClassifier 247 | >>> mv_classifier = MaxVoteClassifier(nb_classifier, dt_classifier, me_classifier, sk_classifier) 248 | >>> mv_classifier.labels() 249 | ['neg', 'pos'] 250 | >>> accuracy(mv_classifier, test_feats) 251 | 0.894 252 | >>> mv_precisions, mv_recalls = precision_recall(mv_classifier, test_feats) 253 | >>> mv_precisions['pos'] 254 | 0.9156118143459916 255 | >>> mv_precisions['neg'] 256 | 0.8745247148288974 257 | >>> mv_recalls['pos'] 258 | 0.868 259 | >>> mv_recalls['neg'] 260 | 0.92 261 | 262 | 263 | ============================================ 264 | Classifying with Multiple Binary Classifiers 265 | ============================================ 266 | 267 | >>> from nltk.corpus import reuters 268 | >>> len(reuters.categories()) 269 | 90 270 | 271 | >>> from featx import reuters_high_info_words, reuters_train_test_feats 272 | >>> rwords = reuters_high_info_words() 273 | >>> featdet = lambda words: bag_of_words_in_set(words, rwords) 274 | >>> multi_train_feats, multi_test_feats = reuters_train_test_feats(featdet) 275 | 276 | >>> from classification import train_binary_classifiers 277 | >>> trainf = lambda train_feats: SklearnClassifier(LogisticRegression()).train(train_feats) 278 | >>> labelset = set(reuters.categories()) 279 | >>> classifiers = train_binary_classifiers(trainf, multi_train_feats, labelset) 280 | >>> len(classifiers) 281 | 90 282 | 283 | >>> from classification import MultiBinaryClassifier, multi_metrics 284 | >>> multi_classifier = MultiBinaryClassifier(*classifiers.items()) 285 | 286 | >>> multi_precisions, multi_recalls, avg_md = multi_metrics(multi_classifier, multi_test_feats) 287 | >>> avg_md 288 | 0.23310715863026216 289 | 290 | >>> multi_precisions['soybean'] 291 | 0.7857142857142857 292 | >>> multi_recalls['soybean'] 293 | 0.3333333333333333 294 | >>> len(reuters.fileids(categories=['soybean'])) 295 | 111 296 | 297 | >>> multi_precisions['sunseed'] 298 | 1.0 299 | >>> multi_recalls['sunseed'] 300 | 0.2 301 | >>> len(reuters.fileids(categories=['sunseed'])) 302 | 16 303 | ''' 304 | 305 | if __name__ == '__main__': 306 | import doctest 307 | doctest.testmod() -------------------------------------------------------------------------------- /chapter8.py: -------------------------------------------------------------------------------- 1 | ''' 2 | ================================ 3 | Distributed Tagging with Execnet 4 | ================================ 5 | 6 | >>> import execnet, remote_tag, nltk.tag, nltk.data 7 | >>> from nltk.corpus import treebank 8 | >>> import pickle 9 | >>> tagger = pickle.dumps(nltk.data.load(nltk.tag._POS_TAGGER)) 10 | >>> gw = execnet.makegateway() 11 | >>> channel = gw.remote_exec(remote_tag) 12 | >>> channel.send(tagger) 13 | >>> channel.send(treebank.sents()[0]) 14 | >>> tagged_sentence = channel.receive() 15 | >>> tagged_sentence == treebank.tagged_sents()[0] 16 | True 17 | >>> gw.exit() 18 | 19 | >>> import itertools 20 | >>> gw1 = execnet.makegateway() 21 | >>> gw2 = execnet.makegateway() 22 | >>> ch1 = gw1.remote_exec(remote_tag) 23 | >>> ch1.send(tagger) 24 | >>> ch2 = gw2.remote_exec(remote_tag) 25 | >>> ch2.send(tagger) 26 | >>> mch = execnet.MultiChannel([ch1, ch2]) 27 | >>> queue = mch.make_receive_queue() 28 | >>> channels = itertools.cycle(mch) 29 | >>> for sentence in treebank.sents()[:4]: 30 | ... channel = next(channels) 31 | ... channel.send(sentence) 32 | >>> tagged_sentences = [] 33 | >>> for i in range(4): 34 | ... channel, tagged_sentence = queue.get() 35 | ... tagged_sentences.append(tagged_sentence) 36 | >>> len(tagged_sentences) 37 | 4 38 | >>> gw1.exit() 39 | >>> gw2.exit() 40 | 41 | 42 | ================================= 43 | Distributed Chunking with Execnet 44 | ================================= 45 | 46 | >>> import remote_chunk, nltk.chunk 47 | >>> from nltk.corpus import treebank_chunk 48 | >>> chunker = pickle.dumps(nltk.data.load(nltk.chunk._MULTICLASS_NE_CHUNKER)) 49 | >>> gw = execnet.makegateway() 50 | >>> channel = gw.remote_exec(remote_chunk) 51 | >>> channel.send(tagger) 52 | >>> channel.send(chunker) 53 | >>> channel.send(treebank_chunk.sents()[0]) 54 | >>> chunk_tree = pickle.loads(channel.receive()) 55 | >>> chunk_tree 56 | Tree('S', [Tree('PERSON', [('Pierre', 'NNP')]), Tree('ORGANIZATION', [('Vinken', 'NNP')]), (',', ','), ('61', 'CD'), ('years', 'NNS'), ('old', 'JJ'), (',', ','), ('will', 'MD'), ('join', 'VB'), ('the', 'DT'), ('board', 'NN'), ('as', 'IN'), ('a', 'DT'), ('nonexecutive', 'JJ'), ('director', 'NN'), ('Nov.', 'NNP'), ('29', 'CD'), ('.', '.')]) 57 | >>> gw.exit() 58 | 59 | 60 | ===================================== 61 | Parallel List Processing with Execnet 62 | ===================================== 63 | 64 | >>> import plists, remote_double 65 | >>> plists.map(remote_double, range(10)) 66 | [0, 2, 4, 6, 8, 10, 12, 14, 16, 18] 67 | 68 | >>> plists.map(remote_double, range(10), [('popen', 4)]) 69 | [0, 2, 4, 6, 8, 10, 12, 14, 16, 18] 70 | 71 | 72 | ====================================== 73 | Storing an Ordered Dictionary in Redis 74 | ====================================== 75 | 76 | >>> from redis import Redis 77 | >>> from rediscollections import RedisOrderedDict 78 | >>> r = Redis() 79 | >>> rod = RedisOrderedDict(r, 'scores') 80 | >>> rod['best'] = 10 81 | >>> rod['worst'] = 0.1 82 | >>> rod['middle'] = 5 83 | >>> rod.keys() 84 | [b'best', b'middle', b'worst'] 85 | >>> rod.keys(start=0, end=1) 86 | [b'best', b'middle'] 87 | >>> rod.clear() 88 | 89 | 90 | =============================================== 91 | Distributed Word Scoring with Redis and Execnet 92 | =============================================== 93 | 94 | >>> from dist_featx import score_words 95 | >>> from nltk.corpus import movie_reviews 96 | >>> labels = movie_reviews.categories() 97 | >>> labelled_words = [(l, movie_reviews.words(categories=[l])) for l in labels] 98 | >>> word_scores = score_words(labelled_words) 99 | >>> len(word_scores) 100 | 39767 101 | >>> topn_words = word_scores.keys(end=1000) 102 | >>> topn_words[0:5] 103 | [b'bad', b',', b'and', b'?', b'movie'] 104 | >>> from redis import Redis 105 | >>> r = Redis() 106 | >>> [r.delete(key) for key in ['word_fd', 'label_word_fd:neg', 'label_word_fd:pos', 'word_scores']] 107 | [1, 1, 1, 1] 108 | ''' 109 | 110 | if __name__ == '__main__': 111 | import doctest 112 | doctest.testmod() -------------------------------------------------------------------------------- /chapter9.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | ''' 3 | =================================== 4 | Parsing Dates & Times with Dateutil 5 | =================================== 6 | 7 | >>> from dateutil import parser 8 | >>> parser.parse('Thu Sep 25 10:36:28 2010') 9 | datetime.datetime(2010, 9, 25, 10, 36, 28) 10 | >>> parser.parse('Thursday, 25. September 2010 10:36AM') 11 | datetime.datetime(2010, 9, 25, 10, 36) 12 | >>> parser.parse('9/25/2010 10:36:28') 13 | datetime.datetime(2010, 9, 25, 10, 36, 28) 14 | >>> parser.parse('9/25/2010') 15 | datetime.datetime(2010, 9, 25, 0, 0) 16 | >>> parser.parse('2010-09-25T10:36:28Z') 17 | datetime.datetime(2010, 9, 25, 10, 36, 28, tzinfo=tzutc()) 18 | 19 | >>> parser.parse('25/9/2010', dayfirst=True) 20 | datetime.datetime(2010, 9, 25, 0, 0) 21 | 22 | >>> parser.parse('10-9-25') 23 | datetime.datetime(2025, 10, 9, 0, 0) 24 | >>> parser.parse('10-9-25', yearfirst=True) 25 | datetime.datetime(2010, 9, 25, 0, 0) 26 | 27 | >>> try: 28 | ... parser.parse('9/25/2010 at about 10:36AM') 29 | ... except ValueError: 30 | ... 'cannot parse' 31 | 'cannot parse' 32 | >>> parser.parse('9/25/2010 at about 10:36AM', fuzzy=True) 33 | datetime.datetime(2010, 9, 25, 10, 36) 34 | 35 | 36 | ============================== 37 | Timezone Lookup and Conversion 38 | ============================== 39 | 40 | >>> from dateutil import tz 41 | >>> tz.tzutc() 42 | tzutc() 43 | >>> import datetime 44 | >>> tz.tzutc().utcoffset(datetime.datetime.utcnow()) 45 | datetime.timedelta(0) 46 | 47 | >>> tz.gettz('US/Pacific') 48 | tzfile('America/Los_Angeles') 49 | >>> tz.gettz('US/Pacific').utcoffset(datetime.datetime.utcnow()) 50 | datetime.timedelta(-1, 61200) 51 | >>> tz.gettz('Europe/Paris') 52 | tzfile('Europe/Paris') 53 | >>> tz.gettz('Europe/Paris').utcoffset(datetime.datetime.utcnow()) 54 | datetime.timedelta(0, 7200) 55 | 56 | >>> pst = tz.gettz('US/Pacific') 57 | >>> dt = datetime.datetime(2010, 9, 25, 10, 36) 58 | >>> dt.tzinfo 59 | >>> dt.astimezone(tz.tzutc()) 60 | Traceback (most recent call last): 61 | File "/usr/lib/python2.6/doctest.py", line 1248, in __run 62 | compileflags, 1) in test.globs 63 | File "", line 1, in 64 | dt.astimezone(tz.tzutc()) 65 | ValueError: astimezone() cannot be applied to a naive datetime 66 | >>> dt.replace(tzinfo=pst) 67 | datetime.datetime(2010, 9, 25, 10, 36, tzinfo=tzfile('America/Los_Angeles')) 68 | >>> dt.replace(tzinfo=pst).astimezone(tz.tzutc()) 69 | datetime.datetime(2010, 9, 25, 17, 36, tzinfo=tzutc()) 70 | 71 | >>> parser.parse('Wednesday, Aug 4, 2010 at 6:30 p.m. (CDT)', fuzzy=True) 72 | datetime.datetime(2010, 8, 4, 18, 30) 73 | >>> tzinfos = {'CDT': tz.gettz('US/Central')} 74 | >>> parser.parse('Wednesday, Aug 4, 2010 at 6:30 p.m. (CDT)', fuzzy=True, tzinfos=tzinfos) 75 | datetime.datetime(2010, 8, 4, 18, 30, tzinfo=tzfile('America/Chicago')) 76 | 77 | >>> tz.tzoffset('custom', 3600) 78 | tzoffset('custom', 3600) 79 | 80 | =================================== 81 | Extracting URLs from HTML with lxml 82 | =================================== 83 | 84 | >>> from lxml import html 85 | >>> doc = html.fromstring('Hello world') 86 | >>> links = list(doc.iterlinks()) 87 | >>> len(links) 88 | 1 89 | >>> (el, attr, link, pos) = links[0] 90 | >>> attr 91 | 'href' 92 | >>> link 93 | '/world' 94 | >>> pos 95 | 0 96 | 97 | >>> doc.make_links_absolute('http://hello') 98 | >>> abslinks = list(doc.iterlinks()) 99 | >>> (el, attr, link, pos) = abslinks[0] 100 | >>> link 101 | 'http://hello/world' 102 | 103 | >>> links = list(html.iterlinks('Hello world')) 104 | >>> links[0][2] 105 | '/world' 106 | 107 | >>> doc.xpath('//a/@href')[0] 108 | 'http://hello/world' 109 | 110 | 111 | =========================== 112 | Cleaning and Stripping HTML 113 | =========================== 114 | 115 | >>> import lxml.html.clean 116 | >>> lxml.html.clean.clean_html('my text') 117 | '
my text
' 118 | 119 | >>> from bs4 import BeautifulSoup 120 | >>> BeautifulSoup('
my text
').get_text() 121 | 'my text' 122 | 123 | 124 | =========================================== 125 | Converting HTML Entities with BeautifulSoup 126 | =========================================== 127 | 128 | >>> from bs4 import BeautifulSoup 129 | >>> BeautifulSoup('<').string 130 | '<' 131 | >>> BeautifulSoup('&').string 132 | '&' 133 | 134 | >>> BeautifulSoup('<').string 135 | 136 | >>> from bs4 import BeautifulSoup 137 | >>> soup = BeautifulSoup('Hello world') 138 | >>> [a['href'] for a in soup.findAll('a')] 139 | ['/world'] 140 | 141 | ============================================ 142 | Detecting and Converting Character Encodings 143 | ============================================ 144 | 145 | >>> import unicodedata 146 | >>> unicodedata.normalize('NFKD', 'abcd\xe9').encode('ascii', 'ignore') 147 | b'abcde' 148 | 149 | >>> from bs4 import UnicodeDammit 150 | >>> UnicodeDammit('abcd\xe9').unicode_markup 151 | 'abcdé' 152 | 153 | ''' 154 | 155 | if __name__ == '__main__': 156 | import doctest 157 | doctest.testmod() 158 | -------------------------------------------------------------------------------- /chunkers.py: -------------------------------------------------------------------------------- 1 | import nltk.tag 2 | from nltk.chunk import ChunkParserI 3 | from nltk.chunk.util import tree2conlltags, conlltags2tree 4 | from nltk.tag import UnigramTagger, BigramTagger, ClassifierBasedTagger 5 | from nltk.corpus import names, ieer, gazetteers 6 | from tag_util import backoff_tagger 7 | 8 | def conll_tag_chunks(chunk_sents): 9 | '''Convert each chunked sentence to list of (tag, chunk_tag) tuples, 10 | so the final result is a list of lists of (tag, chunk_tag) tuples. 11 | >>> from nltk.tree import Tree 12 | >>> t = Tree('S', [Tree('NP', [('the', 'DT'), ('book', 'NN')])]) 13 | >>> conll_tag_chunks([t]) 14 | [[('DT', 'B-NP'), ('NN', 'I-NP')]] 15 | ''' 16 | tagged_sents = [tree2conlltags(tree) for tree in chunk_sents] 17 | return [[(t, c) for (w, t, c) in sent] for sent in tagged_sents] 18 | 19 | class TagChunker(ChunkParserI): 20 | '''Chunks tagged tokens using Ngram Tagging.''' 21 | def __init__(self, train_chunks, tagger_classes=[UnigramTagger, BigramTagger]): 22 | '''Train Ngram taggers on chunked sentences''' 23 | train_sents = conll_tag_chunks(train_chunks) 24 | self.tagger = backoff_tagger(train_sents, tagger_classes) 25 | 26 | def parse(self, tagged_sent): 27 | '''Parsed tagged tokens into parse Tree of chunks''' 28 | if not tagged_sent: return None 29 | (words, tags) = zip(*tagged_sent) 30 | chunks = self.tagger.tag(tags) 31 | # create conll str for tree parsing 32 | wtc = zip(words, chunks) 33 | return conlltags2tree([(w,t,c) for (w,(t,c)) in wtc]) 34 | 35 | def chunk_trees2train_chunks(chunk_sents): 36 | tag_sents = [tree2conlltags(sent) for sent in chunk_sents] 37 | return [[((w,t),c) for (w,t,c) in sent] for sent in tag_sents] 38 | 39 | def prev_next_pos_iob(tokens, index, history): 40 | word, pos = tokens[index] 41 | 42 | if index == 0: 43 | prevword, prevpos, previob = ('',)*3 44 | else: 45 | prevword, prevpos = tokens[index-1] 46 | previob = history[index-1] 47 | 48 | if index == len(tokens) - 1: 49 | nextword, nextpos = ('',)*2 50 | else: 51 | nextword, nextpos = tokens[index+1] 52 | 53 | feats = { 54 | 'word': word, 55 | 'pos': pos, 56 | 'nextword': nextword, 57 | 'nextpos': nextpos, 58 | 'prevword': prevword, 59 | 'prevpos': prevpos, 60 | 'previob': previob 61 | } 62 | 63 | return feats 64 | 65 | class ClassifierChunker(ChunkParserI): 66 | def __init__(self, train_sents, feature_detector=prev_next_pos_iob, **kwargs): 67 | if not feature_detector: 68 | feature_detector = self.feature_detector 69 | 70 | train_chunks = chunk_trees2train_chunks(train_sents) 71 | self.tagger = ClassifierBasedTagger(train=train_chunks, 72 | feature_detector=feature_detector, **kwargs) 73 | 74 | def parse(self, tagged_sent): 75 | if not tagged_sent: return None 76 | chunks = self.tagger.tag(tagged_sent) 77 | return conlltags2tree([(w,t,c) for ((w,t),c) in chunks]) 78 | 79 | def sub_leaves(tree, label): 80 | return [t.leaves() for t in tree.subtrees(lambda s: s.label() == label)] 81 | 82 | class PersonChunker(ChunkParserI): 83 | ''' 84 | >>> from nltk.corpus import treebank_chunk 85 | >>> chunker = PersonChunker() 86 | >>> sub_leaves(chunker.parse(treebank_chunk.tagged_sents()[0]), 'PERSON') 87 | [[('Pierre', 'NNP')]] 88 | ''' 89 | def __init__(self): 90 | self.name_set = set(names.words()) 91 | 92 | def parse(self, tagged_sent): 93 | iobs = [] 94 | in_person = False 95 | 96 | for word, tag in tagged_sent: 97 | if word in self.name_set and in_person: 98 | iobs.append((word, tag, 'I-PERSON')) 99 | elif word in self.name_set: 100 | iobs.append((word, tag, 'B-PERSON')) 101 | in_person = True 102 | else: 103 | iobs.append((word, tag, 'O')) 104 | in_person = False 105 | 106 | return conlltags2tree(iobs) 107 | 108 | class LocationChunker(ChunkParserI): 109 | '''Chunks locations based on the gazetteers corpus. 110 | >>> loc = LocationChunker() 111 | >>> t = loc.parse([('San', 'NNP'), ('Francisco', 'NNP'), ('CA', 'NNP'), ('is', 'BE'), ('cold', 'JJ'), ('compared', 'VBD'), ('to', 'TO'), ('San', 'NNP'), ('Jose', 'NNP'), ('CA', 'NNP')]) 112 | >>> sub_leaves(t, 'LOCATION') 113 | [[('San', 'NNP'), ('Francisco', 'NNP'), ('CA', 'NNP')], [('San', 'NNP'), ('Jose', 'NNP'), ('CA', 'NNP')]] 114 | ''' 115 | def __init__(self): 116 | # gazetteers is a WordListCorpusReader of many different location words 117 | self.locations = set(gazetteers.words()) 118 | self.lookahead = 0 119 | # need to know how many words to lookahead in the tagged sentence to find a location 120 | for loc in self.locations: 121 | nwords = loc.count(' ') 122 | 123 | if nwords > self.lookahead: 124 | self.lookahead = nwords 125 | 126 | def iob_locations(self, tagged_sent): 127 | i = 0 128 | l = len(tagged_sent) 129 | inside = False 130 | 131 | while i < l: 132 | word, tag = tagged_sent[i] 133 | j = i + 1 134 | k = j + self.lookahead 135 | nextwords, nexttags = [], [] 136 | loc = False 137 | # lookahead in the sentence to find multi-word locations 138 | while j < k: 139 | if ' '.join([word] + nextwords) in self.locations: 140 | # combine multiple separate locations into single location chunk 141 | if inside: 142 | yield word, tag, 'I-LOCATION' 143 | else: 144 | yield word, tag, 'B-LOCATION' 145 | # every next word is inside the location chunk 146 | for nword, ntag in zip(nextwords, nexttags): 147 | yield nword, ntag, 'I-LOCATION' 148 | # found a location, so we're inside a chunk 149 | loc, inside = True, True 150 | # move forward to the next word since the current words 151 | # are already chunked 152 | i = j 153 | break 154 | 155 | if j < l: 156 | nextword, nexttag = tagged_sent[j] 157 | nextwords.append(nextword) 158 | nexttags.append(nexttag) 159 | j += 1 160 | else: 161 | break 162 | # if no location found, then we're outside the location chunk 163 | if not loc: 164 | inside = False 165 | i += 1 166 | yield word, tag, 'O' 167 | 168 | def parse(self, tagged_sent): 169 | iobs = self.iob_locations(tagged_sent) 170 | return conlltags2tree(iobs) 171 | 172 | def ieertree2conlltags(tree, tag=nltk.tag.pos_tag): 173 | # tree.pos() flattens the tree and produces [(word, node)] where node is 174 | # from the word's parent tree node. words in a chunk therefore get the 175 | # chunk tag, while words outside a chunk get the same tag as the tree's 176 | # top node 177 | words, ents = zip(*tree.pos()) 178 | iobs = [] 179 | prev = None 180 | # construct iob tags from entity names 181 | for ent in ents: 182 | # any entity that is the same as the tree's top node is outside a chunk 183 | if ent == tree.label(): 184 | iobs.append('O') 185 | prev = None 186 | # have a previous entity that is equal so this is inside the chunk 187 | elif prev == ent: 188 | iobs.append('I-%s' % ent) 189 | # no previous equal entity in the sequence, so this is the beginning of 190 | # an entity chunk 191 | else: 192 | iobs.append('B-%s' % ent) 193 | prev = ent 194 | # get tags for each word, then construct 3-tuple for conll tags 195 | words, tags = zip(*tag(words)) 196 | return zip(words, tags, iobs) 197 | 198 | def ieer_chunked_sents(tag=nltk.tag.pos_tag): 199 | for doc in ieer.parsed_docs(): 200 | tagged = ieertree2conlltags(doc.text, tag) 201 | yield conlltags2tree(tagged) 202 | 203 | if __name__ == '__main__': 204 | import doctest 205 | doctest.testmod() -------------------------------------------------------------------------------- /classification.py: -------------------------------------------------------------------------------- 1 | import collections, itertools 2 | from nltk import metrics 3 | from nltk.classify import util, ClassifierI, MultiClassifierI 4 | from nltk.probability import FreqDist 5 | 6 | def precision_recall(classifier, testfeats): 7 | refsets = collections.defaultdict(set) 8 | testsets = collections.defaultdict(set) 9 | 10 | for i, (feats, label) in enumerate(testfeats): 11 | refsets[label].add(i) 12 | observed = classifier.classify(feats) 13 | testsets[observed].add(i) 14 | 15 | precisions = {} 16 | recalls = {} 17 | 18 | for label in classifier.labels(): 19 | precisions[label] = metrics.precision(refsets[label], testsets[label]) 20 | recalls[label] = metrics.recall(refsets[label], testsets[label]) 21 | 22 | return precisions, recalls 23 | 24 | class MaxVoteClassifier(ClassifierI): 25 | def __init__(self, *classifiers): 26 | self._classifiers = classifiers 27 | self._labels = sorted(set(itertools.chain(*[c.labels() for c in classifiers]))) 28 | 29 | def labels(self): 30 | return self._labels 31 | 32 | def classify(self, feats): 33 | counts = FreqDist() 34 | 35 | for classifier in self._classifiers: 36 | counts[classifier.classify(feats)] += 1 37 | 38 | return counts.max() 39 | 40 | class MultiBinaryClassifier(MultiClassifierI): 41 | def __init__(self, *label_classifiers): 42 | self._label_classifiers = dict(label_classifiers) 43 | self._labels = sorted(self._label_classifiers.keys()) 44 | 45 | def labels(self): 46 | return self._labels 47 | 48 | def classify(self, feats): 49 | lbls = set() 50 | 51 | for label, classifier in self._label_classifiers.items(): 52 | if classifier.classify(feats) == label: 53 | lbls.add(label) 54 | 55 | return lbls 56 | 57 | def train_binary_classifiers(trainf, labelled_feats, labelset): 58 | pos_feats = collections.defaultdict(list) 59 | neg_feats = collections.defaultdict(list) 60 | classifiers = {} 61 | 62 | for feat, labels in labelled_feats: 63 | for label in labels: 64 | pos_feats[label].append(feat) 65 | 66 | for label in labelset - set(labels): 67 | neg_feats[label].append(feat) 68 | 69 | for label in labelset: 70 | postrain = [(feat, label) for feat in pos_feats[label]] 71 | negtrain = [(feat, '!%s' % label) for feat in neg_feats[label]] 72 | classifiers[label] = trainf(postrain + negtrain) 73 | 74 | return classifiers 75 | 76 | def multi_metrics(multi_classifier, test_feats): 77 | mds = [] 78 | refsets = collections.defaultdict(set) 79 | testsets = collections.defaultdict(set) 80 | 81 | for i, (feat, labels) in enumerate(test_feats): 82 | for label in labels: 83 | refsets[label].add(i) 84 | 85 | guessed = multi_classifier.classify(feat) 86 | 87 | for label in guessed: 88 | testsets[label].add(i) 89 | 90 | mds.append(metrics.masi_distance(set(labels), guessed)) 91 | 92 | avg_md = sum(mds) / float(len(mds)) 93 | precisions = {} 94 | recalls = {} 95 | 96 | for label in multi_classifier.labels(): 97 | precisions[label] = metrics.precision(refsets[label], testsets[label]) 98 | recalls[label] = metrics.recall(refsets[label], testsets[label]) 99 | 100 | return precisions, recalls, avg_md -------------------------------------------------------------------------------- /conll.iob: -------------------------------------------------------------------------------- 1 | Mr. NNP B-NP 2 | Meador NNP I-NP 3 | had VBD B-VP 4 | been VBN I-VP 5 | executive JJ B-NP 6 | vice NN I-NP 7 | president NN I-NP 8 | of IN B-PP 9 | Balcor NNP B-NP 10 | . . O -------------------------------------------------------------------------------- /corpus.py: -------------------------------------------------------------------------------- 1 | import lockfile, tempfile, shutil 2 | from nltk.corpus.reader import PlaintextCorpusReader 3 | from nltk.corpus.reader.util import StreamBackedCorpusView, read_blankline_block 4 | 5 | class IgnoreHeadingCorpusView(StreamBackedCorpusView): 6 | def __init__(self, *args, **kwargs): 7 | StreamBackedCorpusView.__init__(self, *args, **kwargs) 8 | # open self._stream 9 | self._open() 10 | # skip the heading block 11 | read_blankline_block(self._stream) 12 | # reset the start position to the current position in the stream 13 | self._filepos = [self._stream.tell()] 14 | 15 | class IgnoreHeadingCorpusReader(PlaintextCorpusReader): 16 | CorpusView = IgnoreHeadingCorpusView 17 | 18 | def append_line(fname, line): 19 | # lock for writing, released when fp is closed 20 | with lockfile.FileLock(fname): 21 | fp = open(fname, 'a+') 22 | fp.write(line) 23 | fp.write('\n') 24 | fp.close() 25 | 26 | def remove_line(fname, line): 27 | '''Remove line from file by creating a temporary file containing all lines 28 | from original file except those matching the given line, then copying the 29 | temporary file back into the original file, overwriting its contents. 30 | ''' 31 | with lockfile.FileLock(fname): 32 | tmp = tempfile.TemporaryFile() 33 | fp = open(fname, 'rw+') 34 | # write all lines from orig file, except if matches given line 35 | for l in fp: 36 | if l.strip() != line: 37 | tmp.write(l) 38 | 39 | # reset file pointers so entire files are copied 40 | fp.seek(0) 41 | tmp.seek(0) 42 | # copy tmp into fp, then truncate to remove trailing line(s) 43 | shutil.copyfileobj(tmp, fp) 44 | fp.truncate() 45 | fp.close() 46 | tmp.close() -------------------------------------------------------------------------------- /dist_featx.py: -------------------------------------------------------------------------------- 1 | import itertools, execnet, remote_word_count 2 | from nltk.metrics import BigramAssocMeasures 3 | from redis import Redis 4 | from redisprob import RedisHashFreqDist, RedisConditionalHashFreqDist 5 | from rediscollections import RedisOrderedDict 6 | 7 | def score_words(labelled_words, score_fn=BigramAssocMeasures.chi_sq, host='localhost', specs=[('popen', 2)]): 8 | gateways = [] 9 | channels = [] 10 | 11 | for spec, count in specs: 12 | for i in range(count): 13 | gw = execnet.makegateway(spec) 14 | gateways.append(gw) 15 | channel = gw.remote_exec(remote_word_count) 16 | channel.send((host, 'word_fd', 'label_word_fd')) 17 | channels.append(channel) 18 | 19 | cyc = itertools.cycle(channels) 20 | 21 | for label, words in labelled_words: 22 | channel = next(cyc) 23 | channel.send((label, list(words))) 24 | 25 | for channel in channels: 26 | channel.send('done') 27 | assert 'done' == channel.receive() 28 | channel.waitclose(5) 29 | 30 | for gateway in gateways: 31 | gateway.exit() 32 | 33 | r = Redis(host) 34 | fd = RedisHashFreqDist(r, 'word_fd') 35 | cfd = RedisConditionalHashFreqDist(r, 'label_word_fd') 36 | word_scores = RedisOrderedDict(r, 'word_scores') 37 | n_xx = cfd.N() 38 | 39 | for label in cfd.conditions(): 40 | n_xi = cfd[label].N() 41 | 42 | for word, n_ii in cfd[label].items(): 43 | word = word.decode() # must convert to string from bytes 44 | n_ix = fd[word] 45 | 46 | if n_ii and n_ix and n_xi and n_xx: 47 | score = score_fn(n_ii, (n_ix, n_xi), n_xx) 48 | word_scores[word] = score 49 | 50 | return word_scores -------------------------------------------------------------------------------- /encoding.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import charade 3 | 4 | def detect(s): 5 | ''' 6 | >>> detect('ascii') 7 | {'confidence': 1.0, 'encoding': 'ascii'} 8 | >>> detect('abcdé') 9 | {'confidence': 0.505, 'encoding': 'utf-8'} 10 | >>> detect(bytes('abcdé', 'utf-8')) 11 | {'confidence': 0.505, 'encoding': 'utf-8'} 12 | >>> detect(bytes('\222\222\223\225', 'latin-1')) 13 | {'confidence': 0.5, 'encoding': 'windows-1252'} 14 | ''' 15 | try: 16 | if isinstance(s, str): 17 | return charade.detect(s.encode()) 18 | else: 19 | return charade.detect(s) 20 | except UnicodeDecodeError: 21 | return charade.detect(s.encode('utf-8')) 22 | 23 | def convert(s): 24 | ''' 25 | >>> convert('ascii') 26 | 'ascii' 27 | >>> convert('abcdé') 28 | 'abcdé' 29 | >>> convert(bytes('abcdé', 'utf-8')) 30 | 'abcdé' 31 | >>> convert(bytes('\222\222\223\225', 'latin-1')) 32 | '\u2019\u2019\u201c\u2022' 33 | ''' 34 | if isinstance(s, str): 35 | s = s.encode() 36 | 37 | encoding = detect(s)['encoding'] 38 | 39 | if encoding == 'utf-8': 40 | return s.decode() 41 | else: 42 | return s.decode(encoding) 43 | 44 | if __name__ == '__main__': 45 | import doctest 46 | doctest.testmod() -------------------------------------------------------------------------------- /featx.py: -------------------------------------------------------------------------------- 1 | import collections 2 | from nltk.corpus import stopwords, reuters 3 | from nltk.collocations import BigramCollocationFinder 4 | from nltk.metrics import BigramAssocMeasures 5 | from nltk.probability import FreqDist, ConditionalFreqDist 6 | 7 | def bag_of_words(words): 8 | ''' 9 | >>> bag_of_words(['the', 'quick', 'brown', 'fox']) 10 | {'quick': True, 'brown': True, 'the': True, 'fox': True} 11 | ''' 12 | return dict([(word, True) for word in words]) 13 | 14 | def bag_of_words_not_in_set(words, badwords): 15 | ''' 16 | >>> bag_of_words_not_in_set(['the', 'quick', 'brown', 'fox'], ['the']) 17 | {'quick': True, 'brown': True, 'fox': True} 18 | ''' 19 | return bag_of_words(set(words) - set(badwords)) 20 | 21 | def bag_of_non_stopwords(words, stopfile='english'): 22 | ''' 23 | >>> bag_of_non_stopwords(['the', 'quick', 'brown', 'fox']) 24 | {'quick': True, 'brown': True, 'fox': True} 25 | ''' 26 | badwords = stopwords.words(stopfile) 27 | return bag_of_words_not_in_set(words, badwords) 28 | 29 | def bag_of_bigrams_words(words, score_fn=BigramAssocMeasures.chi_sq, n=200): 30 | ''' 31 | >>> bag_of_bigrams_words(['the', 'quick', 'brown', 'fox']) 32 | {'brown': True, ('brown', 'fox'): True, ('the', 'quick'): True, 'quick': True, ('quick', 'brown'): True, 'the': True, 'fox': True} 33 | ''' 34 | bigram_finder = BigramCollocationFinder.from_words(words) 35 | bigrams = bigram_finder.nbest(score_fn, n) 36 | return bag_of_words(words + bigrams) 37 | 38 | def bag_of_words_in_set(words, goodwords): 39 | return bag_of_words(set(words) & set(goodwords)) 40 | 41 | def label_feats_from_corpus(corp, feature_detector=bag_of_words): 42 | label_feats = collections.defaultdict(list) 43 | 44 | for label in corp.categories(): 45 | for fileid in corp.fileids(categories=[label]): 46 | feats = feature_detector(corp.words(fileids=[fileid])) 47 | label_feats[label].append(feats) 48 | 49 | return label_feats 50 | 51 | def split_label_feats(lfeats, split=0.75): 52 | train_feats = [] 53 | test_feats = [] 54 | 55 | for label, feats in lfeats.items(): 56 | cutoff = int(len(feats) * split) 57 | train_feats.extend([(feat, label) for feat in feats[:cutoff]]) 58 | test_feats.extend([(feat, label) for feat in feats[cutoff:]]) 59 | 60 | return train_feats, test_feats 61 | 62 | def high_information_words(labelled_words, score_fn=BigramAssocMeasures.chi_sq, min_score=5): 63 | word_fd = FreqDist() 64 | label_word_fd = ConditionalFreqDist() 65 | 66 | for label, words in labelled_words: 67 | for word in words: 68 | word_fd[word] += 1 69 | label_word_fd[label][word] += 1 70 | 71 | n_xx = label_word_fd.N() 72 | high_info_words = set() 73 | 74 | for label in label_word_fd.conditions(): 75 | n_xi = label_word_fd[label].N() 76 | word_scores = collections.defaultdict(int) 77 | 78 | for word, n_ii in label_word_fd[label].items(): 79 | n_ix = word_fd[word] 80 | score = score_fn(n_ii, (n_ix, n_xi), n_xx) 81 | word_scores[word] = score 82 | 83 | bestwords = [word for word, score in word_scores.items() if score >= min_score] 84 | high_info_words |= set(bestwords) 85 | 86 | return high_info_words 87 | 88 | def reuters_high_info_words(score_fn=BigramAssocMeasures.chi_sq): 89 | labeled_words = [] 90 | 91 | for label in reuters.categories(): 92 | labeled_words.append((label, reuters.words(categories=[label]))) 93 | 94 | return high_information_words(labeled_words, score_fn=score_fn) 95 | 96 | def reuters_train_test_feats(feature_detector=bag_of_words): 97 | train_feats = [] 98 | test_feats = [] 99 | 100 | for fileid in reuters.fileids(): 101 | if fileid.startswith('training'): 102 | featlist = train_feats 103 | else: # fileid.startswith('test') 104 | featlist = test_feats 105 | 106 | feats = feature_detector(reuters.words(fileid)) 107 | labels = reuters.categories(fileid) 108 | featlist.append((feats, labels)) 109 | 110 | return train_feats, test_feats 111 | 112 | if __name__ == '__main__': 113 | import doctest 114 | doctest.testmod() -------------------------------------------------------------------------------- /heading_text.txt: -------------------------------------------------------------------------------- 1 | A simple heading 2 | 3 | Here is the actual text for the corpus. 4 | 5 | Paragraphs are split by blanklines. 6 | 7 | This is the 3rd paragraph. -------------------------------------------------------------------------------- /mongoreader.py: -------------------------------------------------------------------------------- 1 | import pymongo 2 | from nltk.data import LazyLoader 3 | from nltk.tokenize import TreebankWordTokenizer 4 | from nltk.util import AbstractLazySequence, LazyMap, LazyConcatenation 5 | 6 | class MongoDBLazySequence(AbstractLazySequence): 7 | def __init__(self, host='localhost', port=27017, db='test', collection='corpus', field='text'): 8 | self.conn = pymongo.MongoClient(host, port) 9 | self.collection = self.conn[db][collection] 10 | self.field = field 11 | 12 | def __len__(self): 13 | return self.collection.count() 14 | 15 | def iterate_from(self, start): 16 | f = lambda d: d.get(self.field, '') 17 | return iter(LazyMap(f, self.collection.find(fields=[self.field], skip=start))) 18 | 19 | class MongoDBCorpusReader(object): 20 | def __init__(self, word_tokenizer=TreebankWordTokenizer(), 21 | sent_tokenizer=LazyLoader('tokenizers/punkt/PY3/english.pickle'), 22 | **kwargs): 23 | self._seq = MongoDBLazySequence(**kwargs) 24 | self._word_tokenize = word_tokenizer.tokenize 25 | self._sent_tokenize = sent_tokenizer.tokenize 26 | 27 | def text(self): 28 | return self._seq 29 | 30 | def words(self): 31 | return LazyConcatenation(LazyMap(self._word_tokenize, self.text())) 32 | 33 | def sents(self): 34 | return LazyConcatenation(LazyMap(self._sent_tokenize, self.text())) -------------------------------------------------------------------------------- /movie_neg.txt: -------------------------------------------------------------------------------- 1 | a big-budget and glossy production can not make up for a lack of spontaneity that permeates their tv show . -------------------------------------------------------------------------------- /movie_pos.txt: -------------------------------------------------------------------------------- 1 | the thin red line is flawed but it provokes . -------------------------------------------------------------------------------- /mywords.txt: -------------------------------------------------------------------------------- 1 | nltk 2 | -------------------------------------------------------------------------------- /plists.py: -------------------------------------------------------------------------------- 1 | import itertools, execnet 2 | 3 | def map(mod, args, specs=[('popen', 2)]): 4 | gateways = [] 5 | channels = [] 6 | 7 | for spec, count in specs: 8 | for i in range(count): 9 | gw = execnet.makegateway(spec) 10 | gateways.append(gw) 11 | channels.append(gw.remote_exec(mod)) 12 | 13 | cyc = itertools.cycle(channels) 14 | 15 | for i, arg in enumerate(args): 16 | channel = next(cyc) 17 | channel.send((i, arg)) 18 | 19 | mch = execnet.MultiChannel(channels) 20 | queue = mch.make_receive_queue() 21 | l = len(args) 22 | results = [None] * l 23 | 24 | for j in range(l): 25 | channel, (i, result) = queue.get() 26 | results[i] = result 27 | 28 | for gw in gateways: 29 | gw.exit() 30 | 31 | return results -------------------------------------------------------------------------------- /rediscollections.py: -------------------------------------------------------------------------------- 1 | import collections, re 2 | 3 | white = re.compile('[\s&]+') 4 | 5 | def encode_key(key): 6 | return white.sub('_', key.strip()) 7 | 8 | class RedisHashMap(collections.MutableMapping): 9 | def __init__(self, r, name): 10 | self._r = r 11 | self._name = encode_key(name) 12 | 13 | def __iter__(self): 14 | return self.items() 15 | 16 | def __len__(self): 17 | return self._r.hlen(self._name) 18 | 19 | def __contains__(self, key): 20 | return self._r.hexists(self._name, encode_key(key)) 21 | 22 | def __getitem__(self, key): 23 | return self._r.hget(self._name, encode_key(key)) 24 | 25 | def __setitem__(self, key, val): 26 | self._r.hset(self._name, encode_key(key), val) 27 | 28 | def __delitem__(self, key): 29 | self._r.hdel(self._name, encode_key(key)) 30 | 31 | def keys(self): 32 | return self._r.hkeys(self._name) 33 | 34 | def values(self): 35 | return self._r.hvals(self._name) 36 | 37 | def items(self): 38 | return self._r.hgetall(self._name).items() 39 | 40 | def get(self, key, default=0): 41 | return self[key] or default 42 | 43 | def clear(self): 44 | self._r.delete(self._name) 45 | 46 | class RedisOrderedDict(collections.MutableMapping): 47 | ''' 48 | >>> from redis import Redis 49 | >>> r = Redis() 50 | >>> rod = RedisOrderedDict(r, 'test') 51 | >>> rod.get('bar') 52 | 0 53 | >>> len(rod) 54 | 0 55 | >>> rod['bar'] = 5.2 56 | >>> rod['bar'] 57 | 5.2 58 | >>> len(rod) 59 | 1 60 | >>> rod.items() 61 | [(b'bar', 5.2)] 62 | >>> rod.clear() 63 | ''' 64 | def __init__(self, r, name): 65 | self._r = r 66 | self._name = encode_key(name) 67 | 68 | def __iter__(self): 69 | return iter(self.items()) 70 | 71 | def __len__(self): 72 | return self._r.zcard(self._name) 73 | 74 | def __getitem__(self, key): 75 | return self._r.zscore(self._name, encode_key(key)) 76 | 77 | def __setitem__(self, key, score): 78 | self._r.zadd(self._name, encode_key(key), score) 79 | 80 | def __delitem__(self, key): 81 | self._r.zrem(self._name, encode_key(key)) 82 | 83 | def keys(self, start=0, end=-1): 84 | # we use zrevrange to get keys sorted by high value instead of by lowest 85 | return self._r.zrevrange(self._name, start, end) 86 | 87 | def values(self, start=0, end=-1): 88 | return [v for (k, v) in self.items(start=start, end=end)] 89 | 90 | def items(self, start=0, end=-1): 91 | return self._r.zrevrange(self._name, start, end, withscores=True) 92 | 93 | def get(self, key, default=0): 94 | return self[key] or default 95 | 96 | def iteritems(self): 97 | return iter(self) 98 | 99 | def clear(self): 100 | self._r.delete(self._name) 101 | 102 | if __name__ == '__main__': 103 | import doctest 104 | doctest.testmod() -------------------------------------------------------------------------------- /redisprob.py: -------------------------------------------------------------------------------- 1 | from nltk.probability import ConditionalFreqDist 2 | from rediscollections import RedisHashMap, encode_key 3 | 4 | class RedisHashFreqDist(RedisHashMap): 5 | ''' 6 | >>> from redis import Redis 7 | >>> r = Redis() 8 | >>> rhfd = RedisHashFreqDist(r, 'test') 9 | >>> rhfd.items() 10 | [] 11 | >>> rhfd.values() 12 | [] 13 | >>> len(rhfd) 14 | 0 15 | >>> rhfd['foo'] 16 | 0 17 | >>> rhfd['foo'] += 1 18 | >>> rhfd['foo'] 19 | 1 20 | >>> rhfd.items() 21 | [(b'foo', 1)] 22 | >>> rhfd.values() 23 | [1] 24 | >>> len(rhfd) 25 | 1 26 | >>> rhfd.clear() 27 | ''' 28 | def N(self): 29 | return int(sum(self.values())) 30 | 31 | def __missing__(self, key): 32 | return 0 33 | 34 | def __getitem__(self, key): 35 | return int(RedisHashMap.__getitem__(self, key) or 0) 36 | 37 | def values(self): 38 | return [int(v) for v in RedisHashMap.values(self)] 39 | 40 | def items(self): 41 | return [(k, int(v)) for (k, v) in RedisHashMap.items(self)] 42 | 43 | class RedisConditionalHashFreqDist(ConditionalFreqDist): 44 | ''' 45 | >>> from redis import Redis 46 | >>> r = Redis() 47 | >>> rchfd = RedisConditionalHashFreqDist(r, 'condhash') 48 | >>> rchfd.N() 49 | 0 50 | >>> rchfd.conditions() 51 | [] 52 | >>> rchfd['cond1']['foo'] += 1 53 | >>> rchfd.N() 54 | 1 55 | >>> rchfd['cond1']['foo'] 56 | 1 57 | >>> rchfd.conditions() 58 | ['cond1'] 59 | >>> rchfd.clear() 60 | ''' 61 | def __init__(self, r, name, cond_samples=None): 62 | self._r = r 63 | self._name = name 64 | ConditionalFreqDist.__init__(self, cond_samples) 65 | 66 | for key in self._r.keys(encode_key('%s:*' % name)): 67 | condition = key.split(b':')[1].decode() 68 | self[condition] # calls self.__getitem__(condition) 69 | 70 | def __getitem__(self, condition): 71 | if condition not in self: 72 | key = '%s:%s' % (self._name, condition) 73 | val = RedisHashFreqDist(self._r, key) 74 | super(RedisConditionalHashFreqDist, self).__setitem__(condition, val) 75 | 76 | return super(RedisConditionalHashFreqDist, self).__getitem__(condition) 77 | 78 | def clear(self): 79 | for fdist in self.values(): 80 | fdist.clear() 81 | 82 | if __name__ == '__main__': 83 | import doctest 84 | doctest.testmod() -------------------------------------------------------------------------------- /remote_chunk.py: -------------------------------------------------------------------------------- 1 | import pickle 2 | 3 | if __name__ == '__channelexec__': 4 | tagger = pickle.loads(channel.receive()) 5 | chunker = pickle.loads(channel.receive()) 6 | 7 | for sent in channel: 8 | tree = chunker.parse(tagger.tag(sent)) 9 | channel.send(pickle.dumps(tree)) -------------------------------------------------------------------------------- /remote_double.py: -------------------------------------------------------------------------------- 1 | 2 | if __name__ == '__channelexec__': 3 | for (i, arg) in channel: 4 | channel.send((i, arg * 2)) -------------------------------------------------------------------------------- /remote_tag.py: -------------------------------------------------------------------------------- 1 | import pickle 2 | 3 | if __name__ == '__channelexec__': 4 | tagger = pickle.loads(channel.receive()) 5 | 6 | for sentence in channel: 7 | channel.send(tagger.tag(sentence)) -------------------------------------------------------------------------------- /remote_word_count.py: -------------------------------------------------------------------------------- 1 | from redis import Redis 2 | from redisprob import RedisHashFreqDist, RedisConditionalHashFreqDist 3 | 4 | if __name__ == '__channelexec__': 5 | host, fd_name, cfd_name = channel.receive() 6 | r = Redis(host) 7 | fd = RedisHashFreqDist(r, fd_name) 8 | cfd = RedisConditionalHashFreqDist(r, cfd_name) 9 | 10 | for data in channel: 11 | if data == 'done': 12 | channel.send('done') 13 | break 14 | 15 | label, words = data 16 | 17 | for word in words: 18 | fd[word] += 1 19 | cfd[label][word] += 1 -------------------------------------------------------------------------------- /replacers.py: -------------------------------------------------------------------------------- 1 | import re, csv, yaml, enchant 2 | from nltk.corpus import wordnet 3 | from nltk.metrics import edit_distance 4 | 5 | ################################################## 6 | ## Replacing Words Matching Regular Expressions ## 7 | ################################################## 8 | 9 | replacement_patterns = [ 10 | (r'won\'t', 'will not'), 11 | (r'can\'t', 'cannot'), 12 | (r'i\'m', 'i am'), 13 | (r'ain\'t', 'is not'), 14 | (r'(\w+)\'ll', '\g<1> will'), 15 | (r'(\w+)n\'t', '\g<1> not'), 16 | (r'(\w+)\'ve', '\g<1> have'), 17 | (r'(\w+)\'s', '\g<1> is'), 18 | (r'(\w+)\'re', '\g<1> are'), 19 | (r'(\w+)\'d', '\g<1> would'), 20 | ] 21 | 22 | class RegexpReplacer(object): 23 | """ Replaces regular expression in a text. 24 | >>> replacer = RegexpReplacer() 25 | >>> replacer.replace("can't is a contraction") 26 | 'cannot is a contraction' 27 | >>> replacer.replace("I should've done that thing I didn't do") 28 | 'I should have done that thing I did not do' 29 | """ 30 | def __init__(self, patterns=replacement_patterns): 31 | self.patterns = [(re.compile(regex), repl) for (regex, repl) in patterns] 32 | 33 | def replace(self, text): 34 | s = text 35 | 36 | for (pattern, repl) in self.patterns: 37 | s = re.sub(pattern, repl, s) 38 | 39 | return s 40 | 41 | #################################### 42 | ## Replacing Repeating Characters ## 43 | #################################### 44 | 45 | class RepeatReplacer(object): 46 | """ Removes repeating characters until a valid word is found. 47 | >>> replacer = RepeatReplacer() 48 | >>> replacer.replace('looooove') 49 | 'love' 50 | >>> replacer.replace('oooooh') 51 | 'ooh' 52 | >>> replacer.replace('goose') 53 | 'goose' 54 | """ 55 | def __init__(self): 56 | self.repeat_regexp = re.compile(r'(\w*)(\w)\2(\w*)') 57 | self.repl = r'\1\2\3' 58 | 59 | def replace(self, word): 60 | if wordnet.synsets(word): 61 | return word 62 | 63 | repl_word = self.repeat_regexp.sub(self.repl, word) 64 | 65 | if repl_word != word: 66 | return self.replace(repl_word) 67 | else: 68 | return repl_word 69 | 70 | ###################################### 71 | ## Spelling Correction with Enchant ## 72 | ###################################### 73 | 74 | class SpellingReplacer(object): 75 | """ Replaces misspelled words with a likely suggestion based on shortest 76 | edit distance. 77 | >>> replacer = SpellingReplacer() 78 | >>> replacer.replace('cookbok') 79 | 'cookbook' 80 | """ 81 | def __init__(self, dict_name='en', max_dist=2): 82 | self.spell_dict = enchant.Dict(dict_name) 83 | self.max_dist = max_dist 84 | 85 | def replace(self, word): 86 | if self.spell_dict.check(word): 87 | return word 88 | 89 | suggestions = self.spell_dict.suggest(word) 90 | 91 | if suggestions and edit_distance(word, suggestions[0]) <= self.max_dist: 92 | return suggestions[0] 93 | else: 94 | return word 95 | 96 | class CustomSpellingReplacer(SpellingReplacer): 97 | """ SpellingReplacer that allows passing a custom enchant dictionary, such 98 | a DictWithPWL. 99 | >>> d = enchant.DictWithPWL('en_US', 'mywords.txt') 100 | >>> replacer = CustomSpellingReplacer(d) 101 | >>> replacer.replace('nltk') 102 | 'nltk' 103 | """ 104 | def __init__(self, spell_dict, max_dist=2): 105 | self.spell_dict = spell_dict 106 | self.max_dist = max_dist 107 | 108 | ######################## 109 | ## Replacing Synonyms ## 110 | ######################## 111 | 112 | class WordReplacer(object): 113 | """ WordReplacer that replaces a given word with a word from the word_map, 114 | or if the word isn't found, returns the word as is. 115 | >>> replacer = WordReplacer({'bday': 'birthday'}) 116 | >>> replacer.replace('bday') 117 | 'birthday' 118 | >>> replacer.replace('happy') 119 | 'happy' 120 | """ 121 | def __init__(self, word_map): 122 | self.word_map = word_map 123 | 124 | def replace(self, word): 125 | return self.word_map.get(word, word) 126 | 127 | class CsvWordReplacer(WordReplacer): 128 | """ WordReplacer that reads word mappings from a csv file. 129 | >>> replacer = CsvWordReplacer('synonyms.csv') 130 | >>> replacer.replace('bday') 131 | 'birthday' 132 | >>> replacer.replace('happy') 133 | 'happy' 134 | """ 135 | def __init__(self, fname): 136 | word_map = {} 137 | 138 | for line in csv.reader(open(fname)): 139 | word, syn = line 140 | word_map[word] = syn 141 | 142 | super(CsvWordReplacer, self).__init__(word_map) 143 | 144 | class YamlWordReplacer(WordReplacer): 145 | """ WordReplacer that reads word mappings from a yaml file. 146 | >>> replacer = YamlWordReplacer('synonyms.yaml') 147 | >>> replacer.replace('bday') 148 | 'birthday' 149 | >>> replacer.replace('happy') 150 | 'happy' 151 | """ 152 | def __init__(self, fname): 153 | word_map = yaml.load(open(fname)) 154 | super(YamlWordReplacer, self).__init__(word_map) 155 | 156 | ####################################### 157 | ## Replacing Negations with Antonyms ## 158 | ####################################### 159 | 160 | class AntonymReplacer(object): 161 | def replace(self, word, pos=None): 162 | """ Returns the antonym of a word, but only if there is no ambiguity. 163 | >>> replacer = AntonymReplacer() 164 | >>> replacer.replace('good') 165 | >>> replacer.replace('uglify') 166 | 'beautify' 167 | >>> replacer.replace('beautify') 168 | 'uglify' 169 | """ 170 | antonyms = set() 171 | 172 | for syn in wordnet.synsets(word, pos=pos): 173 | for lemma in syn.lemmas(): 174 | for antonym in lemma.antonyms(): 175 | antonyms.add(antonym.name()) 176 | 177 | if len(antonyms) == 1: 178 | return antonyms.pop() 179 | else: 180 | return None 181 | 182 | def replace_negations(self, sent): 183 | """ Try to replace negations with antonyms in the tokenized sentence. 184 | >>> replacer = AntonymReplacer() 185 | >>> replacer.replace_negations(['do', 'not', 'uglify', 'our', 'code']) 186 | ['do', 'beautify', 'our', 'code'] 187 | >>> replacer.replace_negations(['good', 'is', 'not', 'evil']) 188 | ['good', 'is', 'not', 'evil'] 189 | """ 190 | i, l = 0, len(sent) 191 | words = [] 192 | 193 | while i < l: 194 | word = sent[i] 195 | 196 | if word == 'not' and i+1 < l: 197 | ant = self.replace(sent[i+1]) 198 | 199 | if ant: 200 | words.append(ant) 201 | i += 2 202 | continue 203 | 204 | words.append(word) 205 | i += 1 206 | 207 | return words 208 | 209 | class AntonymWordReplacer(WordReplacer, AntonymReplacer): 210 | """ AntonymReplacer that uses a custom mapping instead of WordNet. 211 | Order of inheritance is very important, this class would not work if 212 | AntonymReplacer comes before WordReplacer. 213 | >>> replacer = AntonymWordReplacer({'evil': 'good'}) 214 | >>> replacer.replace_negations(['good', 'is', 'not', 'evil']) 215 | ['good', 'is', 'good'] 216 | """ 217 | pass 218 | 219 | if __name__ == '__main__': 220 | import doctest 221 | doctest.testmod() -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | NLTK>=3.0a4 2 | pyenchant>=1.6.5 3 | lockfile>=0.9.1 4 | numpy>=1.8.0 5 | scipy>=0.13.0 6 | scikit-learn>=0.14.1 7 | execnet>=1.1 8 | pymongo>=2.6.3 9 | redis>=2.8.0 10 | lxml>=3.2.3 11 | beautifulsoup4>=4.3.2 12 | python-dateutil>=2.0 13 | charade>=1.0.3 14 | -------------------------------------------------------------------------------- /tag_util.py: -------------------------------------------------------------------------------- 1 | import itertools 2 | from nltk.tbl import Template 3 | from nltk.tag import brill, brill_trainer 4 | from nltk.probability import FreqDist, ConditionalFreqDist 5 | 6 | def backoff_tagger(train_sents, tagger_classes, backoff=None): 7 | for cls in tagger_classes: 8 | backoff = cls(train_sents, backoff=backoff) 9 | 10 | return backoff 11 | 12 | def word_tag_model(words, tagged_words, limit=200): 13 | fd = FreqDist(words) 14 | cfd = ConditionalFreqDist(tagged_words) 15 | most_freq = (word for word, count in fd.most_common(limit)) 16 | return dict((word, cfd[word].max()) for word in most_freq) 17 | 18 | patterns = [ 19 | (r'^\d+$', 'CD'), 20 | (r'.*ing$', 'VBG'), # gerunds, i.e. wondering 21 | (r'.*ment$', 'NN'), # i.e. wonderment 22 | (r'.*ful$', 'JJ') # i.e. wonderful 23 | ] 24 | 25 | def train_brill_tagger(initial_tagger, train_sents, **kwargs): 26 | templates = [ 27 | brill.Template(brill.Pos([-1])), 28 | brill.Template(brill.Pos([1])), 29 | brill.Template(brill.Pos([-2])), 30 | brill.Template(brill.Pos([2])), 31 | brill.Template(brill.Pos([-2, -1])), 32 | brill.Template(brill.Pos([1, 2])), 33 | brill.Template(brill.Pos([-3, -2, -1])), 34 | brill.Template(brill.Pos([1, 2, 3])), 35 | brill.Template(brill.Pos([-1]), brill.Pos([1])), 36 | brill.Template(brill.Word([-1])), 37 | brill.Template(brill.Word([1])), 38 | brill.Template(brill.Word([-2])), 39 | brill.Template(brill.Word([2])), 40 | brill.Template(brill.Word([-2, -1])), 41 | brill.Template(brill.Word([1, 2])), 42 | brill.Template(brill.Word([-3, -2, -1])), 43 | brill.Template(brill.Word([1, 2, 3])), 44 | brill.Template(brill.Word([-1]), brill.Word([1])), 45 | ] 46 | 47 | trainer = brill_trainer.BrillTaggerTrainer(initial_tagger, templates, deterministic=True) 48 | return trainer.train(train_sents, **kwargs) 49 | 50 | def unigram_feature_detector(tokens, index, history): 51 | return {'word': tokens[index]} -------------------------------------------------------------------------------- /taggers.py: -------------------------------------------------------------------------------- 1 | from nltk.tag import NgramTagger, SequentialBackoffTagger 2 | from nltk.corpus import wordnet, names 3 | from nltk.probability import FreqDist 4 | 5 | class QuadgramTagger(NgramTagger): 6 | def __init__(self, *args, **kwargs): 7 | NgramTagger.__init__(self, 4, *args, **kwargs) 8 | 9 | class WordNetTagger(SequentialBackoffTagger): 10 | ''' 11 | >>> wt = WordNetTagger() 12 | >>> wt.tag(['food', 'is', 'great']) 13 | [('food', 'NN'), ('is', 'VB'), ('great', 'JJ')] 14 | ''' 15 | def __init__(self, *args, **kwargs): 16 | SequentialBackoffTagger.__init__(self, *args, **kwargs) 17 | 18 | self.wordnet_tag_map = { 19 | 'n': 'NN', 20 | 's': 'JJ', 21 | 'a': 'JJ', 22 | 'r': 'RB', 23 | 'v': 'VB' 24 | } 25 | 26 | def choose_tag(self, tokens, index, history): 27 | word = tokens[index] 28 | fd = FreqDist() 29 | 30 | for synset in wordnet.synsets(word): 31 | fd[synset.pos()] += 1 32 | 33 | if not fd: return None 34 | return self.wordnet_tag_map.get(fd.max()) 35 | 36 | class NamesTagger(SequentialBackoffTagger): 37 | ''' 38 | >>> nt = NamesTagger() 39 | >>> nt.tag(['Jacob']) 40 | [('Jacob', 'NNP')] 41 | ''' 42 | def __init__(self, *args, **kwargs): 43 | SequentialBackoffTagger.__init__(self, *args, **kwargs) 44 | self.name_set = set([n.lower() for n in names.words()]) 45 | 46 | def choose_tag(self, tokens, index, history): 47 | word = tokens[index] 48 | 49 | if word.lower() in self.name_set: 50 | return 'NNP' 51 | else: 52 | return None 53 | 54 | if __name__ == '__main__': 55 | import doctest 56 | doctest.testmod() -------------------------------------------------------------------------------- /transforms.py: -------------------------------------------------------------------------------- 1 | import re, itertools 2 | import nltk.tag 3 | from nltk.tree import Tree 4 | 5 | def filter_insignificant(chunk, tag_suffixes=['DT', 'CC']): 6 | '''Remove insignificant words from the chunk. 7 | >>> filter_insignificant([('the', 'DT'), ('terrible', 'JJ'), ('movie', 'NN')]) 8 | [('terrible', 'JJ'), ('movie', 'NN')] 9 | ''' 10 | good = [] 11 | 12 | for word, tag in chunk: 13 | ok = True 14 | 15 | for suffix in tag_suffixes: 16 | if tag.endswith(suffix): 17 | ok = False 18 | break 19 | 20 | if ok: 21 | good.append((word, tag)) 22 | 23 | return good 24 | 25 | def tag_startswith(prefix): 26 | def f(wt): 27 | return wt[1].startswith(prefix) 28 | return f 29 | 30 | def tag_equals(tag): 31 | def f(wt): 32 | return wt[1] == tag 33 | return f 34 | 35 | def first_chunk_index(chunk, pred, start=0, step=1): 36 | '''Go through chunk and return the first index where pred(chunk[index]) 37 | returns True. 38 | >>> first_chunk_index([('61', 'CD'), ('years', 'NNS')], tag_equals('CD')) 39 | 0 40 | >>> first_chunk_index([('61', 'CD'), ('years', 'NNS')], tag_equals('NNS')) 41 | 1 42 | >>> first_chunk_index([('61', 'CD'), ('years', 'NNS')], tag_equals('CD'), start=1, step=-1) 43 | 0 44 | >>> first_chunk_index([('61', 'CD'), ('years', 'NNS')], tag_equals('VB')) 45 | ''' 46 | l = len(chunk) 47 | end = l if step > 0 else -1 48 | 49 | for i in range(start, end, step): 50 | if pred(chunk[i]): 51 | return i 52 | 53 | return None 54 | 55 | plural_verb_forms = { 56 | ('is', 'VBZ'): ('are', 'VBP'), 57 | ('was', 'VBD'): ('were', 'VBD') 58 | } 59 | 60 | singular_verb_forms = { 61 | ('are', 'VBP'): ('is', 'VBZ'), 62 | ('were', 'VBD'): ('was', 'VBD') 63 | } 64 | 65 | def correct_verbs(chunk): 66 | '''Correct plural/singular verb mistakes. 67 | >>> correct_verbs([('is', 'VBZ'), ('our', 'PRP$'), ('children', 'NNS'), ('learning', 'VBG')]) 68 | [('are', 'VBP'), ('our', 'PRP$'), ('children', 'NNS'), ('learning', 'VBG')] 69 | >>> correct_verbs([('our', 'PRP$'), ('children', 'NNS'), ('is', 'VBZ'), ('learning', 'VBG')]) 70 | [('our', 'PRP$'), ('children', 'NNS'), ('are', 'VBP'), ('learning', 'VBG')] 71 | >>> correct_verbs([('our', 'PRP$'), ('child', 'NN'), ('were', 'VBD'), ('learning', 'VBG')]) 72 | [('our', 'PRP$'), ('child', 'NN'), ('was', 'VBD'), ('learning', 'VBG')] 73 | >>> correct_verbs([('our', 'PRP$'), ('child', 'NN'), ('is', 'VBZ'), ('learning', 'VBG')]) 74 | [('our', 'PRP$'), ('child', 'NN'), ('is', 'VBZ'), ('learning', 'VBG')] 75 | ''' 76 | vbidx = first_chunk_index(chunk, tag_startswith('VB')) 77 | # if no verb found, do nothing 78 | if vbidx is None: 79 | return chunk 80 | 81 | verb, vbtag = chunk[vbidx] 82 | nnpred = tag_startswith('NN') 83 | # find nearest noun to the right of verb 84 | nnidx = first_chunk_index(chunk, nnpred, start=vbidx+1) 85 | # if no noun found to right, look to the left 86 | if nnidx is None: 87 | nnidx = first_chunk_index(chunk, nnpred, start=vbidx-1, step=-1) 88 | # if no noun found, do nothing 89 | if nnidx is None: 90 | return chunk 91 | 92 | noun, nntag = chunk[nnidx] 93 | # get correct verb form and insert into chunk 94 | if nntag.endswith('S'): 95 | chunk[vbidx] = plural_verb_forms.get((verb, vbtag), (verb, vbtag)) 96 | else: 97 | chunk[vbidx] = singular_verb_forms.get((verb, vbtag), (verb, vbtag)) 98 | 99 | return chunk 100 | 101 | def swap_verb_phrase(chunk): 102 | '''Move modifier phrase after verb to front of chunk and drop the verb. 103 | >>> swap_verb_phrase([('the', 'DT'), ('book', 'NN'), ('was', 'VBD'), ('great', 'JJ')]) 104 | [('great', 'JJ'), ('the', 'DT'), ('book', 'NN')] 105 | >>> swap_verb_phrase([('this', 'DT'), ('gripping', 'VBG'), ('book', 'NN'), ('is', 'VBZ'), ('fantastic', 'JJ')]) 106 | [('fantastic', 'JJ'), ('this', 'DT'), ('gripping', 'VBG'), ('book', 'NN')] 107 | ''' 108 | # find location of verb 109 | def vbpred(wt): 110 | word, tag = wt 111 | return tag != 'VBG' and tag.startswith('VB') and len(tag) > 2 112 | 113 | vbidx = first_chunk_index(chunk, vbpred) 114 | 115 | if vbidx is None: 116 | return chunk 117 | 118 | return chunk[vbidx+1:] + chunk[:vbidx] 119 | 120 | def swap_noun_cardinal(chunk): 121 | '''Move a cardinal that occurs after a noun to immediately before the noun. 122 | >>> swap_noun_cardinal([('Dec.', 'NNP'), ('10', 'CD')]) 123 | [('10', 'CD'), ('Dec.', 'NNP')] 124 | >>> swap_noun_cardinal([('the', 'DT'), ('top', 'NN'), ('10', 'CD')]) 125 | [('the', 'DT'), ('10', 'CD'), ('top', 'NN')] 126 | ''' 127 | cdidx = first_chunk_index(chunk, tag_equals('CD')) 128 | # cdidx must be > 0 and there must be a noun immediately before it 129 | if not cdidx or not chunk[cdidx-1][1].startswith('NN'): 130 | return chunk 131 | 132 | noun, nntag = chunk[cdidx-1] 133 | chunk[cdidx-1] = chunk[cdidx] 134 | chunk[cdidx] = noun, nntag 135 | return chunk 136 | 137 | def swap_infinitive_phrase(chunk): 138 | '''Move subject to before the noun preceding the infinitive. 139 | >>> swap_infinitive_phrase([('book', 'NN'), ('of', 'IN'), ('recipes', 'NNS')]) 140 | [('recipes', 'NNS'), ('book', 'NN')] 141 | >>> swap_infinitive_phrase([('tastes', 'VBZ'), ('like', 'IN'), ('chicken', 'NN')]) 142 | [('tastes', 'VBZ'), ('like', 'IN'), ('chicken', 'NN')] 143 | >>> swap_infinitive_phrase([('delicious', 'JJ'), ('book', 'NN'), ('of', 'IN'), ('recipes', 'NNS')]) 144 | [('delicious', 'JJ'), ('recipes', 'NNS'), ('book', 'NN')] 145 | ''' 146 | def inpred(wt): 147 | word, tag = wt 148 | return tag == 'IN' and word != 'like' 149 | 150 | inidx = first_chunk_index(chunk, inpred) 151 | 152 | if inidx is None: 153 | return chunk 154 | 155 | nnidx = first_chunk_index(chunk, tag_startswith('NN'), start=inidx, step=-1) or 0 156 | return chunk[:nnidx] + chunk[inidx+1:] + chunk[nnidx:inidx] 157 | 158 | def singularize_plural_noun(chunk): 159 | '''If a plural noun is followed by another noun, singularize the plural noun. 160 | >>> singularize_plural_noun([('recipes', 'NNS'), ('book', 'NN')]) 161 | [('recipe', 'NN'), ('book', 'NN')] 162 | ''' 163 | nnsidx = first_chunk_index(chunk, tag_equals('NNS')) 164 | 165 | if nnsidx is not None and nnsidx+1 < len(chunk) and chunk[nnsidx+1][1][:2] == 'NN': 166 | noun, nnstag = chunk[nnsidx] 167 | chunk[nnsidx] = (noun.rstrip('s'), nnstag.rstrip('S')) 168 | 169 | return chunk 170 | 171 | def transform_chunk(chunk, chain=[filter_insignificant, swap_verb_phrase, swap_infinitive_phrase, singularize_plural_noun], trace=0): 172 | ''' 173 | >>> transform_chunk([('the', 'DT'), ('book', 'NN'), ('of', 'IN'), ('recipes', 'NNS'), ('is', 'VBZ'), ('delicious', 'JJ')]) 174 | [('delicious', 'JJ'), ('recipe', 'NN'), ('book', 'NN')] 175 | ''' 176 | for f in chain: 177 | chunk = f(chunk) 178 | 179 | if trace: 180 | print('%s : %s' % (f.__name__, chunk)) 181 | 182 | return chunk 183 | 184 | punct_re = re.compile(r'\s([,\.;\?])') 185 | 186 | def chunk_tree_to_sent(tree, concat=' '): 187 | '''Convert a parse tree to a sentence, with correct punctuation. 188 | >>> from nltk.tree import Tree 189 | >>> chunk_tree_to_sent(Tree('S', [Tree('NP', [('Pierre', 'NNP'), ('Vinken', 'NNP')]), (',', ','), Tree('NP', [('61', 'CD'), ('years', 'NNS')]), ('old', 'JJ'), (',', ','), ('will', 'MD'), ('join', 'VB'), Tree('NP', [('the', 'DT'), ('board', 'NN')]), ('as', 'IN'), Tree('NP', [('a', 'DT'), ('nonexecutive', 'JJ'), ('director', 'NN'), ('Nov.', 'NNP'), ('29', 'CD')]), ('.', '.')])) 190 | 'Pierre Vinken, 61 years old, will join the board as a nonexecutive director Nov. 29.' 191 | ''' 192 | s = concat.join(nltk.tag.untag(tree.leaves())) 193 | return re.sub(punct_re, r'\g<1>', s) 194 | 195 | def flatten_childtrees(trees): 196 | children = [] 197 | 198 | for t in trees: 199 | if t.height() < 3: 200 | children.extend(t.pos()) 201 | elif t.height() == 3: 202 | children.append(Tree(t.label(), t.pos())) 203 | else: 204 | children.extend(flatten_childtrees([c for c in t])) 205 | 206 | return children 207 | 208 | def flatten_deeptree(tree): 209 | ''' 210 | >>> flatten_deeptree(Tree('S', [Tree('NP-SBJ', [Tree('NP', [Tree('NNP', ['Pierre']), Tree('NNP', ['Vinken'])]), Tree(',', [',']), Tree('ADJP', [Tree('NP', [Tree('CD', ['61']), Tree('NNS', ['years'])]), Tree('JJ', ['old'])]), Tree(',', [','])]), Tree('VP', [Tree('MD', ['will']), Tree('VP', [Tree('VB', ['join']), Tree('NP', [Tree('DT', ['the']), Tree('NN', ['board'])]), Tree('PP-CLR', [Tree('IN', ['as']), Tree('NP', [Tree('DT', ['a']), Tree('JJ', ['nonexecutive']), Tree('NN', ['director'])])]), Tree('NP-TMP', [Tree('NNP', ['Nov.']), Tree('CD', ['29'])])])]), Tree('.', ['.'])])) 211 | Tree('S', [Tree('NP', [('Pierre', 'NNP'), ('Vinken', 'NNP')]), (',', ','), Tree('NP', [('61', 'CD'), ('years', 'NNS')]), ('old', 'JJ'), (',', ','), ('will', 'MD'), ('join', 'VB'), Tree('NP', [('the', 'DT'), ('board', 'NN')]), ('as', 'IN'), Tree('NP', [('a', 'DT'), ('nonexecutive', 'JJ'), ('director', 'NN')]), Tree('NP-TMP', [('Nov.', 'NNP'), ('29', 'CD')]), ('.', '.')]) 212 | ''' 213 | return Tree(tree.label(), flatten_childtrees([c for c in tree])) 214 | 215 | def shallow_tree(tree): 216 | ''' 217 | >>> shallow_tree(Tree('S', [Tree('NP-SBJ', [Tree('NP', [Tree('NNP', ['Pierre']), Tree('NNP', ['Vinken'])]), Tree(',', [',']), Tree('ADJP', [Tree('NP', [Tree('CD', ['61']), Tree('NNS', ['years'])]), Tree('JJ', ['old'])]), Tree(',', [','])]), Tree('VP', [Tree('MD', ['will']), Tree('VP', [Tree('VB', ['join']), Tree('NP', [Tree('DT', ['the']), Tree('NN', ['board'])]), Tree('PP-CLR', [Tree('IN', ['as']), Tree('NP', [Tree('DT', ['a']), Tree('JJ', ['nonexecutive']), Tree('NN', ['director'])])]), Tree('NP-TMP', [Tree('NNP', ['Nov.']), Tree('CD', ['29'])])])]), Tree('.', ['.'])])) 218 | Tree('S', [Tree('NP-SBJ', [('Pierre', 'NNP'), ('Vinken', 'NNP'), (',', ','), ('61', 'CD'), ('years', 'NNS'), ('old', 'JJ'), (',', ',')]), Tree('VP', [('will', 'MD'), ('join', 'VB'), ('the', 'DT'), ('board', 'NN'), ('as', 'IN'), ('a', 'DT'), ('nonexecutive', 'JJ'), ('director', 'NN'), ('Nov.', 'NNP'), ('29', 'CD')]), ('.', '.')]) 219 | ''' 220 | children = [] 221 | 222 | for t in tree: 223 | if t.height() < 3: 224 | children.extend(t.pos()) 225 | else: 226 | children.append(Tree(t.label(), t.pos())) 227 | 228 | return Tree(tree.label(), children) 229 | 230 | def convert_tree_labels(tree, mapping): 231 | ''' 232 | >>> convert_tree_labels(Tree('S', [Tree('NP-SBJ', [('foo', 'NN')])]), {'NP-SBJ': 'NP'}) 233 | Tree('S', [Tree('NP', [('foo', 'NN')])]) 234 | ''' 235 | children = [] 236 | 237 | for t in tree: 238 | if isinstance(t, Tree): 239 | children.append(convert_tree_labels(t, mapping)) 240 | else: 241 | children.append(t) 242 | 243 | label = mapping.get(tree.label(), tree.label()) 244 | return Tree(label, children) 245 | 246 | if __name__ == '__main__': 247 | import doctest 248 | doctest.testmod() -------------------------------------------------------------------------------- /treebank.chunk: -------------------------------------------------------------------------------- 1 | [Earlier/JJR staff-reduction/NN moves/NNS] have/VBP trimmed/VBN about/IN [300/CD jobs/NNS] ,/, [the/DT spokesman/NN] said/VBD ./. -------------------------------------------------------------------------------- /wordlist: -------------------------------------------------------------------------------- 1 | nltk 2 | corpus 3 | corpora 4 | wordnet 5 | --------------------------------------------------------------------------------