├── README ├── chunk_parser.py ├── chunker_training.txt ├── chunking.txt ├── classifier_training.txt ├── collocations.txt ├── explore_nltk.py ├── movie_reviews_classifier.txt ├── stemming.txt ├── tagger_training.txt ├── tagging.txt ├── tokenization.txt ├── translation.txt └── wordnet.txt /README: -------------------------------------------------------------------------------- 1 | Doctestable code examples and NLTK exploration functions from Introduction to NLTK at PyCon 2012. 2 | -------------------------------------------------------------------------------- /chunk_parser.py: -------------------------------------------------------------------------------- 1 | from nltk.chunk.named_entity import NEChunkParser, NEChunkParserTagger 2 | from nltk.classify import NaiveBayesClassifier 3 | from nltk.tag.simplify import simplify_wsj_tag 4 | from nltk.tree import Tree 5 | 6 | def simplify_chunk(chunk): 7 | if isinstance(chunk, Tree): 8 | return Tree(chunk.node, [simplify_chunk(c) for c in chunk]) 9 | elif isinstance(chunk, tuple): 10 | word, tag = chunk 11 | return (word, simplify_wsj_tag(tag)) 12 | else: 13 | return chunk 14 | 15 | # custom classes are required to use a custom classifier, the default is megam 16 | 17 | class ChunkTagger(NEChunkParserTagger): 18 | def _classifier_builder(self, train): 19 | return NaiveBayesClassifier.train(train) 20 | 21 | class ChunkParser(NEChunkParser): 22 | def _train(self, corpus): 23 | self._tagger = ChunkTagger([self._parse_to_tagged(s) for s in corpus]) -------------------------------------------------------------------------------- /chunker_training.txt: -------------------------------------------------------------------------------- 1 | ==================== 2 | Treebank Chunk Trees 3 | ==================== 4 | 5 | >>> from nltk.corpus import treebank_chunk 6 | >>> treebank_chunk.chunked_sents()[0] 7 | Tree('S', [Tree('NP', [('Pierre', 'NNP'), ('Vinken', 'NNP')]), (',', ','), Tree('NP', [('61', 'CD'), ('years', 'NNS')]), ('old', 'JJ'), (',', ','), ('will', 'MD'), ('join', 'VB'), Tree('NP', [('the', 'DT'), ('board', 'NN')]), ('as', 'IN'), Tree('NP', [('a', 'DT'), ('nonexecutive', 'JJ'), ('director', 'NN'), ('Nov.', 'NNP'), ('29', 'CD')]), ('.', '.')]) 8 | 9 | >>> from nltk import chunk 10 | >>> import nltk.data 11 | >>> chunker = nltk.data.load(chunk._MULTICLASS_NE_CHUNKER) 12 | >>> score = chunker.evaluate(treebank_chunk.chunked_sents()) 13 | >>> print score 14 | ChunkParse score: 15 | IOB Accuracy: 45.4% 16 | Precision: 0.0% 17 | Recall: 0.0% 18 | F-Measure: 0.0% 19 | 20 | # treebank_chunk doesn't have named entities, so score is not really accurate 21 | 22 | >>> score.accuracy() 23 | 0.4536624203821656 24 | >>> score.precision() 25 | 0.0 26 | >>> score.recall() 27 | 0.0 28 | >>> len(score.correct()) 29 | 24667 30 | >>> len(score.incorrect()) 31 | 5659 32 | >>> len(score.missed()) 33 | 24667 34 | 35 | ====================== 36 | Simplified Chunk Trees 37 | ====================== 38 | 39 | >>> import chunk_parser 40 | >>> simple_chunks = [chunk_parser.simplify_chunk(c) for c in treebank_chunk.chunked_sents()] 41 | >>> simple_chunks[0] 42 | Tree('S', [Tree('NP', [('Pierre', 'NP'), ('Vinken', 'NP')]), (',', ','), Tree('NP', [('61', 'NUM'), ('years', 'N')]), ('old', 'ADJ'), (',', ','), ('will', 'MOD'), ('join', 'V'), Tree('NP', [('the', 'DET'), ('board', 'N')]), ('as', 'P'), Tree('NP', [('a', 'DET'), ('nonexecutive', 'ADJ'), ('director', 'N'), ('Nov.', 'NP'), ('29', 'NUM')]), ('.', '.')]) 43 | 44 | >>> chunker = chunk_parser.ChunkParser(simple_chunks) 45 | >>> score = chunker.evaluate(simple_chunks) 46 | >>> print score 47 | ChunkParse score: 48 | IOB Accuracy: 97.6% 49 | Precision: 93.0% 50 | Recall: 94.4% 51 | F-Measure: 93.7% 52 | 53 | # score is self-accurate and doesn't reflect real-world usage 54 | 55 | >>> import os, os.path, pickle 56 | >>> path = os.path.expanduser('~/nltk_data/chunkers/') 57 | >>> os.makedirs(path) 58 | >>> f = open(os.path.join(path, 'chunk_parser.pickle'), 'wb') 59 | >>> pickle.dump(chunker, f) 60 | >>> f.close() 61 | 62 | >>> iob_tagged = chunker._parse_to_tagged(simple_chunks[0]) 63 | >>> iob_tagged 64 | [(('Pierre', 'NP'), 'B-NP'), (('Vinken', 'NP'), 'I-NP'), ((',', ','), 'O'), (('61', 'NUM'), 'B-NP'), (('years', 'N'), 'I-NP'), (('old', 'ADJ'), 'O'), ((',', ','), 'O'), (('will', 'MOD'), 'O'), (('join', 'V'), 'O'), (('the', 'DET'), 'B-NP'), (('board', 'N'), 'I-NP'), (('as', 'P'), 'O'), (('a', 'DET'), 'B-NP'), (('nonexecutive', 'ADJ'), 'I-NP'), (('director', 'N'), 'I-NP'), (('Nov.', 'NP'), 'I-NP'), (('29', 'NUM'), 'I-NP'), (('.', '.'), 'O')] 65 | >>> chunker._tagged_to_parse(iob_tagged) == simple_chunks[0] 66 | True 67 | >>> iob_tagged[0] 68 | (('Pierre', 'NP'), 'B-NP') 69 | >>> untagged, tags = zip(*iob_tagged) 70 | >>> chunker._tagger._feature_detector(untagged, 0, []) 71 | {'nextpos': 'np', 'pos+prevtag': 'NP+None', 'nextword': 'vinken', 'word': 'Pierre', 'prefix3': 'pie', 'wordlen': 6, 'prevpos': None, 'pos': 'NP', 'prevtag': None, 'prevword': None, 'shape': 'upcase', 'bias': True, 'en-wordlist': False, 'shape+prevtag': 'None+None', 'suffix3': 'rre', 'word+nextpos': 'pierre+np'} 72 | -------------------------------------------------------------------------------- /chunking.txt: -------------------------------------------------------------------------------- 1 | ================== 2 | Default NE Chunker 3 | ================== 4 | 5 | >>> sent = "Today you'll be learning NLTK." 6 | >>> from nltk import chunk, tag, tokenize 7 | >>> words = tokenize.word_tokenize(sent) 8 | >>> tagged_sent = tag.pos_tag(words) 9 | >>> tree = chunk.ne_chunk(tagged_sent) 10 | >>> tree 11 | Tree('S', [('Today', 'NN'), ('you', 'PRP'), ("'ll", 'MD'), ('be', 'VB'), ('learning', 'VBG'), Tree('ORGANIZATION', [('NLTK', 'NNP')]), ('.', '.')]) 12 | >>> tree.draw() 13 | 14 | >>> import nltk.data 15 | >>> chunker = nltk.data.load(chunk._MULTICLASS_NE_CHUNKER) 16 | >>> chunker.parse(tagged_sent) 17 | Tree('S', [('Today', 'NN'), ('you', 'PRP'), ("'ll", 'MD'), ('be', 'VB'), ('learning', 'VBG'), Tree('ORGANIZATION', [('NLTK', 'NNP')]), ('.', '.')]) 18 | 19 | ================= 20 | Binary NE Chunker 21 | ================= 22 | 23 | >>> chunk.ne_chunk(tagged_sent, binary=True) 24 | Tree('S', [('Today', 'NN'), ('you', 'PRP'), ("'ll", 'MD'), ('be', 'VB'), ('learning', 'VBG'), Tree('NE', [('NLTK', 'NNP')]), ('.', '.')]) 25 | 26 | >>> binary_chunker = nltk.data.load(chunk._BINARY_NE_CHUNKER) 27 | >>> binary_chunker.parse(tagged_sent) 28 | Tree('S', [('Today', 'NN'), ('you', 'PRP'), ("'ll", 'MD'), ('be', 'VB'), ('learning', 'VBG'), Tree('NE', [('NLTK', 'NNP')]), ('.', '.')]) 29 | 30 | ================= 31 | Phrase Extraction 32 | ================= 33 | 34 | >>> from nltk.tag import untag 35 | >>> untag(tagged_sent) 36 | ['Today', 'you', "'ll", 'be', 'learning', 'NLTK', '.'] 37 | 38 | >>> import collections 39 | >>> def extract_phrases(t): 40 | ... d = collections.defaultdict(list) 41 | ... for sub in t.subtrees(lambda s: s.node != 'S'): 42 | ... d[sub.node].append(' '.join(untag(sub.leaves()))) 43 | ... return d 44 | 45 | >>> extract_phrases(tree) 46 | defaultdict(, {'ORGANIZATION': ['NLTK']}) 47 | -------------------------------------------------------------------------------- /classifier_training.txt: -------------------------------------------------------------------------------- 1 | ====================================== 2 | Train Naive Bayes Sentiment Classifier 3 | ====================================== 4 | 5 | >>> from nltk.corpus import movie_reviews 6 | >>> from nltk.classify import NaiveBayesClassifier 7 | >>> def bag_of_words(para): 8 | ... return dict([(word, True) for sent in para for word in sent]) 9 | 10 | >>> bag_of_words([['great', 'movie']]) 11 | {'movie': True, 'great': True} 12 | >>> pos_feats = [(bag_of_words(para), 'pos') for para in movie_reviews.paras(categories=['pos'])] 13 | >>> neg_feats = [(bag_of_words(para), 'neg') for para in movie_reviews.paras(categories=['neg'])] 14 | >>> pos_cutoff = len(pos_feats) * 3/4 15 | >>> neg_cutoff = len(neg_feats) * 3/4 16 | >>> train_feats = pos_feats[:pos_cutoff] + neg_feats[:neg_cutoff] 17 | >>> test_feats = pos_feats[pos_cutoff:] + neg_feats[neg_cutoff:] 18 | >>> classifier = NaiveBayesClassifier.train(train_feats) 19 | >>> classifier.classify({'great': True, 'movie': True}) 20 | 'pos' 21 | >>> classifier.classify({'bad': True, 'movie': True}) 22 | 'neg' 23 | 24 | >>> from nltk.classify.util import accuracy 25 | >>> accuracy(classifier, test_feats) 26 | 0.728 27 | 28 | # low accuracy is usually not the algorithms fault, but noisy data 29 | 30 | >>> classifier.show_most_informative_features() 31 | Most Informative Features 32 | magnificent = True pos : neg = 15.0 : 1.0 33 | outstanding = True pos : neg = 13.6 : 1.0 34 | insulting = True neg : pos = 13.0 : 1.0 35 | vulnerable = True pos : neg = 12.3 : 1.0 36 | ludicrous = True neg : pos = 11.8 : 1.0 37 | avoids = True pos : neg = 11.7 : 1.0 38 | uninvolving = True neg : pos = 11.7 : 1.0 39 | astounding = True pos : neg = 10.3 : 1.0 40 | fascination = True pos : neg = 10.3 : 1.0 41 | idiotic = True neg : pos = 9.8 : 1.0 42 | 43 | 44 | ==================== 45 | Precision and Recall 46 | ==================== 47 | 48 | >>> import collections 49 | >>> def build_ref_test_sets(classifier, labeled_feats): 50 | ... refsets = collections.defaultdict(set) 51 | ... testsets = collections.defaultdict(set) 52 | ... for i, (feat, label) in enumerate(labeled_feats): 53 | ... refsets[label].add(i) 54 | ... guess = classifier.classify(feat) 55 | ... testsets[guess].add(i) 56 | ... return refsets, testsets 57 | 58 | >>> refsets, testsets = build_ref_test_sets(classifier, test_feats) 59 | >>> from nltk import metrics 60 | 61 | # precision measures lack of false positives, or what percentage of guesses were correct 62 | # a false positive would be a featureset that was guessed to be pos, but wasn't 63 | 64 | >>> metrics.precision(refsets['pos'], testsets['pos']) 65 | 0.651595744680851 66 | >>> metrics.precision(refsets['neg'], testsets['neg']) 67 | 0.9596774193548387 68 | 69 | # recall measures lack of false negatives, or what percentage was guessed correctly 70 | # a false negative would be a featureset that should have been guessed to be pos, but wasn't 71 | 72 | >>> metrics.recall(refsets['pos'], testsets['pos']) 73 | 0.98 74 | >>> metrics.recall(refsets['neg'], testsets['neg']) 75 | 0.476 76 | 77 | # F-measure is the harmonic mean of precision and recall 78 | 79 | >>> metrics.f_measure(refsets['pos'], testsets['pos']) 80 | 0.7827476038338657 81 | >>> metrics.f_measure(refsets['neg'], testsets['neg']) 82 | 0.6363636363636364 83 | -------------------------------------------------------------------------------- /collocations.txt: -------------------------------------------------------------------------------- 1 | ============================= 2 | Discovering Word Collocations 3 | ============================= 4 | 5 | >>> from nltk.corpus import webtext 6 | >>> from nltk.collocations import BigramCollocationFinder 7 | >>> from nltk.metrics import BigramAssocMeasures 8 | >>> words = [w.lower() for w in webtext.words('grail.txt')] 9 | >>> bcf = BigramCollocationFinder.from_words(words) 10 | >>> bcf.nbest(BigramAssocMeasures.likelihood_ratio, 4) 11 | [("'", 's'), ('arthur', ':'), ('#', '1'), ("'", 't')] 12 | 13 | >>> from nltk.corpus import stopwords 14 | >>> stopset = set(stopwords.words('english')) 15 | >>> filter_stops = lambda w: len(w) < 3 or w in stopset 16 | >>> bcf.apply_word_filter(filter_stops) 17 | >>> bcf.nbest(BigramAssocMeasures.likelihood_ratio, 4) 18 | [('black', 'knight'), ('clop', 'clop'), ('head', 'knight'), ('mumble', 'mumble')] 19 | 20 | >>> from nltk.collocations import TrigramCollocationFinder 21 | >>> from nltk.metrics import TrigramAssocMeasures 22 | >>> words = [w.lower() for w in webtext.words('singles.txt')] 23 | >>> tcf = TrigramCollocationFinder.from_words(words) 24 | >>> tcf.apply_word_filter(filter_stops) 25 | >>> tcf.apply_freq_filter(3) 26 | >>> tcf.nbest(TrigramAssocMeasures.likelihood_ratio, 4) 27 | [('long', 'term', 'relationship')] 28 | -------------------------------------------------------------------------------- /explore_nltk.py: -------------------------------------------------------------------------------- 1 | import collections, itertools, string 2 | from nltk import collocations, probability, stem 3 | from nltk.corpus import stopwords, wordnet 4 | from nltk.tag.util import untag 5 | from nltk.util import bigrams 6 | 7 | stopset = set(stopwords.words('english')) | set(string.punctuation) 8 | stemmer = stem.PorterStemmer() 9 | 10 | def count_stems(corpus): 11 | fd = probability.FreqDist() 12 | 13 | for word in corpus.words(): 14 | w = word.lower() 15 | if w in stopset: continue 16 | fd.inc(stemmer.stem(w)) 17 | 18 | return fd 19 | 20 | def count_hypernyms(corpus): 21 | fd = probability.FreqDist() 22 | 23 | for word in corpus.words(): 24 | w = word.lower() 25 | if w in stopset: continue 26 | 27 | for syn in wordnet.synsets(w): 28 | if syn.pos != 'n': continue 29 | 30 | for path in syn.hypernym_paths(): 31 | for hyp in path: 32 | fd.inc(hyp.name) 33 | 34 | return fd 35 | 36 | def count_stemmed_bigram_collocations(corpus, min_freq=3): 37 | stems = (stemmer.stem(w.lower()) for w in corpus.words()) 38 | finder = collocations.BigramCollocationFinder.from_words(stems) 39 | finder.apply_word_filter(lambda w: w in stopset) 40 | finder.apply_freq_filter(min_freq) 41 | return finder 42 | 43 | def count_tag_words(corpus, tagger): 44 | cfd = probability.ConditionalFreqDist() 45 | 46 | for sent in corpus.sents(): 47 | for word, tag in tagger.tag(sent): 48 | w = word.lower() 49 | if w in stopset: continue 50 | cfd[tag].inc(w) 51 | 52 | return cfd 53 | 54 | def count_phrases(corpus, tagger, chunker): 55 | cfd = probability.ConditionalFreqDist() 56 | 57 | for sent in corpus.sents(): 58 | tree = chunker.parse(tagger.tag(sent)) 59 | 60 | for sub in tree.subtrees(): 61 | if sub.node == 'S': continue 62 | words = untag(sub.leaves()) 63 | if len(words) >= 2: cfd[sub.node].inc(' '.join(words)) 64 | 65 | return cfd 66 | 67 | def classify_paras(paras, classifier): 68 | d = collections.defaultdict(list) 69 | 70 | for para in paras: 71 | words = [w.lower() for w in itertools.chain(*para)] 72 | feats = dict([(w, True) for w in words + bigrams(words)]) 73 | label = classifier.classify(feats) 74 | d[label].append(' '.join(words)) 75 | 76 | return d -------------------------------------------------------------------------------- /movie_reviews_classifier.txt: -------------------------------------------------------------------------------- 1 | >>> import nltk.data 2 | >>> classifier = nltk.data.load('classifiers/movie_reviews_NaiveBayes.pickle') 3 | >>> from nltk import tokenize 4 | >>> words = tokenize.word_tokenize("that was a terrible movie") 5 | >>> feats = dict([(word, True) for word in words]) 6 | >>> feats 7 | {'a': True, 'movie': True, 'was': True, 'terrible': True, 'that': True} 8 | >>> classifier.classify(feats) 9 | 'neg' 10 | >>> probs = classifier.prob_classify(feats) 11 | >>> probs.prob('neg') 12 | 0.789541654729651 13 | >>> probs.prob('pos') 14 | 0.2104583452703487 15 | >>> classifier.show_most_informative_features() 16 | Most Informative Features 17 | avoids = True pos : neg = 13.0 : 1.0 18 | astounding = True pos : neg = 12.3 : 1.0 19 | slip = True pos : neg = 11.7 : 1.0 20 | outstanding = True pos : neg = 11.5 : 1.0 21 | ludicrous = True neg : pos = 11.0 : 1.0 22 | insulting = True neg : pos = 11.0 : 1.0 23 | fascination = True pos : neg = 11.0 : 1.0 24 | 3000 = True neg : pos = 11.0 : 1.0 25 | sucks = True neg : pos = 10.6 : 1.0 26 | hudson = True neg : pos = 10.3 : 1.0 27 | -------------------------------------------------------------------------------- /stemming.txt: -------------------------------------------------------------------------------- 1 | ============== 2 | Stemming Words 3 | ============== 4 | 5 | >>> from nltk import stem 6 | >>> stemmer = stem.PorterStemmer() 7 | >>> stemmer.stem('cooking') 8 | 'cook' 9 | >>> stemmer.stem('cookery') 10 | 'cookeri' 11 | 12 | >>> stemmer = stem.LancasterStemmer() 13 | >>> stemmer.stem('cooking') 14 | 'cook' 15 | >>> stemmer.stem('cookery') 16 | 'cookery' 17 | 18 | # essentially common suffix removal 19 | 20 | >>> stemmer = stem.RegexpStemmer('ing') 21 | >>> stemmer.stem('cooking') 22 | 'cook' 23 | >>> stemmer.stem('cookery') 24 | 'cookery' 25 | >>> stemmer.stem('ingleside') 26 | 'leside' 27 | 28 | >>> stem.SnowballStemmer.languages 29 | ('danish', 'dutch', 'english', 'finnish', 'french', 'german', 'hungarian', 'italian', 'norwegian', 'porter', 'portuguese', 'romanian', 'russian', 'spanish', 'swedish') 30 | >>> spanish_stemmer = stem.SnowballStemmer('spanish') 31 | >>> spanish_stemmer.stem('hola') 32 | u'hol' 33 | 34 | # SnowballStemmer('english') is an updated & improved Porter stemmer 35 | 36 | ============================== 37 | Lemmatising Words with WordNet 38 | ============================== 39 | 40 | >>> lemmatizer = stem.WordNetLemmatizer() 41 | >>> lemmatizer.lemmatize('cooking') 42 | 'cooking' 43 | >>> lemmatizer.lemmatize('cooking', pos='v') 44 | 'cook' 45 | >>> lemmatizer.lemmatize('cookbooks') 46 | 'cookbook' 47 | 48 | # lemmatizing preserves meaning and lemmas are real words, while stems may not be 49 | 50 | >>> stemmer = stem.PorterStemmer() 51 | >>> stemmer.stem('believes') 52 | 'believ' 53 | >>> lemmatizer.lemmatize('believes') 54 | 'belief' 55 | 56 | >>> stemmer.stem('buses') 57 | 'buse' 58 | >>> lemmatizer.lemmatize('buses') 59 | 'bus' 60 | >>> stemmer.stem('bus') 61 | 'bu' 62 | 63 | # lossy compression, can be useful for fuzzy matching 64 | -------------------------------------------------------------------------------- /tagger_training.txt: -------------------------------------------------------------------------------- 1 | ========================= 2 | Treebank Tagged Sentences 3 | ========================= 4 | 5 | >>> from nltk.corpus import treebank 6 | >>> treebank.tagged_sents()[0] 7 | [('Pierre', 'NNP'), ('Vinken', 'NNP'), (',', ','), ('61', 'CD'), ('years', 'NNS'), ('old', 'JJ'), (',', ','), ('will', 'MD'), ('join', 'VB'), ('the', 'DT'), ('board', 'NN'), ('as', 'IN'), ('a', 'DT'), ('nonexecutive', 'JJ'), ('director', 'NN'), ('Nov.', 'NNP'), ('29', 'CD'), ('.', '.')] 8 | 9 | >>> import nltk.data 10 | >>> from nltk import tag 11 | >>> tagger = nltk.data.load(tag._POS_TAGGER) 12 | >>> sent = treebank.sents()[0] 13 | >>> sent 14 | ['Pierre', 'Vinken', ',', '61', 'years', 'old', ',', 'will', 'join', 'the', 'board', 'as', 'a', 'nonexecutive', 'director', 'Nov.', '29', '.'] 15 | 16 | >>> tagger.tag(sent) 17 | [('Pierre', 'NNP'), ('Vinken', 'NNP'), (',', ','), ('61', 'CD'), ('years', 'NNS'), ('old', 'JJ'), (',', ','), ('will', 'MD'), ('join', 'VB'), ('the', 'DT'), ('board', 'NN'), ('as', 'IN'), ('a', 'DT'), ('nonexecutive', 'JJ'), ('director', 'NN'), ('Nov.', 'NNP'), ('29', 'CD'), ('.', '.')] 18 | 19 | >>> tagger.evaluate(treebank.tagged_sents()) 20 | 0.9956891414041082 21 | 22 | # this is self-accurate and doesn't reflect real-world performance 23 | 24 | >>> simple_sents = treebank.tagged_sents(simplify_tags=True) 25 | >>> simple_sents[0] 26 | [('Pierre', 'NP'), ('Vinken', 'NP'), (',', ','), ('61', 'NUM'), ('years', 'N'), ('old', 'ADJ'), (',', ','), ('will', 'MOD'), ('join', 'V'), ('the', 'DET'), ('board', 'N'), ('as', 'P'), ('a', 'DET'), ('nonexecutive', 'ADJ'), ('director', 'N'), ('Nov.', 'NP'), ('29', 'NUM'), ('.', '.')] 27 | 28 | =================================== 29 | Training Sequential Backoff Taggers 30 | =================================== 31 | 32 | >>> default = tag.DefaultTagger('N') 33 | >>> default.evaluate(simple_sents) 34 | 0.19083992212642537 35 | 36 | >>> u = tag.UnigramTagger(simple_sents, backoff=default) 37 | >>> u.tag(sent) 38 | [('Pierre', 'NP'), ('Vinken', 'NP'), (',', ','), ('61', 'NUM'), ('years', 'N'), ('old', 'ADJ'), (',', ','), ('will', 'MOD'), ('join', 'V'), ('the', 'DET'), ('board', 'N'), ('as', 'P'), ('a', 'DET'), ('nonexecutive', 'ADJ'), ('director', 'N'), ('Nov.', 'NP'), ('29', 'NUM'), ('.', '.')] 39 | >>> u.evaluate(simple_sents) 40 | 0.9656621240414796 41 | 42 | # these numbers are self-accurate and won't reflect real-world performance 43 | # for accuracy measurements, separate simple_sents into training & testing lists 44 | 45 | >>> ub = tag.BigramTagger(simple_sents, backoff=u) 46 | >>> ub.evaluate(simple_sents) 47 | 0.9861635345067344 48 | 49 | >>> ubt = tag.TrigramTagger(simple_sents, backoff=ub) 50 | >>> ubt.evaluate(simple_sents) 51 | 0.991914656919226 52 | 53 | =============== 54 | Saving a Tagger 55 | =============== 56 | 57 | >>> import os, os.path, pickle 58 | >>> path = os.path.expanduser('~/nltk_data/taggers/') 59 | >>> os.makedirs(path) 60 | >>> f = open(os.path.join(path, 'ubt.pickle'), 'wb') 61 | >>> pickle.dump(ubt, f) 62 | 63 | ================================== 64 | Training a Classifier Based Tagger 65 | ================================== 66 | 67 | >>> c = tag.ClassifierBasedPOSTagger(train=simple_sents) 68 | >>> c.evaluate(simple_sents) 69 | 0.9773530930907068 70 | 71 | # a classifier based tagger is also much slower than sequential backoff taggers 72 | -------------------------------------------------------------------------------- /tagging.txt: -------------------------------------------------------------------------------- 1 | >>> from nltk import tokenize 2 | >>> sent = tokenize.word_tokenize("Today you'll be learning NLTK.") 3 | >>> from nltk import tag 4 | >>> tag.pos_tag(sent) 5 | [('Today', 'NN'), ('you', 'PRP'), ("'ll", 'MD'), ('be', 'VB'), ('learning', 'VBG'), ('NLTK', 'NNP'), ('.', '.')] 6 | 7 | >>> import nltk.data 8 | >>> tagger = nltk.data.load(tag._POS_TAGGER) 9 | >>> tagger.tag(sent) 10 | [('Today', 'NN'), ('you', 'PRP'), ("'ll", 'MD'), ('be', 'VB'), ('learning', 'VBG'), ('NLTK', 'NNP'), ('.', '.')] 11 | -------------------------------------------------------------------------------- /tokenization.txt: -------------------------------------------------------------------------------- 1 | ===================== 2 | Sentence Tokenization 3 | ===================== 4 | 5 | >>> para = "Hello. My name is Jacob. Today you'll be learning NLTK." 6 | >>> from nltk import tokenize 7 | >>> tokenize.sent_tokenize(para) 8 | ['Hello.', 'My name is Jacob.', "Today you'll be learning NLTK."] 9 | 10 | >>> import nltk.data 11 | >>> tokenizer = nltk.data.load('tokenizers/punkt/english.pickle') 12 | >>> sents = tokenizer.tokenize(para) 13 | >>> sents 14 | ['Hello.', 'My name is Jacob.', "Today you'll be learning NLTK."] 15 | 16 | ================= 17 | Word Tokenization 18 | ================= 19 | 20 | >>> sent = sents[2] 21 | >>> tokenize.word_tokenize(sent) 22 | ['Today', 'you', "'ll", 'be', 'learning', 'NLTK', '.'] 23 | 24 | >>> tokenize.wordpunct_tokenize(sent) 25 | ['Today', 'you', "'", 'll', 'be', 'learning', 'NLTK', '.'] 26 | 27 | >>> from nltk.tokenize import WordPunctTokenizer 28 | >>> tokenizer = WordPunctTokenizer() 29 | >>> tokenizer.tokenize(sent) 30 | ['Today', 'you', "'", 'll', 'be', 'learning', 'NLTK', '.'] 31 | 32 | >>> tokenizer = tokenize.PunktWordTokenizer() 33 | >>> tokenizer.tokenize(sent) 34 | ['Today', 'you', "'ll", 'be', 'learning', 'NLTK.'] 35 | 36 | >>> tokenizer = tokenize.SpaceTokenizer() 37 | >>> tokenizer.tokenize(sent) 38 | ['Today', "you'll", 'be', 'learning', 'NLTK.'] 39 | 40 | 41 | Choosing a Word Tokenizer 42 | ------------------------- 43 | Your choice of word tokenizer depends on further steps down the pipeline. 44 | There's no one right answer, it's context/pipeline dependent. 45 | Do you need a normalized/canonical form? 46 | How much does punctuation matter, and in what way? 47 | What does the pos tagger and/or classifier expect? 48 | Are you doing transformations? 49 | 50 | 51 | ========================================== 52 | Tokenizing Words using Regular Expressions 53 | ========================================== 54 | 55 | >>> from nltk.tokenize import RegexpTokenizer 56 | >>> tokenizer = RegexpTokenizer("[\w']+") 57 | >>> tokenizer.tokenize("Can't is a contraction.") 58 | ["Can't", 'is', 'a', 'contraction'] 59 | 60 | >>> from nltk.tokenize import regexp_tokenize 61 | >>> regexp_tokenize("Can't is a contraction.", "[\w']+") 62 | ["Can't", 'is', 'a', 'contraction'] 63 | 64 | >>> tokenizer = RegexpTokenizer('\s+', gaps=True) 65 | >>> tokenizer.tokenize("Can't is a contraction.") 66 | ["Can't", 'is', 'a', 'contraction.'] 67 | -------------------------------------------------------------------------------- /translation.txt: -------------------------------------------------------------------------------- 1 | =============================== 2 | Translating Text with Babelfish 3 | =============================== 4 | 5 | >>> from nltk.misc import babelfish 6 | >>> babelfish.translate('cookbook', 'english', 'spanish') 7 | 'libro de cocina' 8 | >>> babelfish.translate('libro de cocina', 'spanish', 'english') 9 | 'kitchen book' 10 | >>> babelfish.translate('cookbook', 'english', 'german') 11 | 'Kochbuch' 12 | >>> babelfish.translate('kochbuch', 'german', 'english') 13 | 'cook book' 14 | 15 | >>> for text in babelfish.babelize('cookbook', 'english', 'spanish'): 16 | ... print text 17 | ... 18 | cookbook 19 | libro de cocina 20 | kitchen book 21 | libro de la cocina 22 | book of the kitchen 23 | 24 | >>> babelfish.available_languages 25 | ['Portuguese', 'Chinese', 'German', 'Japanese', 'French', 'Spanish', 'Russian', 'Greek', 'English', 'Korean', 'Italian'] 26 | -------------------------------------------------------------------------------- /wordnet.txt: -------------------------------------------------------------------------------- 1 | ========================================= 2 | Looking up a Synset for a Word in WordNet 3 | ========================================= 4 | 5 | >>> from nltk.corpus import wordnet 6 | >>> syn = wordnet.synsets('cookbook')[0] 7 | >>> syn.name 8 | 'cookbook.n.01' 9 | >>> syn.definition 10 | 'a book of recipes and cooking directions' 11 | 12 | >>> wordnet.synset('cookbook.n.01') 13 | Synset('cookbook.n.01') 14 | 15 | >>> syn.examples 16 | ['cooking can be a great art', 'people are needed who have experience in cookery', 'he left the preparation of meals to his wife'] 17 | 18 | >>> syn.hypernyms() 19 | [Synset('reference_book.n.01')] 20 | >>> syn.hypernyms()[0].hyponyms() 21 | [Synset('encyclopedia.n.01'), Synset('directory.n.01'), Synset('source_book.n.01'), Synset('handbook.n.01'), Synset('instruction_book.n.01'), Synset('cookbook.n.01'), Synset('annual.n.02'), Synset('atlas.n.02'), Synset('wordbook.n.01')] 22 | >>> syn.root_hypernyms() 23 | [Synset('entity.n.01')] 24 | 25 | >>> syn.hypernym_paths() 26 | [[Synset('entity.n.01'), Synset('physical_entity.n.01'), Synset('object.n.01'), Synset('whole.n.02'), Synset('artifact.n.01'), Synset('creation.n.02'), Synset('product.n.02'), Synset('work.n.02'), Synset('publication.n.01'), Synset('book.n.01'), Synset('reference_book.n.01'), Synset('cookbook.n.01')]] 27 | 28 | >>> syn.pos 29 | 'n' 30 | 31 | ========================================= 32 | Looking up Lemmas and Synonyms in WordNet 33 | ========================================= 34 | 35 | >>> lemmas = syn.lemmas 36 | >>> len(lemmas) 37 | 2 38 | >>> lemmas[0].name 39 | 'cookbook' 40 | >>> lemmas[1].name 41 | 'cookery_book' 42 | >>> lemmas[0].synset == lemmas[1].synset 43 | True 44 | 45 | >>> gn2 = wordnet.synset('good.n.02') 46 | >>> gn2.definition 47 | 'moral excellence or admirableness' 48 | >>> evil = gn2.lemmas[0].antonyms()[0] 49 | >>> evil.name 50 | 'evil' 51 | >>> evil.synset.definition 52 | 'the quality of being morally wrong in principle or practice' 53 | >>> ga1 = wordnet.synset('good.a.01') 54 | >>> ga1.definition 55 | 'having desirable or positive qualities especially those suitable for a thing specified' 56 | >>> bad = ga1.lemmas[0].antonyms()[0] 57 | >>> bad.name 58 | 'bad' 59 | >>> bad.synset.definition 60 | 'having undesirable or negative qualities' 61 | 62 | ===================================== 63 | Calculating WordNet Synset Similarity 64 | ===================================== 65 | 66 | >>> ib = wordnet.synset('instruction_book.n.01') 67 | >>> syn.wup_similarity(ib) 68 | 0.9166666666666666 69 | 70 | >>> ref = syn.hypernyms()[0] 71 | >>> syn.shortest_path_distance(ref) 72 | 1 73 | >>> ib.shortest_path_distance(ref) 74 | 1 75 | >>> syn.shortest_path_distance(ib) 76 | 2 77 | 78 | >>> dog = wordnet.synsets('dog')[0] 79 | >>> dog.wup_similarity(syn) 80 | 0.38095238095238093 81 | 82 | >>> dog.common_hypernyms(syn) 83 | [Synset('object.n.01'), Synset('whole.n.02'), Synset('physical_entity.n.01'), Synset('entity.n.01')] 84 | 85 | >>> cook = wordnet.synset('cook.v.01') 86 | >>> bake = wordnet.synset('bake.v.02') 87 | >>> cook.wup_similarity(bake) 88 | 0.6666666666666666 89 | 90 | >>> syn.path_similarity(ib) 91 | 0.3333333333333333 92 | >>> syn.path_similarity(dog) 93 | 0.07142857142857142 94 | >>> syn.lch_similarity(ib) 95 | 2.538973871058276 96 | >>> syn.lch_similarity(dog) 97 | 0.9985288301111273 98 | --------------------------------------------------------------------------------