├── README
├── chunk_parser.py
├── chunker_training.txt
├── chunking.txt
├── classifier_training.txt
├── collocations.txt
├── explore_nltk.py
├── movie_reviews_classifier.txt
├── stemming.txt
├── tagger_training.txt
├── tagging.txt
├── tokenization.txt
├── translation.txt
└── wordnet.txt


/README:
--------------------------------------------------------------------------------
1 | Doctestable code examples and NLTK exploration functions from Introduction to NLTK at PyCon 2012.
2 | 


--------------------------------------------------------------------------------
/chunk_parser.py:
--------------------------------------------------------------------------------
 1 | from nltk.chunk.named_entity import NEChunkParser, NEChunkParserTagger
 2 | from nltk.classify import NaiveBayesClassifier
 3 | from nltk.tag.simplify import simplify_wsj_tag
 4 | from nltk.tree import Tree
 5 | 
 6 | def simplify_chunk(chunk):
 7 | 	if isinstance(chunk, Tree):
 8 | 		return Tree(chunk.node, [simplify_chunk(c) for c in chunk])
 9 | 	elif isinstance(chunk, tuple):
10 | 		word, tag = chunk
11 | 		return (word, simplify_wsj_tag(tag))
12 | 	else:
13 | 		return chunk
14 | 
15 | # custom classes are required to use a custom classifier, the default is megam
16 | 
17 | class ChunkTagger(NEChunkParserTagger):
18 | 	def _classifier_builder(self, train):
19 | 		return NaiveBayesClassifier.train(train)
20 | 
21 | class ChunkParser(NEChunkParser):
22 | 	def _train(self, corpus):
23 | 		self._tagger = ChunkTagger([self._parse_to_tagged(s) for s in corpus])


--------------------------------------------------------------------------------
/chunker_training.txt:
--------------------------------------------------------------------------------
 1 | ====================
 2 | Treebank Chunk Trees
 3 | ====================
 4 | 
 5 | >>> from nltk.corpus import treebank_chunk
 6 | >>> treebank_chunk.chunked_sents()[0]
 7 | Tree('S', [Tree('NP', [('Pierre', 'NNP'), ('Vinken', 'NNP')]), (',', ','), Tree('NP', [('61', 'CD'), ('years', 'NNS')]), ('old', 'JJ'), (',', ','), ('will', 'MD'), ('join', 'VB'), Tree('NP', [('the', 'DT'), ('board', 'NN')]), ('as', 'IN'), Tree('NP', [('a', 'DT'), ('nonexecutive', 'JJ'), ('director', 'NN'), ('Nov.', 'NNP'), ('29', 'CD')]), ('.', '.')])
 8 | 
 9 | >>> from nltk import chunk
10 | >>> import nltk.data
11 | >>> chunker = nltk.data.load(chunk._MULTICLASS_NE_CHUNKER)
12 | >>> score = chunker.evaluate(treebank_chunk.chunked_sents())
13 | >>> print score
14 | ChunkParse score:
15 |     IOB Accuracy:  45.4%
16 |     Precision:      0.0%
17 |     Recall:         0.0%
18 |     F-Measure:      0.0%
19 | 
20 | # treebank_chunk doesn't have named entities, so score is not really accurate
21 | 
22 | >>> score.accuracy()
23 | 0.4536624203821656
24 | >>> score.precision()
25 | 0.0
26 | >>> score.recall()
27 | 0.0
28 | >>> len(score.correct())
29 | 24667
30 | >>> len(score.incorrect())
31 | 5659
32 | >>> len(score.missed())
33 | 24667
34 | 
35 | ======================
36 | Simplified Chunk Trees
37 | ======================
38 | 
39 | >>> import chunk_parser
40 | >>> simple_chunks = [chunk_parser.simplify_chunk(c) for c in treebank_chunk.chunked_sents()]
41 | >>> simple_chunks[0]
42 | Tree('S', [Tree('NP', [('Pierre', 'NP'), ('Vinken', 'NP')]), (',', ','), Tree('NP', [('61', 'NUM'), ('years', 'N')]), ('old', 'ADJ'), (',', ','), ('will', 'MOD'), ('join', 'V'), Tree('NP', [('the', 'DET'), ('board', 'N')]), ('as', 'P'), Tree('NP', [('a', 'DET'), ('nonexecutive', 'ADJ'), ('director', 'N'), ('Nov.', 'NP'), ('29', 'NUM')]), ('.', '.')])
43 | 
44 | >>> chunker = chunk_parser.ChunkParser(simple_chunks)
45 | >>> score = chunker.evaluate(simple_chunks)
46 | >>> print score
47 | ChunkParse score:
48 |     IOB Accuracy:  97.6%
49 |     Precision:     93.0%
50 |     Recall:        94.4%
51 |     F-Measure:     93.7%
52 | 
53 | # score is self-accurate and doesn't reflect real-world usage
54 | 
55 | >>> import os, os.path, pickle
56 | >>> path = os.path.expanduser('~/nltk_data/chunkers/')
57 | >>> os.makedirs(path)
58 | >>> f = open(os.path.join(path, 'chunk_parser.pickle'), 'wb')
59 | >>> pickle.dump(chunker, f)
60 | >>> f.close()
61 | 
62 | >>> iob_tagged = chunker._parse_to_tagged(simple_chunks[0])
63 | >>> iob_tagged
64 | [(('Pierre', 'NP'), 'B-NP'), (('Vinken', 'NP'), 'I-NP'), ((',', ','), 'O'), (('61', 'NUM'), 'B-NP'), (('years', 'N'), 'I-NP'), (('old', 'ADJ'), 'O'), ((',', ','), 'O'), (('will', 'MOD'), 'O'), (('join', 'V'), 'O'), (('the', 'DET'), 'B-NP'), (('board', 'N'), 'I-NP'), (('as', 'P'), 'O'), (('a', 'DET'), 'B-NP'), (('nonexecutive', 'ADJ'), 'I-NP'), (('director', 'N'), 'I-NP'), (('Nov.', 'NP'), 'I-NP'), (('29', 'NUM'), 'I-NP'), (('.', '.'), 'O')]
65 | >>> chunker._tagged_to_parse(iob_tagged) == simple_chunks[0]
66 | True
67 | >>> iob_tagged[0]
68 | (('Pierre', 'NP'), 'B-NP')
69 | >>> untagged, tags = zip(*iob_tagged)
70 | >>> chunker._tagger._feature_detector(untagged, 0, [])
71 | {'nextpos': 'np', 'pos+prevtag': 'NP+None', 'nextword': 'vinken', 'word': 'Pierre', 'prefix3': 'pie', 'wordlen': 6, 'prevpos': None, 'pos': 'NP', 'prevtag': None, 'prevword': None, 'shape': 'upcase', 'bias': True, 'en-wordlist': False, 'shape+prevtag': 'None+None', 'suffix3': 'rre', 'word+nextpos': 'pierre+np'}
72 | 


--------------------------------------------------------------------------------
/chunking.txt:
--------------------------------------------------------------------------------
 1 | ==================
 2 | Default NE Chunker
 3 | ==================
 4 | 
 5 | >>> sent = "Today you'll be learning NLTK."
 6 | >>> from nltk import chunk, tag, tokenize
 7 | >>> words = tokenize.word_tokenize(sent)
 8 | >>> tagged_sent = tag.pos_tag(words)
 9 | >>> tree = chunk.ne_chunk(tagged_sent)
10 | >>> tree
11 | Tree('S', [('Today', 'NN'), ('you', 'PRP'), ("'ll", 'MD'), ('be', 'VB'), ('learning', 'VBG'), Tree('ORGANIZATION', [('NLTK', 'NNP')]), ('.', '.')])
12 | >>> tree.draw()
13 | 
14 | >>> import nltk.data
15 | >>> chunker = nltk.data.load(chunk._MULTICLASS_NE_CHUNKER)
16 | >>> chunker.parse(tagged_sent)
17 | Tree('S', [('Today', 'NN'), ('you', 'PRP'), ("'ll", 'MD'), ('be', 'VB'), ('learning', 'VBG'), Tree('ORGANIZATION', [('NLTK', 'NNP')]), ('.', '.')])
18 | 
19 | =================
20 | Binary NE Chunker
21 | =================
22 | 
23 | >>> chunk.ne_chunk(tagged_sent, binary=True)
24 | Tree('S', [('Today', 'NN'), ('you', 'PRP'), ("'ll", 'MD'), ('be', 'VB'), ('learning', 'VBG'), Tree('NE', [('NLTK', 'NNP')]), ('.', '.')])
25 | 
26 | >>> binary_chunker = nltk.data.load(chunk._BINARY_NE_CHUNKER)
27 | >>> binary_chunker.parse(tagged_sent)
28 | Tree('S', [('Today', 'NN'), ('you', 'PRP'), ("'ll", 'MD'), ('be', 'VB'), ('learning', 'VBG'), Tree('NE', [('NLTK', 'NNP')]), ('.', '.')])
29 | 
30 | =================
31 | Phrase Extraction
32 | =================
33 | 
34 | >>> from nltk.tag import untag
35 | >>> untag(tagged_sent)
36 | ['Today', 'you', "'ll", 'be', 'learning', 'NLTK', '.']
37 | 
38 | >>> import collections
39 | >>> def extract_phrases(t):
40 | ...    d = collections.defaultdict(list)
41 | ...    for sub in t.subtrees(lambda s: s.node != 'S'):
42 | ...        d[sub.node].append(' '.join(untag(sub.leaves())))
43 | ...    return d
44 | 
45 | >>> extract_phrases(tree)
46 | defaultdict(<type 'list'>, {'ORGANIZATION': ['NLTK']})
47 | 


--------------------------------------------------------------------------------
/classifier_training.txt:
--------------------------------------------------------------------------------
 1 | ======================================
 2 | Train Naive Bayes Sentiment Classifier
 3 | ======================================
 4 | 
 5 | >>> from nltk.corpus import movie_reviews
 6 | >>> from nltk.classify import NaiveBayesClassifier
 7 | >>> def bag_of_words(para):
 8 | ...     return dict([(word, True) for sent in para for word in sent])
 9 | 
10 | >>> bag_of_words([['great', 'movie']])
11 | {'movie': True, 'great': True}
12 | >>> pos_feats = [(bag_of_words(para), 'pos') for para in movie_reviews.paras(categories=['pos'])]
13 | >>> neg_feats = [(bag_of_words(para), 'neg') for para in movie_reviews.paras(categories=['neg'])]
14 | >>> pos_cutoff = len(pos_feats) * 3/4
15 | >>> neg_cutoff = len(neg_feats) * 3/4
16 | >>> train_feats = pos_feats[:pos_cutoff] + neg_feats[:neg_cutoff]
17 | >>> test_feats = pos_feats[pos_cutoff:] + neg_feats[neg_cutoff:]
18 | >>> classifier = NaiveBayesClassifier.train(train_feats)
19 | >>> classifier.classify({'great': True, 'movie': True})
20 | 'pos'
21 | >>> classifier.classify({'bad': True, 'movie': True})
22 | 'neg'
23 | 
24 | >>> from nltk.classify.util import accuracy
25 | >>> accuracy(classifier, test_feats)
26 | 0.728
27 | 
28 | # low accuracy is usually not the algorithms fault, but noisy data
29 | 
30 | >>> classifier.show_most_informative_features()
31 | Most Informative Features
32 |              magnificent = True              pos : neg    =     15.0 : 1.0
33 |              outstanding = True              pos : neg    =     13.6 : 1.0
34 |                insulting = True              neg : pos    =     13.0 : 1.0
35 |               vulnerable = True              pos : neg    =     12.3 : 1.0
36 |                ludicrous = True              neg : pos    =     11.8 : 1.0
37 |                   avoids = True              pos : neg    =     11.7 : 1.0
38 |              uninvolving = True              neg : pos    =     11.7 : 1.0
39 |               astounding = True              pos : neg    =     10.3 : 1.0
40 |              fascination = True              pos : neg    =     10.3 : 1.0
41 |                  idiotic = True              neg : pos    =      9.8 : 1.0
42 | 
43 | 
44 | ====================
45 | Precision and Recall
46 | ====================
47 | 
48 | >>> import collections
49 | >>> def build_ref_test_sets(classifier, labeled_feats):
50 | ...     refsets = collections.defaultdict(set)
51 | ...     testsets = collections.defaultdict(set)
52 | ...     for i, (feat, label) in enumerate(labeled_feats):
53 | ...         refsets[label].add(i)
54 | ...         guess = classifier.classify(feat)
55 | ...         testsets[guess].add(i)
56 | ...     return refsets, testsets
57 | 
58 | >>> refsets, testsets = build_ref_test_sets(classifier, test_feats)
59 | >>> from nltk import metrics
60 | 
61 | # precision measures lack of false positives, or what percentage of guesses were correct
62 | # a false positive would be a featureset that was guessed to be pos, but wasn't
63 | 
64 | >>> metrics.precision(refsets['pos'], testsets['pos'])
65 | 0.651595744680851
66 | >>> metrics.precision(refsets['neg'], testsets['neg'])
67 | 0.9596774193548387
68 | 
69 | # recall measures lack of false negatives, or what percentage was guessed correctly
70 | # a false negative would be a featureset that should have been guessed to be pos, but wasn't
71 | 
72 | >>> metrics.recall(refsets['pos'], testsets['pos'])
73 | 0.98
74 | >>> metrics.recall(refsets['neg'], testsets['neg'])
75 | 0.476
76 | 
77 | # F-measure is the harmonic mean of precision and recall
78 | 
79 | >>> metrics.f_measure(refsets['pos'], testsets['pos'])
80 | 0.7827476038338657
81 | >>> metrics.f_measure(refsets['neg'], testsets['neg'])
82 | 0.6363636363636364
83 | 


--------------------------------------------------------------------------------
/collocations.txt:
--------------------------------------------------------------------------------
 1 | =============================
 2 | Discovering Word Collocations
 3 | =============================
 4 | 
 5 | >>> from nltk.corpus import webtext
 6 | >>> from nltk.collocations import BigramCollocationFinder
 7 | >>> from nltk.metrics import BigramAssocMeasures
 8 | >>> words = [w.lower() for w in webtext.words('grail.txt')]
 9 | >>> bcf = BigramCollocationFinder.from_words(words)
10 | >>> bcf.nbest(BigramAssocMeasures.likelihood_ratio, 4)
11 | [("'", 's'), ('arthur', ':'), ('#', '1'), ("'", 't')]
12 | 
13 | >>> from nltk.corpus import stopwords
14 | >>> stopset = set(stopwords.words('english'))
15 | >>> filter_stops = lambda w: len(w) < 3 or w in stopset
16 | >>> bcf.apply_word_filter(filter_stops)
17 | >>> bcf.nbest(BigramAssocMeasures.likelihood_ratio, 4)
18 | [('black', 'knight'), ('clop', 'clop'), ('head', 'knight'), ('mumble', 'mumble')]
19 | 
20 | >>> from nltk.collocations import TrigramCollocationFinder
21 | >>> from nltk.metrics import TrigramAssocMeasures
22 | >>> words = [w.lower() for w in webtext.words('singles.txt')]
23 | >>> tcf = TrigramCollocationFinder.from_words(words)
24 | >>> tcf.apply_word_filter(filter_stops)
25 | >>> tcf.apply_freq_filter(3)
26 | >>> tcf.nbest(TrigramAssocMeasures.likelihood_ratio, 4)
27 | [('long', 'term', 'relationship')]
28 | 


--------------------------------------------------------------------------------
/explore_nltk.py:
--------------------------------------------------------------------------------
 1 | import collections, itertools, string
 2 | from nltk import collocations, probability, stem
 3 | from nltk.corpus import stopwords, wordnet
 4 | from nltk.tag.util import untag
 5 | from nltk.util import bigrams
 6 | 
 7 | stopset = set(stopwords.words('english')) | set(string.punctuation)
 8 | stemmer = stem.PorterStemmer()
 9 | 
10 | def count_stems(corpus):
11 | 	fd = probability.FreqDist()
12 | 	
13 | 	for word in corpus.words():
14 | 		w = word.lower()
15 | 		if w in stopset: continue
16 | 		fd.inc(stemmer.stem(w))
17 | 	
18 | 	return fd
19 | 
20 | def count_hypernyms(corpus):
21 | 	fd = probability.FreqDist()
22 | 	
23 | 	for word in corpus.words():
24 | 		w = word.lower()
25 | 		if w in stopset: continue
26 | 		
27 | 		for syn in wordnet.synsets(w):
28 | 			if syn.pos != 'n': continue
29 | 			
30 | 			for path in syn.hypernym_paths():
31 | 				for hyp in path:
32 | 					fd.inc(hyp.name)
33 | 	
34 | 	return fd
35 | 
36 | def count_stemmed_bigram_collocations(corpus, min_freq=3):
37 | 	stems = (stemmer.stem(w.lower()) for w in corpus.words())
38 | 	finder = collocations.BigramCollocationFinder.from_words(stems)
39 | 	finder.apply_word_filter(lambda w: w in stopset)
40 | 	finder.apply_freq_filter(min_freq)
41 | 	return finder
42 | 
43 | def count_tag_words(corpus, tagger):
44 | 	cfd = probability.ConditionalFreqDist()
45 | 	
46 | 	for sent in corpus.sents():
47 | 		for word, tag in tagger.tag(sent):
48 | 			w = word.lower()
49 | 			if w in stopset: continue
50 | 			cfd[tag].inc(w)
51 | 	
52 | 	return cfd
53 | 
54 | def count_phrases(corpus, tagger, chunker):
55 | 	cfd = probability.ConditionalFreqDist()
56 | 	
57 | 	for sent in corpus.sents():
58 | 		tree = chunker.parse(tagger.tag(sent))
59 | 		
60 | 		for sub in tree.subtrees():
61 | 			if sub.node == 'S': continue
62 | 			words = untag(sub.leaves())
63 | 			if len(words) >= 2: cfd[sub.node].inc(' '.join(words))
64 | 	
65 | 	return cfd
66 | 
67 | def classify_paras(paras, classifier):
68 | 	d = collections.defaultdict(list)
69 | 	
70 | 	for para in paras:
71 | 		words = [w.lower() for w in itertools.chain(*para)]
72 | 		feats = dict([(w, True) for w in words + bigrams(words)])
73 | 		label = classifier.classify(feats)
74 | 		d[label].append(' '.join(words))
75 | 	
76 | 	return d


--------------------------------------------------------------------------------
/movie_reviews_classifier.txt:
--------------------------------------------------------------------------------
 1 | >>> import nltk.data
 2 | >>> classifier = nltk.data.load('classifiers/movie_reviews_NaiveBayes.pickle')
 3 | >>> from nltk import tokenize
 4 | >>> words = tokenize.word_tokenize("that was a terrible movie")
 5 | >>> feats = dict([(word, True) for word in words])
 6 | >>> feats
 7 | {'a': True, 'movie': True, 'was': True, 'terrible': True, 'that': True}
 8 | >>> classifier.classify(feats)
 9 | 'neg'
10 | >>> probs = classifier.prob_classify(feats)
11 | >>> probs.prob('neg')
12 | 0.789541654729651
13 | >>> probs.prob('pos')
14 | 0.2104583452703487
15 | >>> classifier.show_most_informative_features()
16 | Most Informative Features
17 |                   avoids = True              pos : neg    =     13.0 : 1.0
18 |               astounding = True              pos : neg    =     12.3 : 1.0
19 |                     slip = True              pos : neg    =     11.7 : 1.0
20 |              outstanding = True              pos : neg    =     11.5 : 1.0
21 |                ludicrous = True              neg : pos    =     11.0 : 1.0
22 |                insulting = True              neg : pos    =     11.0 : 1.0
23 |              fascination = True              pos : neg    =     11.0 : 1.0
24 |                     3000 = True              neg : pos    =     11.0 : 1.0
25 |                    sucks = True              neg : pos    =     10.6 : 1.0
26 |                   hudson = True              neg : pos    =     10.3 : 1.0
27 | 


--------------------------------------------------------------------------------
/stemming.txt:
--------------------------------------------------------------------------------
 1 | ==============
 2 | Stemming Words
 3 | ==============
 4 | 
 5 | >>> from nltk import stem
 6 | >>> stemmer = stem.PorterStemmer()
 7 | >>> stemmer.stem('cooking')
 8 | 'cook'
 9 | >>> stemmer.stem('cookery')
10 | 'cookeri'
11 | 
12 | >>> stemmer = stem.LancasterStemmer()
13 | >>> stemmer.stem('cooking')
14 | 'cook'
15 | >>> stemmer.stem('cookery')
16 | 'cookery'
17 | 
18 | # essentially common suffix removal
19 | 
20 | >>> stemmer = stem.RegexpStemmer('ing')
21 | >>> stemmer.stem('cooking')
22 | 'cook'
23 | >>> stemmer.stem('cookery')
24 | 'cookery'
25 | >>> stemmer.stem('ingleside')
26 | 'leside'
27 | 
28 | >>> stem.SnowballStemmer.languages
29 | ('danish', 'dutch', 'english', 'finnish', 'french', 'german', 'hungarian', 'italian', 'norwegian', 'porter', 'portuguese', 'romanian', 'russian', 'spanish', 'swedish')
30 | >>> spanish_stemmer = stem.SnowballStemmer('spanish')
31 | >>> spanish_stemmer.stem('hola')
32 | u'hol'
33 | 
34 | # SnowballStemmer('english') is an updated & improved Porter stemmer
35 | 
36 | ==============================
37 | Lemmatising Words with WordNet
38 | ==============================
39 | 
40 | >>> lemmatizer = stem.WordNetLemmatizer()
41 | >>> lemmatizer.lemmatize('cooking')
42 | 'cooking'
43 | >>> lemmatizer.lemmatize('cooking', pos='v')
44 | 'cook'
45 | >>> lemmatizer.lemmatize('cookbooks')
46 | 'cookbook'
47 | 
48 | # lemmatizing preserves meaning and lemmas are real words, while stems may not be
49 | 
50 | >>> stemmer = stem.PorterStemmer()
51 | >>> stemmer.stem('believes')
52 | 'believ'
53 | >>> lemmatizer.lemmatize('believes')
54 | 'belief'
55 | 
56 | >>> stemmer.stem('buses')
57 | 'buse'
58 | >>> lemmatizer.lemmatize('buses')
59 | 'bus'
60 | >>> stemmer.stem('bus')
61 | 'bu'
62 | 
63 | # lossy compression, can be useful for fuzzy matching
64 | 


--------------------------------------------------------------------------------
/tagger_training.txt:
--------------------------------------------------------------------------------
 1 | =========================
 2 | Treebank Tagged Sentences
 3 | =========================
 4 | 
 5 | >>> from nltk.corpus import treebank
 6 | >>> treebank.tagged_sents()[0]
 7 | [('Pierre', 'NNP'), ('Vinken', 'NNP'), (',', ','), ('61', 'CD'), ('years', 'NNS'), ('old', 'JJ'), (',', ','), ('will', 'MD'), ('join', 'VB'), ('the', 'DT'), ('board', 'NN'), ('as', 'IN'), ('a', 'DT'), ('nonexecutive', 'JJ'), ('director', 'NN'), ('Nov.', 'NNP'), ('29', 'CD'), ('.', '.')]
 8 | 
 9 | >>> import nltk.data
10 | >>> from nltk import tag
11 | >>> tagger = nltk.data.load(tag._POS_TAGGER)
12 | >>> sent = treebank.sents()[0]
13 | >>> sent
14 | ['Pierre', 'Vinken', ',', '61', 'years', 'old', ',', 'will', 'join', 'the', 'board', 'as', 'a', 'nonexecutive', 'director', 'Nov.', '29', '.']
15 | 
16 | >>> tagger.tag(sent)
17 | [('Pierre', 'NNP'), ('Vinken', 'NNP'), (',', ','), ('61', 'CD'), ('years', 'NNS'), ('old', 'JJ'), (',', ','), ('will', 'MD'), ('join', 'VB'), ('the', 'DT'), ('board', 'NN'), ('as', 'IN'), ('a', 'DT'), ('nonexecutive', 'JJ'), ('director', 'NN'), ('Nov.', 'NNP'), ('29', 'CD'), ('.', '.')]
18 | 
19 | >>> tagger.evaluate(treebank.tagged_sents())
20 | 0.9956891414041082
21 | 
22 | # this is self-accurate and doesn't reflect real-world performance
23 | 
24 | >>> simple_sents = treebank.tagged_sents(simplify_tags=True)
25 | >>> simple_sents[0]
26 | [('Pierre', 'NP'), ('Vinken', 'NP'), (',', ','), ('61', 'NUM'), ('years', 'N'), ('old', 'ADJ'), (',', ','), ('will', 'MOD'), ('join', 'V'), ('the', 'DET'), ('board', 'N'), ('as', 'P'), ('a', 'DET'), ('nonexecutive', 'ADJ'), ('director', 'N'), ('Nov.', 'NP'), ('29', 'NUM'), ('.', '.')]
27 | 
28 | ===================================
29 | Training Sequential Backoff Taggers
30 | ===================================
31 | 
32 | >>> default = tag.DefaultTagger('N')
33 | >>> default.evaluate(simple_sents)
34 | 0.19083992212642537
35 | 
36 | >>> u = tag.UnigramTagger(simple_sents, backoff=default)
37 | >>> u.tag(sent)
38 | [('Pierre', 'NP'), ('Vinken', 'NP'), (',', ','), ('61', 'NUM'), ('years', 'N'), ('old', 'ADJ'), (',', ','), ('will', 'MOD'), ('join', 'V'), ('the', 'DET'), ('board', 'N'), ('as', 'P'), ('a', 'DET'), ('nonexecutive', 'ADJ'), ('director', 'N'), ('Nov.', 'NP'), ('29', 'NUM'), ('.', '.')]
39 | >>> u.evaluate(simple_sents)
40 | 0.9656621240414796
41 | 
42 | # these numbers are self-accurate and won't reflect real-world performance
43 | # for accuracy measurements, separate simple_sents into training & testing lists
44 | 
45 | >>> ub = tag.BigramTagger(simple_sents, backoff=u)
46 | >>> ub.evaluate(simple_sents)
47 | 0.9861635345067344
48 | 
49 | >>> ubt = tag.TrigramTagger(simple_sents, backoff=ub)
50 | >>> ubt.evaluate(simple_sents)
51 | 0.991914656919226
52 | 
53 | ===============
54 | Saving a Tagger
55 | ===============
56 | 
57 | >>> import os, os.path, pickle
58 | >>> path = os.path.expanduser('~/nltk_data/taggers/')
59 | >>> os.makedirs(path)
60 | >>> f = open(os.path.join(path, 'ubt.pickle'), 'wb')
61 | >>> pickle.dump(ubt, f)
62 | 
63 | ==================================
64 | Training a Classifier Based Tagger
65 | ==================================
66 | 
67 | >>> c = tag.ClassifierBasedPOSTagger(train=simple_sents)
68 | >>> c.evaluate(simple_sents)
69 | 0.9773530930907068
70 | 
71 | # a classifier based tagger is also much slower than sequential backoff taggers
72 | 


--------------------------------------------------------------------------------
/tagging.txt:
--------------------------------------------------------------------------------
 1 | >>> from nltk import tokenize
 2 | >>> sent = tokenize.word_tokenize("Today you'll be learning NLTK.")
 3 | >>> from nltk import tag
 4 | >>> tag.pos_tag(sent)
 5 | [('Today', 'NN'), ('you', 'PRP'), ("'ll", 'MD'), ('be', 'VB'), ('learning', 'VBG'), ('NLTK', 'NNP'), ('.', '.')]
 6 | 
 7 | >>> import nltk.data
 8 | >>> tagger = nltk.data.load(tag._POS_TAGGER)
 9 | >>> tagger.tag(sent)
10 | [('Today', 'NN'), ('you', 'PRP'), ("'ll", 'MD'), ('be', 'VB'), ('learning', 'VBG'), ('NLTK', 'NNP'), ('.', '.')]
11 | 


--------------------------------------------------------------------------------
/tokenization.txt:
--------------------------------------------------------------------------------
 1 | =====================
 2 | Sentence Tokenization
 3 | =====================
 4 | 
 5 | >>> para = "Hello. My name is Jacob. Today you'll be learning NLTK."
 6 | >>> from nltk import tokenize
 7 | >>> tokenize.sent_tokenize(para)
 8 | ['Hello.', 'My name is Jacob.', "Today you'll be learning NLTK."]
 9 | 
10 | >>> import nltk.data
11 | >>> tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
12 | >>> sents = tokenizer.tokenize(para)
13 | >>> sents
14 | ['Hello.', 'My name is Jacob.', "Today you'll be learning NLTK."]
15 | 
16 | =================
17 | Word Tokenization
18 | =================
19 | 
20 | >>> sent = sents[2]
21 | >>> tokenize.word_tokenize(sent)
22 | ['Today', 'you', "'ll", 'be', 'learning', 'NLTK', '.']
23 | 
24 | >>> tokenize.wordpunct_tokenize(sent)
25 | ['Today', 'you', "'", 'll', 'be', 'learning', 'NLTK', '.']
26 | 
27 | >>> from nltk.tokenize import WordPunctTokenizer
28 | >>> tokenizer = WordPunctTokenizer()
29 | >>> tokenizer.tokenize(sent)
30 | ['Today', 'you', "'", 'll', 'be', 'learning', 'NLTK', '.']
31 | 
32 | >>> tokenizer = tokenize.PunktWordTokenizer()
33 | >>> tokenizer.tokenize(sent)
34 | ['Today', 'you', "'ll", 'be', 'learning', 'NLTK.']
35 | 
36 | >>> tokenizer = tokenize.SpaceTokenizer()
37 | >>> tokenizer.tokenize(sent)
38 | ['Today', "you'll", 'be', 'learning', 'NLTK.']
39 | 
40 | 
41 | Choosing a Word Tokenizer
42 | -------------------------
43 | Your choice of word tokenizer depends on further steps down the pipeline.
44 | There's no one right answer, it's context/pipeline dependent.
45 | Do you need a normalized/canonical form?
46 | How much does punctuation matter, and in what way?
47 | What does the pos tagger and/or classifier expect?
48 | Are you doing transformations?
49 | 
50 | 
51 | ==========================================
52 | Tokenizing Words using Regular Expressions
53 | ==========================================
54 | 
55 | >>> from nltk.tokenize import RegexpTokenizer
56 | >>> tokenizer = RegexpTokenizer("[\w']+")
57 | >>> tokenizer.tokenize("Can't is a contraction.")
58 | ["Can't", 'is', 'a', 'contraction']
59 | 
60 | >>> from nltk.tokenize import regexp_tokenize
61 | >>> regexp_tokenize("Can't is a contraction.", "[\w']+")
62 | ["Can't", 'is', 'a', 'contraction']
63 | 
64 | >>> tokenizer = RegexpTokenizer('\s+', gaps=True)
65 | >>> tokenizer.tokenize("Can't is a contraction.")
66 | ["Can't", 'is', 'a', 'contraction.']
67 | 


--------------------------------------------------------------------------------
/translation.txt:
--------------------------------------------------------------------------------
 1 | ===============================
 2 | Translating Text with Babelfish
 3 | ===============================
 4 | 
 5 | >>> from nltk.misc import babelfish
 6 | >>> babelfish.translate('cookbook', 'english', 'spanish')
 7 | 'libro de cocina'
 8 | >>> babelfish.translate('libro de cocina', 'spanish', 'english')
 9 | 'kitchen book'
10 | >>> babelfish.translate('cookbook', 'english', 'german')
11 | 'Kochbuch'
12 | >>> babelfish.translate('kochbuch', 'german', 'english')
13 | 'cook book'
14 | 
15 | >>> for text in babelfish.babelize('cookbook', 'english', 'spanish'):
16 | ...             print text
17 | ...
18 | cookbook
19 | libro de cocina
20 | kitchen book
21 | libro de la cocina
22 | book of the kitchen
23 | 
24 | >>> babelfish.available_languages
25 | ['Portuguese', 'Chinese', 'German', 'Japanese', 'French', 'Spanish', 'Russian', 'Greek', 'English', 'Korean', 'Italian']
26 | 


--------------------------------------------------------------------------------
/wordnet.txt:
--------------------------------------------------------------------------------
 1 | =========================================
 2 | Looking up a Synset for a Word in WordNet
 3 | =========================================
 4 | 
 5 | >>> from nltk.corpus import wordnet
 6 | >>> syn = wordnet.synsets('cookbook')[0]
 7 | >>> syn.name
 8 | 'cookbook.n.01'
 9 | >>> syn.definition
10 | 'a book of recipes and cooking directions'
11 | 
12 | >>> wordnet.synset('cookbook.n.01')
13 | Synset('cookbook.n.01')
14 | 
15 | >>> syn.examples
16 | ['cooking can be a great art', 'people are needed who have experience in cookery', 'he left the preparation of meals to his wife']
17 | 
18 | >>> syn.hypernyms()
19 | [Synset('reference_book.n.01')]
20 | >>> syn.hypernyms()[0].hyponyms()
21 | [Synset('encyclopedia.n.01'), Synset('directory.n.01'), Synset('source_book.n.01'), Synset('handbook.n.01'), Synset('instruction_book.n.01'), Synset('cookbook.n.01'), Synset('annual.n.02'), Synset('atlas.n.02'), Synset('wordbook.n.01')]
22 | >>> syn.root_hypernyms()
23 | [Synset('entity.n.01')]
24 | 
25 | >>> syn.hypernym_paths()
26 | [[Synset('entity.n.01'), Synset('physical_entity.n.01'), Synset('object.n.01'), Synset('whole.n.02'), Synset('artifact.n.01'), Synset('creation.n.02'), Synset('product.n.02'), Synset('work.n.02'), Synset('publication.n.01'), Synset('book.n.01'), Synset('reference_book.n.01'), Synset('cookbook.n.01')]]
27 | 
28 | >>> syn.pos
29 | 'n'
30 | 
31 | =========================================
32 | Looking up Lemmas and Synonyms in WordNet
33 | =========================================
34 | 
35 | >>> lemmas = syn.lemmas
36 | >>> len(lemmas)
37 | 2
38 | >>> lemmas[0].name
39 | 'cookbook'
40 | >>> lemmas[1].name
41 | 'cookery_book'
42 | >>> lemmas[0].synset == lemmas[1].synset
43 | True
44 | 
45 | >>> gn2 = wordnet.synset('good.n.02')
46 | >>> gn2.definition
47 | 'moral excellence or admirableness'
48 | >>> evil = gn2.lemmas[0].antonyms()[0]
49 | >>> evil.name
50 | 'evil'
51 | >>> evil.synset.definition
52 | 'the quality of being morally wrong in principle or practice'
53 | >>> ga1 = wordnet.synset('good.a.01')
54 | >>> ga1.definition
55 | 'having desirable or positive qualities especially those suitable for a thing specified'
56 | >>> bad = ga1.lemmas[0].antonyms()[0]
57 | >>> bad.name
58 | 'bad'
59 | >>> bad.synset.definition
60 | 'having undesirable or negative qualities'
61 | 
62 | =====================================
63 | Calculating WordNet Synset Similarity
64 | =====================================
65 | 
66 | >>> ib = wordnet.synset('instruction_book.n.01')
67 | >>> syn.wup_similarity(ib)
68 | 0.9166666666666666
69 | 
70 | >>> ref = syn.hypernyms()[0]
71 | >>> syn.shortest_path_distance(ref)
72 | 1
73 | >>> ib.shortest_path_distance(ref)
74 | 1
75 | >>> syn.shortest_path_distance(ib)
76 | 2
77 | 
78 | >>> dog = wordnet.synsets('dog')[0]
79 | >>> dog.wup_similarity(syn)
80 | 0.38095238095238093
81 | 
82 | >>> dog.common_hypernyms(syn)
83 | [Synset('object.n.01'), Synset('whole.n.02'), Synset('physical_entity.n.01'), Synset('entity.n.01')]
84 | 
85 | >>> cook = wordnet.synset('cook.v.01')
86 | >>> bake = wordnet.synset('bake.v.02')
87 | >>> cook.wup_similarity(bake)
88 | 0.6666666666666666
89 | 
90 | >>> syn.path_similarity(ib)
91 | 0.3333333333333333
92 | >>> syn.path_similarity(dog)
93 | 0.07142857142857142
94 | >>> syn.lch_similarity(ib)
95 | 2.538973871058276
96 | >>> syn.lch_similarity(dog)
97 | 0.9985288301111273
98 | 


--------------------------------------------------------------------------------