├── Readme.md ├── Chapter 6 ├── ch6_1.py ├── ch6_10.py ├── ch6_11.py ├── ch6_8.py ├── ch6_9.py ├── ch6_16.py ├── ch6_15.py ├── ch6_17.py ├── ch6_2.py ├── ch6_5.py ├── ch6_7.py ├── ch6_12.py ├── ch6_3.py ├── ch6_4.py ├── ch6_13.py ├── ch6_6.py ├── ch6_14.py └── ch6_18.py ├── Chapter 4 ├── ch4_2.py ├── ch4_3.py ├── ch4_1.py ├── ch4_4.py ├── ch4_12.py ├── ch4_7.py ├── ch4_11.py ├── ch4_14.py ├── ch4_5.py ├── ch4_15.py ├── ch4_10.py ├── ch4_18.py ├── ch4_8.py ├── ch4_30.py ├── ch4_13.py ├── ch4_16.py ├── ch4_17.py ├── ch4_23.py ├── ch4_22.py ├── ch4_27.py ├── ch4_24.py ├── ch4_25.py ├── ch4_6.py ├── ch4_9.py ├── ch4_19.py ├── ch4_20.py ├── ch4_21.py ├── ch4_28.py ├── ch4_29.py └── ch4_26.py ├── Chapter 1 ├── ch1_25.py ├── ch1_23.py ├── ch1_7.py ├── ch1_30.py ├── ch1_35.py ├── ch1_36.py ├── ch1_4.py ├── ch1_34.py ├── ch1_8.py ├── ch1_5.py ├── ch1_1.py ├── ch1_9.py ├── ch1_11.py ├── ch1_10.py ├── ch1_17.py ├── ch1_13.py ├── ch1_14.py ├── ch1_31.py ├── ch1_6.py ├── ch1_20.py ├── ch1_18.py ├── ch1_37.py ├── ch1_12.py ├── ch1_24.py ├── ch1_29.py ├── ch1_2.py ├── ch1_27.py ├── ch1_21.py ├── ch1_15.py ├── ch1_28.py ├── ch1_19.py ├── ch1_16.py ├── ch1_26.py ├── ch1_33.py ├── ch1_3.py └── ch1_22.py ├── Chapter 5 ├── ch5_2.py ├── ch5_8.py ├── ch5_21.py ├── ch5_22.py ├── ch5_5.py ├── ch5_6.py ├── ch5_3.py ├── ch5_23.py ├── ch5_20.py ├── ch5_4.py ├── ch5_9.py ├── ch5_1.py ├── ch5_13.py ├── ch5_18.py ├── ch5_10.py ├── ch5_12.py ├── ch5_11.py ├── ch5_17.py ├── ch5_16.py ├── ch5_14.py ├── ch5_15.py ├── ch5_19.py └── ch5_7.py ├── SoftwareHardwarelist.pdf ├── Chapter 10 ├── ch10_10.py ├── ch10_5.py ├── ch10_6.py ├── ch10_1.py ├── ch10_8.py ├── ch10_2.py ├── ch10_4.py ├── ch10_9.py ├── ch10_3.py └── ch10_7.py ├── __pycache__ ├── replacers.cpython-34.pyc └── replacers.py ├── Chapter 9 ├── ch9_4.py ├── ch9_1.py ├── ch9_2.py ├── ch9_3.py └── ch9_5.py ├── Chapter 3 ├── ch3_2.py ├── ch3_1.py ├── ch3_3.py ├── ch3_5.py ├── ch3_4.py └── ch3_6.py ├── Chapter 2 ├── ch2_1.py ├── ch2_2.py ├── ch2_6.py ├── ch2_8.py ├── ch2_3.py ├── ch2_7.py ├── ch2_5.py ├── ch2_4.py ├── ch2_9.py └── ch2_10.py ├── Chapter 8 └── ch8_1.py ├── .gitattributes ├── .gitignore └── Chapter 7 └── ch7_1.py /Readme.md: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /Chapter 6/ch6_1.py: -------------------------------------------------------------------------------- 1 | import nltk 2 | nltk.boolean_ops() 3 | -------------------------------------------------------------------------------- /Chapter 6/ch6_10.py: -------------------------------------------------------------------------------- 1 | import nltk 2 | nltk.tag.hmm.demo_pos() 3 | -------------------------------------------------------------------------------- /Chapter 4/ch4_2.py: -------------------------------------------------------------------------------- 1 | import nltk 2 | print(nltk.help.upenn_tagset('NNS')) 3 | -------------------------------------------------------------------------------- /Chapter 4/ch4_3.py: -------------------------------------------------------------------------------- 1 | import nltk 2 | print(nltk.help.upenn_tagset('VB.*')) 3 | -------------------------------------------------------------------------------- /Chapter 1/ch1_25.py: -------------------------------------------------------------------------------- 1 | from nltk.corpus import stopwords 2 | print(stopwords.fileids()) 3 | -------------------------------------------------------------------------------- /Chapter 1/ch1_23.py: -------------------------------------------------------------------------------- 1 | text='HARdWork IS KEy to SUCCESS' 2 | print(text.lower()) 3 | print(text.upper()) 4 | -------------------------------------------------------------------------------- /Chapter 1/ch1_7.py: -------------------------------------------------------------------------------- 1 | import nltk 2 | text=nltk.word_tokenize(" Don't hesitate to ask questions") 3 | print(text) 4 | -------------------------------------------------------------------------------- /Chapter 5/ch5_2.py: -------------------------------------------------------------------------------- 1 | import nltk 2 | from nltk.corpus import treebank 3 | print(treebank.parsed_sents('wsj_0007.mrg')[2]) 4 | -------------------------------------------------------------------------------- /Chapter 5/ch5_8.py: -------------------------------------------------------------------------------- 1 | import nltk 2 | gram1 = nltk.data.load('grammars/large_grammars/atis.cfg') 3 | print(gram1) 4 | 5 | -------------------------------------------------------------------------------- /Chapter 4/ch4_1.py: -------------------------------------------------------------------------------- 1 | import nltk 2 | text1=nltk.word_tokenize("It is a pleasant day today") 3 | print(nltk.pos_tag(text1)) 4 | -------------------------------------------------------------------------------- /Chapter 4/ch4_4.py: -------------------------------------------------------------------------------- 1 | import nltk 2 | text=nltk.word_tokenize("I cannot bear the pain of bear") 3 | print(nltk.pos_tag(text)) 4 | -------------------------------------------------------------------------------- /Chapter 5/ch5_21.py: -------------------------------------------------------------------------------- 1 | import nltk 2 | nltk.parse.chart.demo(5, print_times=False, trace=1,sent='John saw a dog', numparses=2) 3 | -------------------------------------------------------------------------------- /Chapter 5/ch5_22.py: -------------------------------------------------------------------------------- 1 | import nltk 2 | nltk.parse.chart.demo(2, print_times=False, trace=1,sent='John saw a dog', numparses=1) 3 | -------------------------------------------------------------------------------- /Chapter 4/ch4_12.py: -------------------------------------------------------------------------------- 1 | import nltk 2 | from nltk.tag import untag 3 | print(untag([('beautiful', 'NN'), ('morning', 'NN')])) 4 | 5 | -------------------------------------------------------------------------------- /Chapter 4/ch4_7.py: -------------------------------------------------------------------------------- 1 | import nltk 2 | taggedtok = ('bear', 'NN') 3 | from nltk.tag.util import tuple2str 4 | print(tuple2str(taggedtok)) 5 | -------------------------------------------------------------------------------- /Chapter 1/ch1_30.py: -------------------------------------------------------------------------------- 1 | import nltk 2 | from replacers import RepeatReplacer 3 | replacer=RepeatReplacer() 4 | print(replacer.replace('happy')) 5 | -------------------------------------------------------------------------------- /Chapter 1/ch1_35.py: -------------------------------------------------------------------------------- 1 | import nltk 2 | from nltk.metrics import * 3 | X=set([10,20,30,40]) 4 | Y=set([20,30,60]) 5 | print(jaccard_distance(X,Y)) 6 | 7 | -------------------------------------------------------------------------------- /SoftwareHardwarelist.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Mastering-Natural-Language-Processing-with-Python/HEAD/SoftwareHardwarelist.pdf -------------------------------------------------------------------------------- /Chapter 1/ch1_36.py: -------------------------------------------------------------------------------- 1 | import nltk 2 | from nltk.metrics import * 3 | X = set([10,20,30,40]) 4 | Y= set([30,50,70]) 5 | print(binary_distance(X, Y)) 6 | 7 | -------------------------------------------------------------------------------- /Chapter 10/ch10_10.py: -------------------------------------------------------------------------------- 1 | import nltk 2 | from nltk.stem.lancaster import LancasterStemmer 3 | stri=LancasterStemmer() 4 | print(stri.stem('achievement')) 5 | -------------------------------------------------------------------------------- /Chapter 4/ch4_11.py: -------------------------------------------------------------------------------- 1 | import nltk 2 | from nltk.tag import DefaultTagger 3 | tag = DefaultTagger('NN') 4 | print(tag.tag(['Beautiful', 'morning'])) 5 | 6 | -------------------------------------------------------------------------------- /Chapter 4/ch4_14.py: -------------------------------------------------------------------------------- 1 | import nltk 2 | from nltk.corpus import names 3 | print(len(names.words('male.txt'))) 4 | print(len(names.words('female.txt'))) 5 | 6 | -------------------------------------------------------------------------------- /Chapter 4/ch4_5.py: -------------------------------------------------------------------------------- 1 | import nltk 2 | taggedword=nltk.tag.str2tuple('bear/NN') 3 | print(taggedword) 4 | print(taggedword[0]) 5 | print(taggedword[1]) 6 | 7 | -------------------------------------------------------------------------------- /Chapter 1/ch1_4.py: -------------------------------------------------------------------------------- 1 | import nltk 2 | text = nltk.word_tokenize("PierreVinken , 59 years old , will join as a nonexecutive director on Nov. 29 .") 3 | print(text) 4 | -------------------------------------------------------------------------------- /Chapter 5/ch5_5.py: -------------------------------------------------------------------------------- 1 | import nltk 2 | from nltk.probability import FreqDist 3 | from nltk.corpus import treebank 4 | fd = FreqDist() 5 | print(fd.items()) 6 | 7 | -------------------------------------------------------------------------------- /Chapter 5/ch5_6.py: -------------------------------------------------------------------------------- 1 | import nltk 2 | from nltk.corpus import sinica_treebank 3 | print(sinica_treebank.sents()) 4 | print(sinica_treebank.parsed_sents()[27]) 5 | -------------------------------------------------------------------------------- /Chapter 6/ch6_11.py: -------------------------------------------------------------------------------- 1 | import nltk 2 | from nltk import pos_tag, word_tokenize 3 | print(pos_tag(word_tokenize("John and Smith are going to NY and Germany"))) 4 | -------------------------------------------------------------------------------- /Chapter 6/ch6_8.py: -------------------------------------------------------------------------------- 1 | import nltk 2 | from nltk.corpus import conll2002 3 | for documents in conll2002.chunked_sents('ned.train')[25]: 4 | print(documents) 5 | -------------------------------------------------------------------------------- /Chapter 1/ch1_34.py: -------------------------------------------------------------------------------- 1 | import nltk 2 | from nltk.metrics import * 3 | print(edit_distance("relate","relation")) 4 | print(edit_distance("suggestion","calculation")) 5 | -------------------------------------------------------------------------------- /Chapter 1/ch1_8.py: -------------------------------------------------------------------------------- 1 | from nltk.tokenize import WordPunctTokenizer 2 | tokenizer=WordPunctTokenizer() 3 | print(tokenizer.tokenize(" Don't hesitate to ask questions")) 4 | -------------------------------------------------------------------------------- /Chapter 5/ch5_3.py: -------------------------------------------------------------------------------- 1 | import nltk 2 | from nltk.corpus import treebank_chunk 3 | print(treebank_chunk.chunked_sents()[1]) 4 | treebank_chunk.chunked_sents()[1].draw() 5 | -------------------------------------------------------------------------------- /Chapter 1/ch1_5.py: -------------------------------------------------------------------------------- 1 | import nltk 2 | from nltk import word_tokenize 3 | r=input("Please write a text") 4 | print("The length of text is",len(word_tokenize(r)),"words") 5 | -------------------------------------------------------------------------------- /__pycache__/replacers.cpython-34.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Mastering-Natural-Language-Processing-with-Python/HEAD/__pycache__/replacers.cpython-34.pyc -------------------------------------------------------------------------------- /Chapter 4/ch4_15.py: -------------------------------------------------------------------------------- 1 | import nltk 2 | from nltk.corpus import words 3 | print(words.fileids()) 4 | print(len(words.words('en'))) 5 | print(len(words.words('en-basic'))) 6 | 7 | -------------------------------------------------------------------------------- /Chapter 5/ch5_23.py: -------------------------------------------------------------------------------- 1 | import nltk 2 | nltk.parse.featurechart.demo(print_times=False,print_grammar=True,parser=nltk.parse.featurechart.FeatureChartParser,sent='I saw a dog') 3 | -------------------------------------------------------------------------------- /Chapter 6/ch6_9.py: -------------------------------------------------------------------------------- 1 | import nltk 2 | sentence = "I went to Greece to meet John"; 3 | tok=nltk.word_tokenize(sentence) 4 | pos_tag=nltk.pos_tag(tok) 5 | print(nltk.ne_chunk(pos_tag)) 6 | -------------------------------------------------------------------------------- /Chapter 1/ch1_1.py: -------------------------------------------------------------------------------- 1 | import nltk 2 | text=" Welcome readers. I hope you find it interesting. Please do reply." 3 | from nltk.tokenize import sent_tokenize 4 | print(sent_tokenize(text)) 5 | -------------------------------------------------------------------------------- /Chapter 1/ch1_9.py: -------------------------------------------------------------------------------- 1 | import nltk 2 | from nltk.tokenize import RegexpTokenizer 3 | tokenizer=RegexpTokenizer("[\w']+") 4 | print(tokenizer.tokenize("Don't hesitate to ask questions")) 5 | -------------------------------------------------------------------------------- /Chapter 9/ch9_4.py: -------------------------------------------------------------------------------- 1 | import nltk 2 | expr_read = nltk.sem.DrtExpression.fromstring 3 | expr4 = expr_read('([],[(([x],[student(x)])->([y],[book(y),read(x,y)]))])') 4 | print(expr4.fol()) 5 | -------------------------------------------------------------------------------- /Chapter 1/ch1_11.py: -------------------------------------------------------------------------------- 1 | import nltk 2 | from nltk.tokenize import RegexpTokenizer 3 | tokenizer=RegexpTokenizer('\s+',gaps=True) 4 | print(tokenizer.tokenize("Don't hesitate to ask questions")) 5 | -------------------------------------------------------------------------------- /Chapter 10/ch10_5.py: -------------------------------------------------------------------------------- 1 | import nltk 2 | chunkparser = nltk.RegexpParser("") 3 | print(nltk.chunk.accuracy(chunkparser, nltk.corpus.conll2000.chunked_sents('train.txt', chunk_types=('NP',)))) 4 | -------------------------------------------------------------------------------- /Chapter 3/ch3_2.py: -------------------------------------------------------------------------------- 1 | import nltk 2 | from nltk.stem import LancasterStemmer 3 | stemmerlan=LancasterStemmer() 4 | print(stemmerlan.stem('working')) 5 | print(stemmerlan.stem('happiness')) 6 | -------------------------------------------------------------------------------- /Chapter 4/ch4_10.py: -------------------------------------------------------------------------------- 1 | import nltk 2 | tag={} 3 | print(tag) 4 | tag['beautiful']='ADJ' 5 | 6 | tag['boy']='N' 7 | tag['read']='V' 8 | tag['generously']='ADV' 9 | print(tag) 10 | 11 | -------------------------------------------------------------------------------- /Chapter 9/ch9_1.py: -------------------------------------------------------------------------------- 1 | import nltk 2 | expr_read = nltk.sem.DrtExpression.fromstring 3 | expr1 = expr_read('([x], [John(x), Went(x)])') 4 | print(expr1) 5 | expr1.draw() 6 | print(expr1.fol()) 7 | -------------------------------------------------------------------------------- /Chapter 1/ch1_10.py: -------------------------------------------------------------------------------- 1 | import nltk 2 | from nltk.tokenize import regexp_tokenize 3 | sent="Don't hesitate to ask questions" 4 | print(regexp_tokenize(sent, pattern='\w+|\$[\d\.]+|\S+')) 5 | 6 | 7 | -------------------------------------------------------------------------------- /Chapter 1/ch1_17.py: -------------------------------------------------------------------------------- 1 | import nltk 2 | sent=" She secured 90.56 % in class X \n. She is a meritorious student\n" 3 | from nltk.tokenize import SpaceTokenizer 4 | print(SpaceTokenizer().tokenize(sent)) 5 | -------------------------------------------------------------------------------- /Chapter 3/ch3_1.py: -------------------------------------------------------------------------------- 1 | import nltk 2 | from nltk.stem import PorterStemmer 3 | stemmerporter = PorterStemmer() 4 | print(stemmerporter.stem('working')) 5 | print(stemmerporter.stem('happiness')) 6 | 7 | -------------------------------------------------------------------------------- /Chapter 1/ch1_13.py: -------------------------------------------------------------------------------- 1 | import nltk 2 | sent=" She secured 90.56 % in class X . She is a meritorious student" 3 | from nltk.tokenize import BlanklineTokenizer 4 | print(BlanklineTokenizer().tokenize(sent)) 5 | -------------------------------------------------------------------------------- /Chapter 1/ch1_14.py: -------------------------------------------------------------------------------- 1 | import nltk 2 | sent=" She secured 90.56 % in class X . She is a meritorious student" 3 | from nltk.tokenize import WhitespaceTokenizer 4 | print(WhitespaceTokenizer().tokenize(sent)) 5 | -------------------------------------------------------------------------------- /Chapter 2/ch2_1.py: -------------------------------------------------------------------------------- 1 | import nltk 2 | from nltk.util import ngrams 3 | from nltk.corpus import alpino 4 | print(alpino.words()) 5 | unigrams=ngrams(alpino.words(),1) 6 | for i in unigrams: 7 | print(i) 8 | -------------------------------------------------------------------------------- /Chapter 2/ch2_2.py: -------------------------------------------------------------------------------- 1 | import nltk 2 | from nltk.util import ngrams 3 | from nltk.corpus import alpino 4 | print(alpino.words()) 5 | quadgrams=ngrams(alpino.words(),4) 6 | for i in quadgrams: 7 | print(i) 8 | -------------------------------------------------------------------------------- /Chapter 4/ch4_18.py: -------------------------------------------------------------------------------- 1 | import nltk 2 | from nltk.corpus import treebank 3 | from nltk.tag import UnigramTagger 4 | unitag = UnigramTagger(model={'Vinken': 'NN'}) 5 | print(unitag.tag(treebank.sents()[0])) 6 | -------------------------------------------------------------------------------- /Chapter 1/ch1_31.py: -------------------------------------------------------------------------------- 1 | import nltk 2 | from replacers import WordReplacer 3 | replacer=WordReplacer({'congrats':'congratulations'}) 4 | print(replacer.replace('congrats')) 5 | print(replacer.replace('maths')) 6 | -------------------------------------------------------------------------------- /Chapter 1/ch1_6.py: -------------------------------------------------------------------------------- 1 | import nltk 2 | from nltk.tokenize import TreebankWordTokenizer 3 | tokenizer = TreebankWordTokenizer() 4 | print(tokenizer.tokenize("Have a nice day. I hope you find the book interesting")) 5 | -------------------------------------------------------------------------------- /Chapter 9/ch9_2.py: -------------------------------------------------------------------------------- 1 | import nltk 2 | expr_read = nltk.sem.DrtExpression.fromstring 3 | expr2 = expr_read('([x,y], [John(x), Went(x),Sam(y),Meet(x,y)])') 4 | print(expr2) 5 | expr2.draw() 6 | print(expr2.fol()) 7 | -------------------------------------------------------------------------------- /Chapter 1/ch1_20.py: -------------------------------------------------------------------------------- 1 | import nltk 2 | from nltk.tokenize.util import string_span_tokenize 3 | sent=" She secured 90.56 % in class X \n. She is a meritorious student\n" 4 | print(list(string_span_tokenize(sent, " "))) 5 | -------------------------------------------------------------------------------- /Chapter 10/ch10_6.py: -------------------------------------------------------------------------------- 1 | import nltk 2 | grammar = r"NP: {<[CDJNP].*>+}" 3 | cp = nltk.RegexpParser(grammar) 4 | print(nltk.chunk.accuracy(cp, nltk.corpus.conll2000.chunked_sents('train.txt', chunk_types=('NP',)))) 5 | -------------------------------------------------------------------------------- /Chapter 6/ch6_16.py: -------------------------------------------------------------------------------- 1 | import nltk 2 | from nltk.corpus import wordnet 3 | from nltk.corpus import wordnet as wn 4 | lion = wn.synset('lion.n.01') 5 | cat = wn.synset('cat.n.01') 6 | print(lion.lch_similarity(cat)) 7 | -------------------------------------------------------------------------------- /Chapter 1/ch1_18.py: -------------------------------------------------------------------------------- 1 | import nltk 2 | from nltk.tokenize import WhitespaceTokenizer 3 | sent=" She secured 90.56 % in class X \n. She is a meritorious student\n" 4 | print(list(WhitespaceTokenizer().span_tokenize(sent))) 5 | -------------------------------------------------------------------------------- /Chapter 1/ch1_37.py: -------------------------------------------------------------------------------- 1 | import nltk 2 | from nltk.util import ngrams 3 | from nltk.corpus import alpino 4 | print(alpino.words()) 5 | trigrams_tokens=ngrams(alpino.words(),3) 6 | for i in trigrams_tokens: 7 | print(i) 8 | -------------------------------------------------------------------------------- /Chapter 2/ch2_6.py: -------------------------------------------------------------------------------- 1 | import nltk 2 | from nltk.util import ngrams 3 | from nltk.corpus import alpino 4 | print(alpino.words()) 5 | bigrams_tokens=ngrams(alpino.words(),2) 6 | for i in bigrams_tokens: 7 | print(i) 8 | -------------------------------------------------------------------------------- /Chapter 6/ch6_15.py: -------------------------------------------------------------------------------- 1 | import nltk 2 | from nltk.corpus import wordnet 3 | from nltk.corpus import wordnet as wn 4 | lion = wn.synset('lion.n.01') 5 | cat = wn.synset('cat.n.01') 6 | print(lion.path_similarity(cat)) 7 | 8 | -------------------------------------------------------------------------------- /Chapter 6/ch6_17.py: -------------------------------------------------------------------------------- 1 | import nltk 2 | from nltk.corpus import wordnet 3 | from nltk.corpus import wordnet as wn 4 | lion = wn.synset('lion.n.01') 5 | cat = wn.synset('cat.n.01') 6 | print(lion.wup_similarity(cat)) 7 | 8 | -------------------------------------------------------------------------------- /Chapter 6/ch6_2.py: -------------------------------------------------------------------------------- 1 | import nltk 2 | input_expr = nltk.sem.Expression.fromstring 3 | print(input_expr('X | (Y -> Z)')) 4 | print(input_expr('-(X & Y)')) 5 | print(input_expr('X & Y')) 6 | print(input_expr('X <-> -- X')) 7 | -------------------------------------------------------------------------------- /Chapter 1/ch1_12.py: -------------------------------------------------------------------------------- 1 | import nltk 2 | from nltk.tokenize import RegexpTokenizer 3 | sent=" She secured 90.56 % in class X . She is a meritorious student" 4 | capt = RegexpTokenizer('[A-Z]\w+') 5 | print(capt.tokenize(sent)) 6 | -------------------------------------------------------------------------------- /Chapter 1/ch1_24.py: -------------------------------------------------------------------------------- 1 | import nltk 2 | from nltk.corpus import stopwords 3 | stops=set(stopwords.words('english')) 4 | words=["Don't", 'hesitate','to','ask','questions'] 5 | print([word for word in words if word not in stops]) 6 | -------------------------------------------------------------------------------- /Chapter 1/ch1_29.py: -------------------------------------------------------------------------------- 1 | import nltk 2 | from replacers import RepeatReplacer 3 | replacer=RepeatReplacer() 4 | print(replacer.replace('lotttt')) 5 | print(replacer.replace('ohhhhh')) 6 | print(replacer.replace('ooohhhhh')) 7 | 8 | -------------------------------------------------------------------------------- /Chapter 4/ch4_8.py: -------------------------------------------------------------------------------- 1 | import nltk 2 | from nltk.corpus import treebank 3 | treebank_tagged = treebank.tagged_words(tagset='universal') 4 | tag = nltk.FreqDist(tag for (word, tag) in treebank_tagged) 5 | print(tag.most_common()) 6 | -------------------------------------------------------------------------------- /Chapter 9/ch9_3.py: -------------------------------------------------------------------------------- 1 | import nltk 2 | expr_read = nltk.sem.DrtExpression.fromstring 3 | expr3 = expr_read('([x], [John(x), eats(x)])+ ([y],[Sam(y),eats(y)])') 4 | print(expr3) 5 | print(expr3.simplify()) 6 | expr3.draw() 7 | 8 | -------------------------------------------------------------------------------- /Chapter 1/ch1_2.py: -------------------------------------------------------------------------------- 1 | import nltk 2 | tokenizer=nltk.data.load('tokenizers/punkt/english.pickle') 3 | text=" Hello everyone. Hope all are fine and doing well. Hope you find the book interesting" 4 | print(tokenizer.tokenize(text)) 5 | -------------------------------------------------------------------------------- /Chapter 4/ch4_30.py: -------------------------------------------------------------------------------- 1 | import nltk 2 | noun1=[("financial","NN"),("year","NN"),("account","NN"),("summary","NN")] 3 | gram="NP:{+}" 4 | find = nltk.RegexpParser(gram) 5 | print(find.parse(noun1)) 6 | x=find.parse(noun1) 7 | x.draw() 8 | -------------------------------------------------------------------------------- /Chapter 1/ch1_27.py: -------------------------------------------------------------------------------- 1 | import nltk 2 | from replacers import RegexpReplacer 3 | replacer= RegexpReplacer() 4 | replacer.replace("Don't hesitate to ask questions") 5 | print(replacer.replace("She must've gone to the market but she didn't go")) 6 | -------------------------------------------------------------------------------- /Chapter 3/ch3_3.py: -------------------------------------------------------------------------------- 1 | import nltk 2 | from nltk.stem import RegexpStemmer 3 | stemmerregexp=RegexpStemmer('ing') 4 | print(stemmerregexp.stem('working')) 5 | print(stemmerregexp.stem('happiness')) 6 | print(stemmerregexp.stem('pairing')) 7 | 8 | -------------------------------------------------------------------------------- /Chapter 1/ch1_21.py: -------------------------------------------------------------------------------- 1 | text=[" It is a pleasant evening.","Guests, who came from US arrived at the venue","Food was tasty."] 2 | from nltk.tokenize import word_tokenize 3 | tokenized_docs=[word_tokenize(doc) for doc in text] 4 | print(tokenized_docs) 5 | -------------------------------------------------------------------------------- /Chapter 1/ch1_15.py: -------------------------------------------------------------------------------- 1 | import nltk 2 | sent= "She secured 90.56 % in class X. She is a meritorious student" 3 | print(sent.split()) 4 | print(sent.split(' ')) 5 | sent=" She secured 90.56 % in class X \n. She is a meritorious student\n" 6 | print(sent.split('\n')) 7 | -------------------------------------------------------------------------------- /Chapter 4/ch4_13.py: -------------------------------------------------------------------------------- 1 | import nltk 2 | import os,os.path 3 | create = os.path.expanduser('~/nltkdoc') 4 | if not os.path.exists(create): 5 | os.mkdir(create) 6 | print(os.path.exists(create)) 7 | import nltk.data 8 | print(create in nltk.data.path) 9 | 10 | -------------------------------------------------------------------------------- /Chapter 4/ch4_16.py: -------------------------------------------------------------------------------- 1 | import nltk 2 | from nltk.tag import UnigramTagger 3 | from nltk.corpus import treebank 4 | training= treebank.tagged_sents()[:7000] 5 | unitagger=UnigramTagger(training) 6 | print(treebank.sents()[0]) 7 | print(unitagger.tag(treebank.sents()[0])) 8 | -------------------------------------------------------------------------------- /Chapter 5/ch5_20.py: -------------------------------------------------------------------------------- 1 | import nltk 2 | from nltk.corpus import treebank 3 | from itertools import islice 4 | from nltk.grammar import PCFG, induce_pcfg, toy_pcfg1, toy_pcfg2 5 | tokens = "Jack told Bob to bring my cookie".split() 6 | grammar = toy_pcfg2 7 | print(grammar) 8 | -------------------------------------------------------------------------------- /Chapter 4/ch4_17.py: -------------------------------------------------------------------------------- 1 | import nltk 2 | from nltk.corpus import treebank 3 | from nltk.tag import UnigramTagger 4 | training= treebank.tagged_sents()[:7000] 5 | unitagger=UnigramTagger(training) 6 | testing = treebank.tagged_sents()[2000:] 7 | print(unitagger.evaluate(testing)) 8 | -------------------------------------------------------------------------------- /Chapter 4/ch4_23.py: -------------------------------------------------------------------------------- 1 | import nltk 2 | from nltk.tag import AffixTagger 3 | from nltk.corpus import treebank 4 | testing = treebank.tagged_sents()[2000:] 5 | training= treebank.tagged_sents()[:7000] 6 | affixtag = AffixTagger(training) 7 | print(affixtag.evaluate(testing)) 8 | -------------------------------------------------------------------------------- /Chapter 1/ch1_28.py: -------------------------------------------------------------------------------- 1 | import nltk 2 | from nltk.tokenize import word_tokenize 3 | from replacers import RegexpReplacer 4 | replacer=RegexpReplacer() 5 | word_tokenize("Don't hesitate to ask questions") 6 | print(word_tokenize(replacer.replace("Don't hesitate to ask questions"))) 7 | -------------------------------------------------------------------------------- /Chapter 3/ch3_5.py: -------------------------------------------------------------------------------- 1 | import nltk 2 | from nltk.stem import WordNetLemmatizer 3 | lemmatizer_output=WordNetLemmatizer() 4 | print(lemmatizer_output.lemmatize('working')) 5 | print(lemmatizer_output.lemmatize('working',pos='v')) 6 | print(lemmatizer_output.lemmatize('works')) 7 | 8 | -------------------------------------------------------------------------------- /Chapter 6/ch6_5.py: -------------------------------------------------------------------------------- 1 | import nltk 2 | locations=[('Jaipur', 'IN', 'Rajasthan'),('Ajmer', 'IN', 'Rajasthan'),('Udaipur', 'IN', 'Rajasthan'),('Mumbai', 'IN', 'Maharashtra'),('Ahmedabad', 'IN', 'Gujrat')] 3 | q = [x1 for (x1, relation, x2) in locations if x2=='Rajasthan'] 4 | print(q) 5 | -------------------------------------------------------------------------------- /Chapter 4/ch4_22.py: -------------------------------------------------------------------------------- 1 | import nltk 2 | from nltk.corpus import treebank 3 | from nltk import NgramTagger 4 | testing = treebank.tagged_sents()[2000:] 5 | training= treebank.tagged_sents()[:7000] 6 | quadgramtag = NgramTagger(4, training) 7 | print(quadgramtag.evaluate(testing)) 8 | 9 | -------------------------------------------------------------------------------- /Chapter 4/ch4_27.py: -------------------------------------------------------------------------------- 1 | import nltk 2 | from nltk.tag import tnt 3 | from nltk.corpus import treebank 4 | testing = treebank.tagged_sents()[2000:] 5 | training= treebank.tagged_sents()[:7000] 6 | tnt_tagger=tnt.TnT() 7 | tnt_tagger.train(training) 8 | print(tnt_tagger.evaluate(testing)) 9 | -------------------------------------------------------------------------------- /Chapter 5/ch5_4.py: -------------------------------------------------------------------------------- 1 | import nltk 2 | from nltk.corpus import treebank_chunk 3 | print(treebank_chunk.chunked_sents()[1].leaves()) 4 | print(treebank_chunk.chunked_sents()[1].pos()) 5 | print(treebank_chunk.chunked_sents()[1].productions()) 6 | print(nltk.corpus.treebank.tagged_words()) 7 | -------------------------------------------------------------------------------- /Chapter 5/ch5_9.py: -------------------------------------------------------------------------------- 1 | import nltk 2 | sent = nltk.data.load('grammars/large_grammars/atis_sentences.txt') 3 | sent = nltk.parse.util.extract_test_sentences(sent) 4 | print(len(sent)) 5 | testingsent=sent[25] 6 | print(testingsent[1]) 7 | print(testingsent[0]) 8 | sent=testingsent[0] 9 | -------------------------------------------------------------------------------- /Chapter 10/ch10_1.py: -------------------------------------------------------------------------------- 1 | import nltk 2 | from nltk.corpus import brown 3 | sentences=brown.tagged_sents(categories='news') 4 | sent=brown.sents(categories='news') 5 | unigram_sent=nltk.UnigramTagger(sentences) 6 | print(unigram_sent.tag(sent[2008])) 7 | print(unigram_sent.evaluate(sentences)) 8 | -------------------------------------------------------------------------------- /Chapter 3/ch3_4.py: -------------------------------------------------------------------------------- 1 | import nltk 2 | from nltk.stem import SnowballStemmer 3 | print(SnowballStemmer.languages) 4 | spanishstemmer=SnowballStemmer('spanish') 5 | print(spanishstemmer.stem('comiendo')) 6 | frenchstemmer=SnowballStemmer('french') 7 | print(frenchstemmer.stem('manger')) 8 | 9 | -------------------------------------------------------------------------------- /Chapter 4/ch4_24.py: -------------------------------------------------------------------------------- 1 | import nltk 2 | from nltk.tag import AffixTagger 3 | from nltk.corpus import treebank 4 | testing = treebank.tagged_sents()[2000:] 5 | training= treebank.tagged_sents()[:7000] 6 | prefixtag = AffixTagger(training, affix_length=4) 7 | print(prefixtag.evaluate(testing)) 8 | -------------------------------------------------------------------------------- /Chapter 4/ch4_25.py: -------------------------------------------------------------------------------- 1 | import nltk 2 | from nltk.tag import AffixTagger 3 | from nltk.corpus import treebank 4 | testing = treebank.tagged_sents()[2000:] 5 | training= treebank.tagged_sents()[:7000] 6 | suffixtag = AffixTagger(training, affix_length=-3) 7 | print(suffixtag.evaluate(testing)) 8 | -------------------------------------------------------------------------------- /Chapter 5/ch5_1.py: -------------------------------------------------------------------------------- 1 | import nltk 2 | import nltk.corpus 3 | print(str(nltk.corpus.treebank).replace('\\\\','/')) 4 | print(nltk.corpus.treebank.fileids()) 5 | from nltk.corpus import treebank 6 | print(treebank.words('wsj_0007.mrg')) 7 | print(treebank.tagged_words('wsj_0007.mrg')) 8 | 9 | -------------------------------------------------------------------------------- /Chapter 6/ch6_7.py: -------------------------------------------------------------------------------- 1 | import nltk 2 | sentences1 = nltk.corpus.treebank.tagged_sents()[17] 3 | print(nltk.ne_chunk(sentences1, binary=True)) 4 | sentences2 = nltk.corpus.treebank.tagged_sents()[7] 5 | print(nltk.ne_chunk(sentences2, binary=True)) 6 | print(nltk.ne_chunk(sentences2)) 7 | 8 | -------------------------------------------------------------------------------- /Chapter 1/ch1_19.py: -------------------------------------------------------------------------------- 1 | import nltk 2 | from nltk.tokenize import WhitespaceTokenizer 3 | from nltk.tokenize.util import spans_to_relative 4 | sent=" She secured 90.56 % in class X \n. She is a meritorious student\n" 5 | print(list(spans_to_relative(WhitespaceTokenizer().span_tokenize(sent)))) 6 | -------------------------------------------------------------------------------- /Chapter 4/ch4_6.py: -------------------------------------------------------------------------------- 1 | import nltk 2 | sentence='''The/DT sacred/VBN Ganga/NNP flows/VBZ in/IN this/DT region/NN ./. This/DT is/VBZ a/DT pilgrimage/NN ./. People/NNP from/IN all/DT over/IN the/DT country/NN visit/NN this/DT place/NN ./. ''' 3 | print([nltk.tag.str2tuple(t) for t in sentence.split()]) 4 | -------------------------------------------------------------------------------- /Chapter 2/ch2_8.py: -------------------------------------------------------------------------------- 1 | import nltk 2 | from nltk.util import ngrams 3 | sent=" Hello , please read the book thoroughly . If you have any queries , then don't hesitate to ask . There is no shortcut to success ." 4 | n=5 5 | fivegrams=ngrams(sent.split(),n) 6 | for grams in fivegrams: 7 | print(grams) 8 | -------------------------------------------------------------------------------- /Chapter 9/ch9_5.py: -------------------------------------------------------------------------------- 1 | import nltk 2 | expr_read = nltk.sem.DrtExpression.fromstring 3 | expr5 = expr_read('([x,y],[ram(x),food(y),eats(x,y)])') 4 | expr6 = expr_read('([u,z],[PRO(u),coffee(z),drinks(u,z)])') 5 | expr7=expr5+expr6 6 | print(expr7.simplify()) 7 | print(expr7.simplify().resolve_anaphora()) 8 | -------------------------------------------------------------------------------- /Chapter 3/ch3_6.py: -------------------------------------------------------------------------------- 1 | import nltk 2 | from nltk.stem import PorterStemmer 3 | from nltk.stem import WordNetLemmatizer 4 | stemmer_output=PorterStemmer() 5 | print(stemmer_output.stem('happiness')) 6 | lemmatizer_output=WordNetLemmatizer() 7 | print(lemmatizer_output.lemmatize('happiness')) 8 | 9 | 10 | -------------------------------------------------------------------------------- /Chapter 6/ch6_12.py: -------------------------------------------------------------------------------- 1 | import nltk 2 | from nltk.corpus import brown 3 | from nltk.tag import UnigramTagger 4 | tagger = UnigramTagger(brown.tagged_sents(categories='news')[:700]) 5 | sentence = ['John','and','Smith','went','to','NY','and','Germany'] 6 | for word, tag in tagger.tag(sentence): 7 | print(word,'->',tag) 8 | -------------------------------------------------------------------------------- /Chapter 10/ch10_8.py: -------------------------------------------------------------------------------- 1 | import nltk 2 | from nltk.corpus import brown 3 | sentences = brown.tagged_sents(categories='news') 4 | sent = brown.sents(categories='news') 5 | pattern = [(r'(January)$','Jan')] 6 | regexpr_tagger = nltk.RegexpTagger(pattern) 7 | print(regexpr_tagger.tag(sent[3])) 8 | print(regexpr_tagger.evaluate(sentences)) 9 | 10 | -------------------------------------------------------------------------------- /Chapter 4/ch4_9.py: -------------------------------------------------------------------------------- 1 | import nltk 2 | from nltk.corpus import treebank 3 | treebank_tagged = treebank.tagged_words(tagset='universal') 4 | tagpairs = nltk.bigrams(treebank_tagged) 5 | preceders_noun = [x[1] for (x, y) in tagpairs if y[1] == 'NOUN'] 6 | freqdist = nltk.FreqDist(preceders_noun) 7 | print([tag for (tag, _) in freqdist.most_common()]) 8 | -------------------------------------------------------------------------------- /Chapter 2/ch2_3.py: -------------------------------------------------------------------------------- 1 | import nltk 2 | from nltk.collocations import BigramCollocationFinder 3 | from nltk.corpus import webtext 4 | from nltk.metrics import BigramAssocMeasures 5 | tokens=[t.lower() for t in webtext.words('grail.txt')] 6 | words=BigramCollocationFinder.from_words(tokens) 7 | print(words.nbest(BigramAssocMeasures.likelihood_ratio, 10)) 8 | -------------------------------------------------------------------------------- /Chapter 8/ch8_1.py: -------------------------------------------------------------------------------- 1 | import nltk 2 | from nltk.corpus import stopwords 3 | print(stopwords.words('english')) 4 | def not_stopwords(text): 5 | stopwords = nltk.corpus.stopwords.words('english') 6 | content = [w for w in text if w.lower() not in stopwords] 7 | return len(content) / len(text) 8 | print(not_stopwords(nltk.corpus.reuters.words())) 9 | -------------------------------------------------------------------------------- /Chapter 10/ch10_2.py: -------------------------------------------------------------------------------- 1 | import nltk 2 | from nltk.corpus import brown 3 | sentences=brown.tagged_sents(categories='news') 4 | sz=int(len(sentences)*0.8) 5 | print(sz) 6 | training_sents = sentences[:sz] 7 | print(testing_sents=sentences[sz:]) 8 | unigram_tagger=nltk.UnigramTagger(training_sents) 9 | print(unigram_tagger.evaluate(testing_sents)) 10 | 11 | -------------------------------------------------------------------------------- /Chapter 2/ch2_7.py: -------------------------------------------------------------------------------- 1 | import nltk 2 | from nltk.collocations import * 3 | import nltk 4 | text="Hello how are you doing ? I hope you find the book interesting" 5 | tokens=nltk.wordpunct_tokenize(text) 6 | fourgrams=nltk.collocations.QuadgramCollocationFinder.from_words(tokens) 7 | for fourgram, freq in fourgrams.ngram_fd.items(): 8 | print(fourgram,freq) 9 | -------------------------------------------------------------------------------- /Chapter 4/ch4_19.py: -------------------------------------------------------------------------------- 1 | import nltk 2 | from nltk.tag import UnigramTagger 3 | from nltk.tag import DefaultTagger 4 | from nltk.corpus import treebank 5 | testing = treebank.tagged_sents()[2000:] 6 | training= treebank.tagged_sents()[:7000] 7 | tag1=DefaultTagger('NN') 8 | tag2=UnigramTagger(training,backoff=tag1) 9 | print(tag2.evaluate(testing)) 10 | -------------------------------------------------------------------------------- /Chapter 6/ch6_3.py: -------------------------------------------------------------------------------- 1 | import nltk 2 | value = nltk.Valuation([('X', True), ('Y', False), ('Z', True)]) 3 | print(value['Z']) 4 | domain = set() 5 | v = nltk.Assignment(domain) 6 | u = nltk.Model(domain, value) 7 | print(u.evaluate('(X & Y)', v)) 8 | print(u.evaluate('-(X & Y)', v)) 9 | print(u.evaluate('(X & Z)', v)) 10 | print(u.evaluate('(X | Y)', v)) 11 | 12 | -------------------------------------------------------------------------------- /Chapter 1/ch1_16.py: -------------------------------------------------------------------------------- 1 | import nltk 2 | from nltk.tokenize import BlanklineTokenizer 3 | sent=" She secured 90.56 % in class X \n. She is a meritorious student\n" 4 | print(BlanklineTokenizer().tokenize(sent)) 5 | from nltk.tokenize import LineTokenizer 6 | print(LineTokenizer(blanklines='keep').tokenize(sent)) 7 | print(LineTokenizer(blanklines='discard').tokenize(sent)) 8 | -------------------------------------------------------------------------------- /Chapter 4/ch4_20.py: -------------------------------------------------------------------------------- 1 | import nltk 2 | from nltk.tag import BigramTagger 3 | from nltk.corpus import treebank 4 | training_1= treebank.tagged_sents()[:7000] 5 | bigramtagger=BigramTagger(training_1) 6 | print(treebank.sents()[0]) 7 | print(bigramtagger.tag(treebank.sents()[0])) 8 | testing_1 = treebank.tagged_sents()[2000:] 9 | print(bigramtagger.evaluate(testing_1)) 10 | 11 | -------------------------------------------------------------------------------- /Chapter 4/ch4_21.py: -------------------------------------------------------------------------------- 1 | import nltk 2 | from nltk.tag import BigramTagger, TrigramTagger 3 | from nltk.corpus import treebank 4 | testing = treebank.tagged_sents()[2000:] 5 | training= treebank.tagged_sents()[:7000] 6 | bigramtag = BigramTagger(training) 7 | print(bigramtag.evaluate(testing)) 8 | trigramtag = TrigramTagger(training) 9 | print(trigramtag.evaluate(testing)) 10 | -------------------------------------------------------------------------------- /Chapter 2/ch2_5.py: -------------------------------------------------------------------------------- 1 | import nltk 2 | from nltk.collocations import * 3 | text1="Hardwork is the key to success. Never give up!" 4 | word = nltk.wordpunct_tokenize(text1) 5 | finder = BigramCollocationFinder.from_words(word) 6 | bigram_measures = nltk.collocations.BigramAssocMeasures() 7 | value = finder.score_ngrams(bigram_measures.raw_freq) 8 | print(sorted(bigram for bigram, score in value)) 9 | -------------------------------------------------------------------------------- /Chapter 6/ch6_4.py: -------------------------------------------------------------------------------- 1 | import nltk 2 | input_expr = nltk.sem.Expression.fromstring 3 | expression = input_expr('run(marcus)', type_check=True) 4 | print(expression.argument) 5 | print(expression.argument.type) 6 | print(expression.function) 7 | print(expression.function.type) 8 | sign = {'run': ''} 9 | expression = input_expr('run(marcus)', signature=sign) 10 | print(expression.function.type) 11 | -------------------------------------------------------------------------------- /Chapter 10/ch10_4.py: -------------------------------------------------------------------------------- 1 | import nltk 2 | from nltk.corpus import brown 3 | sentences=brown.tagged_sents(categories='news') 4 | sz=int(len(sentences)*0.8) 5 | training_sents = sentences[:sz] 6 | testing_sents=sentences[sz:] 7 | s0=nltk.DefaultTagger('NNP') 8 | s1=nltk.UnigramTagger(training_sents,backoff=s0) 9 | s2=nltk.BigramTagger(training_sents,backoff=s1) 10 | print(s2.evaluate(testing_sents)) 11 | 12 | -------------------------------------------------------------------------------- /Chapter 4/ch4_28.py: -------------------------------------------------------------------------------- 1 | import nltk 2 | from nltk.tag import DefaultTagger 3 | from nltk.tag import tnt 4 | from nltk.corpus import treebank 5 | testing = treebank.tagged_sents()[2000:] 6 | training= treebank.tagged_sents()[:7000] 7 | tnt_tagger=tnt.TnT() 8 | unknown=DefaultTagger('NN') 9 | tagger_tnt=tnt.TnT(unk=unknown,Trained=True) 10 | tnt_tagger.train(training) 11 | print(tnt_tagger.evaluate(testing)) 12 | -------------------------------------------------------------------------------- /Chapter 1/ch1_26.py: -------------------------------------------------------------------------------- 1 | import nltk 2 | from nltk.corpus import stopwords 3 | print(stopwords.words('english')) 4 | def para_fraction(text): 5 | stopwords = nltk.corpus.stopwords.words('english') 6 | para = [w for w in text if w.lower() not in stopwords] 7 | return len(para) / len(text) 8 | print(para_fraction(nltk.corpus.reuters.words())) 9 | print(para_fraction(nltk.corpus.inaugural.words())) 10 | 11 | -------------------------------------------------------------------------------- /Chapter 1/ch1_33.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | from nltk.metrics import * 3 | training='PERSON OTHER PERSON OTHER OTHER ORGANIZATION'.split() 4 | testing='PERSON OTHER OTHER OTHER OTHER OTHER'.split() 5 | print(accuracy(training,testing)) 6 | trainset=set(training) 7 | testset=set(testing) 8 | precision(trainset,testset) 9 | print(recall(trainset,testset)) 10 | print(f_measure(trainset,testset)) 11 | 12 | -------------------------------------------------------------------------------- /Chapter 4/ch4_29.py: -------------------------------------------------------------------------------- 1 | import nltk 2 | sent=[("A","DT"),("wise", "JJ"), ("small", "JJ"),("girl", "NN"), ("of", "IN"), ("village", "N"), ("became", "VBD"), ("leader", "NN")] 3 | sent=[("A","DT"),("wise", "JJ"), ("small", "JJ"),("girl", "NN"), ("of", "IN"), ("village", "NN"), ("became", "VBD"), ("leader", "NN")] 4 | grammar = "NP: {
?*?*}" 5 | find = nltk.RegexpParser(grammar) 6 | res = find.parse(sent) 7 | print(res) 8 | res.draw() 9 | -------------------------------------------------------------------------------- /Chapter 5/ch5_13.py: -------------------------------------------------------------------------------- 1 | import nltk 2 | gram1 = nltk.data.load('grammars/large_grammars/atis.cfg') 3 | sent = nltk.data.load('grammars/large_grammars/atis_sentences.txt') 4 | sent = nltk.parse.util.extract_test_sentences(sent) 5 | testingsent=sent[25] 6 | sent=testingsent[0] 7 | parser4 = nltk.parse.TopDownChartParser(gram1) 8 | chart4 = parser4.chart_parse(sent) 9 | print((chart4.num_edges())) 10 | print((len(list(chart4.parses(gram1.start()))))) 11 | -------------------------------------------------------------------------------- /Chapter 5/ch5_18.py: -------------------------------------------------------------------------------- 1 | import nltk 2 | gram1 = nltk.data.load('grammars/large_grammars/atis.cfg') 3 | sent = nltk.data.load('grammars/large_grammars/atis_sentences.txt') 4 | sent = nltk.parse.util.extract_test_sentences(sent) 5 | testingsent=sent[25] 6 | sent=testingsent[0] 7 | parser9 = nltk.parse.EarleyChartParser(gram1) 8 | chart9 = parser9.chart_parse(sent) 9 | print((chart9.num_edges())) 10 | print((len(list(chart9.parses(gram1.start()))))) 11 | -------------------------------------------------------------------------------- /Chapter 5/ch5_10.py: -------------------------------------------------------------------------------- 1 | import nltk 2 | gram1 = nltk.data.load('grammars/large_grammars/atis.cfg') 3 | sent = nltk.data.load('grammars/large_grammars/atis_sentences.txt') 4 | sent = nltk.parse.util.extract_test_sentences(sent) 5 | testingsent=sent[25] 6 | sent=testingsent[0] 7 | parser1 = nltk.parse.BottomUpChartParser(gram1) 8 | chart1 = parser1.chart_parse(sent) 9 | print((chart1.num_edges())) 10 | print((len(list(chart1.parses(gram1.start()))))) 11 | 12 | -------------------------------------------------------------------------------- /Chapter 5/ch5_12.py: -------------------------------------------------------------------------------- 1 | import nltk 2 | gram1 = nltk.data.load('grammars/large_grammars/atis.cfg') 3 | sent = nltk.data.load('grammars/large_grammars/atis_sentences.txt') 4 | sent = nltk.parse.util.extract_test_sentences(sent) 5 | testingsent=sent[25] 6 | sent=testingsent[0] 7 | parser3 = nltk.parse.LeftCornerChartParser(gram1) 8 | chart3 = parser3.chart_parse(sent) 9 | print((chart3.num_edges())) 10 | print((len(list(chart3.parses(gram1.start()))))) 11 | -------------------------------------------------------------------------------- /Chapter 5/ch5_11.py: -------------------------------------------------------------------------------- 1 | import nltk 2 | gram1 = nltk.data.load('grammars/large_grammars/atis.cfg') 3 | sent = nltk.data.load('grammars/large_grammars/atis_sentences.txt') 4 | sent = nltk.parse.util.extract_test_sentences(sent) 5 | testingsent=sent[25] 6 | sent=testingsent[0] 7 | parser2 = nltk.parse.BottomUpLeftCornerChartParser(gram1) 8 | chart2 = parser2.chart_parse(sent) 9 | print((chart2.num_edges())) 10 | print((len(list(chart2.parses(gram1.start()))))) 11 | -------------------------------------------------------------------------------- /Chapter 5/ch5_17.py: -------------------------------------------------------------------------------- 1 | import nltk 2 | gram1 = nltk.data.load('grammars/large_grammars/atis.cfg') 3 | sent = nltk.data.load('grammars/large_grammars/atis_sentences.txt') 4 | sent = nltk.parse.util.extract_test_sentences(sent) 5 | testingsent=sent[25] 6 | sent=testingsent[0] 7 | parser8 = nltk.parse.IncrementalTopDownChartParser(gram1) 8 | chart8 = parser8.chart_parse(sent) 9 | print((chart8.num_edges())) 10 | print((len(list(chart8.parses(gram1.start()))))) 11 | -------------------------------------------------------------------------------- /Chapter 6/ch6_13.py: -------------------------------------------------------------------------------- 1 | import nltk 2 | from nltk.corpus import wordnet 3 | from nltk.corpus import wordnet as wn 4 | wn.synsets('cat') 5 | wn.synsets('cat', pos=wn.VERB) 6 | wn.synset('cat.n.01') 7 | print(wn.synset('cat.n.01').definition()) 8 | print(len(wn.synset('cat.n.01').examples())) 9 | print(wn.synset('cat.n.01').lemmas()) 10 | print([str(lemma.name()) for lemma in wn.synset('cat.n.01').lemmas()]) 11 | print(wn.lemma('cat.n.01.cat').synset()) 12 | 13 | -------------------------------------------------------------------------------- /.gitattributes: -------------------------------------------------------------------------------- 1 | # Auto detect text files and perform LF normalization 2 | * text=auto 3 | 4 | # Custom for Visual Studio 5 | *.cs diff=csharp 6 | 7 | # Standard to msysgit 8 | *.doc diff=astextplain 9 | *.DOC diff=astextplain 10 | *.docx diff=astextplain 11 | *.DOCX diff=astextplain 12 | *.dot diff=astextplain 13 | *.DOT diff=astextplain 14 | *.pdf diff=astextplain 15 | *.PDF diff=astextplain 16 | *.rtf diff=astextplain 17 | *.RTF diff=astextplain 18 | -------------------------------------------------------------------------------- /Chapter 5/ch5_16.py: -------------------------------------------------------------------------------- 1 | import nltk 2 | gram1 = nltk.data.load('grammars/large_grammars/atis.cfg') 3 | sent = nltk.data.load('grammars/large_grammars/atis_sentences.txt') 4 | sent = nltk.parse.util.extract_test_sentences(sent) 5 | testingsent=sent[25] 6 | sent=testingsent[0] 7 | parser7 = nltk.parse.IncrementalLeftCornerChartParser(gram1) 8 | chart7 = parser7.chart_parse(sent) 9 | print((chart7.num_edges())) 10 | print((len(list(chart7.parses(gram1.start()))))) 11 | -------------------------------------------------------------------------------- /Chapter 5/ch5_14.py: -------------------------------------------------------------------------------- 1 | import nltk 2 | gram1 = nltk.data.load('grammars/large_grammars/atis.cfg') 3 | sent = nltk.data.load('grammars/large_grammars/atis_sentences.txt') 4 | sent = nltk.parse.util.extract_test_sentences(sent) 5 | testingsent=sent[25] 6 | sent=testingsent[0] 7 | parser5 = nltk.parse.IncrementalBottomUpChartParser(gram1) 8 | chart5 = parser5.chart_parse(sent) 9 | print((chart5.num_edges())) 10 | print((len(list(chart5.parses(gram1.start()))))) 11 | 12 | -------------------------------------------------------------------------------- /Chapter 10/ch10_9.py: -------------------------------------------------------------------------------- 1 | import nltk 2 | from nltk.corpus import brown 3 | freqd = nltk.FreqDist(brown.words(categories='news')) 4 | cfreqd = nltk.ConditionalFreqDist(brown.tagged_words(categories='news')) 5 | mostfreq_words = freqd.most_common(100) 6 | likelytags = dict((word, cfreqd[word].max()) for (word, _) in mostfreq_words) 7 | baselinetagger = nltk.UnigramTagger(model=likelytags) 8 | 9 | sent = brown.sents(categories='news')[3] 10 | print(baselinetagger.tag(sent)) 11 | -------------------------------------------------------------------------------- /Chapter 5/ch5_15.py: -------------------------------------------------------------------------------- 1 | import nltk 2 | gram1 = nltk.data.load('grammars/large_grammars/atis.cfg') 3 | sent = nltk.data.load('grammars/large_grammars/atis_sentences.txt') 4 | sent = nltk.parse.util.extract_test_sentences(sent) 5 | testingsent=sent[25] 6 | sent=testingsent[0] 7 | parser6 = nltk.parse.IncrementalBottomUpLeftCornerChartParser(gram1) 8 | chart6 = parser6.chart_parse(sent) 9 | print((chart6.num_edges())) 10 | print((len(list(chart6.parses(gram1.start()))))) 11 | 12 | -------------------------------------------------------------------------------- /Chapter 10/ch10_3.py: -------------------------------------------------------------------------------- 1 | import nltk 2 | from nltk.corpus import brown 3 | sentences=brown.tagged_sents(categories='news') 4 | sz=int(len(sentences)*0.8) 5 | training_sents = sentences[:sz] 6 | testing_sents=sentences[sz:] 7 | bigram_tagger=nltk.UnigramTagger(training_sents) 8 | bigram_tagger=nltk.BigramTagger(training_sents) 9 | print(bigram_tagger.tag(sentences[2008])) 10 | un_sent=sentences[4203] 11 | print(bigram_tagger.tag(un_sent)) 12 | print(bigram_tagger.evaluate(testing_sents)) 13 | -------------------------------------------------------------------------------- /Chapter 6/ch6_6.py: -------------------------------------------------------------------------------- 1 | import nltk 2 | nltk.data.show_cfg('grammars/book_grammars/sql1.fcfg') 3 | 4 | 5 | from nltk import load_parser 6 | test = load_parser('grammars/book_grammars/sql1.fcfg') 7 | q=" What cities are in Greece" 8 | t = list(test.parse(q.split())) 9 | ans = t[0].label()['SEM'] 10 | ans = [s for s in ans if s] 11 | q = ' '.join(ans) 12 | print(q) 13 | from nltk.sem import chat80 14 | r = chat80.sql_query('corpora/city_database/city.db', q) 15 | for p in r: 16 | print(p[0], end=" ") 17 | -------------------------------------------------------------------------------- /Chapter 2/ch2_4.py: -------------------------------------------------------------------------------- 1 | from nltk.corpus import stopwords 2 | from nltk.corpus import webtext 3 | from nltk.collocations import BigramCollocationFinder 4 | from nltk.metrics import BigramAssocMeasures 5 | set = set(stopwords.words('english')) 6 | stops_filter = lambda w: len(w) < 3 or w in set 7 | tokens=[t.lower() for t in webtext.words('grail.txt')] 8 | words=BigramCollocationFinder.from_words(tokens) 9 | words.apply_word_filter(stops_filter) 10 | print(words.nbest(BigramAssocMeasures.likelihood_ratio, 10)) 11 | -------------------------------------------------------------------------------- /Chapter 5/ch5_19.py: -------------------------------------------------------------------------------- 1 | import nltk 2 | from nltk.corpus import treebank 3 | from itertools import islice 4 | from nltk.grammar import PCFG, induce_pcfg, toy_pcfg1, toy_pcfg2 5 | gram2 = PCFG.fromstring(""" 6 | A -> B B [.3] | C B C [.7] 7 | B -> B D [.5] | C [.5] 8 | C -> 'a' [.1] | 'b' [0.9] 9 | D -> 'b' [1.0] 10 | """) 11 | prod1 = gram2.productions()[0] 12 | print(prod1) 13 | prod2 = gram2.productions()[1] 14 | print(prod2) 15 | print(prod2.lhs()) 16 | print(prod2.rhs()) 17 | print((prod2.prob())) 18 | print(gram2.start()) 19 | print(gram2.productions()) 20 | -------------------------------------------------------------------------------- /Chapter 6/ch6_14.py: -------------------------------------------------------------------------------- 1 | import nltk 2 | from nltk.corpus import wordnet 3 | from nltk.corpus import wordnet as wn 4 | print(sorted(wn.langs())) 5 | print(wn.synset('cat.n.01').lemma_names('ita')) 6 | print(sorted(wn.synset('cat.n.01').lemmas('dan'))) 7 | print(sorted(wn.synset('cat.n.01').lemmas('por'))) 8 | print(len(wordnet.all_lemma_names(pos='n', lang='jpn'))) 9 | cat = wn.synset('cat.n.01') 10 | print(cat.hypernyms()) 11 | print(cat.hyponyms()) 12 | print(cat.member_holonyms()) 13 | print(cat.root_hypernyms()) 14 | print(wn.synset('cat.n.01').lowest_common_hypernyms(wn.synset('dog.n.01'))) 15 | -------------------------------------------------------------------------------- /Chapter 1/ch1_3.py: -------------------------------------------------------------------------------- 1 | import nltk 2 | french_tokenizer=nltk.data.load('tokenizers/punkt/french.pickle') 3 | print(french_tokenizer.tokenize('Deux agressions en quelques jours, voilà ce qui a motivé hier matin le débrayage collège franco-britanniquedeLevallois-Perret. Deux agressions en quelques jours, voilà ce qui a motivé hier matin le débrayage Levallois. L’équipe pédagogique de ce collège de 750 élèves avait déjà été choquée par l’agression, janvier , d’un professeur d’histoire. L’équipe pédagogique de ce collège de 750 élèves avait déjà été choquée par l’agression, mercredi , d’un professeur d’histoire')) 4 | -------------------------------------------------------------------------------- /Chapter 10/ch10_7.py: -------------------------------------------------------------------------------- 1 | import nltk 2 | correct = nltk.chunk.tagstr2tree( 3 | "[ the/DT little/JJ cat/NN ] sat/VBD on/IN [ the/DT mat/NN ]") 4 | print(correct.flatten()) 5 | grammar = r"NP: {<[CDJNP].*>+}" 6 | cp = nltk.RegexpParser(grammar) 7 | 8 | grammar = r"NP: {+}" 9 | chunk_parser = nltk.RegexpParser(grammar) 10 | tagged_tok = [("the", "DT"), ("little", "JJ"), ("cat", "NN"),("sat", "VBD"), ("on", "IN"), ("the", "DT"), ("mat", "NN")] 11 | chunkscore = nltk.chunk.ChunkScore() 12 | guessed = cp.parse(correct.flatten()) 13 | chunkscore.score(correct, guessed) 14 | print(chunkscore) 15 | -------------------------------------------------------------------------------- /Chapter 4/ch4_26.py: -------------------------------------------------------------------------------- 1 | import nltk 2 | from nltk.tag import AffixTagger 3 | from nltk.corpus import treebank 4 | testing = treebank.tagged_sents()[2000:] 5 | training= treebank.tagged_sents()[:7000] 6 | prefixtagger=AffixTagger(training,affix_length=4) 7 | prefixtagger3=AffixTagger(training,affix_length=3,backoff=prefixtagger) 8 | print(prefixtagger3.evaluate(testing)) 9 | suffixtagger3=AffixTagger(training,affix_length=-3,backoff=prefixtagger3) 10 | print(suffixtagger3.evaluate(testing)) 11 | suffixtagger4=AffixTagger(training,affix_length=-4,backoff=suffixtagger3) 12 | print(suffixtagger4.evaluate(testing)) 13 | 14 | -------------------------------------------------------------------------------- /Chapter 6/ch6_18.py: -------------------------------------------------------------------------------- 1 | import nltk 2 | from nltk.corpus import wordnet 3 | from nltk.corpus import wordnet as wn 4 | from nltk.corpus import wordnet_ic 5 | brown_ic = wordnet_ic.ic('ic-brown.dat') 6 | semcor_ic = wordnet_ic.ic('ic-semcor.dat') 7 | from nltk.corpus import genesis 8 | genesis_ic = wn.ic(genesis, False, 0.0) 9 | lion = wn.synset('lion.n.01') 10 | cat = wn.synset('cat.n.01') 11 | print(lion.res_similarity(cat, brown_ic)) 12 | print(lion.res_similarity(cat, genesis_ic)) 13 | print(lion.jcn_similarity(cat, brown_ic)) 14 | print(lion.jcn_similarity(cat, genesis_ic)) 15 | print(lion.lin_similarity(cat, semcor_ic)) 16 | 17 | 18 | -------------------------------------------------------------------------------- /Chapter 1/ch1_22.py: -------------------------------------------------------------------------------- 1 | import re 2 | import string 3 | text=[" It is a pleasant evening.","Guests, who came from US arrived at the venue","Food was tasty."] 4 | from nltk.tokenize import word_tokenize 5 | tokenized_docs=[word_tokenize(doc) for doc in text] 6 | x=re.compile('[%s]' % re.escape(string.punctuation)) 7 | tokenized_docs_no_punctuation = [] 8 | for review in tokenized_docs: 9 | new_review = [] 10 | for token in review: 11 | new_token = x.sub(u'', token) 12 | if not new_token == u'': 13 | new_review.append(new_token) 14 | tokenized_docs_no_punctuation.append(new_review) 15 | print(tokenized_docs_no_punctuation) 16 | -------------------------------------------------------------------------------- /Chapter 2/ch2_9.py: -------------------------------------------------------------------------------- 1 | import nltk 2 | cor = nltk.corpus.brown.tagged_sents(categories='adventure')[:500] 3 | print(len(cor)) 4 | from nltk.util import unique_list 5 | tag_set = unique_list(tag for sent in cor for (word,tag) in sent) 6 | print(len(tag_set)) 7 | symbols = unique_list(word for sent in cor for (word,tag) in sent) 8 | print(len(symbols)) 9 | print(len(tag_set)) 10 | symbols = unique_list(word for sent in cor for (word,tag) in sent) 11 | print(len(symbols)) 12 | trainer = nltk.tag.HiddenMarkovModelTrainer(tag_set, symbols) 13 | train_corpus = [] 14 | test_corpus = [] 15 | for i in range(len(cor)): 16 | if i % 10: 17 | train_corpus+=[cor[i]] 18 | else: 19 | test_corpus+=[cor[i]] 20 | print(len(train_corpus)) 21 | print(len(test_corpus)) 22 | 23 | 24 | -------------------------------------------------------------------------------- /Chapter 5/ch5_7.py: -------------------------------------------------------------------------------- 1 | import nltk 2 | from nltk import Nonterminal, nonterminals, Production, CFG 3 | nonterminal1 = Nonterminal('NP') 4 | nonterminal2 = Nonterminal('VP') 5 | nonterminal3 = Nonterminal('PP') 6 | print(nonterminal1.symbol()) 7 | print(nonterminal2.symbol()) 8 | print(nonterminal3.symbol()) 9 | print(nonterminal1==nonterminal2) 10 | print(nonterminal2==nonterminal3) 11 | print(nonterminal1==nonterminal3) 12 | S, NP, VP, PP = nonterminals('S, NP, VP, PP') 13 | N, V, P, DT = nonterminals('N, V, P, DT') 14 | production1 = Production(S, [NP, VP]) 15 | production2 = Production(NP, [DT, NP]) 16 | production3 = Production(VP, [V, NP,NP,PP]) 17 | print(production1.lhs()) 18 | print(production1.rhs()) 19 | print(production3.lhs()) 20 | print(production3.rhs()) 21 | print(production3 == Production(VP, [V,NP,NP,PP])) 22 | print(production2 == production3) 23 | 24 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Windows image file caches 2 | Thumbs.db 3 | ehthumbs.db 4 | 5 | # Folder config file 6 | Desktop.ini 7 | 8 | # Recycle Bin used on file shares 9 | $RECYCLE.BIN/ 10 | 11 | # Windows Installer files 12 | *.cab 13 | *.msi 14 | *.msm 15 | *.msp 16 | 17 | # Windows shortcuts 18 | *.lnk 19 | 20 | # ========================= 21 | # Operating System Files 22 | # ========================= 23 | 24 | # OSX 25 | # ========================= 26 | 27 | .DS_Store 28 | .AppleDouble 29 | .LSOverride 30 | 31 | # Thumbnails 32 | ._* 33 | 34 | # Files that might appear in the root of a volume 35 | .DocumentRevisions-V100 36 | .fseventsd 37 | .Spotlight-V100 38 | .TemporaryItems 39 | .Trashes 40 | .VolumeIcon.icns 41 | 42 | # Directories potentially created on remote AFP share 43 | .AppleDB 44 | .AppleDesktop 45 | Network Trash Folder 46 | Temporary Items 47 | .apdisk 48 | -------------------------------------------------------------------------------- /Chapter 7/ch7_1.py: -------------------------------------------------------------------------------- 1 | import nltk 2 | import random 3 | from nltk.corpus import movie_reviews 4 | docs = [(list(movie_reviews.words(fid)), cat) for cat in movie_reviews.categories() for fid in movie_reviews.fileids(cat)] 5 | random.shuffle(docs) 6 | all_tokens = nltk.FreqDist(x.lower() for x in movie_reviews.words()) 7 | token_features = list(all_tokens.keys())[:2000] 8 | print(token_features[:100]) 9 | 10 | def doc_features(docs): 11 | doc_words = set(docs) 12 | features = {} 13 | for word in token_features: 14 | features['contains(%s)' % word] = (word in doc_words) 15 | return features 16 | 17 | print(doc_features(movie_reviews.words('pos/cv957_8737.txt'))) 18 | feature_sets = [(doc_features(d), c) for (d,c) in docs] 19 | train_sets, test_sets = feature_sets[100:], feature_sets[:100] 20 | classifiers = nltk.NaiveBayesClassifier.train(train_sets) 21 | print(nltk.classify.accuracy(classifiers, test_sets)) 22 | classifiers.show_most_informative_features(5) 23 | -------------------------------------------------------------------------------- /Chapter 2/ch2_10.py: -------------------------------------------------------------------------------- 1 | import nltk 2 | corpus=u" hello how are you doing ? Hope you find the book interesting. ".split() 3 | sentence=u"how are you doing".split() 4 | vocabulary=set(corpus) 5 | print(len(vocabulary)) 6 | cfd = nltk.ConditionalFreqDist(nltk.bigrams(corpus)) 7 | print([cfd[a][b] for (a,b) in nltk.bigrams(sentence)]) 8 | print([cfd[a].N() for (a,b) in nltk.bigrams(sentence)]) 9 | print([cfd[a].freq(b) for (a,b) in nltk.bigrams(sentence)]) 10 | print([1 + cfd[a][b] for (a,b) in nltk.bigrams(sentence)]) 11 | print([len(vocabulary) + cfd[a].N() for (a,b) in nltk.bigrams(sentence)]) 12 | print([1.0 * (1+cfd[a][b]) / (len(vocabulary)+cfd[a].N()) for (a,b) in nltk.bigrams(sentence)]) 13 | cpd_mle = nltk.ConditionalProbDist(cfd, nltk.MLEProbDist, bins=len(vocabulary)) 14 | print([cpd_mle[a].prob(b) for (a,b) in nltk.bigrams(sentence)]) 15 | cpd_laplace = nltk.ConditionalProbDist(cfd, nltk.LaplaceProbDist, bins=len(vocabulary)) 16 | print([cpd_laplace[a].prob(b) for (a,b) in nltk.bigrams(sentence)]) 17 | 18 | -------------------------------------------------------------------------------- /__pycache__/replacers.py: -------------------------------------------------------------------------------- 1 | import re 2 | 3 | from nltk.corpus import wordnet 4 | 5 | replacement_patterns = [ 6 | (r'won\'t', 'will not'), 7 | (r'can\'t', 'cannot'), 8 | (r'i\'m', 'i am'), 9 | (r'ain\'t', 'is not'), 10 | (r'(\w+)\'ll', '\g<1> will'), 11 | (r'(\w+)n\'t', '\g<1> not'), 12 | (r'(\w+)\'ve', '\g<1> have'), 13 | (r'(\w+)\'s', '\g<1> is'), 14 | (r'(\w+)\'re', '\g<1> are'), 15 | (r'(\w+)\'d', '\g<1> would') 16 | ] 17 | class RegexpReplacer(object): 18 | def __init__(self, patterns=replacement_patterns): 19 | self.patterns = [(re.compile(regex), repl) for (regex, repl) in 20 | patterns] 21 | def replace(self, text): 22 | s = text 23 | for (pattern, repl) in self.patterns: 24 | (s, count) = re.subn(pattern, repl, s) 25 | return s 26 | 27 | class RepeatReplacer(object): 28 | def __init__(self): 29 | self.repeat_regexp = re.compile(r'(\w*)(\w)\2(\w*)') 30 | self.repl = r'\1\2\3' 31 | def replace(self, word): 32 | if wordnet.synsets(word): 33 | return word 34 | repl_word = self.repeat_regexp.sub(self.repl, word) 35 | if repl_word != word: 36 | return self.replace(repl_word) 37 | else: 38 | return repl_word 39 | 40 | class WordReplacer(object): 41 | def __init__(self, word_map): 42 | self.word_map = word_map 43 | def replace(self, word): 44 | return self.word_map.get(word, word) 45 | --------------------------------------------------------------------------------