├── Readme.md
├── Chapter 6
    ├── ch6_1.py
    ├── ch6_10.py
    ├── ch6_11.py
    ├── ch6_8.py
    ├── ch6_9.py
    ├── ch6_16.py
    ├── ch6_15.py
    ├── ch6_17.py
    ├── ch6_2.py
    ├── ch6_5.py
    ├── ch6_7.py
    ├── ch6_12.py
    ├── ch6_3.py
    ├── ch6_4.py
    ├── ch6_13.py
    ├── ch6_6.py
    ├── ch6_14.py
    └── ch6_18.py
├── Chapter 4
    ├── ch4_2.py
    ├── ch4_3.py
    ├── ch4_1.py
    ├── ch4_4.py
    ├── ch4_12.py
    ├── ch4_7.py
    ├── ch4_11.py
    ├── ch4_14.py
    ├── ch4_5.py
    ├── ch4_15.py
    ├── ch4_10.py
    ├── ch4_18.py
    ├── ch4_8.py
    ├── ch4_30.py
    ├── ch4_13.py
    ├── ch4_16.py
    ├── ch4_17.py
    ├── ch4_23.py
    ├── ch4_22.py
    ├── ch4_27.py
    ├── ch4_24.py
    ├── ch4_25.py
    ├── ch4_6.py
    ├── ch4_9.py
    ├── ch4_19.py
    ├── ch4_20.py
    ├── ch4_21.py
    ├── ch4_28.py
    ├── ch4_29.py
    └── ch4_26.py
├── Chapter 1
    ├── ch1_25.py
    ├── ch1_23.py
    ├── ch1_7.py
    ├── ch1_30.py
    ├── ch1_35.py
    ├── ch1_36.py
    ├── ch1_4.py
    ├── ch1_34.py
    ├── ch1_8.py
    ├── ch1_5.py
    ├── ch1_1.py
    ├── ch1_9.py
    ├── ch1_11.py
    ├── ch1_10.py
    ├── ch1_17.py
    ├── ch1_13.py
    ├── ch1_14.py
    ├── ch1_31.py
    ├── ch1_6.py
    ├── ch1_20.py
    ├── ch1_18.py
    ├── ch1_37.py
    ├── ch1_12.py
    ├── ch1_24.py
    ├── ch1_29.py
    ├── ch1_2.py
    ├── ch1_27.py
    ├── ch1_21.py
    ├── ch1_15.py
    ├── ch1_28.py
    ├── ch1_19.py
    ├── ch1_16.py
    ├── ch1_26.py
    ├── ch1_33.py
    ├── ch1_3.py
    └── ch1_22.py
├── Chapter 5
    ├── ch5_2.py
    ├── ch5_8.py
    ├── ch5_21.py
    ├── ch5_22.py
    ├── ch5_5.py
    ├── ch5_6.py
    ├── ch5_3.py
    ├── ch5_23.py
    ├── ch5_20.py
    ├── ch5_4.py
    ├── ch5_9.py
    ├── ch5_1.py
    ├── ch5_13.py
    ├── ch5_18.py
    ├── ch5_10.py
    ├── ch5_12.py
    ├── ch5_11.py
    ├── ch5_17.py
    ├── ch5_16.py
    ├── ch5_14.py
    ├── ch5_15.py
    ├── ch5_19.py
    └── ch5_7.py
├── SoftwareHardwarelist.pdf
├── Chapter 10
    ├── ch10_10.py
    ├── ch10_5.py
    ├── ch10_6.py
    ├── ch10_1.py
    ├── ch10_8.py
    ├── ch10_2.py
    ├── ch10_4.py
    ├── ch10_9.py
    ├── ch10_3.py
    └── ch10_7.py
├── __pycache__
    ├── replacers.cpython-34.pyc
    └── replacers.py
├── Chapter 9
    ├── ch9_4.py
    ├── ch9_1.py
    ├── ch9_2.py
    ├── ch9_3.py
    └── ch9_5.py
├── Chapter 3
    ├── ch3_2.py
    ├── ch3_1.py
    ├── ch3_3.py
    ├── ch3_5.py
    ├── ch3_4.py
    └── ch3_6.py
├── Chapter 2
    ├── ch2_1.py
    ├── ch2_2.py
    ├── ch2_6.py
    ├── ch2_8.py
    ├── ch2_3.py
    ├── ch2_7.py
    ├── ch2_5.py
    ├── ch2_4.py
    ├── ch2_9.py
    └── ch2_10.py
├── Chapter 8
    └── ch8_1.py
├── .gitattributes
├── .gitignore
└── Chapter 7
    └── ch7_1.py


/Readme.md:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/Chapter 6/ch6_1.py:
--------------------------------------------------------------------------------
1 | import nltk
2 | nltk.boolean_ops()
3 | 


--------------------------------------------------------------------------------
/Chapter 6/ch6_10.py:
--------------------------------------------------------------------------------
1 | import nltk
2 | nltk.tag.hmm.demo_pos()
3 | 


--------------------------------------------------------------------------------
/Chapter 4/ch4_2.py:
--------------------------------------------------------------------------------
1 | import nltk
2 | print(nltk.help.upenn_tagset('NNS'))
3 | 


--------------------------------------------------------------------------------
/Chapter 4/ch4_3.py:
--------------------------------------------------------------------------------
1 | import nltk
2 | print(nltk.help.upenn_tagset('VB.*'))
3 | 


--------------------------------------------------------------------------------
/Chapter 1/ch1_25.py:
--------------------------------------------------------------------------------
1 | from nltk.corpus import stopwords
2 | print(stopwords.fileids())
3 | 


--------------------------------------------------------------------------------
/Chapter 1/ch1_23.py:
--------------------------------------------------------------------------------
1 | text='HARdWork IS KEy to SUCCESS'
2 | print(text.lower())
3 | print(text.upper())
4 | 


--------------------------------------------------------------------------------
/Chapter 1/ch1_7.py:
--------------------------------------------------------------------------------
1 | import nltk
2 | text=nltk.word_tokenize(" Don't hesitate to ask questions")
3 | print(text)
4 | 


--------------------------------------------------------------------------------
/Chapter 5/ch5_2.py:
--------------------------------------------------------------------------------
1 | import nltk
2 | from nltk.corpus import treebank
3 | print(treebank.parsed_sents('wsj_0007.mrg')[2])
4 | 


--------------------------------------------------------------------------------
/Chapter 5/ch5_8.py:
--------------------------------------------------------------------------------
1 | import nltk
2 | gram1 = nltk.data.load('grammars/large_grammars/atis.cfg')
3 | print(gram1)
4 | 
5 | 


--------------------------------------------------------------------------------
/Chapter 4/ch4_1.py:
--------------------------------------------------------------------------------
1 | import nltk
2 | text1=nltk.word_tokenize("It is a pleasant day today")
3 | print(nltk.pos_tag(text1))
4 | 


--------------------------------------------------------------------------------
/Chapter 4/ch4_4.py:
--------------------------------------------------------------------------------
1 | import nltk
2 | text=nltk.word_tokenize("I cannot bear the pain of bear")
3 | print(nltk.pos_tag(text))
4 | 


--------------------------------------------------------------------------------
/Chapter 5/ch5_21.py:
--------------------------------------------------------------------------------
1 | import nltk
2 | nltk.parse.chart.demo(5, print_times=False, trace=1,sent='John saw a dog', numparses=2)
3 | 


--------------------------------------------------------------------------------
/Chapter 5/ch5_22.py:
--------------------------------------------------------------------------------
1 | import nltk
2 | nltk.parse.chart.demo(2, print_times=False, trace=1,sent='John saw a dog', numparses=1)
3 | 


--------------------------------------------------------------------------------
/Chapter 4/ch4_12.py:
--------------------------------------------------------------------------------
1 | import nltk
2 | from nltk.tag import untag
3 | print(untag([('beautiful', 'NN'), ('morning', 'NN')]))
4 | 
5 | 


--------------------------------------------------------------------------------
/Chapter 4/ch4_7.py:
--------------------------------------------------------------------------------
1 | import nltk
2 | taggedtok = ('bear', 'NN')
3 | from nltk.tag.util import tuple2str
4 | print(tuple2str(taggedtok))
5 | 


--------------------------------------------------------------------------------
/Chapter 1/ch1_30.py:
--------------------------------------------------------------------------------
1 | import nltk
2 | from replacers import RepeatReplacer
3 | replacer=RepeatReplacer()
4 | print(replacer.replace('happy'))
5 | 


--------------------------------------------------------------------------------
/Chapter 1/ch1_35.py:
--------------------------------------------------------------------------------
1 | import nltk
2 | from nltk.metrics import *
3 | X=set([10,20,30,40])
4 | Y=set([20,30,60])
5 | print(jaccard_distance(X,Y))
6 | 
7 | 


--------------------------------------------------------------------------------
/SoftwareHardwarelist.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Mastering-Natural-Language-Processing-with-Python/HEAD/SoftwareHardwarelist.pdf


--------------------------------------------------------------------------------
/Chapter 1/ch1_36.py:
--------------------------------------------------------------------------------
1 | import nltk
2 | from nltk.metrics import *
3 | X = set([10,20,30,40])
4 | Y= set([30,50,70])
5 | print(binary_distance(X, Y))
6 | 
7 | 


--------------------------------------------------------------------------------
/Chapter 10/ch10_10.py:
--------------------------------------------------------------------------------
1 | import nltk
2 | from nltk.stem.lancaster import LancasterStemmer
3 | stri=LancasterStemmer()
4 | print(stri.stem('achievement'))
5 | 


--------------------------------------------------------------------------------
/Chapter 4/ch4_11.py:
--------------------------------------------------------------------------------
1 | import nltk
2 | from nltk.tag import DefaultTagger
3 | tag = DefaultTagger('NN')
4 | print(tag.tag(['Beautiful', 'morning']))
5 | 
6 | 


--------------------------------------------------------------------------------
/Chapter 4/ch4_14.py:
--------------------------------------------------------------------------------
1 | import nltk
2 | from nltk.corpus import names
3 | print(len(names.words('male.txt')))
4 | print(len(names.words('female.txt')))
5 | 
6 | 


--------------------------------------------------------------------------------
/Chapter 4/ch4_5.py:
--------------------------------------------------------------------------------
1 | import nltk
2 | taggedword=nltk.tag.str2tuple('bear/NN')
3 | print(taggedword)
4 | print(taggedword[0])
5 | print(taggedword[1])
6 | 
7 | 


--------------------------------------------------------------------------------
/Chapter 1/ch1_4.py:
--------------------------------------------------------------------------------
1 | import nltk
2 | text = nltk.word_tokenize("PierreVinken , 59 years old , will join as a nonexecutive director on Nov. 29 .")
3 | print(text)
4 | 


--------------------------------------------------------------------------------
/Chapter 5/ch5_5.py:
--------------------------------------------------------------------------------
1 | import nltk
2 | from nltk.probability import FreqDist
3 | from nltk.corpus import treebank
4 | fd = FreqDist()
5 | print(fd.items())
6 | 
7 | 


--------------------------------------------------------------------------------
/Chapter 5/ch5_6.py:
--------------------------------------------------------------------------------
1 | import nltk
2 | from nltk.corpus import sinica_treebank
3 | print(sinica_treebank.sents())
4 | print(sinica_treebank.parsed_sents()[27])
5 | 


--------------------------------------------------------------------------------
/Chapter 6/ch6_11.py:
--------------------------------------------------------------------------------
1 | import nltk
2 | from nltk import pos_tag, word_tokenize
3 | print(pos_tag(word_tokenize("John and Smith are going to NY and Germany")))
4 | 


--------------------------------------------------------------------------------
/Chapter 6/ch6_8.py:
--------------------------------------------------------------------------------
1 | import nltk
2 | from nltk.corpus import conll2002
3 | for documents in conll2002.chunked_sents('ned.train')[25]:
4 |     print(documents)
5 | 


--------------------------------------------------------------------------------
/Chapter 1/ch1_34.py:
--------------------------------------------------------------------------------
1 | import nltk
2 | from nltk.metrics import *
3 | print(edit_distance("relate","relation"))
4 | print(edit_distance("suggestion","calculation"))
5 | 


--------------------------------------------------------------------------------
/Chapter 1/ch1_8.py:
--------------------------------------------------------------------------------
1 | from nltk.tokenize import WordPunctTokenizer
2 | tokenizer=WordPunctTokenizer()
3 | print(tokenizer.tokenize(" Don't hesitate to ask questions"))
4 | 


--------------------------------------------------------------------------------
/Chapter 5/ch5_3.py:
--------------------------------------------------------------------------------
1 | import nltk
2 | from nltk.corpus import treebank_chunk
3 | print(treebank_chunk.chunked_sents()[1])
4 | treebank_chunk.chunked_sents()[1].draw()
5 | 


--------------------------------------------------------------------------------
/Chapter 1/ch1_5.py:
--------------------------------------------------------------------------------
1 | import nltk
2 | from nltk import word_tokenize
3 | r=input("Please write a text")
4 | print("The length of text is",len(word_tokenize(r)),"words")
5 | 


--------------------------------------------------------------------------------
/__pycache__/replacers.cpython-34.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Mastering-Natural-Language-Processing-with-Python/HEAD/__pycache__/replacers.cpython-34.pyc


--------------------------------------------------------------------------------
/Chapter 4/ch4_15.py:
--------------------------------------------------------------------------------
1 | import nltk
2 | from nltk.corpus import words
3 | print(words.fileids())
4 | print(len(words.words('en')))
5 | print(len(words.words('en-basic')))
6 | 
7 | 


--------------------------------------------------------------------------------
/Chapter 5/ch5_23.py:
--------------------------------------------------------------------------------
1 | import nltk
2 | nltk.parse.featurechart.demo(print_times=False,print_grammar=True,parser=nltk.parse.featurechart.FeatureChartParser,sent='I saw a dog')
3 | 


--------------------------------------------------------------------------------
/Chapter 6/ch6_9.py:
--------------------------------------------------------------------------------
1 | import nltk
2 | sentence = "I went to Greece to meet John";
3 | tok=nltk.word_tokenize(sentence)
4 | pos_tag=nltk.pos_tag(tok)
5 | print(nltk.ne_chunk(pos_tag))
6 | 


--------------------------------------------------------------------------------
/Chapter 1/ch1_1.py:
--------------------------------------------------------------------------------
1 | import nltk
2 | text=" Welcome readers. I hope you find it interesting. Please do reply."
3 | from nltk.tokenize import sent_tokenize
4 | print(sent_tokenize(text))
5 | 


--------------------------------------------------------------------------------
/Chapter 1/ch1_9.py:
--------------------------------------------------------------------------------
1 | import nltk
2 | from nltk.tokenize import RegexpTokenizer
3 | tokenizer=RegexpTokenizer("[\w']+")
4 | print(tokenizer.tokenize("Don't hesitate to ask questions"))
5 | 


--------------------------------------------------------------------------------
/Chapter 9/ch9_4.py:
--------------------------------------------------------------------------------
1 | import nltk
2 | expr_read = nltk.sem.DrtExpression.fromstring
3 | expr4 = expr_read('([],[(([x],[student(x)])->([y],[book(y),read(x,y)]))])')
4 | print(expr4.fol())
5 | 


--------------------------------------------------------------------------------
/Chapter 1/ch1_11.py:
--------------------------------------------------------------------------------
1 | import nltk
2 | from nltk.tokenize import RegexpTokenizer
3 | tokenizer=RegexpTokenizer('\s+',gaps=True)
4 | print(tokenizer.tokenize("Don't hesitate to ask questions"))
5 | 


--------------------------------------------------------------------------------
/Chapter 10/ch10_5.py:
--------------------------------------------------------------------------------
1 | import nltk
2 | chunkparser = nltk.RegexpParser("")
3 | print(nltk.chunk.accuracy(chunkparser, nltk.corpus.conll2000.chunked_sents('train.txt', chunk_types=('NP',))))
4 | 


--------------------------------------------------------------------------------
/Chapter 3/ch3_2.py:
--------------------------------------------------------------------------------
1 | import nltk
2 | from nltk.stem import LancasterStemmer
3 | stemmerlan=LancasterStemmer()
4 | print(stemmerlan.stem('working'))
5 | print(stemmerlan.stem('happiness'))
6 | 


--------------------------------------------------------------------------------
/Chapter 4/ch4_10.py:
--------------------------------------------------------------------------------
 1 | import nltk
 2 | tag={}
 3 | print(tag)
 4 | tag['beautiful']='ADJ'
 5 | 
 6 | tag['boy']='N'
 7 | tag['read']='V'
 8 | tag['generously']='ADV'
 9 | print(tag)
10 | 
11 | 


--------------------------------------------------------------------------------
/Chapter 9/ch9_1.py:
--------------------------------------------------------------------------------
1 | import nltk
2 | expr_read = nltk.sem.DrtExpression.fromstring
3 | expr1 = expr_read('([x], [John(x), Went(x)])')
4 | print(expr1)
5 | expr1.draw()
6 | print(expr1.fol())
7 | 


--------------------------------------------------------------------------------
/Chapter 1/ch1_10.py:
--------------------------------------------------------------------------------
1 | import nltk
2 | from nltk.tokenize import regexp_tokenize
3 | sent="Don't hesitate to ask questions"
4 | print(regexp_tokenize(sent, pattern='\w+|\$[\d\.]+|\S+'))
5 | 
6 | 
7 | 


--------------------------------------------------------------------------------
/Chapter 1/ch1_17.py:
--------------------------------------------------------------------------------
1 | import nltk
2 | sent=" She secured 90.56 % in class X \n. She is a meritorious student\n"
3 | from nltk.tokenize import SpaceTokenizer
4 | print(SpaceTokenizer().tokenize(sent))
5 | 


--------------------------------------------------------------------------------
/Chapter 3/ch3_1.py:
--------------------------------------------------------------------------------
1 | import nltk
2 | from nltk.stem import PorterStemmer
3 | stemmerporter = PorterStemmer()
4 | print(stemmerporter.stem('working'))
5 | print(stemmerporter.stem('happiness'))
6 | 
7 | 


--------------------------------------------------------------------------------
/Chapter 1/ch1_13.py:
--------------------------------------------------------------------------------
1 | import nltk
2 | sent=" She secured 90.56 % in class X . She is a meritorious student"
3 | from nltk.tokenize import BlanklineTokenizer
4 | print(BlanklineTokenizer().tokenize(sent))
5 | 


--------------------------------------------------------------------------------
/Chapter 1/ch1_14.py:
--------------------------------------------------------------------------------
1 | import nltk
2 | sent=" She secured 90.56 % in class X . She is a meritorious student"
3 | from nltk.tokenize import WhitespaceTokenizer
4 | print(WhitespaceTokenizer().tokenize(sent))
5 | 


--------------------------------------------------------------------------------
/Chapter 2/ch2_1.py:
--------------------------------------------------------------------------------
1 | import nltk
2 | from nltk.util import ngrams
3 | from nltk.corpus import alpino
4 | print(alpino.words())
5 | unigrams=ngrams(alpino.words(),1)
6 | for i in unigrams:
7 |     print(i)
8 | 


--------------------------------------------------------------------------------
/Chapter 2/ch2_2.py:
--------------------------------------------------------------------------------
1 | import nltk
2 | from nltk.util import ngrams
3 | from nltk.corpus import alpino
4 | print(alpino.words())
5 | quadgrams=ngrams(alpino.words(),4)
6 | for i in quadgrams:
7 |     print(i)
8 | 


--------------------------------------------------------------------------------
/Chapter 4/ch4_18.py:
--------------------------------------------------------------------------------
1 | import nltk
2 | from nltk.corpus import treebank
3 | from nltk.tag import UnigramTagger
4 | unitag = UnigramTagger(model={'Vinken': 'NN'})
5 | print(unitag.tag(treebank.sents()[0]))
6 | 


--------------------------------------------------------------------------------
/Chapter 1/ch1_31.py:
--------------------------------------------------------------------------------
1 | import nltk
2 | from replacers import WordReplacer
3 | replacer=WordReplacer({'congrats':'congratulations'})
4 | print(replacer.replace('congrats'))
5 | print(replacer.replace('maths'))
6 | 


--------------------------------------------------------------------------------
/Chapter 1/ch1_6.py:
--------------------------------------------------------------------------------
1 | import nltk
2 | from nltk.tokenize import TreebankWordTokenizer
3 | tokenizer = TreebankWordTokenizer()
4 | print(tokenizer.tokenize("Have a nice day. I hope you find the book interesting"))
5 | 


--------------------------------------------------------------------------------
/Chapter 9/ch9_2.py:
--------------------------------------------------------------------------------
1 | import nltk
2 | expr_read = nltk.sem.DrtExpression.fromstring
3 | expr2 = expr_read('([x,y], [John(x), Went(x),Sam(y),Meet(x,y)])')
4 | print(expr2)
5 | expr2.draw()
6 | print(expr2.fol())
7 | 


--------------------------------------------------------------------------------
/Chapter 1/ch1_20.py:
--------------------------------------------------------------------------------
1 | import nltk
2 | from nltk.tokenize.util import string_span_tokenize
3 | sent=" She secured 90.56 % in class X \n. She is a meritorious student\n"
4 | print(list(string_span_tokenize(sent, " ")))
5 | 


--------------------------------------------------------------------------------
/Chapter 10/ch10_6.py:
--------------------------------------------------------------------------------
1 | import nltk
2 | grammar = r"NP: {<[CDJNP].*>+}"
3 | cp = nltk.RegexpParser(grammar)
4 | print(nltk.chunk.accuracy(cp, nltk.corpus.conll2000.chunked_sents('train.txt', chunk_types=('NP',))))
5 | 


--------------------------------------------------------------------------------
/Chapter 6/ch6_16.py:
--------------------------------------------------------------------------------
1 | import nltk
2 | from nltk.corpus import wordnet
3 | from nltk.corpus import wordnet as wn
4 | lion = wn.synset('lion.n.01')
5 | cat = wn.synset('cat.n.01')
6 | print(lion.lch_similarity(cat))
7 | 


--------------------------------------------------------------------------------
/Chapter 1/ch1_18.py:
--------------------------------------------------------------------------------
1 | import nltk
2 | from nltk.tokenize import WhitespaceTokenizer
3 | sent=" She secured 90.56 % in class X \n. She is a meritorious student\n"
4 | print(list(WhitespaceTokenizer().span_tokenize(sent)))
5 | 


--------------------------------------------------------------------------------
/Chapter 1/ch1_37.py:
--------------------------------------------------------------------------------
1 | import nltk
2 | from nltk.util import ngrams
3 | from nltk.corpus import alpino
4 | print(alpino.words())
5 | trigrams_tokens=ngrams(alpino.words(),3)
6 | for i in trigrams_tokens:
7 |     print(i)
8 | 


--------------------------------------------------------------------------------
/Chapter 2/ch2_6.py:
--------------------------------------------------------------------------------
1 | import nltk
2 | from nltk.util import ngrams
3 | from nltk.corpus import alpino
4 | print(alpino.words())
5 | bigrams_tokens=ngrams(alpino.words(),2)
6 | for i in bigrams_tokens:
7 |     print(i) 
8 | 


--------------------------------------------------------------------------------
/Chapter 6/ch6_15.py:
--------------------------------------------------------------------------------
1 | import nltk
2 | from nltk.corpus import wordnet
3 | from nltk.corpus import wordnet as wn
4 | lion = wn.synset('lion.n.01')
5 | cat = wn.synset('cat.n.01')
6 | print(lion.path_similarity(cat))
7 | 
8 | 


--------------------------------------------------------------------------------
/Chapter 6/ch6_17.py:
--------------------------------------------------------------------------------
1 | import nltk
2 | from nltk.corpus import wordnet
3 | from nltk.corpus import wordnet as wn
4 | lion = wn.synset('lion.n.01')
5 | cat = wn.synset('cat.n.01')
6 | print(lion.wup_similarity(cat))
7 | 
8 | 


--------------------------------------------------------------------------------
/Chapter 6/ch6_2.py:
--------------------------------------------------------------------------------
1 | import nltk
2 | input_expr = nltk.sem.Expression.fromstring
3 | print(input_expr('X | (Y -> Z)'))
4 | print(input_expr('-(X & Y)'))
5 | print(input_expr('X & Y'))
6 | print(input_expr('X <-> -- X'))
7 | 


--------------------------------------------------------------------------------
/Chapter 1/ch1_12.py:
--------------------------------------------------------------------------------
1 | import nltk
2 | from nltk.tokenize import RegexpTokenizer
3 | sent=" She secured 90.56 % in class X . She is a meritorious student"
4 | capt = RegexpTokenizer('[A-Z]\w+')
5 | print(capt.tokenize(sent))
6 | 


--------------------------------------------------------------------------------
/Chapter 1/ch1_24.py:
--------------------------------------------------------------------------------
1 | import nltk
2 | from nltk.corpus import stopwords
3 | stops=set(stopwords.words('english'))
4 | words=["Don't", 'hesitate','to','ask','questions']
5 | print([word for word in words if word not in stops])
6 | 


--------------------------------------------------------------------------------
/Chapter 1/ch1_29.py:
--------------------------------------------------------------------------------
1 | import nltk
2 | from replacers import RepeatReplacer
3 | replacer=RepeatReplacer()
4 | print(replacer.replace('lotttt'))
5 | print(replacer.replace('ohhhhh'))
6 | print(replacer.replace('ooohhhhh'))
7 | 
8 | 


--------------------------------------------------------------------------------
/Chapter 4/ch4_8.py:
--------------------------------------------------------------------------------
1 | import nltk
2 | from nltk.corpus import treebank
3 | treebank_tagged = treebank.tagged_words(tagset='universal')
4 | tag = nltk.FreqDist(tag for (word, tag) in treebank_tagged)
5 | print(tag.most_common())
6 | 


--------------------------------------------------------------------------------
/Chapter 9/ch9_3.py:
--------------------------------------------------------------------------------
1 | import nltk
2 | expr_read = nltk.sem.DrtExpression.fromstring
3 | expr3 = expr_read('([x], [John(x), eats(x)])+ ([y],[Sam(y),eats(y)])')
4 | print(expr3)
5 | print(expr3.simplify())
6 | expr3.draw()
7 | 
8 | 


--------------------------------------------------------------------------------
/Chapter 1/ch1_2.py:
--------------------------------------------------------------------------------
1 | import nltk
2 | tokenizer=nltk.data.load('tokenizers/punkt/english.pickle')
3 | text=" Hello everyone. Hope all are fine and doing well. Hope you find the book interesting"
4 | print(tokenizer.tokenize(text))
5 | 


--------------------------------------------------------------------------------
/Chapter 4/ch4_30.py:
--------------------------------------------------------------------------------
1 | import nltk
2 | noun1=[("financial","NN"),("year","NN"),("account","NN"),("summary","NN")]
3 | gram="NP:{<NN>+}"
4 | find = nltk.RegexpParser(gram)
5 | print(find.parse(noun1))
6 | x=find.parse(noun1)
7 | x.draw()
8 | 


--------------------------------------------------------------------------------
/Chapter 1/ch1_27.py:
--------------------------------------------------------------------------------
1 | import nltk
2 | from replacers import RegexpReplacer
3 | replacer= RegexpReplacer()
4 | replacer.replace("Don't hesitate to ask questions")
5 | print(replacer.replace("She must've gone to the market but she didn't go"))
6 | 


--------------------------------------------------------------------------------
/Chapter 3/ch3_3.py:
--------------------------------------------------------------------------------
1 | import nltk
2 | from nltk.stem import RegexpStemmer
3 | stemmerregexp=RegexpStemmer('ing')
4 | print(stemmerregexp.stem('working'))
5 | print(stemmerregexp.stem('happiness'))
6 | print(stemmerregexp.stem('pairing'))
7 | 
8 | 


--------------------------------------------------------------------------------
/Chapter 1/ch1_21.py:
--------------------------------------------------------------------------------
1 | text=[" It is a pleasant evening.","Guests, who came from US arrived at the venue","Food was tasty."]
2 | from nltk.tokenize import word_tokenize
3 | tokenized_docs=[word_tokenize(doc) for doc in text]
4 | print(tokenized_docs)
5 | 


--------------------------------------------------------------------------------
/Chapter 1/ch1_15.py:
--------------------------------------------------------------------------------
1 | import nltk
2 | sent= "She secured 90.56 % in class X. She is a meritorious student"
3 | print(sent.split())
4 | print(sent.split(' '))
5 | sent=" She secured 90.56 % in class X \n. She is a meritorious student\n"
6 | print(sent.split('\n'))
7 | 


--------------------------------------------------------------------------------
/Chapter 4/ch4_13.py:
--------------------------------------------------------------------------------
 1 | import nltk
 2 | import os,os.path
 3 | create = os.path.expanduser('~/nltkdoc')
 4 | if not os.path.exists(create):
 5 |     os.mkdir(create)
 6 | print(os.path.exists(create))
 7 | import nltk.data
 8 | print(create in nltk.data.path)
 9 | 
10 | 


--------------------------------------------------------------------------------
/Chapter 4/ch4_16.py:
--------------------------------------------------------------------------------
1 | import nltk
2 | from nltk.tag import UnigramTagger
3 | from nltk.corpus import treebank
4 | training= treebank.tagged_sents()[:7000]
5 | unitagger=UnigramTagger(training)
6 | print(treebank.sents()[0])
7 | print(unitagger.tag(treebank.sents()[0]))
8 | 


--------------------------------------------------------------------------------
/Chapter 5/ch5_20.py:
--------------------------------------------------------------------------------
1 | import nltk
2 | from nltk.corpus import treebank
3 | from itertools import islice
4 | from nltk.grammar import PCFG, induce_pcfg, toy_pcfg1, toy_pcfg2
5 | tokens = "Jack told Bob to bring my cookie".split()
6 | grammar = toy_pcfg2
7 | print(grammar)
8 | 


--------------------------------------------------------------------------------
/Chapter 4/ch4_17.py:
--------------------------------------------------------------------------------
1 | import nltk
2 | from nltk.corpus import treebank
3 | from nltk.tag import UnigramTagger
4 | training= treebank.tagged_sents()[:7000]
5 | unitagger=UnigramTagger(training)
6 | testing = treebank.tagged_sents()[2000:]
7 | print(unitagger.evaluate(testing))
8 | 


--------------------------------------------------------------------------------
/Chapter 4/ch4_23.py:
--------------------------------------------------------------------------------
1 | import nltk
2 | from nltk.tag import AffixTagger
3 | from nltk.corpus import treebank
4 | testing = treebank.tagged_sents()[2000:]
5 | training= treebank.tagged_sents()[:7000]
6 | affixtag = AffixTagger(training)
7 | print(affixtag.evaluate(testing))
8 | 


--------------------------------------------------------------------------------
/Chapter 1/ch1_28.py:
--------------------------------------------------------------------------------
1 | import nltk
2 | from nltk.tokenize import word_tokenize
3 | from replacers import RegexpReplacer
4 | replacer=RegexpReplacer()
5 | word_tokenize("Don't hesitate to ask questions")
6 | print(word_tokenize(replacer.replace("Don't hesitate to ask questions")))
7 | 


--------------------------------------------------------------------------------
/Chapter 3/ch3_5.py:
--------------------------------------------------------------------------------
1 | import nltk
2 | from nltk.stem import WordNetLemmatizer
3 | lemmatizer_output=WordNetLemmatizer()
4 | print(lemmatizer_output.lemmatize('working'))
5 | print(lemmatizer_output.lemmatize('working',pos='v'))
6 | print(lemmatizer_output.lemmatize('works'))
7 | 
8 | 


--------------------------------------------------------------------------------
/Chapter 6/ch6_5.py:
--------------------------------------------------------------------------------
1 | import nltk
2 | locations=[('Jaipur', 'IN', 'Rajasthan'),('Ajmer', 'IN', 'Rajasthan'),('Udaipur', 'IN', 'Rajasthan'),('Mumbai', 'IN', 'Maharashtra'),('Ahmedabad', 'IN', 'Gujrat')]
3 | q = [x1 for (x1, relation, x2) in locations if x2=='Rajasthan']
4 | print(q)
5 | 


--------------------------------------------------------------------------------
/Chapter 4/ch4_22.py:
--------------------------------------------------------------------------------
1 | import nltk
2 | from nltk.corpus import treebank
3 | from nltk import NgramTagger
4 | testing = treebank.tagged_sents()[2000:]
5 | training= treebank.tagged_sents()[:7000]
6 | quadgramtag = NgramTagger(4, training)
7 | print(quadgramtag.evaluate(testing))
8 | 
9 | 


--------------------------------------------------------------------------------
/Chapter 4/ch4_27.py:
--------------------------------------------------------------------------------
1 | import nltk
2 | from nltk.tag import tnt
3 | from nltk.corpus import treebank
4 | testing = treebank.tagged_sents()[2000:]
5 | training= treebank.tagged_sents()[:7000]
6 | tnt_tagger=tnt.TnT()
7 | tnt_tagger.train(training)
8 | print(tnt_tagger.evaluate(testing))
9 | 


--------------------------------------------------------------------------------
/Chapter 5/ch5_4.py:
--------------------------------------------------------------------------------
1 | import nltk
2 | from nltk.corpus import treebank_chunk
3 | print(treebank_chunk.chunked_sents()[1].leaves())
4 | print(treebank_chunk.chunked_sents()[1].pos())
5 | print(treebank_chunk.chunked_sents()[1].productions())
6 | print(nltk.corpus.treebank.tagged_words())
7 | 


--------------------------------------------------------------------------------
/Chapter 5/ch5_9.py:
--------------------------------------------------------------------------------
1 | import nltk
2 | sent = nltk.data.load('grammars/large_grammars/atis_sentences.txt')
3 | sent = nltk.parse.util.extract_test_sentences(sent)
4 | print(len(sent))
5 | testingsent=sent[25]
6 | print(testingsent[1])
7 | print(testingsent[0])
8 | sent=testingsent[0]
9 | 


--------------------------------------------------------------------------------
/Chapter 10/ch10_1.py:
--------------------------------------------------------------------------------
1 | import nltk
2 | from nltk.corpus import brown
3 | sentences=brown.tagged_sents(categories='news')
4 | sent=brown.sents(categories='news')
5 | unigram_sent=nltk.UnigramTagger(sentences)
6 | print(unigram_sent.tag(sent[2008]))
7 | print(unigram_sent.evaluate(sentences))
8 | 


--------------------------------------------------------------------------------
/Chapter 3/ch3_4.py:
--------------------------------------------------------------------------------
1 | import nltk
2 | from nltk.stem import SnowballStemmer
3 | print(SnowballStemmer.languages)
4 | spanishstemmer=SnowballStemmer('spanish')
5 | print(spanishstemmer.stem('comiendo'))
6 | frenchstemmer=SnowballStemmer('french')
7 | print(frenchstemmer.stem('manger'))
8 | 
9 | 


--------------------------------------------------------------------------------
/Chapter 4/ch4_24.py:
--------------------------------------------------------------------------------
1 | import nltk
2 | from nltk.tag import AffixTagger
3 | from nltk.corpus import treebank
4 | testing = treebank.tagged_sents()[2000:]
5 | training= treebank.tagged_sents()[:7000]
6 | prefixtag = AffixTagger(training, affix_length=4)
7 | print(prefixtag.evaluate(testing))
8 | 


--------------------------------------------------------------------------------
/Chapter 4/ch4_25.py:
--------------------------------------------------------------------------------
1 | import nltk
2 | from nltk.tag import AffixTagger
3 | from nltk.corpus import treebank
4 | testing = treebank.tagged_sents()[2000:]
5 | training= treebank.tagged_sents()[:7000]
6 | suffixtag = AffixTagger(training, affix_length=-3)
7 | print(suffixtag.evaluate(testing))
8 | 


--------------------------------------------------------------------------------
/Chapter 5/ch5_1.py:
--------------------------------------------------------------------------------
1 | import nltk
2 | import nltk.corpus
3 | print(str(nltk.corpus.treebank).replace('\\\\','/'))
4 | print(nltk.corpus.treebank.fileids())
5 | from nltk.corpus import treebank
6 | print(treebank.words('wsj_0007.mrg'))
7 | print(treebank.tagged_words('wsj_0007.mrg'))
8 | 
9 | 


--------------------------------------------------------------------------------
/Chapter 6/ch6_7.py:
--------------------------------------------------------------------------------
1 | import nltk
2 | sentences1 = nltk.corpus.treebank.tagged_sents()[17]
3 | print(nltk.ne_chunk(sentences1, binary=True))
4 | sentences2 = nltk.corpus.treebank.tagged_sents()[7]
5 | print(nltk.ne_chunk(sentences2, binary=True))
6 | print(nltk.ne_chunk(sentences2))
7 | 
8 | 


--------------------------------------------------------------------------------
/Chapter 1/ch1_19.py:
--------------------------------------------------------------------------------
1 | import nltk
2 | from nltk.tokenize import WhitespaceTokenizer
3 | from nltk.tokenize.util import spans_to_relative
4 | sent=" She secured 90.56 % in class X \n. She is a meritorious student\n"
5 | print(list(spans_to_relative(WhitespaceTokenizer().span_tokenize(sent))))
6 | 


--------------------------------------------------------------------------------
/Chapter 4/ch4_6.py:
--------------------------------------------------------------------------------
1 | import nltk
2 | sentence='''The/DT sacred/VBN Ganga/NNP flows/VBZ in/IN this/DT region/NN ./. This/DT is/VBZ a/DT pilgrimage/NN ./. People/NNP from/IN all/DT over/IN the/DT country/NN visit/NN this/DT place/NN ./. '''
3 | print([nltk.tag.str2tuple(t) for t in sentence.split()])
4 | 


--------------------------------------------------------------------------------
/Chapter 2/ch2_8.py:
--------------------------------------------------------------------------------
1 | import nltk
2 | from nltk.util import ngrams
3 | sent=" Hello , please read the book thoroughly . If you have any queries , then don't hesitate to ask . There is no shortcut to success ."
4 | n=5
5 | fivegrams=ngrams(sent.split(),n)
6 | for grams in fivegrams:
7 |     print(grams)
8 | 


--------------------------------------------------------------------------------
/Chapter 9/ch9_5.py:
--------------------------------------------------------------------------------
1 | import nltk
2 | expr_read = nltk.sem.DrtExpression.fromstring
3 | expr5 = expr_read('([x,y],[ram(x),food(y),eats(x,y)])')
4 | expr6 = expr_read('([u,z],[PRO(u),coffee(z),drinks(u,z)])')
5 | expr7=expr5+expr6
6 | print(expr7.simplify())
7 | print(expr7.simplify().resolve_anaphora())
8 | 


--------------------------------------------------------------------------------
/Chapter 3/ch3_6.py:
--------------------------------------------------------------------------------
 1 | import nltk
 2 | from nltk.stem import PorterStemmer
 3 | from nltk.stem import WordNetLemmatizer
 4 | stemmer_output=PorterStemmer()
 5 | print(stemmer_output.stem('happiness'))
 6 | lemmatizer_output=WordNetLemmatizer()
 7 | print(lemmatizer_output.lemmatize('happiness'))
 8 | 
 9 | 
10 | 


--------------------------------------------------------------------------------
/Chapter 6/ch6_12.py:
--------------------------------------------------------------------------------
1 | import nltk
2 | from nltk.corpus import brown
3 | from nltk.tag import UnigramTagger
4 | tagger = UnigramTagger(brown.tagged_sents(categories='news')[:700])
5 | sentence = ['John','and','Smith','went','to','NY','and','Germany']
6 | for word, tag in tagger.tag(sentence):
7 |     print(word,'->',tag)
8 | 


--------------------------------------------------------------------------------
/Chapter 10/ch10_8.py:
--------------------------------------------------------------------------------
 1 | import nltk
 2 | from nltk.corpus import brown
 3 | sentences = brown.tagged_sents(categories='news')
 4 | sent = brown.sents(categories='news')
 5 | pattern = [(r'(January)$','Jan')]
 6 | regexpr_tagger = nltk.RegexpTagger(pattern)
 7 | print(regexpr_tagger.tag(sent[3]))
 8 | print(regexpr_tagger.evaluate(sentences))
 9 | 
10 | 


--------------------------------------------------------------------------------
/Chapter 4/ch4_9.py:
--------------------------------------------------------------------------------
1 | import nltk
2 | from nltk.corpus import treebank	
3 | treebank_tagged = treebank.tagged_words(tagset='universal')
4 | tagpairs = nltk.bigrams(treebank_tagged)
5 | preceders_noun = [x[1] for (x, y) in tagpairs if y[1] == 'NOUN']
6 | freqdist = nltk.FreqDist(preceders_noun)
7 | print([tag for (tag, _) in freqdist.most_common()])
8 | 


--------------------------------------------------------------------------------
/Chapter 2/ch2_3.py:
--------------------------------------------------------------------------------
1 | import nltk
2 | from nltk.collocations import BigramCollocationFinder
3 | from nltk.corpus import webtext
4 | from nltk.metrics import BigramAssocMeasures
5 | tokens=[t.lower() for t in webtext.words('grail.txt')]
6 | words=BigramCollocationFinder.from_words(tokens)
7 | print(words.nbest(BigramAssocMeasures.likelihood_ratio, 10))
8 | 


--------------------------------------------------------------------------------
/Chapter 8/ch8_1.py:
--------------------------------------------------------------------------------
1 | import nltk
2 | from nltk.corpus import stopwords
3 | print(stopwords.words('english'))
4 | def not_stopwords(text):
5 |     stopwords = nltk.corpus.stopwords.words('english')
6 |     content = [w for w in text if w.lower() not in stopwords]
7 |     return len(content) / len(text)
8 | print(not_stopwords(nltk.corpus.reuters.words()))
9 | 


--------------------------------------------------------------------------------
/Chapter 10/ch10_2.py:
--------------------------------------------------------------------------------
 1 | import nltk
 2 | from nltk.corpus import brown
 3 | sentences=brown.tagged_sents(categories='news')
 4 | sz=int(len(sentences)*0.8)
 5 | print(sz)
 6 | training_sents = sentences[:sz]
 7 | print(testing_sents=sentences[sz:])
 8 | unigram_tagger=nltk.UnigramTagger(training_sents)
 9 | print(unigram_tagger.evaluate(testing_sents))
10 | 
11 | 


--------------------------------------------------------------------------------
/Chapter 2/ch2_7.py:
--------------------------------------------------------------------------------
1 | import nltk
2 | from nltk.collocations import *
3 | import nltk
4 | text="Hello how are you doing ? I hope you find the book interesting"
5 | tokens=nltk.wordpunct_tokenize(text)
6 | fourgrams=nltk.collocations.QuadgramCollocationFinder.from_words(tokens)
7 | for fourgram, freq in fourgrams.ngram_fd.items():
8 |     print(fourgram,freq)
9 | 


--------------------------------------------------------------------------------
/Chapter 4/ch4_19.py:
--------------------------------------------------------------------------------
 1 | import nltk
 2 | from nltk.tag import UnigramTagger
 3 | from nltk.tag import DefaultTagger
 4 | from nltk.corpus import treebank
 5 | testing = treebank.tagged_sents()[2000:]
 6 | training= treebank.tagged_sents()[:7000]
 7 | tag1=DefaultTagger('NN')
 8 | tag2=UnigramTagger(training,backoff=tag1)
 9 | print(tag2.evaluate(testing))
10 | 


--------------------------------------------------------------------------------
/Chapter 6/ch6_3.py:
--------------------------------------------------------------------------------
 1 | import nltk
 2 | value = nltk.Valuation([('X', True), ('Y', False), ('Z', True)])
 3 | print(value['Z'])
 4 | domain = set()
 5 | v = nltk.Assignment(domain)
 6 | u = nltk.Model(domain, value)
 7 | print(u.evaluate('(X & Y)', v))
 8 | print(u.evaluate('-(X & Y)', v))
 9 | print(u.evaluate('(X & Z)', v))
10 | print(u.evaluate('(X | Y)', v))
11 | 
12 | 


--------------------------------------------------------------------------------
/Chapter 1/ch1_16.py:
--------------------------------------------------------------------------------
1 | import nltk
2 | from nltk.tokenize import BlanklineTokenizer
3 | sent=" She secured 90.56 % in class X \n. She is a meritorious student\n"
4 | print(BlanklineTokenizer().tokenize(sent))
5 | from nltk.tokenize import LineTokenizer
6 | print(LineTokenizer(blanklines='keep').tokenize(sent))
7 | print(LineTokenizer(blanklines='discard').tokenize(sent))
8 | 


--------------------------------------------------------------------------------
/Chapter 4/ch4_20.py:
--------------------------------------------------------------------------------
 1 | import nltk
 2 | from nltk.tag import BigramTagger
 3 | from nltk.corpus import treebank
 4 | training_1= treebank.tagged_sents()[:7000]
 5 | bigramtagger=BigramTagger(training_1)
 6 | print(treebank.sents()[0])
 7 | print(bigramtagger.tag(treebank.sents()[0]))
 8 | testing_1 = treebank.tagged_sents()[2000:]
 9 | print(bigramtagger.evaluate(testing_1))
10 | 
11 | 


--------------------------------------------------------------------------------
/Chapter 4/ch4_21.py:
--------------------------------------------------------------------------------
 1 | import nltk
 2 | from nltk.tag import BigramTagger, TrigramTagger
 3 | from nltk.corpus import treebank
 4 | testing = treebank.tagged_sents()[2000:]
 5 | training= treebank.tagged_sents()[:7000]
 6 | bigramtag = BigramTagger(training)
 7 | print(bigramtag.evaluate(testing))
 8 | trigramtag = TrigramTagger(training)
 9 | print(trigramtag.evaluate(testing))
10 | 


--------------------------------------------------------------------------------
/Chapter 2/ch2_5.py:
--------------------------------------------------------------------------------
1 | import nltk
2 | from nltk.collocations import *
3 | text1="Hardwork is the key to success. Never give up!"
4 | word = nltk.wordpunct_tokenize(text1)
5 | finder = BigramCollocationFinder.from_words(word)
6 | bigram_measures = nltk.collocations.BigramAssocMeasures()
7 | value = finder.score_ngrams(bigram_measures.raw_freq)
8 | print(sorted(bigram for bigram, score in value))
9 | 


--------------------------------------------------------------------------------
/Chapter 6/ch6_4.py:
--------------------------------------------------------------------------------
 1 | import nltk
 2 | input_expr = nltk.sem.Expression.fromstring
 3 | expression = input_expr('run(marcus)', type_check=True)
 4 | print(expression.argument)
 5 | print(expression.argument.type)
 6 | print(expression.function)
 7 | print(expression.function.type)
 8 | sign = {'run': '<e, t>'}
 9 | expression = input_expr('run(marcus)', signature=sign)
10 | print(expression.function.type)
11 | 


--------------------------------------------------------------------------------
/Chapter 10/ch10_4.py:
--------------------------------------------------------------------------------
 1 | import nltk
 2 | from nltk.corpus import brown
 3 | sentences=brown.tagged_sents(categories='news')
 4 | sz=int(len(sentences)*0.8)
 5 | training_sents = sentences[:sz]
 6 | testing_sents=sentences[sz:]
 7 | s0=nltk.DefaultTagger('NNP')
 8 | s1=nltk.UnigramTagger(training_sents,backoff=s0)
 9 | s2=nltk.BigramTagger(training_sents,backoff=s1)
10 | print(s2.evaluate(testing_sents))
11 | 
12 | 


--------------------------------------------------------------------------------
/Chapter 4/ch4_28.py:
--------------------------------------------------------------------------------
 1 | import nltk
 2 | from nltk.tag import DefaultTagger
 3 | from nltk.tag import tnt
 4 | from nltk.corpus import treebank
 5 | testing = treebank.tagged_sents()[2000:]
 6 | training= treebank.tagged_sents()[:7000]
 7 | tnt_tagger=tnt.TnT()
 8 | unknown=DefaultTagger('NN')
 9 | tagger_tnt=tnt.TnT(unk=unknown,Trained=True)
10 | tnt_tagger.train(training)
11 | print(tnt_tagger.evaluate(testing))
12 | 


--------------------------------------------------------------------------------
/Chapter 1/ch1_26.py:
--------------------------------------------------------------------------------
 1 | import nltk
 2 | from nltk.corpus import stopwords
 3 | print(stopwords.words('english'))
 4 | def para_fraction(text):
 5 |     stopwords = nltk.corpus.stopwords.words('english')
 6 |     para = [w for w in text if w.lower() not in stopwords]
 7 |     return len(para) / len(text)
 8 | print(para_fraction(nltk.corpus.reuters.words()))
 9 | print(para_fraction(nltk.corpus.inaugural.words()))
10 | 
11 | 


--------------------------------------------------------------------------------
/Chapter 1/ch1_33.py:
--------------------------------------------------------------------------------
 1 | from __future__ import print_function
 2 | from nltk.metrics import *
 3 | training='PERSON OTHER PERSON OTHER OTHER ORGANIZATION'.split()
 4 | testing='PERSON OTHER OTHER OTHER OTHER OTHER'.split()
 5 | print(accuracy(training,testing))
 6 | trainset=set(training)
 7 | testset=set(testing)
 8 | precision(trainset,testset)
 9 | print(recall(trainset,testset))
10 | print(f_measure(trainset,testset))
11 |  
12 | 


--------------------------------------------------------------------------------
/Chapter 4/ch4_29.py:
--------------------------------------------------------------------------------
1 | import nltk
2 | sent=[("A","DT"),("wise", "JJ"), ("small", "JJ"),("girl", "NN"), ("of", "IN"), ("village", "N"),  ("became", "VBD"), ("leader", "NN")]
3 | sent=[("A","DT"),("wise", "JJ"), ("small", "JJ"),("girl", "NN"), ("of", "IN"), ("village", "NN"),  ("became", "VBD"), ("leader", "NN")]
4 | grammar = "NP: {<DT>?<JJ>*<NN><IN>?<NN>*}"
5 | find = nltk.RegexpParser(grammar)
6 | res = find.parse(sent)
7 | print(res)
8 | res.draw()
9 | 


--------------------------------------------------------------------------------
/Chapter 5/ch5_13.py:
--------------------------------------------------------------------------------
 1 | import nltk
 2 | gram1 = nltk.data.load('grammars/large_grammars/atis.cfg')
 3 | sent = nltk.data.load('grammars/large_grammars/atis_sentences.txt')
 4 | sent = nltk.parse.util.extract_test_sentences(sent)
 5 | testingsent=sent[25]
 6 | sent=testingsent[0]
 7 | parser4 = nltk.parse.TopDownChartParser(gram1)
 8 | chart4 = parser4.chart_parse(sent)
 9 | print((chart4.num_edges()))
10 | print((len(list(chart4.parses(gram1.start())))))
11 | 


--------------------------------------------------------------------------------
/Chapter 5/ch5_18.py:
--------------------------------------------------------------------------------
 1 | import nltk
 2 | gram1 = nltk.data.load('grammars/large_grammars/atis.cfg')
 3 | sent = nltk.data.load('grammars/large_grammars/atis_sentences.txt')
 4 | sent = nltk.parse.util.extract_test_sentences(sent)
 5 | testingsent=sent[25]
 6 | sent=testingsent[0]
 7 | parser9 = nltk.parse.EarleyChartParser(gram1)
 8 | chart9 = parser9.chart_parse(sent)
 9 | print((chart9.num_edges()))
10 | print((len(list(chart9.parses(gram1.start())))))
11 | 


--------------------------------------------------------------------------------
/Chapter 5/ch5_10.py:
--------------------------------------------------------------------------------
 1 | import nltk
 2 | gram1 = nltk.data.load('grammars/large_grammars/atis.cfg')
 3 | sent = nltk.data.load('grammars/large_grammars/atis_sentences.txt')
 4 | sent = nltk.parse.util.extract_test_sentences(sent)
 5 | testingsent=sent[25]
 6 | sent=testingsent[0]
 7 | parser1 = nltk.parse.BottomUpChartParser(gram1)
 8 | chart1 = parser1.chart_parse(sent)
 9 | print((chart1.num_edges()))
10 | print((len(list(chart1.parses(gram1.start())))))
11 | 
12 | 


--------------------------------------------------------------------------------
/Chapter 5/ch5_12.py:
--------------------------------------------------------------------------------
 1 | import nltk
 2 | gram1 = nltk.data.load('grammars/large_grammars/atis.cfg')
 3 | sent = nltk.data.load('grammars/large_grammars/atis_sentences.txt')
 4 | sent = nltk.parse.util.extract_test_sentences(sent)
 5 | testingsent=sent[25]
 6 | sent=testingsent[0]
 7 | parser3 = nltk.parse.LeftCornerChartParser(gram1)
 8 | chart3 = parser3.chart_parse(sent)
 9 | print((chart3.num_edges()))
10 | print((len(list(chart3.parses(gram1.start())))))
11 | 


--------------------------------------------------------------------------------
/Chapter 5/ch5_11.py:
--------------------------------------------------------------------------------
 1 | import nltk
 2 | gram1 = nltk.data.load('grammars/large_grammars/atis.cfg')
 3 | sent = nltk.data.load('grammars/large_grammars/atis_sentences.txt')
 4 | sent = nltk.parse.util.extract_test_sentences(sent)
 5 | testingsent=sent[25]
 6 | sent=testingsent[0]
 7 | parser2 = nltk.parse.BottomUpLeftCornerChartParser(gram1)
 8 | chart2 = parser2.chart_parse(sent)
 9 | print((chart2.num_edges()))
10 | print((len(list(chart2.parses(gram1.start())))))
11 | 


--------------------------------------------------------------------------------
/Chapter 5/ch5_17.py:
--------------------------------------------------------------------------------
 1 | import nltk
 2 | gram1 = nltk.data.load('grammars/large_grammars/atis.cfg')
 3 | sent = nltk.data.load('grammars/large_grammars/atis_sentences.txt')
 4 | sent = nltk.parse.util.extract_test_sentences(sent)
 5 | testingsent=sent[25]
 6 | sent=testingsent[0]
 7 | parser8 = nltk.parse.IncrementalTopDownChartParser(gram1)
 8 | chart8 = parser8.chart_parse(sent)
 9 | print((chart8.num_edges()))
10 | print((len(list(chart8.parses(gram1.start())))))
11 | 


--------------------------------------------------------------------------------
/Chapter 6/ch6_13.py:
--------------------------------------------------------------------------------
 1 | import nltk
 2 | from nltk.corpus import wordnet
 3 | from nltk.corpus import wordnet as wn
 4 | wn.synsets('cat')
 5 | wn.synsets('cat', pos=wn.VERB)
 6 | wn.synset('cat.n.01')
 7 | print(wn.synset('cat.n.01').definition())
 8 | print(len(wn.synset('cat.n.01').examples()))
 9 | print(wn.synset('cat.n.01').lemmas())
10 | print([str(lemma.name()) for lemma in wn.synset('cat.n.01').lemmas()])
11 | print(wn.lemma('cat.n.01.cat').synset())
12 | 
13 | 


--------------------------------------------------------------------------------
/.gitattributes:
--------------------------------------------------------------------------------
 1 | # Auto detect text files and perform LF normalization
 2 | * text=auto
 3 | 
 4 | # Custom for Visual Studio
 5 | *.cs     diff=csharp
 6 | 
 7 | # Standard to msysgit
 8 | *.doc	 diff=astextplain
 9 | *.DOC	 diff=astextplain
10 | *.docx diff=astextplain
11 | *.DOCX diff=astextplain
12 | *.dot  diff=astextplain
13 | *.DOT  diff=astextplain
14 | *.pdf  diff=astextplain
15 | *.PDF	 diff=astextplain
16 | *.rtf	 diff=astextplain
17 | *.RTF	 diff=astextplain
18 | 


--------------------------------------------------------------------------------
/Chapter 5/ch5_16.py:
--------------------------------------------------------------------------------
 1 | import nltk
 2 | gram1 = nltk.data.load('grammars/large_grammars/atis.cfg')
 3 | sent = nltk.data.load('grammars/large_grammars/atis_sentences.txt')
 4 | sent = nltk.parse.util.extract_test_sentences(sent)
 5 | testingsent=sent[25]
 6 | sent=testingsent[0]
 7 | parser7 = nltk.parse.IncrementalLeftCornerChartParser(gram1)
 8 | chart7 = parser7.chart_parse(sent)
 9 | print((chart7.num_edges()))
10 | print((len(list(chart7.parses(gram1.start())))))
11 | 


--------------------------------------------------------------------------------
/Chapter 5/ch5_14.py:
--------------------------------------------------------------------------------
 1 | import nltk
 2 | gram1 = nltk.data.load('grammars/large_grammars/atis.cfg')
 3 | sent = nltk.data.load('grammars/large_grammars/atis_sentences.txt')
 4 | sent = nltk.parse.util.extract_test_sentences(sent)
 5 | testingsent=sent[25]
 6 | sent=testingsent[0]
 7 | parser5 = nltk.parse.IncrementalBottomUpChartParser(gram1)
 8 | chart5 = parser5.chart_parse(sent)
 9 | print((chart5.num_edges()))
10 | print((len(list(chart5.parses(gram1.start())))))
11 | 
12 | 


--------------------------------------------------------------------------------
/Chapter 10/ch10_9.py:
--------------------------------------------------------------------------------
 1 | import nltk
 2 | from nltk.corpus import brown
 3 | freqd = nltk.FreqDist(brown.words(categories='news'))
 4 | cfreqd = nltk.ConditionalFreqDist(brown.tagged_words(categories='news'))
 5 | mostfreq_words = freqd.most_common(100)
 6 | likelytags = dict((word, cfreqd[word].max()) for (word, _) in mostfreq_words)
 7 | baselinetagger = nltk.UnigramTagger(model=likelytags)
 8 | 
 9 | sent = brown.sents(categories='news')[3]
10 | print(baselinetagger.tag(sent))
11 | 


--------------------------------------------------------------------------------
/Chapter 5/ch5_15.py:
--------------------------------------------------------------------------------
 1 | import nltk
 2 | gram1 = nltk.data.load('grammars/large_grammars/atis.cfg')
 3 | sent = nltk.data.load('grammars/large_grammars/atis_sentences.txt')
 4 | sent = nltk.parse.util.extract_test_sentences(sent)
 5 | testingsent=sent[25]
 6 | sent=testingsent[0]
 7 | parser6 = nltk.parse.IncrementalBottomUpLeftCornerChartParser(gram1)
 8 | chart6 = parser6.chart_parse(sent)
 9 | print((chart6.num_edges()))
10 | print((len(list(chart6.parses(gram1.start())))))
11 | 
12 | 


--------------------------------------------------------------------------------
/Chapter 10/ch10_3.py:
--------------------------------------------------------------------------------
 1 | import nltk
 2 | from nltk.corpus import brown
 3 | sentences=brown.tagged_sents(categories='news')
 4 | sz=int(len(sentences)*0.8)
 5 | training_sents = sentences[:sz]
 6 | testing_sents=sentences[sz:]
 7 | bigram_tagger=nltk.UnigramTagger(training_sents)
 8 | bigram_tagger=nltk.BigramTagger(training_sents)
 9 | print(bigram_tagger.tag(sentences[2008]))
10 | un_sent=sentences[4203]
11 | print(bigram_tagger.tag(un_sent))
12 | print(bigram_tagger.evaluate(testing_sents))
13 | 


--------------------------------------------------------------------------------
/Chapter 6/ch6_6.py:
--------------------------------------------------------------------------------
 1 | import nltk
 2 | nltk.data.show_cfg('grammars/book_grammars/sql1.fcfg')
 3 | 
 4 | 
 5 | from nltk import load_parser
 6 | test = load_parser('grammars/book_grammars/sql1.fcfg')
 7 | q=" What cities are in Greece"
 8 | t = list(test.parse(q.split()))
 9 | ans = t[0].label()['SEM']
10 | ans = [s for s in ans if s]
11 | q = ' '.join(ans)
12 | print(q)
13 | from nltk.sem import chat80
14 | r = chat80.sql_query('corpora/city_database/city.db', q)
15 | for p in r:
16 |     print(p[0], end=" ")
17 | 


--------------------------------------------------------------------------------
/Chapter 2/ch2_4.py:
--------------------------------------------------------------------------------
 1 | from nltk.corpus import stopwords
 2 | from nltk.corpus import webtext
 3 | from nltk.collocations import BigramCollocationFinder
 4 | from nltk.metrics import BigramAssocMeasures
 5 | set = set(stopwords.words('english'))
 6 | stops_filter = lambda w: len(w) < 3 or w in set
 7 | tokens=[t.lower() for t in webtext.words('grail.txt')]
 8 | words=BigramCollocationFinder.from_words(tokens)
 9 | words.apply_word_filter(stops_filter)
10 | print(words.nbest(BigramAssocMeasures.likelihood_ratio, 10))
11 | 


--------------------------------------------------------------------------------
/Chapter 5/ch5_19.py:
--------------------------------------------------------------------------------
 1 | import nltk
 2 | from nltk.corpus import treebank
 3 | from itertools import islice
 4 | from nltk.grammar import PCFG, induce_pcfg, toy_pcfg1, toy_pcfg2
 5 | gram2 = PCFG.fromstring("""
 6 | 	A -> B B [.3] | C B C [.7]
 7 | 	B -> B D [.5] | C [.5]
 8 | 	C -> 'a' [.1] | 'b' [0.9]
 9 | 	D -> 'b' [1.0]
10 | 	""")
11 | prod1 = gram2.productions()[0]
12 | print(prod1)
13 | prod2 = gram2.productions()[1]
14 | print(prod2)
15 | print(prod2.lhs())
16 | print(prod2.rhs())
17 | print((prod2.prob()))
18 | print(gram2.start())
19 | print(gram2.productions())
20 | 


--------------------------------------------------------------------------------
/Chapter 6/ch6_14.py:
--------------------------------------------------------------------------------
 1 | import nltk
 2 | from nltk.corpus import wordnet
 3 | from nltk.corpus import wordnet as wn
 4 | print(sorted(wn.langs()))
 5 | print(wn.synset('cat.n.01').lemma_names('ita'))
 6 | print(sorted(wn.synset('cat.n.01').lemmas('dan')))
 7 | print(sorted(wn.synset('cat.n.01').lemmas('por')))
 8 | print(len(wordnet.all_lemma_names(pos='n', lang='jpn')))
 9 | cat = wn.synset('cat.n.01')
10 | print(cat.hypernyms())
11 | print(cat.hyponyms())
12 | print(cat.member_holonyms())
13 | print(cat.root_hypernyms())
14 | print(wn.synset('cat.n.01').lowest_common_hypernyms(wn.synset('dog.n.01')))
15 | 


--------------------------------------------------------------------------------
/Chapter 1/ch1_3.py:
--------------------------------------------------------------------------------
1 | import nltk
2 | french_tokenizer=nltk.data.load('tokenizers/punkt/french.pickle')
3 | print(french_tokenizer.tokenize('Deux agressions en quelques jours, voilà ce qui a motivé hier matin le débrayage  collège franco-britanniquedeLevallois-Perret. Deux agressions en quelques jours, voilà ce qui a motivé hier matin le débrayage  Levallois. L’équipe pédagogique de ce collège de 750 élèves avait déjà été choquée par l’agression, janvier , d’un professeur d’histoire. L’équipe pédagogique de ce collège de 750 élèves avait déjà été choquée par l’agression, mercredi , d’un professeur d’histoire'))
4 | 


--------------------------------------------------------------------------------
/Chapter 10/ch10_7.py:
--------------------------------------------------------------------------------
 1 | import nltk
 2 | correct = nltk.chunk.tagstr2tree(
 3 | 	"[ the/DT little/JJ cat/NN ] sat/VBD on/IN [ the/DT mat/NN ]")
 4 | print(correct.flatten())
 5 | grammar = r"NP: {<[CDJNP].*>+}"
 6 | cp = nltk.RegexpParser(grammar)
 7 | 
 8 | grammar = r"NP: {<PRP|DT|POS|JJ|CD|N.*>+}"
 9 | chunk_parser = nltk.RegexpParser(grammar)
10 | tagged_tok = [("the", "DT"), ("little", "JJ"), ("cat", "NN"),("sat", "VBD"), ("on", "IN"), ("the", "DT"), ("mat", "NN")]
11 | chunkscore = nltk.chunk.ChunkScore()
12 | guessed = cp.parse(correct.flatten())
13 | chunkscore.score(correct, guessed)
14 | print(chunkscore)
15 | 


--------------------------------------------------------------------------------
/Chapter 4/ch4_26.py:
--------------------------------------------------------------------------------
 1 | import nltk
 2 | from nltk.tag import AffixTagger
 3 | from nltk.corpus import treebank
 4 | testing = treebank.tagged_sents()[2000:]
 5 | training= treebank.tagged_sents()[:7000]
 6 | prefixtagger=AffixTagger(training,affix_length=4)
 7 | prefixtagger3=AffixTagger(training,affix_length=3,backoff=prefixtagger)
 8 | print(prefixtagger3.evaluate(testing))
 9 | suffixtagger3=AffixTagger(training,affix_length=-3,backoff=prefixtagger3)
10 | print(suffixtagger3.evaluate(testing))
11 | suffixtagger4=AffixTagger(training,affix_length=-4,backoff=suffixtagger3)
12 | print(suffixtagger4.evaluate(testing))
13 | 
14 | 


--------------------------------------------------------------------------------
/Chapter 6/ch6_18.py:
--------------------------------------------------------------------------------
 1 | import nltk
 2 | from nltk.corpus import wordnet
 3 | from nltk.corpus import wordnet as wn
 4 | from nltk.corpus import wordnet_ic
 5 | brown_ic = wordnet_ic.ic('ic-brown.dat')
 6 | semcor_ic = wordnet_ic.ic('ic-semcor.dat')
 7 | from nltk.corpus import genesis
 8 | genesis_ic = wn.ic(genesis, False, 0.0)
 9 | lion = wn.synset('lion.n.01')
10 | cat = wn.synset('cat.n.01')
11 | print(lion.res_similarity(cat, brown_ic))
12 | print(lion.res_similarity(cat, genesis_ic))
13 | print(lion.jcn_similarity(cat, brown_ic))
14 | print(lion.jcn_similarity(cat, genesis_ic))
15 | print(lion.lin_similarity(cat, semcor_ic))
16 | 
17 | 
18 | 


--------------------------------------------------------------------------------
/Chapter 1/ch1_22.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | import string
 3 | text=[" It is a pleasant evening.","Guests, who came from US arrived at the venue","Food was tasty."]
 4 | from nltk.tokenize import word_tokenize
 5 | tokenized_docs=[word_tokenize(doc) for doc in text]
 6 | x=re.compile('[%s]' % re.escape(string.punctuation))
 7 | tokenized_docs_no_punctuation = []
 8 | for review in tokenized_docs:
 9 |     new_review = []
10 |     for token in review: 
11 |         new_token = x.sub(u'', token)
12 |         if not new_token == u'':
13 |             new_review.append(new_token)
14 |     tokenized_docs_no_punctuation.append(new_review)	
15 | print(tokenized_docs_no_punctuation)
16 | 


--------------------------------------------------------------------------------
/Chapter 2/ch2_9.py:
--------------------------------------------------------------------------------
 1 | import nltk
 2 | cor = nltk.corpus.brown.tagged_sents(categories='adventure')[:500]
 3 | print(len(cor))
 4 | from nltk.util import unique_list
 5 | tag_set = unique_list(tag for sent in cor for (word,tag) in sent)
 6 | print(len(tag_set))
 7 | symbols = unique_list(word for sent in cor for (word,tag) in sent)
 8 | print(len(symbols))
 9 | print(len(tag_set))
10 | symbols = unique_list(word for sent in cor for (word,tag) in sent)
11 | print(len(symbols))
12 | trainer = nltk.tag.HiddenMarkovModelTrainer(tag_set, symbols)
13 | train_corpus = []
14 | test_corpus = []
15 | for i in range(len(cor)):
16 |     if i % 10:
17 |         train_corpus+=[cor[i]]
18 |     else:
19 |         test_corpus+=[cor[i]]
20 | print(len(train_corpus))
21 | print(len(test_corpus))
22 |  
23 |     
24 | 


--------------------------------------------------------------------------------
/Chapter 5/ch5_7.py:
--------------------------------------------------------------------------------
 1 | import nltk
 2 | from nltk import Nonterminal, nonterminals, Production, CFG
 3 | nonterminal1 = Nonterminal('NP')
 4 | nonterminal2 = Nonterminal('VP')
 5 | nonterminal3 = Nonterminal('PP')
 6 | print(nonterminal1.symbol())
 7 | print(nonterminal2.symbol())
 8 | print(nonterminal3.symbol())
 9 | print(nonterminal1==nonterminal2)
10 | print(nonterminal2==nonterminal3)
11 | print(nonterminal1==nonterminal3)
12 | S, NP, VP, PP = nonterminals('S, NP, VP, PP')
13 | N, V, P, DT = nonterminals('N, V, P, DT')
14 | production1 = Production(S, [NP, VP])
15 | production2 = Production(NP, [DT, NP])
16 | production3 = Production(VP, [V, NP,NP,PP])
17 | print(production1.lhs())
18 | print(production1.rhs())
19 | print(production3.lhs())
20 | print(production3.rhs())
21 | print(production3 == Production(VP, [V,NP,NP,PP]))
22 | print(production2 == production3)
23 | 
24 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # Windows image file caches
 2 | Thumbs.db
 3 | ehthumbs.db
 4 | 
 5 | # Folder config file
 6 | Desktop.ini
 7 | 
 8 | # Recycle Bin used on file shares
 9 | $RECYCLE.BIN/
10 | 
11 | # Windows Installer files
12 | *.cab
13 | *.msi
14 | *.msm
15 | *.msp
16 | 
17 | # Windows shortcuts
18 | *.lnk
19 | 
20 | # =========================
21 | # Operating System Files
22 | # =========================
23 | 
24 | # OSX
25 | # =========================
26 | 
27 | .DS_Store
28 | .AppleDouble
29 | .LSOverride
30 | 
31 | # Thumbnails
32 | ._*
33 | 
34 | # Files that might appear in the root of a volume
35 | .DocumentRevisions-V100
36 | .fseventsd
37 | .Spotlight-V100
38 | .TemporaryItems
39 | .Trashes
40 | .VolumeIcon.icns
41 | 
42 | # Directories potentially created on remote AFP share
43 | .AppleDB
44 | .AppleDesktop
45 | Network Trash Folder
46 | Temporary Items
47 | .apdisk
48 | 


--------------------------------------------------------------------------------
/Chapter 7/ch7_1.py:
--------------------------------------------------------------------------------
 1 | import nltk
 2 | import random
 3 | from nltk.corpus import movie_reviews
 4 | docs = [(list(movie_reviews.words(fid)), cat) for cat in movie_reviews.categories() for fid in movie_reviews.fileids(cat)]
 5 | random.shuffle(docs)
 6 | all_tokens = nltk.FreqDist(x.lower() for x in movie_reviews.words())
 7 | token_features = list(all_tokens.keys())[:2000]
 8 | print(token_features[:100])
 9 |  
10 | def doc_features(docs):
11 |     doc_words = set(docs)
12 |     features = {}
13 |     for word in token_features:
14 |         features['contains(%s)' % word] = (word in doc_words)
15 |         return features
16 | 
17 | print(doc_features(movie_reviews.words('pos/cv957_8737.txt')))
18 | feature_sets = [(doc_features(d), c) for (d,c) in docs]
19 | train_sets, test_sets = feature_sets[100:], feature_sets[:100]
20 | classifiers = nltk.NaiveBayesClassifier.train(train_sets)
21 | print(nltk.classify.accuracy(classifiers, test_sets))
22 | classifiers.show_most_informative_features(5)
23 | 


--------------------------------------------------------------------------------
/Chapter 2/ch2_10.py:
--------------------------------------------------------------------------------
 1 | import nltk
 2 | corpus=u"<s> hello how are you doing ? Hope you find the book interesting. </s>".split()
 3 | sentence=u"<s>how are you doing</s>".split()
 4 | vocabulary=set(corpus)
 5 | print(len(vocabulary))
 6 | cfd = nltk.ConditionalFreqDist(nltk.bigrams(corpus))
 7 | print([cfd[a][b] for (a,b) in nltk.bigrams(sentence)])
 8 | print([cfd[a].N() for (a,b) in nltk.bigrams(sentence)])
 9 | print([cfd[a].freq(b) for (a,b) in nltk.bigrams(sentence)])
10 | print([1 + cfd[a][b] for (a,b) in nltk.bigrams(sentence)])
11 | print([len(vocabulary) + cfd[a].N() for (a,b) in nltk.bigrams(sentence)])
12 | print([1.0 * (1+cfd[a][b]) / (len(vocabulary)+cfd[a].N()) for (a,b) in nltk.bigrams(sentence)])
13 | cpd_mle = nltk.ConditionalProbDist(cfd, nltk.MLEProbDist, bins=len(vocabulary))
14 | print([cpd_mle[a].prob(b) for (a,b) in nltk.bigrams(sentence)])
15 | cpd_laplace = nltk.ConditionalProbDist(cfd, nltk.LaplaceProbDist, bins=len(vocabulary))
16 | print([cpd_laplace[a].prob(b) for (a,b) in nltk.bigrams(sentence)])
17 | 
18 | 


--------------------------------------------------------------------------------
/__pycache__/replacers.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | 
 3 | from nltk.corpus import wordnet
 4 | 
 5 | replacement_patterns = [
 6 | (r'won\'t', 'will not'),
 7 | (r'can\'t', 'cannot'),
 8 | (r'i\'m', 'i am'),
 9 | (r'ain\'t', 'is not'),
10 | (r'(\w+)\'ll', '\g<1> will'),
11 | (r'(\w+)n\'t', '\g<1> not'),
12 | (r'(\w+)\'ve', '\g<1> have'),
13 | (r'(\w+)\'s', '\g<1> is'),
14 | (r'(\w+)\'re', '\g<1> are'),
15 | (r'(\w+)\'d', '\g<1> would')
16 | ]
17 | class RegexpReplacer(object):
18 |     def __init__(self, patterns=replacement_patterns):
19 |         self.patterns = [(re.compile(regex), repl) for (regex, repl) in
20 |         patterns]
21 |     def replace(self, text):
22 |         s = text
23 |         for (pattern, repl) in self.patterns:
24 |             (s, count) = re.subn(pattern, repl, s)
25 |         return s
26 | 
27 | class RepeatReplacer(object):
28 |     def __init__(self):
29 |         self.repeat_regexp = re.compile(r'(\w*)(\w)\2(\w*)')
30 |         self.repl = r'\1\2\3'
31 |     def replace(self, word):
32 |         if wordnet.synsets(word):
33 |             return word
34 |         repl_word = self.repeat_regexp.sub(self.repl, word)
35 |         if repl_word != word:
36 |             return self.replace(repl_word)
37 |         else:
38 |             return repl_word
39 | 
40 | class WordReplacer(object):
41 |     def __init__(self, word_map):
42 |         self.word_map = word_map
43 |     def replace(self, word):
44 |         return self.word_map.get(word, word)
45 | 


--------------------------------------------------------------------------------