├── .gitattributes
├── .gitignore
├── License
├── Module 1
    ├── Chapter 1
    │   ├── classification.py
    │   ├── hadoop_cmd.sh
    │   ├── mapper.py
    │   ├── nltk_hive.sql
    │   ├── nltk_scoring.py
    │   ├── pyspark_classification.py
    │   ├── reducer.py
    │   ├── scikit_hive.sql
    │   └── serialize_model_vect.py
    ├── Chapter 10
    │   ├── classification.py
    │   ├── hadoop_cmd.sh
    │   ├── mapper.py
    │   ├── nltk_hive.sql
    │   ├── nltk_scoring.py
    │   ├── pyspark_classification.py
    │   ├── reducer.py
    │   ├── scikit_hive.sql
    │   └── serialize_model_vect.py
    ├── Chapter 2
    │   └── ch2.py
    ├── Chapter 3
    │   └── ch3.py
    ├── Chapter 4
    │   └── ch4.py
    ├── Chapter 5
    │   ├── summarizer.py
    │   └── summarizer2.py
    ├── Chapter 6
    │   ├── classification.py
    │   ├── modelbuilding.py
    │   ├── readdata.py
    │   ├── textclustering.py
    │   └── topicmodeling.py
    ├── Chapter 7
    │   ├── item.py
    │   ├── itempiplines.py
    │   ├── loginspider.py
    │   ├── myspider.py
    │   ├── newsspider_1.py
    │   ├── newsspider_2.py
    │   ├── newsspider_3.py
    │   └── notes.txt
    ├── Chapter 8
    │   ├── intregation.py
    │   ├── matplotlib_code.py
    │   ├── numpy_codes.py
    │   ├── optimize.py
    │   ├── pandas_code.py
    │   └── solver.py
    └── Chapter 9
    │   ├── fb_classification.py
    │   ├── fbdump.py
    │   ├── influencer_frnd.py
    │   ├── trendingtopic.py
    │   ├── tweetdump.py
    │   └── tweetinfo.py
├── Module 2
    ├── Chapter 1
    │   └── 7853OS_01_codes
    │   │   └── chapter1.py
    ├── Chapter 2
    │   └── 7853OS_02_codes
    │   │   ├── chapter2.py
    │   │   ├── mywords.txt
    │   │   └── replacers.py
    ├── Chapter 3
    │   └── 7853OS_03_codes
    │   │   ├── brown.pos
    │   │   ├── catchunked.py
    │   │   ├── chapter3.py
    │   │   ├── conll.iob
    │   │   ├── corpus.py
    │   │   ├── heading_text.txt
    │   │   ├── mongoreader.py
    │   │   ├── mywords.txt
    │   │   ├── synonyms.csv
    │   │   ├── synonyms.yaml
    │   │   ├── treebank.chunk
    │   │   └── wordlist
    ├── Chapter 4
    │   └── 7853OS_04_Codes
    │   │   ├── chapter4.py
    │   │   ├── tag_util.py
    │   │   └── taggers.py
    ├── Chapter 5
    │   └── 7853OS_05_Codes
    │   │   ├── chapter5.py
    │   │   └── chunkers.py
    ├── Chapter 6
    │   └── 7853OS_06_codes
    │   │   ├── chapter6.py
    │   │   └── transforms.py
    ├── Chapter 7
    │   └── 7853OS_07_Codes
    │   │   ├── chapter7.py
    │   │   ├── classification.py
    │   │   └── featx.py
    ├── Chapter 8
    │   └── 7853OS_08_Codes
    │   │   ├── chapter8.py
    │   │   ├── dist_featx.py
    │   │   ├── plists.py
    │   │   ├── remote_chunk.py
    │   │   ├── remote_double.py
    │   │   ├── remote_tag.py
    │   │   └── remote_word_count.py
    └── Chapter 9
    │   └── 7853OS_09_Codes
    │       ├── chapter9.py
    │       └── encoding.py
├── Module 3
    ├── Chapter 1
    │   ├── ch1_1.py
    │   ├── ch1_10.py
    │   ├── ch1_11.py
    │   ├── ch1_12.py
    │   ├── ch1_13.py
    │   ├── ch1_14.py
    │   ├── ch1_15.py
    │   ├── ch1_16.py
    │   ├── ch1_17.py
    │   ├── ch1_18.py
    │   ├── ch1_19.py
    │   ├── ch1_2.py
    │   ├── ch1_20.py
    │   ├── ch1_21.py
    │   ├── ch1_22.py
    │   ├── ch1_23.py
    │   ├── ch1_24.py
    │   ├── ch1_25.py
    │   ├── ch1_26.py
    │   ├── ch1_27.py
    │   ├── ch1_28.py
    │   ├── ch1_29.py
    │   ├── ch1_3.py
    │   ├── ch1_30.py
    │   ├── ch1_31.py
    │   ├── ch1_33.py
    │   ├── ch1_34.py
    │   ├── ch1_35.py
    │   ├── ch1_36.py
    │   ├── ch1_37.py
    │   ├── ch1_4.py
    │   ├── ch1_5.py
    │   ├── ch1_6.py
    │   ├── ch1_7.py
    │   ├── ch1_8.py
    │   └── ch1_9.py
    ├── Chapter 10
    │   ├── ch10_1.py
    │   ├── ch10_10.py
    │   ├── ch10_2.py
    │   ├── ch10_3.py
    │   ├── ch10_4.py
    │   ├── ch10_5.py
    │   ├── ch10_6.py
    │   ├── ch10_7.py
    │   ├── ch10_8.py
    │   └── ch10_9.py
    ├── Chapter 2
    │   ├── ch2_1.py
    │   ├── ch2_10.py
    │   ├── ch2_2.py
    │   ├── ch2_3.py
    │   ├── ch2_4.py
    │   ├── ch2_5.py
    │   ├── ch2_6.py
    │   ├── ch2_7.py
    │   ├── ch2_8.py
    │   └── ch2_9.py
    ├── Chapter 3
    │   ├── ch3_1.py
    │   ├── ch3_2.py
    │   ├── ch3_3.py
    │   ├── ch3_4.py
    │   ├── ch3_5.py
    │   └── ch3_6.py
    ├── Chapter 4
    │   ├── ch4_1.py
    │   ├── ch4_10.py
    │   ├── ch4_11.py
    │   ├── ch4_12.py
    │   ├── ch4_13.py
    │   ├── ch4_14.py
    │   ├── ch4_15.py
    │   ├── ch4_16.py
    │   ├── ch4_17.py
    │   ├── ch4_18.py
    │   ├── ch4_19.py
    │   ├── ch4_2.py
    │   ├── ch4_20.py
    │   ├── ch4_21.py
    │   ├── ch4_22.py
    │   ├── ch4_23.py
    │   ├── ch4_24.py
    │   ├── ch4_25.py
    │   ├── ch4_26.py
    │   ├── ch4_27.py
    │   ├── ch4_28.py
    │   ├── ch4_29.py
    │   ├── ch4_3.py
    │   ├── ch4_30.py
    │   ├── ch4_4.py
    │   ├── ch4_5.py
    │   ├── ch4_6.py
    │   ├── ch4_7.py
    │   ├── ch4_8.py
    │   └── ch4_9.py
    ├── Chapter 5
    │   ├── ch5_1.py
    │   ├── ch5_10.py
    │   ├── ch5_11.py
    │   ├── ch5_12.py
    │   ├── ch5_13.py
    │   ├── ch5_14.py
    │   ├── ch5_15.py
    │   ├── ch5_16.py
    │   ├── ch5_17.py
    │   ├── ch5_18.py
    │   ├── ch5_19.py
    │   ├── ch5_2.py
    │   ├── ch5_20.py
    │   ├── ch5_21.py
    │   ├── ch5_22.py
    │   ├── ch5_23.py
    │   ├── ch5_3.py
    │   ├── ch5_4.py
    │   ├── ch5_5.py
    │   ├── ch5_6.py
    │   ├── ch5_7.py
    │   ├── ch5_8.py
    │   └── ch5_9.py
    ├── Chapter 6
    │   ├── ch6_1.py
    │   ├── ch6_10.py
    │   ├── ch6_11.py
    │   ├── ch6_12.py
    │   ├── ch6_13.py
    │   ├── ch6_14.py
    │   ├── ch6_15.py
    │   ├── ch6_16.py
    │   ├── ch6_17.py
    │   ├── ch6_18.py
    │   ├── ch6_2.py
    │   ├── ch6_3.py
    │   ├── ch6_4.py
    │   ├── ch6_5.py
    │   ├── ch6_6.py
    │   ├── ch6_7.py
    │   ├── ch6_8.py
    │   └── ch6_9.py
    ├── Chapter 7
    │   └── ch7_1.py
    ├── Chapter 8
    │   └── ch8_1.py
    ├── Chapter 9
    │   ├── ch9_1.py
    │   ├── ch9_2.py
    │   ├── ch9_3.py
    │   ├── ch9_4.py
    │   └── ch9_5.py
    └── __pycache__
    │   ├── replacers.cpython-34.pyc
    │   └── replacers.py
└── README.md


/.gitattributes:
--------------------------------------------------------------------------------
 1 | # Auto detect text files and perform LF normalization
 2 | * text=auto
 3 | 
 4 | # Custom for Visual Studio
 5 | *.cs     diff=csharp
 6 | 
 7 | # Standard to msysgit
 8 | *.doc	 diff=astextplain
 9 | *.DOC	 diff=astextplain
10 | *.docx diff=astextplain
11 | *.DOCX diff=astextplain
12 | *.dot  diff=astextplain
13 | *.DOT  diff=astextplain
14 | *.pdf  diff=astextplain
15 | *.PDF	 diff=astextplain
16 | *.rtf	 diff=astextplain
17 | *.RTF	 diff=astextplain
18 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # Windows image file caches
 2 | Thumbs.db
 3 | ehthumbs.db
 4 | 
 5 | # Folder config file
 6 | Desktop.ini
 7 | 
 8 | # Recycle Bin used on file shares
 9 | $RECYCLE.BIN/
10 | 
11 | # Windows Installer files
12 | *.cab
13 | *.msi
14 | *.msm
15 | *.msp
16 | 
17 | # Windows shortcuts
18 | *.lnk
19 | 
20 | # =========================
21 | # Operating System Files
22 | # =========================
23 | 
24 | # OSX
25 | # =========================
26 | 
27 | .DS_Store
28 | .AppleDouble
29 | .LSOverride
30 | 
31 | # Thumbnails
32 | ._*
33 | 
34 | # Files that might appear in the root of a volume
35 | .DocumentRevisions-V100
36 | .fseventsd
37 | .Spotlight-V100
38 | .TemporaryItems
39 | .Trashes
40 | .VolumeIcon.icns
41 | 
42 | # Directories potentially created on remote AFP share
43 | .AppleDB
44 | .AppleDesktop
45 | Network Trash Folder
46 | Temporary Items
47 | .apdisk
48 | 


--------------------------------------------------------------------------------
/License:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2016 Packt
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/Module 1/Chapter 1/classification.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | import pickle
 3 | import sklearn
 4 | from sklearn.externals import joblib
 5 | 
 6 | clf = joblib.load('classifier.pkl')
 7 | vectorizer = joblib.load('vectorizer.pkl')
 8 | 
 9 | for line in sys.stdin:
10 |     line = line.strip()
11 |     id, content= line.split('\t')
12 |     X_test = vectorizer.transform([str(content)])
13 |   
14 |     prob = clf.predict_proba(X_test)
15 |     pred = clf.predict (X_test)
16 |     prob_score =prob[:,1]
17 |     print '\t'.join([id, content,pred,prob_score])
18 | 


--------------------------------------------------------------------------------
/Module 1/Chapter 1/hadoop_cmd.sh:
--------------------------------------------------------------------------------
1 | hadoop jar <path>/hadoop-streaming.jar \
2 | -D mapred.reduce.tasks=1 -file <path>/mapper.py \
3 | -mapper <path>/mapper.py \
4 | -file <path>/reducer.py \
5 | -reducer <path>/reducer.py \
6 | -input /hdfspath/infile \
7 | -output outfile


--------------------------------------------------------------------------------
/Module 1/Chapter 1/mapper.py:
--------------------------------------------------------------------------------
1 | import sys
2 | import pickle
3 | import nltk
4 | for line in sys.stdin:
5 |     line = line.strip()
6 |     id, content = line.split('\t')
7 |     tokens =nltk.word_tokenize(concat_all_text)
8 |     print '\t'.join([id,content,tokens])
9 | 


--------------------------------------------------------------------------------
/Module 1/Chapter 1/nltk_hive.sql:
--------------------------------------------------------------------------------
 1 | hive>
 2 | CREATE TABLE $InputTableName 
 3 | (
 4 | ID String,
 5 | Content String
 6 | )
 7 | ROW FORMAT DELIMITED
 8 | FIELDS TERMINATED BY '\t';
 9 | 
10 | hive>
11 | CREATE TABLE $OutTableName 
12 | (
13 | ID String,
14 | Content String,
15 | Tokens String
16 | )
17 | 
18 | hive>
19 | add FILE nltk_scoring.py;
20 | add FILE english.pickle; #Adding file to DistributedCache
21 | INSERT OVERWRITE TABLE $OutTableName
22 | SELECT
23 |         TRANSFORM (id, content)
24 |     USING 'PYTHONPATH nltk_scoring.py'
25 |     AS (id string, content string, tokens string )
26 | FROM $InputTablename;
27 | 


--------------------------------------------------------------------------------
/Module 1/Chapter 1/nltk_scoring.py:
--------------------------------------------------------------------------------
 1 | >>>import sys
 2 | >>>import datetime
 3 | >>>import pickle
 4 | >>>import nltk
 5 | >>>nltk.download('punkt')
 6 | >>>for line in sys.stdin:
 7 | >>>    line = line.strip()
 8 | >>>    id, content= line.split('\t')
 9 | >>>    tokens =nltk.word_tokenize(concat_all_text)
10 | >>>    print '\t'.join([id,content,tokens])
11 | 


--------------------------------------------------------------------------------
/Module 1/Chapter 1/pyspark_classification.py:
--------------------------------------------------------------------------------
 1 | 
 2 | # here I am assuming that we have pyspark configured on your hadoop cluster 
 3 | >>>from pyspark import SparkContext
 4 | >>>sc = SparkContext(appName="comment_classifcation")
 5 | #http://spark.apache.org/docs/0.7.3/api/pyspark/pyspark.context.SparkContext-class.html.
 6 | #The next thing is reading a tab delimited text file. Reading the file should be on HDFS. This file could be huge (~Tb/Pb):
 7 | >>>lines = sc.textFile("testcomments.txt")
 8 | #The lines are now a list of all the rows in the corpus:
 9 | >>>parts = lines.map(lambda l: l.split("\t"))
10 | >>>corpus = parts.map(lambda row: Row(id=row[0], comment=row[1], class=row[2]))
11 | #The parts is a list of fields as we have each field in the line delimited on “\t”.
12 | #Let's break the corpus that has [ID, comment, class (0,1)] in the different RDD objects:
13 | >>>comment = corpus.map(lambda row: " " + row.comment)
14 | >>>class_var = corpus.map(lambda row:row.class)
15 | #Once we have the comments, we need to do a process very similar to what we did in Chapter 6, Text Classification, where we used scikit to do tokenization, hash vectorizer and calculate TF, IDF, and tf-idf using a vectorizer.
16 | #The following is the snippet of how to create tokenization, term frequency, and inverse document frequency:
17 | >>>from pyspark.mllib.feature import HashingTF
18 | >>>from pyspark.mllib.feature import IDF
19 | # https://spark.apache.org/docs/1.3.0/mllib-feature-extraction.html 
20 | >>>comment_tokenized = comment.map(lambda line: line.strip().split(" "))
21 | >>>hashingTF = HashingTF(1000) # to select only 1000 features 
22 | >>>comment_tf = hashingTF.transform(comment_tokenized)
23 | 
24 | >>>comment_idf = IDF().fit(comment_tf)
25 | >>>comment_tfidf = comment_idf.transform(comment_tf)
26 | #Will merge the class with the c tfidf RDD like this:
27 | >>>finaldata = class_var.zip(comment_tfidf)
28 | #We will do a typical test and train smapling
29 | >>>train, test = finaldata.randomSplit([0.8, 0.2], seed=0)
30 | #Let's perform the main classification commands, which are quite similar to scikit. We are using a logistic regression, which is widely used classifier. The pyspark.mllib provides you a variety of algorithms.
31 | #For more information on pyspark.mllib visit https://spark.apache.org/docs/latest/api/python/pyspark.mllib.html
32 | 
33 | #The following is an example of logistic regression classifier:
34 | >>>from pyspark.mllib.regression import LabeledPoint
35 | >>>from pyspark.mllib.classification import NaiveBayes
36 | >>>train_rdd = train.map(lambda t: LabeledPoint(t[0], t[1]))
37 | >>>test_rdd = test.map(lambda t: LabeledPoint(t[0], t[1]))
38 | >>>nb = NaiveBayes.train(train_rdd,lambda = 1.0)
39 | >>>nb_output = test_rdd.map(lambda point: (NB.predict(point.features), point.label))
40 | >>>print nb_output
41 | 


--------------------------------------------------------------------------------
/Module 1/Chapter 1/reducer.py:
--------------------------------------------------------------------------------
1 | import sys
2 | import pickle
3 | import nltk
4 | for line in sys.stdin:
5 |     line = line.strip()
6 |     id, content,topics = line.split('\t')
7 |     print '\t'.join([id,content,topics])
8 | 


--------------------------------------------------------------------------------
/Module 1/Chapter 1/scikit_hive.sql:
--------------------------------------------------------------------------------
 1 | CREATE TABLE $InputTableName (
 2 | ID String,
 3 | Content String
 4 | )
 5 | ROW FORMAT DELIMITED
 6 | FIELDS TERMINATED BY '\t';CREATE TABLE $InputTableName (
 7 | ID String,
 8 | Content String
 9 | )
10 | ROW FORMAT DELIMITED
11 | 
12 | hive>CREATE TABLE $OutTableName (
13 | ID String,
14 | Content String,
15 | predict String,
16 | predict_score double
17 | )
18 | hive>
19 | add FILE vectorizer.pkl;
20 | add FILE classifier.pkl;
21 | 
22 | hive>
23 | add FILE classification.py;
24 | INSERT OVERWRITE TABLE $OutTableName
25 | SELECT
26 |     TRANSFORM (id, content)
27 |     USING '/opt/anaconda/python2.7/bin/python2.7 classification.py'
28 |     AS (id string, scorestringscore string )
29 | FROM $Tablename;


--------------------------------------------------------------------------------
/Module 1/Chapter 1/serialize_model_vect.py:
--------------------------------------------------------------------------------
1 | 
2 | # please refer to code modelbuilding.py in ch 6 and just serialize the vectorizer and mode
3 | # object using joblib.dump
4 | vectorizer = TfidfVectorizer(sublinear_tf=True, min_df=in_min_df,
5 | stop_words='english', ngram_range=(1,2), max_df=in_max_df)
6 | joblib.dump(vectorizer, "vectorizer.pkl", compress=3)
7 | clf = GaussianNB().fit(X_train,y_train)
8 | joblib.dump(clf, "classifier.pkl")


--------------------------------------------------------------------------------
/Module 1/Chapter 10/classification.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | import pickle
 3 | import sklearn
 4 | from sklearn.externals import joblib
 5 | 
 6 | clf = joblib.load('classifier.pkl')
 7 | vectorizer = joblib.load('vectorizer.pkl')
 8 | 
 9 | for line in sys.stdin:
10 |     line = line.strip()
11 |     id, content= line.split('\t')
12 |     X_test = vectorizer.transform([str(content)])
13 |   
14 |     prob = clf.predict_proba(X_test)
15 |     pred = clf.predict (X_test)
16 |     prob_score =prob[:,1]
17 |     print '\t'.join([id, content,pred,prob_score])
18 | 


--------------------------------------------------------------------------------
/Module 1/Chapter 10/hadoop_cmd.sh:
--------------------------------------------------------------------------------
1 | hadoop jar <path>/hadoop-streaming.jar \
2 | -D mapred.reduce.tasks=1 -file <path>/mapper.py \
3 | -mapper <path>/mapper.py \
4 | -file <path>/reducer.py \
5 | -reducer <path>/reducer.py \
6 | -input /hdfspath/infile \
7 | -output outfile


--------------------------------------------------------------------------------
/Module 1/Chapter 10/mapper.py:
--------------------------------------------------------------------------------
1 | import sys
2 | import pickle
3 | import nltk
4 | for line in sys.stdin:
5 |     line = line.strip()
6 |     id, content = line.split('\t')
7 |     tokens =nltk.word_tokenize(concat_all_text)
8 |     print '\t'.join([id,content,tokens])
9 | 


--------------------------------------------------------------------------------
/Module 1/Chapter 10/nltk_hive.sql:
--------------------------------------------------------------------------------
 1 | hive>
 2 | CREATE TABLE $InputTableName 
 3 | (
 4 | ID String,
 5 | Content String
 6 | )
 7 | ROW FORMAT DELIMITED
 8 | FIELDS TERMINATED BY '\t';
 9 | 
10 | hive>
11 | CREATE TABLE $OutTableName 
12 | (
13 | ID String,
14 | Content String,
15 | Tokens String
16 | )
17 | 
18 | hive>
19 | add FILE nltk_scoring.py;
20 | add FILE english.pickle; #Adding file to DistributedCache
21 | INSERT OVERWRITE TABLE $OutTableName
22 | SELECT
23 |         TRANSFORM (id, content)
24 |     USING 'PYTHONPATH nltk_scoring.py'
25 |     AS (id string, content string, tokens string )
26 | FROM $InputTablename;
27 | 


--------------------------------------------------------------------------------
/Module 1/Chapter 10/nltk_scoring.py:
--------------------------------------------------------------------------------
 1 | >>>import sys
 2 | >>>import datetime
 3 | >>>import pickle
 4 | >>>import nltk
 5 | >>>nltk.download('punkt')
 6 | >>>for line in sys.stdin:
 7 | >>>    line = line.strip()
 8 | >>>    id, content= line.split('\t')
 9 | >>>    tokens =nltk.word_tokenize(concat_all_text)
10 | >>>    print '\t'.join([id,content,tokens])
11 | 


--------------------------------------------------------------------------------
/Module 1/Chapter 10/pyspark_classification.py:
--------------------------------------------------------------------------------
 1 | 
 2 | # here I am assuming that we have pyspark configured on your hadoop cluster 
 3 | >>>from pyspark import SparkContext
 4 | >>>sc = SparkContext(appName="comment_classifcation")
 5 | #http://spark.apache.org/docs/0.7.3/api/pyspark/pyspark.context.SparkContext-class.html.
 6 | #The next thing is reading a tab delimited text file. Reading the file should be on HDFS. This file could be huge (~Tb/Pb):
 7 | >>>lines = sc.textFile("testcomments.txt")
 8 | #The lines are now a list of all the rows in the corpus:
 9 | >>>parts = lines.map(lambda l: l.split("\t"))
10 | >>>corpus = parts.map(lambda row: Row(id=row[0], comment=row[1], class=row[2]))
11 | #The parts is a list of fields as we have each field in the line delimited on “\t”.
12 | #Let's break the corpus that has [ID, comment, class (0,1)] in the different RDD objects:
13 | >>>comment = corpus.map(lambda row: " " + row.comment)
14 | >>>class_var = corpus.map(lambda row:row.class)
15 | #Once we have the comments, we need to do a process very similar to what we did in Chapter 6, Text Classification, where we used scikit to do tokenization, hash vectorizer and calculate TF, IDF, and tf-idf using a vectorizer.
16 | #The following is the snippet of how to create tokenization, term frequency, and inverse document frequency:
17 | >>>from pyspark.mllib.feature import HashingTF
18 | >>>from pyspark.mllib.feature import IDF
19 | # https://spark.apache.org/docs/1.3.0/mllib-feature-extraction.html 
20 | >>>comment_tokenized = comment.map(lambda line: line.strip().split(" "))
21 | >>>hashingTF = HashingTF(1000) # to select only 1000 features 
22 | >>>comment_tf = hashingTF.transform(comment_tokenized)
23 | 
24 | >>>comment_idf = IDF().fit(comment_tf)
25 | >>>comment_tfidf = comment_idf.transform(comment_tf)
26 | #Will merge the class with the c tfidf RDD like this:
27 | >>>finaldata = class_var.zip(comment_tfidf)
28 | #We will do a typical test and train smapling
29 | >>>train, test = finaldata.randomSplit([0.8, 0.2], seed=0)
30 | #Let's perform the main classification commands, which are quite similar to scikit. We are using a logistic regression, which is widely used classifier. The pyspark.mllib provides you a variety of algorithms.
31 | #For more information on pyspark.mllib visit https://spark.apache.org/docs/latest/api/python/pyspark.mllib.html
32 | 
33 | #The following is an example of logistic regression classifier:
34 | >>>from pyspark.mllib.regression import LabeledPoint
35 | >>>from pyspark.mllib.classification import NaiveBayes
36 | >>>train_rdd = train.map(lambda t: LabeledPoint(t[0], t[1]))
37 | >>>test_rdd = test.map(lambda t: LabeledPoint(t[0], t[1]))
38 | >>>nb = NaiveBayes.train(train_rdd,lambda = 1.0)
39 | >>>nb_output = test_rdd.map(lambda point: (NB.predict(point.features), point.label))
40 | >>>print nb_output
41 | 


--------------------------------------------------------------------------------
/Module 1/Chapter 10/reducer.py:
--------------------------------------------------------------------------------
1 | import sys
2 | import pickle
3 | import nltk
4 | for line in sys.stdin:
5 |     line = line.strip()
6 |     id, content,topics = line.split('\t')
7 |     print '\t'.join([id,content,topics])
8 | 


--------------------------------------------------------------------------------
/Module 1/Chapter 10/scikit_hive.sql:
--------------------------------------------------------------------------------
 1 | CREATE TABLE $InputTableName (
 2 | ID String,
 3 | Content String
 4 | )
 5 | ROW FORMAT DELIMITED
 6 | FIELDS TERMINATED BY '\t';CREATE TABLE $InputTableName (
 7 | ID String,
 8 | Content String
 9 | )
10 | ROW FORMAT DELIMITED
11 | 
12 | hive>CREATE TABLE $OutTableName (
13 | ID String,
14 | Content String,
15 | predict String,
16 | predict_score double
17 | )
18 | hive>
19 | add FILE vectorizer.pkl;
20 | add FILE classifier.pkl;
21 | 
22 | hive>
23 | add FILE classification.py;
24 | INSERT OVERWRITE TABLE $OutTableName
25 | SELECT
26 |     TRANSFORM (id, content)
27 |     USING '/opt/anaconda/python2.7/bin/python2.7 classification.py'
28 |     AS (id string, scorestringscore string )
29 | FROM $Tablename;


--------------------------------------------------------------------------------
/Module 1/Chapter 10/serialize_model_vect.py:
--------------------------------------------------------------------------------
1 | 
2 | # please refer to code modelbuilding.py in ch 6 and just serialize the vectorizer and mode
3 | # object using joblib.dump
4 | vectorizer = TfidfVectorizer(sublinear_tf=True, min_df=in_min_df,
5 | stop_words='english', ngram_range=(1,2), max_df=in_max_df)
6 | joblib.dump(vectorizer, "vectorizer.pkl", compress=3)
7 | clf = GaussianNB().fit(X_train,y_train)
8 | joblib.dump(clf, "classifier.pkl")


--------------------------------------------------------------------------------
/Module 1/Chapter 2/ch2.py:
--------------------------------------------------------------------------------
 1 | # csv load 
 2 | >>>import csv
 3 | >>>with open('example.csv','rb')  as f:
 4 | >>>    reader=csv.reader(f,delimiter=',',quotechar='"')
 5 | >>>    for line in reader :
 6 | >>>        print line[1]  # assuming the second field is the raw sting
 7 | 
 8 | # json load 
 9 | >>>import json
10 | >>>jsonfile=open('example.json')
11 | >>>data=json.load(jsonfile)
12 | >>>print data['string']
13 | 
14 | # sentence splitter 
15 | 
16 | >>>inputstring = ' This is an example sent. The sentence splitter will split on sent markers. Ohh really !!'
17 | >>>from nltk.tokenize import sent_tokenize
18 | >>>all_sent=sent_tokenize(inputstring)
19 | >>>print all_sent
20 | >>>[' This is an example sent', 'The sentence splitter will split on markers.','Ohh really !!']
21 | 
22 | >>>import nltk.tokenize.punkt
23 | >>>tokenizer =nltk.tokenize.punkt.PunktSentenceTokenizer()
24 | 
25 | # word tokenizer
26 | >>>s ="Hi Everyone !    hola gr8" # simplest tokenizer
27 | >>>print s.split()
28 | 
29 | >>>from nltk.tokenize import word_tokenize
30 | >>>word_tokenize(s)
31 | 
32 | >>>from nltk.tokenize import regexp_tokenize, wordpunct_tokenize, blankline_tokenize
33 | >>>regexp_tokenize(s, pattern='\w+')
34 | 
35 | >>>regexp_tokenize(s, pattern='\d+')
36 | 
37 | >>>wordpunct_tokenize(s)
38 | >>>blankline_tokenize(s)
39 | 
40 | #Porter stemmer
41 | >>>from nltk.stem import PorterStemmer # import Porter stemmer
42 | >>>from nltk.stem.lancaster import LancasterStemmer
43 | >>>from nltk.stem.Snowball import SnowballStemmer
44 | >>>pst=PorterStemmer()   # create obj of the PorterStemmer
45 | >>>lst = LancasterStemmer() # create obj of LancasterStemmer
46 | >>>lst.stem("eating")
47 | >>>pst.stem("shopping")
48 | 
49 | #Lemmatizer
50 | >>>from nltk.stem import WordNetLemmatizer
51 | >>>wlem=WordNetLemmatizer()
52 | >>>wlem.lemmatize("ate")
53 | 
54 | # stop word 
55 | 
56 | >>>from nltk.corpus import stopwords
57 | >>>stoplist=stopwords.words('english') # config the language name
58 | >>>text = "This is just a test"
59 | >>>cleanwordlist=[word for word in text.split() if word not in stoplist]
60 | 
61 | 
62 | # rare word removal 
63 | 
64 | >>>freq_dist=nltk.FreqDist(token)
65 | >>>rarewords =freq_dist.keys()[-50:]
66 | >>>after_rare_words= [ word for word in token not in rarewords]
67 | 
68 | # spell check
69 | 
70 | >>>from nltk.metrics import edit_distance
71 | >>>edit_distance(“rain”,”shine”)
72 | 
73 | 
74 | 


--------------------------------------------------------------------------------
/Module 1/Chapter 3/ch3.py:
--------------------------------------------------------------------------------
 1 | # POS tagging 
 2 | >>>import nltk
 3 | >>>from nltk import word_tokenize
 4 | >>>s="I was watching TV"
 5 | >>>print nltk.pos_tag(word_tokenize(s))
 6 | 
 7 | # all nouns
 8 | 
 9 | >>>tagged=nltk.pos_tag(word_tokenize(s))
10 | >>>allnoun=[word for word,pos in tagged if pos in ['NN','NNP'] ]
11 | 
12 | # Stanford POS tagger 
13 | 
14 | >>>from nltk.tag.stanford import POSTagger
15 | >>>import nltk
16 | >>>stan_tagger=POSTagger('models/english-bidirectional-distdim.tagger','standford-postagger.jar')
17 | >>>tokens =nltk.word_tokenize(s)
18 | >>>stan_tagger.tag(tokens)
19 | 
20 | # POS tags freq distribtuion
21 | >>>from nltk.corpus import brown
22 | >>>import nltk
23 | >>>tags = [tag for (word, tag) in brown.tagged_words(categories='news')]
24 | >>>print nltk.FreqDist(tags)
25 | 
26 | # default tagger
27 | >>>brown_tagged_sents = brown.tagged_sents(categories='news')
28 | >>>default_tagger = nltk.DefaultTagger('NN')
29 | >>>print default_tagger.evaluate(brown_tagged_sents)
30 | 
31 | # N-gram taggers
32 | 
33 | >>>from nltk.tag import UnigramTagger
34 | >>>from nltk.tag import DefaultTagger
35 | >>>from nltk.tag import BigramTagger
36 | >>>from nltk.tag import TrigramTagger
37 | # we are dividing the data into a test and train to evaluate our taggers.
38 | >>>train_data= brown_tagged_sents[:int(len(brown_tagged_sents) * 0.9)]
39 | >>>test_data= brown_tagged_sents[int(len(brown_tagged_sents) * 0.9):]
40 | >>>unigram_tagger = UnigramTagger(train_data,backoff=default_tagger)
41 | >>>print unigram_tagger.evaluate(test_data)
42 | >>>bigram_tagger= BigramTagger(train_data, backoff=unigram_tagger)
43 | >>>print bigram_tagger.evaluate(test_data)
44 | >>>trigram_tagger=TrigramTagger(train_data,backoff=bigram_tagger)
45 | >>>print trigram_tagger.evaluate(test_data)
46 | 
47 | # Regex tagger 
48 | 
49 | >>>from nltk.tag.sequential import RegexpTagger
50 | >>>regexp_tagger = RegexpTagger(
51 |          [( r'^-?[0-9]+(.[0-9]+)?$', 'CD'),   # cardinal numbers
52 |           ( r'(The|the|A|a|An|an)$', 'AT'),   # articles
53 |           ( r'.*able$', 'JJ'),                # adjectives
54 |           ( r'.*ness$', 'NN'),         # nouns formed from adj
55 |           ( r'.*ly$', 'RB'),           # adverbs
56 |           ( r'.*s$', 'NNS'),           # plural nouns
57 |           ( r'.*ing$', 'VBG'),         # gerunds
58 |           (r'.*ed$', 'VBD'),           # past tense verbs
59 |           (r'.*', 'NN')                # nouns (default)
60 |           ])
61 | >>>print regexp_tagger.evaluate(test_data)
62 | 
63 | 
64 | 
65 | # NER tagger 
66 | >>>import nltk
67 | >>>from nltk import ne_chunk
68 | >>>from nltk import word_tokenize
69 | >>>sent = "Mark is studying at Stanford University in California"
70 | >>>print(ne_chunk(nltk.pos_tag(word_tokenize(sent)), binary=False))
71 | 
72 | # NER stanford tagger 
73 | 
74 | >>>from nltk.tag.stanford import NERTagger
75 | >>>st = NERTagger('<PATH>/stanford-ner/classifiers/all.3class.distsim.crf.ser.gz',...               '<PATH>/stanford-ner/stanford-ner.jar')
76 | # <PATH> will be the relative path where you downloaded the tagger 
77 | #http://nlp.stanford.edu/software/ 
78 | 


--------------------------------------------------------------------------------
/Module 1/Chapter 4/ch4.py:
--------------------------------------------------------------------------------
 1 | 
 2 | # toy CFG 
 3 | >>> from nltk import CFG
 4 | >>>toy_grammar = 
 5 | nltk.CFG.fromstring(
 6 | """
 7 |   S -> NP VP  		 # S indicate the entire sentence   
 8 |   VP -> V NP              # VP is verb phrase the 
 9 |   V -> "eats" | "drinks"  # V is verb we are using only 2 verbs      in the example
10 |   NP -> Det N   # NP is noun phrase (chunk that has noun in it)
11 |   Det -> "a" | "an" | "the" # Det is determiner used in the sentences 
12 |   N -> "president" |"Obama" |"apple"| "coke"  # N some example nouns 
13 |    """)
14 | >>> toy_grammar.productions()
15 | 
16 | # similarly a PCFG also can be built 
17 | 
18 | >>> from nltk import PCFG
19 | >>> toy_pcfg1 = PCFG.fromstring("""
20 | 	S -> NP VP [1.0]
21 | 	NP -> Det N [0.5] | NP PP [0.25] | 'John' [0.1] | 'I' [0.15]
22 | 	Det -> 'the' [0.8] | 'my' [0.2]
23 | 	N -> 'man' [0.5] | 'telescope' [0.5]
24 | 	VP -> VP PP [0.1] | V NP [0.7] | V [0.2]
25 | 	V -> 'ate' [0.35] | 'saw' [0.65]
26 | 	PP -> P NP [1.0]
27 | 	P -> 'with' [0.61] | 'under' [0.39]
28 | 	""")
29 | # ref :http://www.nltk.org/howto/grammar.html
30 | 
31 | 
32 | # Regex parser
33 | 
34 | >>> chunk_rules=ChunkRule("<.*>+","chunk everything")
35 | >>> import nltk
36 | >>> from nltk.chunk.regexp import *
37 | >>> reg_parser = RegexpParser('''
38 |  		NP: {<DT>? <JJ>* <NN>*} # NP
39 |   		 P: {<IN>}              # Preposition
40 |              V: {<V.*>}             # Verb
41 |   	      PP: {<P> <NP>}          # PP -> P NP
42 |    	      VP: {<V> <NP|PP>*}  # VP -> V (NP|PP)*
43 |   ''')
44 | >>> test_sent="Mr. Obama played a big role in the Health insurance bill" 
45 | >>> test_sent_pos=nltk.pos_tag(nltk.word_tokenize(test_sent))
46 | >>> paresed_out=reg_parser.parse(test_sent_pos)
47 | 
48 | # Stanford Parser [Very useful]
49 | 
50 | >>>from nltk.parse.stanford import StanfordParser
51 | >>>english_parser = StanfordParser('stanford-parser.jar', 'stanford-parser-3.4-models.jar')
52 | >>>english_parser.raw_parse_sents(("this is the english parser test")
53 | 
54 | # Chunking 
55 | 
56 | >>>from nltk.chunk.regexp import *
57 | >>>test_sent="The prime minister announced he had asked the chief government whip, Philip Ruddock, to call a special party room meeting for 9am on Monday to consider the spill motion."
58 | >>>test_sent_pos=nltk.pos_tag(nltk.word_tokenize(test_sent))
59 | >>>rule_vp = ChunkRule(r'(<VB.*>)?(<VB.*>)+(<PRP>)?', 'Chunk VPs')
60 | >>>parser_vp = RegexpChunkParser([rule_vp],chunk_label='VP')
61 | >>>print parser_vp.parse(test_sent_pos)    
62 | >>>rule_np = ChunkRule(r'(<DT>?<RB>?)?<JJ|CD>*(<JJ|CD><,>)*(<NN.*>)+', 'Chunk NPs')
63 | >>>parser_np = RegexpChunkParser([rule_np],chunk_label="NP")
64 | >>>print parser_np.parse(test_sent_pos) 
65 | 
66 | # NP chunking (NER)
67 | 
68 | >>>f=open(# absolute path for the file of text for which we want NER)
69 | >>>text=f.read()
70 | >>>sentences = nltk.sent_tokenize(text)
71 | >>>tokenized_sentences = [nltk.word_tokenize(sentence) for sentence in sentences]
72 | >>>tagged_sentences = [nltk.pos_tag(sentence) for sentence in tokenized_sentences]
73 | >>>for sent in tagged_sentences:
74 | >>>print nltk.ne_chunk(sent)
75 | 
76 | # Relation Extraction 
77 | 
78 | >>>import re
79 | >>>IN = re.compile(r'.*\bin\b(?!\b.+ing)')
80 | >>>for doc in nltk.corpus.ieer.parsed_docs('NYT_19980315'):
81 | >>>	for rel in nltk.sem.extract_rels('ORG', 'LOC', doc, corpus='ieer', pattern = IN):
82 | >>>print(nltk.sem.rtuple(rel))
83 | 
84 | 
85 | 
86 | 


--------------------------------------------------------------------------------
/Module 1/Chapter 5/summarizer.py:
--------------------------------------------------------------------------------
 1 | >>>import nltk
 2 | >>>results=[]
 3 | >>>for sent_no,sentence in enumerate(nltk.sent_tokenize(news_content)):
 4 | >>>    no_of_tokens=len(nltk.word_tokenize(sentence))
 5 | >>>    # Let's do POS tagging
 6 | >>>    tagged=nltk.pos_tag(nltk.word_tokenize(sentence))
 7 | >>>    # Count the no of Nouns in the sentence
 8 | >>>    no_of_nouns=len([word for word,pos in tagged if pos in ["NN","NNP"] ])
 9 | >>>    #Use NER to tag the named entities.
10 | >>>    ners=nltk.ne_chunk(nltk.pos_tag(nltk.word_tokenize(sentence)), binary=False)
11 | >>>    no_of_ners= len([chunk for chunk in ners if hasattr(chunk, 'node')])
12 | >>>    score=(no_of_ners+no_of_nouns)/float(no_of_toekns)
13 | >>>    results.append((sent_no,no_of_tokens,no_of_ners,\
14 | no_of_nouns,score,sentence))
15 | 
16 | >>>for sent in sorted(results,key=lambda x: x[4],reverse=True):
17 | >>>    print sent[5]
18 | 


--------------------------------------------------------------------------------
/Module 1/Chapter 5/summarizer2.py:
--------------------------------------------------------------------------------
 1 | >>>import nltk
 2 | >>>from sklearn.feature_extraction.text import TfidfVectorizer
 3 | >>>results=[]
 4 | >>>sentences=nltk.sent_tokenize(news_content)
 5 | >>>vectorizer = TfidfVectorizer(norm='l2',min_df=0, use_idf=True, smooth_idf=False, sublinear_tf=True)
 6 | >>>sklearn_binary=vectorizer.fit_transform(sentences)
 7 | >>>print countvectorizer.get_feature_names()
 8 | >>>print sklearn_binary.toarray()
 9 | >>>for sent_no,i in enumerate(sklearn_binary.toarray()):
10 | >>>	results.append(sent_no,i.sum()/float(len(i.nonzero()[0])))
11 | 
12 | 


--------------------------------------------------------------------------------
/Module 1/Chapter 6/classification.py:
--------------------------------------------------------------------------------
 1 | def modelbuilding(smsdata,sms_labels):
 2 | 	'''
 3 | 	This is an example pipline to building a text classifier
 4 | 	1. sampling
 5 | 	2. TfidfVectorizer conversion
 6 | 	3. building a naive_bayes model
 7 | 	4. print the accuracy and other metrics
 8 | 	5. print most relavent features 
 9 | 	'''
10 | 
11 | 	# sampling steps 
12 | 	trainset_size = int(round(len(sms_data)*0.70))
13 | 	# i chose this threshold for 70:30 train and test split.
14 | 	print 'The training set size for this classifier is ' + str(trainset_size) + '\n'
15 | 	x_train = np.array([''.join(el) for el in sms_data[0:trainset_size]])
16 | 	y_train = np.array([el for el in sms_labels[0:trainset_size]])
17 | 	x_test = np.array([''.join(el) for el in sms_data[trainset_size+1:len(sms_data)]])
18 | 	y_test = np.array([el for el in sms_labels[trainset_size+1:len(sms_labels)]])
19 | 	print x_train
20 | 	print y_train
21 | 
22 | 	# count vectorizer 
23 | 	# not really used just for explanation 
24 | 	from sklearn.feature_extraction.text import CountVectorizer
25 | 	sms_exp=[ ]
26 | 	for line in sms_list:
27 | 		sms_exp.append(preprocessing(line[1]))
28 | 	vectorizer = CountVectorizer(min_df=1)
29 | 	X_exp = vectorizer.fit_transform(sms_exp)
30 | 	print "||".join(vectorizer.get_feature_names())
31 | 	print X_exp.toarray()
32 | 
33 | 	# We are building a TFIDF vectorizer here
34 | 	from sklearn.feature_extraction.text import TfidfVectorizer
35 | 	vectorizer = TfidfVectorizer(min_df=2, ngram_range=(1, 2),  stop_
36 | 	words='english',  strip_accents='unicode',  norm='l2')
37 | 	X_train = vectorizer.fit_transform(x_train)
38 | 	X_test = vectorizer.transform(x_test)
39 | 
40 | 	from sklearn.naive_bayes import MultinomialNB
41 | 	clf = MultinomialNB().fit(X_train, y_train)
42 | 	y_nb_predicted = clf.predict(X_test)
43 | 	print y_nb_predicted
44 | 	print ' \n confusion_matrix \n '
45 | 	cm = confusion_matrix(y_test, y_pred)
46 | 	print cm
47 | 	print '\n Here is the classification report:'
48 | 	print classification_report(y_test, y_nb_predicted)
49 | 	# print the top features 
50 | 
51 | 	coefs = clf.coef_
52 | 	intercept = clf.intercept_
53 | 	coefs_with_fns = sorted(zip(clf.coef_[0], feature_names))
54 | 	n=10
55 | 	top = zip(coefs_with_fns[:n], coefs_with_fns[:-(n + 1):-1])
56 | 	for (coef_1, fn_1), (coef_2, fn_2) in top:
57 | 		print('\t%.4f\t%-15s\t\t%.4f\t%-15s' % (coef_1, fn_1, coef_2, fn_2))
58 | 
59 | def preprocessing(text):
60 |     text = text.decode("utf8")
61 |     # tokenize into words
62 |     tokens = [word for sent in nltk.sent_tokenize(text) \
63 |     for word in nltk.word_tokenize(sent)]
64 | 
65 |     # remove stopwords
66 |     stop = stopwords.words('english')
67 |     tokens = [token for token in tokens if token not in stop]
68 | 
69 |     # remove words less than three letters
70 |     tokens = [word for word in tokens if len(word) >= 3]
71 | 
72 |     # lower capitalization
73 |     tokens = [word.lower() for word in tokens]
74 | 
75 |     # lemmatize
76 |     lmtzr = WordNetLemmatizer()
77 |     tokens = [lmtzr.lemmatize(word) for word in tokens]
78 |     preprocessed_text= ' '.join(tokens)
79 | 
80 |     return preprocessed_text
81 | 
82 | def main():
83 | 	smsdata = open('SMSSpamCollection') # check the structure of this file!
84 | 	smsdata_data = []
85 | 	sms_labels = []
86 | 	csv_reader = csv.reader(sms,delimiter='\t')
87 | 	for line in csv_reader:
88 | 	     # adding the sms_id 
89 | 	    sms_labels.append( line[0])
90 | 	    # adding the cleaned text We are calling preprocessing method 
91 | 	    sms_data.append(preprocessing(line[1]))
92 | 
93 | 	sms.close() 
94 | 	# we are calling the model builing function here 
95 | 	modelbuilding(smsdata,sms_labels)   
96 | if __name__ == '__main__':
97 | 	main()


--------------------------------------------------------------------------------
/Module 1/Chapter 6/modelbuilding.py:
--------------------------------------------------------------------------------
 1 | 
 2 | 
 3 | # sampling steps 
 4 | trainset_size = int(round(len(sms_data)*0.70))
 5 | # i chose this threshold for 70:30 train and test split.
 6 | print 'The training set size for this classifier is ' + str(trainset_size) + '\n'
 7 | x_train = np.array([''.join(el) for el in sms_data[0:trainset_size]])
 8 | y_train = np.array([el for el in sms_labels[0:trainset_size]])
 9 | x_test = np.array([''.join(el) for el in sms_data[trainset_size+1:len(sms_data)]])
10 | y_test = np.array([el for el in sms_labels[trainset_size+1:len(sms_labels)]])
11 | print x_train
12 | print y_train
13 | 
14 | # count vectorizer 
15 | # not really used just for explanation 
16 | from sklearn.feature_extraction.text import CountVectorizer
17 | sms_exp=[ ]
18 | for line in sms_list:
19 | 	sms_exp.append(preprocessing(line[1]))
20 | vectorizer = CountVectorizer(min_df=1)
21 | X_exp = vectorizer.fit_transform(sms_exp)
22 | print "||".join(vectorizer.get_feature_names())
23 | print X_exp.toarray()
24 | 
25 | # We are building a TFIDF vectorizer here
26 | from sklearn.feature_extraction.text import TfidfVectorizer
27 | vectorizer = TfidfVectorizer(min_df=2, ngram_range=(1, 2),  stop_
28 | words='english',  strip_accents='unicode',  norm='l2')
29 | X_train = vectorizer.fit_transform(x_train)
30 | X_test = vectorizer.transform(x_test)
31 | 
32 | from sklearn.naive_bayes import MultinomialNB
33 | clf = MultinomialNB().fit(X_train, y_train)
34 | y_nb_predicted = clf.predict(X_test)
35 | print y_nb_predicted
36 | print ' \n confusion_matrix \n '
37 | cm = confusion_matrix(y_test, y_pred)
38 | print cm
39 | print '\n Here is the classification report:'
40 | print classification_report(y_test, y_nb_predicted)
41 | # print the top features 
42 | 
43 | coefs = clf.coef_
44 | intercept = clf.intercept_
45 | coefs_with_fns = sorted(zip(clf.coef_[0], feature_names))
46 | n=10
47 | top = zip(coefs_with_fns[:n], coefs_with_fns[:-(n + 1):-1])
48 | for (coef_1, fn_1), (coef_2, fn_2) in top:
49 | 	print('\t%.4f\t%-15s\t\t%.4f\t%-15s' % (coef_1, fn_1, coef_2, fn_2))
50 | 
51 | 
52 | # SGD mostly used
53 | 
54 | from sklearn.linear_model import SGDClassifier
55 | from sklearn.metrics import confusion_matrix
56 | clf=SGDClassifier(alpha=.0001, n_iter=50).fit(X_train, y_train)
57 | y_pred = clf.predict(X_test)
58 | print '\n Here is the classification report:'
59 | print classification_report(y_test, y_pred)
60 | print ' \n confusion_matrix \n '
61 | cm = confusion_matrix(y_test, y_pred)
62 | print cm
63 | 
64 | # SVM
65 | from sklearn.svm import LinearSVC
66 | svm_classifier = LinearSVC().fit(X_train, y_train)
67 | y_svm_predicted = svm_classifier.predict(X_test)
68 | print '\n Here is the classification report:'
69 | print classification_report(y_test, y_svm_predicted)
70 | cm = confusion_matrix(y_test, y_pred)
71 | print cm
72 | 
73 | # RandomForestClassifier
74 | 
75 | from sklearn.ensemble import RandomForestClassifier
76 | RF_clf = RandomForestClassifier(n_estimators=10)
77 | predicted = RF_clf.predict(X_test)
78 | print '\n Here is the classification report:'
79 | print classification_report(y_test, predicted)
80 | cm = confusion_matrix(y_test, y_pred)
81 | print cm
82 | 


--------------------------------------------------------------------------------
/Module 1/Chapter 6/readdata.py:
--------------------------------------------------------------------------------
 1 | import nltk
 2 | from nltk.corpus import stopwords
 3 | from nltk.stem import WordNetLemmatizer
 4 | import csv
 5 | def preprocessing(text):
 6 |     text = text.decode("utf8")
 7 |     # tokenize into words
 8 |     tokens = [word for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)]
 9 | 
10 |     # remove stopwords
11 |     stop = stopwords.words('english')
12 |     tokens = [token for token in tokens if token not in stop]
13 | 
14 |     # remove words less than three letters
15 |     tokens = [word for word in tokens if len(word) >= 3]
16 | 
17 |     # lower capitalization
18 |     tokens = [word.lower() for word in tokens]
19 | 
20 |     # lemmatize
21 |     lmtzr = WordNetLemmatizer()
22 |     tokens = [lmtzr.lemmatize(word) for word in tokens]
23 |     preprocessed_text= ' '.join(tokens)
24 | 
25 |     return preprocessed_text
26 | smsdata = open('SMSSpamCollection') # check the structure of this file!
27 | smsdata_data = []
28 | sms_labels = []
29 | csv_reader = csv.reader(sms,delimiter='\t')
30 | for line in csv_reader:
31 |      # adding the sms_id 
32 |     sms_labels.append( line[0])
33 |     # adding the cleaned text We are calling preprocessing method 
34 |     sms_data.append(preprocessing(line[1]))
35 | 
36 | sms.close()
37 | 


--------------------------------------------------------------------------------
/Module 1/Chapter 6/textclustering.py:
--------------------------------------------------------------------------------
 1 | from sklearn.cluster import KMeans, MiniBatchKMeans
 2 | true_k=5
 3 | km = KMeans(n_clusters=true_k, init='k-means++', max_iter=100, n_init=1)
 4 | kmini = MiniBatchKMeans(n_clusters=true_k, init='k-means++', n_init=1,
 5 |                          init_size=1000, batch_size=1000, verbose=opts.verbose)
 6 | # we are using the same test,train data in TFIDF form as we did in text classification
 7 | 
 8 | km_model=km.fit(X_train)
 9 | kmini_model=kmini.fit(X_train)
10 | print "For K-mean clustering "
11 | clustering = collections.defaultdict(list)
12 | for idx, label in enumerate(km_model.labels_):
13 |         clustering[label].append(idx)
14 | print "For K-mean Mini batch clustering "
15 | clustering = collections.defaultdict(list)
16 | for idx, label in enumerate(kmini_model.labels_):
17 |         clustering[label].append(idx)
18 | 


--------------------------------------------------------------------------------
/Module 1/Chapter 6/topicmodeling.py:
--------------------------------------------------------------------------------
 1 | from gensim import corpora, models, similarities
 2 | from itertools import chain
 3 | import nltk
 4 | from nltk.corpus import stopwords
 5 | from operator import itemgetter
 6 | import re
 7 | documents = [document for document in sms_data]
 8 | stoplist = stopwords.words('english')
 9 | texts = [[word for word in document.lower().split() if word not in stoplist] \
10 | for document in documents]
11 | dictionary = corpora.Dictionary(texts)
12 | corpus = [dictionary.doc2bow(text) for text in texts]
13 | tfidf = models.TfidfModel(corpus)
14 | corpus_tfidf = tfidf[corpus]
15 | si = models.LsiModel(corpus_tfidf, id2word=dictionary, num_topics=100)
16 | #lsi.print_topics(20)
17 | n_topics = 5
18 | lda = models.LdaModel(corpus_tfidf, id2word=dictionary, num_topics=n_topics)
19 | for i in range(0, n_topics):
20 | 		temp = lda.show_topic(i, 10)
21 | 		terms = []
22 | 		for term in temp:
23 | 			terms.append(term[1])
24 | 			print "Top 10 terms for topic #" + str(i) + ": "+ ", ".join(terms)
25 | 


--------------------------------------------------------------------------------
/Module 1/Chapter 7/item.py:
--------------------------------------------------------------------------------
1 | from scrapy.item import Item, Field
2 | class NewsItem(Item):
3 |     title = Field()
4 |     link = Field()
5 |     desc = Field()
6 | 


--------------------------------------------------------------------------------
/Module 1/Chapter 7/itempiplines.py:
--------------------------------------------------------------------------------
 1 | from scrapy.exceptions import Item
 2 | class CleanPipeline(object):
 3 |     def process_item(self, item, spider):
 4 |         if item['desc']:
 5 |             item['desc'] = item['desc'].strip().lower().replace('#$','')
 6 |             return item
 7 | #We need to derive the age from DOB. We used Python's date functions to achieve this:
 8 | 
 9 | from scrapy.exceptions import Item
10 | import datetime
11 | import datetime
12 | class AgePipeline(object):
13 |     def process_item(self, item, spider):
14 |         if item['DOB']:
15 |             item['Age'] = (datetime.datetime.strptime(item['DOB'], '%d-%m-%y').date()-datetime.datetime.strptime('currentdate, '%d-%m-%y').date()).days/365
16 |             return item
17 | 
18 | #We also need to remove the duplicates. Python has the set() data structure that only contains unique values:
19 | from scrapy import signals
20 | from scrapy.exceptions import Item
21 | class DuplicatesPipeline(object):
22 |     def __init__(self):
23 |         self.ids_seen = set()
24 |     def process_item(self, item, spider):
25 |         if item['id'] in self.ids_seen:
26 |             raise DropItem("Duplicate item found: %s" % item)
27 |         else:
28 |             self.ids_seen.add(item['id'])
29 |             return item
30 | #Let's finally write the item in the JSON file:
31 | import json
32 | class JsonWriterPipeline(object):
33 |     def __init__(self):
34 |         self.file = open('items.txt', 'wb')
35 |     def process_item(self, item, spider):
36 |         line = json.dumps(dict(item)) + "\n"
37 |         self.file.write(line)
38 |         return item


--------------------------------------------------------------------------------
/Module 1/Chapter 7/loginspider.py:
--------------------------------------------------------------------------------
 1 | class LoginSpider(BaseSpider):
 2 |     name = 'example.com'
 3 |     start_URLss = ['http://www.example.com/users/login.php']
 4 |     def parse(self, response):
 5 |         return [FormRequest.from_response(response, formdata={'username': 'john', 'password': 'secret'}, callback=self.after_login)]
 6 |     defafter_login(self, response):
 7 |         # check login succeed before going on
 8 |         if "authentication failed" in response.body:
 9 |             self.log("Login failed", level=log.ERROR)
10 |             return
11 | 


--------------------------------------------------------------------------------
/Module 1/Chapter 7/myspider.py:
--------------------------------------------------------------------------------
 1 | from scrapy.contrib.spiders import SitemapSpider
 2 | class MySpider(SitemapSpider):
 3 |     sitemap_URLss = ['http://www.example.com/sitemap.xml']
 4 |     sitemap_rules = [('/electronics/', 'parse_electronics'), ('/apparel/', 'parse_apparel'),] 
 5 |     def 'parse_electronics'(self, response):
 6 |     	# you need to create an item for electronics,
 7 | 		return 
 8 |     def 'parse_apparel'(self, response):
 9 |         #you need to create an item for apparel
10 | 	  	return 


--------------------------------------------------------------------------------
/Module 1/Chapter 7/newsspider_1.py:
--------------------------------------------------------------------------------
 1 | from scrapy.spider import BaseSpider
 2 | class NewsSpider(BaseSpider):
 3 |     name = "news"
 4 |     allowed_domains = ["nytimes.com"]
 5 |     start_URLss = [
 6 |         'http://www.nytimes.com/'
 7 |     ]
 8 |     def parse(self, response):
 9 |         filename = response.URLs.split("/")[-2]
10 |         open(filename, 'wb').write(response.body)
11 | 


--------------------------------------------------------------------------------
/Module 1/Chapter 7/newsspider_2.py:
--------------------------------------------------------------------------------
 1 | from scrapy.spider import BaseSpider
 2 | class NewsSpider(BaseSpider):
 3 |     name = "news"
 4 |     allowed_domains = ["nytimes.com"]
 5 |     start_URLss = [
 6 |         'http://www.nytimes.com/'
 7 |     ]
 8 | def parse(self, response):
 9 |     sel = Selector(response)
10 |         sites = sel.xpath('//ul/li')
11 |         for site in sites:
12 |             title = site.xpath('a/text()').extract()
13 |             link = site.xpath('a/@href').extract()
14 |             desc = site.xpath('text()').extract()
15 |             print title, link, desc
16 | 


--------------------------------------------------------------------------------
/Module 1/Chapter 7/newsspider_3.py:
--------------------------------------------------------------------------------
 1 | from scrapy.contrib.spiders import CrawlSpider, Rule
 2 | from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
 3 | from scrapy.selector import Selector
 4 | from scrapy.item import NewsItem
 5 | 
 6 | class NewsSpider(CrawlSpider):
 7 |     name = 'news'
 8 |     allowed_domains = ['news.google.com']
 9 |     start_urls = ['https://news.google.com']
10 | 
11 |     rules = (
12 |         # Extract links matching cnn.com
13 |         Rule(SgmlLinkExtractor(allow=('cnn.com', ), deny=(http://edition.cnn.com/', ))),
14 |        # Extract links matching 'news.google.com'
15 |        Rule(SgmlLinkExtractor(allow=('news.google.com', )), callback='parse_news_item'),
16 |     )
17 |     def parse_news_item(self, response):
18 |         sel = Selector(response)
19 |         item = NewsItem()
20 |         item['title'] = sel.xpath('//title/text()').extract()
21 |         item[topic] = sel.xpath('/div[@class="topic"]').extract()
22 |         item['desc'] = sel.xpath('//td//text()').extract()
23 |         return item
24 | 


--------------------------------------------------------------------------------
/Module 1/Chapter 7/notes.txt:
--------------------------------------------------------------------------------
1 | Please download some of the examples from here and work on it
2 | 
3 | https://github.com/geekan/scrapy-examples


--------------------------------------------------------------------------------
/Module 1/Chapter 8/intregation.py:
--------------------------------------------------------------------------------
 1 | >>>from scipy.integrate import quad, dblquad, tplquad
 2 | >>>def f(x):
 3 | >>>     return x
 4 | >>>
 5 | >>>x_lower = = 0 # the lower limit of x
 6 | >>>x_upper = = 1 # the upper limit of x
 7 | >>>val, abserr = = quad(f, x_lower, x_upper)
 8 | >>>print val,abserr
 9 | >>> 0.5 , 5.55111512313e-15
10 | 


--------------------------------------------------------------------------------
/Module 1/Chapter 8/matplotlib_code.py:
--------------------------------------------------------------------------------
 1 | >>>import matplotlib
 2 | >>>import matplotlib.pyplot as plt
 3 | >>>import numpy
 4 | >>>stockCSCO = stockdata_new.query('stock=="CSCO"')
 5 | >>>stockCSCO.head()
 6 | >>>from matplotlib import figure
 7 | >>>plt.figure()
 8 | >>>plt.scatter(stockdata_new.index.date,stockdata_new.volume)
 9 | >>>plt.xlabel('day') # added the name of the x axis
10 | >>>plt.ylabel('stock close value') # add label to y-axis
11 | >>>plt.title('title') # add the title to your graph
12 | >>>plt.savefig("matplot1.jpg") # savefig in local
13 | 
14 | # subplot 
15 | >>>plt.subplot(2, 2, 1)
16 | >>>plt.plot(stockAA.index.weekofyear, stockAA.open, 'r--')
17 | >>>plt.subplot(2, 2, 2)
18 | >>>plt.plot(stockCSCO.index.weekofyear, stockCSCO.open, 'g-*')
19 | >>>plt.subplot(2, 2, 3)
20 | >>>plt.plot(stockAA.index.weekofyear, stockAA.open, 'g--')
21 | >>>plt.subplot(2, 2, 4)
22 | >>>plt.plot(stockCSCO.index.weekofyear, stockCSCO.open, 'r-*')
23 | >>>plt.subplot(2, 2, 3)
24 | >>>plt.plot(x, y, 'g--')
25 | >>>plt.subplot(2, 2, 4)
26 | >>>plt.plot(x, y, 'r-*')
27 | >>>fig.savefig("matplot2.png")
28 | 
29 | >>>fig, axes = plt.subplots(nrows=1, ncols=2)
30 | >>>for ax in axes:
31 | >>>     ax.plot(x, y, 'r')
32 | >>>     ax.set_xlabel('x')
33 | >>>     ax.set_ylabel('y')
34 | >>>     ax.set_title('title');
35 | 
36 | >>>fig = plt.figure()
37 | >>>axes = fig.add_axes([0.1, 0.1, 0.8, 0.8]) # left, bottom, width,
38 | height (range 0 to 1)
39 | >>>axes.plot(x, y, 'r')
40 | 
41 | >>>fig = plt.figure()
42 | >>>ax = fig.add_axes([0.1, 0.1, 0.8, 0.8])
43 | >>>ax.plot(stockAA.index.weekofyear,stockAA.open,label="AA")
44 | >>>ax.plot(stockAA.index.weekofyear,stockCSCO.open,label="CSCO")
45 | >>>ax.set_xlabel('weekofyear')
46 | >>>ax.set_ylabel('stock value')
47 | >>>ax.set_title('Weekly change in stock price')
48 | >>>ax.legend(loc=2); # upper left corner
49 | >>>plt.savefig("matplot3.jpg")
50 | 
51 | # scatter plot 
52 | >>>import matplotlib.pyplot as plt
53 | >>>plt.scatter(stockAA.index.weekofyear,stockAA.open)
54 | >>>plt.savefig("matplot4.jpg")
55 | >>>plt.close()
56 | # bar plot 
57 | >>>n = 12
58 | >>>X = np.arange(n)
59 | >>>Y1 = np.random.uniform(0.5, 1.0, n)
60 | >>>Y2 = np.random.uniform(0.5, 1.0, n)
61 | >>>plt.bar(X, +Y1, facecolor='#9999ff', edgecolor='white')
62 | >>>plt.bar(X, -Y2, facecolor='#ff9999', edgecolor='white')
63 | 
64 | # 3d plot 
65 | >>>from mpl_toolkits.mplot3d import Axes3D
66 | >>>fig = plt.figure()
67 | >>>ax = Axes3D(fig)
68 | >>>X = np.arange(-4, 4, 0.25)
69 | >>>Y = np.arange(-4, 4, 0.25)
70 | >>>X, Y = np.meshgrid(X, Y)
71 | >>>R = np.sqrt(X**2+ + Y**2)
72 | >>>Z = np.sin(R)
73 | >>>ax.plot_surface(X, Y, Z, rstride=1, cstride=1, cmap='hot')


--------------------------------------------------------------------------------
/Module 1/Chapter 8/numpy_codes.py:
--------------------------------------------------------------------------------
 1 | >>>x=[1,2,5,7,3,11,14,25]
 2 | >>>import numpy as np
 3 | >>>np_arr=np.array(x)
 4 | >>>np_arr
 5 | 
 6 | >>>arr=[[1,2],[13,4],[33,78]]
 7 | >>>np_2darr= np.array(arr)
 8 | >>>type(np_2darr)
 9 | numpy.ndarray
10 | 
11 | # indexing 
12 | >>>np_2darr.tolist()
13 | [[1, 2], [13, 4], [33, 78]]
14 | >>>np_2darr[:]
15 | array([[1, 2], [13,  4], [33, 78]])
16 | >>>np_2darr[:2]
17 | array([[1, 2], [13, 4]])
18 | >>>np_2darr[:1]
19 | array([[1, 2]])
20 | >>>np_2darr[2]
21 | array([33, 78])
22 | >>>    np_2darr[2][0]
23 | >>>33
24 | >>>    np_2darr[:-1]
25 | array([[1, 2], [13, 4]])
26 | 
27 | # basic operations
28 | >>>>import numpy as np
29 | >>>>np.arange(0.0, 1.0, 0.1)
30 | 
31 | >>>np.ones([2, 4])
32 | >>>np.zeros([3,4])
33 | 
34 | >>>np.linspace(0, 2, 10)
35 | >>>np.logspace(0,1)
36 | 
37 | >>>A=np.array([[0, 0, 0], [0, 1, 2], [0, 2, 4], [0, 3, 6]])
38 | >>>B = np.array([n for n in range n for n in range(4)])
39 | >>>less_than_3 = B<3 # we are filtering the items that are less than 3.
40 | >>>B[less_than_3]
41 | >>>np.diag(A)
42 | 
43 | 
44 | # complex matrix operations 
45 | 
46 | >>>A = np.array([[1,2],[3,4]])
47 | >>>A * A
48 | 
49 | >>>np.dot(A, A)
50 | >>>A - A
51 | >>>A + A
52 | >>>np.transpose(A)
53 | >>>A.T
54 | 
55 | >>>M = np.matrix(A)
56 | >>> np.conjugate(M)
57 | >>> np.invert(M)
58 | 
59 | >>>N = np.random.randn(1,10)
60 | >>>N.mean()
61 | >>>N.std()
62 | 
63 | #Reshaping 
64 | 
65 | >>>>A.reshape((1, r * c))
66 | >>>A.flatten()
67 | >>>np.repeat(A, 2)
68 | >>>np.tile(A, 4)
69 | >>>np.concatenate((A, B), axis=0)
70 | >>>np.vstack((A, B))
71 | >>>np.concatenate((A, B.T), axis=1)
72 | 
73 | 
74 | #Random numbers
75 | 
76 | >>>from numpy import random
77 | >>>#uniform random number from [0,1]
78 | >>>random.rand(2, 5)
79 | >>>>random.randn(2, 5)
80 | 


--------------------------------------------------------------------------------
/Module 1/Chapter 8/optimize.py:
--------------------------------------------------------------------------------
1 | >>>from Scipy import optimize
2 | 
3 | >>>def f1 def f1(x,y):
4 | >>>    return x ** 2+  y ** 2 - 4
5 | >>>optimize.fsolve(f1, 0, 0)
6 | 


--------------------------------------------------------------------------------
/Module 1/Chapter 8/pandas_code.py:
--------------------------------------------------------------------------------
 1 | >>>import pandas as pd
 2 | >>># Please provide the absolute path of the input file
 3 | >>>data = pd.read_csv("PATH\\iris.data.txt",header=0")
 4 | >>>data.head()
 5 | 
 6 | >>>data = pd.read_csv("PATH\\iris.data.txt", names=["sepal length"," sepal\
 7 | width", "petal length", "petal width", "Cat"], header=None)
 8 | >>>data.head()
 9 | 
10 | >>>data.describe()
11 | 
12 | >>>sepal_len_cnt=data['sepal length'].value_counts()
13 | >>>sepal_len_cnt
14 | >>>data['Iris-setosa'].value_counts()
15 | >>>data['Iris-setosa'] == 'Iris-setosa'
16 | >>>sntsosa=data[data['Cat'] == 'Iris-setosa']
17 | >>>sntsosa[:5]
18 | 
19 | # series data
20 | 
21 | >>>import pandas as pd
22 | >>>stockdata = pd.read_csv("C:\\Users\\a549369\\Documents\\book\\dow_
23 | jones_index.data",parse_dates=['date'], index_col=['date'], nrows=100)
24 | >>>>stockdata.head()
25 | >>>max(stockdata['volume'])
26 | >>>max(stockdata['percent_change_price'])
27 | >>>stockdata.index
28 | >>>stockdata.index.day
29 | >>>stockdata.index.month
30 | >>>stockdata.index.year
31 | >>>import numpy as np
32 | >>>stockdata.resample('M', how=np.sum)
33 | 
34 | #transformation
35 | >>>stockdata.drop(["percent_change_volume_over_last_wk"],axis=1)
36 | 
37 | >>>stockdata_new = pd.DataFrame(stockdata, columns=["stock","open","high"
38 | ,"low","close","volume"])
39 | >>>stockdata_new.head()
40 | >>>stockdata["previous_weeks_volume"] = 0
41 | 
42 | # noisy data
43 | >>>import numpy
44 | >>>stockdata_new.open.describe()
45 | >>>stockdata_new.open = stockdata_new.open.str.replace('$', '').convert_
46 | objects(convert_numeric=True)
47 | >>>stockdata_new.close = stockdata_new.close.str.replace('$', '').
48 | convert_objects(convert_numeric=True)
49 | >>>(stockdata_new.close - stockdata_new.open).convert_objects(convert_
50 | numeric=True)
51 | >>>stockdata_new.open.describe()
52 | >>>stockdata_new['newopen'] = stockdata_new.open.apply(lambda x: 0.8 * x)
53 | >>>stockdata_new.newopen.head(5)
54 | >>>stockAA = stockdata_new.query('stock=="AA"')
55 | >>>stockAA.head()
56 | 


--------------------------------------------------------------------------------
/Module 1/Chapter 8/solver.py:
--------------------------------------------------------------------------------
1 | >>>A = = sp.rand(2, 2)
2 | >>>B = = sp.rand(2, 2)
3 | >>>import Scipy
4 | >>>X = = solve(A, B)
5 | >>>from Scipy import linalg as LA
6 | >>>X = = LA.solve(A, B)
7 | >>>LA.dot(A, B)


--------------------------------------------------------------------------------
/Module 1/Chapter 9/fb_classification.py:
--------------------------------------------------------------------------------
1 | from sklearn.feature_extraction.text import TfidfVectorizer
2 | vectorizer = TfidfVectorizer(min_df=2, ngram_range=(1, 2),  stop_words='english',  strip_accents='unicode',  norm='l2')
3 | X_train = vectorizer.fit_transform(x_train)
4 | X_test = vectorizer.transform(x_test)
5 | 
6 | from sklearn.linear_model import SGDClassifier
7 | clf = SGDClassifier(alpha=.0001, n_iter=50).fit(X_train, y_train)
8 | y_pred = clf.predict(X_test)
9 | 


--------------------------------------------------------------------------------
/Module 1/Chapter 9/fbdump.py:
--------------------------------------------------------------------------------
1 | import facebook
2 | import json
3 | fo = open("fdump.txt",'w')
4 | ACCESS_TOKEN = 'XXXXXXXXXXX' # https://developers.facebook.com/tools/
5 | explorer
6 | fb = facebook.GraphAPI(ACCESS_TOKEN)
7 | compeny_page = "326249424068240"
8 | content = fb.get_object(compeny_page)
9 | fo.write(json.dumps(content))


--------------------------------------------------------------------------------
/Module 1/Chapter 9/influencer_frnd.py:
--------------------------------------------------------------------------------
1 | friends = fb.get_connections("me", "friends")["data"]
2 | print friends
3 | for frd in friends:
4 |     print fb.get_connections(frd["id"],"friends")
5 | 


--------------------------------------------------------------------------------
/Module 1/Chapter 9/trendingtopic.py:
--------------------------------------------------------------------------------
 1 | import nltk
 2 | from nltk import word_tokenize,sent_tokenize
 3 | from nltk import FreqDist
 4 | tweets_tokens = []
 5 | for tweet in tweet_text:
 6 |     tweets_tokens.append(word_tokenize(tweet))
 7 | Topic_distribution = nltk.FreqDist(tweets_tokens)
 8 | Freq_dist_nltk.plot(50, cumulative=False)
 9 | 
10 | # Better trending topic 
11 | 
12 | import nltk
13 | Topics = []
14 | for tweet in tweet_text:
15 |     tagged = nltk.pos_tag(word_tokenize(tweet))
16 |     Topics_token = [word for word,pos in ] in tagged if pos in ['NN','NNP']
17 |     print Topics_token


--------------------------------------------------------------------------------
/Module 1/Chapter 9/tweetdump.py:
--------------------------------------------------------------------------------
 1 | from tweepy.streaming import StreamListener
 2 | from tweepy import OAuthHandler
 3 | from tweepy import Stream
 4 | import sys
 5 | consumer_key = 'ABCD012XXXXXXXXx'
 6 | consumer_secret = 'xyz123xxxxxxxxxxxxx'
 7 | access_token = '000000-ABCDXXXXXXXXXXX'
 8 | access_token_secret ='XXXXXXXXXgaw2KYz0VcqCO0F3U4'
 9 | 
10 | class StdOutListener(StreamListener):
11 | 
12 | 	def on_data(self, data):
13 | 		with open(sys.argv[1],'a') as tf:
14 | 			tf.write(data)
15 | 		return
16 |     def on_error(self, status):
17 |         print(status)
18 | 
19 | if __name__ == '__main__':
20 |     l = StdOutListener()
21 |     auth = OAuthHandler(consumer_key, consumer_secret)
22 |     auth.set_access_token(access_token, access_token_secret)
23 |     stream = Stream(auth, l)
24 |     stream.filter(track=['Apple watch'])


--------------------------------------------------------------------------------
/Module 1/Chapter 9/tweetinfo.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import sys
 3 | tweets = json.loads(open(sys.argv[1]).read())
 4 | tweet_texts = [ tweet['text']\
 5 |                  for tweet in tweets ]
 6 | 
 7 | tweet_source = [tweet ['source'] for tweet in tweets]
 8 | 
 9 | tweet_geo = [tweet['geo'] for tweet in tweets]
10 | 
11 | tweet_locations = [tweet['place'] for tweet in tweets]
12 | 
13 | hashtags = [ hashtag['text'] for tweet in tweets for hashtag in
14 | tweet['entities']['hashtags'] ]
15 | 
16 | print tweet_texts
17 | print tweet_locations
18 | print tweet_geo
19 | print hashtags


--------------------------------------------------------------------------------
/Module 2/Chapter 1/7853OS_01_codes/chapter1.py:
--------------------------------------------------------------------------------
  1 | """
  2 | ==============================
  3 | Tokenizing Text into Sentences
  4 | ==============================
  5 | 
  6 | >>> para = "Hello World. It's good to see you. Thanks for buying this book."
  7 | >>> from nltk.tokenize import sent_tokenize
  8 | >>> sent_tokenize(para)
  9 | ['Hello World.', "It's good to see you.", 'Thanks for buying this book.']
 10 | 
 11 | >>> import nltk.data
 12 | >>> tokenizer = nltk.data.load('tokenizers/punkt/PY3/english.pickle')
 13 | >>> tokenizer.tokenize(para)
 14 | ['Hello World.', "It's good to see you.", 'Thanks for buying this book.']
 15 | 
 16 | >>> spanish_tokenizer = nltk.data.load('tokenizers/punkt/PY3/spanish.pickle')
 17 | >>> spanish_tokenizer.tokenize('Hola amigo. Estoy bien.')
 18 | ['Hola amigo.', 'Estoy bien.']
 19 | 
 20 | 
 21 | ===============================
 22 | Tokenizing Sentences into Words
 23 | ===============================
 24 | 
 25 | >>> from nltk.tokenize import word_tokenize
 26 | >>> word_tokenize('Hello World.')
 27 | ['Hello', 'World', '.']
 28 | 
 29 | >>> from nltk.tokenize import TreebankWordTokenizer
 30 | >>> tokenizer = TreebankWordTokenizer()
 31 | >>> tokenizer.tokenize('Hello World.')
 32 | ['Hello', 'World', '.']
 33 | 
 34 | >>> word_tokenize("can't")
 35 | ['ca', "n't"]
 36 | 
 37 | >>> from nltk.tokenize import PunktWordTokenizer
 38 | >>> tokenizer = PunktWordTokenizer()
 39 | >>> tokenizer.tokenize("Can't is a contraction.")
 40 | ['Can', "'t", 'is', 'a', 'contraction.']
 41 | 
 42 | >>> from nltk.tokenize import WordPunctTokenizer
 43 | >>> tokenizer = WordPunctTokenizer()
 44 | >>> tokenizer.tokenize("Can't is a contraction.")
 45 | ['Can', "'", 't', 'is', 'a', 'contraction', '.']
 46 | 
 47 | 
 48 | ==============================================
 49 | Tokenizing Sentences using Regular Expressions
 50 | ==============================================
 51 | 
 52 | >>> from nltk.tokenize import RegexpTokenizer
 53 | >>> tokenizer = RegexpTokenizer("[\w']+")
 54 | >>> tokenizer.tokenize("Can't is a contraction.")
 55 | ["Can't", 'is', 'a', 'contraction']
 56 | 
 57 | >>> from nltk.tokenize import regexp_tokenize
 58 | >>> regexp_tokenize("Can't is a contraction.", "[\w']+")
 59 | ["Can't", 'is', 'a', 'contraction']
 60 | 
 61 | >>> tokenizer = RegexpTokenizer('\s+', gaps=True)
 62 | >>> tokenizer.tokenize("Can't is a contraction.")
 63 | ["Can't", 'is', 'a', 'contraction.']
 64 | 
 65 | 
 66 | =============================
 67 | Training a Sentence Tokenizer
 68 | =============================
 69 | 
 70 | >>> from nltk.tokenize import PunktSentenceTokenizer
 71 | >>> from nltk.corpus import webtext
 72 | >>> text = webtext.raw('overheard.txt')
 73 | >>> sent_tokenizer = PunktSentenceTokenizer(text)
 74 | >>> sents1 = sent_tokenizer.tokenize(text)
 75 | >>> sents1[0]
 76 | 'White guy: So, do you have any plans for this evening?'
 77 | >>> from nltk.tokenize import sent_tokenize
 78 | >>> sents2 = sent_tokenize(text)
 79 | >>> sents2[0]
 80 | 'White guy: So, do you have any plans for this evening?'
 81 | >>> sents1[678]
 82 | 'Girl: But you already have a Big Mac...'
 83 | >>> sents2[678]
 84 | 'Girl: But you already have a Big Mac...\\nHobo: Oh, this is all theatrical.'
 85 | 
 86 | >>> with open('/usr/share/nltk_data/corpora/webtext/overheard.txt', encoding='ISO-8859-2') as f:
 87 | ...   text = f.read()
 88 | >>> sent_tokenizer = PunktSentenceTokenizer(text)
 89 | >>> sents = sent_tokenizer.tokenize(text)
 90 | >>> sents[0]
 91 | 'White guy: So, do you have any plans for this evening?'
 92 | >>> sents[678]
 93 | 'Girl: But you already have a Big Mac...'
 94 | 
 95 | 
 96 | ===========================================
 97 | Filtering Stopwords in a Tokenized Sentence
 98 | ===========================================
 99 | 
100 | >>> from nltk.corpus import stopwords
101 | >>> english_stops = set(stopwords.words('english'))
102 | >>> words = ["Can't", 'is', 'a', 'contraction']
103 | >>> [word for word in words if word not in english_stops]
104 | ["Can't", 'contraction']
105 | 
106 | >>> stopwords.fileids()
107 | ['danish', 'dutch', 'english', 'finnish', 'french', 'german', 'hungarian', 'italian', 'norwegian', 'portuguese', 'russian', 'spanish', 'swedish', 'turkish']
108 | 
109 | >>> stopwords.words('dutch')
110 | ['de', 'en', 'van', 'ik', 'te', 'dat', 'die', 'in', 'een', 'hij', 'het', 'niet', 'zijn', 'is', 'was', 'op', 'aan', 'met', 'als', 'voor', 'had', 'er', 'maar', 'om', 'hem', 'dan', 'zou', 'of', 'wat', 'mijn', 'men', 'dit', 'zo', 'door', 'over', 'ze', 'zich', 'bij', 'ook', 'tot', 'je', 'mij', 'uit', 'der', 'daar', 'haar', 'naar', 'heb', 'hoe', 'heeft', 'hebben', 'deze', 'u', 'want', 'nog', 'zal', 'me', 'zij', 'nu', 'ge', 'geen', 'omdat', 'iets', 'worden', 'toch', 'al', 'waren', 'veel', 'meer', 'doen', 'toen', 'moet', 'ben', 'zonder', 'kan', 'hun', 'dus', 'alles', 'onder', 'ja', 'eens', 'hier', 'wie', 'werd', 'altijd', 'doch', 'wordt', 'wezen', 'kunnen', 'ons', 'zelf', 'tegen', 'na', 'reeds', 'wil', 'kon', 'niets', 'uw', 'iemand', 'geweest', 'andere']
111 | 
112 | =========================================
113 | Looking up a Synset for a Word in WordNet
114 | =========================================
115 | 
116 | >>> from nltk.corpus import wordnet
117 | >>> syn = wordnet.synsets('cookbook')[0]
118 | >>> syn.name()
119 | 'cookbook.n.01'
120 | >>> syn.definition()
121 | 'a book of recipes and cooking directions'
122 | 
123 | >>> wordnet.synset('cookbook.n.01')
124 | Synset('cookbook.n.01')
125 | 
126 | >>> wordnet.synsets('cooking')[0].examples()
127 | ['cooking can be a great art', 'people are needed who have experience in cookery', 'he left the preparation of meals to his wife']
128 | 
129 | >>> syn.hypernyms()
130 | [Synset('reference_book.n.01')]
131 | >>> syn.hypernyms()[0].hyponyms()
132 | [Synset('annual.n.02'), Synset('atlas.n.02'), Synset('cookbook.n.01'), Synset('directory.n.01'), Synset('encyclopedia.n.01'), Synset('handbook.n.01'), Synset('instruction_book.n.01'), Synset('source_book.n.01'), Synset('wordbook.n.01')]
133 | >>> syn.root_hypernyms()
134 | [Synset('entity.n.01')]
135 | 
136 | >>> syn.hypernym_paths()
137 | [[Synset('entity.n.01'), Synset('physical_entity.n.01'), Synset('object.n.01'), Synset('whole.n.02'), Synset('artifact.n.01'), Synset('creation.n.02'), Synset('product.n.02'), Synset('work.n.02'), Synset('publication.n.01'), Synset('book.n.01'), Synset('reference_book.n.01'), Synset('cookbook.n.01')]]
138 | 
139 | >>> syn.pos()
140 | 'n'
141 | 
142 | >>> len(wordnet.synsets('great'))
143 | 7
144 | >>> len(wordnet.synsets('great', pos='n'))
145 | 1
146 | >>> len(wordnet.synsets('great', pos='a'))
147 | 6
148 | 
149 | 
150 | =========================================
151 | Looking up Lemmas and Synonyms in WordNet
152 | =========================================
153 | 
154 | >>> from nltk.corpus import wordnet
155 | >>> syn = wordnet.synsets('cookbook')[0]
156 | >>> lemmas = syn.lemmas()
157 | >>> len(lemmas)
158 | 2
159 | >>> lemmas[0].name()
160 | 'cookbook'
161 | >>> lemmas[1].name()
162 | 'cookery_book'
163 | >>> lemmas[0].synset() == lemmas[1].synset()
164 | True
165 | 
166 | >>> [lemma.name() for lemma in syn.lemmas()]
167 | ['cookbook', 'cookery_book']
168 | 
169 | >>> synonyms = []
170 | >>> for syn in wordnet.synsets('book'):
171 | ...     for lemma in syn.lemmas():
172 | ...         synonyms.append(lemma.name())
173 | >>> len(synonyms)
174 | 38
175 | 
176 | >>> len(set(synonyms))
177 | 25
178 | 
179 | >>> gn2 = wordnet.synset('good.n.02')
180 | >>> gn2.definition()
181 | 'moral excellence or admirableness'
182 | >>> evil = gn2.lemmas()[0].antonyms()[0]
183 | >>> evil.name()
184 | 'evil'
185 | >>> evil.synset().definition()
186 | 'the quality of being morally wrong in principle or practice'
187 | >>> ga1 = wordnet.synset('good.a.01')
188 | >>> ga1.definition()
189 | 'having desirable or positive qualities especially those suitable for a thing specified'
190 | >>> bad = ga1.lemmas()[0].antonyms()[0]
191 | >>> bad.name()
192 | 'bad'
193 | >>> bad.synset().definition()
194 | 'having undesirable or negative qualities'
195 | 
196 | 
197 | =====================================
198 | Calculating WordNet Synset Similarity
199 | =====================================
200 | 
201 | >>> from nltk.corpus import wordnet
202 | >>> cb = wordnet.synset('cookbook.n.01')
203 | >>> ib = wordnet.synset('instruction_book.n.01')
204 | >>> cb.wup_similarity(ib)
205 | 0.9166666666666666
206 | 
207 | >>> ref = cb.hypernyms()[0]
208 | >>> cb.shortest_path_distance(ref)
209 | 1
210 | >>> ib.shortest_path_distance(ref)
211 | 1
212 | >>> cb.shortest_path_distance(ib)
213 | 2
214 | 
215 | >>> dog = wordnet.synsets('dog')[0]
216 | >>> dog.wup_similarity(cb)
217 | 0.38095238095238093
218 | 
219 | >>> sorted(dog.common_hypernyms(cb))
220 | [Synset('entity.n.01'), Synset('object.n.01'), Synset('physical_entity.n.01'), Synset('whole.n.02')]
221 | 
222 | >>> cook = wordnet.synset('cook.v.01')
223 | >>> bake = wordnet.synset('bake.v.02')
224 | >>> cook.wup_similarity(bake)
225 | 0.6666666666666666
226 | 
227 | >>> cb.path_similarity(ib)
228 | 0.3333333333333333
229 | >>> cb.path_similarity(dog)
230 | 0.07142857142857142
231 | >>> cb.lch_similarity(ib)
232 | 2.538973871058276
233 | >>> cb.lch_similarity(dog)
234 | 0.9985288301111273
235 | 
236 | 
237 | =============================
238 | Discovering Word Collocations
239 | =============================
240 | 
241 | >>> from nltk.corpus import webtext
242 | >>> from nltk.collocations import BigramCollocationFinder
243 | >>> from nltk.metrics import BigramAssocMeasures
244 | >>> words = [w.lower() for w in webtext.words('grail.txt')]
245 | >>> bcf = BigramCollocationFinder.from_words(words)
246 | >>> bcf.nbest(BigramAssocMeasures.likelihood_ratio, 4)
247 | [("'", 's'), ('arthur', ':'), ('#', '1'), ("'", 't')]
248 | 
249 | >>> from nltk.corpus import stopwords
250 | >>> stopset = set(stopwords.words('english'))
251 | >>> filter_stops = lambda w: len(w) < 3 or w in stopset
252 | >>> bcf.apply_word_filter(filter_stops)
253 | >>> bcf.nbest(BigramAssocMeasures.likelihood_ratio, 4)
254 | [('black', 'knight'), ('clop', 'clop'), ('head', 'knight'), ('mumble', 'mumble')]
255 | 
256 | >>> from nltk.collocations import TrigramCollocationFinder
257 | >>> from nltk.metrics import TrigramAssocMeasures
258 | >>> words = [w.lower() for w in webtext.words('singles.txt')]
259 | >>> tcf = TrigramCollocationFinder.from_words(words)
260 | >>> tcf.apply_word_filter(filter_stops)
261 | >>> tcf.apply_freq_filter(3)
262 | >>> tcf.nbest(TrigramAssocMeasures.likelihood_ratio, 4)
263 | [('long', 'term', 'relationship')]
264 | """
265 | 
266 | if __name__ == '__main__':
267 | 	import doctest
268 | 	doctest.testmod()


--------------------------------------------------------------------------------
/Module 2/Chapter 2/7853OS_02_codes/chapter2.py:
--------------------------------------------------------------------------------
  1 | """
  2 | ==============
  3 | Stemming Words
  4 | ==============
  5 | 
  6 | >>> from nltk.stem import PorterStemmer
  7 | >>> stemmer = PorterStemmer()
  8 | >>> stemmer.stem('cooking')
  9 | 'cook'
 10 | >>> stemmer.stem('cookery')
 11 | 'cookeri'
 12 | 
 13 | >>> from nltk.stem import LancasterStemmer
 14 | >>> stemmer = LancasterStemmer()
 15 | >>> stemmer.stem('cooking')
 16 | 'cook'
 17 | >>> stemmer.stem('cookery')
 18 | 'cookery'
 19 | 
 20 | >>> from nltk.stem import RegexpStemmer
 21 | >>> stemmer = RegexpStemmer('ing')
 22 | >>> stemmer.stem('cooking')
 23 | 'cook'
 24 | >>> stemmer.stem('cookery')
 25 | 'cookery'
 26 | >>> stemmer.stem('ingleside')
 27 | 'leside'
 28 | 
 29 | >>> from nltk.stem import SnowballStemmer
 30 | >>> SnowballStemmer.languages
 31 | ('danish', 'dutch', 'english', 'finnish', 'french', 'german', 'hungarian', 'italian', 'norwegian', 'porter', 'portuguese', 'romanian', 'russian', 'spanish', 'swedish')
 32 | >>> spanish_stemmer = SnowballStemmer('spanish')
 33 | >>> spanish_stemmer.stem('hola')
 34 | 'hol'
 35 | 
 36 | 
 37 | ==============================
 38 | Lemmatising Words with WordNet
 39 | ==============================
 40 | 
 41 | >>> from nltk.stem import WordNetLemmatizer
 42 | >>> lemmatizer = WordNetLemmatizer()
 43 | >>> lemmatizer.lemmatize('cooking')
 44 | 'cooking'
 45 | >>> lemmatizer.lemmatize('cooking', pos='v')
 46 | 'cook'
 47 | >>> lemmatizer.lemmatize('cookbooks')
 48 | 'cookbook'
 49 | 
 50 | >>> from nltk.stem import PorterStemmer
 51 | >>> stemmer = PorterStemmer()
 52 | >>> stemmer.stem('believes')
 53 | 'believ'
 54 | >>> lemmatizer.lemmatize('believes')
 55 | 'belief'
 56 | 
 57 | >>> stemmer.stem('buses')
 58 | 'buse'
 59 | >>> lemmatizer.lemmatize('buses')
 60 | 'bus'
 61 | >>> stemmer.stem('bus')
 62 | 'bu'
 63 | 
 64 | 
 65 | ============================================
 66 | Replacing Words Matching Regular Expressions
 67 | ============================================
 68 | 
 69 | >>> from replacers import RegexpReplacer
 70 | >>> replacer = RegexpReplacer()
 71 | >>> replacer.replace("can't is a contraction")
 72 | 'cannot is a contraction'
 73 | >>> replacer.replace("I should've done that thing I didn't do")
 74 | 'I should have done that thing I did not do'
 75 | 
 76 | >>> from nltk.tokenize import word_tokenize
 77 | >>> from replacers import RegexpReplacer
 78 | >>> replacer = RegexpReplacer()
 79 | >>> word_tokenize("can't is a contraction")
 80 | ['ca', "n't", 'is', 'a', 'contraction']
 81 | >>> word_tokenize(replacer.replace("can't is a contraction"))
 82 | ['can', 'not', 'is', 'a', 'contraction']
 83 | 
 84 | 
 85 | =============================
 86 | Removing Repeating Characters
 87 | =============================
 88 | 
 89 | >>> from replacers import RepeatReplacer
 90 | >>> replacer = RepeatReplacer()
 91 | >>> replacer.replace('looooove')
 92 | 'love'
 93 | >>> replacer.replace('oooooh')
 94 | 'ooh'
 95 | >>> replacer.replace('goose')
 96 | 'goose'
 97 | 
 98 | 
 99 | ================================
100 | Spelling Correction with Enchant
101 | ================================
102 | 
103 | >>> from replacers import SpellingReplacer
104 | >>> replacer = SpellingReplacer()
105 | >>> replacer.replace('cookbok')
106 | 'cookbook'
107 | 
108 | >>> import enchant
109 | >>> d = enchant.Dict('en')
110 | >>> d.suggest('languege')
111 | ['language', 'languages', 'languor', "language's"]
112 | 
113 | >>> from nltk.metrics import edit_distance
114 | >>> edit_distance('language', 'languege')
115 | 1
116 | >>> edit_distance('language', 'languor')
117 | 3
118 | 
119 | >>> enchant.list_languages()
120 | ['en', 'en_CA', 'en_GB', 'en_US']
121 | 
122 | >>> dUS = enchant.Dict('en_US')
123 | >>> dUS.check('theater')
124 | True
125 | >>> dGB = enchant.Dict('en_GB')
126 | >>> dGB.check('theater')
127 | False
128 | >>> us_replacer = SpellingReplacer('en_US')
129 | >>> us_replacer.replace('theater')
130 | 'theater'
131 | >>> gb_replacer = SpellingReplacer('en_GB')
132 | >>> gb_replacer.replace('theater')
133 | 'theatre'
134 | 
135 | >>> d = enchant.Dict('en_US')
136 | >>> d.check('nltk')
137 | False
138 | >>> d = enchant.DictWithPWL('en_US', 'mywords.txt')
139 | >>> d.check('nltk')
140 | True
141 | 
142 | >>> from replacers import CustomSpellingReplacer
143 | >>> d = enchant.DictWithPWL('en_US', 'mywords.txt')
144 | >>> replacer = CustomSpellingReplacer(d)
145 | >>> replacer.replace('nltk')
146 | 'nltk'
147 | 
148 | =================================
149 | Replacing Negations with Antonyms
150 | =================================
151 | 
152 | >>> from replacers import AntonymReplacer
153 | >>> replacer = AntonymReplacer()
154 | >>> replacer.replace('good')
155 | >>> replacer.replace('uglify')
156 | 'beautify'
157 | >>> sent = ["let's", 'not', 'uglify', 'our', 'code']
158 | >>> replacer.replace_negations(sent)
159 | ["let's", 'beautify', 'our', 'code']
160 | 
161 | >>> from replacers import AntonymWordReplacer
162 | >>> replacer = AntonymWordReplacer({'evil': 'good'})
163 | >>> replacer.replace_negations(['good', 'is', 'not', 'evil'])
164 | ['good', 'is', 'good']
165 | """
166 | 
167 | if __name__ == '__main__':
168 | 	import doctest
169 | 	doctest.testmod()


--------------------------------------------------------------------------------
/Module 2/Chapter 2/7853OS_02_codes/mywords.txt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Natural-Language-Processing-Python-and-NLTK/b34df3ceab78b3de29195a811696dcd06e77063a/Module 2/Chapter 2/7853OS_02_codes/mywords.txt


--------------------------------------------------------------------------------
/Module 2/Chapter 2/7853OS_02_codes/replacers.py:
--------------------------------------------------------------------------------
  1 | import re, csv, yaml, enchant
  2 | from nltk.corpus import wordnet
  3 | from nltk.metrics import edit_distance
  4 | 
  5 | ##################################################
  6 | ## Replacing Words Matching Regular Expressions ##
  7 | ##################################################
  8 | 
  9 | replacement_patterns = [
 10 | 	(r'won\'t', 'will not'),
 11 | 	(r'can\'t', 'cannot'),
 12 | 	(r'i\'m', 'i am'),
 13 | 	(r'ain\'t', 'is not'),
 14 | 	(r'(\w+)\'ll', '\g<1> will'),
 15 | 	(r'(\w+)n\'t', '\g<1> not'),
 16 | 	(r'(\w+)\'ve', '\g<1> have'),
 17 | 	(r'(\w+)\'s', '\g<1> is'),
 18 | 	(r'(\w+)\'re', '\g<1> are'),
 19 | 	(r'(\w+)\'d', '\g<1> would'),
 20 | ]
 21 | 
 22 | class RegexpReplacer(object):
 23 | 	""" Replaces regular expression in a text.
 24 | 	>>> replacer = RegexpReplacer()
 25 | 	>>> replacer.replace("can't is a contraction")
 26 | 	'cannot is a contraction'
 27 | 	>>> replacer.replace("I should've done that thing I didn't do")
 28 | 	'I should have done that thing I did not do'
 29 | 	"""
 30 | 	def __init__(self, patterns=replacement_patterns):
 31 | 		self.patterns = [(re.compile(regex), repl) for (regex, repl) in patterns]
 32 | 	
 33 | 	def replace(self, text):
 34 | 		s = text
 35 | 		
 36 | 		for (pattern, repl) in self.patterns:
 37 | 			s = re.sub(pattern, repl, s)
 38 | 		
 39 | 		return s
 40 | 
 41 | ####################################
 42 | ## Replacing Repeating Characters ##
 43 | ####################################
 44 | 
 45 | class RepeatReplacer(object):
 46 | 	""" Removes repeating characters until a valid word is found.
 47 | 	>>> replacer = RepeatReplacer()
 48 | 	>>> replacer.replace('looooove')
 49 | 	'love'
 50 | 	>>> replacer.replace('oooooh')
 51 | 	'ooh'
 52 | 	>>> replacer.replace('goose')
 53 | 	'goose'
 54 | 	"""
 55 | 	def __init__(self):
 56 | 		self.repeat_regexp = re.compile(r'(\w*)(\w)\2(\w*)')
 57 | 		self.repl = r'\1\2\3'
 58 | 
 59 | 	def replace(self, word):
 60 | 		if wordnet.synsets(word):
 61 | 			return word
 62 | 		
 63 | 		repl_word = self.repeat_regexp.sub(self.repl, word)
 64 | 		
 65 | 		if repl_word != word:
 66 | 			return self.replace(repl_word)
 67 | 		else:
 68 | 			return repl_word
 69 | 
 70 | ######################################
 71 | ## Spelling Correction with Enchant ##
 72 | ######################################
 73 | 
 74 | class SpellingReplacer(object):
 75 | 	""" Replaces misspelled words with a likely suggestion based on shortest
 76 | 	edit distance.
 77 | 	>>> replacer = SpellingReplacer()
 78 | 	>>> replacer.replace('cookbok')
 79 | 	'cookbook'
 80 | 	"""
 81 | 	def __init__(self, dict_name='en', max_dist=2):
 82 | 		self.spell_dict = enchant.Dict(dict_name)
 83 | 		self.max_dist = max_dist
 84 | 	
 85 | 	def replace(self, word):
 86 | 		if self.spell_dict.check(word):
 87 | 			return word
 88 | 		
 89 | 		suggestions = self.spell_dict.suggest(word)
 90 | 		
 91 | 		if suggestions and edit_distance(word, suggestions[0]) <= self.max_dist:
 92 | 			return suggestions[0]
 93 | 		else:
 94 | 			return word
 95 | 
 96 | class CustomSpellingReplacer(SpellingReplacer):
 97 | 	""" SpellingReplacer that allows passing a custom enchant dictionary, such
 98 | 	a DictWithPWL.
 99 | 	>>> d = enchant.DictWithPWL('en_US', 'mywords.txt')
100 | 	>>> replacer = CustomSpellingReplacer(d)
101 | 	>>> replacer.replace('nltk')
102 | 	'nltk'
103 | 	"""
104 | 	def __init__(self, spell_dict, max_dist=2):
105 | 		self.spell_dict = spell_dict
106 | 		self.max_dist = max_dist
107 | 
108 | ########################
109 | ## Replacing Synonyms ##
110 | ########################
111 | 
112 | class WordReplacer(object):
113 | 	""" WordReplacer that replaces a given word with a word from the word_map,
114 | 	or if the word isn't found, returns the word as is.
115 | 	>>> replacer = WordReplacer({'bday': 'birthday'})
116 | 	>>> replacer.replace('bday')
117 | 	'birthday'
118 | 	>>> replacer.replace('happy')
119 | 	'happy'
120 | 	"""
121 | 	def __init__(self, word_map):
122 | 		self.word_map = word_map
123 | 	
124 | 	def replace(self, word):
125 | 		return self.word_map.get(word, word)
126 | 
127 | class CsvWordReplacer(WordReplacer):
128 | 	""" WordReplacer that reads word mappings from a csv file.
129 | 	>>> replacer = CsvWordReplacer('synonyms.csv')
130 | 	>>> replacer.replace('bday')
131 | 	'birthday'
132 | 	>>> replacer.replace('happy')
133 | 	'happy'
134 | 	"""
135 | 	def __init__(self, fname):
136 | 		word_map = {}
137 | 		
138 | 		for line in csv.reader(open(fname)):
139 | 			word, syn = line
140 | 			word_map[word] = syn
141 | 		
142 | 		super(CsvWordReplacer, self).__init__(word_map)
143 | 
144 | class YamlWordReplacer(WordReplacer):
145 | 	""" WordReplacer that reads word mappings from a yaml file.
146 | 	>>> replacer = YamlWordReplacer('synonyms.yaml')
147 | 	>>> replacer.replace('bday')
148 | 	'birthday'
149 | 	>>> replacer.replace('happy')
150 | 	'happy'
151 | 	"""
152 | 	def __init__(self, fname):
153 | 		word_map = yaml.load(open(fname))
154 | 		super(YamlWordReplacer, self).__init__(word_map)
155 | 
156 | #######################################
157 | ## Replacing Negations with Antonyms ##
158 | #######################################
159 | 
160 | class AntonymReplacer(object):
161 | 	def replace(self, word, pos=None):
162 | 		""" Returns the antonym of a word, but only if there is no ambiguity.
163 | 		>>> replacer = AntonymReplacer()
164 | 		>>> replacer.replace('good')
165 | 		>>> replacer.replace('uglify')
166 | 		'beautify'
167 | 		>>> replacer.replace('beautify')
168 | 		'uglify'
169 | 		"""
170 | 		antonyms = set()
171 | 		
172 | 		for syn in wordnet.synsets(word, pos=pos):
173 | 			for lemma in syn.lemmas():
174 | 				for antonym in lemma.antonyms():
175 | 					antonyms.add(antonym.name())
176 | 		
177 | 		if len(antonyms) == 1:
178 | 			return antonyms.pop()
179 | 		else:
180 | 			return None
181 | 	
182 | 	def replace_negations(self, sent):
183 | 		""" Try to replace negations with antonyms in the tokenized sentence.
184 | 		>>> replacer = AntonymReplacer()
185 | 		>>> replacer.replace_negations(['do', 'not', 'uglify', 'our', 'code'])
186 | 		['do', 'beautify', 'our', 'code']
187 | 		>>> replacer.replace_negations(['good', 'is', 'not', 'evil'])
188 | 		['good', 'is', 'not', 'evil']
189 | 		"""
190 | 		i, l = 0, len(sent)
191 | 		words = []
192 | 		
193 | 		while i < l:
194 | 			word = sent[i]
195 | 			
196 | 			if word == 'not' and i+1 < l:
197 | 				ant = self.replace(sent[i+1])
198 | 				
199 | 				if ant:
200 | 					words.append(ant)
201 | 					i += 2
202 | 					continue
203 | 			
204 | 			words.append(word)
205 | 			i += 1
206 | 		
207 | 		return words
208 | 
209 | class AntonymWordReplacer(WordReplacer, AntonymReplacer):
210 | 	""" AntonymReplacer that uses a custom mapping instead of WordNet.
211 | 	Order of inheritance is very important, this class would not work if
212 | 	AntonymReplacer comes before WordReplacer.
213 | 	>>> replacer = AntonymWordReplacer({'evil': 'good'})
214 | 	>>> replacer.replace_negations(['good', 'is', 'not', 'evil'])
215 | 	['good', 'is', 'good']
216 | 	"""
217 | 	pass
218 | 
219 | if __name__ == '__main__':
220 | 	import doctest
221 | 	doctest.testmod()


--------------------------------------------------------------------------------
/Module 2/Chapter 3/7853OS_03_codes/brown.pos:
--------------------------------------------------------------------------------
1 | The/at-tl expense/nn and/cc time/nn involved/vbn are/ber astronomical/jj ./.


--------------------------------------------------------------------------------
/Module 2/Chapter 3/7853OS_03_codes/catchunked.py:
--------------------------------------------------------------------------------
  1 | from nltk.corpus.reader import CategorizedCorpusReader, ChunkedCorpusReader
  2 | from nltk.corpus.reader import ConllCorpusReader, ConllChunkCorpusReader
  3 | 
  4 | class CategorizedChunkedCorpusReader(CategorizedCorpusReader, ChunkedCorpusReader):
  5 | 	"""
  6 | 	A reader for chunked corpora whose documents are divided into categories
  7 | 	based on their file identifiers.
  8 | 	"""
  9 | 	# code adapted from CategorizedTaggedCorpusReader
 10 | 	def __init__(self, *args, **kwargs):
 11 | 		CategorizedCorpusReader.__init__(self, kwargs)
 12 | 		ChunkedCorpusReader.__init__(self, *args, **kwargs)
 13 | 	
 14 | 	def _resolve(self, fileids, categories):
 15 | 		if fileids is not None and categories is not None:
 16 | 			raise ValueError('Specify fileids or categories, not both')
 17 | 		if categories is not None:
 18 | 			return self.fileids(categories)
 19 | 		else:
 20 | 			return fileids
 21 | 	
 22 | 	def raw(self, fileids=None, categories=None):
 23 | 		return ChunkedCorpusReader.raw(self, self._resolve(fileids, categories))
 24 | 	
 25 | 	def words(self, fileids=None, categories=None):
 26 | 		return ChunkedCorpusReader.words(self, self._resolve(fileids, categories))
 27 | 	
 28 | 	def sents(self, fileids=None, categories=None):
 29 | 		return ChunkedCorpusReader.sents(self, self._resolve(fileids, categories))
 30 | 	
 31 | 	def paras(self, fileids=None, categories=None):
 32 | 		return ChunkedCorpusReader.paras(self, self._resolve(fileids, categories))
 33 | 	
 34 | 	def tagged_words(self, fileids=None, categories=None):
 35 | 		return ChunkedCorpusReader.tagged_words(self, self._resolve(fileids, categories))
 36 | 	
 37 | 	def tagged_sents(self, fileids=None, categories=None):
 38 | 		return ChunkedCorpusReader.tagged_sents(self, self._resolve(fileids, categories))
 39 | 		
 40 | 	def tagged_paras(self, fileids=None, categories=None):
 41 | 		return ChunkedCorpusReader.tagged_paras(self, self._resolve(fileids, categories))
 42 | 	
 43 | 	def chunked_words(self, fileids=None, categories=None):
 44 | 		return ChunkedCorpusReader.chunked_words(
 45 | 			self, self._resolve(fileids, categories))
 46 | 	
 47 | 	def chunked_sents(self, fileids=None, categories=None):
 48 | 		return ChunkedCorpusReader.chunked_sents(
 49 | 			self, self._resolve(fileids, categories))
 50 | 	
 51 | 	def chunked_paras(self, fileids=None, categories=None):
 52 | 		return ChunkedCorpusReader.chunked_paras(
 53 | 			self, self._resolve(fileids, categories))
 54 | 
 55 | class CategorizedConllChunkCorpusReader(CategorizedCorpusReader, ConllChunkCorpusReader):
 56 | 	"""
 57 | 	A reader for conll chunked corpora whose documents are divided into
 58 | 	categories based on their file identifiers.
 59 | 	"""
 60 | 	def __init__(self, *args, **kwargs):
 61 | 		# NOTE: in addition to cat_pattern, ConllChunkCorpusReader also requires
 62 | 		# chunk_types as third argument, which defaults to ('NP','VP','PP')
 63 | 		CategorizedCorpusReader.__init__(self, kwargs)
 64 | 		ConllChunkCorpusReader.__init__(self, *args, **kwargs)
 65 | 	
 66 | 	def _resolve(self, fileids, categories):
 67 | 		if fileids is not None and categories is not None:
 68 | 			raise ValueError('Specify fileids or categories, not both')
 69 | 		if categories is not None:
 70 | 			return self.fileids(categories)
 71 | 		else:
 72 | 			return fileids
 73 | 	
 74 | 	def raw(self, fileids=None, categories=None):
 75 | 		return ConllCorpusReader.raw(self, self._resolve(fileids, categories))
 76 | 	
 77 | 	def words(self, fileids=None, categories=None):
 78 | 		return ConllCorpusReader.words(self, self._resolve(fileids, categories))
 79 | 	
 80 | 	def sents(self, fileids=None, categories=None):
 81 | 		return ConllCorpusReader.sents(self, self._resolve(fileids, categories))
 82 | 	
 83 | 	def tagged_words(self, fileids=None, categories=None):
 84 | 		return ConllCorpusReader.tagged_words(self, self._resolve(fileids, categories))
 85 | 	
 86 | 	def tagged_sents(self, fileids=None, categories=None):
 87 | 		return ConllCorpusReader.tagged_sents(self, self._resolve(fileids, categories))
 88 | 	
 89 | 	def chunked_words(self, fileids=None, categories=None, chunk_types=None):
 90 | 		return ConllCorpusReader.chunked_words(
 91 | 			self, self._resolve(fileids, categories), chunk_types)
 92 | 	
 93 | 	def chunked_sents(self, fileids=None, categories=None, chunk_types=None):
 94 | 		return ConllCorpusReader.chunked_sents(
 95 | 			self, self._resolve(fileids, categories), chunk_types)
 96 | 	
 97 | 	def parsed_sents(self, fileids=None, categories=None, pos_in_tree=None):
 98 | 		return ConllCorpusReader.parsed_sents(
 99 | 			self, self._resolve(fileids, categories), pos_in_tree)
100 | 	
101 | 	def srl_spans(self, fileids=None, categories=None):
102 | 		return ConllCorpusReader.srl_spans(self, self._resolve(fileids, categories))
103 | 	
104 | 	def srl_instances(self, fileids=None, categories=None, pos_in_tree=None, flatten=True):
105 | 		return ConllCorpusReader.srl_instances(
106 | 			self, self._resolve(fileids, categories), pos_in_tree, flatten)
107 | 	
108 | 	def iob_words(self, fileids=None, categories=None):
109 | 		return ConllCorpusReader.iob_words(self, self._resolve(fileids, categories))
110 | 	
111 | 	def iob_sents(self, fileids=None, categories=None):
112 | 		return ConllCorpusReader.iob_sents(self, self._resolve(fileids, categories))


--------------------------------------------------------------------------------
/Module 2/Chapter 3/7853OS_03_codes/chapter3.py:
--------------------------------------------------------------------------------
  1 | """
  2 | ==========================
  3 | Setting up a Custom Corpus
  4 | ==========================
  5 | 
  6 | >>> import os, os.path
  7 | >>> path = os.path.expanduser('~/nltk_data')
  8 | >>> if not os.path.exists(path):
  9 | ...		os.mkdir(path)
 10 | >>> os.path.exists(path)
 11 | True
 12 | >>> import nltk.data
 13 | >>> path in nltk.data.path
 14 | True
 15 | 
 16 | >>> nltk.data.load('corpora/cookbook/mywords.txt', format='raw')
 17 | b'nltk\\n'
 18 | 
 19 | >>> nltk.data.load('corpora/cookbook/synonyms.yaml')
 20 | {'bday': 'birthday'}
 21 | 
 22 | 
 23 | ===========================
 24 | Creating a Word List Corpus
 25 | ===========================
 26 | 
 27 | >>> from nltk.corpus.reader import WordListCorpusReader
 28 | >>> reader = WordListCorpusReader('.', ['wordlist'])
 29 | >>> reader.words()
 30 | ['nltk', 'corpus', 'corpora', 'wordnet']
 31 | >>> reader.fileids()
 32 | ['wordlist']
 33 | 
 34 | >>> reader.raw()
 35 | 'nltk\\ncorpus\\ncorpora\\nwordnet\\n'
 36 | >>> from nltk.tokenize import line_tokenize
 37 | >>> line_tokenize(reader.raw())
 38 | ['nltk', 'corpus', 'corpora', 'wordnet']
 39 | 
 40 | >>> from nltk.corpus import names
 41 | >>> names.fileids()
 42 | ['female.txt', 'male.txt']
 43 | >>> len(names.words('female.txt'))
 44 | 5001
 45 | >>> len(names.words('male.txt'))
 46 | 2943
 47 | 
 48 | >>> from nltk.corpus import words
 49 | >>> words.fileids()
 50 | ['en', 'en-basic']
 51 | >>> len(words.words('en-basic'))
 52 | 850
 53 | >>> len(words.words('en'))
 54 | 234936
 55 | 
 56 | 
 57 | ============================================
 58 | Creating a Part-of-Speech Tagged Word Corpus
 59 | ============================================
 60 | 
 61 | >>> from nltk.corpus.reader import TaggedCorpusReader
 62 | >>> reader = TaggedCorpusReader('.', r'.*\.pos')
 63 | >>> reader.words()
 64 | ['The', 'expense', 'and', 'time', 'involved', 'are', ...]
 65 | >>> reader.tagged_words()
 66 | [('The', 'AT-TL'), ('expense', 'NN'), ('and', 'CC'), ...]
 67 | >>> reader.sents()
 68 | [['The', 'expense', 'and', 'time', 'involved', 'are', 'astronomical', '.']]
 69 | >>> reader.tagged_sents()
 70 | [[('The', 'AT-TL'), ('expense', 'NN'), ('and', 'CC'), ('time', 'NN'), ('involved', 'VBN'), ('are', 'BER'), ('astronomical', 'JJ'), ('.', '.')]]
 71 | >>> reader.paras()
 72 | [[['The', 'expense', 'and', 'time', 'involved', 'are', 'astronomical', '.']]]
 73 | >>> reader.tagged_paras()
 74 | [[[('The', 'AT-TL'), ('expense', 'NN'), ('and', 'CC'), ('time', 'NN'), ('involved', 'VBN'), ('are', 'BER'), ('astronomical', 'JJ'), ('.', '.')]]]
 75 | 
 76 | >>> from nltk.tokenize import SpaceTokenizer
 77 | >>> reader = TaggedCorpusReader('.', r'.*\.pos', word_tokenizer=SpaceTokenizer())
 78 | >>> reader.words()
 79 | ['The', 'expense', 'and', 'time', 'involved', 'are', ...]
 80 | 
 81 | >>> from nltk.tokenize import LineTokenizer
 82 | >>> reader = TaggedCorpusReader('.', r'.*\.pos', sent_tokenizer=LineTokenizer())
 83 | >>> reader.sents()
 84 | [['The', 'expense', 'and', 'time', 'involved', 'are', 'astronomical', '.']]
 85 | 
 86 | >>> reader = TaggedCorpusReader('.', r'.*\.pos', tagset='en-brown')
 87 | >>> reader.tagged_words(tagset='universal')
 88 | [('The', 'DET'), ('expense', 'NOUN'), ('and', 'CONJ'), ...]
 89 | 
 90 | >>> from nltk.corpus import treebank
 91 | >>> treebank.tagged_words()
 92 | [('Pierre', 'NNP'), ('Vinken', 'NNP'), (',', ','), ...]
 93 | >>> treebank.tagged_words(tagset='universal')
 94 | [('Pierre', 'NOUN'), ('Vinken', 'NOUN'), (',', '.'), ...]
 95 | >>> treebank.tagged_words(tagset='brown')
 96 | [('Pierre', 'UNK'), ('Vinken', 'UNK'), (',', 'UNK'), ...]
 97 | 
 98 | 
 99 | ================================
100 | Creating a Chunked Phrase Corpus
101 | ================================
102 | 
103 | >>> from nltk.corpus.reader import ChunkedCorpusReader
104 | >>> reader = ChunkedCorpusReader('.', r'.*\.chunk')
105 | >>> reader.chunked_words()
106 | [Tree('NP', [('Earlier', 'JJR'), ('staff-reduction', 'NN'), ('moves', 'NNS')]), ('have', 'VBP'), ...]
107 | >>> reader.chunked_sents()
108 | [Tree('S', [Tree('NP', [('Earlier', 'JJR'), ('staff-reduction', 'NN'), ('moves', 'NNS')]), ('have', 'VBP'), ('trimmed', 'VBN'), ('about', 'IN'), Tree('NP', [('300', 'CD'), ('jobs', 'NNS')]), (',', ','), Tree('NP', [('the', 'DT'), ('spokesman', 'NN')]), ('said', 'VBD'), ('.', '.')])]
109 | >>> reader.chunked_paras()
110 | [[Tree('S', [Tree('NP', [('Earlier', 'JJR'), ('staff-reduction', 'NN'), ('moves', 'NNS')]), ('have', 'VBP'), ('trimmed', 'VBN'), ('about', 'IN'), Tree('NP', [('300', 'CD'), ('jobs', 'NNS')]), (',', ','), Tree('NP', [('the', 'DT'), ('spokesman', 'NN')]), ('said', 'VBD'), ('.', '.')])]]
111 | 
112 | >>> from nltk.corpus.reader import ConllChunkCorpusReader
113 | >>> conllreader = ConllChunkCorpusReader('.', r'.*\.iob', ('NP', 'VP', 'PP'))
114 | >>> conllreader.chunked_words()
115 | [Tree('NP', [('Mr.', 'NNP'), ('Meador', 'NNP')]), Tree('VP', [('had', 'VBD'), ('been', 'VBN')]), ...]
116 | >>> conllreader.chunked_sents()
117 | [Tree('S', [Tree('NP', [('Mr.', 'NNP'), ('Meador', 'NNP')]), Tree('VP', [('had', 'VBD'), ('been', 'VBN')]), Tree('NP', [('executive', 'JJ'), ('vice', 'NN'), ('president', 'NN')]), Tree('PP', [('of', 'IN')]), Tree('NP', [('Balcor', 'NNP')]), ('.', '.')])]
118 | >>> conllreader.iob_words()
119 | [('Mr.', 'NNP', 'B-NP'), ('Meador', 'NNP', 'I-NP'), ...]
120 | >>> conllreader.iob_sents()
121 | [[('Mr.', 'NNP', 'B-NP'), ('Meador', 'NNP', 'I-NP'), ('had', 'VBD', 'B-VP'), ('been', 'VBN', 'I-VP'), ('executive', 'JJ', 'B-NP'), ('vice', 'NN', 'I-NP'), ('president', 'NN', 'I-NP'), ('of', 'IN', 'B-PP'), ('Balcor', 'NNP', 'B-NP'), ('.', '.', 'O')]]
122 | 
123 | >>> reader.chunked_words()[0].leaves()
124 | [('Earlier', 'JJR'), ('staff-reduction', 'NN'), ('moves', 'NNS')]
125 | >>> reader.chunked_sents()[0].leaves()
126 | [('Earlier', 'JJR'), ('staff-reduction', 'NN'), ('moves', 'NNS'), ('have', 'VBP'), ('trimmed', 'VBN'), ('about', 'IN'), ('300', 'CD'), ('jobs', 'NNS'), (',', ','), ('the', 'DT'), ('spokesman', 'NN'), ('said', 'VBD'), ('.', '.')]
127 | >>> reader.chunked_paras()[0][0].leaves()
128 | [('Earlier', 'JJR'), ('staff-reduction', 'NN'), ('moves', 'NNS'), ('have', 'VBP'), ('trimmed', 'VBN'), ('about', 'IN'), ('300', 'CD'), ('jobs', 'NNS'), (',', ','), ('the', 'DT'), ('spokesman', 'NN'), ('said', 'VBD'), ('.', '.')]
129 | 
130 | 
131 | ==================================
132 | Creating a Categorized Text Corpus
133 | ==================================
134 | 
135 | >>> from nltk.corpus import brown
136 | >>> brown.categories()
137 | ['adventure', 'belles_lettres', 'editorial', 'fiction', 'government', 'hobbies', 'humor', 'learned', 'lore', 'mystery', 'news', 'religion', 'reviews', 'romance', 'science_fiction']
138 | 
139 | >>> from nltk.corpus.reader import CategorizedPlaintextCorpusReader
140 | >>> reader = CategorizedPlaintextCorpusReader('.', r'movie_.*\.txt', cat_pattern=r'movie_(\w+)\.txt')
141 | >>> reader.categories()
142 | ['neg', 'pos']
143 | >>> reader.fileids(categories=['neg'])
144 | ['movie_neg.txt']
145 | >>> reader.fileids(categories=['pos'])
146 | ['movie_pos.txt']
147 | 
148 | >>> reader = CategorizedPlaintextCorpusReader('.', r'movie_.*\.txt', cat_map={'movie_pos.txt': ['pos'], 'movie_neg.txt': ['neg']})
149 | >>> reader.categories()
150 | ['neg', 'pos']
151 | 
152 | 
153 | ===================================
154 | Creating a Categorized Chunk Corpus
155 | ===================================
156 | 
157 | >>> import nltk.data
158 | >>> from catchunked import CategorizedChunkedCorpusReader
159 | >>> path = nltk.data.find('corpora/treebank/tagged')
160 | >>> reader = CategorizedChunkedCorpusReader(path, r'wsj_.*\.pos', cat_pattern=r'wsj_(.*)\.pos')
161 | >>> len(reader.categories()) == len(reader.fileids())
162 | True
163 | >>> len(reader.chunked_sents(categories=['0001']))
164 | 16
165 | 
166 | >>> import nltk.data
167 | >>> from catchunked import CategorizedConllChunkCorpusReader
168 | >>> path = nltk.data.find('corpora/conll2000')
169 | >>> reader = CategorizedConllChunkCorpusReader(path, r'.*\.txt', ('NP','VP','PP'), cat_pattern=r'(.*)\.txt')
170 | >>> reader.categories()
171 | ['test', 'train']
172 | >>> reader.fileids()
173 | ['test.txt', 'train.txt']
174 | >>> len(reader.chunked_sents(categories=['test']))
175 | 2012
176 | 
177 | 
178 | ===================
179 | Lazy Corpus Loading
180 | ===================
181 | 
182 | >>> from nltk.corpus.util import LazyCorpusLoader
183 | >>> from nltk.corpus.reader import WordListCorpusReader
184 | >>> reader = LazyCorpusLoader('cookbook', WordListCorpusReader, ['wordlist'])
185 | >>> isinstance(reader, LazyCorpusLoader)
186 | True
187 | >>> reader.fileids()
188 | ['wordlist']
189 | >>> isinstance(reader, LazyCorpusLoader)
190 | False
191 | >>> isinstance(reader, WordListCorpusReader)
192 | True
193 | 
194 | 
195 | =============================
196 | Creating a Custom Corpus View
197 | =============================
198 | 
199 | >>> from nltk.corpus.reader import PlaintextCorpusReader
200 | >>> plain = PlaintextCorpusReader('.', ['heading_text.txt'])
201 | >>> len(plain.paras())
202 | 4
203 | >>> from corpus import IgnoreHeadingCorpusReader
204 | >>> reader = IgnoreHeadingCorpusReader('.', ['heading_text.txt'])
205 | >>> len(reader.paras())
206 | 3
207 | """
208 | 
209 | if __name__ == '__main__':
210 | 	import doctest
211 | 	doctest.testmod()


--------------------------------------------------------------------------------
/Module 2/Chapter 3/7853OS_03_codes/conll.iob:
--------------------------------------------------------------------------------
 1 | Mr. NNP B-NP
 2 | Meador NNP I-NP
 3 | had VBD B-VP
 4 | been VBN I-VP
 5 | executive JJ B-NP
 6 | vice NN I-NP
 7 | president NN I-NP
 8 | of IN B-PP
 9 | Balcor NNP B-NP
10 | . . O


--------------------------------------------------------------------------------
/Module 2/Chapter 3/7853OS_03_codes/corpus.py:
--------------------------------------------------------------------------------
 1 | import lockfile, tempfile, shutil
 2 | from nltk.corpus.reader import PlaintextCorpusReader
 3 | from nltk.corpus.reader.util import StreamBackedCorpusView, read_blankline_block
 4 | 
 5 | class IgnoreHeadingCorpusView(StreamBackedCorpusView):
 6 | 	def __init__(self, *args, **kwargs):
 7 | 		StreamBackedCorpusView.__init__(self, *args, **kwargs)
 8 | 		# open self._stream
 9 | 		self._open()
10 | 		# skip the heading block
11 | 		read_blankline_block(self._stream)
12 | 		# reset the start position to the current position in the stream
13 | 		self._filepos = [self._stream.tell()]
14 | 
15 | class IgnoreHeadingCorpusReader(PlaintextCorpusReader):
16 | 	CorpusView = IgnoreHeadingCorpusView
17 | 
18 | def append_line(fname, line):
19 | 	# lock for writing, released when fp is closed
20 | 	with lockfile.FileLock(fname):
21 | 		fp = open(fname, 'a+')
22 | 		fp.write(line)
23 | 		fp.write('\n')
24 | 		fp.close()
25 | 
26 | def remove_line(fname, line):
27 | 	'''Remove line from file by creating a temporary file containing all lines
28 | 	from original file except those matching the given line, then copying the
29 | 	temporary file back into the original file, overwriting its contents.
30 | 	'''
31 | 	with lockfile.FileLock(fname):
32 | 		tmp = tempfile.TemporaryFile()
33 | 		fp = open(fname, 'rw+')
34 | 		# write all lines from orig file, except if matches given line
35 | 		for l in fp:
36 | 			if l.strip() != line:
37 | 				tmp.write(l)
38 | 		
39 | 		# reset file pointers so entire files are copied
40 | 		fp.seek(0)
41 | 		tmp.seek(0)
42 | 		# copy tmp into fp, then truncate to remove trailing line(s)
43 | 		shutil.copyfileobj(tmp, fp)
44 | 		fp.truncate()
45 | 		fp.close()
46 | 		tmp.close()


--------------------------------------------------------------------------------
/Module 2/Chapter 3/7853OS_03_codes/heading_text.txt:
--------------------------------------------------------------------------------
1 | A simple heading
2 | 
3 | Here is the actual text for the corpus.
4 | 
5 | Paragraphs are split by blanklines.
6 | 
7 | This is the 3rd paragraph.


--------------------------------------------------------------------------------
/Module 2/Chapter 3/7853OS_03_codes/mongoreader.py:
--------------------------------------------------------------------------------
 1 | import pymongo
 2 | from nltk.data import LazyLoader
 3 | from nltk.tokenize import TreebankWordTokenizer
 4 | from nltk.util import AbstractLazySequence, LazyMap, LazyConcatenation
 5 | 
 6 | class MongoDBLazySequence(AbstractLazySequence):
 7 | 	def __init__(self, host='localhost', port=27017, db='test', collection='corpus', field='text'):
 8 | 		self.conn = pymongo.MongoClient(host, port)
 9 | 		self.collection = self.conn[db][collection]
10 | 		self.field = field
11 | 	
12 | 	def __len__(self):
13 | 		return self.collection.count()
14 | 	
15 | 	def iterate_from(self, start):
16 | 		f = lambda d: d.get(self.field, '')
17 | 		return iter(LazyMap(f, self.collection.find(fields=[self.field], skip=start)))
18 | 
19 | class MongoDBCorpusReader(object):
20 | 	def __init__(self, word_tokenizer=TreebankWordTokenizer(),
21 | 				 sent_tokenizer=LazyLoader('tokenizers/punkt/PY3/english.pickle'),
22 | 				 **kwargs):
23 | 		self._seq = MongoDBLazySequence(**kwargs)
24 | 		self._word_tokenize = word_tokenizer.tokenize
25 | 		self._sent_tokenize = sent_tokenizer.tokenize
26 | 	
27 | 	def text(self):
28 | 		return self._seq
29 | 	
30 | 	def words(self):
31 | 		return LazyConcatenation(LazyMap(self._word_tokenize, self.text()))
32 | 	
33 | 	def sents(self):
34 | 		return LazyConcatenation(LazyMap(self._sent_tokenize, self.text()))


--------------------------------------------------------------------------------
/Module 2/Chapter 3/7853OS_03_codes/mywords.txt:
--------------------------------------------------------------------------------
1 | nltk
2 | 


--------------------------------------------------------------------------------
/Module 2/Chapter 3/7853OS_03_codes/synonyms.csv:
--------------------------------------------------------------------------------
1 | bday,birthday
2 | 


--------------------------------------------------------------------------------
/Module 2/Chapter 3/7853OS_03_codes/synonyms.yaml:
--------------------------------------------------------------------------------
1 | bday: birthday
2 | 


--------------------------------------------------------------------------------
/Module 2/Chapter 3/7853OS_03_codes/treebank.chunk:
--------------------------------------------------------------------------------
1 | [Earlier/JJR staff-reduction/NN moves/NNS] have/VBP trimmed/VBN about/IN [300/CD jobs/NNS] ,/, [the/DT spokesman/NN] said/VBD ./.


--------------------------------------------------------------------------------
/Module 2/Chapter 3/7853OS_03_codes/wordlist:
--------------------------------------------------------------------------------
1 | nltk
2 | corpus
3 | corpora
4 | wordnet
5 | 


--------------------------------------------------------------------------------
/Module 2/Chapter 4/7853OS_04_Codes/chapter4.py:
--------------------------------------------------------------------------------
  1 | """
  2 | ===============
  3 | Default Tagging
  4 | ===============
  5 | 
  6 | >>> from nltk.tag import DefaultTagger
  7 | >>> tagger = DefaultTagger('NN')
  8 | >>> tagger.tag(['Hello', 'World'])
  9 | [('Hello', 'NN'), ('World', 'NN')]
 10 | 
 11 | >>> from nltk.corpus import treebank
 12 | >>> test_sents = treebank.tagged_sents()[3000:]
 13 | >>> tagger.evaluate(test_sents)
 14 | 0.14331966328512843
 15 | 
 16 | >>> tagger.tag_sents([['Hello', 'world', '.'], ['How', 'are', 'you', '?']])
 17 | [[('Hello', 'NN'), ('world', 'NN'), ('.', 'NN')], [('How', 'NN'), ('are', 'NN'), ('you', 'NN'), ('?', 'NN')]]
 18 | 
 19 | >>> from nltk.tag import untag
 20 | >>> untag([('Hello', 'NN'), ('World', 'NN')])
 21 | ['Hello', 'World']
 22 | 
 23 | 
 24 | ========================================
 25 | Training a Unigram Part-of-Speech Tagger
 26 | ========================================
 27 | 
 28 | >>> from nltk.tag import UnigramTagger
 29 | >>> from nltk.corpus import treebank
 30 | >>> train_sents = treebank.tagged_sents()[:3000]
 31 | >>> tagger = UnigramTagger(train_sents)
 32 | >>> treebank.sents()[0]
 33 | ['Pierre', 'Vinken', ',', '61', 'years', 'old', ',', 'will', 'join', 'the', 'board', 'as', 'a', 'nonexecutive', 'director', 'Nov.', '29', '.']
 34 | >>> tagger.tag(treebank.sents()[0])
 35 | [('Pierre', 'NNP'), ('Vinken', 'NNP'), (',', ','), ('61', 'CD'), ('years', 'NNS'), ('old', 'JJ'), (',', ','), ('will', 'MD'), ('join', 'VB'), ('the', 'DT'), ('board', 'NN'), ('as', 'IN'), ('a', 'DT'), ('nonexecutive', 'JJ'), ('director', 'NN'), ('Nov.', 'NNP'), ('29', 'CD'), ('.', '.')]
 36 | 
 37 | >>> tagger.evaluate(test_sents)
 38 | 0.8588819339520829
 39 | 
 40 | >>> tagger = UnigramTagger(model={'Pierre': 'NN'})
 41 | >>> tagger.tag(treebank.sents()[0])
 42 | [('Pierre', 'NN'), ('Vinken', None), (',', None), ('61', None), ('years', None), ('old', None), (',', None), ('will', None), ('join', None), ('the', None), ('board', None), ('as', None), ('a', None), ('nonexecutive', None), ('director', None), ('Nov.', None), ('29', None), ('.', None)]
 43 | 
 44 | >>> tagger = UnigramTagger(train_sents, cutoff=3)
 45 | >>> tagger.evaluate(test_sents)
 46 | 0.7757392618173969
 47 | 
 48 | 
 49 | ======================================
 50 | Combining Taggers with Backoff Tagging
 51 | ======================================
 52 | 
 53 | >>> tagger1 = DefaultTagger('NN')
 54 | >>> tagger2 = UnigramTagger(train_sents, backoff=tagger1)
 55 | >>> tagger2.evaluate(test_sents)
 56 | 0.8758471832505935
 57 | 
 58 | >>> tagger1._taggers == [tagger1]
 59 | True
 60 | >>> tagger2._taggers == [tagger2, tagger1]
 61 | True
 62 | 
 63 | >>> import pickle
 64 | >>> f = open('tagger.pickle', 'wb')
 65 | >>> pickle.dump(tagger, f)
 66 | >>> f.close()
 67 | >>> f = open('tagger.pickle', 'rb')
 68 | >>> tagger = pickle.load(f)
 69 | 
 70 | 
 71 | ====================================
 72 | Training and Combining Ngram Taggers
 73 | ====================================
 74 | 
 75 | >>> from nltk.tag import BigramTagger, TrigramTagger
 76 | >>> bitagger = BigramTagger(train_sents)
 77 | >>> bitagger.evaluate(test_sents)
 78 | 0.11310166199007123
 79 | >>> tritagger = TrigramTagger(train_sents)
 80 | >>> tritagger.evaluate(test_sents)
 81 | 0.0688107058061731
 82 | 
 83 | >>> from tag_util import backoff_tagger
 84 | >>> backoff = DefaultTagger('NN')
 85 | >>> tagger = backoff_tagger(train_sents, [UnigramTagger, BigramTagger, TrigramTagger], backoff=backoff)
 86 | >>> tagger.evaluate(test_sents)
 87 | 0.8806820634578028
 88 | 
 89 | >>> tagger._taggers[-1] == backoff
 90 | True
 91 | >>> isinstance(tagger._taggers[0], TrigramTagger)
 92 | True
 93 | >>> isinstance(tagger._taggers[1], BigramTagger)
 94 | True
 95 | 
 96 | >>> from nltk.tag import NgramTagger
 97 | >>> quadtagger = NgramTagger(4, train_sents)
 98 | >>> quadtagger.evaluate(test_sents)
 99 | 0.058234405352903085
100 | 
101 | >>> from taggers import QuadgramTagger
102 | >>> quadtagger = backoff_tagger(train_sents, [UnigramTagger, BigramTagger, TrigramTagger, QuadgramTagger], backoff=backoff)
103 | >>> quadtagger.evaluate(test_sents)
104 | 0.8806388948845241
105 | 
106 | 
107 | ====================================
108 | Creating a Model of Likely Word Tags
109 | ====================================
110 | 
111 | >>> from tag_util import word_tag_model
112 | >>> from nltk.corpus import treebank
113 | >>> model = word_tag_model(treebank.words(), treebank.tagged_words())
114 | >>> tagger = UnigramTagger(model=model)
115 | >>> tagger.evaluate(test_sents)
116 | 0.559680552557738
117 | 
118 | >>> default_tagger = DefaultTagger('NN')
119 | >>> likely_tagger = UnigramTagger(model=model, backoff=default_tagger)
120 | >>> tagger = backoff_tagger(train_sents, [UnigramTagger, BigramTagger, TrigramTagger], backoff=likely_tagger)
121 | >>> tagger.evaluate(test_sents)
122 | 0.8806820634578028
123 | 
124 | >>> tagger = backoff_tagger(train_sents, [UnigramTagger, BigramTagger, TrigramTagger], backoff=default_tagger)
125 | >>> likely_tagger = UnigramTagger(model=model, backoff=tagger)
126 | >>> likely_tagger.evaluate(test_sents)
127 | 0.8824088063889488
128 | 
129 | 
130 | ================================
131 | Tagging with Regular Expressions
132 | ================================
133 | 
134 | >>> from tag_util import patterns
135 | >>> from nltk.tag import RegexpTagger
136 | >>> tagger = RegexpTagger(patterns)
137 | >>> tagger.evaluate(test_sents)
138 | 0.037470321605870924
139 | 
140 | 
141 | =============
142 | Affix Tagging
143 | =============
144 | 
145 | >>> from nltk.tag import AffixTagger
146 | >>> tagger = AffixTagger(train_sents)
147 | >>> tagger.evaluate(test_sents)
148 | 0.27558817181092166
149 | 
150 | >>> prefix_tagger = AffixTagger(train_sents, affix_length=3)
151 | >>> prefix_tagger.evaluate(test_sents)
152 | 0.23587308439456076
153 | 
154 | >>> suffix_tagger = AffixTagger(train_sents, affix_length=-2)
155 | >>> suffix_tagger.evaluate(test_sents)
156 | 0.31940427368875457
157 | 
158 | >>> pre3_tagger = AffixTagger(train_sents, affix_length=3)
159 | >>> pre3_tagger.evaluate(test_sents)
160 | 0.23587308439456076
161 | >>> pre2_tagger = AffixTagger(train_sents, affix_length=2, backoff=pre3_tagger)
162 | >>> pre2_tagger.evaluate(test_sents)
163 | 0.29786315562270665
164 | >>> suf2_tagger = AffixTagger(train_sents, affix_length=-2, backoff=pre2_tagger)
165 | >>> suf2_tagger.evaluate(test_sents)
166 | 0.32467083962875026
167 | >>> suf3_tagger = AffixTagger(train_sents, affix_length=-3, backoff=suf2_tagger)
168 | >>> suf3_tagger.evaluate(test_sents)
169 | 0.3590761925318368
170 | 
171 | 
172 | =======================
173 | Training a Brill Tagger
174 | =======================
175 | 
176 | >>> default_tagger = DefaultTagger('NN')
177 | >>> initial_tagger = backoff_tagger(train_sents, [UnigramTagger, BigramTagger, TrigramTagger], backoff=default_tagger)
178 | >>> initial_tagger.evaluate(test_sents)
179 | 0.8806820634578028
180 | >>> from tag_util import train_brill_tagger
181 | >>> brill_tagger = train_brill_tagger(initial_tagger, train_sents)
182 | >>> brill_tagger.evaluate(test_sents)
183 | 0.8827541549751781
184 | 
185 | 
186 | =====================
187 | Training a TnT Tagger
188 | =====================
189 | 
190 | >>> from nltk.tag import tnt
191 | >>> tnt_tagger = tnt.TnT()
192 | >>> tnt_tagger.train(train_sents)
193 | >>> tnt_tagger.evaluate(test_sents)
194 | 0.8756313403842003
195 | 
196 | >>> from nltk.tag import DefaultTagger
197 | >>> unk = DefaultTagger('NN')
198 | >>> tnt_tagger = tnt.TnT(unk=unk, Trained=True)
199 | >>> tnt_tagger.train(train_sents)
200 | >>> tnt_tagger.evaluate(test_sents)
201 | 0.892467083962875
202 | 
203 | >>> tnt_tagger = tnt.TnT(N=100)
204 | >>> tnt_tagger.train(train_sents)
205 | >>> tnt_tagger.evaluate(test_sents)
206 | 0.8756313403842003
207 | 
208 | 
209 | =========================
210 | Using WordNet for Tagging
211 | =========================
212 | 
213 | >>> from taggers import WordNetTagger
214 | >>> wn_tagger = WordNetTagger()
215 | >>> wn_tagger.evaluate(train_sents)
216 | 0.17914876598160262
217 | 
218 | >>> from tag_util import backoff_tagger
219 | >>> from nltk.tag import UnigramTagger, BigramTagger, TrigramTagger
220 | >>> tagger = backoff_tagger(train_sents, [UnigramTagger, BigramTagger, TrigramTagger], backoff=wn_tagger)
221 | >>> tagger.evaluate(test_sents)
222 | 0.8848262464925534
223 | 
224 | 
225 | ========================
226 | Classifier Based Tagging
227 | ========================
228 | 
229 | >>> from nltk.tag.sequential import ClassifierBasedPOSTagger
230 | >>> tagger = ClassifierBasedPOSTagger(train=train_sents)
231 | >>> tagger.evaluate(test_sents)
232 | 0.9309734513274336
233 | 
234 | >>> from nltk.classify import MaxentClassifier
235 | >>> me_tagger = ClassifierBasedPOSTagger(train=train_sents, classifier_builder=MaxentClassifier.train)
236 |   ==> Training (100 iterations)
237 | <BLANKLINE>
238 |       Iteration    Log Likelihood    Accuracy
239 |       ---------------------------------------
240 |              1          -3.82864        0.008
241 |              2          -0.76859        0.957
242 |          Final               nan        0.984
243 | >>> me_tagger.evaluate(test_sents)
244 | 0.9258363911072739
245 | 
246 | >>> from nltk.tag.sequential import ClassifierBasedTagger
247 | >>> from tag_util import unigram_feature_detector
248 | >>> tagger = ClassifierBasedTagger(train=train_sents, feature_detector=unigram_feature_detector)
249 | >>> tagger.evaluate(test_sents)
250 | 0.8733865745737104
251 | 
252 | >>> default = DefaultTagger('NN')
253 | >>> tagger = ClassifierBasedPOSTagger(train=train_sents, backoff=default, cutoff_prob=0.3)
254 | >>> tagger.evaluate(test_sents)
255 | 0.9311029570472696
256 | """
257 | 
258 | if __name__ == '__main__':
259 | 	import doctest
260 | 	doctest.testmod()


--------------------------------------------------------------------------------
/Module 2/Chapter 4/7853OS_04_Codes/tag_util.py:
--------------------------------------------------------------------------------
 1 | import itertools
 2 | from nltk.tbl import Template
 3 | from nltk.tag import brill, brill_trainer
 4 | from nltk.probability import FreqDist, ConditionalFreqDist
 5 | 
 6 | def backoff_tagger(train_sents, tagger_classes, backoff=None):
 7 | 	for cls in tagger_classes:
 8 | 		backoff = cls(train_sents, backoff=backoff)
 9 | 	
10 | 	return backoff
11 | 
12 | def word_tag_model(words, tagged_words, limit=200):
13 | 	fd = FreqDist(words)
14 | 	cfd = ConditionalFreqDist(tagged_words)
15 | 	most_freq = (word for word, count in fd.most_common(limit))
16 | 	return dict((word, cfd[word].max()) for word in most_freq)
17 | 
18 | patterns = [
19 | 	(r'^\d+$', 'CD'),
20 | 	(r'.*ing$', 'VBG'), # gerunds, i.e. wondering
21 | 	(r'.*ment$', 'NN'), # i.e. wonderment
22 | 	(r'.*ful$', 'JJ') # i.e. wonderful
23 | ]
24 | 
25 | def train_brill_tagger(initial_tagger, train_sents, **kwargs):
26 | 	templates = [
27 | 		brill.Template(brill.Pos([-1])),
28 | 		brill.Template(brill.Pos([1])),
29 | 		brill.Template(brill.Pos([-2])),
30 | 		brill.Template(brill.Pos([2])),
31 | 		brill.Template(brill.Pos([-2, -1])),
32 | 		brill.Template(brill.Pos([1, 2])),
33 | 		brill.Template(brill.Pos([-3, -2, -1])),
34 | 		brill.Template(brill.Pos([1, 2, 3])),
35 | 		brill.Template(brill.Pos([-1]), brill.Pos([1])),
36 | 		brill.Template(brill.Word([-1])),
37 | 		brill.Template(brill.Word([1])),
38 | 		brill.Template(brill.Word([-2])),
39 | 		brill.Template(brill.Word([2])),
40 | 		brill.Template(brill.Word([-2, -1])),
41 | 		brill.Template(brill.Word([1, 2])),
42 | 		brill.Template(brill.Word([-3, -2, -1])),
43 | 		brill.Template(brill.Word([1, 2, 3])),
44 | 		brill.Template(brill.Word([-1]), brill.Word([1])),
45 | 	]
46 | 	
47 | 	trainer = brill_trainer.BrillTaggerTrainer(initial_tagger, templates, deterministic=True)
48 | 	return trainer.train(train_sents, **kwargs)
49 | 
50 | def unigram_feature_detector(tokens, index, history):
51 | 	return {'word': tokens[index]}


--------------------------------------------------------------------------------
/Module 2/Chapter 4/7853OS_04_Codes/taggers.py:
--------------------------------------------------------------------------------
 1 | from nltk.tag import NgramTagger, SequentialBackoffTagger
 2 | from nltk.corpus import wordnet, names
 3 | from nltk.probability import FreqDist
 4 | 
 5 | class QuadgramTagger(NgramTagger):
 6 | 	def __init__(self, *args, **kwargs):
 7 | 		NgramTagger.__init__(self, 4, *args, **kwargs)
 8 | 
 9 | class WordNetTagger(SequentialBackoffTagger):
10 | 	'''
11 | 	>>> wt = WordNetTagger()
12 | 	>>> wt.tag(['food', 'is', 'great'])
13 | 	[('food', 'NN'), ('is', 'VB'), ('great', 'JJ')]
14 | 	'''
15 | 	def __init__(self, *args, **kwargs):
16 | 		SequentialBackoffTagger.__init__(self, *args, **kwargs)
17 | 		
18 | 		self.wordnet_tag_map = {
19 | 			'n': 'NN',
20 | 			's': 'JJ',
21 | 			'a': 'JJ',
22 | 			'r': 'RB',
23 | 			'v': 'VB'
24 | 		}
25 | 	
26 | 	def choose_tag(self, tokens, index, history):
27 | 		word = tokens[index]
28 | 		fd = FreqDist()
29 | 		
30 | 		for synset in wordnet.synsets(word):
31 | 			fd[synset.pos()] += 1
32 | 		
33 | 		if not fd: return None
34 | 		return self.wordnet_tag_map.get(fd.max())
35 | 
36 | class NamesTagger(SequentialBackoffTagger):
37 | 	'''
38 | 	>>> nt = NamesTagger()
39 | 	>>> nt.tag(['Jacob'])
40 | 	[('Jacob', 'NNP')]
41 | 	'''
42 | 	def __init__(self, *args, **kwargs):
43 | 		SequentialBackoffTagger.__init__(self, *args, **kwargs)
44 | 		self.name_set = set([n.lower() for n in names.words()])
45 | 	
46 | 	def choose_tag(self, tokens, index, history):
47 | 		word = tokens[index]
48 | 		
49 | 		if word.lower() in self.name_set:
50 | 			return 'NNP'
51 | 		else:
52 | 			return None
53 | 
54 | if __name__ == '__main__':
55 | 	import doctest
56 | 	doctest.testmod()


--------------------------------------------------------------------------------
/Module 2/Chapter 5/7853OS_05_Codes/chunkers.py:
--------------------------------------------------------------------------------
  1 | import nltk.tag
  2 | from nltk.chunk import ChunkParserI
  3 | from nltk.chunk.util import tree2conlltags, conlltags2tree
  4 | from nltk.tag import UnigramTagger, BigramTagger, ClassifierBasedTagger
  5 | from nltk.corpus import names, ieer, gazetteers
  6 | from tag_util import backoff_tagger
  7 | 
  8 | def conll_tag_chunks(chunk_sents):
  9 | 	'''Convert each chunked sentence to list of (tag, chunk_tag) tuples,
 10 | 	so the final result is a list of lists of (tag, chunk_tag) tuples.
 11 | 	>>> from nltk.tree import Tree
 12 | 	>>> t = Tree('S', [Tree('NP', [('the', 'DT'), ('book', 'NN')])])
 13 | 	>>> conll_tag_chunks([t])
 14 | 	[[('DT', 'B-NP'), ('NN', 'I-NP')]]
 15 | 	'''
 16 | 	tagged_sents = [tree2conlltags(tree) for tree in chunk_sents]
 17 | 	return [[(t, c) for (w, t, c) in sent] for sent in tagged_sents]
 18 | 
 19 | class TagChunker(ChunkParserI):
 20 | 	'''Chunks tagged tokens using Ngram Tagging.'''
 21 | 	def __init__(self, train_chunks, tagger_classes=[UnigramTagger, BigramTagger]):
 22 | 		'''Train Ngram taggers on chunked sentences'''
 23 | 		train_sents = conll_tag_chunks(train_chunks)
 24 | 		self.tagger = backoff_tagger(train_sents, tagger_classes)
 25 | 	
 26 | 	def parse(self, tagged_sent):
 27 | 		'''Parsed tagged tokens into parse Tree of chunks'''
 28 | 		if not tagged_sent: return None
 29 | 		(words, tags) = zip(*tagged_sent)
 30 | 		chunks = self.tagger.tag(tags)
 31 | 		# create conll str for tree parsing
 32 | 		wtc = zip(words, chunks)
 33 | 		return conlltags2tree([(w,t,c) for (w,(t,c)) in wtc])
 34 | 
 35 | def chunk_trees2train_chunks(chunk_sents):
 36 | 	tag_sents = [tree2conlltags(sent) for sent in chunk_sents]
 37 | 	return [[((w,t),c) for (w,t,c) in sent] for sent in tag_sents]
 38 | 
 39 | def prev_next_pos_iob(tokens, index, history):
 40 | 	word, pos = tokens[index]
 41 | 	
 42 | 	if index == 0:
 43 | 		prevword, prevpos, previob = ('<START>',)*3
 44 | 	else:
 45 | 		prevword, prevpos = tokens[index-1]
 46 | 		previob = history[index-1]
 47 | 	
 48 | 	if index == len(tokens) - 1:
 49 | 		nextword, nextpos = ('<END>',)*2
 50 | 	else:
 51 | 		nextword, nextpos = tokens[index+1]
 52 | 	
 53 | 	feats = {
 54 | 		'word': word,
 55 | 		'pos': pos,
 56 | 		'nextword': nextword,
 57 | 		'nextpos': nextpos,
 58 | 		'prevword': prevword,
 59 | 		'prevpos': prevpos,
 60 | 		'previob': previob
 61 | 	}
 62 | 	
 63 | 	return feats
 64 | 
 65 | class ClassifierChunker(ChunkParserI):
 66 | 	def __init__(self, train_sents, feature_detector=prev_next_pos_iob, **kwargs):
 67 | 		if not feature_detector:
 68 | 			feature_detector = self.feature_detector
 69 | 		
 70 | 		train_chunks = chunk_trees2train_chunks(train_sents)
 71 | 		self.tagger = ClassifierBasedTagger(train=train_chunks,
 72 | 			feature_detector=feature_detector, **kwargs)
 73 | 	
 74 | 	def parse(self, tagged_sent):
 75 | 		if not tagged_sent: return None
 76 | 		chunks = self.tagger.tag(tagged_sent)
 77 | 		return conlltags2tree([(w,t,c) for ((w,t),c) in chunks])
 78 | 
 79 | def sub_leaves(tree, label):
 80 | 	return [t.leaves() for t in tree.subtrees(lambda s: s.label() == label)]
 81 | 
 82 | class PersonChunker(ChunkParserI):
 83 | 	'''
 84 | 	>>> from nltk.corpus import treebank_chunk
 85 | 	>>> chunker = PersonChunker()
 86 | 	>>> sub_leaves(chunker.parse(treebank_chunk.tagged_sents()[0]), 'PERSON')
 87 | 	[[('Pierre', 'NNP')]]
 88 | 	'''
 89 | 	def __init__(self):
 90 | 		self.name_set = set(names.words())
 91 | 	
 92 | 	def parse(self, tagged_sent):
 93 | 		iobs = []
 94 | 		in_person = False
 95 | 		
 96 | 		for word, tag in tagged_sent:
 97 | 			if word in self.name_set and in_person:
 98 | 				iobs.append((word, tag, 'I-PERSON'))
 99 | 			elif word in self.name_set:
100 | 				iobs.append((word, tag, 'B-PERSON'))
101 | 				in_person = True
102 | 			else:
103 | 				iobs.append((word, tag, 'O'))
104 | 				in_person = False
105 | 		
106 | 		return conlltags2tree(iobs)
107 | 
108 | class LocationChunker(ChunkParserI):
109 | 	'''Chunks locations based on the gazetteers corpus.
110 | 	>>> loc = LocationChunker()
111 | 	>>> t = loc.parse([('San', 'NNP'), ('Francisco', 'NNP'), ('CA', 'NNP'), ('is', 'BE'), ('cold', 'JJ'), ('compared', 'VBD'), ('to', 'TO'), ('San', 'NNP'), ('Jose', 'NNP'), ('CA', 'NNP')])
112 | 	>>> sub_leaves(t, 'LOCATION')
113 | 	[[('San', 'NNP'), ('Francisco', 'NNP'), ('CA', 'NNP')], [('San', 'NNP'), ('Jose', 'NNP'), ('CA', 'NNP')]]
114 | 	'''
115 | 	def __init__(self):
116 | 		# gazetteers is a WordListCorpusReader of many different location words
117 | 		self.locations = set(gazetteers.words())
118 | 		self.lookahead = 0
119 | 		# need to know how many words to lookahead in the tagged sentence to find a location
120 | 		for loc in self.locations:
121 | 			nwords = loc.count(' ')
122 | 			
123 | 			if nwords > self.lookahead:
124 | 				self.lookahead = nwords
125 | 	
126 | 	def iob_locations(self, tagged_sent):
127 | 		i = 0
128 | 		l = len(tagged_sent)
129 | 		inside = False
130 | 		
131 | 		while i < l:
132 | 			word, tag = tagged_sent[i]
133 | 			j = i + 1
134 | 			k = j + self.lookahead
135 | 			nextwords, nexttags = [], []
136 | 			loc = False
137 | 			# lookahead in the sentence to find multi-word locations
138 | 			while j < k:
139 | 				if ' '.join([word] + nextwords) in self.locations:
140 | 					# combine multiple separate locations into single location chunk
141 | 					if inside:
142 | 						yield word, tag, 'I-LOCATION'
143 | 					else:
144 | 						yield word, tag, 'B-LOCATION'
145 | 					# every next word is inside the location chunk
146 | 					for nword, ntag in zip(nextwords, nexttags):
147 | 						yield nword, ntag, 'I-LOCATION'
148 | 					# found a location, so we're inside a chunk
149 | 					loc, inside = True, True
150 | 					# move forward to the next word since the current words
151 | 					# are already chunked
152 | 					i = j
153 | 					break
154 | 				
155 | 				if j < l:
156 | 					nextword, nexttag = tagged_sent[j]
157 | 					nextwords.append(nextword)
158 | 					nexttags.append(nexttag)
159 | 					j += 1
160 | 				else:
161 | 					break
162 | 			# if no location found, then we're outside the location chunk
163 | 			if not loc:
164 | 				inside = False
165 | 				i += 1
166 | 				yield word, tag, 'O'
167 | 	
168 | 	def parse(self, tagged_sent):
169 | 		iobs = self.iob_locations(tagged_sent)
170 | 		return conlltags2tree(iobs)
171 | 
172 | def ieertree2conlltags(tree, tag=nltk.tag.pos_tag):
173 | 	# tree.pos() flattens the tree and produces [(word, node)] where node is
174 | 	# from the word's parent tree node. words in a chunk therefore get the
175 | 	# chunk tag, while words outside a chunk get the same tag as the tree's
176 | 	# top node
177 | 	words, ents = zip(*tree.pos())
178 | 	iobs = []
179 | 	prev = None
180 | 	# construct iob tags from entity names
181 | 	for ent in ents:
182 | 		# any entity that is the same as the tree's top node is outside a chunk
183 | 		if ent == tree.label():
184 | 			iobs.append('O')
185 | 			prev = None
186 | 		# have a previous entity that is equal so this is inside the chunk
187 | 		elif prev == ent:
188 | 			iobs.append('I-%s' % ent)
189 | 		# no previous equal entity in the sequence, so this is the beginning of
190 | 		# an entity chunk
191 | 		else:
192 | 			iobs.append('B-%s' % ent)
193 | 			prev = ent
194 | 	# get tags for each word, then construct 3-tuple for conll tags
195 | 	words, tags = zip(*tag(words))
196 | 	return zip(words, tags, iobs)
197 | 
198 | def ieer_chunked_sents(tag=nltk.tag.pos_tag):
199 | 	for doc in ieer.parsed_docs():
200 | 		tagged = ieertree2conlltags(doc.text, tag)
201 | 		yield conlltags2tree(tagged)
202 | 
203 | if __name__ == '__main__':
204 | 	import doctest
205 | 	doctest.testmod()


--------------------------------------------------------------------------------
/Module 2/Chapter 6/7853OS_06_codes/chapter6.py:
--------------------------------------------------------------------------------
  1 | '''
  2 | =============================
  3 | Filtering Insignificant Words
  4 | =============================
  5 | 
  6 | >>> from transforms import filter_insignificant
  7 | >>> filter_insignificant([('your', 'PRP$'), ('book', 'NN'), ('is', 'VBZ'), ('great', 'JJ')], tag_suffixes=['PRP', 'PRP$'])
  8 | [('book', 'NN'), ('is', 'VBZ'), ('great', 'JJ')]
  9 | 
 10 | 
 11 | =====================
 12 | Swapping Verb Phrases
 13 | =====================
 14 | 
 15 | >>> from transforms import swap_verb_phrase
 16 | >>> swap_verb_phrase(filter_insignificant([('this', 'DT'), ('gripping', 'VBG'), ('book', 'NN'), ('is', 'VBZ'), ('fantastic', 'JJ')]))
 17 | [('fantastic', 'JJ'), ('gripping', 'VBG'), ('book', 'NN')]
 18 | >>> filter_insignificant(swap_verb_phrase([('this', 'DT'), ('gripping', 'VBG'), ('book', 'NN'), ('is', 'VBZ'), ('fantastic', 'JJ')]))
 19 | [('fantastic', 'JJ'), ('gripping', 'VBG'), ('book', 'NN')]
 20 | 
 21 | 
 22 | ==============================
 23 | Chaining Chunk Transformations
 24 | ==============================
 25 | 
 26 | >>> from transforms import transform_chunk
 27 | >>> transform_chunk([('the', 'DT'), ('book', 'NN'), ('of', 'IN'), ('recipes', 'NNS'), ('is', 'VBZ'), ('delicious', 'JJ')], trace=1)
 28 | filter_insignificant : [('book', 'NN'), ('of', 'IN'), ('recipes', 'NNS'), ('is', 'VBZ'), ('delicious', 'JJ')]
 29 | swap_verb_phrase : [('delicious', 'JJ'), ('book', 'NN'), ('of', 'IN'), ('recipes', 'NNS')]
 30 | swap_infinitive_phrase : [('delicious', 'JJ'), ('recipes', 'NNS'), ('book', 'NN')]
 31 | singularize_plural_noun : [('delicious', 'JJ'), ('recipe', 'NN'), ('book', 'NN')]
 32 | [('delicious', 'JJ'), ('recipe', 'NN'), ('book', 'NN')]
 33 | 
 34 | 
 35 | ===============================
 36 | Converting a Chunk Tree to Text
 37 | ===============================
 38 | 
 39 | >>> from nltk.corpus import treebank_chunk
 40 | >>> tree = treebank_chunk.chunked_sents()[0]
 41 | >>> ' '.join([w for w, t in tree.leaves()])
 42 | 'Pierre Vinken , 61 years old , will join the board as a nonexecutive director Nov. 29 .'
 43 | 
 44 | >>> from transforms import chunk_tree_to_sent
 45 | >>> chunk_tree_to_sent(tree)
 46 | 'Pierre Vinken, 61 years old, will join the board as a nonexecutive director Nov. 29.'
 47 | 
 48 | 
 49 | =====================
 50 | Flattening Deep Trees
 51 | =====================
 52 | 
 53 | >>> from nltk.corpus import treebank
 54 | >>> from transforms import flatten_deeptree
 55 | >>> flatten_deeptree(treebank.parsed_sents()[0])
 56 | Tree('S', [Tree('NP', [('Pierre', 'NNP'), ('Vinken', 'NNP')]), (',', ','), Tree('NP', [('61', 'CD'), ('years', 'NNS')]), ('old', 'JJ'), (',', ','), ('will', 'MD'), ('join', 'VB'), Tree('NP', [('the', 'DT'), ('board', 'NN')]), ('as', 'IN'), Tree('NP', [('a', 'DT'), ('nonexecutive', 'JJ'), ('director', 'NN')]), Tree('NP-TMP', [('Nov.', 'NNP'), ('29', 'CD')]), ('.', '.')])
 57 | 
 58 | >>> from nltk.tree import Tree
 59 | >>> Tree('NNP', ['Pierre']).height()
 60 | 2
 61 | 
 62 | >>> Tree('NNP', ['Pierre']).pos()
 63 | [('Pierre', 'NNP')]
 64 | 
 65 | >>> Tree('NP', [Tree('NNP', ['Pierre']), Tree('NNP', ['Vinken'])]).height()
 66 | 3
 67 | 
 68 | >>> Tree('NP', [Tree('NNP', ['Pierre']), Tree('NNP', ['Vinken'])]).pos()
 69 | [('Pierre', 'NNP'), ('Vinken', 'NNP')]
 70 | 
 71 | >>> from nltk.chunk.util import tree2conlltags
 72 | >>> tree2conlltags(treebank.parsed_sents()[0])
 73 | Traceback (most recent call last):
 74 |   File "<stdin>", line 1, in <module>
 75 |   File "/usr/local/lib/python2.6/dist-packages/nltk/chunk/util.py", line 417, in tree2conlltags
 76 |     raise ValueError, "Tree is too deeply nested to be printed in CoNLL format"
 77 | ValueError: Tree is too deeply nested to be printed in CoNLL format
 78 | 
 79 | >>> tree2conlltags(flatten_deeptree(treebank.parsed_sents()[0]))
 80 | [('Pierre', 'NNP', 'B-NP'), ('Vinken', 'NNP', 'I-NP'), (',', ',', 'O'), ('61', 'CD', 'B-NP'), ('years', 'NNS', 'I-NP'), ('old', 'JJ', 'O'), (',', ',', 'O'), ('will', 'MD', 'O'), ('join', 'VB', 'O'), ('the', 'DT', 'B-NP'), ('board', 'NN', 'I-NP'), ('as', 'IN', 'O'), ('a', 'DT', 'B-NP'), ('nonexecutive', 'JJ', 'I-NP'), ('director', 'NN', 'I-NP'), ('Nov.', 'NNP', 'B-NP-TMP'), ('29', 'CD', 'I-NP-TMP'), ('.', '.', 'O')]
 81 | 
 82 | >>> from nltk.corpus import cess_esp
 83 | >>> cess_esp.parsed_sents()[0].height()
 84 | 22
 85 | >>> flatten_deeptree(cess_esp.parsed_sents()[0]).height()
 86 | 3
 87 | 
 88 | 
 89 | =======================
 90 | Creating a Shallow Tree
 91 | =======================
 92 | 
 93 | >>> from transforms import shallow_tree
 94 | >>> shallow_tree(treebank.parsed_sents()[0])
 95 | Tree('S', [Tree('NP-SBJ', [('Pierre', 'NNP'), ('Vinken', 'NNP'), (',', ','), ('61', 'CD'), ('years', 'NNS'), ('old', 'JJ'), (',', ',')]), Tree('VP', [('will', 'MD'), ('join', 'VB'), ('the', 'DT'), ('board', 'NN'), ('as', 'IN'), ('a', 'DT'), ('nonexecutive', 'JJ'), ('director', 'NN'), ('Nov.', 'NNP'), ('29', 'CD')]), ('.', '.')])
 96 | 
 97 | >>> treebank.parsed_sents()[0].height()
 98 | 7
 99 | >>> shallow_tree(treebank.parsed_sents()[0]).height()
100 | 3
101 | 
102 | 
103 | ======================
104 | Converting Tree Labels
105 | ======================
106 | 
107 | >>> from transforms import convert_tree_labels
108 | >>> mapping = {'NP-SBJ': 'NP', 'NP-TMP': 'NP'}
109 | >>> convert_tree_labels(treebank.parsed_sents()[0], mapping)
110 | Tree('S', [Tree('NP', [Tree('NP', [Tree('NNP', ['Pierre']), Tree('NNP', ['Vinken'])]), Tree(',', [',']), Tree('ADJP', [Tree('NP', [Tree('CD', ['61']), Tree('NNS', ['years'])]), Tree('JJ', ['old'])]), Tree(',', [','])]), Tree('VP', [Tree('MD', ['will']), Tree('VP', [Tree('VB', ['join']), Tree('NP', [Tree('DT', ['the']), Tree('NN', ['board'])]), Tree('PP-CLR', [Tree('IN', ['as']), Tree('NP', [Tree('DT', ['a']), Tree('JJ', ['nonexecutive']), Tree('NN', ['director'])])]), Tree('NP', [Tree('NNP', ['Nov.']), Tree('CD', ['29'])])])]), Tree('.', ['.'])])
111 | '''
112 | # TODO: also do a task on converting tree nodes so NP-TMP -> NP
113 | 
114 | if __name__ == '__main__':
115 | 	import doctest
116 | 	doctest.testmod()


--------------------------------------------------------------------------------
/Module 2/Chapter 7/7853OS_07_Codes/chapter7.py:
--------------------------------------------------------------------------------
  1 | '''
  2 | =================================
  3 | Training a Naive Bayes Classifier
  4 | =================================
  5 | 
  6 | >>> from nltk.corpus import movie_reviews
  7 | >>> from featx import label_feats_from_corpus, split_label_feats
  8 | >>> movie_reviews.categories()
  9 | ['neg', 'pos']
 10 | >>> lfeats = label_feats_from_corpus(movie_reviews)
 11 | >>> lfeats.keys()
 12 | dict_keys(['neg', 'pos'])
 13 | >>> train_feats, test_feats = split_label_feats(lfeats)
 14 | >>> len(train_feats)
 15 | 1500
 16 | >>> len(test_feats)
 17 | 500
 18 | 
 19 | >>> from nltk.classify import NaiveBayesClassifier
 20 | >>> nb_classifier = NaiveBayesClassifier.train(train_feats)
 21 | >>> nb_classifier.labels()
 22 | ['neg', 'pos']
 23 | 
 24 | >>> from featx import bag_of_words
 25 | >>> negfeat = bag_of_words(['the', 'plot', 'was', 'ludicrous'])
 26 | >>> nb_classifier.classify(negfeat)
 27 | 'neg'
 28 | >>> posfeat = bag_of_words(['kate', 'winslet', 'is', 'accessible'])
 29 | >>> nb_classifier.classify(posfeat)
 30 | 'pos'
 31 | 
 32 | >>> from nltk.classify.util import accuracy
 33 | >>> accuracy(nb_classifier, test_feats)
 34 | 0.728
 35 | 
 36 | >>> probs = nb_classifier.prob_classify(test_feats[0][0])
 37 | >>> probs.samples()
 38 | dict_keys(['neg', 'pos'])
 39 | >>> probs.max()
 40 | 'pos'
 41 | >>> probs.prob('pos')
 42 | 0.9999999646430913
 43 | >>> probs.prob('neg')
 44 | 3.535688969240647e-08
 45 | 
 46 | >>> nb_classifier.most_informative_features(n=5)
 47 | [('magnificent', True), ('outstanding', True), ('insulting', True), ('vulnerable', True), ('ludicrous', True)]
 48 | 
 49 | >>> from nltk.probability import LaplaceProbDist
 50 | >>> nb_classifier = NaiveBayesClassifier.train(train_feats, estimator=LaplaceProbDist)
 51 | >>> accuracy(nb_classifier, test_feats)
 52 | 0.716
 53 | 
 54 | >>> from nltk.probability import DictionaryProbDist
 55 | >>> label_probdist = DictionaryProbDist({'pos': 0.5, 'neg': 0.5})
 56 | >>> true_probdist = DictionaryProbDist({True: 1})
 57 | >>> feature_probdist = {('pos', 'yes'): true_probdist, ('neg', 'no'): true_probdist}
 58 | >>> classifier = NaiveBayesClassifier(label_probdist, feature_probdist)
 59 | >>> classifier.classify({'yes': True})
 60 | 'pos'
 61 | >>> classifier.classify({'no': True})
 62 | 'neg'
 63 | 
 64 | 
 65 | ===================================
 66 | Training a Decision Tree Classifier
 67 | ===================================
 68 | 
 69 | >>> from nltk.classify import DecisionTreeClassifier
 70 | >>> dt_classifier = DecisionTreeClassifier.train(train_feats, binary=True, entropy_cutoff=0.8, depth_cutoff=5, support_cutoff=30)
 71 | >>> accuracy(dt_classifier, test_feats)
 72 | 0.688
 73 | 
 74 | >>> from nltk.probability import FreqDist, MLEProbDist, entropy
 75 | >>> fd = FreqDist({'pos': 30, 'neg': 10})
 76 | >>> entropy(MLEProbDist(fd))
 77 | 0.8112781244591328
 78 | >>> fd['neg'] = 25
 79 | >>> entropy(MLEProbDist(fd))
 80 | 0.9940302114769565
 81 | >>> fd['neg'] = 30
 82 | >>> entropy(MLEProbDist(fd))
 83 | 1.0
 84 | >>> fd['neg'] = 1
 85 | >>> entropy(MLEProbDist(fd))
 86 | 0.20559250818508304
 87 | 
 88 | 
 89 | =====================================
 90 | Training a Maximum Entropy Classifier
 91 | =====================================
 92 | 
 93 | >>> from nltk.classify import MaxentClassifier
 94 | >>> me_classifier = MaxentClassifier.train(train_feats, trace=0, max_iter=1, min_lldelta=0.5)
 95 | >>> accuracy(me_classifier, test_feats)
 96 | 0.5
 97 | 
 98 | >>> me_classifier = MaxentClassifier.train(train_feats, algorithm='gis', trace=0, max_iter=10, min_lldelta=0.5)
 99 | >>> accuracy(me_classifier, test_feats)
100 | 0.722
101 | 
102 | =================================
103 | Training Scikit-Learn Classifiers
104 | =================================
105 | 
106 | >>> from nltk.classify.scikitlearn import SklearnClassifier
107 | >>> from sklearn.naive_bayes import MultinomialNB
108 | >>> sk_classifier = SklearnClassifier(MultinomialNB())
109 | >>> sk_classifier.train(train_feats)
110 | <SklearnClassifier(MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))>
111 | >>> accuracy(sk_classifier, test_feats)
112 | 0.83
113 | 
114 | >>> from sklearn.naive_bayes import BernoulliNB
115 | >>> sk_classifier = SklearnClassifier(BernoulliNB())
116 | >>> sk_classifier.train(train_feats)
117 | <SklearnClassifier(BernoulliNB(alpha=1.0, binarize=0.0, class_prior=None, fit_prior=True))>
118 | >>> accuracy(sk_classifier, test_feats)
119 | 0.812
120 | 
121 | >>> from sklearn.linear_model import LogisticRegression
122 | >>> sk_classifier = SklearnClassifier(LogisticRegression()).train(train_feats)
123 | >>> accuracy(sk_classifier, test_feats)
124 | 0.892
125 | 
126 | >>> from sklearn.svm import SVC
127 | >>> sk_classifier = SklearnClassifier(SVC()).train(train_feats)
128 | >>> accuracy(sk_classifier, test_feats)
129 | 0.69
130 | 
131 | >>> from sklearn.svm import LinearSVC
132 | >>> sk_classifier = SklearnClassifier(LinearSVC()).train(train_feats)
133 | >>> accuracy(sk_classifier, test_feats)
134 | 0.864
135 | 
136 | >>> from sklearn.svm import NuSVC
137 | >>> sk_classifier = SklearnClassifier(NuSVC()).train(train_feats)
138 | >>> accuracy(sk_classifier, test_feats)
139 | 0.882
140 | 
141 | ==============================================
142 | Measuring Precision and Recall of a Classifier
143 | ==============================================
144 | 
145 | >>> from classification import precision_recall
146 | >>> nb_precisions, nb_recalls = precision_recall(nb_classifier, test_feats)
147 | >>> nb_precisions['pos']
148 | 0.6413612565445026
149 | >>> nb_precisions['neg']
150 | 0.9576271186440678
151 | >>> nb_recalls['pos']
152 | 0.98
153 | >>> nb_recalls['neg']
154 | 0.452
155 | 
156 | >>> me_precisions, me_recalls = precision_recall(me_classifier, test_feats)
157 | >>> me_precisions['pos']
158 | 0.6456692913385826
159 | >>> me_precisions['neg']
160 | 0.9663865546218487
161 | >>> me_recalls['pos']
162 | 0.984
163 | >>> me_recalls['neg']
164 | 0.46
165 | 
166 | >>> sk_precisions, sk_recalls = precision_recall(sk_classifier, test_feats)
167 | >>> sk_precisions['pos']
168 | 0.9063829787234042
169 | >>> sk_precisions['neg']
170 | 0.8603773584905661
171 | >>> sk_recalls['pos']
172 | 0.852
173 | >>> sk_recalls['neg']
174 | 0.912
175 | 
176 | 
177 | ==================================
178 | Calculating High Information Words
179 | ==================================
180 | 
181 | >>> from featx import high_information_words, bag_of_words_in_set
182 | >>> labels = movie_reviews.categories()
183 | >>> labeled_words = [(l, movie_reviews.words(categories=[l])) for l in labels]
184 | >>> high_info_words = set(high_information_words(labeled_words))
185 | >>> feat_det = lambda words: bag_of_words_in_set(words, high_info_words)
186 | >>> lfeats = label_feats_from_corpus(movie_reviews, feature_detector=feat_det)
187 | >>> train_feats, test_feats = split_label_feats(lfeats)
188 | 
189 | >>> nb_classifier = NaiveBayesClassifier.train(train_feats)
190 | >>> accuracy(nb_classifier, test_feats)
191 | 0.91
192 | >>> nb_precisions, nb_recalls = precision_recall(nb_classifier, test_feats)
193 | >>> nb_precisions['pos']
194 | 0.8988326848249028
195 | >>> nb_precisions['neg']
196 | 0.9218106995884774
197 | >>> nb_recalls['pos']
198 | 0.924
199 | >>> nb_recalls['neg']
200 | 0.896
201 | 
202 | >>> me_classifier = MaxentClassifier.train(train_feats, algorithm='gis', trace=0, max_iter=10, min_lldelta=0.5)
203 | >>> accuracy(me_classifier, test_feats)
204 | 0.912
205 | >>> me_precisions, me_recalls = precision_recall(me_classifier, test_feats)
206 | >>> me_precisions['pos']
207 | 0.8992248062015504
208 | >>> me_precisions['neg']
209 | 0.9256198347107438
210 | >>> me_recalls['pos']
211 | 0.928
212 | >>> me_recalls['neg']
213 | 0.896
214 | 
215 | >>> dt_classifier = DecisionTreeClassifier.train(train_feats, binary=True, depth_cutoff=20, support_cutoff=20, entropy_cutoff=0.01)
216 | >>> accuracy(dt_classifier, test_feats)
217 | 0.688
218 | >>> dt_precisions, dt_recalls = precision_recall(dt_classifier, test_feats)
219 | >>> dt_precisions['pos']
220 | 0.6766917293233082
221 | >>> dt_precisions['neg']
222 | 0.7008547008547008
223 | >>> dt_recalls['pos']
224 | 0.72
225 | >>> dt_recalls['neg']
226 | 0.656
227 | 
228 | >>> sk_classifier = SklearnClassifier(LinearSVC()).train(train_feats)
229 | >>> accuracy(sk_classifier, test_feats)
230 | 0.86
231 | >>> sk_precisions, sk_recalls = precision_recall(sk_classifier, test_feats)
232 | >>> sk_precisions['pos']
233 | 0.871900826446281
234 | >>> sk_precisions['neg']
235 | 0.8488372093023255
236 | >>> sk_recalls['pos']
237 | 0.844
238 | >>> sk_recalls['neg']
239 | 0.876
240 | 
241 | 
242 | =================================
243 | Combining Classifiers with Voting
244 | =================================
245 | 
246 | >>> from classification import MaxVoteClassifier
247 | >>> mv_classifier = MaxVoteClassifier(nb_classifier, dt_classifier, me_classifier, sk_classifier)
248 | >>> mv_classifier.labels()
249 | ['neg', 'pos']
250 | >>> accuracy(mv_classifier, test_feats)
251 | 0.894
252 | >>> mv_precisions, mv_recalls = precision_recall(mv_classifier, test_feats)
253 | >>> mv_precisions['pos']
254 | 0.9156118143459916
255 | >>> mv_precisions['neg']
256 | 0.8745247148288974
257 | >>> mv_recalls['pos']
258 | 0.868
259 | >>> mv_recalls['neg']
260 | 0.92
261 | 
262 | 
263 | ============================================
264 | Classifying with Multiple Binary Classifiers
265 | ============================================
266 | 
267 | >>> from nltk.corpus import reuters
268 | >>> len(reuters.categories())
269 | 90
270 | 
271 | >>> from featx import reuters_high_info_words, reuters_train_test_feats
272 | >>> rwords = reuters_high_info_words()
273 | >>> featdet = lambda words: bag_of_words_in_set(words, rwords)
274 | >>> multi_train_feats, multi_test_feats = reuters_train_test_feats(featdet)
275 | 
276 | >>> from classification import train_binary_classifiers
277 | >>> trainf = lambda train_feats: SklearnClassifier(LogisticRegression()).train(train_feats)
278 | >>> labelset = set(reuters.categories())
279 | >>> classifiers = train_binary_classifiers(trainf, multi_train_feats, labelset)
280 | >>> len(classifiers)
281 | 90
282 | 
283 | >>> from classification import MultiBinaryClassifier, multi_metrics
284 | >>> multi_classifier = MultiBinaryClassifier(*classifiers.items())
285 | 
286 | >>> multi_precisions, multi_recalls, avg_md = multi_metrics(multi_classifier, multi_test_feats)
287 | >>> avg_md
288 | 0.23310715863026216
289 | 
290 | >>> multi_precisions['soybean']
291 | 0.7857142857142857
292 | >>> multi_recalls['soybean']
293 | 0.3333333333333333
294 | >>> len(reuters.fileids(categories=['soybean']))
295 | 111
296 | 
297 | >>> multi_precisions['sunseed']
298 | 1.0
299 | >>> multi_recalls['sunseed']
300 | 0.2
301 | >>> len(reuters.fileids(categories=['sunseed']))
302 | 16
303 | '''
304 | 
305 | if __name__ == '__main__':
306 | 	import doctest
307 | 	doctest.testmod()


--------------------------------------------------------------------------------
/Module 2/Chapter 7/7853OS_07_Codes/classification.py:
--------------------------------------------------------------------------------
  1 | import collections, itertools
  2 | from nltk import metrics
  3 | from nltk.classify import util, ClassifierI, MultiClassifierI
  4 | from nltk.probability import FreqDist
  5 | 
  6 | def precision_recall(classifier, testfeats):
  7 | 	refsets = collections.defaultdict(set)
  8 | 	testsets = collections.defaultdict(set)
  9 | 	
 10 | 	for i, (feats, label) in enumerate(testfeats):
 11 | 		refsets[label].add(i)
 12 | 		observed = classifier.classify(feats)
 13 | 		testsets[observed].add(i)
 14 | 	
 15 | 	precisions = {}
 16 | 	recalls = {}
 17 | 	
 18 | 	for label in classifier.labels():
 19 | 		precisions[label] = metrics.precision(refsets[label], testsets[label])
 20 | 		recalls[label] = metrics.recall(refsets[label], testsets[label])
 21 | 	
 22 | 	return precisions, recalls
 23 | 
 24 | class MaxVoteClassifier(ClassifierI):
 25 | 	def __init__(self, *classifiers):
 26 | 		self._classifiers = classifiers
 27 | 		self._labels = sorted(set(itertools.chain(*[c.labels() for c in classifiers])))
 28 | 	
 29 | 	def labels(self):
 30 | 		return self._labels
 31 | 	
 32 | 	def classify(self, feats):
 33 | 		counts = FreqDist()
 34 | 		
 35 | 		for classifier in self._classifiers:
 36 | 			counts[classifier.classify(feats)] += 1
 37 | 		
 38 | 		return counts.max()
 39 | 
 40 | class MultiBinaryClassifier(MultiClassifierI):
 41 | 	def __init__(self, *label_classifiers):
 42 | 		self._label_classifiers = dict(label_classifiers)
 43 | 		self._labels = sorted(self._label_classifiers.keys())
 44 | 	
 45 | 	def labels(self):
 46 | 		return self._labels
 47 | 	
 48 | 	def classify(self, feats):
 49 | 		lbls = set()
 50 | 		
 51 | 		for label, classifier in self._label_classifiers.items():
 52 | 			if classifier.classify(feats) == label:
 53 | 				lbls.add(label)
 54 | 		
 55 | 		return lbls
 56 | 
 57 | def train_binary_classifiers(trainf, labelled_feats, labelset):
 58 | 	pos_feats = collections.defaultdict(list)
 59 | 	neg_feats = collections.defaultdict(list)
 60 | 	classifiers = {}
 61 | 	
 62 | 	for feat, labels in labelled_feats:
 63 | 		for label in labels:
 64 | 			pos_feats[label].append(feat)
 65 | 		
 66 | 		for label in labelset - set(labels):
 67 | 			neg_feats[label].append(feat)
 68 | 	
 69 | 	for label in labelset:
 70 | 		postrain = [(feat, label) for feat in pos_feats[label]]
 71 | 		negtrain = [(feat, '!%s' % label) for feat in neg_feats[label]]
 72 | 		classifiers[label] = trainf(postrain + negtrain)
 73 | 	
 74 | 	return classifiers
 75 | 
 76 | def multi_metrics(multi_classifier, test_feats):
 77 | 	mds = []
 78 | 	refsets = collections.defaultdict(set)
 79 | 	testsets = collections.defaultdict(set)
 80 | 	
 81 | 	for i, (feat, labels) in enumerate(test_feats):
 82 | 		for label in labels:
 83 | 			refsets[label].add(i)
 84 | 		
 85 | 		guessed = multi_classifier.classify(feat)
 86 | 		
 87 | 		for label in guessed:
 88 | 			testsets[label].add(i)
 89 | 		
 90 | 		mds.append(metrics.masi_distance(set(labels), guessed))
 91 | 	
 92 | 	avg_md = sum(mds) / float(len(mds))
 93 | 	precisions = {}
 94 | 	recalls = {}
 95 | 	
 96 | 	for label in multi_classifier.labels():
 97 | 		precisions[label] = metrics.precision(refsets[label], testsets[label])
 98 | 		recalls[label] = metrics.recall(refsets[label], testsets[label])
 99 | 	
100 | 	return precisions, recalls, avg_md


--------------------------------------------------------------------------------
/Module 2/Chapter 7/7853OS_07_Codes/featx.py:
--------------------------------------------------------------------------------
  1 | import collections
  2 | from nltk.corpus import stopwords, reuters
  3 | from nltk.collocations import BigramCollocationFinder
  4 | from nltk.metrics import BigramAssocMeasures
  5 | from nltk.probability import FreqDist, ConditionalFreqDist
  6 | 
  7 | def bag_of_words(words):
  8 | 	'''
  9 | 	>>> bag_of_words(['the', 'quick', 'brown', 'fox'])
 10 | 	{'quick': True, 'brown': True, 'the': True, 'fox': True}
 11 | 	'''
 12 | 	return dict([(word, True) for word in words])
 13 | 
 14 | def bag_of_words_not_in_set(words, badwords):
 15 | 	'''
 16 | 	>>> bag_of_words_not_in_set(['the', 'quick', 'brown', 'fox'], ['the'])
 17 | 	{'quick': True, 'brown': True, 'fox': True}
 18 | 	'''
 19 | 	return bag_of_words(set(words) - set(badwords))
 20 | 
 21 | def bag_of_non_stopwords(words, stopfile='english'):
 22 | 	'''
 23 | 	>>> bag_of_non_stopwords(['the', 'quick', 'brown', 'fox'])
 24 | 	{'quick': True, 'brown': True, 'fox': True}
 25 | 	'''
 26 | 	badwords = stopwords.words(stopfile)
 27 | 	return bag_of_words_not_in_set(words, badwords)
 28 | 
 29 | def bag_of_bigrams_words(words, score_fn=BigramAssocMeasures.chi_sq, n=200):
 30 | 	'''
 31 | 	>>> bag_of_bigrams_words(['the', 'quick', 'brown', 'fox'])
 32 | 	{'brown': True, ('brown', 'fox'): True, ('the', 'quick'): True, 'quick': True, ('quick', 'brown'): True, 'the': True, 'fox': True}
 33 | 	'''
 34 | 	bigram_finder = BigramCollocationFinder.from_words(words)
 35 | 	bigrams = bigram_finder.nbest(score_fn, n)
 36 | 	return bag_of_words(words + bigrams)
 37 | 
 38 | def bag_of_words_in_set(words, goodwords):
 39 | 	return bag_of_words(set(words) & set(goodwords))
 40 | 
 41 | def label_feats_from_corpus(corp, feature_detector=bag_of_words):
 42 | 	label_feats = collections.defaultdict(list)
 43 | 	
 44 | 	for label in corp.categories():
 45 | 		for fileid in corp.fileids(categories=[label]):
 46 | 			feats = feature_detector(corp.words(fileids=[fileid]))
 47 | 			label_feats[label].append(feats)
 48 | 	
 49 | 	return label_feats
 50 | 
 51 | def split_label_feats(lfeats, split=0.75):
 52 | 	train_feats = []
 53 | 	test_feats = []
 54 | 	
 55 | 	for label, feats in lfeats.items():
 56 | 		cutoff = int(len(feats) * split)
 57 | 		train_feats.extend([(feat, label) for feat in feats[:cutoff]])
 58 | 		test_feats.extend([(feat, label) for feat in feats[cutoff:]])
 59 | 	
 60 | 	return train_feats, test_feats
 61 | 
 62 | def high_information_words(labelled_words, score_fn=BigramAssocMeasures.chi_sq, min_score=5):
 63 | 	word_fd = FreqDist()
 64 | 	label_word_fd = ConditionalFreqDist()
 65 | 	
 66 | 	for label, words in labelled_words:
 67 | 		for word in words:
 68 | 			word_fd[word] += 1
 69 | 			label_word_fd[label][word] += 1
 70 | 	
 71 | 	n_xx = label_word_fd.N()
 72 | 	high_info_words = set()
 73 | 	
 74 | 	for label in label_word_fd.conditions():
 75 | 		n_xi = label_word_fd[label].N()
 76 | 		word_scores = collections.defaultdict(int)
 77 | 		
 78 | 		for word, n_ii in label_word_fd[label].items():
 79 | 			n_ix = word_fd[word]
 80 | 			score = score_fn(n_ii, (n_ix, n_xi), n_xx)
 81 | 			word_scores[word] = score
 82 | 		
 83 | 		bestwords = [word for word, score in word_scores.items() if score >= min_score]
 84 | 		high_info_words |= set(bestwords)
 85 | 	
 86 | 	return high_info_words
 87 | 
 88 | def reuters_high_info_words(score_fn=BigramAssocMeasures.chi_sq):
 89 | 	labeled_words = []
 90 | 	
 91 | 	for label in reuters.categories():
 92 | 		labeled_words.append((label, reuters.words(categories=[label])))
 93 | 	
 94 | 	return high_information_words(labeled_words, score_fn=score_fn)
 95 | 
 96 | def reuters_train_test_feats(feature_detector=bag_of_words):
 97 | 	train_feats = []
 98 | 	test_feats = []
 99 | 	
100 | 	for fileid in reuters.fileids():
101 | 		if fileid.startswith('training'):
102 | 			featlist = train_feats
103 | 		else: # fileid.startswith('test')
104 | 			featlist = test_feats
105 | 		
106 | 		feats = feature_detector(reuters.words(fileid))
107 | 		labels = reuters.categories(fileid)
108 | 		featlist.append((feats, labels))
109 | 	
110 | 	return train_feats, test_feats
111 | 
112 | if __name__ == '__main__':
113 | 	import doctest
114 | 	doctest.testmod()


--------------------------------------------------------------------------------
/Module 2/Chapter 8/7853OS_08_Codes/chapter8.py:
--------------------------------------------------------------------------------
  1 | '''
  2 | ================================
  3 | Distributed Tagging with Execnet
  4 | ================================
  5 | 
  6 | >>> import execnet, remote_tag, nltk.tag, nltk.data
  7 | >>> from nltk.corpus import treebank
  8 | >>> import pickle
  9 | >>> tagger = pickle.dumps(nltk.data.load(nltk.tag._POS_TAGGER))
 10 | >>> gw = execnet.makegateway()
 11 | >>> channel = gw.remote_exec(remote_tag)
 12 | >>> channel.send(tagger)
 13 | >>> channel.send(treebank.sents()[0])
 14 | >>> tagged_sentence = channel.receive()
 15 | >>> tagged_sentence == treebank.tagged_sents()[0]
 16 | True
 17 | >>> gw.exit()
 18 | 
 19 | >>> import itertools
 20 | >>> gw1 = execnet.makegateway()
 21 | >>> gw2 = execnet.makegateway()
 22 | >>> ch1 = gw1.remote_exec(remote_tag)
 23 | >>> ch1.send(tagger)
 24 | >>> ch2 = gw2.remote_exec(remote_tag)
 25 | >>> ch2.send(tagger)
 26 | >>> mch = execnet.MultiChannel([ch1, ch2])
 27 | >>> queue = mch.make_receive_queue()
 28 | >>> channels = itertools.cycle(mch)
 29 | >>> for sentence in treebank.sents()[:4]:
 30 | ...    channel = next(channels)
 31 | ...    channel.send(sentence)
 32 | >>> tagged_sentences = []
 33 | >>> for i in range(4):
 34 | ...    channel, tagged_sentence = queue.get()
 35 | ...    tagged_sentences.append(tagged_sentence)
 36 | >>> len(tagged_sentences)
 37 | 4
 38 | >>> gw1.exit()
 39 | >>> gw2.exit()
 40 | 
 41 | 
 42 | =================================
 43 | Distributed Chunking with Execnet
 44 | =================================
 45 | 
 46 | >>> import remote_chunk, nltk.chunk
 47 | >>> from nltk.corpus import treebank_chunk
 48 | >>> chunker = pickle.dumps(nltk.data.load(nltk.chunk._MULTICLASS_NE_CHUNKER))
 49 | >>> gw = execnet.makegateway()
 50 | >>> channel = gw.remote_exec(remote_chunk)
 51 | >>> channel.send(tagger)
 52 | >>> channel.send(chunker)
 53 | >>> channel.send(treebank_chunk.sents()[0])
 54 | >>> chunk_tree = pickle.loads(channel.receive())
 55 | >>> chunk_tree
 56 | Tree('S', [Tree('PERSON', [('Pierre', 'NNP')]), Tree('ORGANIZATION', [('Vinken', 'NNP')]), (',', ','), ('61', 'CD'), ('years', 'NNS'), ('old', 'JJ'), (',', ','), ('will', 'MD'), ('join', 'VB'), ('the', 'DT'), ('board', 'NN'), ('as', 'IN'), ('a', 'DT'), ('nonexecutive', 'JJ'), ('director', 'NN'), ('Nov.', 'NNP'), ('29', 'CD'), ('.', '.')])
 57 | >>> gw.exit()
 58 | 
 59 | 
 60 | =====================================
 61 | Parallel List Processing with Execnet
 62 | =====================================
 63 | 
 64 | >>> import plists, remote_double
 65 | >>> plists.map(remote_double, range(10))
 66 | [0, 2, 4, 6, 8, 10, 12, 14, 16, 18]
 67 | 
 68 | >>> plists.map(remote_double, range(10), [('popen', 4)])
 69 | [0, 2, 4, 6, 8, 10, 12, 14, 16, 18]
 70 | 
 71 | 
 72 | ======================================
 73 | Storing an Ordered Dictionary in Redis
 74 | ======================================
 75 | 
 76 | >>> from redis import Redis
 77 | >>> from rediscollections import RedisOrderedDict
 78 | >>> r = Redis()
 79 | >>> rod = RedisOrderedDict(r, 'scores')
 80 | >>> rod['best'] = 10
 81 | >>> rod['worst'] = 0.1
 82 | >>> rod['middle'] = 5
 83 | >>> rod.keys()
 84 | [b'best', b'middle', b'worst']
 85 | >>> rod.keys(start=0, end=1)
 86 | [b'best', b'middle']
 87 | >>> rod.clear()
 88 | 
 89 | 
 90 | ===============================================
 91 | Distributed Word Scoring with Redis and Execnet
 92 | ===============================================
 93 | 
 94 | >>> from dist_featx import score_words
 95 | >>> from nltk.corpus import movie_reviews
 96 | >>> labels = movie_reviews.categories()
 97 | >>> labelled_words = [(l, movie_reviews.words(categories=[l])) for l in labels]
 98 | >>> word_scores = score_words(labelled_words)
 99 | >>> len(word_scores)
100 | 39767
101 | >>> topn_words = word_scores.keys(end=1000)
102 | >>> topn_words[0:5]
103 | [b'bad', b',', b'and', b'?', b'movie']
104 | >>> from redis import Redis
105 | >>> r = Redis()
106 | >>> [r.delete(key) for key in ['word_fd', 'label_word_fd:neg', 'label_word_fd:pos', 'word_scores']]
107 | [1, 1, 1, 1]
108 | '''
109 | 
110 | if __name__ == '__main__':
111 | 	import doctest
112 | 	doctest.testmod()


--------------------------------------------------------------------------------
/Module 2/Chapter 8/7853OS_08_Codes/dist_featx.py:
--------------------------------------------------------------------------------
 1 | import itertools, execnet, remote_word_count
 2 | from nltk.metrics import BigramAssocMeasures
 3 | from redis import Redis
 4 | from redisprob import RedisHashFreqDist, RedisConditionalHashFreqDist
 5 | from rediscollections import RedisOrderedDict
 6 | 
 7 | def score_words(labelled_words, score_fn=BigramAssocMeasures.chi_sq, host='localhost', specs=[('popen', 2)]):
 8 | 	gateways = []
 9 | 	channels = []
10 | 	
11 | 	for spec, count in specs:
12 | 		for i in range(count):
13 | 			gw = execnet.makegateway(spec)
14 | 			gateways.append(gw)
15 | 			channel = gw.remote_exec(remote_word_count)
16 | 			channel.send((host, 'word_fd', 'label_word_fd'))
17 | 			channels.append(channel)
18 | 	
19 | 	cyc = itertools.cycle(channels)
20 | 	
21 | 	for label, words in labelled_words:
22 | 		channel = next(cyc)
23 | 		channel.send((label, list(words)))
24 | 	
25 | 	for channel in channels:
26 | 		channel.send('done')
27 | 		assert 'done' == channel.receive()
28 | 		channel.waitclose(5)
29 | 	
30 | 	for gateway in gateways:
31 | 		gateway.exit()
32 | 	
33 | 	r = Redis(host)
34 | 	fd = RedisHashFreqDist(r, 'word_fd')
35 | 	cfd = RedisConditionalHashFreqDist(r, 'label_word_fd')
36 | 	word_scores = RedisOrderedDict(r, 'word_scores')
37 | 	n_xx = cfd.N()
38 | 	
39 | 	for label in cfd.conditions():
40 | 		n_xi = cfd[label].N()
41 | 		
42 | 		for word, n_ii in cfd[label].items():
43 | 			word = word.decode() # must convert to string from bytes
44 | 			n_ix = fd[word]
45 | 			
46 | 			if n_ii and n_ix and n_xi and n_xx:
47 | 				score = score_fn(n_ii, (n_ix, n_xi), n_xx)
48 | 				word_scores[word] = score
49 | 	
50 | 	return word_scores


--------------------------------------------------------------------------------
/Module 2/Chapter 8/7853OS_08_Codes/plists.py:
--------------------------------------------------------------------------------
 1 | import itertools, execnet
 2 | 
 3 | def map(mod, args, specs=[('popen', 2)]):
 4 | 	gateways = []
 5 | 	channels = []
 6 | 	
 7 | 	for spec, count in specs:
 8 | 		for i in range(count):
 9 | 			gw = execnet.makegateway(spec)
10 | 			gateways.append(gw)
11 | 			channels.append(gw.remote_exec(mod))
12 | 	
13 | 	cyc = itertools.cycle(channels)
14 | 	
15 | 	for i, arg in enumerate(args):
16 | 		channel = next(cyc)
17 | 		channel.send((i, arg))
18 | 	
19 | 	mch = execnet.MultiChannel(channels)
20 | 	queue = mch.make_receive_queue()
21 | 	l = len(args)
22 | 	results = [None] * l
23 | 	
24 | 	for j in range(l):
25 | 		channel, (i, result) = queue.get()
26 | 		results[i] = result
27 | 	
28 | 	for gw in gateways:
29 | 		gw.exit()
30 | 	
31 | 	return results


--------------------------------------------------------------------------------
/Module 2/Chapter 8/7853OS_08_Codes/remote_chunk.py:
--------------------------------------------------------------------------------
1 | import pickle
2 | 
3 | if __name__ == '__channelexec__':
4 | 	tagger = pickle.loads(channel.receive())
5 | 	chunker = pickle.loads(channel.receive())
6 | 	
7 | 	for sent in channel:
8 | 		tree = chunker.parse(tagger.tag(sent))
9 | 		channel.send(pickle.dumps(tree))


--------------------------------------------------------------------------------
/Module 2/Chapter 8/7853OS_08_Codes/remote_double.py:
--------------------------------------------------------------------------------
1 | 
2 | if __name__ == '__channelexec__':
3 | 	for (i, arg) in channel:
4 | 		channel.send((i, arg * 2))


--------------------------------------------------------------------------------
/Module 2/Chapter 8/7853OS_08_Codes/remote_tag.py:
--------------------------------------------------------------------------------
1 | import pickle
2 | 
3 | if __name__ == '__channelexec__':
4 | 	tagger = pickle.loads(channel.receive())
5 | 	
6 | 	for sentence in channel:
7 | 		channel.send(tagger.tag(sentence))


--------------------------------------------------------------------------------
/Module 2/Chapter 8/7853OS_08_Codes/remote_word_count.py:
--------------------------------------------------------------------------------
 1 | from redis import Redis
 2 | from redisprob import RedisHashFreqDist, RedisConditionalHashFreqDist
 3 | 
 4 | if __name__ == '__channelexec__':
 5 | 	host, fd_name, cfd_name = channel.receive()
 6 | 	r = Redis(host)
 7 | 	fd = RedisHashFreqDist(r, fd_name)
 8 | 	cfd = RedisConditionalHashFreqDist(r, cfd_name)
 9 | 	
10 | 	for data in channel:
11 | 		if data == 'done':
12 | 			channel.send('done')
13 | 			break
14 | 		
15 | 		label, words = data
16 | 		
17 | 		for word in words:
18 | 			fd[word] += 1
19 | 			cfd[label][word] += 1


--------------------------------------------------------------------------------
/Module 2/Chapter 9/7853OS_09_Codes/chapter9.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | '''
  3 | ===================================
  4 | Parsing Dates & Times with Dateutil
  5 | ===================================
  6 | 
  7 | >>> from dateutil import parser
  8 | >>> parser.parse('Thu Sep 25 10:36:28 2010')
  9 | datetime.datetime(2010, 9, 25, 10, 36, 28)
 10 | >>> parser.parse('Thursday, 25. September 2010 10:36AM')
 11 | datetime.datetime(2010, 9, 25, 10, 36)
 12 | >>> parser.parse('9/25/2010 10:36:28')
 13 | datetime.datetime(2010, 9, 25, 10, 36, 28)
 14 | >>> parser.parse('9/25/2010')
 15 | datetime.datetime(2010, 9, 25, 0, 0)
 16 | >>> parser.parse('2010-09-25T10:36:28Z')
 17 | datetime.datetime(2010, 9, 25, 10, 36, 28, tzinfo=tzutc())
 18 | 
 19 | >>> parser.parse('25/9/2010', dayfirst=True)
 20 | datetime.datetime(2010, 9, 25, 0, 0)
 21 | 
 22 | >>> parser.parse('10-9-25')
 23 | datetime.datetime(2025, 10, 9, 0, 0)
 24 | >>> parser.parse('10-9-25', yearfirst=True)
 25 | datetime.datetime(2010, 9, 25, 0, 0)
 26 | 
 27 | >>> try:
 28 | ...    parser.parse('9/25/2010 at about 10:36AM')
 29 | ... except ValueError:
 30 | ...    'cannot parse'
 31 | 'cannot parse'
 32 | >>> parser.parse('9/25/2010 at about 10:36AM', fuzzy=True)
 33 | datetime.datetime(2010, 9, 25, 10, 36)
 34 | 
 35 | 
 36 | ==============================
 37 | Timezone Lookup and Conversion
 38 | ==============================
 39 | 
 40 | >>> from dateutil import tz
 41 | >>> tz.tzutc()
 42 | tzutc()
 43 | >>> import datetime
 44 | >>> tz.tzutc().utcoffset(datetime.datetime.utcnow())
 45 | datetime.timedelta(0)
 46 | 
 47 | >>> tz.gettz('US/Pacific')
 48 | tzfile('America/Los_Angeles')
 49 | >>> tz.gettz('US/Pacific').utcoffset(datetime.datetime.utcnow())
 50 | datetime.timedelta(-1, 61200)
 51 | >>> tz.gettz('Europe/Paris')
 52 | tzfile('Europe/Paris')
 53 | >>> tz.gettz('Europe/Paris').utcoffset(datetime.datetime.utcnow())
 54 | datetime.timedelta(0, 7200)
 55 | 
 56 | >>> pst = tz.gettz('US/Pacific')
 57 | >>> dt = datetime.datetime(2010, 9, 25, 10, 36)
 58 | >>> dt.tzinfo
 59 | >>> dt.astimezone(tz.tzutc())
 60 | Traceback (most recent call last):
 61 |   File "/usr/lib/python2.6/doctest.py", line 1248, in __run
 62 | 	compileflags, 1) in test.globs
 63 |   File "<doctest __main__[22]>", line 1, in <module>
 64 | 	dt.astimezone(tz.tzutc())
 65 | ValueError: astimezone() cannot be applied to a naive datetime
 66 | >>> dt.replace(tzinfo=pst)
 67 | datetime.datetime(2010, 9, 25, 10, 36, tzinfo=tzfile('America/Los_Angeles'))
 68 | >>> dt.replace(tzinfo=pst).astimezone(tz.tzutc())
 69 | datetime.datetime(2010, 9, 25, 17, 36, tzinfo=tzutc())
 70 | 
 71 | >>> parser.parse('Wednesday, Aug 4, 2010 at 6:30 p.m. (CDT)', fuzzy=True)
 72 | datetime.datetime(2010, 8, 4, 18, 30)
 73 | >>> tzinfos = {'CDT': tz.gettz('US/Central')}
 74 | >>> parser.parse('Wednesday, Aug 4, 2010 at 6:30 p.m. (CDT)', fuzzy=True, tzinfos=tzinfos)
 75 | datetime.datetime(2010, 8, 4, 18, 30, tzinfo=tzfile('America/Chicago'))
 76 | 
 77 | >>> tz.tzoffset('custom', 3600)
 78 | tzoffset('custom', 3600)
 79 | 
 80 | ===================================
 81 | Extracting URLs from HTML with lxml
 82 | ===================================
 83 | 
 84 | >>> from lxml import html
 85 | >>> doc = html.fromstring('Hello <a href="/world">world</a>')
 86 | >>> links = list(doc.iterlinks())
 87 | >>> len(links)
 88 | 1
 89 | >>> (el, attr, link, pos) = links[0]
 90 | >>> attr
 91 | 'href'
 92 | >>> link
 93 | '/world'
 94 | >>> pos
 95 | 0
 96 | 
 97 | >>> doc.make_links_absolute('http://hello')
 98 | >>> abslinks = list(doc.iterlinks())
 99 | >>> (el, attr, link, pos) = abslinks[0]
100 | >>> link
101 | 'http://hello/world'
102 | 
103 | >>> links = list(html.iterlinks('Hello <a href="/world">world</a>'))
104 | >>> links[0][2]
105 | '/world'
106 | 
107 | >>> doc.xpath('//a/@href')[0]
108 | 'http://hello/world'
109 | 
110 | 
111 | ===========================
112 | Cleaning and Stripping HTML
113 | ===========================
114 | 
115 | >>> import lxml.html.clean
116 | >>> lxml.html.clean.clean_html('<html><head></head><body onload=loadfunc()>my text</body></html>')
117 | '<div><body>my text</body></div>'
118 | 
119 | >>> from bs4 import BeautifulSoup
120 | >>> BeautifulSoup('<div><body>my text</body></div>').get_text()
121 | 'my text'
122 | 
123 | 
124 | ===========================================
125 | Converting HTML Entities with BeautifulSoup
126 | ===========================================
127 | 
128 | >>> from bs4 import BeautifulSoup
129 | >>> BeautifulSoup('&lt;').string
130 | '<'
131 | >>> BeautifulSoup('&amp;').string
132 | '&'
133 | 
134 | >>> BeautifulSoup('<').string
135 | 
136 | >>> from bs4 import BeautifulSoup
137 | >>> soup = BeautifulSoup('Hello <a href="/world">world</a>')
138 | >>> [a['href'] for a in soup.findAll('a')]
139 | ['/world']
140 | 
141 | ============================================
142 | Detecting and Converting Character Encodings
143 | ============================================
144 | 
145 | >>> import unicodedata
146 | >>> unicodedata.normalize('NFKD', 'abcd\xe9').encode('ascii', 'ignore')
147 | b'abcde'
148 | 
149 | >>> from bs4 import UnicodeDammit
150 | >>> UnicodeDammit('abcd\xe9').unicode_markup
151 | 'abcdé'
152 | 
153 | '''
154 | 
155 | if __name__ == '__main__':
156 | 	import doctest
157 | 	doctest.testmod()
158 | 


--------------------------------------------------------------------------------
/Module 2/Chapter 9/7853OS_09_Codes/encoding.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | import charade
 3 | 
 4 | def detect(s):
 5 | 	'''
 6 | 	>>> detect('ascii')
 7 | 	{'confidence': 1.0, 'encoding': 'ascii'}
 8 | 	>>> detect('abcdé')
 9 | 	{'confidence': 0.505, 'encoding': 'utf-8'}
10 | 	>>> detect(bytes('abcdé', 'utf-8'))
11 | 	{'confidence': 0.505, 'encoding': 'utf-8'}
12 | 	>>> detect(bytes('\222\222\223\225', 'latin-1'))
13 | 	{'confidence': 0.5, 'encoding': 'windows-1252'}
14 | 	'''
15 | 	try:
16 | 		if isinstance(s, str):
17 | 			return charade.detect(s.encode())
18 | 		else:
19 | 			return charade.detect(s)
20 | 	except UnicodeDecodeError:
21 | 		return charade.detect(s.encode('utf-8'))
22 | 
23 | def convert(s):
24 | 	'''
25 | 	>>> convert('ascii')
26 | 	'ascii'
27 | 	>>> convert('abcdé')
28 | 	'abcdé'
29 | 	>>> convert(bytes('abcdé', 'utf-8'))
30 | 	'abcdé'
31 | 	>>> convert(bytes('\222\222\223\225', 'latin-1'))
32 | 	'\u2019\u2019\u201c\u2022'
33 | 	'''
34 | 	if isinstance(s, str):
35 | 		s = s.encode()
36 | 	
37 | 	encoding = detect(s)['encoding']
38 | 	
39 | 	if encoding == 'utf-8':
40 | 		return s.decode()
41 | 	else:
42 | 		return s.decode(encoding)
43 | 
44 | if __name__ == '__main__':
45 | 	import doctest
46 | 	doctest.testmod()


--------------------------------------------------------------------------------
/Module 3/Chapter 1/ch1_1.py:
--------------------------------------------------------------------------------
1 | import nltk
2 | text=" Welcome readers. I hope you find it interesting. Please do reply."
3 | from nltk.tokenize import sent_tokenize
4 | print(sent_tokenize(text))
5 | 


--------------------------------------------------------------------------------
/Module 3/Chapter 1/ch1_10.py:
--------------------------------------------------------------------------------
1 | import nltk
2 | from nltk.tokenize import regexp_tokenize
3 | sent="Don't hesitate to ask questions"
4 | print(regexp_tokenize(sent, pattern='\w+|\$[\d\.]+|\S+'))
5 | 
6 | 
7 | 


--------------------------------------------------------------------------------
/Module 3/Chapter 1/ch1_11.py:
--------------------------------------------------------------------------------
1 | import nltk
2 | from nltk.tokenize import RegexpTokenizer
3 | tokenizer=RegexpTokenizer('\s+',gaps=True)
4 | print(tokenizer.tokenize("Don't hesitate to ask questions"))
5 | 


--------------------------------------------------------------------------------
/Module 3/Chapter 1/ch1_12.py:
--------------------------------------------------------------------------------
1 | import nltk
2 | from nltk.tokenize import RegexpTokenizer
3 | sent=" She secured 90.56 % in class X . She is a meritorious student"
4 | capt = RegexpTokenizer('[A-Z]\w+')
5 | print(capt.tokenize(sent))
6 | 


--------------------------------------------------------------------------------
/Module 3/Chapter 1/ch1_13.py:
--------------------------------------------------------------------------------
1 | import nltk
2 | sent=" She secured 90.56 % in class X . She is a meritorious student"
3 | from nltk.tokenize import BlanklineTokenizer
4 | print(BlanklineTokenizer().tokenize(sent))
5 | 


--------------------------------------------------------------------------------
/Module 3/Chapter 1/ch1_14.py:
--------------------------------------------------------------------------------
1 | import nltk
2 | sent=" She secured 90.56 % in class X . She is a meritorious student"
3 | from nltk.tokenize import WhitespaceTokenizer
4 | print(WhitespaceTokenizer().tokenize(sent))
5 | 


--------------------------------------------------------------------------------
/Module 3/Chapter 1/ch1_15.py:
--------------------------------------------------------------------------------
1 | import nltk
2 | sent= "She secured 90.56 % in class X. She is a meritorious student"
3 | print(sent.split())
4 | print(sent.split(' '))
5 | sent=" She secured 90.56 % in class X \n. She is a meritorious student\n"
6 | print(sent.split('\n'))
7 | 


--------------------------------------------------------------------------------
/Module 3/Chapter 1/ch1_16.py:
--------------------------------------------------------------------------------
1 | import nltk
2 | from nltk.tokenize import BlanklineTokenizer
3 | sent=" She secured 90.56 % in class X \n. She is a meritorious student\n"
4 | print(BlanklineTokenizer().tokenize(sent))
5 | from nltk.tokenize import LineTokenizer
6 | print(LineTokenizer(blanklines='keep').tokenize(sent))
7 | print(LineTokenizer(blanklines='discard').tokenize(sent))
8 | 


--------------------------------------------------------------------------------
/Module 3/Chapter 1/ch1_17.py:
--------------------------------------------------------------------------------
1 | import nltk
2 | sent=" She secured 90.56 % in class X \n. She is a meritorious student\n"
3 | from nltk.tokenize import SpaceTokenizer
4 | print(SpaceTokenizer().tokenize(sent))
5 | 


--------------------------------------------------------------------------------
/Module 3/Chapter 1/ch1_18.py:
--------------------------------------------------------------------------------
1 | import nltk
2 | from nltk.tokenize import WhitespaceTokenizer
3 | sent=" She secured 90.56 % in class X \n. She is a meritorious student\n"
4 | print(list(WhitespaceTokenizer().span_tokenize(sent)))
5 | 


--------------------------------------------------------------------------------
/Module 3/Chapter 1/ch1_19.py:
--------------------------------------------------------------------------------
1 | import nltk
2 | from nltk.tokenize import WhitespaceTokenizer
3 | from nltk.tokenize.util import spans_to_relative
4 | sent=" She secured 90.56 % in class X \n. She is a meritorious student\n"
5 | print(list(spans_to_relative(WhitespaceTokenizer().span_tokenize(sent))))
6 | 


--------------------------------------------------------------------------------
/Module 3/Chapter 1/ch1_2.py:
--------------------------------------------------------------------------------
1 | import nltk
2 | tokenizer=nltk.data.load('tokenizers/punkt/english.pickle')
3 | text=" Hello everyone. Hope all are fine and doing well. Hope you find the book interesting"
4 | print(tokenizer.tokenize(text))
5 | 


--------------------------------------------------------------------------------
/Module 3/Chapter 1/ch1_20.py:
--------------------------------------------------------------------------------
1 | import nltk
2 | from nltk.tokenize.util import string_span_tokenize
3 | sent=" She secured 90.56 % in class X \n. She is a meritorious student\n"
4 | print(list(string_span_tokenize(sent, " ")))
5 | 


--------------------------------------------------------------------------------
/Module 3/Chapter 1/ch1_21.py:
--------------------------------------------------------------------------------
1 | text=[" It is a pleasant evening.","Guests, who came from US arrived at the venue","Food was tasty."]
2 | from nltk.tokenize import word_tokenize
3 | tokenized_docs=[word_tokenize(doc) for doc in text]
4 | print(tokenized_docs)
5 | 


--------------------------------------------------------------------------------
/Module 3/Chapter 1/ch1_22.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | import string
 3 | text=[" It is a pleasant evening.","Guests, who came from US arrived at the venue","Food was tasty."]
 4 | from nltk.tokenize import word_tokenize
 5 | tokenized_docs=[word_tokenize(doc) for doc in text]
 6 | x=re.compile('[%s]' % re.escape(string.punctuation))
 7 | tokenized_docs_no_punctuation = []
 8 | for review in tokenized_docs:
 9 |     new_review = []
10 |     for token in review: 
11 |         new_token = x.sub(u'', token)
12 |         if not new_token == u'':
13 |             new_review.append(new_token)
14 |     tokenized_docs_no_punctuation.append(new_review)	
15 | print(tokenized_docs_no_punctuation)
16 | 


--------------------------------------------------------------------------------
/Module 3/Chapter 1/ch1_23.py:
--------------------------------------------------------------------------------
1 | text='HARdWork IS KEy to SUCCESS'
2 | print(text.lower())
3 | print(text.upper())
4 | 


--------------------------------------------------------------------------------
/Module 3/Chapter 1/ch1_24.py:
--------------------------------------------------------------------------------
1 | import nltk
2 | from nltk.corpus import stopwords
3 | stops=set(stopwords.words('english'))
4 | words=["Don't", 'hesitate','to','ask','questions']
5 | print([word for word in words if word not in stops])
6 | 


--------------------------------------------------------------------------------
/Module 3/Chapter 1/ch1_25.py:
--------------------------------------------------------------------------------
1 | from nltk.corpus import stopwords
2 | print(stopwords.fileids())
3 | 


--------------------------------------------------------------------------------
/Module 3/Chapter 1/ch1_26.py:
--------------------------------------------------------------------------------
 1 | import nltk
 2 | from nltk.corpus import stopwords
 3 | print(stopwords.words('english'))
 4 | def para_fraction(text):
 5 |     stopwords = nltk.corpus.stopwords.words('english')
 6 |     para = [w for w in text if w.lower() not in stopwords]
 7 |     return len(para) / len(text)
 8 | print(para_fraction(nltk.corpus.reuters.words()))
 9 | print(para_fraction(nltk.corpus.inaugural.words()))
10 | 
11 | 


--------------------------------------------------------------------------------
/Module 3/Chapter 1/ch1_27.py:
--------------------------------------------------------------------------------
1 | import nltk
2 | from replacers import RegexpReplacer
3 | replacer= RegexpReplacer()
4 | replacer.replace("Don't hesitate to ask questions")
5 | print(replacer.replace("She must've gone to the market but she didn't go"))
6 | 


--------------------------------------------------------------------------------
/Module 3/Chapter 1/ch1_28.py:
--------------------------------------------------------------------------------
1 | import nltk
2 | from nltk.tokenize import word_tokenize
3 | from replacers import RegexpReplacer
4 | replacer=RegexpReplacer()
5 | word_tokenize("Don't hesitate to ask questions")
6 | print(word_tokenize(replacer.replace("Don't hesitate to ask questions")))
7 | 


--------------------------------------------------------------------------------
/Module 3/Chapter 1/ch1_29.py:
--------------------------------------------------------------------------------
1 | import nltk
2 | from replacers import RepeatReplacer
3 | replacer=RepeatReplacer()
4 | print(replacer.replace('lotttt'))
5 | print(replacer.replace('ohhhhh'))
6 | print(replacer.replace('ooohhhhh'))
7 | 
8 | 


--------------------------------------------------------------------------------
/Module 3/Chapter 1/ch1_3.py:
--------------------------------------------------------------------------------
1 | import nltk
2 | french_tokenizer=nltk.data.load('tokenizers/punkt/french.pickle')
3 | print(french_tokenizer.tokenize('Deux agressions en quelques jours, voilà ce qui a motivé hier matin le débrayage  collège franco-britanniquedeLevallois-Perret. Deux agressions en quelques jours, voilà ce qui a motivé hier matin le débrayage  Levallois. L’équipe pédagogique de ce collège de 750 élèves avait déjà été choquée par l’agression, janvier , d’un professeur d’histoire. L’équipe pédagogique de ce collège de 750 élèves avait déjà été choquée par l’agression, mercredi , d’un professeur d’histoire'))
4 | 


--------------------------------------------------------------------------------
/Module 3/Chapter 1/ch1_30.py:
--------------------------------------------------------------------------------
1 | import nltk
2 | from replacers import RepeatReplacer
3 | replacer=RepeatReplacer()
4 | print(replacer.replace('happy'))
5 | 


--------------------------------------------------------------------------------
/Module 3/Chapter 1/ch1_31.py:
--------------------------------------------------------------------------------
1 | import nltk
2 | from replacers import WordReplacer
3 | replacer=WordReplacer({'congrats':'congratulations'})
4 | print(replacer.replace('congrats'))
5 | print(replacer.replace('maths'))
6 | 


--------------------------------------------------------------------------------
/Module 3/Chapter 1/ch1_33.py:
--------------------------------------------------------------------------------
 1 | from __future__ import print_function
 2 | from nltk.metrics import *
 3 | training='PERSON OTHER PERSON OTHER OTHER ORGANIZATION'.split()
 4 | testing='PERSON OTHER OTHER OTHER OTHER OTHER'.split()
 5 | print(accuracy(training,testing))
 6 | trainset=set(training)
 7 | testset=set(testing)
 8 | precision(trainset,testset)
 9 | print(recall(trainset,testset))
10 | print(f_measure(trainset,testset))
11 |  
12 | 


--------------------------------------------------------------------------------
/Module 3/Chapter 1/ch1_34.py:
--------------------------------------------------------------------------------
1 | import nltk
2 | from nltk.metrics import *
3 | print(edit_distance("relate","relation"))
4 | print(edit_distance("suggestion","calculation"))
5 | 


--------------------------------------------------------------------------------
/Module 3/Chapter 1/ch1_35.py:
--------------------------------------------------------------------------------
1 | import nltk
2 | from nltk.metrics import *
3 | X=set([10,20,30,40])
4 | Y=set([20,30,60])
5 | print(jaccard_distance(X,Y))
6 | 
7 | 


--------------------------------------------------------------------------------
/Module 3/Chapter 1/ch1_36.py:
--------------------------------------------------------------------------------
1 | import nltk
2 | from nltk.metrics import *
3 | X = set([10,20,30,40])
4 | Y= set([30,50,70])
5 | print(binary_distance(X, Y))
6 | 
7 | 


--------------------------------------------------------------------------------
/Module 3/Chapter 1/ch1_37.py:
--------------------------------------------------------------------------------
1 | import nltk
2 | from nltk.util import ngrams
3 | from nltk.corpus import alpino
4 | print(alpino.words())
5 | trigrams_tokens=ngrams(alpino.words(),3)
6 | for i in trigrams_tokens:
7 |     print(i)
8 | 


--------------------------------------------------------------------------------
/Module 3/Chapter 1/ch1_4.py:
--------------------------------------------------------------------------------
1 | import nltk
2 | text = nltk.word_tokenize("PierreVinken , 59 years old , will join as a nonexecutive director on Nov. 29 .")
3 | print(text)
4 | 


--------------------------------------------------------------------------------
/Module 3/Chapter 1/ch1_5.py:
--------------------------------------------------------------------------------
1 | import nltk
2 | from nltk import word_tokenize
3 | r=input("Please write a text")
4 | print("The length of text is",len(word_tokenize(r)),"words")
5 | 


--------------------------------------------------------------------------------
/Module 3/Chapter 1/ch1_6.py:
--------------------------------------------------------------------------------
1 | import nltk
2 | from nltk.tokenize import TreebankWordTokenizer
3 | tokenizer = TreebankWordTokenizer()
4 | print(tokenizer.tokenize("Have a nice day. I hope you find the book interesting"))
5 | 


--------------------------------------------------------------------------------
/Module 3/Chapter 1/ch1_7.py:
--------------------------------------------------------------------------------
1 | import nltk
2 | text=nltk.word_tokenize(" Don't hesitate to ask questions")
3 | print(text)
4 | 


--------------------------------------------------------------------------------
/Module 3/Chapter 1/ch1_8.py:
--------------------------------------------------------------------------------
1 | from nltk.tokenize import WordPunctTokenizer
2 | tokenizer=WordPunctTokenizer()
3 | print(tokenizer.tokenize(" Don't hesitate to ask questions"))
4 | 


--------------------------------------------------------------------------------
/Module 3/Chapter 1/ch1_9.py:
--------------------------------------------------------------------------------
1 | import nltk
2 | from nltk.tokenize import RegexpTokenizer
3 | tokenizer=RegexpTokenizer("[\w']+")
4 | print(tokenizer.tokenize("Don't hesitate to ask questions"))
5 | 


--------------------------------------------------------------------------------
/Module 3/Chapter 10/ch10_1.py:
--------------------------------------------------------------------------------
1 | import nltk
2 | from nltk.corpus import brown
3 | sentences=brown.tagged_sents(categories='news')
4 | sent=brown.sents(categories='news')
5 | unigram_sent=nltk.UnigramTagger(sentences)
6 | print(unigram_sent.tag(sent[2008]))
7 | print(unigram_sent.evaluate(sentences))
8 | 


--------------------------------------------------------------------------------
/Module 3/Chapter 10/ch10_10.py:
--------------------------------------------------------------------------------
1 | import nltk
2 | from nltk.stem.lancaster import LancasterStemmer
3 | stri=LancasterStemmer()
4 | print(stri.stem('achievement'))
5 | 


--------------------------------------------------------------------------------
/Module 3/Chapter 10/ch10_2.py:
--------------------------------------------------------------------------------
 1 | import nltk
 2 | from nltk.corpus import brown
 3 | sentences=brown.tagged_sents(categories='news')
 4 | sz=int(len(sentences)*0.8)
 5 | print(sz)
 6 | training_sents = sentences[:sz]
 7 | print(testing_sents=sentences[sz:])
 8 | unigram_tagger=nltk.UnigramTagger(training_sents)
 9 | print(unigram_tagger.evaluate(testing_sents))
10 | 
11 | 


--------------------------------------------------------------------------------
/Module 3/Chapter 10/ch10_3.py:
--------------------------------------------------------------------------------
 1 | import nltk
 2 | from nltk.corpus import brown
 3 | sentences=brown.tagged_sents(categories='news')
 4 | sz=int(len(sentences)*0.8)
 5 | training_sents = sentences[:sz]
 6 | testing_sents=sentences[sz:]
 7 | bigram_tagger=nltk.UnigramTagger(training_sents)
 8 | bigram_tagger=nltk.BigramTagger(training_sents)
 9 | print(bigram_tagger.tag(sentences[2008]))
10 | un_sent=sentences[4203]
11 | print(bigram_tagger.tag(un_sent))
12 | print(bigram_tagger.evaluate(testing_sents))
13 | 


--------------------------------------------------------------------------------
/Module 3/Chapter 10/ch10_4.py:
--------------------------------------------------------------------------------
 1 | import nltk
 2 | from nltk.corpus import brown
 3 | sentences=brown.tagged_sents(categories='news')
 4 | sz=int(len(sentences)*0.8)
 5 | training_sents = sentences[:sz]
 6 | testing_sents=sentences[sz:]
 7 | s0=nltk.DefaultTagger('NNP')
 8 | s1=nltk.UnigramTagger(training_sents,backoff=s0)
 9 | s2=nltk.BigramTagger(training_sents,backoff=s1)
10 | print(s2.evaluate(testing_sents))
11 | 
12 | 


--------------------------------------------------------------------------------
/Module 3/Chapter 10/ch10_5.py:
--------------------------------------------------------------------------------
1 | import nltk
2 | chunkparser = nltk.RegexpParser("")
3 | print(nltk.chunk.accuracy(chunkparser, nltk.corpus.conll2000.chunked_sents('train.txt', chunk_types=('NP',))))
4 | 


--------------------------------------------------------------------------------
/Module 3/Chapter 10/ch10_6.py:
--------------------------------------------------------------------------------
1 | import nltk
2 | grammar = r"NP: {<[CDJNP].*>+}"
3 | cp = nltk.RegexpParser(grammar)
4 | print(nltk.chunk.accuracy(cp, nltk.corpus.conll2000.chunked_sents('train.txt', chunk_types=('NP',))))
5 | 


--------------------------------------------------------------------------------
/Module 3/Chapter 10/ch10_7.py:
--------------------------------------------------------------------------------
 1 | import nltk
 2 | correct = nltk.chunk.tagstr2tree(
 3 | 	"[ the/DT little/JJ cat/NN ] sat/VBD on/IN [ the/DT mat/NN ]")
 4 | print(correct.flatten())
 5 | grammar = r"NP: {<[CDJNP].*>+}"
 6 | cp = nltk.RegexpParser(grammar)
 7 | 
 8 | grammar = r"NP: {<PRP|DT|POS|JJ|CD|N.*>+}"
 9 | chunk_parser = nltk.RegexpParser(grammar)
10 | tagged_tok = [("the", "DT"), ("little", "JJ"), ("cat", "NN"),("sat", "VBD"), ("on", "IN"), ("the", "DT"), ("mat", "NN")]
11 | chunkscore = nltk.chunk.ChunkScore()
12 | guessed = cp.parse(correct.flatten())
13 | chunkscore.score(correct, guessed)
14 | print(chunkscore)
15 | 


--------------------------------------------------------------------------------
/Module 3/Chapter 10/ch10_8.py:
--------------------------------------------------------------------------------
 1 | import nltk
 2 | from nltk.corpus import brown
 3 | sentences = brown.tagged_sents(categories='news')
 4 | sent = brown.sents(categories='news')
 5 | pattern = [(r'(January)$','Jan')]
 6 | regexpr_tagger = nltk.RegexpTagger(pattern)
 7 | print(regexpr_tagger.tag(sent[3]))
 8 | print(regexpr_tagger.evaluate(sentences))
 9 | 
10 | 


--------------------------------------------------------------------------------
/Module 3/Chapter 10/ch10_9.py:
--------------------------------------------------------------------------------
 1 | import nltk
 2 | from nltk.corpus import brown
 3 | freqd = nltk.FreqDist(brown.words(categories='news'))
 4 | cfreqd = nltk.ConditionalFreqDist(brown.tagged_words(categories='news'))
 5 | mostfreq_words = freqd.most_common(100)
 6 | likelytags = dict((word, cfreqd[word].max()) for (word, _) in mostfreq_words)
 7 | baselinetagger = nltk.UnigramTagger(model=likelytags)
 8 | 
 9 | sent = brown.sents(categories='news')[3]
10 | print(baselinetagger.tag(sent))
11 | 


--------------------------------------------------------------------------------
/Module 3/Chapter 2/ch2_1.py:
--------------------------------------------------------------------------------
1 | import nltk
2 | from nltk.util import ngrams
3 | from nltk.corpus import alpino
4 | print(alpino.words())
5 | unigrams=ngrams(alpino.words(),1)
6 | for i in unigrams:
7 |     print(i)
8 | 


--------------------------------------------------------------------------------
/Module 3/Chapter 2/ch2_10.py:
--------------------------------------------------------------------------------
 1 | import nltk
 2 | corpus=u"<s> hello how are you doing ? Hope you find the book interesting. </s>".split()
 3 | sentence=u"<s>how are you doing</s>".split()
 4 | vocabulary=set(corpus)
 5 | print(len(vocabulary))
 6 | cfd = nltk.ConditionalFreqDist(nltk.bigrams(corpus))
 7 | print([cfd[a][b] for (a,b) in nltk.bigrams(sentence)])
 8 | print([cfd[a].N() for (a,b) in nltk.bigrams(sentence)])
 9 | print([cfd[a].freq(b) for (a,b) in nltk.bigrams(sentence)])
10 | print([1 + cfd[a][b] for (a,b) in nltk.bigrams(sentence)])
11 | print([len(vocabulary) + cfd[a].N() for (a,b) in nltk.bigrams(sentence)])
12 | print([1.0 * (1+cfd[a][b]) / (len(vocabulary)+cfd[a].N()) for (a,b) in nltk.bigrams(sentence)])
13 | cpd_mle = nltk.ConditionalProbDist(cfd, nltk.MLEProbDist, bins=len(vocabulary))
14 | print([cpd_mle[a].prob(b) for (a,b) in nltk.bigrams(sentence)])
15 | cpd_laplace = nltk.ConditionalProbDist(cfd, nltk.LaplaceProbDist, bins=len(vocabulary))
16 | print([cpd_laplace[a].prob(b) for (a,b) in nltk.bigrams(sentence)])
17 | 
18 | 


--------------------------------------------------------------------------------
/Module 3/Chapter 2/ch2_2.py:
--------------------------------------------------------------------------------
1 | import nltk
2 | from nltk.util import ngrams
3 | from nltk.corpus import alpino
4 | print(alpino.words())
5 | quadgrams=ngrams(alpino.words(),4)
6 | for i in quadgrams:
7 |     print(i)
8 | 


--------------------------------------------------------------------------------
/Module 3/Chapter 2/ch2_3.py:
--------------------------------------------------------------------------------
1 | import nltk
2 | from nltk.collocations import BigramCollocationFinder
3 | from nltk.corpus import webtext
4 | from nltk.metrics import BigramAssocMeasures
5 | tokens=[t.lower() for t in webtext.words('grail.txt')]
6 | words=BigramCollocationFinder.from_words(tokens)
7 | print(words.nbest(BigramAssocMeasures.likelihood_ratio, 10))
8 | 


--------------------------------------------------------------------------------
/Module 3/Chapter 2/ch2_4.py:
--------------------------------------------------------------------------------
 1 | from nltk.corpus import stopwords
 2 | from nltk.corpus import webtext
 3 | from nltk.collocations import BigramCollocationFinder
 4 | from nltk.metrics import BigramAssocMeasures
 5 | set = set(stopwords.words('english'))
 6 | stops_filter = lambda w: len(w) < 3 or w in set
 7 | tokens=[t.lower() for t in webtext.words('grail.txt')]
 8 | words=BigramCollocationFinder.from_words(tokens)
 9 | words.apply_word_filter(stops_filter)
10 | print(words.nbest(BigramAssocMeasures.likelihood_ratio, 10))
11 | 


--------------------------------------------------------------------------------
/Module 3/Chapter 2/ch2_5.py:
--------------------------------------------------------------------------------
1 | import nltk
2 | from nltk.collocations import *
3 | text1="Hardwork is the key to success. Never give up!"
4 | word = nltk.wordpunct_tokenize(text1)
5 | finder = BigramCollocationFinder.from_words(word)
6 | bigram_measures = nltk.collocations.BigramAssocMeasures()
7 | value = finder.score_ngrams(bigram_measures.raw_freq)
8 | print(sorted(bigram for bigram, score in value))
9 | 


--------------------------------------------------------------------------------
/Module 3/Chapter 2/ch2_6.py:
--------------------------------------------------------------------------------
1 | import nltk
2 | from nltk.util import ngrams
3 | from nltk.corpus import alpino
4 | print(alpino.words())
5 | bigrams_tokens=ngrams(alpino.words(),2)
6 | for i in bigrams_tokens:
7 |     print(i) 
8 | 


--------------------------------------------------------------------------------
/Module 3/Chapter 2/ch2_7.py:
--------------------------------------------------------------------------------
1 | import nltk
2 | from nltk.collocations import *
3 | import nltk
4 | text="Hello how are you doing ? I hope you find the book interesting"
5 | tokens=nltk.wordpunct_tokenize(text)
6 | fourgrams=nltk.collocations.QuadgramCollocationFinder.from_words(tokens)
7 | for fourgram, freq in fourgrams.ngram_fd.items():
8 |     print(fourgram,freq)
9 | 


--------------------------------------------------------------------------------
/Module 3/Chapter 2/ch2_8.py:
--------------------------------------------------------------------------------
1 | import nltk
2 | from nltk.util import ngrams
3 | sent=" Hello , please read the book thoroughly . If you have any queries , then don't hesitate to ask . There is no shortcut to success ."
4 | n=5
5 | fivegrams=ngrams(sent.split(),n)
6 | for grams in fivegrams:
7 |     print(grams)
8 | 


--------------------------------------------------------------------------------
/Module 3/Chapter 2/ch2_9.py:
--------------------------------------------------------------------------------
 1 | import nltk
 2 | cor = nltk.corpus.brown.tagged_sents(categories='adventure')[:500]
 3 | print(len(cor))
 4 | from nltk.util import unique_list
 5 | tag_set = unique_list(tag for sent in cor for (word,tag) in sent)
 6 | print(len(tag_set))
 7 | symbols = unique_list(word for sent in cor for (word,tag) in sent)
 8 | print(len(symbols))
 9 | print(len(tag_set))
10 | symbols = unique_list(word for sent in cor for (word,tag) in sent)
11 | print(len(symbols))
12 | trainer = nltk.tag.HiddenMarkovModelTrainer(tag_set, symbols)
13 | train_corpus = []
14 | test_corpus = []
15 | for i in range(len(cor)):
16 |     if i % 10:
17 |         train_corpus+=[cor[i]]
18 |     else:
19 |         test_corpus+=[cor[i]]
20 | print(len(train_corpus))
21 | print(len(test_corpus))
22 |  
23 |     
24 | 


--------------------------------------------------------------------------------
/Module 3/Chapter 3/ch3_1.py:
--------------------------------------------------------------------------------
1 | import nltk
2 | from nltk.stem import PorterStemmer
3 | stemmerporter = PorterStemmer()
4 | print(stemmerporter.stem('working'))
5 | print(stemmerporter.stem('happiness'))
6 | 
7 | 


--------------------------------------------------------------------------------
/Module 3/Chapter 3/ch3_2.py:
--------------------------------------------------------------------------------
1 | import nltk
2 | from nltk.stem import LancasterStemmer
3 | stemmerlan=LancasterStemmer()
4 | print(stemmerlan.stem('working'))
5 | print(stemmerlan.stem('happiness'))
6 | 


--------------------------------------------------------------------------------
/Module 3/Chapter 3/ch3_3.py:
--------------------------------------------------------------------------------
1 | import nltk
2 | from nltk.stem import RegexpStemmer
3 | stemmerregexp=RegexpStemmer('ing')
4 | print(stemmerregexp.stem('working'))
5 | print(stemmerregexp.stem('happiness'))
6 | print(stemmerregexp.stem('pairing'))
7 | 
8 | 


--------------------------------------------------------------------------------
/Module 3/Chapter 3/ch3_4.py:
--------------------------------------------------------------------------------
1 | import nltk
2 | from nltk.stem import SnowballStemmer
3 | print(SnowballStemmer.languages)
4 | spanishstemmer=SnowballStemmer('spanish')
5 | print(spanishstemmer.stem('comiendo'))
6 | frenchstemmer=SnowballStemmer('french')
7 | print(frenchstemmer.stem('manger'))
8 | 
9 | 


--------------------------------------------------------------------------------
/Module 3/Chapter 3/ch3_5.py:
--------------------------------------------------------------------------------
1 | import nltk
2 | from nltk.stem import WordNetLemmatizer
3 | lemmatizer_output=WordNetLemmatizer()
4 | print(lemmatizer_output.lemmatize('working'))
5 | print(lemmatizer_output.lemmatize('working',pos='v'))
6 | print(lemmatizer_output.lemmatize('works'))
7 | 
8 | 


--------------------------------------------------------------------------------
/Module 3/Chapter 3/ch3_6.py:
--------------------------------------------------------------------------------
 1 | import nltk
 2 | from nltk.stem import PorterStemmer
 3 | from nltk.stem import WordNetLemmatizer
 4 | stemmer_output=PorterStemmer()
 5 | print(stemmer_output.stem('happiness'))
 6 | lemmatizer_output=WordNetLemmatizer()
 7 | print(lemmatizer_output.lemmatize('happiness'))
 8 | 
 9 | 
10 | 


--------------------------------------------------------------------------------
/Module 3/Chapter 4/ch4_1.py:
--------------------------------------------------------------------------------
1 | import nltk
2 | text1=nltk.word_tokenize("It is a pleasant day today")
3 | print(nltk.pos_tag(text1))
4 | 


--------------------------------------------------------------------------------
/Module 3/Chapter 4/ch4_10.py:
--------------------------------------------------------------------------------
 1 | import nltk
 2 | tag={}
 3 | print(tag)
 4 | tag['beautiful']='ADJ'
 5 | 
 6 | tag['boy']='N'
 7 | tag['read']='V'
 8 | tag['generously']='ADV'
 9 | print(tag)
10 | 
11 | 


--------------------------------------------------------------------------------
/Module 3/Chapter 4/ch4_11.py:
--------------------------------------------------------------------------------
1 | import nltk
2 | from nltk.tag import DefaultTagger
3 | tag = DefaultTagger('NN')
4 | print(tag.tag(['Beautiful', 'morning']))
5 | 
6 | 


--------------------------------------------------------------------------------
/Module 3/Chapter 4/ch4_12.py:
--------------------------------------------------------------------------------
1 | import nltk
2 | from nltk.tag import untag
3 | print(untag([('beautiful', 'NN'), ('morning', 'NN')]))
4 | 
5 | 


--------------------------------------------------------------------------------
/Module 3/Chapter 4/ch4_13.py:
--------------------------------------------------------------------------------
 1 | import nltk
 2 | import os,os.path
 3 | create = os.path.expanduser('~/nltkdoc')
 4 | if not os.path.exists(create):
 5 |     os.mkdir(create)
 6 | print(os.path.exists(create))
 7 | import nltk.data
 8 | print(create in nltk.data.path)
 9 | 
10 | 


--------------------------------------------------------------------------------
/Module 3/Chapter 4/ch4_14.py:
--------------------------------------------------------------------------------
1 | import nltk
2 | from nltk.corpus import names
3 | print(len(names.words('male.txt')))
4 | print(len(names.words('female.txt')))
5 | 
6 | 


--------------------------------------------------------------------------------
/Module 3/Chapter 4/ch4_15.py:
--------------------------------------------------------------------------------
1 | import nltk
2 | from nltk.corpus import words
3 | print(words.fileids())
4 | print(len(words.words('en')))
5 | print(len(words.words('en-basic')))
6 | 
7 | 


--------------------------------------------------------------------------------
/Module 3/Chapter 4/ch4_16.py:
--------------------------------------------------------------------------------
1 | import nltk
2 | from nltk.tag import UnigramTagger
3 | from nltk.corpus import treebank
4 | training= treebank.tagged_sents()[:7000]
5 | unitagger=UnigramTagger(training)
6 | print(treebank.sents()[0])
7 | print(unitagger.tag(treebank.sents()[0]))
8 | 


--------------------------------------------------------------------------------
/Module 3/Chapter 4/ch4_17.py:
--------------------------------------------------------------------------------
1 | import nltk
2 | from nltk.corpus import treebank
3 | from nltk.tag import UnigramTagger
4 | training= treebank.tagged_sents()[:7000]
5 | unitagger=UnigramTagger(training)
6 | testing = treebank.tagged_sents()[2000:]
7 | print(unitagger.evaluate(testing))
8 | 


--------------------------------------------------------------------------------
/Module 3/Chapter 4/ch4_18.py:
--------------------------------------------------------------------------------
1 | import nltk
2 | from nltk.corpus import treebank
3 | from nltk.tag import UnigramTagger
4 | unitag = UnigramTagger(model={'Vinken': 'NN'})
5 | print(unitag.tag(treebank.sents()[0]))
6 | 


--------------------------------------------------------------------------------
/Module 3/Chapter 4/ch4_19.py:
--------------------------------------------------------------------------------
 1 | import nltk
 2 | from nltk.tag import UnigramTagger
 3 | from nltk.tag import DefaultTagger
 4 | from nltk.corpus import treebank
 5 | testing = treebank.tagged_sents()[2000:]
 6 | training= treebank.tagged_sents()[:7000]
 7 | tag1=DefaultTagger('NN')
 8 | tag2=UnigramTagger(training,backoff=tag1)
 9 | print(tag2.evaluate(testing))
10 | 


--------------------------------------------------------------------------------
/Module 3/Chapter 4/ch4_2.py:
--------------------------------------------------------------------------------
1 | import nltk
2 | print(nltk.help.upenn_tagset('NNS'))
3 | 


--------------------------------------------------------------------------------
/Module 3/Chapter 4/ch4_20.py:
--------------------------------------------------------------------------------
 1 | import nltk
 2 | from nltk.tag import BigramTagger
 3 | from nltk.corpus import treebank
 4 | training_1= treebank.tagged_sents()[:7000]
 5 | bigramtagger=BigramTagger(training_1)
 6 | print(treebank.sents()[0])
 7 | print(bigramtagger.tag(treebank.sents()[0]))
 8 | testing_1 = treebank.tagged_sents()[2000:]
 9 | print(bigramtagger.evaluate(testing_1))
10 | 
11 | 


--------------------------------------------------------------------------------
/Module 3/Chapter 4/ch4_21.py:
--------------------------------------------------------------------------------
 1 | import nltk
 2 | from nltk.tag import BigramTagger, TrigramTagger
 3 | from nltk.corpus import treebank
 4 | testing = treebank.tagged_sents()[2000:]
 5 | training= treebank.tagged_sents()[:7000]
 6 | bigramtag = BigramTagger(training)
 7 | print(bigramtag.evaluate(testing))
 8 | trigramtag = TrigramTagger(training)
 9 | print(trigramtag.evaluate(testing))
10 | 


--------------------------------------------------------------------------------
/Module 3/Chapter 4/ch4_22.py:
--------------------------------------------------------------------------------
1 | import nltk
2 | from nltk.corpus import treebank
3 | from nltk import NgramTagger
4 | testing = treebank.tagged_sents()[2000:]
5 | training= treebank.tagged_sents()[:7000]
6 | quadgramtag = NgramTagger(4, training)
7 | print(quadgramtag.evaluate(testing))
8 | 
9 | 


--------------------------------------------------------------------------------
/Module 3/Chapter 4/ch4_23.py:
--------------------------------------------------------------------------------
1 | import nltk
2 | from nltk.tag import AffixTagger
3 | from nltk.corpus import treebank
4 | testing = treebank.tagged_sents()[2000:]
5 | training= treebank.tagged_sents()[:7000]
6 | affixtag = AffixTagger(training)
7 | print(affixtag.evaluate(testing))
8 | 


--------------------------------------------------------------------------------
/Module 3/Chapter 4/ch4_24.py:
--------------------------------------------------------------------------------
1 | import nltk
2 | from nltk.tag import AffixTagger
3 | from nltk.corpus import treebank
4 | testing = treebank.tagged_sents()[2000:]
5 | training= treebank.tagged_sents()[:7000]
6 | prefixtag = AffixTagger(training, affix_length=4)
7 | print(prefixtag.evaluate(testing))
8 | 


--------------------------------------------------------------------------------
/Module 3/Chapter 4/ch4_25.py:
--------------------------------------------------------------------------------
1 | import nltk
2 | from nltk.tag import AffixTagger
3 | from nltk.corpus import treebank
4 | testing = treebank.tagged_sents()[2000:]
5 | training= treebank.tagged_sents()[:7000]
6 | suffixtag = AffixTagger(training, affix_length=-3)
7 | print(suffixtag.evaluate(testing))
8 | 


--------------------------------------------------------------------------------
/Module 3/Chapter 4/ch4_26.py:
--------------------------------------------------------------------------------
 1 | import nltk
 2 | from nltk.tag import AffixTagger
 3 | from nltk.corpus import treebank
 4 | testing = treebank.tagged_sents()[2000:]
 5 | training= treebank.tagged_sents()[:7000]
 6 | prefixtagger=AffixTagger(training,affix_length=4)
 7 | prefixtagger3=AffixTagger(training,affix_length=3,backoff=prefixtagger)
 8 | print(prefixtagger3.evaluate(testing))
 9 | suffixtagger3=AffixTagger(training,affix_length=-3,backoff=prefixtagger3)
10 | print(suffixtagger3.evaluate(testing))
11 | suffixtagger4=AffixTagger(training,affix_length=-4,backoff=suffixtagger3)
12 | print(suffixtagger4.evaluate(testing))
13 | 
14 | 


--------------------------------------------------------------------------------
/Module 3/Chapter 4/ch4_27.py:
--------------------------------------------------------------------------------
1 | import nltk
2 | from nltk.tag import tnt
3 | from nltk.corpus import treebank
4 | testing = treebank.tagged_sents()[2000:]
5 | training= treebank.tagged_sents()[:7000]
6 | tnt_tagger=tnt.TnT()
7 | tnt_tagger.train(training)
8 | print(tnt_tagger.evaluate(testing))
9 | 


--------------------------------------------------------------------------------
/Module 3/Chapter 4/ch4_28.py:
--------------------------------------------------------------------------------
 1 | import nltk
 2 | from nltk.tag import DefaultTagger
 3 | from nltk.tag import tnt
 4 | from nltk.corpus import treebank
 5 | testing = treebank.tagged_sents()[2000:]
 6 | training= treebank.tagged_sents()[:7000]
 7 | tnt_tagger=tnt.TnT()
 8 | unknown=DefaultTagger('NN')
 9 | tagger_tnt=tnt.TnT(unk=unknown,Trained=True)
10 | tnt_tagger.train(training)
11 | print(tnt_tagger.evaluate(testing))
12 | 


--------------------------------------------------------------------------------
/Module 3/Chapter 4/ch4_29.py:
--------------------------------------------------------------------------------
1 | import nltk
2 | sent=[("A","DT"),("wise", "JJ"), ("small", "JJ"),("girl", "NN"), ("of", "IN"), ("village", "N"),  ("became", "VBD"), ("leader", "NN")]
3 | sent=[("A","DT"),("wise", "JJ"), ("small", "JJ"),("girl", "NN"), ("of", "IN"), ("village", "NN"),  ("became", "VBD"), ("leader", "NN")]
4 | grammar = "NP: {<DT>?<JJ>*<NN><IN>?<NN>*}"
5 | find = nltk.RegexpParser(grammar)
6 | res = find.parse(sent)
7 | print(res)
8 | res.draw()
9 | 


--------------------------------------------------------------------------------
/Module 3/Chapter 4/ch4_3.py:
--------------------------------------------------------------------------------
1 | import nltk
2 | print(nltk.help.upenn_tagset('VB.*'))
3 | 


--------------------------------------------------------------------------------
/Module 3/Chapter 4/ch4_30.py:
--------------------------------------------------------------------------------
1 | import nltk
2 | noun1=[("financial","NN"),("year","NN"),("account","NN"),("summary","NN")]
3 | gram="NP:{<NN>+}"
4 | find = nltk.RegexpParser(gram)
5 | print(find.parse(noun1))
6 | x=find.parse(noun1)
7 | x.draw()
8 | 


--------------------------------------------------------------------------------
/Module 3/Chapter 4/ch4_4.py:
--------------------------------------------------------------------------------
1 | import nltk
2 | text=nltk.word_tokenize("I cannot bear the pain of bear")
3 | print(nltk.pos_tag(text))
4 | 


--------------------------------------------------------------------------------
/Module 3/Chapter 4/ch4_5.py:
--------------------------------------------------------------------------------
1 | import nltk
2 | taggedword=nltk.tag.str2tuple('bear/NN')
3 | print(taggedword)
4 | print(taggedword[0])
5 | print(taggedword[1])
6 | 
7 | 


--------------------------------------------------------------------------------
/Module 3/Chapter 4/ch4_6.py:
--------------------------------------------------------------------------------
1 | import nltk
2 | sentence='''The/DT sacred/VBN Ganga/NNP flows/VBZ in/IN this/DT region/NN ./. This/DT is/VBZ a/DT pilgrimage/NN ./. People/NNP from/IN all/DT over/IN the/DT country/NN visit/NN this/DT place/NN ./. '''
3 | print([nltk.tag.str2tuple(t) for t in sentence.split()])
4 | 


--------------------------------------------------------------------------------
/Module 3/Chapter 4/ch4_7.py:
--------------------------------------------------------------------------------
1 | import nltk
2 | taggedtok = ('bear', 'NN')
3 | from nltk.tag.util import tuple2str
4 | print(tuple2str(taggedtok))
5 | 


--------------------------------------------------------------------------------
/Module 3/Chapter 4/ch4_8.py:
--------------------------------------------------------------------------------
1 | import nltk
2 | from nltk.corpus import treebank
3 | treebank_tagged = treebank.tagged_words(tagset='universal')
4 | tag = nltk.FreqDist(tag for (word, tag) in treebank_tagged)
5 | print(tag.most_common())
6 | 


--------------------------------------------------------------------------------
/Module 3/Chapter 4/ch4_9.py:
--------------------------------------------------------------------------------
1 | import nltk
2 | from nltk.corpus import treebank	
3 | treebank_tagged = treebank.tagged_words(tagset='universal')
4 | tagpairs = nltk.bigrams(treebank_tagged)
5 | preceders_noun = [x[1] for (x, y) in tagpairs if y[1] == 'NOUN']
6 | freqdist = nltk.FreqDist(preceders_noun)
7 | print([tag for (tag, _) in freqdist.most_common()])
8 | 


--------------------------------------------------------------------------------
/Module 3/Chapter 5/ch5_1.py:
--------------------------------------------------------------------------------
1 | import nltk
2 | import nltk.corpus
3 | print(str(nltk.corpus.treebank).replace('\\\\','/'))
4 | print(nltk.corpus.treebank.fileids())
5 | from nltk.corpus import treebank
6 | print(treebank.words('wsj_0007.mrg'))
7 | print(treebank.tagged_words('wsj_0007.mrg'))
8 | 
9 | 


--------------------------------------------------------------------------------
/Module 3/Chapter 5/ch5_10.py:
--------------------------------------------------------------------------------
 1 | import nltk
 2 | gram1 = nltk.data.load('grammars/large_grammars/atis.cfg')
 3 | sent = nltk.data.load('grammars/large_grammars/atis_sentences.txt')
 4 | sent = nltk.parse.util.extract_test_sentences(sent)
 5 | testingsent=sent[25]
 6 | sent=testingsent[0]
 7 | parser1 = nltk.parse.BottomUpChartParser(gram1)
 8 | chart1 = parser1.chart_parse(sent)
 9 | print((chart1.num_edges()))
10 | print((len(list(chart1.parses(gram1.start())))))
11 | 
12 | 


--------------------------------------------------------------------------------
/Module 3/Chapter 5/ch5_11.py:
--------------------------------------------------------------------------------
 1 | import nltk
 2 | gram1 = nltk.data.load('grammars/large_grammars/atis.cfg')
 3 | sent = nltk.data.load('grammars/large_grammars/atis_sentences.txt')
 4 | sent = nltk.parse.util.extract_test_sentences(sent)
 5 | testingsent=sent[25]
 6 | sent=testingsent[0]
 7 | parser2 = nltk.parse.BottomUpLeftCornerChartParser(gram1)
 8 | chart2 = parser2.chart_parse(sent)
 9 | print((chart2.num_edges()))
10 | print((len(list(chart2.parses(gram1.start())))))
11 | 


--------------------------------------------------------------------------------
/Module 3/Chapter 5/ch5_12.py:
--------------------------------------------------------------------------------
 1 | import nltk
 2 | gram1 = nltk.data.load('grammars/large_grammars/atis.cfg')
 3 | sent = nltk.data.load('grammars/large_grammars/atis_sentences.txt')
 4 | sent = nltk.parse.util.extract_test_sentences(sent)
 5 | testingsent=sent[25]
 6 | sent=testingsent[0]
 7 | parser3 = nltk.parse.LeftCornerChartParser(gram1)
 8 | chart3 = parser3.chart_parse(sent)
 9 | print((chart3.num_edges()))
10 | print((len(list(chart3.parses(gram1.start())))))
11 | 


--------------------------------------------------------------------------------
/Module 3/Chapter 5/ch5_13.py:
--------------------------------------------------------------------------------
 1 | import nltk
 2 | gram1 = nltk.data.load('grammars/large_grammars/atis.cfg')
 3 | sent = nltk.data.load('grammars/large_grammars/atis_sentences.txt')
 4 | sent = nltk.parse.util.extract_test_sentences(sent)
 5 | testingsent=sent[25]
 6 | sent=testingsent[0]
 7 | parser4 = nltk.parse.TopDownChartParser(gram1)
 8 | chart4 = parser4.chart_parse(sent)
 9 | print((chart4.num_edges()))
10 | print((len(list(chart4.parses(gram1.start())))))
11 | 


--------------------------------------------------------------------------------
/Module 3/Chapter 5/ch5_14.py:
--------------------------------------------------------------------------------
 1 | import nltk
 2 | gram1 = nltk.data.load('grammars/large_grammars/atis.cfg')
 3 | sent = nltk.data.load('grammars/large_grammars/atis_sentences.txt')
 4 | sent = nltk.parse.util.extract_test_sentences(sent)
 5 | testingsent=sent[25]
 6 | sent=testingsent[0]
 7 | parser5 = nltk.parse.IncrementalBottomUpChartParser(gram1)
 8 | chart5 = parser5.chart_parse(sent)
 9 | print((chart5.num_edges()))
10 | print((len(list(chart5.parses(gram1.start())))))
11 | 
12 | 


--------------------------------------------------------------------------------
/Module 3/Chapter 5/ch5_15.py:
--------------------------------------------------------------------------------
 1 | import nltk
 2 | gram1 = nltk.data.load('grammars/large_grammars/atis.cfg')
 3 | sent = nltk.data.load('grammars/large_grammars/atis_sentences.txt')
 4 | sent = nltk.parse.util.extract_test_sentences(sent)
 5 | testingsent=sent[25]
 6 | sent=testingsent[0]
 7 | parser6 = nltk.parse.IncrementalBottomUpLeftCornerChartParser(gram1)
 8 | chart6 = parser6.chart_parse(sent)
 9 | print((chart6.num_edges()))
10 | print((len(list(chart6.parses(gram1.start())))))
11 | 
12 | 


--------------------------------------------------------------------------------
/Module 3/Chapter 5/ch5_16.py:
--------------------------------------------------------------------------------
 1 | import nltk
 2 | gram1 = nltk.data.load('grammars/large_grammars/atis.cfg')
 3 | sent = nltk.data.load('grammars/large_grammars/atis_sentences.txt')
 4 | sent = nltk.parse.util.extract_test_sentences(sent)
 5 | testingsent=sent[25]
 6 | sent=testingsent[0]
 7 | parser7 = nltk.parse.IncrementalLeftCornerChartParser(gram1)
 8 | chart7 = parser7.chart_parse(sent)
 9 | print((chart7.num_edges()))
10 | print((len(list(chart7.parses(gram1.start())))))
11 | 


--------------------------------------------------------------------------------
/Module 3/Chapter 5/ch5_17.py:
--------------------------------------------------------------------------------
 1 | import nltk
 2 | gram1 = nltk.data.load('grammars/large_grammars/atis.cfg')
 3 | sent = nltk.data.load('grammars/large_grammars/atis_sentences.txt')
 4 | sent = nltk.parse.util.extract_test_sentences(sent)
 5 | testingsent=sent[25]
 6 | sent=testingsent[0]
 7 | parser8 = nltk.parse.IncrementalTopDownChartParser(gram1)
 8 | chart8 = parser8.chart_parse(sent)
 9 | print((chart8.num_edges()))
10 | print((len(list(chart8.parses(gram1.start())))))
11 | 


--------------------------------------------------------------------------------
/Module 3/Chapter 5/ch5_18.py:
--------------------------------------------------------------------------------
 1 | import nltk
 2 | gram1 = nltk.data.load('grammars/large_grammars/atis.cfg')
 3 | sent = nltk.data.load('grammars/large_grammars/atis_sentences.txt')
 4 | sent = nltk.parse.util.extract_test_sentences(sent)
 5 | testingsent=sent[25]
 6 | sent=testingsent[0]
 7 | parser9 = nltk.parse.EarleyChartParser(gram1)
 8 | chart9 = parser9.chart_parse(sent)
 9 | print((chart9.num_edges()))
10 | print((len(list(chart9.parses(gram1.start())))))
11 | 


--------------------------------------------------------------------------------
/Module 3/Chapter 5/ch5_19.py:
--------------------------------------------------------------------------------
 1 | import nltk
 2 | from nltk.corpus import treebank
 3 | from itertools import islice
 4 | from nltk.grammar import PCFG, induce_pcfg, toy_pcfg1, toy_pcfg2
 5 | gram2 = PCFG.fromstring("""
 6 | 	A -> B B [.3] | C B C [.7]
 7 | 	B -> B D [.5] | C [.5]
 8 | 	C -> 'a' [.1] | 'b' [0.9]
 9 | 	D -> 'b' [1.0]
10 | 	""")
11 | prod1 = gram2.productions()[0]
12 | print(prod1)
13 | prod2 = gram2.productions()[1]
14 | print(prod2)
15 | print(prod2.lhs())
16 | print(prod2.rhs())
17 | print((prod2.prob()))
18 | print(gram2.start())
19 | print(gram2.productions())
20 | 


--------------------------------------------------------------------------------
/Module 3/Chapter 5/ch5_2.py:
--------------------------------------------------------------------------------
1 | import nltk
2 | from nltk.corpus import treebank
3 | print(treebank.parsed_sents('wsj_0007.mrg')[2])
4 | 


--------------------------------------------------------------------------------
/Module 3/Chapter 5/ch5_20.py:
--------------------------------------------------------------------------------
1 | import nltk
2 | from nltk.corpus import treebank
3 | from itertools import islice
4 | from nltk.grammar import PCFG, induce_pcfg, toy_pcfg1, toy_pcfg2
5 | tokens = "Jack told Bob to bring my cookie".split()
6 | grammar = toy_pcfg2
7 | print(grammar)
8 | 


--------------------------------------------------------------------------------
/Module 3/Chapter 5/ch5_21.py:
--------------------------------------------------------------------------------
1 | import nltk
2 | nltk.parse.chart.demo(5, print_times=False, trace=1,sent='John saw a dog', numparses=2)
3 | 


--------------------------------------------------------------------------------
/Module 3/Chapter 5/ch5_22.py:
--------------------------------------------------------------------------------
1 | import nltk
2 | nltk.parse.chart.demo(2, print_times=False, trace=1,sent='John saw a dog', numparses=1)
3 | 


--------------------------------------------------------------------------------
/Module 3/Chapter 5/ch5_23.py:
--------------------------------------------------------------------------------
1 | import nltk
2 | nltk.parse.featurechart.demo(print_times=False,print_grammar=True,parser=nltk.parse.featurechart.FeatureChartParser,sent='I saw a dog')
3 | 


--------------------------------------------------------------------------------
/Module 3/Chapter 5/ch5_3.py:
--------------------------------------------------------------------------------
1 | import nltk
2 | from nltk.corpus import treebank_chunk
3 | print(treebank_chunk.chunked_sents()[1])
4 | treebank_chunk.chunked_sents()[1].draw()
5 | 


--------------------------------------------------------------------------------
/Module 3/Chapter 5/ch5_4.py:
--------------------------------------------------------------------------------
1 | import nltk
2 | from nltk.corpus import treebank_chunk
3 | print(treebank_chunk.chunked_sents()[1].leaves())
4 | print(treebank_chunk.chunked_sents()[1].pos())
5 | print(treebank_chunk.chunked_sents()[1].productions())
6 | print(nltk.corpus.treebank.tagged_words())
7 | 


--------------------------------------------------------------------------------
/Module 3/Chapter 5/ch5_5.py:
--------------------------------------------------------------------------------
1 | import nltk
2 | from nltk.probability import FreqDist
3 | from nltk.corpus import treebank
4 | fd = FreqDist()
5 | print(fd.items())
6 | 
7 | 


--------------------------------------------------------------------------------
/Module 3/Chapter 5/ch5_6.py:
--------------------------------------------------------------------------------
1 | import nltk
2 | from nltk.corpus import sinica_treebank
3 | print(sinica_treebank.sents())
4 | print(sinica_treebank.parsed_sents()[27])
5 | 


--------------------------------------------------------------------------------
/Module 3/Chapter 5/ch5_7.py:
--------------------------------------------------------------------------------
 1 | import nltk
 2 | from nltk import Nonterminal, nonterminals, Production, CFG
 3 | nonterminal1 = Nonterminal('NP')
 4 | nonterminal2 = Nonterminal('VP')
 5 | nonterminal3 = Nonterminal('PP')
 6 | print(nonterminal1.symbol())
 7 | print(nonterminal2.symbol())
 8 | print(nonterminal3.symbol())
 9 | print(nonterminal1==nonterminal2)
10 | print(nonterminal2==nonterminal3)
11 | print(nonterminal1==nonterminal3)
12 | S, NP, VP, PP = nonterminals('S, NP, VP, PP')
13 | N, V, P, DT = nonterminals('N, V, P, DT')
14 | production1 = Production(S, [NP, VP])
15 | production2 = Production(NP, [DT, NP])
16 | production3 = Production(VP, [V, NP,NP,PP])
17 | print(production1.lhs())
18 | print(production1.rhs())
19 | print(production3.lhs())
20 | print(production3.rhs())
21 | print(production3 == Production(VP, [V,NP,NP,PP]))
22 | print(production2 == production3)
23 | 
24 | 


--------------------------------------------------------------------------------
/Module 3/Chapter 5/ch5_8.py:
--------------------------------------------------------------------------------
1 | import nltk
2 | gram1 = nltk.data.load('grammars/large_grammars/atis.cfg')
3 | print(gram1)
4 | 
5 | 


--------------------------------------------------------------------------------
/Module 3/Chapter 5/ch5_9.py:
--------------------------------------------------------------------------------
1 | import nltk
2 | sent = nltk.data.load('grammars/large_grammars/atis_sentences.txt')
3 | sent = nltk.parse.util.extract_test_sentences(sent)
4 | print(len(sent))
5 | testingsent=sent[25]
6 | print(testingsent[1])
7 | print(testingsent[0])
8 | sent=testingsent[0]
9 | 


--------------------------------------------------------------------------------
/Module 3/Chapter 6/ch6_1.py:
--------------------------------------------------------------------------------
1 | import nltk
2 | nltk.boolean_ops()
3 | 


--------------------------------------------------------------------------------
/Module 3/Chapter 6/ch6_10.py:
--------------------------------------------------------------------------------
1 | import nltk
2 | nltk.tag.hmm.demo_pos()
3 | 


--------------------------------------------------------------------------------
/Module 3/Chapter 6/ch6_11.py:
--------------------------------------------------------------------------------
1 | import nltk
2 | from nltk import pos_tag, word_tokenize
3 | print(pos_tag(word_tokenize("John and Smith are going to NY and Germany")))
4 | 


--------------------------------------------------------------------------------
/Module 3/Chapter 6/ch6_12.py:
--------------------------------------------------------------------------------
1 | import nltk
2 | from nltk.corpus import brown
3 | from nltk.tag import UnigramTagger
4 | tagger = UnigramTagger(brown.tagged_sents(categories='news')[:700])
5 | sentence = ['John','and','Smith','went','to','NY','and','Germany']
6 | for word, tag in tagger.tag(sentence):
7 |     print(word,'->',tag)
8 | 


--------------------------------------------------------------------------------
/Module 3/Chapter 6/ch6_13.py:
--------------------------------------------------------------------------------
 1 | import nltk
 2 | from nltk.corpus import wordnet
 3 | from nltk.corpus import wordnet as wn
 4 | wn.synsets('cat')
 5 | wn.synsets('cat', pos=wn.VERB)
 6 | wn.synset('cat.n.01')
 7 | print(wn.synset('cat.n.01').definition())
 8 | print(len(wn.synset('cat.n.01').examples()))
 9 | print(wn.synset('cat.n.01').lemmas())
10 | print([str(lemma.name()) for lemma in wn.synset('cat.n.01').lemmas()])
11 | print(wn.lemma('cat.n.01.cat').synset())
12 | 
13 | 


--------------------------------------------------------------------------------
/Module 3/Chapter 6/ch6_14.py:
--------------------------------------------------------------------------------
 1 | import nltk
 2 | from nltk.corpus import wordnet
 3 | from nltk.corpus import wordnet as wn
 4 | print(sorted(wn.langs()))
 5 | print(wn.synset('cat.n.01').lemma_names('ita'))
 6 | print(sorted(wn.synset('cat.n.01').lemmas('dan')))
 7 | print(sorted(wn.synset('cat.n.01').lemmas('por')))
 8 | print(len(wordnet.all_lemma_names(pos='n', lang='jpn')))
 9 | cat = wn.synset('cat.n.01')
10 | print(cat.hypernyms())
11 | print(cat.hyponyms())
12 | print(cat.member_holonyms())
13 | print(cat.root_hypernyms())
14 | print(wn.synset('cat.n.01').lowest_common_hypernyms(wn.synset('dog.n.01')))
15 | 


--------------------------------------------------------------------------------
/Module 3/Chapter 6/ch6_15.py:
--------------------------------------------------------------------------------
1 | import nltk
2 | from nltk.corpus import wordnet
3 | from nltk.corpus import wordnet as wn
4 | lion = wn.synset('lion.n.01')
5 | cat = wn.synset('cat.n.01')
6 | print(lion.path_similarity(cat))
7 | 
8 | 


--------------------------------------------------------------------------------
/Module 3/Chapter 6/ch6_16.py:
--------------------------------------------------------------------------------
1 | import nltk
2 | from nltk.corpus import wordnet
3 | from nltk.corpus import wordnet as wn
4 | lion = wn.synset('lion.n.01')
5 | cat = wn.synset('cat.n.01')
6 | print(lion.lch_similarity(cat))
7 | 


--------------------------------------------------------------------------------
/Module 3/Chapter 6/ch6_17.py:
--------------------------------------------------------------------------------
1 | import nltk
2 | from nltk.corpus import wordnet
3 | from nltk.corpus import wordnet as wn
4 | lion = wn.synset('lion.n.01')
5 | cat = wn.synset('cat.n.01')
6 | print(lion.wup_similarity(cat))
7 | 
8 | 


--------------------------------------------------------------------------------
/Module 3/Chapter 6/ch6_18.py:
--------------------------------------------------------------------------------
 1 | import nltk
 2 | from nltk.corpus import wordnet
 3 | from nltk.corpus import wordnet as wn
 4 | from nltk.corpus import wordnet_ic
 5 | brown_ic = wordnet_ic.ic('ic-brown.dat')
 6 | semcor_ic = wordnet_ic.ic('ic-semcor.dat')
 7 | from nltk.corpus import genesis
 8 | genesis_ic = wn.ic(genesis, False, 0.0)
 9 | lion = wn.synset('lion.n.01')
10 | cat = wn.synset('cat.n.01')
11 | print(lion.res_similarity(cat, brown_ic))
12 | print(lion.res_similarity(cat, genesis_ic))
13 | print(lion.jcn_similarity(cat, brown_ic))
14 | print(lion.jcn_similarity(cat, genesis_ic))
15 | print(lion.lin_similarity(cat, semcor_ic))
16 | 
17 | 
18 | 


--------------------------------------------------------------------------------
/Module 3/Chapter 6/ch6_2.py:
--------------------------------------------------------------------------------
1 | import nltk
2 | input_expr = nltk.sem.Expression.fromstring
3 | print(input_expr('X | (Y -> Z)'))
4 | print(input_expr('-(X & Y)'))
5 | print(input_expr('X & Y'))
6 | print(input_expr('X <-> -- X'))
7 | 


--------------------------------------------------------------------------------
/Module 3/Chapter 6/ch6_3.py:
--------------------------------------------------------------------------------
 1 | import nltk
 2 | value = nltk.Valuation([('X', True), ('Y', False), ('Z', True)])
 3 | print(value['Z'])
 4 | domain = set()
 5 | v = nltk.Assignment(domain)
 6 | u = nltk.Model(domain, value)
 7 | print(u.evaluate('(X & Y)', v))
 8 | print(u.evaluate('-(X & Y)', v))
 9 | print(u.evaluate('(X & Z)', v))
10 | print(u.evaluate('(X | Y)', v))
11 | 
12 | 


--------------------------------------------------------------------------------
/Module 3/Chapter 6/ch6_4.py:
--------------------------------------------------------------------------------
 1 | import nltk
 2 | input_expr = nltk.sem.Expression.fromstring
 3 | expression = input_expr('run(marcus)', type_check=True)
 4 | print(expression.argument)
 5 | print(expression.argument.type)
 6 | print(expression.function)
 7 | print(expression.function.type)
 8 | sign = {'run': '<e, t>'}
 9 | expression = input_expr('run(marcus)', signature=sign)
10 | print(expression.function.type)
11 | 


--------------------------------------------------------------------------------
/Module 3/Chapter 6/ch6_5.py:
--------------------------------------------------------------------------------
1 | import nltk
2 | locations=[('Jaipur', 'IN', 'Rajasthan'),('Ajmer', 'IN', 'Rajasthan'),('Udaipur', 'IN', 'Rajasthan'),('Mumbai', 'IN', 'Maharashtra'),('Ahmedabad', 'IN', 'Gujrat')]
3 | q = [x1 for (x1, relation, x2) in locations if x2=='Rajasthan']
4 | print(q)
5 | 


--------------------------------------------------------------------------------
/Module 3/Chapter 6/ch6_6.py:
--------------------------------------------------------------------------------
 1 | import nltk
 2 | nltk.data.show_cfg('grammars/book_grammars/sql1.fcfg')
 3 | 
 4 | 
 5 | from nltk import load_parser
 6 | test = load_parser('grammars/book_grammars/sql1.fcfg')
 7 | q=" What cities are in Greece"
 8 | t = list(test.parse(q.split()))
 9 | ans = t[0].label()['SEM']
10 | ans = [s for s in ans if s]
11 | q = ' '.join(ans)
12 | print(q)
13 | from nltk.sem import chat80
14 | r = chat80.sql_query('corpora/city_database/city.db', q)
15 | for p in r:
16 |     print(p[0], end=" ")
17 | 


--------------------------------------------------------------------------------
/Module 3/Chapter 6/ch6_7.py:
--------------------------------------------------------------------------------
1 | import nltk
2 | sentences1 = nltk.corpus.treebank.tagged_sents()[17]
3 | print(nltk.ne_chunk(sentences1, binary=True))
4 | sentences2 = nltk.corpus.treebank.tagged_sents()[7]
5 | print(nltk.ne_chunk(sentences2, binary=True))
6 | print(nltk.ne_chunk(sentences2))
7 | 
8 | 


--------------------------------------------------------------------------------
/Module 3/Chapter 6/ch6_8.py:
--------------------------------------------------------------------------------
1 | import nltk
2 | from nltk.corpus import conll2002
3 | for documents in conll2002.chunked_sents('ned.train')[25]:
4 |     print(documents)
5 | 


--------------------------------------------------------------------------------
/Module 3/Chapter 6/ch6_9.py:
--------------------------------------------------------------------------------
1 | import nltk
2 | sentence = "I went to Greece to meet John";
3 | tok=nltk.word_tokenize(sentence)
4 | pos_tag=nltk.pos_tag(tok)
5 | print(nltk.ne_chunk(pos_tag))
6 | 


--------------------------------------------------------------------------------
/Module 3/Chapter 7/ch7_1.py:
--------------------------------------------------------------------------------
 1 | import nltk
 2 | import random
 3 | from nltk.corpus import movie_reviews
 4 | docs = [(list(movie_reviews.words(fid)), cat) for cat in movie_reviews.categories() for fid in movie_reviews.fileids(cat)]
 5 | random.shuffle(docs)
 6 | all_tokens = nltk.FreqDist(x.lower() for x in movie_reviews.words())
 7 | token_features = list(all_tokens.keys())[:2000]
 8 | print(token_features[:100])
 9 |  
10 | def doc_features(docs):
11 |     doc_words = set(docs)
12 |     features = {}
13 |     for word in token_features:
14 |         features['contains(%s)' % word] = (word in doc_words)
15 |         return features
16 | 
17 | print(doc_features(movie_reviews.words('pos/cv957_8737.txt')))
18 | feature_sets = [(doc_features(d), c) for (d,c) in docs]
19 | train_sets, test_sets = feature_sets[100:], feature_sets[:100]
20 | classifiers = nltk.NaiveBayesClassifier.train(train_sets)
21 | print(nltk.classify.accuracy(classifiers, test_sets))
22 | classifiers.show_most_informative_features(5)
23 | 


--------------------------------------------------------------------------------
/Module 3/Chapter 8/ch8_1.py:
--------------------------------------------------------------------------------
1 | import nltk
2 | from nltk.corpus import stopwords
3 | print(stopwords.words('english'))
4 | def not_stopwords(text):
5 |     stopwords = nltk.corpus.stopwords.words('english')
6 |     content = [w for w in text if w.lower() not in stopwords]
7 |     return len(content) / len(text)
8 | print(not_stopwords(nltk.corpus.reuters.words()))
9 | 


--------------------------------------------------------------------------------
/Module 3/Chapter 9/ch9_1.py:
--------------------------------------------------------------------------------
1 | import nltk
2 | expr_read = nltk.sem.DrtExpression.fromstring
3 | expr1 = expr_read('([x], [John(x), Went(x)])')
4 | print(expr1)
5 | expr1.draw()
6 | print(expr1.fol())
7 | 


--------------------------------------------------------------------------------
/Module 3/Chapter 9/ch9_2.py:
--------------------------------------------------------------------------------
1 | import nltk
2 | expr_read = nltk.sem.DrtExpression.fromstring
3 | expr2 = expr_read('([x,y], [John(x), Went(x),Sam(y),Meet(x,y)])')
4 | print(expr2)
5 | expr2.draw()
6 | print(expr2.fol())
7 | 


--------------------------------------------------------------------------------
/Module 3/Chapter 9/ch9_3.py:
--------------------------------------------------------------------------------
1 | import nltk
2 | expr_read = nltk.sem.DrtExpression.fromstring
3 | expr3 = expr_read('([x], [John(x), eats(x)])+ ([y],[Sam(y),eats(y)])')
4 | print(expr3)
5 | print(expr3.simplify())
6 | expr3.draw()
7 | 
8 | 


--------------------------------------------------------------------------------
/Module 3/Chapter 9/ch9_4.py:
--------------------------------------------------------------------------------
1 | import nltk
2 | expr_read = nltk.sem.DrtExpression.fromstring
3 | expr4 = expr_read('([],[(([x],[student(x)])->([y],[book(y),read(x,y)]))])')
4 | print(expr4.fol())
5 | 


--------------------------------------------------------------------------------
/Module 3/Chapter 9/ch9_5.py:
--------------------------------------------------------------------------------
1 | import nltk
2 | expr_read = nltk.sem.DrtExpression.fromstring
3 | expr5 = expr_read('([x,y],[ram(x),food(y),eats(x,y)])')
4 | expr6 = expr_read('([u,z],[PRO(u),coffee(z),drinks(u,z)])')
5 | expr7=expr5+expr6
6 | print(expr7.simplify())
7 | print(expr7.simplify().resolve_anaphora())
8 | 


--------------------------------------------------------------------------------
/Module 3/__pycache__/replacers.cpython-34.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Natural-Language-Processing-Python-and-NLTK/b34df3ceab78b3de29195a811696dcd06e77063a/Module 3/__pycache__/replacers.cpython-34.pyc


--------------------------------------------------------------------------------
/Module 3/__pycache__/replacers.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | 
 3 | from nltk.corpus import wordnet
 4 | 
 5 | replacement_patterns = [
 6 | (r'won\'t', 'will not'),
 7 | (r'can\'t', 'cannot'),
 8 | (r'i\'m', 'i am'),
 9 | (r'ain\'t', 'is not'),
10 | (r'(\w+)\'ll', '\g<1> will'),
11 | (r'(\w+)n\'t', '\g<1> not'),
12 | (r'(\w+)\'ve', '\g<1> have'),
13 | (r'(\w+)\'s', '\g<1> is'),
14 | (r'(\w+)\'re', '\g<1> are'),
15 | (r'(\w+)\'d', '\g<1> would')
16 | ]
17 | class RegexpReplacer(object):
18 |     def __init__(self, patterns=replacement_patterns):
19 |         self.patterns = [(re.compile(regex), repl) for (regex, repl) in
20 |         patterns]
21 |     def replace(self, text):
22 |         s = text
23 |         for (pattern, repl) in self.patterns:
24 |             (s, count) = re.subn(pattern, repl, s)
25 |         return s
26 | 
27 | class RepeatReplacer(object):
28 |     def __init__(self):
29 |         self.repeat_regexp = re.compile(r'(\w*)(\w)\2(\w*)')
30 |         self.repl = r'\1\2\3'
31 |     def replace(self, word):
32 |         if wordnet.synsets(word):
33 |             return word
34 |         repl_word = self.repeat_regexp.sub(self.repl, word)
35 |         if repl_word != word:
36 |             return self.replace(repl_word)
37 |         else:
38 |             return repl_word
39 | 
40 | class WordReplacer(object):
41 |     def __init__(self, word_map):
42 |         self.word_map = word_map
43 |     def replace(self, word):
44 |         return self.word_map.get(word, word)
45 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | 
 2 | 
 3 | 
 4 | # Natural Language Processing Python and NLTK
 5 | Code repository for Natural Language Processing Python and NLTK
 6 | 
 7 | ##What You Will Learn:
 8 | * Get a glimpse of the complexity of natural languages and how they are processed by machines
 9 | * Clean and wrangle text using tokenization and chunking to help you better process data
10 | * Tokenize text into sentences, and sentences into words
11 | * Classify text and perform sentiment analysis
12 | * Implement string matching algorithms and normalization techniques
13 | * Understand and implement the concepts of information retrieval and text summarization
14 | * Find out how to implement various NLP tasks in Python
15 | 
16 | ### Software and Hardware (Module 1):
17 | | Chapter number | Software required (with version) | Download links to the software | Hardware specifications | OS required |
18 | | -------------- | -------------- |-------------- |-------------- |-------------- |
19 | | 1-5            | Python/Anaconda NLTK | https://www.python.org/, http://continuum.io/downloads, http://www.nltk.org/ | Common Unix Printing System | any |
20 | | 6 | scikit-learn and gensim | http://scikit-learn.org/stable/, https://radimrehurek.com/gensim/ |	Common Unix Printing System | any |
21 | | 7 | Scrappy | http://scrapy.org/ |	Common Unix Printing System	| any |
22 | | 8 | NumPy, SciPy, pandas, and matplotlib | http://www.numpy.org/, http://www.scipy.org/, http://pandas.pydata.org/, http://matplotlib.org/ |	Common Unix Printing System	| any |
23 | | 9 | Twitter Python APIs and Facebook python APIs	| https://dev.twitter.com/overview/api/twitter-libraries, https://developers.facebook.com |	Common Unix Printing System	| any |
24 | 
25 | 
26 | 
27 | ### Software and Hardware (Module 2):
28 | | Chapter number | Software required (with version) | Free/Proprietary | Download links to the software |
29 | | -------------- | -------------- |-------------- |-------------- |-------------- |
30 | | 1 | NLTK>=3.0a4, NLTK Data |	Free |	http://www.nltk.org, http://www.nltk.org/data.html |
31 | | 2 | pyenchant>=1.6.5 |	Free | http://pythonhosted.org/pyenchant/ |
32 | | 3 | lockfile>=0.9.1, MongoDB >= 2.6, pymongo>=2.6.3 | Free | https://pypi.python.org/pypi/lockfile, http://www.mongodb.org/, https://pypi.python.org/pypi/pymongo/ |
33 | | 4 | NLTK-Trainer >= 0.9 |	Free | https://github.com/japerk/nltk-trainer |
34 | | 7 | scikit-learn>=0.14.1 |	Free |http://scikit-learn.org/stable/ |
35 | | 8 | Redis >= 2.8, redis>=2.8.0 , execnet>=1.1 |	Free |	http://redis.io/, https://pypi.python.org/pypi/redis/, https://codespeak.net/execnet/ |
36 | | 9 | python-dateutil>=2.0, beautifulsoup4>=4.3.2, lxml>=3.2.3, charade>=1.0.3 | Free | http://labix.org/python-dateutil, http://www.crummy.com/software/BeautifulSoup/, http://lxml.de/, https://pypi.python.org/pypi/charade |
37 | 
38 | 
39 | 
40 | 
41 | 
42 | 
43 | ### Software and Hardware (Module 3):
44 | | Chapter number | Software required (with version) | Hardware Specifications | OS required |
45 | | -------------- | -------------------------------- | ----------------------- | ----------- |
46 | | All chapters   | Python 2.7 or 3.2+               | Install NLTK 3.0 either on 32-bit or 64-bit machine | Windows or Mac/Unix |
47 | 
48 | 
49 | 
50 | 
51 | 
52 | ###Note
53 | Modules 1, 2 and 3 have code arranged by chapter (for the chapters that have code). [Click here](https://docs.google.com/forms/d/e/1FAIpQLSe5qwunkGf6PUvzPirPDtuy1Du5Rlzew23UBp2S-P3wB-GcwQ/viewform) if you have any feedback or suggestions.
54 | ### Download a free PDF
55 | 
56 |  <i>If you have already purchased a print or Kindle version of this book, you can get a DRM-free PDF version at no cost.<br>Simply click on the link to claim your free PDF.</i>
57 | <p align="center"> <a href="https://packt.link/free-ebook/9781787285101">https://packt.link/free-ebook/9781787285101 </a> </p>


--------------------------------------------------------------------------------