├── .gitattributes ├── .gitignore ├── License ├── Module 1 ├── Chapter 1 │ ├── classification.py │ ├── hadoop_cmd.sh │ ├── mapper.py │ ├── nltk_hive.sql │ ├── nltk_scoring.py │ ├── pyspark_classification.py │ ├── reducer.py │ ├── scikit_hive.sql │ └── serialize_model_vect.py ├── Chapter 10 │ ├── classification.py │ ├── hadoop_cmd.sh │ ├── mapper.py │ ├── nltk_hive.sql │ ├── nltk_scoring.py │ ├── pyspark_classification.py │ ├── reducer.py │ ├── scikit_hive.sql │ └── serialize_model_vect.py ├── Chapter 2 │ └── ch2.py ├── Chapter 3 │ └── ch3.py ├── Chapter 4 │ └── ch4.py ├── Chapter 5 │ ├── summarizer.py │ └── summarizer2.py ├── Chapter 6 │ ├── classification.py │ ├── modelbuilding.py │ ├── readdata.py │ ├── textclustering.py │ └── topicmodeling.py ├── Chapter 7 │ ├── item.py │ ├── itempiplines.py │ ├── loginspider.py │ ├── myspider.py │ ├── newsspider_1.py │ ├── newsspider_2.py │ ├── newsspider_3.py │ └── notes.txt ├── Chapter 8 │ ├── intregation.py │ ├── matplotlib_code.py │ ├── numpy_codes.py │ ├── optimize.py │ ├── pandas_code.py │ └── solver.py └── Chapter 9 │ ├── fb_classification.py │ ├── fbdump.py │ ├── influencer_frnd.py │ ├── trendingtopic.py │ ├── tweetdump.py │ └── tweetinfo.py ├── Module 2 ├── Chapter 1 │ └── 7853OS_01_codes │ │ └── chapter1.py ├── Chapter 2 │ └── 7853OS_02_codes │ │ ├── chapter2.py │ │ ├── mywords.txt │ │ └── replacers.py ├── Chapter 3 │ └── 7853OS_03_codes │ │ ├── brown.pos │ │ ├── catchunked.py │ │ ├── chapter3.py │ │ ├── conll.iob │ │ ├── corpus.py │ │ ├── heading_text.txt │ │ ├── mongoreader.py │ │ ├── mywords.txt │ │ ├── synonyms.csv │ │ ├── synonyms.yaml │ │ ├── treebank.chunk │ │ └── wordlist ├── Chapter 4 │ └── 7853OS_04_Codes │ │ ├── chapter4.py │ │ ├── tag_util.py │ │ └── taggers.py ├── Chapter 5 │ └── 7853OS_05_Codes │ │ ├── chapter5.py │ │ └── chunkers.py ├── Chapter 6 │ └── 7853OS_06_codes │ │ ├── chapter6.py │ │ └── transforms.py ├── Chapter 7 │ └── 7853OS_07_Codes │ │ ├── chapter7.py │ │ ├── classification.py │ │ └── featx.py ├── Chapter 8 │ └── 7853OS_08_Codes │ │ ├── chapter8.py │ │ ├── dist_featx.py │ │ ├── plists.py │ │ ├── remote_chunk.py │ │ ├── remote_double.py │ │ ├── remote_tag.py │ │ └── remote_word_count.py └── Chapter 9 │ └── 7853OS_09_Codes │ ├── chapter9.py │ └── encoding.py ├── Module 3 ├── Chapter 1 │ ├── ch1_1.py │ ├── ch1_10.py │ ├── ch1_11.py │ ├── ch1_12.py │ ├── ch1_13.py │ ├── ch1_14.py │ ├── ch1_15.py │ ├── ch1_16.py │ ├── ch1_17.py │ ├── ch1_18.py │ ├── ch1_19.py │ ├── ch1_2.py │ ├── ch1_20.py │ ├── ch1_21.py │ ├── ch1_22.py │ ├── ch1_23.py │ ├── ch1_24.py │ ├── ch1_25.py │ ├── ch1_26.py │ ├── ch1_27.py │ ├── ch1_28.py │ ├── ch1_29.py │ ├── ch1_3.py │ ├── ch1_30.py │ ├── ch1_31.py │ ├── ch1_33.py │ ├── ch1_34.py │ ├── ch1_35.py │ ├── ch1_36.py │ ├── ch1_37.py │ ├── ch1_4.py │ ├── ch1_5.py │ ├── ch1_6.py │ ├── ch1_7.py │ ├── ch1_8.py │ └── ch1_9.py ├── Chapter 10 │ ├── ch10_1.py │ ├── ch10_10.py │ ├── ch10_2.py │ ├── ch10_3.py │ ├── ch10_4.py │ ├── ch10_5.py │ ├── ch10_6.py │ ├── ch10_7.py │ ├── ch10_8.py │ └── ch10_9.py ├── Chapter 2 │ ├── ch2_1.py │ ├── ch2_10.py │ ├── ch2_2.py │ ├── ch2_3.py │ ├── ch2_4.py │ ├── ch2_5.py │ ├── ch2_6.py │ ├── ch2_7.py │ ├── ch2_8.py │ └── ch2_9.py ├── Chapter 3 │ ├── ch3_1.py │ ├── ch3_2.py │ ├── ch3_3.py │ ├── ch3_4.py │ ├── ch3_5.py │ └── ch3_6.py ├── Chapter 4 │ ├── ch4_1.py │ ├── ch4_10.py │ ├── ch4_11.py │ ├── ch4_12.py │ ├── ch4_13.py │ ├── ch4_14.py │ ├── ch4_15.py │ ├── ch4_16.py │ ├── ch4_17.py │ ├── ch4_18.py │ ├── ch4_19.py │ ├── ch4_2.py │ ├── ch4_20.py │ ├── ch4_21.py │ ├── ch4_22.py │ ├── ch4_23.py │ ├── ch4_24.py │ ├── ch4_25.py │ ├── ch4_26.py │ ├── ch4_27.py │ ├── ch4_28.py │ ├── ch4_29.py │ ├── ch4_3.py │ ├── ch4_30.py │ ├── ch4_4.py │ ├── ch4_5.py │ ├── ch4_6.py │ ├── ch4_7.py │ ├── ch4_8.py │ └── ch4_9.py ├── Chapter 5 │ ├── ch5_1.py │ ├── ch5_10.py │ ├── ch5_11.py │ ├── ch5_12.py │ ├── ch5_13.py │ ├── ch5_14.py │ ├── ch5_15.py │ ├── ch5_16.py │ ├── ch5_17.py │ ├── ch5_18.py │ ├── ch5_19.py │ ├── ch5_2.py │ ├── ch5_20.py │ ├── ch5_21.py │ ├── ch5_22.py │ ├── ch5_23.py │ ├── ch5_3.py │ ├── ch5_4.py │ ├── ch5_5.py │ ├── ch5_6.py │ ├── ch5_7.py │ ├── ch5_8.py │ └── ch5_9.py ├── Chapter 6 │ ├── ch6_1.py │ ├── ch6_10.py │ ├── ch6_11.py │ ├── ch6_12.py │ ├── ch6_13.py │ ├── ch6_14.py │ ├── ch6_15.py │ ├── ch6_16.py │ ├── ch6_17.py │ ├── ch6_18.py │ ├── ch6_2.py │ ├── ch6_3.py │ ├── ch6_4.py │ ├── ch6_5.py │ ├── ch6_6.py │ ├── ch6_7.py │ ├── ch6_8.py │ └── ch6_9.py ├── Chapter 7 │ └── ch7_1.py ├── Chapter 8 │ └── ch8_1.py ├── Chapter 9 │ ├── ch9_1.py │ ├── ch9_2.py │ ├── ch9_3.py │ ├── ch9_4.py │ └── ch9_5.py └── __pycache__ │ ├── replacers.cpython-34.pyc │ └── replacers.py └── README.md /.gitattributes: -------------------------------------------------------------------------------- 1 | # Auto detect text files and perform LF normalization 2 | * text=auto 3 | 4 | # Custom for Visual Studio 5 | *.cs diff=csharp 6 | 7 | # Standard to msysgit 8 | *.doc diff=astextplain 9 | *.DOC diff=astextplain 10 | *.docx diff=astextplain 11 | *.DOCX diff=astextplain 12 | *.dot diff=astextplain 13 | *.DOT diff=astextplain 14 | *.pdf diff=astextplain 15 | *.PDF diff=astextplain 16 | *.rtf diff=astextplain 17 | *.RTF diff=astextplain 18 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Windows image file caches 2 | Thumbs.db 3 | ehthumbs.db 4 | 5 | # Folder config file 6 | Desktop.ini 7 | 8 | # Recycle Bin used on file shares 9 | $RECYCLE.BIN/ 10 | 11 | # Windows Installer files 12 | *.cab 13 | *.msi 14 | *.msm 15 | *.msp 16 | 17 | # Windows shortcuts 18 | *.lnk 19 | 20 | # ========================= 21 | # Operating System Files 22 | # ========================= 23 | 24 | # OSX 25 | # ========================= 26 | 27 | .DS_Store 28 | .AppleDouble 29 | .LSOverride 30 | 31 | # Thumbnails 32 | ._* 33 | 34 | # Files that might appear in the root of a volume 35 | .DocumentRevisions-V100 36 | .fseventsd 37 | .Spotlight-V100 38 | .TemporaryItems 39 | .Trashes 40 | .VolumeIcon.icns 41 | 42 | # Directories potentially created on remote AFP share 43 | .AppleDB 44 | .AppleDesktop 45 | Network Trash Folder 46 | Temporary Items 47 | .apdisk 48 | -------------------------------------------------------------------------------- /License: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2016 Packt 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /Module 1/Chapter 1/classification.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import pickle 3 | import sklearn 4 | from sklearn.externals import joblib 5 | 6 | clf = joblib.load('classifier.pkl') 7 | vectorizer = joblib.load('vectorizer.pkl') 8 | 9 | for line in sys.stdin: 10 | line = line.strip() 11 | id, content= line.split('\t') 12 | X_test = vectorizer.transform([str(content)]) 13 | 14 | prob = clf.predict_proba(X_test) 15 | pred = clf.predict (X_test) 16 | prob_score =prob[:,1] 17 | print '\t'.join([id, content,pred,prob_score]) 18 | -------------------------------------------------------------------------------- /Module 1/Chapter 1/hadoop_cmd.sh: -------------------------------------------------------------------------------- 1 | hadoop jar /hadoop-streaming.jar \ 2 | -D mapred.reduce.tasks=1 -file /mapper.py \ 3 | -mapper /mapper.py \ 4 | -file /reducer.py \ 5 | -reducer /reducer.py \ 6 | -input /hdfspath/infile \ 7 | -output outfile -------------------------------------------------------------------------------- /Module 1/Chapter 1/mapper.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import pickle 3 | import nltk 4 | for line in sys.stdin: 5 | line = line.strip() 6 | id, content = line.split('\t') 7 | tokens =nltk.word_tokenize(concat_all_text) 8 | print '\t'.join([id,content,tokens]) 9 | -------------------------------------------------------------------------------- /Module 1/Chapter 1/nltk_hive.sql: -------------------------------------------------------------------------------- 1 | hive> 2 | CREATE TABLE $InputTableName 3 | ( 4 | ID String, 5 | Content String 6 | ) 7 | ROW FORMAT DELIMITED 8 | FIELDS TERMINATED BY '\t'; 9 | 10 | hive> 11 | CREATE TABLE $OutTableName 12 | ( 13 | ID String, 14 | Content String, 15 | Tokens String 16 | ) 17 | 18 | hive> 19 | add FILE nltk_scoring.py; 20 | add FILE english.pickle; #Adding file to DistributedCache 21 | INSERT OVERWRITE TABLE $OutTableName 22 | SELECT 23 | TRANSFORM (id, content) 24 | USING 'PYTHONPATH nltk_scoring.py' 25 | AS (id string, content string, tokens string ) 26 | FROM $InputTablename; 27 | -------------------------------------------------------------------------------- /Module 1/Chapter 1/nltk_scoring.py: -------------------------------------------------------------------------------- 1 | >>>import sys 2 | >>>import datetime 3 | >>>import pickle 4 | >>>import nltk 5 | >>>nltk.download('punkt') 6 | >>>for line in sys.stdin: 7 | >>> line = line.strip() 8 | >>> id, content= line.split('\t') 9 | >>> tokens =nltk.word_tokenize(concat_all_text) 10 | >>> print '\t'.join([id,content,tokens]) 11 | -------------------------------------------------------------------------------- /Module 1/Chapter 1/pyspark_classification.py: -------------------------------------------------------------------------------- 1 | 2 | # here I am assuming that we have pyspark configured on your hadoop cluster 3 | >>>from pyspark import SparkContext 4 | >>>sc = SparkContext(appName="comment_classifcation") 5 | #http://spark.apache.org/docs/0.7.3/api/pyspark/pyspark.context.SparkContext-class.html. 6 | #The next thing is reading a tab delimited text file. Reading the file should be on HDFS. This file could be huge (~Tb/Pb): 7 | >>>lines = sc.textFile("testcomments.txt") 8 | #The lines are now a list of all the rows in the corpus: 9 | >>>parts = lines.map(lambda l: l.split("\t")) 10 | >>>corpus = parts.map(lambda row: Row(id=row[0], comment=row[1], class=row[2])) 11 | #The parts is a list of fields as we have each field in the line delimited on “\t”. 12 | #Let's break the corpus that has [ID, comment, class (0,1)] in the different RDD objects: 13 | >>>comment = corpus.map(lambda row: " " + row.comment) 14 | >>>class_var = corpus.map(lambda row:row.class) 15 | #Once we have the comments, we need to do a process very similar to what we did in Chapter 6, Text Classification, where we used scikit to do tokenization, hash vectorizer and calculate TF, IDF, and tf-idf using a vectorizer. 16 | #The following is the snippet of how to create tokenization, term frequency, and inverse document frequency: 17 | >>>from pyspark.mllib.feature import HashingTF 18 | >>>from pyspark.mllib.feature import IDF 19 | # https://spark.apache.org/docs/1.3.0/mllib-feature-extraction.html 20 | >>>comment_tokenized = comment.map(lambda line: line.strip().split(" ")) 21 | >>>hashingTF = HashingTF(1000) # to select only 1000 features 22 | >>>comment_tf = hashingTF.transform(comment_tokenized) 23 | 24 | >>>comment_idf = IDF().fit(comment_tf) 25 | >>>comment_tfidf = comment_idf.transform(comment_tf) 26 | #Will merge the class with the c tfidf RDD like this: 27 | >>>finaldata = class_var.zip(comment_tfidf) 28 | #We will do a typical test and train smapling 29 | >>>train, test = finaldata.randomSplit([0.8, 0.2], seed=0) 30 | #Let's perform the main classification commands, which are quite similar to scikit. We are using a logistic regression, which is widely used classifier. The pyspark.mllib provides you a variety of algorithms. 31 | #For more information on pyspark.mllib visit https://spark.apache.org/docs/latest/api/python/pyspark.mllib.html 32 | 33 | #The following is an example of logistic regression classifier: 34 | >>>from pyspark.mllib.regression import LabeledPoint 35 | >>>from pyspark.mllib.classification import NaiveBayes 36 | >>>train_rdd = train.map(lambda t: LabeledPoint(t[0], t[1])) 37 | >>>test_rdd = test.map(lambda t: LabeledPoint(t[0], t[1])) 38 | >>>nb = NaiveBayes.train(train_rdd,lambda = 1.0) 39 | >>>nb_output = test_rdd.map(lambda point: (NB.predict(point.features), point.label)) 40 | >>>print nb_output 41 | -------------------------------------------------------------------------------- /Module 1/Chapter 1/reducer.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import pickle 3 | import nltk 4 | for line in sys.stdin: 5 | line = line.strip() 6 | id, content,topics = line.split('\t') 7 | print '\t'.join([id,content,topics]) 8 | -------------------------------------------------------------------------------- /Module 1/Chapter 1/scikit_hive.sql: -------------------------------------------------------------------------------- 1 | CREATE TABLE $InputTableName ( 2 | ID String, 3 | Content String 4 | ) 5 | ROW FORMAT DELIMITED 6 | FIELDS TERMINATED BY '\t';CREATE TABLE $InputTableName ( 7 | ID String, 8 | Content String 9 | ) 10 | ROW FORMAT DELIMITED 11 | 12 | hive>CREATE TABLE $OutTableName ( 13 | ID String, 14 | Content String, 15 | predict String, 16 | predict_score double 17 | ) 18 | hive> 19 | add FILE vectorizer.pkl; 20 | add FILE classifier.pkl; 21 | 22 | hive> 23 | add FILE classification.py; 24 | INSERT OVERWRITE TABLE $OutTableName 25 | SELECT 26 | TRANSFORM (id, content) 27 | USING '/opt/anaconda/python2.7/bin/python2.7 classification.py' 28 | AS (id string, scorestringscore string ) 29 | FROM $Tablename; -------------------------------------------------------------------------------- /Module 1/Chapter 1/serialize_model_vect.py: -------------------------------------------------------------------------------- 1 | 2 | # please refer to code modelbuilding.py in ch 6 and just serialize the vectorizer and mode 3 | # object using joblib.dump 4 | vectorizer = TfidfVectorizer(sublinear_tf=True, min_df=in_min_df, 5 | stop_words='english', ngram_range=(1,2), max_df=in_max_df) 6 | joblib.dump(vectorizer, "vectorizer.pkl", compress=3) 7 | clf = GaussianNB().fit(X_train,y_train) 8 | joblib.dump(clf, "classifier.pkl") -------------------------------------------------------------------------------- /Module 1/Chapter 10/classification.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import pickle 3 | import sklearn 4 | from sklearn.externals import joblib 5 | 6 | clf = joblib.load('classifier.pkl') 7 | vectorizer = joblib.load('vectorizer.pkl') 8 | 9 | for line in sys.stdin: 10 | line = line.strip() 11 | id, content= line.split('\t') 12 | X_test = vectorizer.transform([str(content)]) 13 | 14 | prob = clf.predict_proba(X_test) 15 | pred = clf.predict (X_test) 16 | prob_score =prob[:,1] 17 | print '\t'.join([id, content,pred,prob_score]) 18 | -------------------------------------------------------------------------------- /Module 1/Chapter 10/hadoop_cmd.sh: -------------------------------------------------------------------------------- 1 | hadoop jar /hadoop-streaming.jar \ 2 | -D mapred.reduce.tasks=1 -file /mapper.py \ 3 | -mapper /mapper.py \ 4 | -file /reducer.py \ 5 | -reducer /reducer.py \ 6 | -input /hdfspath/infile \ 7 | -output outfile -------------------------------------------------------------------------------- /Module 1/Chapter 10/mapper.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import pickle 3 | import nltk 4 | for line in sys.stdin: 5 | line = line.strip() 6 | id, content = line.split('\t') 7 | tokens =nltk.word_tokenize(concat_all_text) 8 | print '\t'.join([id,content,tokens]) 9 | -------------------------------------------------------------------------------- /Module 1/Chapter 10/nltk_hive.sql: -------------------------------------------------------------------------------- 1 | hive> 2 | CREATE TABLE $InputTableName 3 | ( 4 | ID String, 5 | Content String 6 | ) 7 | ROW FORMAT DELIMITED 8 | FIELDS TERMINATED BY '\t'; 9 | 10 | hive> 11 | CREATE TABLE $OutTableName 12 | ( 13 | ID String, 14 | Content String, 15 | Tokens String 16 | ) 17 | 18 | hive> 19 | add FILE nltk_scoring.py; 20 | add FILE english.pickle; #Adding file to DistributedCache 21 | INSERT OVERWRITE TABLE $OutTableName 22 | SELECT 23 | TRANSFORM (id, content) 24 | USING 'PYTHONPATH nltk_scoring.py' 25 | AS (id string, content string, tokens string ) 26 | FROM $InputTablename; 27 | -------------------------------------------------------------------------------- /Module 1/Chapter 10/nltk_scoring.py: -------------------------------------------------------------------------------- 1 | >>>import sys 2 | >>>import datetime 3 | >>>import pickle 4 | >>>import nltk 5 | >>>nltk.download('punkt') 6 | >>>for line in sys.stdin: 7 | >>> line = line.strip() 8 | >>> id, content= line.split('\t') 9 | >>> tokens =nltk.word_tokenize(concat_all_text) 10 | >>> print '\t'.join([id,content,tokens]) 11 | -------------------------------------------------------------------------------- /Module 1/Chapter 10/pyspark_classification.py: -------------------------------------------------------------------------------- 1 | 2 | # here I am assuming that we have pyspark configured on your hadoop cluster 3 | >>>from pyspark import SparkContext 4 | >>>sc = SparkContext(appName="comment_classifcation") 5 | #http://spark.apache.org/docs/0.7.3/api/pyspark/pyspark.context.SparkContext-class.html. 6 | #The next thing is reading a tab delimited text file. Reading the file should be on HDFS. This file could be huge (~Tb/Pb): 7 | >>>lines = sc.textFile("testcomments.txt") 8 | #The lines are now a list of all the rows in the corpus: 9 | >>>parts = lines.map(lambda l: l.split("\t")) 10 | >>>corpus = parts.map(lambda row: Row(id=row[0], comment=row[1], class=row[2])) 11 | #The parts is a list of fields as we have each field in the line delimited on “\t”. 12 | #Let's break the corpus that has [ID, comment, class (0,1)] in the different RDD objects: 13 | >>>comment = corpus.map(lambda row: " " + row.comment) 14 | >>>class_var = corpus.map(lambda row:row.class) 15 | #Once we have the comments, we need to do a process very similar to what we did in Chapter 6, Text Classification, where we used scikit to do tokenization, hash vectorizer and calculate TF, IDF, and tf-idf using a vectorizer. 16 | #The following is the snippet of how to create tokenization, term frequency, and inverse document frequency: 17 | >>>from pyspark.mllib.feature import HashingTF 18 | >>>from pyspark.mllib.feature import IDF 19 | # https://spark.apache.org/docs/1.3.0/mllib-feature-extraction.html 20 | >>>comment_tokenized = comment.map(lambda line: line.strip().split(" ")) 21 | >>>hashingTF = HashingTF(1000) # to select only 1000 features 22 | >>>comment_tf = hashingTF.transform(comment_tokenized) 23 | 24 | >>>comment_idf = IDF().fit(comment_tf) 25 | >>>comment_tfidf = comment_idf.transform(comment_tf) 26 | #Will merge the class with the c tfidf RDD like this: 27 | >>>finaldata = class_var.zip(comment_tfidf) 28 | #We will do a typical test and train smapling 29 | >>>train, test = finaldata.randomSplit([0.8, 0.2], seed=0) 30 | #Let's perform the main classification commands, which are quite similar to scikit. We are using a logistic regression, which is widely used classifier. The pyspark.mllib provides you a variety of algorithms. 31 | #For more information on pyspark.mllib visit https://spark.apache.org/docs/latest/api/python/pyspark.mllib.html 32 | 33 | #The following is an example of logistic regression classifier: 34 | >>>from pyspark.mllib.regression import LabeledPoint 35 | >>>from pyspark.mllib.classification import NaiveBayes 36 | >>>train_rdd = train.map(lambda t: LabeledPoint(t[0], t[1])) 37 | >>>test_rdd = test.map(lambda t: LabeledPoint(t[0], t[1])) 38 | >>>nb = NaiveBayes.train(train_rdd,lambda = 1.0) 39 | >>>nb_output = test_rdd.map(lambda point: (NB.predict(point.features), point.label)) 40 | >>>print nb_output 41 | -------------------------------------------------------------------------------- /Module 1/Chapter 10/reducer.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import pickle 3 | import nltk 4 | for line in sys.stdin: 5 | line = line.strip() 6 | id, content,topics = line.split('\t') 7 | print '\t'.join([id,content,topics]) 8 | -------------------------------------------------------------------------------- /Module 1/Chapter 10/scikit_hive.sql: -------------------------------------------------------------------------------- 1 | CREATE TABLE $InputTableName ( 2 | ID String, 3 | Content String 4 | ) 5 | ROW FORMAT DELIMITED 6 | FIELDS TERMINATED BY '\t';CREATE TABLE $InputTableName ( 7 | ID String, 8 | Content String 9 | ) 10 | ROW FORMAT DELIMITED 11 | 12 | hive>CREATE TABLE $OutTableName ( 13 | ID String, 14 | Content String, 15 | predict String, 16 | predict_score double 17 | ) 18 | hive> 19 | add FILE vectorizer.pkl; 20 | add FILE classifier.pkl; 21 | 22 | hive> 23 | add FILE classification.py; 24 | INSERT OVERWRITE TABLE $OutTableName 25 | SELECT 26 | TRANSFORM (id, content) 27 | USING '/opt/anaconda/python2.7/bin/python2.7 classification.py' 28 | AS (id string, scorestringscore string ) 29 | FROM $Tablename; -------------------------------------------------------------------------------- /Module 1/Chapter 10/serialize_model_vect.py: -------------------------------------------------------------------------------- 1 | 2 | # please refer to code modelbuilding.py in ch 6 and just serialize the vectorizer and mode 3 | # object using joblib.dump 4 | vectorizer = TfidfVectorizer(sublinear_tf=True, min_df=in_min_df, 5 | stop_words='english', ngram_range=(1,2), max_df=in_max_df) 6 | joblib.dump(vectorizer, "vectorizer.pkl", compress=3) 7 | clf = GaussianNB().fit(X_train,y_train) 8 | joblib.dump(clf, "classifier.pkl") -------------------------------------------------------------------------------- /Module 1/Chapter 2/ch2.py: -------------------------------------------------------------------------------- 1 | # csv load 2 | >>>import csv 3 | >>>with open('example.csv','rb') as f: 4 | >>> reader=csv.reader(f,delimiter=',',quotechar='"') 5 | >>> for line in reader : 6 | >>> print line[1] # assuming the second field is the raw sting 7 | 8 | # json load 9 | >>>import json 10 | >>>jsonfile=open('example.json') 11 | >>>data=json.load(jsonfile) 12 | >>>print data['string'] 13 | 14 | # sentence splitter 15 | 16 | >>>inputstring = ' This is an example sent. The sentence splitter will split on sent markers. Ohh really !!' 17 | >>>from nltk.tokenize import sent_tokenize 18 | >>>all_sent=sent_tokenize(inputstring) 19 | >>>print all_sent 20 | >>>[' This is an example sent', 'The sentence splitter will split on markers.','Ohh really !!'] 21 | 22 | >>>import nltk.tokenize.punkt 23 | >>>tokenizer =nltk.tokenize.punkt.PunktSentenceTokenizer() 24 | 25 | # word tokenizer 26 | >>>s ="Hi Everyone ! hola gr8" # simplest tokenizer 27 | >>>print s.split() 28 | 29 | >>>from nltk.tokenize import word_tokenize 30 | >>>word_tokenize(s) 31 | 32 | >>>from nltk.tokenize import regexp_tokenize, wordpunct_tokenize, blankline_tokenize 33 | >>>regexp_tokenize(s, pattern='\w+') 34 | 35 | >>>regexp_tokenize(s, pattern='\d+') 36 | 37 | >>>wordpunct_tokenize(s) 38 | >>>blankline_tokenize(s) 39 | 40 | #Porter stemmer 41 | >>>from nltk.stem import PorterStemmer # import Porter stemmer 42 | >>>from nltk.stem.lancaster import LancasterStemmer 43 | >>>from nltk.stem.Snowball import SnowballStemmer 44 | >>>pst=PorterStemmer() # create obj of the PorterStemmer 45 | >>>lst = LancasterStemmer() # create obj of LancasterStemmer 46 | >>>lst.stem("eating") 47 | >>>pst.stem("shopping") 48 | 49 | #Lemmatizer 50 | >>>from nltk.stem import WordNetLemmatizer 51 | >>>wlem=WordNetLemmatizer() 52 | >>>wlem.lemmatize("ate") 53 | 54 | # stop word 55 | 56 | >>>from nltk.corpus import stopwords 57 | >>>stoplist=stopwords.words('english') # config the language name 58 | >>>text = "This is just a test" 59 | >>>cleanwordlist=[word for word in text.split() if word not in stoplist] 60 | 61 | 62 | # rare word removal 63 | 64 | >>>freq_dist=nltk.FreqDist(token) 65 | >>>rarewords =freq_dist.keys()[-50:] 66 | >>>after_rare_words= [ word for word in token not in rarewords] 67 | 68 | # spell check 69 | 70 | >>>from nltk.metrics import edit_distance 71 | >>>edit_distance(“rain”,”shine”) 72 | 73 | 74 | -------------------------------------------------------------------------------- /Module 1/Chapter 3/ch3.py: -------------------------------------------------------------------------------- 1 | # POS tagging 2 | >>>import nltk 3 | >>>from nltk import word_tokenize 4 | >>>s="I was watching TV" 5 | >>>print nltk.pos_tag(word_tokenize(s)) 6 | 7 | # all nouns 8 | 9 | >>>tagged=nltk.pos_tag(word_tokenize(s)) 10 | >>>allnoun=[word for word,pos in tagged if pos in ['NN','NNP'] ] 11 | 12 | # Stanford POS tagger 13 | 14 | >>>from nltk.tag.stanford import POSTagger 15 | >>>import nltk 16 | >>>stan_tagger=POSTagger('models/english-bidirectional-distdim.tagger','standford-postagger.jar') 17 | >>>tokens =nltk.word_tokenize(s) 18 | >>>stan_tagger.tag(tokens) 19 | 20 | # POS tags freq distribtuion 21 | >>>from nltk.corpus import brown 22 | >>>import nltk 23 | >>>tags = [tag for (word, tag) in brown.tagged_words(categories='news')] 24 | >>>print nltk.FreqDist(tags) 25 | 26 | # default tagger 27 | >>>brown_tagged_sents = brown.tagged_sents(categories='news') 28 | >>>default_tagger = nltk.DefaultTagger('NN') 29 | >>>print default_tagger.evaluate(brown_tagged_sents) 30 | 31 | # N-gram taggers 32 | 33 | >>>from nltk.tag import UnigramTagger 34 | >>>from nltk.tag import DefaultTagger 35 | >>>from nltk.tag import BigramTagger 36 | >>>from nltk.tag import TrigramTagger 37 | # we are dividing the data into a test and train to evaluate our taggers. 38 | >>>train_data= brown_tagged_sents[:int(len(brown_tagged_sents) * 0.9)] 39 | >>>test_data= brown_tagged_sents[int(len(brown_tagged_sents) * 0.9):] 40 | >>>unigram_tagger = UnigramTagger(train_data,backoff=default_tagger) 41 | >>>print unigram_tagger.evaluate(test_data) 42 | >>>bigram_tagger= BigramTagger(train_data, backoff=unigram_tagger) 43 | >>>print bigram_tagger.evaluate(test_data) 44 | >>>trigram_tagger=TrigramTagger(train_data,backoff=bigram_tagger) 45 | >>>print trigram_tagger.evaluate(test_data) 46 | 47 | # Regex tagger 48 | 49 | >>>from nltk.tag.sequential import RegexpTagger 50 | >>>regexp_tagger = RegexpTagger( 51 | [( r'^-?[0-9]+(.[0-9]+)?$', 'CD'), # cardinal numbers 52 | ( r'(The|the|A|a|An|an)$', 'AT'), # articles 53 | ( r'.*able$', 'JJ'), # adjectives 54 | ( r'.*ness$', 'NN'), # nouns formed from adj 55 | ( r'.*ly$', 'RB'), # adverbs 56 | ( r'.*s$', 'NNS'), # plural nouns 57 | ( r'.*ing$', 'VBG'), # gerunds 58 | (r'.*ed$', 'VBD'), # past tense verbs 59 | (r'.*', 'NN') # nouns (default) 60 | ]) 61 | >>>print regexp_tagger.evaluate(test_data) 62 | 63 | 64 | 65 | # NER tagger 66 | >>>import nltk 67 | >>>from nltk import ne_chunk 68 | >>>from nltk import word_tokenize 69 | >>>sent = "Mark is studying at Stanford University in California" 70 | >>>print(ne_chunk(nltk.pos_tag(word_tokenize(sent)), binary=False)) 71 | 72 | # NER stanford tagger 73 | 74 | >>>from nltk.tag.stanford import NERTagger 75 | >>>st = NERTagger('/stanford-ner/classifiers/all.3class.distsim.crf.ser.gz',... '/stanford-ner/stanford-ner.jar') 76 | # will be the relative path where you downloaded the tagger 77 | #http://nlp.stanford.edu/software/ 78 | -------------------------------------------------------------------------------- /Module 1/Chapter 4/ch4.py: -------------------------------------------------------------------------------- 1 | 2 | # toy CFG 3 | >>> from nltk import CFG 4 | >>>toy_grammar = 5 | nltk.CFG.fromstring( 6 | """ 7 | S -> NP VP # S indicate the entire sentence 8 | VP -> V NP # VP is verb phrase the 9 | V -> "eats" | "drinks" # V is verb we are using only 2 verbs in the example 10 | NP -> Det N # NP is noun phrase (chunk that has noun in it) 11 | Det -> "a" | "an" | "the" # Det is determiner used in the sentences 12 | N -> "president" |"Obama" |"apple"| "coke" # N some example nouns 13 | """) 14 | >>> toy_grammar.productions() 15 | 16 | # similarly a PCFG also can be built 17 | 18 | >>> from nltk import PCFG 19 | >>> toy_pcfg1 = PCFG.fromstring(""" 20 | S -> NP VP [1.0] 21 | NP -> Det N [0.5] | NP PP [0.25] | 'John' [0.1] | 'I' [0.15] 22 | Det -> 'the' [0.8] | 'my' [0.2] 23 | N -> 'man' [0.5] | 'telescope' [0.5] 24 | VP -> VP PP [0.1] | V NP [0.7] | V [0.2] 25 | V -> 'ate' [0.35] | 'saw' [0.65] 26 | PP -> P NP [1.0] 27 | P -> 'with' [0.61] | 'under' [0.39] 28 | """) 29 | # ref :http://www.nltk.org/howto/grammar.html 30 | 31 | 32 | # Regex parser 33 | 34 | >>> chunk_rules=ChunkRule("<.*>+","chunk everything") 35 | >>> import nltk 36 | >>> from nltk.chunk.regexp import * 37 | >>> reg_parser = RegexpParser(''' 38 | NP: {
? * *} # NP 39 | P: {} # Preposition 40 | V: {} # Verb 41 | PP: {

} # PP -> P NP 42 | VP: { *} # VP -> V (NP|PP)* 43 | ''') 44 | >>> test_sent="Mr. Obama played a big role in the Health insurance bill" 45 | >>> test_sent_pos=nltk.pos_tag(nltk.word_tokenize(test_sent)) 46 | >>> paresed_out=reg_parser.parse(test_sent_pos) 47 | 48 | # Stanford Parser [Very useful] 49 | 50 | >>>from nltk.parse.stanford import StanfordParser 51 | >>>english_parser = StanfordParser('stanford-parser.jar', 'stanford-parser-3.4-models.jar') 52 | >>>english_parser.raw_parse_sents(("this is the english parser test") 53 | 54 | # Chunking 55 | 56 | >>>from nltk.chunk.regexp import * 57 | >>>test_sent="The prime minister announced he had asked the chief government whip, Philip Ruddock, to call a special party room meeting for 9am on Monday to consider the spill motion." 58 | >>>test_sent_pos=nltk.pos_tag(nltk.word_tokenize(test_sent)) 59 | >>>rule_vp = ChunkRule(r'()?()+()?', 'Chunk VPs') 60 | >>>parser_vp = RegexpChunkParser([rule_vp],chunk_label='VP') 61 | >>>print parser_vp.parse(test_sent_pos) 62 | >>>rule_np = ChunkRule(r'(

??)?*(<,>)*()+', 'Chunk NPs') 63 | >>>parser_np = RegexpChunkParser([rule_np],chunk_label="NP") 64 | >>>print parser_np.parse(test_sent_pos) 65 | 66 | # NP chunking (NER) 67 | 68 | >>>f=open(# absolute path for the file of text for which we want NER) 69 | >>>text=f.read() 70 | >>>sentences = nltk.sent_tokenize(text) 71 | >>>tokenized_sentences = [nltk.word_tokenize(sentence) for sentence in sentences] 72 | >>>tagged_sentences = [nltk.pos_tag(sentence) for sentence in tokenized_sentences] 73 | >>>for sent in tagged_sentences: 74 | >>>print nltk.ne_chunk(sent) 75 | 76 | # Relation Extraction 77 | 78 | >>>import re 79 | >>>IN = re.compile(r'.*\bin\b(?!\b.+ing)') 80 | >>>for doc in nltk.corpus.ieer.parsed_docs('NYT_19980315'): 81 | >>> for rel in nltk.sem.extract_rels('ORG', 'LOC', doc, corpus='ieer', pattern = IN): 82 | >>>print(nltk.sem.rtuple(rel)) 83 | 84 | 85 | 86 | -------------------------------------------------------------------------------- /Module 1/Chapter 5/summarizer.py: -------------------------------------------------------------------------------- 1 | >>>import nltk 2 | >>>results=[] 3 | >>>for sent_no,sentence in enumerate(nltk.sent_tokenize(news_content)): 4 | >>> no_of_tokens=len(nltk.word_tokenize(sentence)) 5 | >>> # Let's do POS tagging 6 | >>> tagged=nltk.pos_tag(nltk.word_tokenize(sentence)) 7 | >>> # Count the no of Nouns in the sentence 8 | >>> no_of_nouns=len([word for word,pos in tagged if pos in ["NN","NNP"] ]) 9 | >>> #Use NER to tag the named entities. 10 | >>> ners=nltk.ne_chunk(nltk.pos_tag(nltk.word_tokenize(sentence)), binary=False) 11 | >>> no_of_ners= len([chunk for chunk in ners if hasattr(chunk, 'node')]) 12 | >>> score=(no_of_ners+no_of_nouns)/float(no_of_toekns) 13 | >>> results.append((sent_no,no_of_tokens,no_of_ners,\ 14 | no_of_nouns,score,sentence)) 15 | 16 | >>>for sent in sorted(results,key=lambda x: x[4],reverse=True): 17 | >>> print sent[5] 18 | -------------------------------------------------------------------------------- /Module 1/Chapter 5/summarizer2.py: -------------------------------------------------------------------------------- 1 | >>>import nltk 2 | >>>from sklearn.feature_extraction.text import TfidfVectorizer 3 | >>>results=[] 4 | >>>sentences=nltk.sent_tokenize(news_content) 5 | >>>vectorizer = TfidfVectorizer(norm='l2',min_df=0, use_idf=True, smooth_idf=False, sublinear_tf=True) 6 | >>>sklearn_binary=vectorizer.fit_transform(sentences) 7 | >>>print countvectorizer.get_feature_names() 8 | >>>print sklearn_binary.toarray() 9 | >>>for sent_no,i in enumerate(sklearn_binary.toarray()): 10 | >>> results.append(sent_no,i.sum()/float(len(i.nonzero()[0]))) 11 | 12 | -------------------------------------------------------------------------------- /Module 1/Chapter 6/classification.py: -------------------------------------------------------------------------------- 1 | def modelbuilding(smsdata,sms_labels): 2 | ''' 3 | This is an example pipline to building a text classifier 4 | 1. sampling 5 | 2. TfidfVectorizer conversion 6 | 3. building a naive_bayes model 7 | 4. print the accuracy and other metrics 8 | 5. print most relavent features 9 | ''' 10 | 11 | # sampling steps 12 | trainset_size = int(round(len(sms_data)*0.70)) 13 | # i chose this threshold for 70:30 train and test split. 14 | print 'The training set size for this classifier is ' + str(trainset_size) + '\n' 15 | x_train = np.array([''.join(el) for el in sms_data[0:trainset_size]]) 16 | y_train = np.array([el for el in sms_labels[0:trainset_size]]) 17 | x_test = np.array([''.join(el) for el in sms_data[trainset_size+1:len(sms_data)]]) 18 | y_test = np.array([el for el in sms_labels[trainset_size+1:len(sms_labels)]]) 19 | print x_train 20 | print y_train 21 | 22 | # count vectorizer 23 | # not really used just for explanation 24 | from sklearn.feature_extraction.text import CountVectorizer 25 | sms_exp=[ ] 26 | for line in sms_list: 27 | sms_exp.append(preprocessing(line[1])) 28 | vectorizer = CountVectorizer(min_df=1) 29 | X_exp = vectorizer.fit_transform(sms_exp) 30 | print "||".join(vectorizer.get_feature_names()) 31 | print X_exp.toarray() 32 | 33 | # We are building a TFIDF vectorizer here 34 | from sklearn.feature_extraction.text import TfidfVectorizer 35 | vectorizer = TfidfVectorizer(min_df=2, ngram_range=(1, 2), stop_ 36 | words='english', strip_accents='unicode', norm='l2') 37 | X_train = vectorizer.fit_transform(x_train) 38 | X_test = vectorizer.transform(x_test) 39 | 40 | from sklearn.naive_bayes import MultinomialNB 41 | clf = MultinomialNB().fit(X_train, y_train) 42 | y_nb_predicted = clf.predict(X_test) 43 | print y_nb_predicted 44 | print ' \n confusion_matrix \n ' 45 | cm = confusion_matrix(y_test, y_pred) 46 | print cm 47 | print '\n Here is the classification report:' 48 | print classification_report(y_test, y_nb_predicted) 49 | # print the top features 50 | 51 | coefs = clf.coef_ 52 | intercept = clf.intercept_ 53 | coefs_with_fns = sorted(zip(clf.coef_[0], feature_names)) 54 | n=10 55 | top = zip(coefs_with_fns[:n], coefs_with_fns[:-(n + 1):-1]) 56 | for (coef_1, fn_1), (coef_2, fn_2) in top: 57 | print('\t%.4f\t%-15s\t\t%.4f\t%-15s' % (coef_1, fn_1, coef_2, fn_2)) 58 | 59 | def preprocessing(text): 60 | text = text.decode("utf8") 61 | # tokenize into words 62 | tokens = [word for sent in nltk.sent_tokenize(text) \ 63 | for word in nltk.word_tokenize(sent)] 64 | 65 | # remove stopwords 66 | stop = stopwords.words('english') 67 | tokens = [token for token in tokens if token not in stop] 68 | 69 | # remove words less than three letters 70 | tokens = [word for word in tokens if len(word) >= 3] 71 | 72 | # lower capitalization 73 | tokens = [word.lower() for word in tokens] 74 | 75 | # lemmatize 76 | lmtzr = WordNetLemmatizer() 77 | tokens = [lmtzr.lemmatize(word) for word in tokens] 78 | preprocessed_text= ' '.join(tokens) 79 | 80 | return preprocessed_text 81 | 82 | def main(): 83 | smsdata = open('SMSSpamCollection') # check the structure of this file! 84 | smsdata_data = [] 85 | sms_labels = [] 86 | csv_reader = csv.reader(sms,delimiter='\t') 87 | for line in csv_reader: 88 | # adding the sms_id 89 | sms_labels.append( line[0]) 90 | # adding the cleaned text We are calling preprocessing method 91 | sms_data.append(preprocessing(line[1])) 92 | 93 | sms.close() 94 | # we are calling the model builing function here 95 | modelbuilding(smsdata,sms_labels) 96 | if __name__ == '__main__': 97 | main() -------------------------------------------------------------------------------- /Module 1/Chapter 6/modelbuilding.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | # sampling steps 4 | trainset_size = int(round(len(sms_data)*0.70)) 5 | # i chose this threshold for 70:30 train and test split. 6 | print 'The training set size for this classifier is ' + str(trainset_size) + '\n' 7 | x_train = np.array([''.join(el) for el in sms_data[0:trainset_size]]) 8 | y_train = np.array([el for el in sms_labels[0:trainset_size]]) 9 | x_test = np.array([''.join(el) for el in sms_data[trainset_size+1:len(sms_data)]]) 10 | y_test = np.array([el for el in sms_labels[trainset_size+1:len(sms_labels)]]) 11 | print x_train 12 | print y_train 13 | 14 | # count vectorizer 15 | # not really used just for explanation 16 | from sklearn.feature_extraction.text import CountVectorizer 17 | sms_exp=[ ] 18 | for line in sms_list: 19 | sms_exp.append(preprocessing(line[1])) 20 | vectorizer = CountVectorizer(min_df=1) 21 | X_exp = vectorizer.fit_transform(sms_exp) 22 | print "||".join(vectorizer.get_feature_names()) 23 | print X_exp.toarray() 24 | 25 | # We are building a TFIDF vectorizer here 26 | from sklearn.feature_extraction.text import TfidfVectorizer 27 | vectorizer = TfidfVectorizer(min_df=2, ngram_range=(1, 2), stop_ 28 | words='english', strip_accents='unicode', norm='l2') 29 | X_train = vectorizer.fit_transform(x_train) 30 | X_test = vectorizer.transform(x_test) 31 | 32 | from sklearn.naive_bayes import MultinomialNB 33 | clf = MultinomialNB().fit(X_train, y_train) 34 | y_nb_predicted = clf.predict(X_test) 35 | print y_nb_predicted 36 | print ' \n confusion_matrix \n ' 37 | cm = confusion_matrix(y_test, y_pred) 38 | print cm 39 | print '\n Here is the classification report:' 40 | print classification_report(y_test, y_nb_predicted) 41 | # print the top features 42 | 43 | coefs = clf.coef_ 44 | intercept = clf.intercept_ 45 | coefs_with_fns = sorted(zip(clf.coef_[0], feature_names)) 46 | n=10 47 | top = zip(coefs_with_fns[:n], coefs_with_fns[:-(n + 1):-1]) 48 | for (coef_1, fn_1), (coef_2, fn_2) in top: 49 | print('\t%.4f\t%-15s\t\t%.4f\t%-15s' % (coef_1, fn_1, coef_2, fn_2)) 50 | 51 | 52 | # SGD mostly used 53 | 54 | from sklearn.linear_model import SGDClassifier 55 | from sklearn.metrics import confusion_matrix 56 | clf=SGDClassifier(alpha=.0001, n_iter=50).fit(X_train, y_train) 57 | y_pred = clf.predict(X_test) 58 | print '\n Here is the classification report:' 59 | print classification_report(y_test, y_pred) 60 | print ' \n confusion_matrix \n ' 61 | cm = confusion_matrix(y_test, y_pred) 62 | print cm 63 | 64 | # SVM 65 | from sklearn.svm import LinearSVC 66 | svm_classifier = LinearSVC().fit(X_train, y_train) 67 | y_svm_predicted = svm_classifier.predict(X_test) 68 | print '\n Here is the classification report:' 69 | print classification_report(y_test, y_svm_predicted) 70 | cm = confusion_matrix(y_test, y_pred) 71 | print cm 72 | 73 | # RandomForestClassifier 74 | 75 | from sklearn.ensemble import RandomForestClassifier 76 | RF_clf = RandomForestClassifier(n_estimators=10) 77 | predicted = RF_clf.predict(X_test) 78 | print '\n Here is the classification report:' 79 | print classification_report(y_test, predicted) 80 | cm = confusion_matrix(y_test, y_pred) 81 | print cm 82 | -------------------------------------------------------------------------------- /Module 1/Chapter 6/readdata.py: -------------------------------------------------------------------------------- 1 | import nltk 2 | from nltk.corpus import stopwords 3 | from nltk.stem import WordNetLemmatizer 4 | import csv 5 | def preprocessing(text): 6 | text = text.decode("utf8") 7 | # tokenize into words 8 | tokens = [word for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)] 9 | 10 | # remove stopwords 11 | stop = stopwords.words('english') 12 | tokens = [token for token in tokens if token not in stop] 13 | 14 | # remove words less than three letters 15 | tokens = [word for word in tokens if len(word) >= 3] 16 | 17 | # lower capitalization 18 | tokens = [word.lower() for word in tokens] 19 | 20 | # lemmatize 21 | lmtzr = WordNetLemmatizer() 22 | tokens = [lmtzr.lemmatize(word) for word in tokens] 23 | preprocessed_text= ' '.join(tokens) 24 | 25 | return preprocessed_text 26 | smsdata = open('SMSSpamCollection') # check the structure of this file! 27 | smsdata_data = [] 28 | sms_labels = [] 29 | csv_reader = csv.reader(sms,delimiter='\t') 30 | for line in csv_reader: 31 | # adding the sms_id 32 | sms_labels.append( line[0]) 33 | # adding the cleaned text We are calling preprocessing method 34 | sms_data.append(preprocessing(line[1])) 35 | 36 | sms.close() 37 | -------------------------------------------------------------------------------- /Module 1/Chapter 6/textclustering.py: -------------------------------------------------------------------------------- 1 | from sklearn.cluster import KMeans, MiniBatchKMeans 2 | true_k=5 3 | km = KMeans(n_clusters=true_k, init='k-means++', max_iter=100, n_init=1) 4 | kmini = MiniBatchKMeans(n_clusters=true_k, init='k-means++', n_init=1, 5 | init_size=1000, batch_size=1000, verbose=opts.verbose) 6 | # we are using the same test,train data in TFIDF form as we did in text classification 7 | 8 | km_model=km.fit(X_train) 9 | kmini_model=kmini.fit(X_train) 10 | print "For K-mean clustering " 11 | clustering = collections.defaultdict(list) 12 | for idx, label in enumerate(km_model.labels_): 13 | clustering[label].append(idx) 14 | print "For K-mean Mini batch clustering " 15 | clustering = collections.defaultdict(list) 16 | for idx, label in enumerate(kmini_model.labels_): 17 | clustering[label].append(idx) 18 | -------------------------------------------------------------------------------- /Module 1/Chapter 6/topicmodeling.py: -------------------------------------------------------------------------------- 1 | from gensim import corpora, models, similarities 2 | from itertools import chain 3 | import nltk 4 | from nltk.corpus import stopwords 5 | from operator import itemgetter 6 | import re 7 | documents = [document for document in sms_data] 8 | stoplist = stopwords.words('english') 9 | texts = [[word for word in document.lower().split() if word not in stoplist] \ 10 | for document in documents] 11 | dictionary = corpora.Dictionary(texts) 12 | corpus = [dictionary.doc2bow(text) for text in texts] 13 | tfidf = models.TfidfModel(corpus) 14 | corpus_tfidf = tfidf[corpus] 15 | si = models.LsiModel(corpus_tfidf, id2word=dictionary, num_topics=100) 16 | #lsi.print_topics(20) 17 | n_topics = 5 18 | lda = models.LdaModel(corpus_tfidf, id2word=dictionary, num_topics=n_topics) 19 | for i in range(0, n_topics): 20 | temp = lda.show_topic(i, 10) 21 | terms = [] 22 | for term in temp: 23 | terms.append(term[1]) 24 | print "Top 10 terms for topic #" + str(i) + ": "+ ", ".join(terms) 25 | -------------------------------------------------------------------------------- /Module 1/Chapter 7/item.py: -------------------------------------------------------------------------------- 1 | from scrapy.item import Item, Field 2 | class NewsItem(Item): 3 | title = Field() 4 | link = Field() 5 | desc = Field() 6 | -------------------------------------------------------------------------------- /Module 1/Chapter 7/itempiplines.py: -------------------------------------------------------------------------------- 1 | from scrapy.exceptions import Item 2 | class CleanPipeline(object): 3 | def process_item(self, item, spider): 4 | if item['desc']: 5 | item['desc'] = item['desc'].strip().lower().replace('#$','') 6 | return item 7 | #We need to derive the age from DOB. We used Python's date functions to achieve this: 8 | 9 | from scrapy.exceptions import Item 10 | import datetime 11 | import datetime 12 | class AgePipeline(object): 13 | def process_item(self, item, spider): 14 | if item['DOB']: 15 | item['Age'] = (datetime.datetime.strptime(item['DOB'], '%d-%m-%y').date()-datetime.datetime.strptime('currentdate, '%d-%m-%y').date()).days/365 16 | return item 17 | 18 | #We also need to remove the duplicates. Python has the set() data structure that only contains unique values: 19 | from scrapy import signals 20 | from scrapy.exceptions import Item 21 | class DuplicatesPipeline(object): 22 | def __init__(self): 23 | self.ids_seen = set() 24 | def process_item(self, item, spider): 25 | if item['id'] in self.ids_seen: 26 | raise DropItem("Duplicate item found: %s" % item) 27 | else: 28 | self.ids_seen.add(item['id']) 29 | return item 30 | #Let's finally write the item in the JSON file: 31 | import json 32 | class JsonWriterPipeline(object): 33 | def __init__(self): 34 | self.file = open('items.txt', 'wb') 35 | def process_item(self, item, spider): 36 | line = json.dumps(dict(item)) + "\n" 37 | self.file.write(line) 38 | return item -------------------------------------------------------------------------------- /Module 1/Chapter 7/loginspider.py: -------------------------------------------------------------------------------- 1 | class LoginSpider(BaseSpider): 2 | name = 'example.com' 3 | start_URLss = ['http://www.example.com/users/login.php'] 4 | def parse(self, response): 5 | return [FormRequest.from_response(response, formdata={'username': 'john', 'password': 'secret'}, callback=self.after_login)] 6 | defafter_login(self, response): 7 | # check login succeed before going on 8 | if "authentication failed" in response.body: 9 | self.log("Login failed", level=log.ERROR) 10 | return 11 | -------------------------------------------------------------------------------- /Module 1/Chapter 7/myspider.py: -------------------------------------------------------------------------------- 1 | from scrapy.contrib.spiders import SitemapSpider 2 | class MySpider(SitemapSpider): 3 | sitemap_URLss = ['http://www.example.com/sitemap.xml'] 4 | sitemap_rules = [('/electronics/', 'parse_electronics'), ('/apparel/', 'parse_apparel'),] 5 | def 'parse_electronics'(self, response): 6 | # you need to create an item for electronics, 7 | return 8 | def 'parse_apparel'(self, response): 9 | #you need to create an item for apparel 10 | return -------------------------------------------------------------------------------- /Module 1/Chapter 7/newsspider_1.py: -------------------------------------------------------------------------------- 1 | from scrapy.spider import BaseSpider 2 | class NewsSpider(BaseSpider): 3 | name = "news" 4 | allowed_domains = ["nytimes.com"] 5 | start_URLss = [ 6 | 'http://www.nytimes.com/' 7 | ] 8 | def parse(self, response): 9 | filename = response.URLs.split("/")[-2] 10 | open(filename, 'wb').write(response.body) 11 | -------------------------------------------------------------------------------- /Module 1/Chapter 7/newsspider_2.py: -------------------------------------------------------------------------------- 1 | from scrapy.spider import BaseSpider 2 | class NewsSpider(BaseSpider): 3 | name = "news" 4 | allowed_domains = ["nytimes.com"] 5 | start_URLss = [ 6 | 'http://www.nytimes.com/' 7 | ] 8 | def parse(self, response): 9 | sel = Selector(response) 10 | sites = sel.xpath('//ul/li') 11 | for site in sites: 12 | title = site.xpath('a/text()').extract() 13 | link = site.xpath('a/@href').extract() 14 | desc = site.xpath('text()').extract() 15 | print title, link, desc 16 | -------------------------------------------------------------------------------- /Module 1/Chapter 7/newsspider_3.py: -------------------------------------------------------------------------------- 1 | from scrapy.contrib.spiders import CrawlSpider, Rule 2 | from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor 3 | from scrapy.selector import Selector 4 | from scrapy.item import NewsItem 5 | 6 | class NewsSpider(CrawlSpider): 7 | name = 'news' 8 | allowed_domains = ['news.google.com'] 9 | start_urls = ['https://news.google.com'] 10 | 11 | rules = ( 12 | # Extract links matching cnn.com 13 | Rule(SgmlLinkExtractor(allow=('cnn.com', ), deny=(http://edition.cnn.com/', ))), 14 | # Extract links matching 'news.google.com' 15 | Rule(SgmlLinkExtractor(allow=('news.google.com', )), callback='parse_news_item'), 16 | ) 17 | def parse_news_item(self, response): 18 | sel = Selector(response) 19 | item = NewsItem() 20 | item['title'] = sel.xpath('//title/text()').extract() 21 | item[topic] = sel.xpath('/div[@class="topic"]').extract() 22 | item['desc'] = sel.xpath('//td//text()').extract() 23 | return item 24 | -------------------------------------------------------------------------------- /Module 1/Chapter 7/notes.txt: -------------------------------------------------------------------------------- 1 | Please download some of the examples from here and work on it 2 | 3 | https://github.com/geekan/scrapy-examples -------------------------------------------------------------------------------- /Module 1/Chapter 8/intregation.py: -------------------------------------------------------------------------------- 1 | >>>from scipy.integrate import quad, dblquad, tplquad 2 | >>>def f(x): 3 | >>> return x 4 | >>> 5 | >>>x_lower = = 0 # the lower limit of x 6 | >>>x_upper = = 1 # the upper limit of x 7 | >>>val, abserr = = quad(f, x_lower, x_upper) 8 | >>>print val,abserr 9 | >>> 0.5 , 5.55111512313e-15 10 | -------------------------------------------------------------------------------- /Module 1/Chapter 8/matplotlib_code.py: -------------------------------------------------------------------------------- 1 | >>>import matplotlib 2 | >>>import matplotlib.pyplot as plt 3 | >>>import numpy 4 | >>>stockCSCO = stockdata_new.query('stock=="CSCO"') 5 | >>>stockCSCO.head() 6 | >>>from matplotlib import figure 7 | >>>plt.figure() 8 | >>>plt.scatter(stockdata_new.index.date,stockdata_new.volume) 9 | >>>plt.xlabel('day') # added the name of the x axis 10 | >>>plt.ylabel('stock close value') # add label to y-axis 11 | >>>plt.title('title') # add the title to your graph 12 | >>>plt.savefig("matplot1.jpg") # savefig in local 13 | 14 | # subplot 15 | >>>plt.subplot(2, 2, 1) 16 | >>>plt.plot(stockAA.index.weekofyear, stockAA.open, 'r--') 17 | >>>plt.subplot(2, 2, 2) 18 | >>>plt.plot(stockCSCO.index.weekofyear, stockCSCO.open, 'g-*') 19 | >>>plt.subplot(2, 2, 3) 20 | >>>plt.plot(stockAA.index.weekofyear, stockAA.open, 'g--') 21 | >>>plt.subplot(2, 2, 4) 22 | >>>plt.plot(stockCSCO.index.weekofyear, stockCSCO.open, 'r-*') 23 | >>>plt.subplot(2, 2, 3) 24 | >>>plt.plot(x, y, 'g--') 25 | >>>plt.subplot(2, 2, 4) 26 | >>>plt.plot(x, y, 'r-*') 27 | >>>fig.savefig("matplot2.png") 28 | 29 | >>>fig, axes = plt.subplots(nrows=1, ncols=2) 30 | >>>for ax in axes: 31 | >>> ax.plot(x, y, 'r') 32 | >>> ax.set_xlabel('x') 33 | >>> ax.set_ylabel('y') 34 | >>> ax.set_title('title'); 35 | 36 | >>>fig = plt.figure() 37 | >>>axes = fig.add_axes([0.1, 0.1, 0.8, 0.8]) # left, bottom, width, 38 | height (range 0 to 1) 39 | >>>axes.plot(x, y, 'r') 40 | 41 | >>>fig = plt.figure() 42 | >>>ax = fig.add_axes([0.1, 0.1, 0.8, 0.8]) 43 | >>>ax.plot(stockAA.index.weekofyear,stockAA.open,label="AA") 44 | >>>ax.plot(stockAA.index.weekofyear,stockCSCO.open,label="CSCO") 45 | >>>ax.set_xlabel('weekofyear') 46 | >>>ax.set_ylabel('stock value') 47 | >>>ax.set_title('Weekly change in stock price') 48 | >>>ax.legend(loc=2); # upper left corner 49 | >>>plt.savefig("matplot3.jpg") 50 | 51 | # scatter plot 52 | >>>import matplotlib.pyplot as plt 53 | >>>plt.scatter(stockAA.index.weekofyear,stockAA.open) 54 | >>>plt.savefig("matplot4.jpg") 55 | >>>plt.close() 56 | # bar plot 57 | >>>n = 12 58 | >>>X = np.arange(n) 59 | >>>Y1 = np.random.uniform(0.5, 1.0, n) 60 | >>>Y2 = np.random.uniform(0.5, 1.0, n) 61 | >>>plt.bar(X, +Y1, facecolor='#9999ff', edgecolor='white') 62 | >>>plt.bar(X, -Y2, facecolor='#ff9999', edgecolor='white') 63 | 64 | # 3d plot 65 | >>>from mpl_toolkits.mplot3d import Axes3D 66 | >>>fig = plt.figure() 67 | >>>ax = Axes3D(fig) 68 | >>>X = np.arange(-4, 4, 0.25) 69 | >>>Y = np.arange(-4, 4, 0.25) 70 | >>>X, Y = np.meshgrid(X, Y) 71 | >>>R = np.sqrt(X**2+ + Y**2) 72 | >>>Z = np.sin(R) 73 | >>>ax.plot_surface(X, Y, Z, rstride=1, cstride=1, cmap='hot') -------------------------------------------------------------------------------- /Module 1/Chapter 8/numpy_codes.py: -------------------------------------------------------------------------------- 1 | >>>x=[1,2,5,7,3,11,14,25] 2 | >>>import numpy as np 3 | >>>np_arr=np.array(x) 4 | >>>np_arr 5 | 6 | >>>arr=[[1,2],[13,4],[33,78]] 7 | >>>np_2darr= np.array(arr) 8 | >>>type(np_2darr) 9 | numpy.ndarray 10 | 11 | # indexing 12 | >>>np_2darr.tolist() 13 | [[1, 2], [13, 4], [33, 78]] 14 | >>>np_2darr[:] 15 | array([[1, 2], [13, 4], [33, 78]]) 16 | >>>np_2darr[:2] 17 | array([[1, 2], [13, 4]]) 18 | >>>np_2darr[:1] 19 | array([[1, 2]]) 20 | >>>np_2darr[2] 21 | array([33, 78]) 22 | >>> np_2darr[2][0] 23 | >>>33 24 | >>> np_2darr[:-1] 25 | array([[1, 2], [13, 4]]) 26 | 27 | # basic operations 28 | >>>>import numpy as np 29 | >>>>np.arange(0.0, 1.0, 0.1) 30 | 31 | >>>np.ones([2, 4]) 32 | >>>np.zeros([3,4]) 33 | 34 | >>>np.linspace(0, 2, 10) 35 | >>>np.logspace(0,1) 36 | 37 | >>>A=np.array([[0, 0, 0], [0, 1, 2], [0, 2, 4], [0, 3, 6]]) 38 | >>>B = np.array([n for n in range n for n in range(4)]) 39 | >>>less_than_3 = B<3 # we are filtering the items that are less than 3. 40 | >>>B[less_than_3] 41 | >>>np.diag(A) 42 | 43 | 44 | # complex matrix operations 45 | 46 | >>>A = np.array([[1,2],[3,4]]) 47 | >>>A * A 48 | 49 | >>>np.dot(A, A) 50 | >>>A - A 51 | >>>A + A 52 | >>>np.transpose(A) 53 | >>>A.T 54 | 55 | >>>M = np.matrix(A) 56 | >>> np.conjugate(M) 57 | >>> np.invert(M) 58 | 59 | >>>N = np.random.randn(1,10) 60 | >>>N.mean() 61 | >>>N.std() 62 | 63 | #Reshaping 64 | 65 | >>>>A.reshape((1, r * c)) 66 | >>>A.flatten() 67 | >>>np.repeat(A, 2) 68 | >>>np.tile(A, 4) 69 | >>>np.concatenate((A, B), axis=0) 70 | >>>np.vstack((A, B)) 71 | >>>np.concatenate((A, B.T), axis=1) 72 | 73 | 74 | #Random numbers 75 | 76 | >>>from numpy import random 77 | >>>#uniform random number from [0,1] 78 | >>>random.rand(2, 5) 79 | >>>>random.randn(2, 5) 80 | -------------------------------------------------------------------------------- /Module 1/Chapter 8/optimize.py: -------------------------------------------------------------------------------- 1 | >>>from Scipy import optimize 2 | 3 | >>>def f1 def f1(x,y): 4 | >>> return x ** 2+ y ** 2 - 4 5 | >>>optimize.fsolve(f1, 0, 0) 6 | -------------------------------------------------------------------------------- /Module 1/Chapter 8/pandas_code.py: -------------------------------------------------------------------------------- 1 | >>>import pandas as pd 2 | >>># Please provide the absolute path of the input file 3 | >>>data = pd.read_csv("PATH\\iris.data.txt",header=0") 4 | >>>data.head() 5 | 6 | >>>data = pd.read_csv("PATH\\iris.data.txt", names=["sepal length"," sepal\ 7 | width", "petal length", "petal width", "Cat"], header=None) 8 | >>>data.head() 9 | 10 | >>>data.describe() 11 | 12 | >>>sepal_len_cnt=data['sepal length'].value_counts() 13 | >>>sepal_len_cnt 14 | >>>data['Iris-setosa'].value_counts() 15 | >>>data['Iris-setosa'] == 'Iris-setosa' 16 | >>>sntsosa=data[data['Cat'] == 'Iris-setosa'] 17 | >>>sntsosa[:5] 18 | 19 | # series data 20 | 21 | >>>import pandas as pd 22 | >>>stockdata = pd.read_csv("C:\\Users\\a549369\\Documents\\book\\dow_ 23 | jones_index.data",parse_dates=['date'], index_col=['date'], nrows=100) 24 | >>>>stockdata.head() 25 | >>>max(stockdata['volume']) 26 | >>>max(stockdata['percent_change_price']) 27 | >>>stockdata.index 28 | >>>stockdata.index.day 29 | >>>stockdata.index.month 30 | >>>stockdata.index.year 31 | >>>import numpy as np 32 | >>>stockdata.resample('M', how=np.sum) 33 | 34 | #transformation 35 | >>>stockdata.drop(["percent_change_volume_over_last_wk"],axis=1) 36 | 37 | >>>stockdata_new = pd.DataFrame(stockdata, columns=["stock","open","high" 38 | ,"low","close","volume"]) 39 | >>>stockdata_new.head() 40 | >>>stockdata["previous_weeks_volume"] = 0 41 | 42 | # noisy data 43 | >>>import numpy 44 | >>>stockdata_new.open.describe() 45 | >>>stockdata_new.open = stockdata_new.open.str.replace('$', '').convert_ 46 | objects(convert_numeric=True) 47 | >>>stockdata_new.close = stockdata_new.close.str.replace('$', ''). 48 | convert_objects(convert_numeric=True) 49 | >>>(stockdata_new.close - stockdata_new.open).convert_objects(convert_ 50 | numeric=True) 51 | >>>stockdata_new.open.describe() 52 | >>>stockdata_new['newopen'] = stockdata_new.open.apply(lambda x: 0.8 * x) 53 | >>>stockdata_new.newopen.head(5) 54 | >>>stockAA = stockdata_new.query('stock=="AA"') 55 | >>>stockAA.head() 56 | -------------------------------------------------------------------------------- /Module 1/Chapter 8/solver.py: -------------------------------------------------------------------------------- 1 | >>>A = = sp.rand(2, 2) 2 | >>>B = = sp.rand(2, 2) 3 | >>>import Scipy 4 | >>>X = = solve(A, B) 5 | >>>from Scipy import linalg as LA 6 | >>>X = = LA.solve(A, B) 7 | >>>LA.dot(A, B) -------------------------------------------------------------------------------- /Module 1/Chapter 9/fb_classification.py: -------------------------------------------------------------------------------- 1 | from sklearn.feature_extraction.text import TfidfVectorizer 2 | vectorizer = TfidfVectorizer(min_df=2, ngram_range=(1, 2), stop_words='english', strip_accents='unicode', norm='l2') 3 | X_train = vectorizer.fit_transform(x_train) 4 | X_test = vectorizer.transform(x_test) 5 | 6 | from sklearn.linear_model import SGDClassifier 7 | clf = SGDClassifier(alpha=.0001, n_iter=50).fit(X_train, y_train) 8 | y_pred = clf.predict(X_test) 9 | -------------------------------------------------------------------------------- /Module 1/Chapter 9/fbdump.py: -------------------------------------------------------------------------------- 1 | import facebook 2 | import json 3 | fo = open("fdump.txt",'w') 4 | ACCESS_TOKEN = 'XXXXXXXXXXX' # https://developers.facebook.com/tools/ 5 | explorer 6 | fb = facebook.GraphAPI(ACCESS_TOKEN) 7 | compeny_page = "326249424068240" 8 | content = fb.get_object(compeny_page) 9 | fo.write(json.dumps(content)) -------------------------------------------------------------------------------- /Module 1/Chapter 9/influencer_frnd.py: -------------------------------------------------------------------------------- 1 | friends = fb.get_connections("me", "friends")["data"] 2 | print friends 3 | for frd in friends: 4 | print fb.get_connections(frd["id"],"friends") 5 | -------------------------------------------------------------------------------- /Module 1/Chapter 9/trendingtopic.py: -------------------------------------------------------------------------------- 1 | import nltk 2 | from nltk import word_tokenize,sent_tokenize 3 | from nltk import FreqDist 4 | tweets_tokens = [] 5 | for tweet in tweet_text: 6 | tweets_tokens.append(word_tokenize(tweet)) 7 | Topic_distribution = nltk.FreqDist(tweets_tokens) 8 | Freq_dist_nltk.plot(50, cumulative=False) 9 | 10 | # Better trending topic 11 | 12 | import nltk 13 | Topics = [] 14 | for tweet in tweet_text: 15 | tagged = nltk.pos_tag(word_tokenize(tweet)) 16 | Topics_token = [word for word,pos in ] in tagged if pos in ['NN','NNP'] 17 | print Topics_token -------------------------------------------------------------------------------- /Module 1/Chapter 9/tweetdump.py: -------------------------------------------------------------------------------- 1 | from tweepy.streaming import StreamListener 2 | from tweepy import OAuthHandler 3 | from tweepy import Stream 4 | import sys 5 | consumer_key = 'ABCD012XXXXXXXXx' 6 | consumer_secret = 'xyz123xxxxxxxxxxxxx' 7 | access_token = '000000-ABCDXXXXXXXXXXX' 8 | access_token_secret ='XXXXXXXXXgaw2KYz0VcqCO0F3U4' 9 | 10 | class StdOutListener(StreamListener): 11 | 12 | def on_data(self, data): 13 | with open(sys.argv[1],'a') as tf: 14 | tf.write(data) 15 | return 16 | def on_error(self, status): 17 | print(status) 18 | 19 | if __name__ == '__main__': 20 | l = StdOutListener() 21 | auth = OAuthHandler(consumer_key, consumer_secret) 22 | auth.set_access_token(access_token, access_token_secret) 23 | stream = Stream(auth, l) 24 | stream.filter(track=['Apple watch']) -------------------------------------------------------------------------------- /Module 1/Chapter 9/tweetinfo.py: -------------------------------------------------------------------------------- 1 | import json 2 | import sys 3 | tweets = json.loads(open(sys.argv[1]).read()) 4 | tweet_texts = [ tweet['text']\ 5 | for tweet in tweets ] 6 | 7 | tweet_source = [tweet ['source'] for tweet in tweets] 8 | 9 | tweet_geo = [tweet['geo'] for tweet in tweets] 10 | 11 | tweet_locations = [tweet['place'] for tweet in tweets] 12 | 13 | hashtags = [ hashtag['text'] for tweet in tweets for hashtag in 14 | tweet['entities']['hashtags'] ] 15 | 16 | print tweet_texts 17 | print tweet_locations 18 | print tweet_geo 19 | print hashtags -------------------------------------------------------------------------------- /Module 2/Chapter 1/7853OS_01_codes/chapter1.py: -------------------------------------------------------------------------------- 1 | """ 2 | ============================== 3 | Tokenizing Text into Sentences 4 | ============================== 5 | 6 | >>> para = "Hello World. It's good to see you. Thanks for buying this book." 7 | >>> from nltk.tokenize import sent_tokenize 8 | >>> sent_tokenize(para) 9 | ['Hello World.', "It's good to see you.", 'Thanks for buying this book.'] 10 | 11 | >>> import nltk.data 12 | >>> tokenizer = nltk.data.load('tokenizers/punkt/PY3/english.pickle') 13 | >>> tokenizer.tokenize(para) 14 | ['Hello World.', "It's good to see you.", 'Thanks for buying this book.'] 15 | 16 | >>> spanish_tokenizer = nltk.data.load('tokenizers/punkt/PY3/spanish.pickle') 17 | >>> spanish_tokenizer.tokenize('Hola amigo. Estoy bien.') 18 | ['Hola amigo.', 'Estoy bien.'] 19 | 20 | 21 | =============================== 22 | Tokenizing Sentences into Words 23 | =============================== 24 | 25 | >>> from nltk.tokenize import word_tokenize 26 | >>> word_tokenize('Hello World.') 27 | ['Hello', 'World', '.'] 28 | 29 | >>> from nltk.tokenize import TreebankWordTokenizer 30 | >>> tokenizer = TreebankWordTokenizer() 31 | >>> tokenizer.tokenize('Hello World.') 32 | ['Hello', 'World', '.'] 33 | 34 | >>> word_tokenize("can't") 35 | ['ca', "n't"] 36 | 37 | >>> from nltk.tokenize import PunktWordTokenizer 38 | >>> tokenizer = PunktWordTokenizer() 39 | >>> tokenizer.tokenize("Can't is a contraction.") 40 | ['Can', "'t", 'is', 'a', 'contraction.'] 41 | 42 | >>> from nltk.tokenize import WordPunctTokenizer 43 | >>> tokenizer = WordPunctTokenizer() 44 | >>> tokenizer.tokenize("Can't is a contraction.") 45 | ['Can', "'", 't', 'is', 'a', 'contraction', '.'] 46 | 47 | 48 | ============================================== 49 | Tokenizing Sentences using Regular Expressions 50 | ============================================== 51 | 52 | >>> from nltk.tokenize import RegexpTokenizer 53 | >>> tokenizer = RegexpTokenizer("[\w']+") 54 | >>> tokenizer.tokenize("Can't is a contraction.") 55 | ["Can't", 'is', 'a', 'contraction'] 56 | 57 | >>> from nltk.tokenize import regexp_tokenize 58 | >>> regexp_tokenize("Can't is a contraction.", "[\w']+") 59 | ["Can't", 'is', 'a', 'contraction'] 60 | 61 | >>> tokenizer = RegexpTokenizer('\s+', gaps=True) 62 | >>> tokenizer.tokenize("Can't is a contraction.") 63 | ["Can't", 'is', 'a', 'contraction.'] 64 | 65 | 66 | ============================= 67 | Training a Sentence Tokenizer 68 | ============================= 69 | 70 | >>> from nltk.tokenize import PunktSentenceTokenizer 71 | >>> from nltk.corpus import webtext 72 | >>> text = webtext.raw('overheard.txt') 73 | >>> sent_tokenizer = PunktSentenceTokenizer(text) 74 | >>> sents1 = sent_tokenizer.tokenize(text) 75 | >>> sents1[0] 76 | 'White guy: So, do you have any plans for this evening?' 77 | >>> from nltk.tokenize import sent_tokenize 78 | >>> sents2 = sent_tokenize(text) 79 | >>> sents2[0] 80 | 'White guy: So, do you have any plans for this evening?' 81 | >>> sents1[678] 82 | 'Girl: But you already have a Big Mac...' 83 | >>> sents2[678] 84 | 'Girl: But you already have a Big Mac...\\nHobo: Oh, this is all theatrical.' 85 | 86 | >>> with open('/usr/share/nltk_data/corpora/webtext/overheard.txt', encoding='ISO-8859-2') as f: 87 | ... text = f.read() 88 | >>> sent_tokenizer = PunktSentenceTokenizer(text) 89 | >>> sents = sent_tokenizer.tokenize(text) 90 | >>> sents[0] 91 | 'White guy: So, do you have any plans for this evening?' 92 | >>> sents[678] 93 | 'Girl: But you already have a Big Mac...' 94 | 95 | 96 | =========================================== 97 | Filtering Stopwords in a Tokenized Sentence 98 | =========================================== 99 | 100 | >>> from nltk.corpus import stopwords 101 | >>> english_stops = set(stopwords.words('english')) 102 | >>> words = ["Can't", 'is', 'a', 'contraction'] 103 | >>> [word for word in words if word not in english_stops] 104 | ["Can't", 'contraction'] 105 | 106 | >>> stopwords.fileids() 107 | ['danish', 'dutch', 'english', 'finnish', 'french', 'german', 'hungarian', 'italian', 'norwegian', 'portuguese', 'russian', 'spanish', 'swedish', 'turkish'] 108 | 109 | >>> stopwords.words('dutch') 110 | ['de', 'en', 'van', 'ik', 'te', 'dat', 'die', 'in', 'een', 'hij', 'het', 'niet', 'zijn', 'is', 'was', 'op', 'aan', 'met', 'als', 'voor', 'had', 'er', 'maar', 'om', 'hem', 'dan', 'zou', 'of', 'wat', 'mijn', 'men', 'dit', 'zo', 'door', 'over', 'ze', 'zich', 'bij', 'ook', 'tot', 'je', 'mij', 'uit', 'der', 'daar', 'haar', 'naar', 'heb', 'hoe', 'heeft', 'hebben', 'deze', 'u', 'want', 'nog', 'zal', 'me', 'zij', 'nu', 'ge', 'geen', 'omdat', 'iets', 'worden', 'toch', 'al', 'waren', 'veel', 'meer', 'doen', 'toen', 'moet', 'ben', 'zonder', 'kan', 'hun', 'dus', 'alles', 'onder', 'ja', 'eens', 'hier', 'wie', 'werd', 'altijd', 'doch', 'wordt', 'wezen', 'kunnen', 'ons', 'zelf', 'tegen', 'na', 'reeds', 'wil', 'kon', 'niets', 'uw', 'iemand', 'geweest', 'andere'] 111 | 112 | ========================================= 113 | Looking up a Synset for a Word in WordNet 114 | ========================================= 115 | 116 | >>> from nltk.corpus import wordnet 117 | >>> syn = wordnet.synsets('cookbook')[0] 118 | >>> syn.name() 119 | 'cookbook.n.01' 120 | >>> syn.definition() 121 | 'a book of recipes and cooking directions' 122 | 123 | >>> wordnet.synset('cookbook.n.01') 124 | Synset('cookbook.n.01') 125 | 126 | >>> wordnet.synsets('cooking')[0].examples() 127 | ['cooking can be a great art', 'people are needed who have experience in cookery', 'he left the preparation of meals to his wife'] 128 | 129 | >>> syn.hypernyms() 130 | [Synset('reference_book.n.01')] 131 | >>> syn.hypernyms()[0].hyponyms() 132 | [Synset('annual.n.02'), Synset('atlas.n.02'), Synset('cookbook.n.01'), Synset('directory.n.01'), Synset('encyclopedia.n.01'), Synset('handbook.n.01'), Synset('instruction_book.n.01'), Synset('source_book.n.01'), Synset('wordbook.n.01')] 133 | >>> syn.root_hypernyms() 134 | [Synset('entity.n.01')] 135 | 136 | >>> syn.hypernym_paths() 137 | [[Synset('entity.n.01'), Synset('physical_entity.n.01'), Synset('object.n.01'), Synset('whole.n.02'), Synset('artifact.n.01'), Synset('creation.n.02'), Synset('product.n.02'), Synset('work.n.02'), Synset('publication.n.01'), Synset('book.n.01'), Synset('reference_book.n.01'), Synset('cookbook.n.01')]] 138 | 139 | >>> syn.pos() 140 | 'n' 141 | 142 | >>> len(wordnet.synsets('great')) 143 | 7 144 | >>> len(wordnet.synsets('great', pos='n')) 145 | 1 146 | >>> len(wordnet.synsets('great', pos='a')) 147 | 6 148 | 149 | 150 | ========================================= 151 | Looking up Lemmas and Synonyms in WordNet 152 | ========================================= 153 | 154 | >>> from nltk.corpus import wordnet 155 | >>> syn = wordnet.synsets('cookbook')[0] 156 | >>> lemmas = syn.lemmas() 157 | >>> len(lemmas) 158 | 2 159 | >>> lemmas[0].name() 160 | 'cookbook' 161 | >>> lemmas[1].name() 162 | 'cookery_book' 163 | >>> lemmas[0].synset() == lemmas[1].synset() 164 | True 165 | 166 | >>> [lemma.name() for lemma in syn.lemmas()] 167 | ['cookbook', 'cookery_book'] 168 | 169 | >>> synonyms = [] 170 | >>> for syn in wordnet.synsets('book'): 171 | ... for lemma in syn.lemmas(): 172 | ... synonyms.append(lemma.name()) 173 | >>> len(synonyms) 174 | 38 175 | 176 | >>> len(set(synonyms)) 177 | 25 178 | 179 | >>> gn2 = wordnet.synset('good.n.02') 180 | >>> gn2.definition() 181 | 'moral excellence or admirableness' 182 | >>> evil = gn2.lemmas()[0].antonyms()[0] 183 | >>> evil.name() 184 | 'evil' 185 | >>> evil.synset().definition() 186 | 'the quality of being morally wrong in principle or practice' 187 | >>> ga1 = wordnet.synset('good.a.01') 188 | >>> ga1.definition() 189 | 'having desirable or positive qualities especially those suitable for a thing specified' 190 | >>> bad = ga1.lemmas()[0].antonyms()[0] 191 | >>> bad.name() 192 | 'bad' 193 | >>> bad.synset().definition() 194 | 'having undesirable or negative qualities' 195 | 196 | 197 | ===================================== 198 | Calculating WordNet Synset Similarity 199 | ===================================== 200 | 201 | >>> from nltk.corpus import wordnet 202 | >>> cb = wordnet.synset('cookbook.n.01') 203 | >>> ib = wordnet.synset('instruction_book.n.01') 204 | >>> cb.wup_similarity(ib) 205 | 0.9166666666666666 206 | 207 | >>> ref = cb.hypernyms()[0] 208 | >>> cb.shortest_path_distance(ref) 209 | 1 210 | >>> ib.shortest_path_distance(ref) 211 | 1 212 | >>> cb.shortest_path_distance(ib) 213 | 2 214 | 215 | >>> dog = wordnet.synsets('dog')[0] 216 | >>> dog.wup_similarity(cb) 217 | 0.38095238095238093 218 | 219 | >>> sorted(dog.common_hypernyms(cb)) 220 | [Synset('entity.n.01'), Synset('object.n.01'), Synset('physical_entity.n.01'), Synset('whole.n.02')] 221 | 222 | >>> cook = wordnet.synset('cook.v.01') 223 | >>> bake = wordnet.synset('bake.v.02') 224 | >>> cook.wup_similarity(bake) 225 | 0.6666666666666666 226 | 227 | >>> cb.path_similarity(ib) 228 | 0.3333333333333333 229 | >>> cb.path_similarity(dog) 230 | 0.07142857142857142 231 | >>> cb.lch_similarity(ib) 232 | 2.538973871058276 233 | >>> cb.lch_similarity(dog) 234 | 0.9985288301111273 235 | 236 | 237 | ============================= 238 | Discovering Word Collocations 239 | ============================= 240 | 241 | >>> from nltk.corpus import webtext 242 | >>> from nltk.collocations import BigramCollocationFinder 243 | >>> from nltk.metrics import BigramAssocMeasures 244 | >>> words = [w.lower() for w in webtext.words('grail.txt')] 245 | >>> bcf = BigramCollocationFinder.from_words(words) 246 | >>> bcf.nbest(BigramAssocMeasures.likelihood_ratio, 4) 247 | [("'", 's'), ('arthur', ':'), ('#', '1'), ("'", 't')] 248 | 249 | >>> from nltk.corpus import stopwords 250 | >>> stopset = set(stopwords.words('english')) 251 | >>> filter_stops = lambda w: len(w) < 3 or w in stopset 252 | >>> bcf.apply_word_filter(filter_stops) 253 | >>> bcf.nbest(BigramAssocMeasures.likelihood_ratio, 4) 254 | [('black', 'knight'), ('clop', 'clop'), ('head', 'knight'), ('mumble', 'mumble')] 255 | 256 | >>> from nltk.collocations import TrigramCollocationFinder 257 | >>> from nltk.metrics import TrigramAssocMeasures 258 | >>> words = [w.lower() for w in webtext.words('singles.txt')] 259 | >>> tcf = TrigramCollocationFinder.from_words(words) 260 | >>> tcf.apply_word_filter(filter_stops) 261 | >>> tcf.apply_freq_filter(3) 262 | >>> tcf.nbest(TrigramAssocMeasures.likelihood_ratio, 4) 263 | [('long', 'term', 'relationship')] 264 | """ 265 | 266 | if __name__ == '__main__': 267 | import doctest 268 | doctest.testmod() -------------------------------------------------------------------------------- /Module 2/Chapter 2/7853OS_02_codes/chapter2.py: -------------------------------------------------------------------------------- 1 | """ 2 | ============== 3 | Stemming Words 4 | ============== 5 | 6 | >>> from nltk.stem import PorterStemmer 7 | >>> stemmer = PorterStemmer() 8 | >>> stemmer.stem('cooking') 9 | 'cook' 10 | >>> stemmer.stem('cookery') 11 | 'cookeri' 12 | 13 | >>> from nltk.stem import LancasterStemmer 14 | >>> stemmer = LancasterStemmer() 15 | >>> stemmer.stem('cooking') 16 | 'cook' 17 | >>> stemmer.stem('cookery') 18 | 'cookery' 19 | 20 | >>> from nltk.stem import RegexpStemmer 21 | >>> stemmer = RegexpStemmer('ing') 22 | >>> stemmer.stem('cooking') 23 | 'cook' 24 | >>> stemmer.stem('cookery') 25 | 'cookery' 26 | >>> stemmer.stem('ingleside') 27 | 'leside' 28 | 29 | >>> from nltk.stem import SnowballStemmer 30 | >>> SnowballStemmer.languages 31 | ('danish', 'dutch', 'english', 'finnish', 'french', 'german', 'hungarian', 'italian', 'norwegian', 'porter', 'portuguese', 'romanian', 'russian', 'spanish', 'swedish') 32 | >>> spanish_stemmer = SnowballStemmer('spanish') 33 | >>> spanish_stemmer.stem('hola') 34 | 'hol' 35 | 36 | 37 | ============================== 38 | Lemmatising Words with WordNet 39 | ============================== 40 | 41 | >>> from nltk.stem import WordNetLemmatizer 42 | >>> lemmatizer = WordNetLemmatizer() 43 | >>> lemmatizer.lemmatize('cooking') 44 | 'cooking' 45 | >>> lemmatizer.lemmatize('cooking', pos='v') 46 | 'cook' 47 | >>> lemmatizer.lemmatize('cookbooks') 48 | 'cookbook' 49 | 50 | >>> from nltk.stem import PorterStemmer 51 | >>> stemmer = PorterStemmer() 52 | >>> stemmer.stem('believes') 53 | 'believ' 54 | >>> lemmatizer.lemmatize('believes') 55 | 'belief' 56 | 57 | >>> stemmer.stem('buses') 58 | 'buse' 59 | >>> lemmatizer.lemmatize('buses') 60 | 'bus' 61 | >>> stemmer.stem('bus') 62 | 'bu' 63 | 64 | 65 | ============================================ 66 | Replacing Words Matching Regular Expressions 67 | ============================================ 68 | 69 | >>> from replacers import RegexpReplacer 70 | >>> replacer = RegexpReplacer() 71 | >>> replacer.replace("can't is a contraction") 72 | 'cannot is a contraction' 73 | >>> replacer.replace("I should've done that thing I didn't do") 74 | 'I should have done that thing I did not do' 75 | 76 | >>> from nltk.tokenize import word_tokenize 77 | >>> from replacers import RegexpReplacer 78 | >>> replacer = RegexpReplacer() 79 | >>> word_tokenize("can't is a contraction") 80 | ['ca', "n't", 'is', 'a', 'contraction'] 81 | >>> word_tokenize(replacer.replace("can't is a contraction")) 82 | ['can', 'not', 'is', 'a', 'contraction'] 83 | 84 | 85 | ============================= 86 | Removing Repeating Characters 87 | ============================= 88 | 89 | >>> from replacers import RepeatReplacer 90 | >>> replacer = RepeatReplacer() 91 | >>> replacer.replace('looooove') 92 | 'love' 93 | >>> replacer.replace('oooooh') 94 | 'ooh' 95 | >>> replacer.replace('goose') 96 | 'goose' 97 | 98 | 99 | ================================ 100 | Spelling Correction with Enchant 101 | ================================ 102 | 103 | >>> from replacers import SpellingReplacer 104 | >>> replacer = SpellingReplacer() 105 | >>> replacer.replace('cookbok') 106 | 'cookbook' 107 | 108 | >>> import enchant 109 | >>> d = enchant.Dict('en') 110 | >>> d.suggest('languege') 111 | ['language', 'languages', 'languor', "language's"] 112 | 113 | >>> from nltk.metrics import edit_distance 114 | >>> edit_distance('language', 'languege') 115 | 1 116 | >>> edit_distance('language', 'languor') 117 | 3 118 | 119 | >>> enchant.list_languages() 120 | ['en', 'en_CA', 'en_GB', 'en_US'] 121 | 122 | >>> dUS = enchant.Dict('en_US') 123 | >>> dUS.check('theater') 124 | True 125 | >>> dGB = enchant.Dict('en_GB') 126 | >>> dGB.check('theater') 127 | False 128 | >>> us_replacer = SpellingReplacer('en_US') 129 | >>> us_replacer.replace('theater') 130 | 'theater' 131 | >>> gb_replacer = SpellingReplacer('en_GB') 132 | >>> gb_replacer.replace('theater') 133 | 'theatre' 134 | 135 | >>> d = enchant.Dict('en_US') 136 | >>> d.check('nltk') 137 | False 138 | >>> d = enchant.DictWithPWL('en_US', 'mywords.txt') 139 | >>> d.check('nltk') 140 | True 141 | 142 | >>> from replacers import CustomSpellingReplacer 143 | >>> d = enchant.DictWithPWL('en_US', 'mywords.txt') 144 | >>> replacer = CustomSpellingReplacer(d) 145 | >>> replacer.replace('nltk') 146 | 'nltk' 147 | 148 | ================================= 149 | Replacing Negations with Antonyms 150 | ================================= 151 | 152 | >>> from replacers import AntonymReplacer 153 | >>> replacer = AntonymReplacer() 154 | >>> replacer.replace('good') 155 | >>> replacer.replace('uglify') 156 | 'beautify' 157 | >>> sent = ["let's", 'not', 'uglify', 'our', 'code'] 158 | >>> replacer.replace_negations(sent) 159 | ["let's", 'beautify', 'our', 'code'] 160 | 161 | >>> from replacers import AntonymWordReplacer 162 | >>> replacer = AntonymWordReplacer({'evil': 'good'}) 163 | >>> replacer.replace_negations(['good', 'is', 'not', 'evil']) 164 | ['good', 'is', 'good'] 165 | """ 166 | 167 | if __name__ == '__main__': 168 | import doctest 169 | doctest.testmod() -------------------------------------------------------------------------------- /Module 2/Chapter 2/7853OS_02_codes/mywords.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Natural-Language-Processing-Python-and-NLTK/b34df3ceab78b3de29195a811696dcd06e77063a/Module 2/Chapter 2/7853OS_02_codes/mywords.txt -------------------------------------------------------------------------------- /Module 2/Chapter 2/7853OS_02_codes/replacers.py: -------------------------------------------------------------------------------- 1 | import re, csv, yaml, enchant 2 | from nltk.corpus import wordnet 3 | from nltk.metrics import edit_distance 4 | 5 | ################################################## 6 | ## Replacing Words Matching Regular Expressions ## 7 | ################################################## 8 | 9 | replacement_patterns = [ 10 | (r'won\'t', 'will not'), 11 | (r'can\'t', 'cannot'), 12 | (r'i\'m', 'i am'), 13 | (r'ain\'t', 'is not'), 14 | (r'(\w+)\'ll', '\g<1> will'), 15 | (r'(\w+)n\'t', '\g<1> not'), 16 | (r'(\w+)\'ve', '\g<1> have'), 17 | (r'(\w+)\'s', '\g<1> is'), 18 | (r'(\w+)\'re', '\g<1> are'), 19 | (r'(\w+)\'d', '\g<1> would'), 20 | ] 21 | 22 | class RegexpReplacer(object): 23 | """ Replaces regular expression in a text. 24 | >>> replacer = RegexpReplacer() 25 | >>> replacer.replace("can't is a contraction") 26 | 'cannot is a contraction' 27 | >>> replacer.replace("I should've done that thing I didn't do") 28 | 'I should have done that thing I did not do' 29 | """ 30 | def __init__(self, patterns=replacement_patterns): 31 | self.patterns = [(re.compile(regex), repl) for (regex, repl) in patterns] 32 | 33 | def replace(self, text): 34 | s = text 35 | 36 | for (pattern, repl) in self.patterns: 37 | s = re.sub(pattern, repl, s) 38 | 39 | return s 40 | 41 | #################################### 42 | ## Replacing Repeating Characters ## 43 | #################################### 44 | 45 | class RepeatReplacer(object): 46 | """ Removes repeating characters until a valid word is found. 47 | >>> replacer = RepeatReplacer() 48 | >>> replacer.replace('looooove') 49 | 'love' 50 | >>> replacer.replace('oooooh') 51 | 'ooh' 52 | >>> replacer.replace('goose') 53 | 'goose' 54 | """ 55 | def __init__(self): 56 | self.repeat_regexp = re.compile(r'(\w*)(\w)\2(\w*)') 57 | self.repl = r'\1\2\3' 58 | 59 | def replace(self, word): 60 | if wordnet.synsets(word): 61 | return word 62 | 63 | repl_word = self.repeat_regexp.sub(self.repl, word) 64 | 65 | if repl_word != word: 66 | return self.replace(repl_word) 67 | else: 68 | return repl_word 69 | 70 | ###################################### 71 | ## Spelling Correction with Enchant ## 72 | ###################################### 73 | 74 | class SpellingReplacer(object): 75 | """ Replaces misspelled words with a likely suggestion based on shortest 76 | edit distance. 77 | >>> replacer = SpellingReplacer() 78 | >>> replacer.replace('cookbok') 79 | 'cookbook' 80 | """ 81 | def __init__(self, dict_name='en', max_dist=2): 82 | self.spell_dict = enchant.Dict(dict_name) 83 | self.max_dist = max_dist 84 | 85 | def replace(self, word): 86 | if self.spell_dict.check(word): 87 | return word 88 | 89 | suggestions = self.spell_dict.suggest(word) 90 | 91 | if suggestions and edit_distance(word, suggestions[0]) <= self.max_dist: 92 | return suggestions[0] 93 | else: 94 | return word 95 | 96 | class CustomSpellingReplacer(SpellingReplacer): 97 | """ SpellingReplacer that allows passing a custom enchant dictionary, such 98 | a DictWithPWL. 99 | >>> d = enchant.DictWithPWL('en_US', 'mywords.txt') 100 | >>> replacer = CustomSpellingReplacer(d) 101 | >>> replacer.replace('nltk') 102 | 'nltk' 103 | """ 104 | def __init__(self, spell_dict, max_dist=2): 105 | self.spell_dict = spell_dict 106 | self.max_dist = max_dist 107 | 108 | ######################## 109 | ## Replacing Synonyms ## 110 | ######################## 111 | 112 | class WordReplacer(object): 113 | """ WordReplacer that replaces a given word with a word from the word_map, 114 | or if the word isn't found, returns the word as is. 115 | >>> replacer = WordReplacer({'bday': 'birthday'}) 116 | >>> replacer.replace('bday') 117 | 'birthday' 118 | >>> replacer.replace('happy') 119 | 'happy' 120 | """ 121 | def __init__(self, word_map): 122 | self.word_map = word_map 123 | 124 | def replace(self, word): 125 | return self.word_map.get(word, word) 126 | 127 | class CsvWordReplacer(WordReplacer): 128 | """ WordReplacer that reads word mappings from a csv file. 129 | >>> replacer = CsvWordReplacer('synonyms.csv') 130 | >>> replacer.replace('bday') 131 | 'birthday' 132 | >>> replacer.replace('happy') 133 | 'happy' 134 | """ 135 | def __init__(self, fname): 136 | word_map = {} 137 | 138 | for line in csv.reader(open(fname)): 139 | word, syn = line 140 | word_map[word] = syn 141 | 142 | super(CsvWordReplacer, self).__init__(word_map) 143 | 144 | class YamlWordReplacer(WordReplacer): 145 | """ WordReplacer that reads word mappings from a yaml file. 146 | >>> replacer = YamlWordReplacer('synonyms.yaml') 147 | >>> replacer.replace('bday') 148 | 'birthday' 149 | >>> replacer.replace('happy') 150 | 'happy' 151 | """ 152 | def __init__(self, fname): 153 | word_map = yaml.load(open(fname)) 154 | super(YamlWordReplacer, self).__init__(word_map) 155 | 156 | ####################################### 157 | ## Replacing Negations with Antonyms ## 158 | ####################################### 159 | 160 | class AntonymReplacer(object): 161 | def replace(self, word, pos=None): 162 | """ Returns the antonym of a word, but only if there is no ambiguity. 163 | >>> replacer = AntonymReplacer() 164 | >>> replacer.replace('good') 165 | >>> replacer.replace('uglify') 166 | 'beautify' 167 | >>> replacer.replace('beautify') 168 | 'uglify' 169 | """ 170 | antonyms = set() 171 | 172 | for syn in wordnet.synsets(word, pos=pos): 173 | for lemma in syn.lemmas(): 174 | for antonym in lemma.antonyms(): 175 | antonyms.add(antonym.name()) 176 | 177 | if len(antonyms) == 1: 178 | return antonyms.pop() 179 | else: 180 | return None 181 | 182 | def replace_negations(self, sent): 183 | """ Try to replace negations with antonyms in the tokenized sentence. 184 | >>> replacer = AntonymReplacer() 185 | >>> replacer.replace_negations(['do', 'not', 'uglify', 'our', 'code']) 186 | ['do', 'beautify', 'our', 'code'] 187 | >>> replacer.replace_negations(['good', 'is', 'not', 'evil']) 188 | ['good', 'is', 'not', 'evil'] 189 | """ 190 | i, l = 0, len(sent) 191 | words = [] 192 | 193 | while i < l: 194 | word = sent[i] 195 | 196 | if word == 'not' and i+1 < l: 197 | ant = self.replace(sent[i+1]) 198 | 199 | if ant: 200 | words.append(ant) 201 | i += 2 202 | continue 203 | 204 | words.append(word) 205 | i += 1 206 | 207 | return words 208 | 209 | class AntonymWordReplacer(WordReplacer, AntonymReplacer): 210 | """ AntonymReplacer that uses a custom mapping instead of WordNet. 211 | Order of inheritance is very important, this class would not work if 212 | AntonymReplacer comes before WordReplacer. 213 | >>> replacer = AntonymWordReplacer({'evil': 'good'}) 214 | >>> replacer.replace_negations(['good', 'is', 'not', 'evil']) 215 | ['good', 'is', 'good'] 216 | """ 217 | pass 218 | 219 | if __name__ == '__main__': 220 | import doctest 221 | doctest.testmod() -------------------------------------------------------------------------------- /Module 2/Chapter 3/7853OS_03_codes/brown.pos: -------------------------------------------------------------------------------- 1 | The/at-tl expense/nn and/cc time/nn involved/vbn are/ber astronomical/jj ./. -------------------------------------------------------------------------------- /Module 2/Chapter 3/7853OS_03_codes/catchunked.py: -------------------------------------------------------------------------------- 1 | from nltk.corpus.reader import CategorizedCorpusReader, ChunkedCorpusReader 2 | from nltk.corpus.reader import ConllCorpusReader, ConllChunkCorpusReader 3 | 4 | class CategorizedChunkedCorpusReader(CategorizedCorpusReader, ChunkedCorpusReader): 5 | """ 6 | A reader for chunked corpora whose documents are divided into categories 7 | based on their file identifiers. 8 | """ 9 | # code adapted from CategorizedTaggedCorpusReader 10 | def __init__(self, *args, **kwargs): 11 | CategorizedCorpusReader.__init__(self, kwargs) 12 | ChunkedCorpusReader.__init__(self, *args, **kwargs) 13 | 14 | def _resolve(self, fileids, categories): 15 | if fileids is not None and categories is not None: 16 | raise ValueError('Specify fileids or categories, not both') 17 | if categories is not None: 18 | return self.fileids(categories) 19 | else: 20 | return fileids 21 | 22 | def raw(self, fileids=None, categories=None): 23 | return ChunkedCorpusReader.raw(self, self._resolve(fileids, categories)) 24 | 25 | def words(self, fileids=None, categories=None): 26 | return ChunkedCorpusReader.words(self, self._resolve(fileids, categories)) 27 | 28 | def sents(self, fileids=None, categories=None): 29 | return ChunkedCorpusReader.sents(self, self._resolve(fileids, categories)) 30 | 31 | def paras(self, fileids=None, categories=None): 32 | return ChunkedCorpusReader.paras(self, self._resolve(fileids, categories)) 33 | 34 | def tagged_words(self, fileids=None, categories=None): 35 | return ChunkedCorpusReader.tagged_words(self, self._resolve(fileids, categories)) 36 | 37 | def tagged_sents(self, fileids=None, categories=None): 38 | return ChunkedCorpusReader.tagged_sents(self, self._resolve(fileids, categories)) 39 | 40 | def tagged_paras(self, fileids=None, categories=None): 41 | return ChunkedCorpusReader.tagged_paras(self, self._resolve(fileids, categories)) 42 | 43 | def chunked_words(self, fileids=None, categories=None): 44 | return ChunkedCorpusReader.chunked_words( 45 | self, self._resolve(fileids, categories)) 46 | 47 | def chunked_sents(self, fileids=None, categories=None): 48 | return ChunkedCorpusReader.chunked_sents( 49 | self, self._resolve(fileids, categories)) 50 | 51 | def chunked_paras(self, fileids=None, categories=None): 52 | return ChunkedCorpusReader.chunked_paras( 53 | self, self._resolve(fileids, categories)) 54 | 55 | class CategorizedConllChunkCorpusReader(CategorizedCorpusReader, ConllChunkCorpusReader): 56 | """ 57 | A reader for conll chunked corpora whose documents are divided into 58 | categories based on their file identifiers. 59 | """ 60 | def __init__(self, *args, **kwargs): 61 | # NOTE: in addition to cat_pattern, ConllChunkCorpusReader also requires 62 | # chunk_types as third argument, which defaults to ('NP','VP','PP') 63 | CategorizedCorpusReader.__init__(self, kwargs) 64 | ConllChunkCorpusReader.__init__(self, *args, **kwargs) 65 | 66 | def _resolve(self, fileids, categories): 67 | if fileids is not None and categories is not None: 68 | raise ValueError('Specify fileids or categories, not both') 69 | if categories is not None: 70 | return self.fileids(categories) 71 | else: 72 | return fileids 73 | 74 | def raw(self, fileids=None, categories=None): 75 | return ConllCorpusReader.raw(self, self._resolve(fileids, categories)) 76 | 77 | def words(self, fileids=None, categories=None): 78 | return ConllCorpusReader.words(self, self._resolve(fileids, categories)) 79 | 80 | def sents(self, fileids=None, categories=None): 81 | return ConllCorpusReader.sents(self, self._resolve(fileids, categories)) 82 | 83 | def tagged_words(self, fileids=None, categories=None): 84 | return ConllCorpusReader.tagged_words(self, self._resolve(fileids, categories)) 85 | 86 | def tagged_sents(self, fileids=None, categories=None): 87 | return ConllCorpusReader.tagged_sents(self, self._resolve(fileids, categories)) 88 | 89 | def chunked_words(self, fileids=None, categories=None, chunk_types=None): 90 | return ConllCorpusReader.chunked_words( 91 | self, self._resolve(fileids, categories), chunk_types) 92 | 93 | def chunked_sents(self, fileids=None, categories=None, chunk_types=None): 94 | return ConllCorpusReader.chunked_sents( 95 | self, self._resolve(fileids, categories), chunk_types) 96 | 97 | def parsed_sents(self, fileids=None, categories=None, pos_in_tree=None): 98 | return ConllCorpusReader.parsed_sents( 99 | self, self._resolve(fileids, categories), pos_in_tree) 100 | 101 | def srl_spans(self, fileids=None, categories=None): 102 | return ConllCorpusReader.srl_spans(self, self._resolve(fileids, categories)) 103 | 104 | def srl_instances(self, fileids=None, categories=None, pos_in_tree=None, flatten=True): 105 | return ConllCorpusReader.srl_instances( 106 | self, self._resolve(fileids, categories), pos_in_tree, flatten) 107 | 108 | def iob_words(self, fileids=None, categories=None): 109 | return ConllCorpusReader.iob_words(self, self._resolve(fileids, categories)) 110 | 111 | def iob_sents(self, fileids=None, categories=None): 112 | return ConllCorpusReader.iob_sents(self, self._resolve(fileids, categories)) -------------------------------------------------------------------------------- /Module 2/Chapter 3/7853OS_03_codes/chapter3.py: -------------------------------------------------------------------------------- 1 | """ 2 | ========================== 3 | Setting up a Custom Corpus 4 | ========================== 5 | 6 | >>> import os, os.path 7 | >>> path = os.path.expanduser('~/nltk_data') 8 | >>> if not os.path.exists(path): 9 | ... os.mkdir(path) 10 | >>> os.path.exists(path) 11 | True 12 | >>> import nltk.data 13 | >>> path in nltk.data.path 14 | True 15 | 16 | >>> nltk.data.load('corpora/cookbook/mywords.txt', format='raw') 17 | b'nltk\\n' 18 | 19 | >>> nltk.data.load('corpora/cookbook/synonyms.yaml') 20 | {'bday': 'birthday'} 21 | 22 | 23 | =========================== 24 | Creating a Word List Corpus 25 | =========================== 26 | 27 | >>> from nltk.corpus.reader import WordListCorpusReader 28 | >>> reader = WordListCorpusReader('.', ['wordlist']) 29 | >>> reader.words() 30 | ['nltk', 'corpus', 'corpora', 'wordnet'] 31 | >>> reader.fileids() 32 | ['wordlist'] 33 | 34 | >>> reader.raw() 35 | 'nltk\\ncorpus\\ncorpora\\nwordnet\\n' 36 | >>> from nltk.tokenize import line_tokenize 37 | >>> line_tokenize(reader.raw()) 38 | ['nltk', 'corpus', 'corpora', 'wordnet'] 39 | 40 | >>> from nltk.corpus import names 41 | >>> names.fileids() 42 | ['female.txt', 'male.txt'] 43 | >>> len(names.words('female.txt')) 44 | 5001 45 | >>> len(names.words('male.txt')) 46 | 2943 47 | 48 | >>> from nltk.corpus import words 49 | >>> words.fileids() 50 | ['en', 'en-basic'] 51 | >>> len(words.words('en-basic')) 52 | 850 53 | >>> len(words.words('en')) 54 | 234936 55 | 56 | 57 | ============================================ 58 | Creating a Part-of-Speech Tagged Word Corpus 59 | ============================================ 60 | 61 | >>> from nltk.corpus.reader import TaggedCorpusReader 62 | >>> reader = TaggedCorpusReader('.', r'.*\.pos') 63 | >>> reader.words() 64 | ['The', 'expense', 'and', 'time', 'involved', 'are', ...] 65 | >>> reader.tagged_words() 66 | [('The', 'AT-TL'), ('expense', 'NN'), ('and', 'CC'), ...] 67 | >>> reader.sents() 68 | [['The', 'expense', 'and', 'time', 'involved', 'are', 'astronomical', '.']] 69 | >>> reader.tagged_sents() 70 | [[('The', 'AT-TL'), ('expense', 'NN'), ('and', 'CC'), ('time', 'NN'), ('involved', 'VBN'), ('are', 'BER'), ('astronomical', 'JJ'), ('.', '.')]] 71 | >>> reader.paras() 72 | [[['The', 'expense', 'and', 'time', 'involved', 'are', 'astronomical', '.']]] 73 | >>> reader.tagged_paras() 74 | [[[('The', 'AT-TL'), ('expense', 'NN'), ('and', 'CC'), ('time', 'NN'), ('involved', 'VBN'), ('are', 'BER'), ('astronomical', 'JJ'), ('.', '.')]]] 75 | 76 | >>> from nltk.tokenize import SpaceTokenizer 77 | >>> reader = TaggedCorpusReader('.', r'.*\.pos', word_tokenizer=SpaceTokenizer()) 78 | >>> reader.words() 79 | ['The', 'expense', 'and', 'time', 'involved', 'are', ...] 80 | 81 | >>> from nltk.tokenize import LineTokenizer 82 | >>> reader = TaggedCorpusReader('.', r'.*\.pos', sent_tokenizer=LineTokenizer()) 83 | >>> reader.sents() 84 | [['The', 'expense', 'and', 'time', 'involved', 'are', 'astronomical', '.']] 85 | 86 | >>> reader = TaggedCorpusReader('.', r'.*\.pos', tagset='en-brown') 87 | >>> reader.tagged_words(tagset='universal') 88 | [('The', 'DET'), ('expense', 'NOUN'), ('and', 'CONJ'), ...] 89 | 90 | >>> from nltk.corpus import treebank 91 | >>> treebank.tagged_words() 92 | [('Pierre', 'NNP'), ('Vinken', 'NNP'), (',', ','), ...] 93 | >>> treebank.tagged_words(tagset='universal') 94 | [('Pierre', 'NOUN'), ('Vinken', 'NOUN'), (',', '.'), ...] 95 | >>> treebank.tagged_words(tagset='brown') 96 | [('Pierre', 'UNK'), ('Vinken', 'UNK'), (',', 'UNK'), ...] 97 | 98 | 99 | ================================ 100 | Creating a Chunked Phrase Corpus 101 | ================================ 102 | 103 | >>> from nltk.corpus.reader import ChunkedCorpusReader 104 | >>> reader = ChunkedCorpusReader('.', r'.*\.chunk') 105 | >>> reader.chunked_words() 106 | [Tree('NP', [('Earlier', 'JJR'), ('staff-reduction', 'NN'), ('moves', 'NNS')]), ('have', 'VBP'), ...] 107 | >>> reader.chunked_sents() 108 | [Tree('S', [Tree('NP', [('Earlier', 'JJR'), ('staff-reduction', 'NN'), ('moves', 'NNS')]), ('have', 'VBP'), ('trimmed', 'VBN'), ('about', 'IN'), Tree('NP', [('300', 'CD'), ('jobs', 'NNS')]), (',', ','), Tree('NP', [('the', 'DT'), ('spokesman', 'NN')]), ('said', 'VBD'), ('.', '.')])] 109 | >>> reader.chunked_paras() 110 | [[Tree('S', [Tree('NP', [('Earlier', 'JJR'), ('staff-reduction', 'NN'), ('moves', 'NNS')]), ('have', 'VBP'), ('trimmed', 'VBN'), ('about', 'IN'), Tree('NP', [('300', 'CD'), ('jobs', 'NNS')]), (',', ','), Tree('NP', [('the', 'DT'), ('spokesman', 'NN')]), ('said', 'VBD'), ('.', '.')])]] 111 | 112 | >>> from nltk.corpus.reader import ConllChunkCorpusReader 113 | >>> conllreader = ConllChunkCorpusReader('.', r'.*\.iob', ('NP', 'VP', 'PP')) 114 | >>> conllreader.chunked_words() 115 | [Tree('NP', [('Mr.', 'NNP'), ('Meador', 'NNP')]), Tree('VP', [('had', 'VBD'), ('been', 'VBN')]), ...] 116 | >>> conllreader.chunked_sents() 117 | [Tree('S', [Tree('NP', [('Mr.', 'NNP'), ('Meador', 'NNP')]), Tree('VP', [('had', 'VBD'), ('been', 'VBN')]), Tree('NP', [('executive', 'JJ'), ('vice', 'NN'), ('president', 'NN')]), Tree('PP', [('of', 'IN')]), Tree('NP', [('Balcor', 'NNP')]), ('.', '.')])] 118 | >>> conllreader.iob_words() 119 | [('Mr.', 'NNP', 'B-NP'), ('Meador', 'NNP', 'I-NP'), ...] 120 | >>> conllreader.iob_sents() 121 | [[('Mr.', 'NNP', 'B-NP'), ('Meador', 'NNP', 'I-NP'), ('had', 'VBD', 'B-VP'), ('been', 'VBN', 'I-VP'), ('executive', 'JJ', 'B-NP'), ('vice', 'NN', 'I-NP'), ('president', 'NN', 'I-NP'), ('of', 'IN', 'B-PP'), ('Balcor', 'NNP', 'B-NP'), ('.', '.', 'O')]] 122 | 123 | >>> reader.chunked_words()[0].leaves() 124 | [('Earlier', 'JJR'), ('staff-reduction', 'NN'), ('moves', 'NNS')] 125 | >>> reader.chunked_sents()[0].leaves() 126 | [('Earlier', 'JJR'), ('staff-reduction', 'NN'), ('moves', 'NNS'), ('have', 'VBP'), ('trimmed', 'VBN'), ('about', 'IN'), ('300', 'CD'), ('jobs', 'NNS'), (',', ','), ('the', 'DT'), ('spokesman', 'NN'), ('said', 'VBD'), ('.', '.')] 127 | >>> reader.chunked_paras()[0][0].leaves() 128 | [('Earlier', 'JJR'), ('staff-reduction', 'NN'), ('moves', 'NNS'), ('have', 'VBP'), ('trimmed', 'VBN'), ('about', 'IN'), ('300', 'CD'), ('jobs', 'NNS'), (',', ','), ('the', 'DT'), ('spokesman', 'NN'), ('said', 'VBD'), ('.', '.')] 129 | 130 | 131 | ================================== 132 | Creating a Categorized Text Corpus 133 | ================================== 134 | 135 | >>> from nltk.corpus import brown 136 | >>> brown.categories() 137 | ['adventure', 'belles_lettres', 'editorial', 'fiction', 'government', 'hobbies', 'humor', 'learned', 'lore', 'mystery', 'news', 'religion', 'reviews', 'romance', 'science_fiction'] 138 | 139 | >>> from nltk.corpus.reader import CategorizedPlaintextCorpusReader 140 | >>> reader = CategorizedPlaintextCorpusReader('.', r'movie_.*\.txt', cat_pattern=r'movie_(\w+)\.txt') 141 | >>> reader.categories() 142 | ['neg', 'pos'] 143 | >>> reader.fileids(categories=['neg']) 144 | ['movie_neg.txt'] 145 | >>> reader.fileids(categories=['pos']) 146 | ['movie_pos.txt'] 147 | 148 | >>> reader = CategorizedPlaintextCorpusReader('.', r'movie_.*\.txt', cat_map={'movie_pos.txt': ['pos'], 'movie_neg.txt': ['neg']}) 149 | >>> reader.categories() 150 | ['neg', 'pos'] 151 | 152 | 153 | =================================== 154 | Creating a Categorized Chunk Corpus 155 | =================================== 156 | 157 | >>> import nltk.data 158 | >>> from catchunked import CategorizedChunkedCorpusReader 159 | >>> path = nltk.data.find('corpora/treebank/tagged') 160 | >>> reader = CategorizedChunkedCorpusReader(path, r'wsj_.*\.pos', cat_pattern=r'wsj_(.*)\.pos') 161 | >>> len(reader.categories()) == len(reader.fileids()) 162 | True 163 | >>> len(reader.chunked_sents(categories=['0001'])) 164 | 16 165 | 166 | >>> import nltk.data 167 | >>> from catchunked import CategorizedConllChunkCorpusReader 168 | >>> path = nltk.data.find('corpora/conll2000') 169 | >>> reader = CategorizedConllChunkCorpusReader(path, r'.*\.txt', ('NP','VP','PP'), cat_pattern=r'(.*)\.txt') 170 | >>> reader.categories() 171 | ['test', 'train'] 172 | >>> reader.fileids() 173 | ['test.txt', 'train.txt'] 174 | >>> len(reader.chunked_sents(categories=['test'])) 175 | 2012 176 | 177 | 178 | =================== 179 | Lazy Corpus Loading 180 | =================== 181 | 182 | >>> from nltk.corpus.util import LazyCorpusLoader 183 | >>> from nltk.corpus.reader import WordListCorpusReader 184 | >>> reader = LazyCorpusLoader('cookbook', WordListCorpusReader, ['wordlist']) 185 | >>> isinstance(reader, LazyCorpusLoader) 186 | True 187 | >>> reader.fileids() 188 | ['wordlist'] 189 | >>> isinstance(reader, LazyCorpusLoader) 190 | False 191 | >>> isinstance(reader, WordListCorpusReader) 192 | True 193 | 194 | 195 | ============================= 196 | Creating a Custom Corpus View 197 | ============================= 198 | 199 | >>> from nltk.corpus.reader import PlaintextCorpusReader 200 | >>> plain = PlaintextCorpusReader('.', ['heading_text.txt']) 201 | >>> len(plain.paras()) 202 | 4 203 | >>> from corpus import IgnoreHeadingCorpusReader 204 | >>> reader = IgnoreHeadingCorpusReader('.', ['heading_text.txt']) 205 | >>> len(reader.paras()) 206 | 3 207 | """ 208 | 209 | if __name__ == '__main__': 210 | import doctest 211 | doctest.testmod() -------------------------------------------------------------------------------- /Module 2/Chapter 3/7853OS_03_codes/conll.iob: -------------------------------------------------------------------------------- 1 | Mr. NNP B-NP 2 | Meador NNP I-NP 3 | had VBD B-VP 4 | been VBN I-VP 5 | executive JJ B-NP 6 | vice NN I-NP 7 | president NN I-NP 8 | of IN B-PP 9 | Balcor NNP B-NP 10 | . . O -------------------------------------------------------------------------------- /Module 2/Chapter 3/7853OS_03_codes/corpus.py: -------------------------------------------------------------------------------- 1 | import lockfile, tempfile, shutil 2 | from nltk.corpus.reader import PlaintextCorpusReader 3 | from nltk.corpus.reader.util import StreamBackedCorpusView, read_blankline_block 4 | 5 | class IgnoreHeadingCorpusView(StreamBackedCorpusView): 6 | def __init__(self, *args, **kwargs): 7 | StreamBackedCorpusView.__init__(self, *args, **kwargs) 8 | # open self._stream 9 | self._open() 10 | # skip the heading block 11 | read_blankline_block(self._stream) 12 | # reset the start position to the current position in the stream 13 | self._filepos = [self._stream.tell()] 14 | 15 | class IgnoreHeadingCorpusReader(PlaintextCorpusReader): 16 | CorpusView = IgnoreHeadingCorpusView 17 | 18 | def append_line(fname, line): 19 | # lock for writing, released when fp is closed 20 | with lockfile.FileLock(fname): 21 | fp = open(fname, 'a+') 22 | fp.write(line) 23 | fp.write('\n') 24 | fp.close() 25 | 26 | def remove_line(fname, line): 27 | '''Remove line from file by creating a temporary file containing all lines 28 | from original file except those matching the given line, then copying the 29 | temporary file back into the original file, overwriting its contents. 30 | ''' 31 | with lockfile.FileLock(fname): 32 | tmp = tempfile.TemporaryFile() 33 | fp = open(fname, 'rw+') 34 | # write all lines from orig file, except if matches given line 35 | for l in fp: 36 | if l.strip() != line: 37 | tmp.write(l) 38 | 39 | # reset file pointers so entire files are copied 40 | fp.seek(0) 41 | tmp.seek(0) 42 | # copy tmp into fp, then truncate to remove trailing line(s) 43 | shutil.copyfileobj(tmp, fp) 44 | fp.truncate() 45 | fp.close() 46 | tmp.close() -------------------------------------------------------------------------------- /Module 2/Chapter 3/7853OS_03_codes/heading_text.txt: -------------------------------------------------------------------------------- 1 | A simple heading 2 | 3 | Here is the actual text for the corpus. 4 | 5 | Paragraphs are split by blanklines. 6 | 7 | This is the 3rd paragraph. -------------------------------------------------------------------------------- /Module 2/Chapter 3/7853OS_03_codes/mongoreader.py: -------------------------------------------------------------------------------- 1 | import pymongo 2 | from nltk.data import LazyLoader 3 | from nltk.tokenize import TreebankWordTokenizer 4 | from nltk.util import AbstractLazySequence, LazyMap, LazyConcatenation 5 | 6 | class MongoDBLazySequence(AbstractLazySequence): 7 | def __init__(self, host='localhost', port=27017, db='test', collection='corpus', field='text'): 8 | self.conn = pymongo.MongoClient(host, port) 9 | self.collection = self.conn[db][collection] 10 | self.field = field 11 | 12 | def __len__(self): 13 | return self.collection.count() 14 | 15 | def iterate_from(self, start): 16 | f = lambda d: d.get(self.field, '') 17 | return iter(LazyMap(f, self.collection.find(fields=[self.field], skip=start))) 18 | 19 | class MongoDBCorpusReader(object): 20 | def __init__(self, word_tokenizer=TreebankWordTokenizer(), 21 | sent_tokenizer=LazyLoader('tokenizers/punkt/PY3/english.pickle'), 22 | **kwargs): 23 | self._seq = MongoDBLazySequence(**kwargs) 24 | self._word_tokenize = word_tokenizer.tokenize 25 | self._sent_tokenize = sent_tokenizer.tokenize 26 | 27 | def text(self): 28 | return self._seq 29 | 30 | def words(self): 31 | return LazyConcatenation(LazyMap(self._word_tokenize, self.text())) 32 | 33 | def sents(self): 34 | return LazyConcatenation(LazyMap(self._sent_tokenize, self.text())) -------------------------------------------------------------------------------- /Module 2/Chapter 3/7853OS_03_codes/mywords.txt: -------------------------------------------------------------------------------- 1 | nltk 2 | -------------------------------------------------------------------------------- /Module 2/Chapter 3/7853OS_03_codes/synonyms.csv: -------------------------------------------------------------------------------- 1 | bday,birthday 2 | -------------------------------------------------------------------------------- /Module 2/Chapter 3/7853OS_03_codes/synonyms.yaml: -------------------------------------------------------------------------------- 1 | bday: birthday 2 | -------------------------------------------------------------------------------- /Module 2/Chapter 3/7853OS_03_codes/treebank.chunk: -------------------------------------------------------------------------------- 1 | [Earlier/JJR staff-reduction/NN moves/NNS] have/VBP trimmed/VBN about/IN [300/CD jobs/NNS] ,/, [the/DT spokesman/NN] said/VBD ./. -------------------------------------------------------------------------------- /Module 2/Chapter 3/7853OS_03_codes/wordlist: -------------------------------------------------------------------------------- 1 | nltk 2 | corpus 3 | corpora 4 | wordnet 5 | -------------------------------------------------------------------------------- /Module 2/Chapter 4/7853OS_04_Codes/chapter4.py: -------------------------------------------------------------------------------- 1 | """ 2 | =============== 3 | Default Tagging 4 | =============== 5 | 6 | >>> from nltk.tag import DefaultTagger 7 | >>> tagger = DefaultTagger('NN') 8 | >>> tagger.tag(['Hello', 'World']) 9 | [('Hello', 'NN'), ('World', 'NN')] 10 | 11 | >>> from nltk.corpus import treebank 12 | >>> test_sents = treebank.tagged_sents()[3000:] 13 | >>> tagger.evaluate(test_sents) 14 | 0.14331966328512843 15 | 16 | >>> tagger.tag_sents([['Hello', 'world', '.'], ['How', 'are', 'you', '?']]) 17 | [[('Hello', 'NN'), ('world', 'NN'), ('.', 'NN')], [('How', 'NN'), ('are', 'NN'), ('you', 'NN'), ('?', 'NN')]] 18 | 19 | >>> from nltk.tag import untag 20 | >>> untag([('Hello', 'NN'), ('World', 'NN')]) 21 | ['Hello', 'World'] 22 | 23 | 24 | ======================================== 25 | Training a Unigram Part-of-Speech Tagger 26 | ======================================== 27 | 28 | >>> from nltk.tag import UnigramTagger 29 | >>> from nltk.corpus import treebank 30 | >>> train_sents = treebank.tagged_sents()[:3000] 31 | >>> tagger = UnigramTagger(train_sents) 32 | >>> treebank.sents()[0] 33 | ['Pierre', 'Vinken', ',', '61', 'years', 'old', ',', 'will', 'join', 'the', 'board', 'as', 'a', 'nonexecutive', 'director', 'Nov.', '29', '.'] 34 | >>> tagger.tag(treebank.sents()[0]) 35 | [('Pierre', 'NNP'), ('Vinken', 'NNP'), (',', ','), ('61', 'CD'), ('years', 'NNS'), ('old', 'JJ'), (',', ','), ('will', 'MD'), ('join', 'VB'), ('the', 'DT'), ('board', 'NN'), ('as', 'IN'), ('a', 'DT'), ('nonexecutive', 'JJ'), ('director', 'NN'), ('Nov.', 'NNP'), ('29', 'CD'), ('.', '.')] 36 | 37 | >>> tagger.evaluate(test_sents) 38 | 0.8588819339520829 39 | 40 | >>> tagger = UnigramTagger(model={'Pierre': 'NN'}) 41 | >>> tagger.tag(treebank.sents()[0]) 42 | [('Pierre', 'NN'), ('Vinken', None), (',', None), ('61', None), ('years', None), ('old', None), (',', None), ('will', None), ('join', None), ('the', None), ('board', None), ('as', None), ('a', None), ('nonexecutive', None), ('director', None), ('Nov.', None), ('29', None), ('.', None)] 43 | 44 | >>> tagger = UnigramTagger(train_sents, cutoff=3) 45 | >>> tagger.evaluate(test_sents) 46 | 0.7757392618173969 47 | 48 | 49 | ====================================== 50 | Combining Taggers with Backoff Tagging 51 | ====================================== 52 | 53 | >>> tagger1 = DefaultTagger('NN') 54 | >>> tagger2 = UnigramTagger(train_sents, backoff=tagger1) 55 | >>> tagger2.evaluate(test_sents) 56 | 0.8758471832505935 57 | 58 | >>> tagger1._taggers == [tagger1] 59 | True 60 | >>> tagger2._taggers == [tagger2, tagger1] 61 | True 62 | 63 | >>> import pickle 64 | >>> f = open('tagger.pickle', 'wb') 65 | >>> pickle.dump(tagger, f) 66 | >>> f.close() 67 | >>> f = open('tagger.pickle', 'rb') 68 | >>> tagger = pickle.load(f) 69 | 70 | 71 | ==================================== 72 | Training and Combining Ngram Taggers 73 | ==================================== 74 | 75 | >>> from nltk.tag import BigramTagger, TrigramTagger 76 | >>> bitagger = BigramTagger(train_sents) 77 | >>> bitagger.evaluate(test_sents) 78 | 0.11310166199007123 79 | >>> tritagger = TrigramTagger(train_sents) 80 | >>> tritagger.evaluate(test_sents) 81 | 0.0688107058061731 82 | 83 | >>> from tag_util import backoff_tagger 84 | >>> backoff = DefaultTagger('NN') 85 | >>> tagger = backoff_tagger(train_sents, [UnigramTagger, BigramTagger, TrigramTagger], backoff=backoff) 86 | >>> tagger.evaluate(test_sents) 87 | 0.8806820634578028 88 | 89 | >>> tagger._taggers[-1] == backoff 90 | True 91 | >>> isinstance(tagger._taggers[0], TrigramTagger) 92 | True 93 | >>> isinstance(tagger._taggers[1], BigramTagger) 94 | True 95 | 96 | >>> from nltk.tag import NgramTagger 97 | >>> quadtagger = NgramTagger(4, train_sents) 98 | >>> quadtagger.evaluate(test_sents) 99 | 0.058234405352903085 100 | 101 | >>> from taggers import QuadgramTagger 102 | >>> quadtagger = backoff_tagger(train_sents, [UnigramTagger, BigramTagger, TrigramTagger, QuadgramTagger], backoff=backoff) 103 | >>> quadtagger.evaluate(test_sents) 104 | 0.8806388948845241 105 | 106 | 107 | ==================================== 108 | Creating a Model of Likely Word Tags 109 | ==================================== 110 | 111 | >>> from tag_util import word_tag_model 112 | >>> from nltk.corpus import treebank 113 | >>> model = word_tag_model(treebank.words(), treebank.tagged_words()) 114 | >>> tagger = UnigramTagger(model=model) 115 | >>> tagger.evaluate(test_sents) 116 | 0.559680552557738 117 | 118 | >>> default_tagger = DefaultTagger('NN') 119 | >>> likely_tagger = UnigramTagger(model=model, backoff=default_tagger) 120 | >>> tagger = backoff_tagger(train_sents, [UnigramTagger, BigramTagger, TrigramTagger], backoff=likely_tagger) 121 | >>> tagger.evaluate(test_sents) 122 | 0.8806820634578028 123 | 124 | >>> tagger = backoff_tagger(train_sents, [UnigramTagger, BigramTagger, TrigramTagger], backoff=default_tagger) 125 | >>> likely_tagger = UnigramTagger(model=model, backoff=tagger) 126 | >>> likely_tagger.evaluate(test_sents) 127 | 0.8824088063889488 128 | 129 | 130 | ================================ 131 | Tagging with Regular Expressions 132 | ================================ 133 | 134 | >>> from tag_util import patterns 135 | >>> from nltk.tag import RegexpTagger 136 | >>> tagger = RegexpTagger(patterns) 137 | >>> tagger.evaluate(test_sents) 138 | 0.037470321605870924 139 | 140 | 141 | ============= 142 | Affix Tagging 143 | ============= 144 | 145 | >>> from nltk.tag import AffixTagger 146 | >>> tagger = AffixTagger(train_sents) 147 | >>> tagger.evaluate(test_sents) 148 | 0.27558817181092166 149 | 150 | >>> prefix_tagger = AffixTagger(train_sents, affix_length=3) 151 | >>> prefix_tagger.evaluate(test_sents) 152 | 0.23587308439456076 153 | 154 | >>> suffix_tagger = AffixTagger(train_sents, affix_length=-2) 155 | >>> suffix_tagger.evaluate(test_sents) 156 | 0.31940427368875457 157 | 158 | >>> pre3_tagger = AffixTagger(train_sents, affix_length=3) 159 | >>> pre3_tagger.evaluate(test_sents) 160 | 0.23587308439456076 161 | >>> pre2_tagger = AffixTagger(train_sents, affix_length=2, backoff=pre3_tagger) 162 | >>> pre2_tagger.evaluate(test_sents) 163 | 0.29786315562270665 164 | >>> suf2_tagger = AffixTagger(train_sents, affix_length=-2, backoff=pre2_tagger) 165 | >>> suf2_tagger.evaluate(test_sents) 166 | 0.32467083962875026 167 | >>> suf3_tagger = AffixTagger(train_sents, affix_length=-3, backoff=suf2_tagger) 168 | >>> suf3_tagger.evaluate(test_sents) 169 | 0.3590761925318368 170 | 171 | 172 | ======================= 173 | Training a Brill Tagger 174 | ======================= 175 | 176 | >>> default_tagger = DefaultTagger('NN') 177 | >>> initial_tagger = backoff_tagger(train_sents, [UnigramTagger, BigramTagger, TrigramTagger], backoff=default_tagger) 178 | >>> initial_tagger.evaluate(test_sents) 179 | 0.8806820634578028 180 | >>> from tag_util import train_brill_tagger 181 | >>> brill_tagger = train_brill_tagger(initial_tagger, train_sents) 182 | >>> brill_tagger.evaluate(test_sents) 183 | 0.8827541549751781 184 | 185 | 186 | ===================== 187 | Training a TnT Tagger 188 | ===================== 189 | 190 | >>> from nltk.tag import tnt 191 | >>> tnt_tagger = tnt.TnT() 192 | >>> tnt_tagger.train(train_sents) 193 | >>> tnt_tagger.evaluate(test_sents) 194 | 0.8756313403842003 195 | 196 | >>> from nltk.tag import DefaultTagger 197 | >>> unk = DefaultTagger('NN') 198 | >>> tnt_tagger = tnt.TnT(unk=unk, Trained=True) 199 | >>> tnt_tagger.train(train_sents) 200 | >>> tnt_tagger.evaluate(test_sents) 201 | 0.892467083962875 202 | 203 | >>> tnt_tagger = tnt.TnT(N=100) 204 | >>> tnt_tagger.train(train_sents) 205 | >>> tnt_tagger.evaluate(test_sents) 206 | 0.8756313403842003 207 | 208 | 209 | ========================= 210 | Using WordNet for Tagging 211 | ========================= 212 | 213 | >>> from taggers import WordNetTagger 214 | >>> wn_tagger = WordNetTagger() 215 | >>> wn_tagger.evaluate(train_sents) 216 | 0.17914876598160262 217 | 218 | >>> from tag_util import backoff_tagger 219 | >>> from nltk.tag import UnigramTagger, BigramTagger, TrigramTagger 220 | >>> tagger = backoff_tagger(train_sents, [UnigramTagger, BigramTagger, TrigramTagger], backoff=wn_tagger) 221 | >>> tagger.evaluate(test_sents) 222 | 0.8848262464925534 223 | 224 | 225 | ======================== 226 | Classifier Based Tagging 227 | ======================== 228 | 229 | >>> from nltk.tag.sequential import ClassifierBasedPOSTagger 230 | >>> tagger = ClassifierBasedPOSTagger(train=train_sents) 231 | >>> tagger.evaluate(test_sents) 232 | 0.9309734513274336 233 | 234 | >>> from nltk.classify import MaxentClassifier 235 | >>> me_tagger = ClassifierBasedPOSTagger(train=train_sents, classifier_builder=MaxentClassifier.train) 236 | ==> Training (100 iterations) 237 | 238 | Iteration Log Likelihood Accuracy 239 | --------------------------------------- 240 | 1 -3.82864 0.008 241 | 2 -0.76859 0.957 242 | Final nan 0.984 243 | >>> me_tagger.evaluate(test_sents) 244 | 0.9258363911072739 245 | 246 | >>> from nltk.tag.sequential import ClassifierBasedTagger 247 | >>> from tag_util import unigram_feature_detector 248 | >>> tagger = ClassifierBasedTagger(train=train_sents, feature_detector=unigram_feature_detector) 249 | >>> tagger.evaluate(test_sents) 250 | 0.8733865745737104 251 | 252 | >>> default = DefaultTagger('NN') 253 | >>> tagger = ClassifierBasedPOSTagger(train=train_sents, backoff=default, cutoff_prob=0.3) 254 | >>> tagger.evaluate(test_sents) 255 | 0.9311029570472696 256 | """ 257 | 258 | if __name__ == '__main__': 259 | import doctest 260 | doctest.testmod() -------------------------------------------------------------------------------- /Module 2/Chapter 4/7853OS_04_Codes/tag_util.py: -------------------------------------------------------------------------------- 1 | import itertools 2 | from nltk.tbl import Template 3 | from nltk.tag import brill, brill_trainer 4 | from nltk.probability import FreqDist, ConditionalFreqDist 5 | 6 | def backoff_tagger(train_sents, tagger_classes, backoff=None): 7 | for cls in tagger_classes: 8 | backoff = cls(train_sents, backoff=backoff) 9 | 10 | return backoff 11 | 12 | def word_tag_model(words, tagged_words, limit=200): 13 | fd = FreqDist(words) 14 | cfd = ConditionalFreqDist(tagged_words) 15 | most_freq = (word for word, count in fd.most_common(limit)) 16 | return dict((word, cfd[word].max()) for word in most_freq) 17 | 18 | patterns = [ 19 | (r'^\d+$', 'CD'), 20 | (r'.*ing$', 'VBG'), # gerunds, i.e. wondering 21 | (r'.*ment$', 'NN'), # i.e. wonderment 22 | (r'.*ful$', 'JJ') # i.e. wonderful 23 | ] 24 | 25 | def train_brill_tagger(initial_tagger, train_sents, **kwargs): 26 | templates = [ 27 | brill.Template(brill.Pos([-1])), 28 | brill.Template(brill.Pos([1])), 29 | brill.Template(brill.Pos([-2])), 30 | brill.Template(brill.Pos([2])), 31 | brill.Template(brill.Pos([-2, -1])), 32 | brill.Template(brill.Pos([1, 2])), 33 | brill.Template(brill.Pos([-3, -2, -1])), 34 | brill.Template(brill.Pos([1, 2, 3])), 35 | brill.Template(brill.Pos([-1]), brill.Pos([1])), 36 | brill.Template(brill.Word([-1])), 37 | brill.Template(brill.Word([1])), 38 | brill.Template(brill.Word([-2])), 39 | brill.Template(brill.Word([2])), 40 | brill.Template(brill.Word([-2, -1])), 41 | brill.Template(brill.Word([1, 2])), 42 | brill.Template(brill.Word([-3, -2, -1])), 43 | brill.Template(brill.Word([1, 2, 3])), 44 | brill.Template(brill.Word([-1]), brill.Word([1])), 45 | ] 46 | 47 | trainer = brill_trainer.BrillTaggerTrainer(initial_tagger, templates, deterministic=True) 48 | return trainer.train(train_sents, **kwargs) 49 | 50 | def unigram_feature_detector(tokens, index, history): 51 | return {'word': tokens[index]} -------------------------------------------------------------------------------- /Module 2/Chapter 4/7853OS_04_Codes/taggers.py: -------------------------------------------------------------------------------- 1 | from nltk.tag import NgramTagger, SequentialBackoffTagger 2 | from nltk.corpus import wordnet, names 3 | from nltk.probability import FreqDist 4 | 5 | class QuadgramTagger(NgramTagger): 6 | def __init__(self, *args, **kwargs): 7 | NgramTagger.__init__(self, 4, *args, **kwargs) 8 | 9 | class WordNetTagger(SequentialBackoffTagger): 10 | ''' 11 | >>> wt = WordNetTagger() 12 | >>> wt.tag(['food', 'is', 'great']) 13 | [('food', 'NN'), ('is', 'VB'), ('great', 'JJ')] 14 | ''' 15 | def __init__(self, *args, **kwargs): 16 | SequentialBackoffTagger.__init__(self, *args, **kwargs) 17 | 18 | self.wordnet_tag_map = { 19 | 'n': 'NN', 20 | 's': 'JJ', 21 | 'a': 'JJ', 22 | 'r': 'RB', 23 | 'v': 'VB' 24 | } 25 | 26 | def choose_tag(self, tokens, index, history): 27 | word = tokens[index] 28 | fd = FreqDist() 29 | 30 | for synset in wordnet.synsets(word): 31 | fd[synset.pos()] += 1 32 | 33 | if not fd: return None 34 | return self.wordnet_tag_map.get(fd.max()) 35 | 36 | class NamesTagger(SequentialBackoffTagger): 37 | ''' 38 | >>> nt = NamesTagger() 39 | >>> nt.tag(['Jacob']) 40 | [('Jacob', 'NNP')] 41 | ''' 42 | def __init__(self, *args, **kwargs): 43 | SequentialBackoffTagger.__init__(self, *args, **kwargs) 44 | self.name_set = set([n.lower() for n in names.words()]) 45 | 46 | def choose_tag(self, tokens, index, history): 47 | word = tokens[index] 48 | 49 | if word.lower() in self.name_set: 50 | return 'NNP' 51 | else: 52 | return None 53 | 54 | if __name__ == '__main__': 55 | import doctest 56 | doctest.testmod() -------------------------------------------------------------------------------- /Module 2/Chapter 5/7853OS_05_Codes/chunkers.py: -------------------------------------------------------------------------------- 1 | import nltk.tag 2 | from nltk.chunk import ChunkParserI 3 | from nltk.chunk.util import tree2conlltags, conlltags2tree 4 | from nltk.tag import UnigramTagger, BigramTagger, ClassifierBasedTagger 5 | from nltk.corpus import names, ieer, gazetteers 6 | from tag_util import backoff_tagger 7 | 8 | def conll_tag_chunks(chunk_sents): 9 | '''Convert each chunked sentence to list of (tag, chunk_tag) tuples, 10 | so the final result is a list of lists of (tag, chunk_tag) tuples. 11 | >>> from nltk.tree import Tree 12 | >>> t = Tree('S', [Tree('NP', [('the', 'DT'), ('book', 'NN')])]) 13 | >>> conll_tag_chunks([t]) 14 | [[('DT', 'B-NP'), ('NN', 'I-NP')]] 15 | ''' 16 | tagged_sents = [tree2conlltags(tree) for tree in chunk_sents] 17 | return [[(t, c) for (w, t, c) in sent] for sent in tagged_sents] 18 | 19 | class TagChunker(ChunkParserI): 20 | '''Chunks tagged tokens using Ngram Tagging.''' 21 | def __init__(self, train_chunks, tagger_classes=[UnigramTagger, BigramTagger]): 22 | '''Train Ngram taggers on chunked sentences''' 23 | train_sents = conll_tag_chunks(train_chunks) 24 | self.tagger = backoff_tagger(train_sents, tagger_classes) 25 | 26 | def parse(self, tagged_sent): 27 | '''Parsed tagged tokens into parse Tree of chunks''' 28 | if not tagged_sent: return None 29 | (words, tags) = zip(*tagged_sent) 30 | chunks = self.tagger.tag(tags) 31 | # create conll str for tree parsing 32 | wtc = zip(words, chunks) 33 | return conlltags2tree([(w,t,c) for (w,(t,c)) in wtc]) 34 | 35 | def chunk_trees2train_chunks(chunk_sents): 36 | tag_sents = [tree2conlltags(sent) for sent in chunk_sents] 37 | return [[((w,t),c) for (w,t,c) in sent] for sent in tag_sents] 38 | 39 | def prev_next_pos_iob(tokens, index, history): 40 | word, pos = tokens[index] 41 | 42 | if index == 0: 43 | prevword, prevpos, previob = ('',)*3 44 | else: 45 | prevword, prevpos = tokens[index-1] 46 | previob = history[index-1] 47 | 48 | if index == len(tokens) - 1: 49 | nextword, nextpos = ('',)*2 50 | else: 51 | nextword, nextpos = tokens[index+1] 52 | 53 | feats = { 54 | 'word': word, 55 | 'pos': pos, 56 | 'nextword': nextword, 57 | 'nextpos': nextpos, 58 | 'prevword': prevword, 59 | 'prevpos': prevpos, 60 | 'previob': previob 61 | } 62 | 63 | return feats 64 | 65 | class ClassifierChunker(ChunkParserI): 66 | def __init__(self, train_sents, feature_detector=prev_next_pos_iob, **kwargs): 67 | if not feature_detector: 68 | feature_detector = self.feature_detector 69 | 70 | train_chunks = chunk_trees2train_chunks(train_sents) 71 | self.tagger = ClassifierBasedTagger(train=train_chunks, 72 | feature_detector=feature_detector, **kwargs) 73 | 74 | def parse(self, tagged_sent): 75 | if not tagged_sent: return None 76 | chunks = self.tagger.tag(tagged_sent) 77 | return conlltags2tree([(w,t,c) for ((w,t),c) in chunks]) 78 | 79 | def sub_leaves(tree, label): 80 | return [t.leaves() for t in tree.subtrees(lambda s: s.label() == label)] 81 | 82 | class PersonChunker(ChunkParserI): 83 | ''' 84 | >>> from nltk.corpus import treebank_chunk 85 | >>> chunker = PersonChunker() 86 | >>> sub_leaves(chunker.parse(treebank_chunk.tagged_sents()[0]), 'PERSON') 87 | [[('Pierre', 'NNP')]] 88 | ''' 89 | def __init__(self): 90 | self.name_set = set(names.words()) 91 | 92 | def parse(self, tagged_sent): 93 | iobs = [] 94 | in_person = False 95 | 96 | for word, tag in tagged_sent: 97 | if word in self.name_set and in_person: 98 | iobs.append((word, tag, 'I-PERSON')) 99 | elif word in self.name_set: 100 | iobs.append((word, tag, 'B-PERSON')) 101 | in_person = True 102 | else: 103 | iobs.append((word, tag, 'O')) 104 | in_person = False 105 | 106 | return conlltags2tree(iobs) 107 | 108 | class LocationChunker(ChunkParserI): 109 | '''Chunks locations based on the gazetteers corpus. 110 | >>> loc = LocationChunker() 111 | >>> t = loc.parse([('San', 'NNP'), ('Francisco', 'NNP'), ('CA', 'NNP'), ('is', 'BE'), ('cold', 'JJ'), ('compared', 'VBD'), ('to', 'TO'), ('San', 'NNP'), ('Jose', 'NNP'), ('CA', 'NNP')]) 112 | >>> sub_leaves(t, 'LOCATION') 113 | [[('San', 'NNP'), ('Francisco', 'NNP'), ('CA', 'NNP')], [('San', 'NNP'), ('Jose', 'NNP'), ('CA', 'NNP')]] 114 | ''' 115 | def __init__(self): 116 | # gazetteers is a WordListCorpusReader of many different location words 117 | self.locations = set(gazetteers.words()) 118 | self.lookahead = 0 119 | # need to know how many words to lookahead in the tagged sentence to find a location 120 | for loc in self.locations: 121 | nwords = loc.count(' ') 122 | 123 | if nwords > self.lookahead: 124 | self.lookahead = nwords 125 | 126 | def iob_locations(self, tagged_sent): 127 | i = 0 128 | l = len(tagged_sent) 129 | inside = False 130 | 131 | while i < l: 132 | word, tag = tagged_sent[i] 133 | j = i + 1 134 | k = j + self.lookahead 135 | nextwords, nexttags = [], [] 136 | loc = False 137 | # lookahead in the sentence to find multi-word locations 138 | while j < k: 139 | if ' '.join([word] + nextwords) in self.locations: 140 | # combine multiple separate locations into single location chunk 141 | if inside: 142 | yield word, tag, 'I-LOCATION' 143 | else: 144 | yield word, tag, 'B-LOCATION' 145 | # every next word is inside the location chunk 146 | for nword, ntag in zip(nextwords, nexttags): 147 | yield nword, ntag, 'I-LOCATION' 148 | # found a location, so we're inside a chunk 149 | loc, inside = True, True 150 | # move forward to the next word since the current words 151 | # are already chunked 152 | i = j 153 | break 154 | 155 | if j < l: 156 | nextword, nexttag = tagged_sent[j] 157 | nextwords.append(nextword) 158 | nexttags.append(nexttag) 159 | j += 1 160 | else: 161 | break 162 | # if no location found, then we're outside the location chunk 163 | if not loc: 164 | inside = False 165 | i += 1 166 | yield word, tag, 'O' 167 | 168 | def parse(self, tagged_sent): 169 | iobs = self.iob_locations(tagged_sent) 170 | return conlltags2tree(iobs) 171 | 172 | def ieertree2conlltags(tree, tag=nltk.tag.pos_tag): 173 | # tree.pos() flattens the tree and produces [(word, node)] where node is 174 | # from the word's parent tree node. words in a chunk therefore get the 175 | # chunk tag, while words outside a chunk get the same tag as the tree's 176 | # top node 177 | words, ents = zip(*tree.pos()) 178 | iobs = [] 179 | prev = None 180 | # construct iob tags from entity names 181 | for ent in ents: 182 | # any entity that is the same as the tree's top node is outside a chunk 183 | if ent == tree.label(): 184 | iobs.append('O') 185 | prev = None 186 | # have a previous entity that is equal so this is inside the chunk 187 | elif prev == ent: 188 | iobs.append('I-%s' % ent) 189 | # no previous equal entity in the sequence, so this is the beginning of 190 | # an entity chunk 191 | else: 192 | iobs.append('B-%s' % ent) 193 | prev = ent 194 | # get tags for each word, then construct 3-tuple for conll tags 195 | words, tags = zip(*tag(words)) 196 | return zip(words, tags, iobs) 197 | 198 | def ieer_chunked_sents(tag=nltk.tag.pos_tag): 199 | for doc in ieer.parsed_docs(): 200 | tagged = ieertree2conlltags(doc.text, tag) 201 | yield conlltags2tree(tagged) 202 | 203 | if __name__ == '__main__': 204 | import doctest 205 | doctest.testmod() -------------------------------------------------------------------------------- /Module 2/Chapter 6/7853OS_06_codes/chapter6.py: -------------------------------------------------------------------------------- 1 | ''' 2 | ============================= 3 | Filtering Insignificant Words 4 | ============================= 5 | 6 | >>> from transforms import filter_insignificant 7 | >>> filter_insignificant([('your', 'PRP$'), ('book', 'NN'), ('is', 'VBZ'), ('great', 'JJ')], tag_suffixes=['PRP', 'PRP$']) 8 | [('book', 'NN'), ('is', 'VBZ'), ('great', 'JJ')] 9 | 10 | 11 | ===================== 12 | Swapping Verb Phrases 13 | ===================== 14 | 15 | >>> from transforms import swap_verb_phrase 16 | >>> swap_verb_phrase(filter_insignificant([('this', 'DT'), ('gripping', 'VBG'), ('book', 'NN'), ('is', 'VBZ'), ('fantastic', 'JJ')])) 17 | [('fantastic', 'JJ'), ('gripping', 'VBG'), ('book', 'NN')] 18 | >>> filter_insignificant(swap_verb_phrase([('this', 'DT'), ('gripping', 'VBG'), ('book', 'NN'), ('is', 'VBZ'), ('fantastic', 'JJ')])) 19 | [('fantastic', 'JJ'), ('gripping', 'VBG'), ('book', 'NN')] 20 | 21 | 22 | ============================== 23 | Chaining Chunk Transformations 24 | ============================== 25 | 26 | >>> from transforms import transform_chunk 27 | >>> transform_chunk([('the', 'DT'), ('book', 'NN'), ('of', 'IN'), ('recipes', 'NNS'), ('is', 'VBZ'), ('delicious', 'JJ')], trace=1) 28 | filter_insignificant : [('book', 'NN'), ('of', 'IN'), ('recipes', 'NNS'), ('is', 'VBZ'), ('delicious', 'JJ')] 29 | swap_verb_phrase : [('delicious', 'JJ'), ('book', 'NN'), ('of', 'IN'), ('recipes', 'NNS')] 30 | swap_infinitive_phrase : [('delicious', 'JJ'), ('recipes', 'NNS'), ('book', 'NN')] 31 | singularize_plural_noun : [('delicious', 'JJ'), ('recipe', 'NN'), ('book', 'NN')] 32 | [('delicious', 'JJ'), ('recipe', 'NN'), ('book', 'NN')] 33 | 34 | 35 | =============================== 36 | Converting a Chunk Tree to Text 37 | =============================== 38 | 39 | >>> from nltk.corpus import treebank_chunk 40 | >>> tree = treebank_chunk.chunked_sents()[0] 41 | >>> ' '.join([w for w, t in tree.leaves()]) 42 | 'Pierre Vinken , 61 years old , will join the board as a nonexecutive director Nov. 29 .' 43 | 44 | >>> from transforms import chunk_tree_to_sent 45 | >>> chunk_tree_to_sent(tree) 46 | 'Pierre Vinken, 61 years old, will join the board as a nonexecutive director Nov. 29.' 47 | 48 | 49 | ===================== 50 | Flattening Deep Trees 51 | ===================== 52 | 53 | >>> from nltk.corpus import treebank 54 | >>> from transforms import flatten_deeptree 55 | >>> flatten_deeptree(treebank.parsed_sents()[0]) 56 | Tree('S', [Tree('NP', [('Pierre', 'NNP'), ('Vinken', 'NNP')]), (',', ','), Tree('NP', [('61', 'CD'), ('years', 'NNS')]), ('old', 'JJ'), (',', ','), ('will', 'MD'), ('join', 'VB'), Tree('NP', [('the', 'DT'), ('board', 'NN')]), ('as', 'IN'), Tree('NP', [('a', 'DT'), ('nonexecutive', 'JJ'), ('director', 'NN')]), Tree('NP-TMP', [('Nov.', 'NNP'), ('29', 'CD')]), ('.', '.')]) 57 | 58 | >>> from nltk.tree import Tree 59 | >>> Tree('NNP', ['Pierre']).height() 60 | 2 61 | 62 | >>> Tree('NNP', ['Pierre']).pos() 63 | [('Pierre', 'NNP')] 64 | 65 | >>> Tree('NP', [Tree('NNP', ['Pierre']), Tree('NNP', ['Vinken'])]).height() 66 | 3 67 | 68 | >>> Tree('NP', [Tree('NNP', ['Pierre']), Tree('NNP', ['Vinken'])]).pos() 69 | [('Pierre', 'NNP'), ('Vinken', 'NNP')] 70 | 71 | >>> from nltk.chunk.util import tree2conlltags 72 | >>> tree2conlltags(treebank.parsed_sents()[0]) 73 | Traceback (most recent call last): 74 | File "", line 1, in 75 | File "/usr/local/lib/python2.6/dist-packages/nltk/chunk/util.py", line 417, in tree2conlltags 76 | raise ValueError, "Tree is too deeply nested to be printed in CoNLL format" 77 | ValueError: Tree is too deeply nested to be printed in CoNLL format 78 | 79 | >>> tree2conlltags(flatten_deeptree(treebank.parsed_sents()[0])) 80 | [('Pierre', 'NNP', 'B-NP'), ('Vinken', 'NNP', 'I-NP'), (',', ',', 'O'), ('61', 'CD', 'B-NP'), ('years', 'NNS', 'I-NP'), ('old', 'JJ', 'O'), (',', ',', 'O'), ('will', 'MD', 'O'), ('join', 'VB', 'O'), ('the', 'DT', 'B-NP'), ('board', 'NN', 'I-NP'), ('as', 'IN', 'O'), ('a', 'DT', 'B-NP'), ('nonexecutive', 'JJ', 'I-NP'), ('director', 'NN', 'I-NP'), ('Nov.', 'NNP', 'B-NP-TMP'), ('29', 'CD', 'I-NP-TMP'), ('.', '.', 'O')] 81 | 82 | >>> from nltk.corpus import cess_esp 83 | >>> cess_esp.parsed_sents()[0].height() 84 | 22 85 | >>> flatten_deeptree(cess_esp.parsed_sents()[0]).height() 86 | 3 87 | 88 | 89 | ======================= 90 | Creating a Shallow Tree 91 | ======================= 92 | 93 | >>> from transforms import shallow_tree 94 | >>> shallow_tree(treebank.parsed_sents()[0]) 95 | Tree('S', [Tree('NP-SBJ', [('Pierre', 'NNP'), ('Vinken', 'NNP'), (',', ','), ('61', 'CD'), ('years', 'NNS'), ('old', 'JJ'), (',', ',')]), Tree('VP', [('will', 'MD'), ('join', 'VB'), ('the', 'DT'), ('board', 'NN'), ('as', 'IN'), ('a', 'DT'), ('nonexecutive', 'JJ'), ('director', 'NN'), ('Nov.', 'NNP'), ('29', 'CD')]), ('.', '.')]) 96 | 97 | >>> treebank.parsed_sents()[0].height() 98 | 7 99 | >>> shallow_tree(treebank.parsed_sents()[0]).height() 100 | 3 101 | 102 | 103 | ====================== 104 | Converting Tree Labels 105 | ====================== 106 | 107 | >>> from transforms import convert_tree_labels 108 | >>> mapping = {'NP-SBJ': 'NP', 'NP-TMP': 'NP'} 109 | >>> convert_tree_labels(treebank.parsed_sents()[0], mapping) 110 | Tree('S', [Tree('NP', [Tree('NP', [Tree('NNP', ['Pierre']), Tree('NNP', ['Vinken'])]), Tree(',', [',']), Tree('ADJP', [Tree('NP', [Tree('CD', ['61']), Tree('NNS', ['years'])]), Tree('JJ', ['old'])]), Tree(',', [','])]), Tree('VP', [Tree('MD', ['will']), Tree('VP', [Tree('VB', ['join']), Tree('NP', [Tree('DT', ['the']), Tree('NN', ['board'])]), Tree('PP-CLR', [Tree('IN', ['as']), Tree('NP', [Tree('DT', ['a']), Tree('JJ', ['nonexecutive']), Tree('NN', ['director'])])]), Tree('NP', [Tree('NNP', ['Nov.']), Tree('CD', ['29'])])])]), Tree('.', ['.'])]) 111 | ''' 112 | # TODO: also do a task on converting tree nodes so NP-TMP -> NP 113 | 114 | if __name__ == '__main__': 115 | import doctest 116 | doctest.testmod() -------------------------------------------------------------------------------- /Module 2/Chapter 7/7853OS_07_Codes/chapter7.py: -------------------------------------------------------------------------------- 1 | ''' 2 | ================================= 3 | Training a Naive Bayes Classifier 4 | ================================= 5 | 6 | >>> from nltk.corpus import movie_reviews 7 | >>> from featx import label_feats_from_corpus, split_label_feats 8 | >>> movie_reviews.categories() 9 | ['neg', 'pos'] 10 | >>> lfeats = label_feats_from_corpus(movie_reviews) 11 | >>> lfeats.keys() 12 | dict_keys(['neg', 'pos']) 13 | >>> train_feats, test_feats = split_label_feats(lfeats) 14 | >>> len(train_feats) 15 | 1500 16 | >>> len(test_feats) 17 | 500 18 | 19 | >>> from nltk.classify import NaiveBayesClassifier 20 | >>> nb_classifier = NaiveBayesClassifier.train(train_feats) 21 | >>> nb_classifier.labels() 22 | ['neg', 'pos'] 23 | 24 | >>> from featx import bag_of_words 25 | >>> negfeat = bag_of_words(['the', 'plot', 'was', 'ludicrous']) 26 | >>> nb_classifier.classify(negfeat) 27 | 'neg' 28 | >>> posfeat = bag_of_words(['kate', 'winslet', 'is', 'accessible']) 29 | >>> nb_classifier.classify(posfeat) 30 | 'pos' 31 | 32 | >>> from nltk.classify.util import accuracy 33 | >>> accuracy(nb_classifier, test_feats) 34 | 0.728 35 | 36 | >>> probs = nb_classifier.prob_classify(test_feats[0][0]) 37 | >>> probs.samples() 38 | dict_keys(['neg', 'pos']) 39 | >>> probs.max() 40 | 'pos' 41 | >>> probs.prob('pos') 42 | 0.9999999646430913 43 | >>> probs.prob('neg') 44 | 3.535688969240647e-08 45 | 46 | >>> nb_classifier.most_informative_features(n=5) 47 | [('magnificent', True), ('outstanding', True), ('insulting', True), ('vulnerable', True), ('ludicrous', True)] 48 | 49 | >>> from nltk.probability import LaplaceProbDist 50 | >>> nb_classifier = NaiveBayesClassifier.train(train_feats, estimator=LaplaceProbDist) 51 | >>> accuracy(nb_classifier, test_feats) 52 | 0.716 53 | 54 | >>> from nltk.probability import DictionaryProbDist 55 | >>> label_probdist = DictionaryProbDist({'pos': 0.5, 'neg': 0.5}) 56 | >>> true_probdist = DictionaryProbDist({True: 1}) 57 | >>> feature_probdist = {('pos', 'yes'): true_probdist, ('neg', 'no'): true_probdist} 58 | >>> classifier = NaiveBayesClassifier(label_probdist, feature_probdist) 59 | >>> classifier.classify({'yes': True}) 60 | 'pos' 61 | >>> classifier.classify({'no': True}) 62 | 'neg' 63 | 64 | 65 | =================================== 66 | Training a Decision Tree Classifier 67 | =================================== 68 | 69 | >>> from nltk.classify import DecisionTreeClassifier 70 | >>> dt_classifier = DecisionTreeClassifier.train(train_feats, binary=True, entropy_cutoff=0.8, depth_cutoff=5, support_cutoff=30) 71 | >>> accuracy(dt_classifier, test_feats) 72 | 0.688 73 | 74 | >>> from nltk.probability import FreqDist, MLEProbDist, entropy 75 | >>> fd = FreqDist({'pos': 30, 'neg': 10}) 76 | >>> entropy(MLEProbDist(fd)) 77 | 0.8112781244591328 78 | >>> fd['neg'] = 25 79 | >>> entropy(MLEProbDist(fd)) 80 | 0.9940302114769565 81 | >>> fd['neg'] = 30 82 | >>> entropy(MLEProbDist(fd)) 83 | 1.0 84 | >>> fd['neg'] = 1 85 | >>> entropy(MLEProbDist(fd)) 86 | 0.20559250818508304 87 | 88 | 89 | ===================================== 90 | Training a Maximum Entropy Classifier 91 | ===================================== 92 | 93 | >>> from nltk.classify import MaxentClassifier 94 | >>> me_classifier = MaxentClassifier.train(train_feats, trace=0, max_iter=1, min_lldelta=0.5) 95 | >>> accuracy(me_classifier, test_feats) 96 | 0.5 97 | 98 | >>> me_classifier = MaxentClassifier.train(train_feats, algorithm='gis', trace=0, max_iter=10, min_lldelta=0.5) 99 | >>> accuracy(me_classifier, test_feats) 100 | 0.722 101 | 102 | ================================= 103 | Training Scikit-Learn Classifiers 104 | ================================= 105 | 106 | >>> from nltk.classify.scikitlearn import SklearnClassifier 107 | >>> from sklearn.naive_bayes import MultinomialNB 108 | >>> sk_classifier = SklearnClassifier(MultinomialNB()) 109 | >>> sk_classifier.train(train_feats) 110 | 111 | >>> accuracy(sk_classifier, test_feats) 112 | 0.83 113 | 114 | >>> from sklearn.naive_bayes import BernoulliNB 115 | >>> sk_classifier = SklearnClassifier(BernoulliNB()) 116 | >>> sk_classifier.train(train_feats) 117 | 118 | >>> accuracy(sk_classifier, test_feats) 119 | 0.812 120 | 121 | >>> from sklearn.linear_model import LogisticRegression 122 | >>> sk_classifier = SklearnClassifier(LogisticRegression()).train(train_feats) 123 | >>> accuracy(sk_classifier, test_feats) 124 | 0.892 125 | 126 | >>> from sklearn.svm import SVC 127 | >>> sk_classifier = SklearnClassifier(SVC()).train(train_feats) 128 | >>> accuracy(sk_classifier, test_feats) 129 | 0.69 130 | 131 | >>> from sklearn.svm import LinearSVC 132 | >>> sk_classifier = SklearnClassifier(LinearSVC()).train(train_feats) 133 | >>> accuracy(sk_classifier, test_feats) 134 | 0.864 135 | 136 | >>> from sklearn.svm import NuSVC 137 | >>> sk_classifier = SklearnClassifier(NuSVC()).train(train_feats) 138 | >>> accuracy(sk_classifier, test_feats) 139 | 0.882 140 | 141 | ============================================== 142 | Measuring Precision and Recall of a Classifier 143 | ============================================== 144 | 145 | >>> from classification import precision_recall 146 | >>> nb_precisions, nb_recalls = precision_recall(nb_classifier, test_feats) 147 | >>> nb_precisions['pos'] 148 | 0.6413612565445026 149 | >>> nb_precisions['neg'] 150 | 0.9576271186440678 151 | >>> nb_recalls['pos'] 152 | 0.98 153 | >>> nb_recalls['neg'] 154 | 0.452 155 | 156 | >>> me_precisions, me_recalls = precision_recall(me_classifier, test_feats) 157 | >>> me_precisions['pos'] 158 | 0.6456692913385826 159 | >>> me_precisions['neg'] 160 | 0.9663865546218487 161 | >>> me_recalls['pos'] 162 | 0.984 163 | >>> me_recalls['neg'] 164 | 0.46 165 | 166 | >>> sk_precisions, sk_recalls = precision_recall(sk_classifier, test_feats) 167 | >>> sk_precisions['pos'] 168 | 0.9063829787234042 169 | >>> sk_precisions['neg'] 170 | 0.8603773584905661 171 | >>> sk_recalls['pos'] 172 | 0.852 173 | >>> sk_recalls['neg'] 174 | 0.912 175 | 176 | 177 | ================================== 178 | Calculating High Information Words 179 | ================================== 180 | 181 | >>> from featx import high_information_words, bag_of_words_in_set 182 | >>> labels = movie_reviews.categories() 183 | >>> labeled_words = [(l, movie_reviews.words(categories=[l])) for l in labels] 184 | >>> high_info_words = set(high_information_words(labeled_words)) 185 | >>> feat_det = lambda words: bag_of_words_in_set(words, high_info_words) 186 | >>> lfeats = label_feats_from_corpus(movie_reviews, feature_detector=feat_det) 187 | >>> train_feats, test_feats = split_label_feats(lfeats) 188 | 189 | >>> nb_classifier = NaiveBayesClassifier.train(train_feats) 190 | >>> accuracy(nb_classifier, test_feats) 191 | 0.91 192 | >>> nb_precisions, nb_recalls = precision_recall(nb_classifier, test_feats) 193 | >>> nb_precisions['pos'] 194 | 0.8988326848249028 195 | >>> nb_precisions['neg'] 196 | 0.9218106995884774 197 | >>> nb_recalls['pos'] 198 | 0.924 199 | >>> nb_recalls['neg'] 200 | 0.896 201 | 202 | >>> me_classifier = MaxentClassifier.train(train_feats, algorithm='gis', trace=0, max_iter=10, min_lldelta=0.5) 203 | >>> accuracy(me_classifier, test_feats) 204 | 0.912 205 | >>> me_precisions, me_recalls = precision_recall(me_classifier, test_feats) 206 | >>> me_precisions['pos'] 207 | 0.8992248062015504 208 | >>> me_precisions['neg'] 209 | 0.9256198347107438 210 | >>> me_recalls['pos'] 211 | 0.928 212 | >>> me_recalls['neg'] 213 | 0.896 214 | 215 | >>> dt_classifier = DecisionTreeClassifier.train(train_feats, binary=True, depth_cutoff=20, support_cutoff=20, entropy_cutoff=0.01) 216 | >>> accuracy(dt_classifier, test_feats) 217 | 0.688 218 | >>> dt_precisions, dt_recalls = precision_recall(dt_classifier, test_feats) 219 | >>> dt_precisions['pos'] 220 | 0.6766917293233082 221 | >>> dt_precisions['neg'] 222 | 0.7008547008547008 223 | >>> dt_recalls['pos'] 224 | 0.72 225 | >>> dt_recalls['neg'] 226 | 0.656 227 | 228 | >>> sk_classifier = SklearnClassifier(LinearSVC()).train(train_feats) 229 | >>> accuracy(sk_classifier, test_feats) 230 | 0.86 231 | >>> sk_precisions, sk_recalls = precision_recall(sk_classifier, test_feats) 232 | >>> sk_precisions['pos'] 233 | 0.871900826446281 234 | >>> sk_precisions['neg'] 235 | 0.8488372093023255 236 | >>> sk_recalls['pos'] 237 | 0.844 238 | >>> sk_recalls['neg'] 239 | 0.876 240 | 241 | 242 | ================================= 243 | Combining Classifiers with Voting 244 | ================================= 245 | 246 | >>> from classification import MaxVoteClassifier 247 | >>> mv_classifier = MaxVoteClassifier(nb_classifier, dt_classifier, me_classifier, sk_classifier) 248 | >>> mv_classifier.labels() 249 | ['neg', 'pos'] 250 | >>> accuracy(mv_classifier, test_feats) 251 | 0.894 252 | >>> mv_precisions, mv_recalls = precision_recall(mv_classifier, test_feats) 253 | >>> mv_precisions['pos'] 254 | 0.9156118143459916 255 | >>> mv_precisions['neg'] 256 | 0.8745247148288974 257 | >>> mv_recalls['pos'] 258 | 0.868 259 | >>> mv_recalls['neg'] 260 | 0.92 261 | 262 | 263 | ============================================ 264 | Classifying with Multiple Binary Classifiers 265 | ============================================ 266 | 267 | >>> from nltk.corpus import reuters 268 | >>> len(reuters.categories()) 269 | 90 270 | 271 | >>> from featx import reuters_high_info_words, reuters_train_test_feats 272 | >>> rwords = reuters_high_info_words() 273 | >>> featdet = lambda words: bag_of_words_in_set(words, rwords) 274 | >>> multi_train_feats, multi_test_feats = reuters_train_test_feats(featdet) 275 | 276 | >>> from classification import train_binary_classifiers 277 | >>> trainf = lambda train_feats: SklearnClassifier(LogisticRegression()).train(train_feats) 278 | >>> labelset = set(reuters.categories()) 279 | >>> classifiers = train_binary_classifiers(trainf, multi_train_feats, labelset) 280 | >>> len(classifiers) 281 | 90 282 | 283 | >>> from classification import MultiBinaryClassifier, multi_metrics 284 | >>> multi_classifier = MultiBinaryClassifier(*classifiers.items()) 285 | 286 | >>> multi_precisions, multi_recalls, avg_md = multi_metrics(multi_classifier, multi_test_feats) 287 | >>> avg_md 288 | 0.23310715863026216 289 | 290 | >>> multi_precisions['soybean'] 291 | 0.7857142857142857 292 | >>> multi_recalls['soybean'] 293 | 0.3333333333333333 294 | >>> len(reuters.fileids(categories=['soybean'])) 295 | 111 296 | 297 | >>> multi_precisions['sunseed'] 298 | 1.0 299 | >>> multi_recalls['sunseed'] 300 | 0.2 301 | >>> len(reuters.fileids(categories=['sunseed'])) 302 | 16 303 | ''' 304 | 305 | if __name__ == '__main__': 306 | import doctest 307 | doctest.testmod() -------------------------------------------------------------------------------- /Module 2/Chapter 7/7853OS_07_Codes/classification.py: -------------------------------------------------------------------------------- 1 | import collections, itertools 2 | from nltk import metrics 3 | from nltk.classify import util, ClassifierI, MultiClassifierI 4 | from nltk.probability import FreqDist 5 | 6 | def precision_recall(classifier, testfeats): 7 | refsets = collections.defaultdict(set) 8 | testsets = collections.defaultdict(set) 9 | 10 | for i, (feats, label) in enumerate(testfeats): 11 | refsets[label].add(i) 12 | observed = classifier.classify(feats) 13 | testsets[observed].add(i) 14 | 15 | precisions = {} 16 | recalls = {} 17 | 18 | for label in classifier.labels(): 19 | precisions[label] = metrics.precision(refsets[label], testsets[label]) 20 | recalls[label] = metrics.recall(refsets[label], testsets[label]) 21 | 22 | return precisions, recalls 23 | 24 | class MaxVoteClassifier(ClassifierI): 25 | def __init__(self, *classifiers): 26 | self._classifiers = classifiers 27 | self._labels = sorted(set(itertools.chain(*[c.labels() for c in classifiers]))) 28 | 29 | def labels(self): 30 | return self._labels 31 | 32 | def classify(self, feats): 33 | counts = FreqDist() 34 | 35 | for classifier in self._classifiers: 36 | counts[classifier.classify(feats)] += 1 37 | 38 | return counts.max() 39 | 40 | class MultiBinaryClassifier(MultiClassifierI): 41 | def __init__(self, *label_classifiers): 42 | self._label_classifiers = dict(label_classifiers) 43 | self._labels = sorted(self._label_classifiers.keys()) 44 | 45 | def labels(self): 46 | return self._labels 47 | 48 | def classify(self, feats): 49 | lbls = set() 50 | 51 | for label, classifier in self._label_classifiers.items(): 52 | if classifier.classify(feats) == label: 53 | lbls.add(label) 54 | 55 | return lbls 56 | 57 | def train_binary_classifiers(trainf, labelled_feats, labelset): 58 | pos_feats = collections.defaultdict(list) 59 | neg_feats = collections.defaultdict(list) 60 | classifiers = {} 61 | 62 | for feat, labels in labelled_feats: 63 | for label in labels: 64 | pos_feats[label].append(feat) 65 | 66 | for label in labelset - set(labels): 67 | neg_feats[label].append(feat) 68 | 69 | for label in labelset: 70 | postrain = [(feat, label) for feat in pos_feats[label]] 71 | negtrain = [(feat, '!%s' % label) for feat in neg_feats[label]] 72 | classifiers[label] = trainf(postrain + negtrain) 73 | 74 | return classifiers 75 | 76 | def multi_metrics(multi_classifier, test_feats): 77 | mds = [] 78 | refsets = collections.defaultdict(set) 79 | testsets = collections.defaultdict(set) 80 | 81 | for i, (feat, labels) in enumerate(test_feats): 82 | for label in labels: 83 | refsets[label].add(i) 84 | 85 | guessed = multi_classifier.classify(feat) 86 | 87 | for label in guessed: 88 | testsets[label].add(i) 89 | 90 | mds.append(metrics.masi_distance(set(labels), guessed)) 91 | 92 | avg_md = sum(mds) / float(len(mds)) 93 | precisions = {} 94 | recalls = {} 95 | 96 | for label in multi_classifier.labels(): 97 | precisions[label] = metrics.precision(refsets[label], testsets[label]) 98 | recalls[label] = metrics.recall(refsets[label], testsets[label]) 99 | 100 | return precisions, recalls, avg_md -------------------------------------------------------------------------------- /Module 2/Chapter 7/7853OS_07_Codes/featx.py: -------------------------------------------------------------------------------- 1 | import collections 2 | from nltk.corpus import stopwords, reuters 3 | from nltk.collocations import BigramCollocationFinder 4 | from nltk.metrics import BigramAssocMeasures 5 | from nltk.probability import FreqDist, ConditionalFreqDist 6 | 7 | def bag_of_words(words): 8 | ''' 9 | >>> bag_of_words(['the', 'quick', 'brown', 'fox']) 10 | {'quick': True, 'brown': True, 'the': True, 'fox': True} 11 | ''' 12 | return dict([(word, True) for word in words]) 13 | 14 | def bag_of_words_not_in_set(words, badwords): 15 | ''' 16 | >>> bag_of_words_not_in_set(['the', 'quick', 'brown', 'fox'], ['the']) 17 | {'quick': True, 'brown': True, 'fox': True} 18 | ''' 19 | return bag_of_words(set(words) - set(badwords)) 20 | 21 | def bag_of_non_stopwords(words, stopfile='english'): 22 | ''' 23 | >>> bag_of_non_stopwords(['the', 'quick', 'brown', 'fox']) 24 | {'quick': True, 'brown': True, 'fox': True} 25 | ''' 26 | badwords = stopwords.words(stopfile) 27 | return bag_of_words_not_in_set(words, badwords) 28 | 29 | def bag_of_bigrams_words(words, score_fn=BigramAssocMeasures.chi_sq, n=200): 30 | ''' 31 | >>> bag_of_bigrams_words(['the', 'quick', 'brown', 'fox']) 32 | {'brown': True, ('brown', 'fox'): True, ('the', 'quick'): True, 'quick': True, ('quick', 'brown'): True, 'the': True, 'fox': True} 33 | ''' 34 | bigram_finder = BigramCollocationFinder.from_words(words) 35 | bigrams = bigram_finder.nbest(score_fn, n) 36 | return bag_of_words(words + bigrams) 37 | 38 | def bag_of_words_in_set(words, goodwords): 39 | return bag_of_words(set(words) & set(goodwords)) 40 | 41 | def label_feats_from_corpus(corp, feature_detector=bag_of_words): 42 | label_feats = collections.defaultdict(list) 43 | 44 | for label in corp.categories(): 45 | for fileid in corp.fileids(categories=[label]): 46 | feats = feature_detector(corp.words(fileids=[fileid])) 47 | label_feats[label].append(feats) 48 | 49 | return label_feats 50 | 51 | def split_label_feats(lfeats, split=0.75): 52 | train_feats = [] 53 | test_feats = [] 54 | 55 | for label, feats in lfeats.items(): 56 | cutoff = int(len(feats) * split) 57 | train_feats.extend([(feat, label) for feat in feats[:cutoff]]) 58 | test_feats.extend([(feat, label) for feat in feats[cutoff:]]) 59 | 60 | return train_feats, test_feats 61 | 62 | def high_information_words(labelled_words, score_fn=BigramAssocMeasures.chi_sq, min_score=5): 63 | word_fd = FreqDist() 64 | label_word_fd = ConditionalFreqDist() 65 | 66 | for label, words in labelled_words: 67 | for word in words: 68 | word_fd[word] += 1 69 | label_word_fd[label][word] += 1 70 | 71 | n_xx = label_word_fd.N() 72 | high_info_words = set() 73 | 74 | for label in label_word_fd.conditions(): 75 | n_xi = label_word_fd[label].N() 76 | word_scores = collections.defaultdict(int) 77 | 78 | for word, n_ii in label_word_fd[label].items(): 79 | n_ix = word_fd[word] 80 | score = score_fn(n_ii, (n_ix, n_xi), n_xx) 81 | word_scores[word] = score 82 | 83 | bestwords = [word for word, score in word_scores.items() if score >= min_score] 84 | high_info_words |= set(bestwords) 85 | 86 | return high_info_words 87 | 88 | def reuters_high_info_words(score_fn=BigramAssocMeasures.chi_sq): 89 | labeled_words = [] 90 | 91 | for label in reuters.categories(): 92 | labeled_words.append((label, reuters.words(categories=[label]))) 93 | 94 | return high_information_words(labeled_words, score_fn=score_fn) 95 | 96 | def reuters_train_test_feats(feature_detector=bag_of_words): 97 | train_feats = [] 98 | test_feats = [] 99 | 100 | for fileid in reuters.fileids(): 101 | if fileid.startswith('training'): 102 | featlist = train_feats 103 | else: # fileid.startswith('test') 104 | featlist = test_feats 105 | 106 | feats = feature_detector(reuters.words(fileid)) 107 | labels = reuters.categories(fileid) 108 | featlist.append((feats, labels)) 109 | 110 | return train_feats, test_feats 111 | 112 | if __name__ == '__main__': 113 | import doctest 114 | doctest.testmod() -------------------------------------------------------------------------------- /Module 2/Chapter 8/7853OS_08_Codes/chapter8.py: -------------------------------------------------------------------------------- 1 | ''' 2 | ================================ 3 | Distributed Tagging with Execnet 4 | ================================ 5 | 6 | >>> import execnet, remote_tag, nltk.tag, nltk.data 7 | >>> from nltk.corpus import treebank 8 | >>> import pickle 9 | >>> tagger = pickle.dumps(nltk.data.load(nltk.tag._POS_TAGGER)) 10 | >>> gw = execnet.makegateway() 11 | >>> channel = gw.remote_exec(remote_tag) 12 | >>> channel.send(tagger) 13 | >>> channel.send(treebank.sents()[0]) 14 | >>> tagged_sentence = channel.receive() 15 | >>> tagged_sentence == treebank.tagged_sents()[0] 16 | True 17 | >>> gw.exit() 18 | 19 | >>> import itertools 20 | >>> gw1 = execnet.makegateway() 21 | >>> gw2 = execnet.makegateway() 22 | >>> ch1 = gw1.remote_exec(remote_tag) 23 | >>> ch1.send(tagger) 24 | >>> ch2 = gw2.remote_exec(remote_tag) 25 | >>> ch2.send(tagger) 26 | >>> mch = execnet.MultiChannel([ch1, ch2]) 27 | >>> queue = mch.make_receive_queue() 28 | >>> channels = itertools.cycle(mch) 29 | >>> for sentence in treebank.sents()[:4]: 30 | ... channel = next(channels) 31 | ... channel.send(sentence) 32 | >>> tagged_sentences = [] 33 | >>> for i in range(4): 34 | ... channel, tagged_sentence = queue.get() 35 | ... tagged_sentences.append(tagged_sentence) 36 | >>> len(tagged_sentences) 37 | 4 38 | >>> gw1.exit() 39 | >>> gw2.exit() 40 | 41 | 42 | ================================= 43 | Distributed Chunking with Execnet 44 | ================================= 45 | 46 | >>> import remote_chunk, nltk.chunk 47 | >>> from nltk.corpus import treebank_chunk 48 | >>> chunker = pickle.dumps(nltk.data.load(nltk.chunk._MULTICLASS_NE_CHUNKER)) 49 | >>> gw = execnet.makegateway() 50 | >>> channel = gw.remote_exec(remote_chunk) 51 | >>> channel.send(tagger) 52 | >>> channel.send(chunker) 53 | >>> channel.send(treebank_chunk.sents()[0]) 54 | >>> chunk_tree = pickle.loads(channel.receive()) 55 | >>> chunk_tree 56 | Tree('S', [Tree('PERSON', [('Pierre', 'NNP')]), Tree('ORGANIZATION', [('Vinken', 'NNP')]), (',', ','), ('61', 'CD'), ('years', 'NNS'), ('old', 'JJ'), (',', ','), ('will', 'MD'), ('join', 'VB'), ('the', 'DT'), ('board', 'NN'), ('as', 'IN'), ('a', 'DT'), ('nonexecutive', 'JJ'), ('director', 'NN'), ('Nov.', 'NNP'), ('29', 'CD'), ('.', '.')]) 57 | >>> gw.exit() 58 | 59 | 60 | ===================================== 61 | Parallel List Processing with Execnet 62 | ===================================== 63 | 64 | >>> import plists, remote_double 65 | >>> plists.map(remote_double, range(10)) 66 | [0, 2, 4, 6, 8, 10, 12, 14, 16, 18] 67 | 68 | >>> plists.map(remote_double, range(10), [('popen', 4)]) 69 | [0, 2, 4, 6, 8, 10, 12, 14, 16, 18] 70 | 71 | 72 | ====================================== 73 | Storing an Ordered Dictionary in Redis 74 | ====================================== 75 | 76 | >>> from redis import Redis 77 | >>> from rediscollections import RedisOrderedDict 78 | >>> r = Redis() 79 | >>> rod = RedisOrderedDict(r, 'scores') 80 | >>> rod['best'] = 10 81 | >>> rod['worst'] = 0.1 82 | >>> rod['middle'] = 5 83 | >>> rod.keys() 84 | [b'best', b'middle', b'worst'] 85 | >>> rod.keys(start=0, end=1) 86 | [b'best', b'middle'] 87 | >>> rod.clear() 88 | 89 | 90 | =============================================== 91 | Distributed Word Scoring with Redis and Execnet 92 | =============================================== 93 | 94 | >>> from dist_featx import score_words 95 | >>> from nltk.corpus import movie_reviews 96 | >>> labels = movie_reviews.categories() 97 | >>> labelled_words = [(l, movie_reviews.words(categories=[l])) for l in labels] 98 | >>> word_scores = score_words(labelled_words) 99 | >>> len(word_scores) 100 | 39767 101 | >>> topn_words = word_scores.keys(end=1000) 102 | >>> topn_words[0:5] 103 | [b'bad', b',', b'and', b'?', b'movie'] 104 | >>> from redis import Redis 105 | >>> r = Redis() 106 | >>> [r.delete(key) for key in ['word_fd', 'label_word_fd:neg', 'label_word_fd:pos', 'word_scores']] 107 | [1, 1, 1, 1] 108 | ''' 109 | 110 | if __name__ == '__main__': 111 | import doctest 112 | doctest.testmod() -------------------------------------------------------------------------------- /Module 2/Chapter 8/7853OS_08_Codes/dist_featx.py: -------------------------------------------------------------------------------- 1 | import itertools, execnet, remote_word_count 2 | from nltk.metrics import BigramAssocMeasures 3 | from redis import Redis 4 | from redisprob import RedisHashFreqDist, RedisConditionalHashFreqDist 5 | from rediscollections import RedisOrderedDict 6 | 7 | def score_words(labelled_words, score_fn=BigramAssocMeasures.chi_sq, host='localhost', specs=[('popen', 2)]): 8 | gateways = [] 9 | channels = [] 10 | 11 | for spec, count in specs: 12 | for i in range(count): 13 | gw = execnet.makegateway(spec) 14 | gateways.append(gw) 15 | channel = gw.remote_exec(remote_word_count) 16 | channel.send((host, 'word_fd', 'label_word_fd')) 17 | channels.append(channel) 18 | 19 | cyc = itertools.cycle(channels) 20 | 21 | for label, words in labelled_words: 22 | channel = next(cyc) 23 | channel.send((label, list(words))) 24 | 25 | for channel in channels: 26 | channel.send('done') 27 | assert 'done' == channel.receive() 28 | channel.waitclose(5) 29 | 30 | for gateway in gateways: 31 | gateway.exit() 32 | 33 | r = Redis(host) 34 | fd = RedisHashFreqDist(r, 'word_fd') 35 | cfd = RedisConditionalHashFreqDist(r, 'label_word_fd') 36 | word_scores = RedisOrderedDict(r, 'word_scores') 37 | n_xx = cfd.N() 38 | 39 | for label in cfd.conditions(): 40 | n_xi = cfd[label].N() 41 | 42 | for word, n_ii in cfd[label].items(): 43 | word = word.decode() # must convert to string from bytes 44 | n_ix = fd[word] 45 | 46 | if n_ii and n_ix and n_xi and n_xx: 47 | score = score_fn(n_ii, (n_ix, n_xi), n_xx) 48 | word_scores[word] = score 49 | 50 | return word_scores -------------------------------------------------------------------------------- /Module 2/Chapter 8/7853OS_08_Codes/plists.py: -------------------------------------------------------------------------------- 1 | import itertools, execnet 2 | 3 | def map(mod, args, specs=[('popen', 2)]): 4 | gateways = [] 5 | channels = [] 6 | 7 | for spec, count in specs: 8 | for i in range(count): 9 | gw = execnet.makegateway(spec) 10 | gateways.append(gw) 11 | channels.append(gw.remote_exec(mod)) 12 | 13 | cyc = itertools.cycle(channels) 14 | 15 | for i, arg in enumerate(args): 16 | channel = next(cyc) 17 | channel.send((i, arg)) 18 | 19 | mch = execnet.MultiChannel(channels) 20 | queue = mch.make_receive_queue() 21 | l = len(args) 22 | results = [None] * l 23 | 24 | for j in range(l): 25 | channel, (i, result) = queue.get() 26 | results[i] = result 27 | 28 | for gw in gateways: 29 | gw.exit() 30 | 31 | return results -------------------------------------------------------------------------------- /Module 2/Chapter 8/7853OS_08_Codes/remote_chunk.py: -------------------------------------------------------------------------------- 1 | import pickle 2 | 3 | if __name__ == '__channelexec__': 4 | tagger = pickle.loads(channel.receive()) 5 | chunker = pickle.loads(channel.receive()) 6 | 7 | for sent in channel: 8 | tree = chunker.parse(tagger.tag(sent)) 9 | channel.send(pickle.dumps(tree)) -------------------------------------------------------------------------------- /Module 2/Chapter 8/7853OS_08_Codes/remote_double.py: -------------------------------------------------------------------------------- 1 | 2 | if __name__ == '__channelexec__': 3 | for (i, arg) in channel: 4 | channel.send((i, arg * 2)) -------------------------------------------------------------------------------- /Module 2/Chapter 8/7853OS_08_Codes/remote_tag.py: -------------------------------------------------------------------------------- 1 | import pickle 2 | 3 | if __name__ == '__channelexec__': 4 | tagger = pickle.loads(channel.receive()) 5 | 6 | for sentence in channel: 7 | channel.send(tagger.tag(sentence)) -------------------------------------------------------------------------------- /Module 2/Chapter 8/7853OS_08_Codes/remote_word_count.py: -------------------------------------------------------------------------------- 1 | from redis import Redis 2 | from redisprob import RedisHashFreqDist, RedisConditionalHashFreqDist 3 | 4 | if __name__ == '__channelexec__': 5 | host, fd_name, cfd_name = channel.receive() 6 | r = Redis(host) 7 | fd = RedisHashFreqDist(r, fd_name) 8 | cfd = RedisConditionalHashFreqDist(r, cfd_name) 9 | 10 | for data in channel: 11 | if data == 'done': 12 | channel.send('done') 13 | break 14 | 15 | label, words = data 16 | 17 | for word in words: 18 | fd[word] += 1 19 | cfd[label][word] += 1 -------------------------------------------------------------------------------- /Module 2/Chapter 9/7853OS_09_Codes/chapter9.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | ''' 3 | =================================== 4 | Parsing Dates & Times with Dateutil 5 | =================================== 6 | 7 | >>> from dateutil import parser 8 | >>> parser.parse('Thu Sep 25 10:36:28 2010') 9 | datetime.datetime(2010, 9, 25, 10, 36, 28) 10 | >>> parser.parse('Thursday, 25. September 2010 10:36AM') 11 | datetime.datetime(2010, 9, 25, 10, 36) 12 | >>> parser.parse('9/25/2010 10:36:28') 13 | datetime.datetime(2010, 9, 25, 10, 36, 28) 14 | >>> parser.parse('9/25/2010') 15 | datetime.datetime(2010, 9, 25, 0, 0) 16 | >>> parser.parse('2010-09-25T10:36:28Z') 17 | datetime.datetime(2010, 9, 25, 10, 36, 28, tzinfo=tzutc()) 18 | 19 | >>> parser.parse('25/9/2010', dayfirst=True) 20 | datetime.datetime(2010, 9, 25, 0, 0) 21 | 22 | >>> parser.parse('10-9-25') 23 | datetime.datetime(2025, 10, 9, 0, 0) 24 | >>> parser.parse('10-9-25', yearfirst=True) 25 | datetime.datetime(2010, 9, 25, 0, 0) 26 | 27 | >>> try: 28 | ... parser.parse('9/25/2010 at about 10:36AM') 29 | ... except ValueError: 30 | ... 'cannot parse' 31 | 'cannot parse' 32 | >>> parser.parse('9/25/2010 at about 10:36AM', fuzzy=True) 33 | datetime.datetime(2010, 9, 25, 10, 36) 34 | 35 | 36 | ============================== 37 | Timezone Lookup and Conversion 38 | ============================== 39 | 40 | >>> from dateutil import tz 41 | >>> tz.tzutc() 42 | tzutc() 43 | >>> import datetime 44 | >>> tz.tzutc().utcoffset(datetime.datetime.utcnow()) 45 | datetime.timedelta(0) 46 | 47 | >>> tz.gettz('US/Pacific') 48 | tzfile('America/Los_Angeles') 49 | >>> tz.gettz('US/Pacific').utcoffset(datetime.datetime.utcnow()) 50 | datetime.timedelta(-1, 61200) 51 | >>> tz.gettz('Europe/Paris') 52 | tzfile('Europe/Paris') 53 | >>> tz.gettz('Europe/Paris').utcoffset(datetime.datetime.utcnow()) 54 | datetime.timedelta(0, 7200) 55 | 56 | >>> pst = tz.gettz('US/Pacific') 57 | >>> dt = datetime.datetime(2010, 9, 25, 10, 36) 58 | >>> dt.tzinfo 59 | >>> dt.astimezone(tz.tzutc()) 60 | Traceback (most recent call last): 61 | File "/usr/lib/python2.6/doctest.py", line 1248, in __run 62 | compileflags, 1) in test.globs 63 | File "", line 1, in 64 | dt.astimezone(tz.tzutc()) 65 | ValueError: astimezone() cannot be applied to a naive datetime 66 | >>> dt.replace(tzinfo=pst) 67 | datetime.datetime(2010, 9, 25, 10, 36, tzinfo=tzfile('America/Los_Angeles')) 68 | >>> dt.replace(tzinfo=pst).astimezone(tz.tzutc()) 69 | datetime.datetime(2010, 9, 25, 17, 36, tzinfo=tzutc()) 70 | 71 | >>> parser.parse('Wednesday, Aug 4, 2010 at 6:30 p.m. (CDT)', fuzzy=True) 72 | datetime.datetime(2010, 8, 4, 18, 30) 73 | >>> tzinfos = {'CDT': tz.gettz('US/Central')} 74 | >>> parser.parse('Wednesday, Aug 4, 2010 at 6:30 p.m. (CDT)', fuzzy=True, tzinfos=tzinfos) 75 | datetime.datetime(2010, 8, 4, 18, 30, tzinfo=tzfile('America/Chicago')) 76 | 77 | >>> tz.tzoffset('custom', 3600) 78 | tzoffset('custom', 3600) 79 | 80 | =================================== 81 | Extracting URLs from HTML with lxml 82 | =================================== 83 | 84 | >>> from lxml import html 85 | >>> doc = html.fromstring('Hello world') 86 | >>> links = list(doc.iterlinks()) 87 | >>> len(links) 88 | 1 89 | >>> (el, attr, link, pos) = links[0] 90 | >>> attr 91 | 'href' 92 | >>> link 93 | '/world' 94 | >>> pos 95 | 0 96 | 97 | >>> doc.make_links_absolute('http://hello') 98 | >>> abslinks = list(doc.iterlinks()) 99 | >>> (el, attr, link, pos) = abslinks[0] 100 | >>> link 101 | 'http://hello/world' 102 | 103 | >>> links = list(html.iterlinks('Hello world')) 104 | >>> links[0][2] 105 | '/world' 106 | 107 | >>> doc.xpath('//a/@href')[0] 108 | 'http://hello/world' 109 | 110 | 111 | =========================== 112 | Cleaning and Stripping HTML 113 | =========================== 114 | 115 | >>> import lxml.html.clean 116 | >>> lxml.html.clean.clean_html('my text') 117 | '
my text
' 118 | 119 | >>> from bs4 import BeautifulSoup 120 | >>> BeautifulSoup('
my text
').get_text() 121 | 'my text' 122 | 123 | 124 | =========================================== 125 | Converting HTML Entities with BeautifulSoup 126 | =========================================== 127 | 128 | >>> from bs4 import BeautifulSoup 129 | >>> BeautifulSoup('<').string 130 | '<' 131 | >>> BeautifulSoup('&').string 132 | '&' 133 | 134 | >>> BeautifulSoup('<').string 135 | 136 | >>> from bs4 import BeautifulSoup 137 | >>> soup = BeautifulSoup('Hello world') 138 | >>> [a['href'] for a in soup.findAll('a')] 139 | ['/world'] 140 | 141 | ============================================ 142 | Detecting and Converting Character Encodings 143 | ============================================ 144 | 145 | >>> import unicodedata 146 | >>> unicodedata.normalize('NFKD', 'abcd\xe9').encode('ascii', 'ignore') 147 | b'abcde' 148 | 149 | >>> from bs4 import UnicodeDammit 150 | >>> UnicodeDammit('abcd\xe9').unicode_markup 151 | 'abcdé' 152 | 153 | ''' 154 | 155 | if __name__ == '__main__': 156 | import doctest 157 | doctest.testmod() 158 | -------------------------------------------------------------------------------- /Module 2/Chapter 9/7853OS_09_Codes/encoding.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import charade 3 | 4 | def detect(s): 5 | ''' 6 | >>> detect('ascii') 7 | {'confidence': 1.0, 'encoding': 'ascii'} 8 | >>> detect('abcdé') 9 | {'confidence': 0.505, 'encoding': 'utf-8'} 10 | >>> detect(bytes('abcdé', 'utf-8')) 11 | {'confidence': 0.505, 'encoding': 'utf-8'} 12 | >>> detect(bytes('\222\222\223\225', 'latin-1')) 13 | {'confidence': 0.5, 'encoding': 'windows-1252'} 14 | ''' 15 | try: 16 | if isinstance(s, str): 17 | return charade.detect(s.encode()) 18 | else: 19 | return charade.detect(s) 20 | except UnicodeDecodeError: 21 | return charade.detect(s.encode('utf-8')) 22 | 23 | def convert(s): 24 | ''' 25 | >>> convert('ascii') 26 | 'ascii' 27 | >>> convert('abcdé') 28 | 'abcdé' 29 | >>> convert(bytes('abcdé', 'utf-8')) 30 | 'abcdé' 31 | >>> convert(bytes('\222\222\223\225', 'latin-1')) 32 | '\u2019\u2019\u201c\u2022' 33 | ''' 34 | if isinstance(s, str): 35 | s = s.encode() 36 | 37 | encoding = detect(s)['encoding'] 38 | 39 | if encoding == 'utf-8': 40 | return s.decode() 41 | else: 42 | return s.decode(encoding) 43 | 44 | if __name__ == '__main__': 45 | import doctest 46 | doctest.testmod() -------------------------------------------------------------------------------- /Module 3/Chapter 1/ch1_1.py: -------------------------------------------------------------------------------- 1 | import nltk 2 | text=" Welcome readers. I hope you find it interesting. Please do reply." 3 | from nltk.tokenize import sent_tokenize 4 | print(sent_tokenize(text)) 5 | -------------------------------------------------------------------------------- /Module 3/Chapter 1/ch1_10.py: -------------------------------------------------------------------------------- 1 | import nltk 2 | from nltk.tokenize import regexp_tokenize 3 | sent="Don't hesitate to ask questions" 4 | print(regexp_tokenize(sent, pattern='\w+|\$[\d\.]+|\S+')) 5 | 6 | 7 | -------------------------------------------------------------------------------- /Module 3/Chapter 1/ch1_11.py: -------------------------------------------------------------------------------- 1 | import nltk 2 | from nltk.tokenize import RegexpTokenizer 3 | tokenizer=RegexpTokenizer('\s+',gaps=True) 4 | print(tokenizer.tokenize("Don't hesitate to ask questions")) 5 | -------------------------------------------------------------------------------- /Module 3/Chapter 1/ch1_12.py: -------------------------------------------------------------------------------- 1 | import nltk 2 | from nltk.tokenize import RegexpTokenizer 3 | sent=" She secured 90.56 % in class X . She is a meritorious student" 4 | capt = RegexpTokenizer('[A-Z]\w+') 5 | print(capt.tokenize(sent)) 6 | -------------------------------------------------------------------------------- /Module 3/Chapter 1/ch1_13.py: -------------------------------------------------------------------------------- 1 | import nltk 2 | sent=" She secured 90.56 % in class X . She is a meritorious student" 3 | from nltk.tokenize import BlanklineTokenizer 4 | print(BlanklineTokenizer().tokenize(sent)) 5 | -------------------------------------------------------------------------------- /Module 3/Chapter 1/ch1_14.py: -------------------------------------------------------------------------------- 1 | import nltk 2 | sent=" She secured 90.56 % in class X . She is a meritorious student" 3 | from nltk.tokenize import WhitespaceTokenizer 4 | print(WhitespaceTokenizer().tokenize(sent)) 5 | -------------------------------------------------------------------------------- /Module 3/Chapter 1/ch1_15.py: -------------------------------------------------------------------------------- 1 | import nltk 2 | sent= "She secured 90.56 % in class X. She is a meritorious student" 3 | print(sent.split()) 4 | print(sent.split(' ')) 5 | sent=" She secured 90.56 % in class X \n. She is a meritorious student\n" 6 | print(sent.split('\n')) 7 | -------------------------------------------------------------------------------- /Module 3/Chapter 1/ch1_16.py: -------------------------------------------------------------------------------- 1 | import nltk 2 | from nltk.tokenize import BlanklineTokenizer 3 | sent=" She secured 90.56 % in class X \n. She is a meritorious student\n" 4 | print(BlanklineTokenizer().tokenize(sent)) 5 | from nltk.tokenize import LineTokenizer 6 | print(LineTokenizer(blanklines='keep').tokenize(sent)) 7 | print(LineTokenizer(blanklines='discard').tokenize(sent)) 8 | -------------------------------------------------------------------------------- /Module 3/Chapter 1/ch1_17.py: -------------------------------------------------------------------------------- 1 | import nltk 2 | sent=" She secured 90.56 % in class X \n. She is a meritorious student\n" 3 | from nltk.tokenize import SpaceTokenizer 4 | print(SpaceTokenizer().tokenize(sent)) 5 | -------------------------------------------------------------------------------- /Module 3/Chapter 1/ch1_18.py: -------------------------------------------------------------------------------- 1 | import nltk 2 | from nltk.tokenize import WhitespaceTokenizer 3 | sent=" She secured 90.56 % in class X \n. She is a meritorious student\n" 4 | print(list(WhitespaceTokenizer().span_tokenize(sent))) 5 | -------------------------------------------------------------------------------- /Module 3/Chapter 1/ch1_19.py: -------------------------------------------------------------------------------- 1 | import nltk 2 | from nltk.tokenize import WhitespaceTokenizer 3 | from nltk.tokenize.util import spans_to_relative 4 | sent=" She secured 90.56 % in class X \n. She is a meritorious student\n" 5 | print(list(spans_to_relative(WhitespaceTokenizer().span_tokenize(sent)))) 6 | -------------------------------------------------------------------------------- /Module 3/Chapter 1/ch1_2.py: -------------------------------------------------------------------------------- 1 | import nltk 2 | tokenizer=nltk.data.load('tokenizers/punkt/english.pickle') 3 | text=" Hello everyone. Hope all are fine and doing well. Hope you find the book interesting" 4 | print(tokenizer.tokenize(text)) 5 | -------------------------------------------------------------------------------- /Module 3/Chapter 1/ch1_20.py: -------------------------------------------------------------------------------- 1 | import nltk 2 | from nltk.tokenize.util import string_span_tokenize 3 | sent=" She secured 90.56 % in class X \n. She is a meritorious student\n" 4 | print(list(string_span_tokenize(sent, " "))) 5 | -------------------------------------------------------------------------------- /Module 3/Chapter 1/ch1_21.py: -------------------------------------------------------------------------------- 1 | text=[" It is a pleasant evening.","Guests, who came from US arrived at the venue","Food was tasty."] 2 | from nltk.tokenize import word_tokenize 3 | tokenized_docs=[word_tokenize(doc) for doc in text] 4 | print(tokenized_docs) 5 | -------------------------------------------------------------------------------- /Module 3/Chapter 1/ch1_22.py: -------------------------------------------------------------------------------- 1 | import re 2 | import string 3 | text=[" It is a pleasant evening.","Guests, who came from US arrived at the venue","Food was tasty."] 4 | from nltk.tokenize import word_tokenize 5 | tokenized_docs=[word_tokenize(doc) for doc in text] 6 | x=re.compile('[%s]' % re.escape(string.punctuation)) 7 | tokenized_docs_no_punctuation = [] 8 | for review in tokenized_docs: 9 | new_review = [] 10 | for token in review: 11 | new_token = x.sub(u'', token) 12 | if not new_token == u'': 13 | new_review.append(new_token) 14 | tokenized_docs_no_punctuation.append(new_review) 15 | print(tokenized_docs_no_punctuation) 16 | -------------------------------------------------------------------------------- /Module 3/Chapter 1/ch1_23.py: -------------------------------------------------------------------------------- 1 | text='HARdWork IS KEy to SUCCESS' 2 | print(text.lower()) 3 | print(text.upper()) 4 | -------------------------------------------------------------------------------- /Module 3/Chapter 1/ch1_24.py: -------------------------------------------------------------------------------- 1 | import nltk 2 | from nltk.corpus import stopwords 3 | stops=set(stopwords.words('english')) 4 | words=["Don't", 'hesitate','to','ask','questions'] 5 | print([word for word in words if word not in stops]) 6 | -------------------------------------------------------------------------------- /Module 3/Chapter 1/ch1_25.py: -------------------------------------------------------------------------------- 1 | from nltk.corpus import stopwords 2 | print(stopwords.fileids()) 3 | -------------------------------------------------------------------------------- /Module 3/Chapter 1/ch1_26.py: -------------------------------------------------------------------------------- 1 | import nltk 2 | from nltk.corpus import stopwords 3 | print(stopwords.words('english')) 4 | def para_fraction(text): 5 | stopwords = nltk.corpus.stopwords.words('english') 6 | para = [w for w in text if w.lower() not in stopwords] 7 | return len(para) / len(text) 8 | print(para_fraction(nltk.corpus.reuters.words())) 9 | print(para_fraction(nltk.corpus.inaugural.words())) 10 | 11 | -------------------------------------------------------------------------------- /Module 3/Chapter 1/ch1_27.py: -------------------------------------------------------------------------------- 1 | import nltk 2 | from replacers import RegexpReplacer 3 | replacer= RegexpReplacer() 4 | replacer.replace("Don't hesitate to ask questions") 5 | print(replacer.replace("She must've gone to the market but she didn't go")) 6 | -------------------------------------------------------------------------------- /Module 3/Chapter 1/ch1_28.py: -------------------------------------------------------------------------------- 1 | import nltk 2 | from nltk.tokenize import word_tokenize 3 | from replacers import RegexpReplacer 4 | replacer=RegexpReplacer() 5 | word_tokenize("Don't hesitate to ask questions") 6 | print(word_tokenize(replacer.replace("Don't hesitate to ask questions"))) 7 | -------------------------------------------------------------------------------- /Module 3/Chapter 1/ch1_29.py: -------------------------------------------------------------------------------- 1 | import nltk 2 | from replacers import RepeatReplacer 3 | replacer=RepeatReplacer() 4 | print(replacer.replace('lotttt')) 5 | print(replacer.replace('ohhhhh')) 6 | print(replacer.replace('ooohhhhh')) 7 | 8 | -------------------------------------------------------------------------------- /Module 3/Chapter 1/ch1_3.py: -------------------------------------------------------------------------------- 1 | import nltk 2 | french_tokenizer=nltk.data.load('tokenizers/punkt/french.pickle') 3 | print(french_tokenizer.tokenize('Deux agressions en quelques jours, voilà ce qui a motivé hier matin le débrayage collège franco-britanniquedeLevallois-Perret. Deux agressions en quelques jours, voilà ce qui a motivé hier matin le débrayage Levallois. L’équipe pédagogique de ce collège de 750 élèves avait déjà été choquée par l’agression, janvier , d’un professeur d’histoire. L’équipe pédagogique de ce collège de 750 élèves avait déjà été choquée par l’agression, mercredi , d’un professeur d’histoire')) 4 | -------------------------------------------------------------------------------- /Module 3/Chapter 1/ch1_30.py: -------------------------------------------------------------------------------- 1 | import nltk 2 | from replacers import RepeatReplacer 3 | replacer=RepeatReplacer() 4 | print(replacer.replace('happy')) 5 | -------------------------------------------------------------------------------- /Module 3/Chapter 1/ch1_31.py: -------------------------------------------------------------------------------- 1 | import nltk 2 | from replacers import WordReplacer 3 | replacer=WordReplacer({'congrats':'congratulations'}) 4 | print(replacer.replace('congrats')) 5 | print(replacer.replace('maths')) 6 | -------------------------------------------------------------------------------- /Module 3/Chapter 1/ch1_33.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | from nltk.metrics import * 3 | training='PERSON OTHER PERSON OTHER OTHER ORGANIZATION'.split() 4 | testing='PERSON OTHER OTHER OTHER OTHER OTHER'.split() 5 | print(accuracy(training,testing)) 6 | trainset=set(training) 7 | testset=set(testing) 8 | precision(trainset,testset) 9 | print(recall(trainset,testset)) 10 | print(f_measure(trainset,testset)) 11 | 12 | -------------------------------------------------------------------------------- /Module 3/Chapter 1/ch1_34.py: -------------------------------------------------------------------------------- 1 | import nltk 2 | from nltk.metrics import * 3 | print(edit_distance("relate","relation")) 4 | print(edit_distance("suggestion","calculation")) 5 | -------------------------------------------------------------------------------- /Module 3/Chapter 1/ch1_35.py: -------------------------------------------------------------------------------- 1 | import nltk 2 | from nltk.metrics import * 3 | X=set([10,20,30,40]) 4 | Y=set([20,30,60]) 5 | print(jaccard_distance(X,Y)) 6 | 7 | -------------------------------------------------------------------------------- /Module 3/Chapter 1/ch1_36.py: -------------------------------------------------------------------------------- 1 | import nltk 2 | from nltk.metrics import * 3 | X = set([10,20,30,40]) 4 | Y= set([30,50,70]) 5 | print(binary_distance(X, Y)) 6 | 7 | -------------------------------------------------------------------------------- /Module 3/Chapter 1/ch1_37.py: -------------------------------------------------------------------------------- 1 | import nltk 2 | from nltk.util import ngrams 3 | from nltk.corpus import alpino 4 | print(alpino.words()) 5 | trigrams_tokens=ngrams(alpino.words(),3) 6 | for i in trigrams_tokens: 7 | print(i) 8 | -------------------------------------------------------------------------------- /Module 3/Chapter 1/ch1_4.py: -------------------------------------------------------------------------------- 1 | import nltk 2 | text = nltk.word_tokenize("PierreVinken , 59 years old , will join as a nonexecutive director on Nov. 29 .") 3 | print(text) 4 | -------------------------------------------------------------------------------- /Module 3/Chapter 1/ch1_5.py: -------------------------------------------------------------------------------- 1 | import nltk 2 | from nltk import word_tokenize 3 | r=input("Please write a text") 4 | print("The length of text is",len(word_tokenize(r)),"words") 5 | -------------------------------------------------------------------------------- /Module 3/Chapter 1/ch1_6.py: -------------------------------------------------------------------------------- 1 | import nltk 2 | from nltk.tokenize import TreebankWordTokenizer 3 | tokenizer = TreebankWordTokenizer() 4 | print(tokenizer.tokenize("Have a nice day. I hope you find the book interesting")) 5 | -------------------------------------------------------------------------------- /Module 3/Chapter 1/ch1_7.py: -------------------------------------------------------------------------------- 1 | import nltk 2 | text=nltk.word_tokenize(" Don't hesitate to ask questions") 3 | print(text) 4 | -------------------------------------------------------------------------------- /Module 3/Chapter 1/ch1_8.py: -------------------------------------------------------------------------------- 1 | from nltk.tokenize import WordPunctTokenizer 2 | tokenizer=WordPunctTokenizer() 3 | print(tokenizer.tokenize(" Don't hesitate to ask questions")) 4 | -------------------------------------------------------------------------------- /Module 3/Chapter 1/ch1_9.py: -------------------------------------------------------------------------------- 1 | import nltk 2 | from nltk.tokenize import RegexpTokenizer 3 | tokenizer=RegexpTokenizer("[\w']+") 4 | print(tokenizer.tokenize("Don't hesitate to ask questions")) 5 | -------------------------------------------------------------------------------- /Module 3/Chapter 10/ch10_1.py: -------------------------------------------------------------------------------- 1 | import nltk 2 | from nltk.corpus import brown 3 | sentences=brown.tagged_sents(categories='news') 4 | sent=brown.sents(categories='news') 5 | unigram_sent=nltk.UnigramTagger(sentences) 6 | print(unigram_sent.tag(sent[2008])) 7 | print(unigram_sent.evaluate(sentences)) 8 | -------------------------------------------------------------------------------- /Module 3/Chapter 10/ch10_10.py: -------------------------------------------------------------------------------- 1 | import nltk 2 | from nltk.stem.lancaster import LancasterStemmer 3 | stri=LancasterStemmer() 4 | print(stri.stem('achievement')) 5 | -------------------------------------------------------------------------------- /Module 3/Chapter 10/ch10_2.py: -------------------------------------------------------------------------------- 1 | import nltk 2 | from nltk.corpus import brown 3 | sentences=brown.tagged_sents(categories='news') 4 | sz=int(len(sentences)*0.8) 5 | print(sz) 6 | training_sents = sentences[:sz] 7 | print(testing_sents=sentences[sz:]) 8 | unigram_tagger=nltk.UnigramTagger(training_sents) 9 | print(unigram_tagger.evaluate(testing_sents)) 10 | 11 | -------------------------------------------------------------------------------- /Module 3/Chapter 10/ch10_3.py: -------------------------------------------------------------------------------- 1 | import nltk 2 | from nltk.corpus import brown 3 | sentences=brown.tagged_sents(categories='news') 4 | sz=int(len(sentences)*0.8) 5 | training_sents = sentences[:sz] 6 | testing_sents=sentences[sz:] 7 | bigram_tagger=nltk.UnigramTagger(training_sents) 8 | bigram_tagger=nltk.BigramTagger(training_sents) 9 | print(bigram_tagger.tag(sentences[2008])) 10 | un_sent=sentences[4203] 11 | print(bigram_tagger.tag(un_sent)) 12 | print(bigram_tagger.evaluate(testing_sents)) 13 | -------------------------------------------------------------------------------- /Module 3/Chapter 10/ch10_4.py: -------------------------------------------------------------------------------- 1 | import nltk 2 | from nltk.corpus import brown 3 | sentences=brown.tagged_sents(categories='news') 4 | sz=int(len(sentences)*0.8) 5 | training_sents = sentences[:sz] 6 | testing_sents=sentences[sz:] 7 | s0=nltk.DefaultTagger('NNP') 8 | s1=nltk.UnigramTagger(training_sents,backoff=s0) 9 | s2=nltk.BigramTagger(training_sents,backoff=s1) 10 | print(s2.evaluate(testing_sents)) 11 | 12 | -------------------------------------------------------------------------------- /Module 3/Chapter 10/ch10_5.py: -------------------------------------------------------------------------------- 1 | import nltk 2 | chunkparser = nltk.RegexpParser("") 3 | print(nltk.chunk.accuracy(chunkparser, nltk.corpus.conll2000.chunked_sents('train.txt', chunk_types=('NP',)))) 4 | -------------------------------------------------------------------------------- /Module 3/Chapter 10/ch10_6.py: -------------------------------------------------------------------------------- 1 | import nltk 2 | grammar = r"NP: {<[CDJNP].*>+}" 3 | cp = nltk.RegexpParser(grammar) 4 | print(nltk.chunk.accuracy(cp, nltk.corpus.conll2000.chunked_sents('train.txt', chunk_types=('NP',)))) 5 | -------------------------------------------------------------------------------- /Module 3/Chapter 10/ch10_7.py: -------------------------------------------------------------------------------- 1 | import nltk 2 | correct = nltk.chunk.tagstr2tree( 3 | "[ the/DT little/JJ cat/NN ] sat/VBD on/IN [ the/DT mat/NN ]") 4 | print(correct.flatten()) 5 | grammar = r"NP: {<[CDJNP].*>+}" 6 | cp = nltk.RegexpParser(grammar) 7 | 8 | grammar = r"NP: {+}" 9 | chunk_parser = nltk.RegexpParser(grammar) 10 | tagged_tok = [("the", "DT"), ("little", "JJ"), ("cat", "NN"),("sat", "VBD"), ("on", "IN"), ("the", "DT"), ("mat", "NN")] 11 | chunkscore = nltk.chunk.ChunkScore() 12 | guessed = cp.parse(correct.flatten()) 13 | chunkscore.score(correct, guessed) 14 | print(chunkscore) 15 | -------------------------------------------------------------------------------- /Module 3/Chapter 10/ch10_8.py: -------------------------------------------------------------------------------- 1 | import nltk 2 | from nltk.corpus import brown 3 | sentences = brown.tagged_sents(categories='news') 4 | sent = brown.sents(categories='news') 5 | pattern = [(r'(January)$','Jan')] 6 | regexpr_tagger = nltk.RegexpTagger(pattern) 7 | print(regexpr_tagger.tag(sent[3])) 8 | print(regexpr_tagger.evaluate(sentences)) 9 | 10 | -------------------------------------------------------------------------------- /Module 3/Chapter 10/ch10_9.py: -------------------------------------------------------------------------------- 1 | import nltk 2 | from nltk.corpus import brown 3 | freqd = nltk.FreqDist(brown.words(categories='news')) 4 | cfreqd = nltk.ConditionalFreqDist(brown.tagged_words(categories='news')) 5 | mostfreq_words = freqd.most_common(100) 6 | likelytags = dict((word, cfreqd[word].max()) for (word, _) in mostfreq_words) 7 | baselinetagger = nltk.UnigramTagger(model=likelytags) 8 | 9 | sent = brown.sents(categories='news')[3] 10 | print(baselinetagger.tag(sent)) 11 | -------------------------------------------------------------------------------- /Module 3/Chapter 2/ch2_1.py: -------------------------------------------------------------------------------- 1 | import nltk 2 | from nltk.util import ngrams 3 | from nltk.corpus import alpino 4 | print(alpino.words()) 5 | unigrams=ngrams(alpino.words(),1) 6 | for i in unigrams: 7 | print(i) 8 | -------------------------------------------------------------------------------- /Module 3/Chapter 2/ch2_10.py: -------------------------------------------------------------------------------- 1 | import nltk 2 | corpus=u" hello how are you doing ? Hope you find the book interesting. ".split() 3 | sentence=u"how are you doing".split() 4 | vocabulary=set(corpus) 5 | print(len(vocabulary)) 6 | cfd = nltk.ConditionalFreqDist(nltk.bigrams(corpus)) 7 | print([cfd[a][b] for (a,b) in nltk.bigrams(sentence)]) 8 | print([cfd[a].N() for (a,b) in nltk.bigrams(sentence)]) 9 | print([cfd[a].freq(b) for (a,b) in nltk.bigrams(sentence)]) 10 | print([1 + cfd[a][b] for (a,b) in nltk.bigrams(sentence)]) 11 | print([len(vocabulary) + cfd[a].N() for (a,b) in nltk.bigrams(sentence)]) 12 | print([1.0 * (1+cfd[a][b]) / (len(vocabulary)+cfd[a].N()) for (a,b) in nltk.bigrams(sentence)]) 13 | cpd_mle = nltk.ConditionalProbDist(cfd, nltk.MLEProbDist, bins=len(vocabulary)) 14 | print([cpd_mle[a].prob(b) for (a,b) in nltk.bigrams(sentence)]) 15 | cpd_laplace = nltk.ConditionalProbDist(cfd, nltk.LaplaceProbDist, bins=len(vocabulary)) 16 | print([cpd_laplace[a].prob(b) for (a,b) in nltk.bigrams(sentence)]) 17 | 18 | -------------------------------------------------------------------------------- /Module 3/Chapter 2/ch2_2.py: -------------------------------------------------------------------------------- 1 | import nltk 2 | from nltk.util import ngrams 3 | from nltk.corpus import alpino 4 | print(alpino.words()) 5 | quadgrams=ngrams(alpino.words(),4) 6 | for i in quadgrams: 7 | print(i) 8 | -------------------------------------------------------------------------------- /Module 3/Chapter 2/ch2_3.py: -------------------------------------------------------------------------------- 1 | import nltk 2 | from nltk.collocations import BigramCollocationFinder 3 | from nltk.corpus import webtext 4 | from nltk.metrics import BigramAssocMeasures 5 | tokens=[t.lower() for t in webtext.words('grail.txt')] 6 | words=BigramCollocationFinder.from_words(tokens) 7 | print(words.nbest(BigramAssocMeasures.likelihood_ratio, 10)) 8 | -------------------------------------------------------------------------------- /Module 3/Chapter 2/ch2_4.py: -------------------------------------------------------------------------------- 1 | from nltk.corpus import stopwords 2 | from nltk.corpus import webtext 3 | from nltk.collocations import BigramCollocationFinder 4 | from nltk.metrics import BigramAssocMeasures 5 | set = set(stopwords.words('english')) 6 | stops_filter = lambda w: len(w) < 3 or w in set 7 | tokens=[t.lower() for t in webtext.words('grail.txt')] 8 | words=BigramCollocationFinder.from_words(tokens) 9 | words.apply_word_filter(stops_filter) 10 | print(words.nbest(BigramAssocMeasures.likelihood_ratio, 10)) 11 | -------------------------------------------------------------------------------- /Module 3/Chapter 2/ch2_5.py: -------------------------------------------------------------------------------- 1 | import nltk 2 | from nltk.collocations import * 3 | text1="Hardwork is the key to success. Never give up!" 4 | word = nltk.wordpunct_tokenize(text1) 5 | finder = BigramCollocationFinder.from_words(word) 6 | bigram_measures = nltk.collocations.BigramAssocMeasures() 7 | value = finder.score_ngrams(bigram_measures.raw_freq) 8 | print(sorted(bigram for bigram, score in value)) 9 | -------------------------------------------------------------------------------- /Module 3/Chapter 2/ch2_6.py: -------------------------------------------------------------------------------- 1 | import nltk 2 | from nltk.util import ngrams 3 | from nltk.corpus import alpino 4 | print(alpino.words()) 5 | bigrams_tokens=ngrams(alpino.words(),2) 6 | for i in bigrams_tokens: 7 | print(i) 8 | -------------------------------------------------------------------------------- /Module 3/Chapter 2/ch2_7.py: -------------------------------------------------------------------------------- 1 | import nltk 2 | from nltk.collocations import * 3 | import nltk 4 | text="Hello how are you doing ? I hope you find the book interesting" 5 | tokens=nltk.wordpunct_tokenize(text) 6 | fourgrams=nltk.collocations.QuadgramCollocationFinder.from_words(tokens) 7 | for fourgram, freq in fourgrams.ngram_fd.items(): 8 | print(fourgram,freq) 9 | -------------------------------------------------------------------------------- /Module 3/Chapter 2/ch2_8.py: -------------------------------------------------------------------------------- 1 | import nltk 2 | from nltk.util import ngrams 3 | sent=" Hello , please read the book thoroughly . If you have any queries , then don't hesitate to ask . There is no shortcut to success ." 4 | n=5 5 | fivegrams=ngrams(sent.split(),n) 6 | for grams in fivegrams: 7 | print(grams) 8 | -------------------------------------------------------------------------------- /Module 3/Chapter 2/ch2_9.py: -------------------------------------------------------------------------------- 1 | import nltk 2 | cor = nltk.corpus.brown.tagged_sents(categories='adventure')[:500] 3 | print(len(cor)) 4 | from nltk.util import unique_list 5 | tag_set = unique_list(tag for sent in cor for (word,tag) in sent) 6 | print(len(tag_set)) 7 | symbols = unique_list(word for sent in cor for (word,tag) in sent) 8 | print(len(symbols)) 9 | print(len(tag_set)) 10 | symbols = unique_list(word for sent in cor for (word,tag) in sent) 11 | print(len(symbols)) 12 | trainer = nltk.tag.HiddenMarkovModelTrainer(tag_set, symbols) 13 | train_corpus = [] 14 | test_corpus = [] 15 | for i in range(len(cor)): 16 | if i % 10: 17 | train_corpus+=[cor[i]] 18 | else: 19 | test_corpus+=[cor[i]] 20 | print(len(train_corpus)) 21 | print(len(test_corpus)) 22 | 23 | 24 | -------------------------------------------------------------------------------- /Module 3/Chapter 3/ch3_1.py: -------------------------------------------------------------------------------- 1 | import nltk 2 | from nltk.stem import PorterStemmer 3 | stemmerporter = PorterStemmer() 4 | print(stemmerporter.stem('working')) 5 | print(stemmerporter.stem('happiness')) 6 | 7 | -------------------------------------------------------------------------------- /Module 3/Chapter 3/ch3_2.py: -------------------------------------------------------------------------------- 1 | import nltk 2 | from nltk.stem import LancasterStemmer 3 | stemmerlan=LancasterStemmer() 4 | print(stemmerlan.stem('working')) 5 | print(stemmerlan.stem('happiness')) 6 | -------------------------------------------------------------------------------- /Module 3/Chapter 3/ch3_3.py: -------------------------------------------------------------------------------- 1 | import nltk 2 | from nltk.stem import RegexpStemmer 3 | stemmerregexp=RegexpStemmer('ing') 4 | print(stemmerregexp.stem('working')) 5 | print(stemmerregexp.stem('happiness')) 6 | print(stemmerregexp.stem('pairing')) 7 | 8 | -------------------------------------------------------------------------------- /Module 3/Chapter 3/ch3_4.py: -------------------------------------------------------------------------------- 1 | import nltk 2 | from nltk.stem import SnowballStemmer 3 | print(SnowballStemmer.languages) 4 | spanishstemmer=SnowballStemmer('spanish') 5 | print(spanishstemmer.stem('comiendo')) 6 | frenchstemmer=SnowballStemmer('french') 7 | print(frenchstemmer.stem('manger')) 8 | 9 | -------------------------------------------------------------------------------- /Module 3/Chapter 3/ch3_5.py: -------------------------------------------------------------------------------- 1 | import nltk 2 | from nltk.stem import WordNetLemmatizer 3 | lemmatizer_output=WordNetLemmatizer() 4 | print(lemmatizer_output.lemmatize('working')) 5 | print(lemmatizer_output.lemmatize('working',pos='v')) 6 | print(lemmatizer_output.lemmatize('works')) 7 | 8 | -------------------------------------------------------------------------------- /Module 3/Chapter 3/ch3_6.py: -------------------------------------------------------------------------------- 1 | import nltk 2 | from nltk.stem import PorterStemmer 3 | from nltk.stem import WordNetLemmatizer 4 | stemmer_output=PorterStemmer() 5 | print(stemmer_output.stem('happiness')) 6 | lemmatizer_output=WordNetLemmatizer() 7 | print(lemmatizer_output.lemmatize('happiness')) 8 | 9 | 10 | -------------------------------------------------------------------------------- /Module 3/Chapter 4/ch4_1.py: -------------------------------------------------------------------------------- 1 | import nltk 2 | text1=nltk.word_tokenize("It is a pleasant day today") 3 | print(nltk.pos_tag(text1)) 4 | -------------------------------------------------------------------------------- /Module 3/Chapter 4/ch4_10.py: -------------------------------------------------------------------------------- 1 | import nltk 2 | tag={} 3 | print(tag) 4 | tag['beautiful']='ADJ' 5 | 6 | tag['boy']='N' 7 | tag['read']='V' 8 | tag['generously']='ADV' 9 | print(tag) 10 | 11 | -------------------------------------------------------------------------------- /Module 3/Chapter 4/ch4_11.py: -------------------------------------------------------------------------------- 1 | import nltk 2 | from nltk.tag import DefaultTagger 3 | tag = DefaultTagger('NN') 4 | print(tag.tag(['Beautiful', 'morning'])) 5 | 6 | -------------------------------------------------------------------------------- /Module 3/Chapter 4/ch4_12.py: -------------------------------------------------------------------------------- 1 | import nltk 2 | from nltk.tag import untag 3 | print(untag([('beautiful', 'NN'), ('morning', 'NN')])) 4 | 5 | -------------------------------------------------------------------------------- /Module 3/Chapter 4/ch4_13.py: -------------------------------------------------------------------------------- 1 | import nltk 2 | import os,os.path 3 | create = os.path.expanduser('~/nltkdoc') 4 | if not os.path.exists(create): 5 | os.mkdir(create) 6 | print(os.path.exists(create)) 7 | import nltk.data 8 | print(create in nltk.data.path) 9 | 10 | -------------------------------------------------------------------------------- /Module 3/Chapter 4/ch4_14.py: -------------------------------------------------------------------------------- 1 | import nltk 2 | from nltk.corpus import names 3 | print(len(names.words('male.txt'))) 4 | print(len(names.words('female.txt'))) 5 | 6 | -------------------------------------------------------------------------------- /Module 3/Chapter 4/ch4_15.py: -------------------------------------------------------------------------------- 1 | import nltk 2 | from nltk.corpus import words 3 | print(words.fileids()) 4 | print(len(words.words('en'))) 5 | print(len(words.words('en-basic'))) 6 | 7 | -------------------------------------------------------------------------------- /Module 3/Chapter 4/ch4_16.py: -------------------------------------------------------------------------------- 1 | import nltk 2 | from nltk.tag import UnigramTagger 3 | from nltk.corpus import treebank 4 | training= treebank.tagged_sents()[:7000] 5 | unitagger=UnigramTagger(training) 6 | print(treebank.sents()[0]) 7 | print(unitagger.tag(treebank.sents()[0])) 8 | -------------------------------------------------------------------------------- /Module 3/Chapter 4/ch4_17.py: -------------------------------------------------------------------------------- 1 | import nltk 2 | from nltk.corpus import treebank 3 | from nltk.tag import UnigramTagger 4 | training= treebank.tagged_sents()[:7000] 5 | unitagger=UnigramTagger(training) 6 | testing = treebank.tagged_sents()[2000:] 7 | print(unitagger.evaluate(testing)) 8 | -------------------------------------------------------------------------------- /Module 3/Chapter 4/ch4_18.py: -------------------------------------------------------------------------------- 1 | import nltk 2 | from nltk.corpus import treebank 3 | from nltk.tag import UnigramTagger 4 | unitag = UnigramTagger(model={'Vinken': 'NN'}) 5 | print(unitag.tag(treebank.sents()[0])) 6 | -------------------------------------------------------------------------------- /Module 3/Chapter 4/ch4_19.py: -------------------------------------------------------------------------------- 1 | import nltk 2 | from nltk.tag import UnigramTagger 3 | from nltk.tag import DefaultTagger 4 | from nltk.corpus import treebank 5 | testing = treebank.tagged_sents()[2000:] 6 | training= treebank.tagged_sents()[:7000] 7 | tag1=DefaultTagger('NN') 8 | tag2=UnigramTagger(training,backoff=tag1) 9 | print(tag2.evaluate(testing)) 10 | -------------------------------------------------------------------------------- /Module 3/Chapter 4/ch4_2.py: -------------------------------------------------------------------------------- 1 | import nltk 2 | print(nltk.help.upenn_tagset('NNS')) 3 | -------------------------------------------------------------------------------- /Module 3/Chapter 4/ch4_20.py: -------------------------------------------------------------------------------- 1 | import nltk 2 | from nltk.tag import BigramTagger 3 | from nltk.corpus import treebank 4 | training_1= treebank.tagged_sents()[:7000] 5 | bigramtagger=BigramTagger(training_1) 6 | print(treebank.sents()[0]) 7 | print(bigramtagger.tag(treebank.sents()[0])) 8 | testing_1 = treebank.tagged_sents()[2000:] 9 | print(bigramtagger.evaluate(testing_1)) 10 | 11 | -------------------------------------------------------------------------------- /Module 3/Chapter 4/ch4_21.py: -------------------------------------------------------------------------------- 1 | import nltk 2 | from nltk.tag import BigramTagger, TrigramTagger 3 | from nltk.corpus import treebank 4 | testing = treebank.tagged_sents()[2000:] 5 | training= treebank.tagged_sents()[:7000] 6 | bigramtag = BigramTagger(training) 7 | print(bigramtag.evaluate(testing)) 8 | trigramtag = TrigramTagger(training) 9 | print(trigramtag.evaluate(testing)) 10 | -------------------------------------------------------------------------------- /Module 3/Chapter 4/ch4_22.py: -------------------------------------------------------------------------------- 1 | import nltk 2 | from nltk.corpus import treebank 3 | from nltk import NgramTagger 4 | testing = treebank.tagged_sents()[2000:] 5 | training= treebank.tagged_sents()[:7000] 6 | quadgramtag = NgramTagger(4, training) 7 | print(quadgramtag.evaluate(testing)) 8 | 9 | -------------------------------------------------------------------------------- /Module 3/Chapter 4/ch4_23.py: -------------------------------------------------------------------------------- 1 | import nltk 2 | from nltk.tag import AffixTagger 3 | from nltk.corpus import treebank 4 | testing = treebank.tagged_sents()[2000:] 5 | training= treebank.tagged_sents()[:7000] 6 | affixtag = AffixTagger(training) 7 | print(affixtag.evaluate(testing)) 8 | -------------------------------------------------------------------------------- /Module 3/Chapter 4/ch4_24.py: -------------------------------------------------------------------------------- 1 | import nltk 2 | from nltk.tag import AffixTagger 3 | from nltk.corpus import treebank 4 | testing = treebank.tagged_sents()[2000:] 5 | training= treebank.tagged_sents()[:7000] 6 | prefixtag = AffixTagger(training, affix_length=4) 7 | print(prefixtag.evaluate(testing)) 8 | -------------------------------------------------------------------------------- /Module 3/Chapter 4/ch4_25.py: -------------------------------------------------------------------------------- 1 | import nltk 2 | from nltk.tag import AffixTagger 3 | from nltk.corpus import treebank 4 | testing = treebank.tagged_sents()[2000:] 5 | training= treebank.tagged_sents()[:7000] 6 | suffixtag = AffixTagger(training, affix_length=-3) 7 | print(suffixtag.evaluate(testing)) 8 | -------------------------------------------------------------------------------- /Module 3/Chapter 4/ch4_26.py: -------------------------------------------------------------------------------- 1 | import nltk 2 | from nltk.tag import AffixTagger 3 | from nltk.corpus import treebank 4 | testing = treebank.tagged_sents()[2000:] 5 | training= treebank.tagged_sents()[:7000] 6 | prefixtagger=AffixTagger(training,affix_length=4) 7 | prefixtagger3=AffixTagger(training,affix_length=3,backoff=prefixtagger) 8 | print(prefixtagger3.evaluate(testing)) 9 | suffixtagger3=AffixTagger(training,affix_length=-3,backoff=prefixtagger3) 10 | print(suffixtagger3.evaluate(testing)) 11 | suffixtagger4=AffixTagger(training,affix_length=-4,backoff=suffixtagger3) 12 | print(suffixtagger4.evaluate(testing)) 13 | 14 | -------------------------------------------------------------------------------- /Module 3/Chapter 4/ch4_27.py: -------------------------------------------------------------------------------- 1 | import nltk 2 | from nltk.tag import tnt 3 | from nltk.corpus import treebank 4 | testing = treebank.tagged_sents()[2000:] 5 | training= treebank.tagged_sents()[:7000] 6 | tnt_tagger=tnt.TnT() 7 | tnt_tagger.train(training) 8 | print(tnt_tagger.evaluate(testing)) 9 | -------------------------------------------------------------------------------- /Module 3/Chapter 4/ch4_28.py: -------------------------------------------------------------------------------- 1 | import nltk 2 | from nltk.tag import DefaultTagger 3 | from nltk.tag import tnt 4 | from nltk.corpus import treebank 5 | testing = treebank.tagged_sents()[2000:] 6 | training= treebank.tagged_sents()[:7000] 7 | tnt_tagger=tnt.TnT() 8 | unknown=DefaultTagger('NN') 9 | tagger_tnt=tnt.TnT(unk=unknown,Trained=True) 10 | tnt_tagger.train(training) 11 | print(tnt_tagger.evaluate(testing)) 12 | -------------------------------------------------------------------------------- /Module 3/Chapter 4/ch4_29.py: -------------------------------------------------------------------------------- 1 | import nltk 2 | sent=[("A","DT"),("wise", "JJ"), ("small", "JJ"),("girl", "NN"), ("of", "IN"), ("village", "N"), ("became", "VBD"), ("leader", "NN")] 3 | sent=[("A","DT"),("wise", "JJ"), ("small", "JJ"),("girl", "NN"), ("of", "IN"), ("village", "NN"), ("became", "VBD"), ("leader", "NN")] 4 | grammar = "NP: {
?*?*}" 5 | find = nltk.RegexpParser(grammar) 6 | res = find.parse(sent) 7 | print(res) 8 | res.draw() 9 | -------------------------------------------------------------------------------- /Module 3/Chapter 4/ch4_3.py: -------------------------------------------------------------------------------- 1 | import nltk 2 | print(nltk.help.upenn_tagset('VB.*')) 3 | -------------------------------------------------------------------------------- /Module 3/Chapter 4/ch4_30.py: -------------------------------------------------------------------------------- 1 | import nltk 2 | noun1=[("financial","NN"),("year","NN"),("account","NN"),("summary","NN")] 3 | gram="NP:{+}" 4 | find = nltk.RegexpParser(gram) 5 | print(find.parse(noun1)) 6 | x=find.parse(noun1) 7 | x.draw() 8 | -------------------------------------------------------------------------------- /Module 3/Chapter 4/ch4_4.py: -------------------------------------------------------------------------------- 1 | import nltk 2 | text=nltk.word_tokenize("I cannot bear the pain of bear") 3 | print(nltk.pos_tag(text)) 4 | -------------------------------------------------------------------------------- /Module 3/Chapter 4/ch4_5.py: -------------------------------------------------------------------------------- 1 | import nltk 2 | taggedword=nltk.tag.str2tuple('bear/NN') 3 | print(taggedword) 4 | print(taggedword[0]) 5 | print(taggedword[1]) 6 | 7 | -------------------------------------------------------------------------------- /Module 3/Chapter 4/ch4_6.py: -------------------------------------------------------------------------------- 1 | import nltk 2 | sentence='''The/DT sacred/VBN Ganga/NNP flows/VBZ in/IN this/DT region/NN ./. This/DT is/VBZ a/DT pilgrimage/NN ./. People/NNP from/IN all/DT over/IN the/DT country/NN visit/NN this/DT place/NN ./. ''' 3 | print([nltk.tag.str2tuple(t) for t in sentence.split()]) 4 | -------------------------------------------------------------------------------- /Module 3/Chapter 4/ch4_7.py: -------------------------------------------------------------------------------- 1 | import nltk 2 | taggedtok = ('bear', 'NN') 3 | from nltk.tag.util import tuple2str 4 | print(tuple2str(taggedtok)) 5 | -------------------------------------------------------------------------------- /Module 3/Chapter 4/ch4_8.py: -------------------------------------------------------------------------------- 1 | import nltk 2 | from nltk.corpus import treebank 3 | treebank_tagged = treebank.tagged_words(tagset='universal') 4 | tag = nltk.FreqDist(tag for (word, tag) in treebank_tagged) 5 | print(tag.most_common()) 6 | -------------------------------------------------------------------------------- /Module 3/Chapter 4/ch4_9.py: -------------------------------------------------------------------------------- 1 | import nltk 2 | from nltk.corpus import treebank 3 | treebank_tagged = treebank.tagged_words(tagset='universal') 4 | tagpairs = nltk.bigrams(treebank_tagged) 5 | preceders_noun = [x[1] for (x, y) in tagpairs if y[1] == 'NOUN'] 6 | freqdist = nltk.FreqDist(preceders_noun) 7 | print([tag for (tag, _) in freqdist.most_common()]) 8 | -------------------------------------------------------------------------------- /Module 3/Chapter 5/ch5_1.py: -------------------------------------------------------------------------------- 1 | import nltk 2 | import nltk.corpus 3 | print(str(nltk.corpus.treebank).replace('\\\\','/')) 4 | print(nltk.corpus.treebank.fileids()) 5 | from nltk.corpus import treebank 6 | print(treebank.words('wsj_0007.mrg')) 7 | print(treebank.tagged_words('wsj_0007.mrg')) 8 | 9 | -------------------------------------------------------------------------------- /Module 3/Chapter 5/ch5_10.py: -------------------------------------------------------------------------------- 1 | import nltk 2 | gram1 = nltk.data.load('grammars/large_grammars/atis.cfg') 3 | sent = nltk.data.load('grammars/large_grammars/atis_sentences.txt') 4 | sent = nltk.parse.util.extract_test_sentences(sent) 5 | testingsent=sent[25] 6 | sent=testingsent[0] 7 | parser1 = nltk.parse.BottomUpChartParser(gram1) 8 | chart1 = parser1.chart_parse(sent) 9 | print((chart1.num_edges())) 10 | print((len(list(chart1.parses(gram1.start()))))) 11 | 12 | -------------------------------------------------------------------------------- /Module 3/Chapter 5/ch5_11.py: -------------------------------------------------------------------------------- 1 | import nltk 2 | gram1 = nltk.data.load('grammars/large_grammars/atis.cfg') 3 | sent = nltk.data.load('grammars/large_grammars/atis_sentences.txt') 4 | sent = nltk.parse.util.extract_test_sentences(sent) 5 | testingsent=sent[25] 6 | sent=testingsent[0] 7 | parser2 = nltk.parse.BottomUpLeftCornerChartParser(gram1) 8 | chart2 = parser2.chart_parse(sent) 9 | print((chart2.num_edges())) 10 | print((len(list(chart2.parses(gram1.start()))))) 11 | -------------------------------------------------------------------------------- /Module 3/Chapter 5/ch5_12.py: -------------------------------------------------------------------------------- 1 | import nltk 2 | gram1 = nltk.data.load('grammars/large_grammars/atis.cfg') 3 | sent = nltk.data.load('grammars/large_grammars/atis_sentences.txt') 4 | sent = nltk.parse.util.extract_test_sentences(sent) 5 | testingsent=sent[25] 6 | sent=testingsent[0] 7 | parser3 = nltk.parse.LeftCornerChartParser(gram1) 8 | chart3 = parser3.chart_parse(sent) 9 | print((chart3.num_edges())) 10 | print((len(list(chart3.parses(gram1.start()))))) 11 | -------------------------------------------------------------------------------- /Module 3/Chapter 5/ch5_13.py: -------------------------------------------------------------------------------- 1 | import nltk 2 | gram1 = nltk.data.load('grammars/large_grammars/atis.cfg') 3 | sent = nltk.data.load('grammars/large_grammars/atis_sentences.txt') 4 | sent = nltk.parse.util.extract_test_sentences(sent) 5 | testingsent=sent[25] 6 | sent=testingsent[0] 7 | parser4 = nltk.parse.TopDownChartParser(gram1) 8 | chart4 = parser4.chart_parse(sent) 9 | print((chart4.num_edges())) 10 | print((len(list(chart4.parses(gram1.start()))))) 11 | -------------------------------------------------------------------------------- /Module 3/Chapter 5/ch5_14.py: -------------------------------------------------------------------------------- 1 | import nltk 2 | gram1 = nltk.data.load('grammars/large_grammars/atis.cfg') 3 | sent = nltk.data.load('grammars/large_grammars/atis_sentences.txt') 4 | sent = nltk.parse.util.extract_test_sentences(sent) 5 | testingsent=sent[25] 6 | sent=testingsent[0] 7 | parser5 = nltk.parse.IncrementalBottomUpChartParser(gram1) 8 | chart5 = parser5.chart_parse(sent) 9 | print((chart5.num_edges())) 10 | print((len(list(chart5.parses(gram1.start()))))) 11 | 12 | -------------------------------------------------------------------------------- /Module 3/Chapter 5/ch5_15.py: -------------------------------------------------------------------------------- 1 | import nltk 2 | gram1 = nltk.data.load('grammars/large_grammars/atis.cfg') 3 | sent = nltk.data.load('grammars/large_grammars/atis_sentences.txt') 4 | sent = nltk.parse.util.extract_test_sentences(sent) 5 | testingsent=sent[25] 6 | sent=testingsent[0] 7 | parser6 = nltk.parse.IncrementalBottomUpLeftCornerChartParser(gram1) 8 | chart6 = parser6.chart_parse(sent) 9 | print((chart6.num_edges())) 10 | print((len(list(chart6.parses(gram1.start()))))) 11 | 12 | -------------------------------------------------------------------------------- /Module 3/Chapter 5/ch5_16.py: -------------------------------------------------------------------------------- 1 | import nltk 2 | gram1 = nltk.data.load('grammars/large_grammars/atis.cfg') 3 | sent = nltk.data.load('grammars/large_grammars/atis_sentences.txt') 4 | sent = nltk.parse.util.extract_test_sentences(sent) 5 | testingsent=sent[25] 6 | sent=testingsent[0] 7 | parser7 = nltk.parse.IncrementalLeftCornerChartParser(gram1) 8 | chart7 = parser7.chart_parse(sent) 9 | print((chart7.num_edges())) 10 | print((len(list(chart7.parses(gram1.start()))))) 11 | -------------------------------------------------------------------------------- /Module 3/Chapter 5/ch5_17.py: -------------------------------------------------------------------------------- 1 | import nltk 2 | gram1 = nltk.data.load('grammars/large_grammars/atis.cfg') 3 | sent = nltk.data.load('grammars/large_grammars/atis_sentences.txt') 4 | sent = nltk.parse.util.extract_test_sentences(sent) 5 | testingsent=sent[25] 6 | sent=testingsent[0] 7 | parser8 = nltk.parse.IncrementalTopDownChartParser(gram1) 8 | chart8 = parser8.chart_parse(sent) 9 | print((chart8.num_edges())) 10 | print((len(list(chart8.parses(gram1.start()))))) 11 | -------------------------------------------------------------------------------- /Module 3/Chapter 5/ch5_18.py: -------------------------------------------------------------------------------- 1 | import nltk 2 | gram1 = nltk.data.load('grammars/large_grammars/atis.cfg') 3 | sent = nltk.data.load('grammars/large_grammars/atis_sentences.txt') 4 | sent = nltk.parse.util.extract_test_sentences(sent) 5 | testingsent=sent[25] 6 | sent=testingsent[0] 7 | parser9 = nltk.parse.EarleyChartParser(gram1) 8 | chart9 = parser9.chart_parse(sent) 9 | print((chart9.num_edges())) 10 | print((len(list(chart9.parses(gram1.start()))))) 11 | -------------------------------------------------------------------------------- /Module 3/Chapter 5/ch5_19.py: -------------------------------------------------------------------------------- 1 | import nltk 2 | from nltk.corpus import treebank 3 | from itertools import islice 4 | from nltk.grammar import PCFG, induce_pcfg, toy_pcfg1, toy_pcfg2 5 | gram2 = PCFG.fromstring(""" 6 | A -> B B [.3] | C B C [.7] 7 | B -> B D [.5] | C [.5] 8 | C -> 'a' [.1] | 'b' [0.9] 9 | D -> 'b' [1.0] 10 | """) 11 | prod1 = gram2.productions()[0] 12 | print(prod1) 13 | prod2 = gram2.productions()[1] 14 | print(prod2) 15 | print(prod2.lhs()) 16 | print(prod2.rhs()) 17 | print((prod2.prob())) 18 | print(gram2.start()) 19 | print(gram2.productions()) 20 | -------------------------------------------------------------------------------- /Module 3/Chapter 5/ch5_2.py: -------------------------------------------------------------------------------- 1 | import nltk 2 | from nltk.corpus import treebank 3 | print(treebank.parsed_sents('wsj_0007.mrg')[2]) 4 | -------------------------------------------------------------------------------- /Module 3/Chapter 5/ch5_20.py: -------------------------------------------------------------------------------- 1 | import nltk 2 | from nltk.corpus import treebank 3 | from itertools import islice 4 | from nltk.grammar import PCFG, induce_pcfg, toy_pcfg1, toy_pcfg2 5 | tokens = "Jack told Bob to bring my cookie".split() 6 | grammar = toy_pcfg2 7 | print(grammar) 8 | -------------------------------------------------------------------------------- /Module 3/Chapter 5/ch5_21.py: -------------------------------------------------------------------------------- 1 | import nltk 2 | nltk.parse.chart.demo(5, print_times=False, trace=1,sent='John saw a dog', numparses=2) 3 | -------------------------------------------------------------------------------- /Module 3/Chapter 5/ch5_22.py: -------------------------------------------------------------------------------- 1 | import nltk 2 | nltk.parse.chart.demo(2, print_times=False, trace=1,sent='John saw a dog', numparses=1) 3 | -------------------------------------------------------------------------------- /Module 3/Chapter 5/ch5_23.py: -------------------------------------------------------------------------------- 1 | import nltk 2 | nltk.parse.featurechart.demo(print_times=False,print_grammar=True,parser=nltk.parse.featurechart.FeatureChartParser,sent='I saw a dog') 3 | -------------------------------------------------------------------------------- /Module 3/Chapter 5/ch5_3.py: -------------------------------------------------------------------------------- 1 | import nltk 2 | from nltk.corpus import treebank_chunk 3 | print(treebank_chunk.chunked_sents()[1]) 4 | treebank_chunk.chunked_sents()[1].draw() 5 | -------------------------------------------------------------------------------- /Module 3/Chapter 5/ch5_4.py: -------------------------------------------------------------------------------- 1 | import nltk 2 | from nltk.corpus import treebank_chunk 3 | print(treebank_chunk.chunked_sents()[1].leaves()) 4 | print(treebank_chunk.chunked_sents()[1].pos()) 5 | print(treebank_chunk.chunked_sents()[1].productions()) 6 | print(nltk.corpus.treebank.tagged_words()) 7 | -------------------------------------------------------------------------------- /Module 3/Chapter 5/ch5_5.py: -------------------------------------------------------------------------------- 1 | import nltk 2 | from nltk.probability import FreqDist 3 | from nltk.corpus import treebank 4 | fd = FreqDist() 5 | print(fd.items()) 6 | 7 | -------------------------------------------------------------------------------- /Module 3/Chapter 5/ch5_6.py: -------------------------------------------------------------------------------- 1 | import nltk 2 | from nltk.corpus import sinica_treebank 3 | print(sinica_treebank.sents()) 4 | print(sinica_treebank.parsed_sents()[27]) 5 | -------------------------------------------------------------------------------- /Module 3/Chapter 5/ch5_7.py: -------------------------------------------------------------------------------- 1 | import nltk 2 | from nltk import Nonterminal, nonterminals, Production, CFG 3 | nonterminal1 = Nonterminal('NP') 4 | nonterminal2 = Nonterminal('VP') 5 | nonterminal3 = Nonterminal('PP') 6 | print(nonterminal1.symbol()) 7 | print(nonterminal2.symbol()) 8 | print(nonterminal3.symbol()) 9 | print(nonterminal1==nonterminal2) 10 | print(nonterminal2==nonterminal3) 11 | print(nonterminal1==nonterminal3) 12 | S, NP, VP, PP = nonterminals('S, NP, VP, PP') 13 | N, V, P, DT = nonterminals('N, V, P, DT') 14 | production1 = Production(S, [NP, VP]) 15 | production2 = Production(NP, [DT, NP]) 16 | production3 = Production(VP, [V, NP,NP,PP]) 17 | print(production1.lhs()) 18 | print(production1.rhs()) 19 | print(production3.lhs()) 20 | print(production3.rhs()) 21 | print(production3 == Production(VP, [V,NP,NP,PP])) 22 | print(production2 == production3) 23 | 24 | -------------------------------------------------------------------------------- /Module 3/Chapter 5/ch5_8.py: -------------------------------------------------------------------------------- 1 | import nltk 2 | gram1 = nltk.data.load('grammars/large_grammars/atis.cfg') 3 | print(gram1) 4 | 5 | -------------------------------------------------------------------------------- /Module 3/Chapter 5/ch5_9.py: -------------------------------------------------------------------------------- 1 | import nltk 2 | sent = nltk.data.load('grammars/large_grammars/atis_sentences.txt') 3 | sent = nltk.parse.util.extract_test_sentences(sent) 4 | print(len(sent)) 5 | testingsent=sent[25] 6 | print(testingsent[1]) 7 | print(testingsent[0]) 8 | sent=testingsent[0] 9 | -------------------------------------------------------------------------------- /Module 3/Chapter 6/ch6_1.py: -------------------------------------------------------------------------------- 1 | import nltk 2 | nltk.boolean_ops() 3 | -------------------------------------------------------------------------------- /Module 3/Chapter 6/ch6_10.py: -------------------------------------------------------------------------------- 1 | import nltk 2 | nltk.tag.hmm.demo_pos() 3 | -------------------------------------------------------------------------------- /Module 3/Chapter 6/ch6_11.py: -------------------------------------------------------------------------------- 1 | import nltk 2 | from nltk import pos_tag, word_tokenize 3 | print(pos_tag(word_tokenize("John and Smith are going to NY and Germany"))) 4 | -------------------------------------------------------------------------------- /Module 3/Chapter 6/ch6_12.py: -------------------------------------------------------------------------------- 1 | import nltk 2 | from nltk.corpus import brown 3 | from nltk.tag import UnigramTagger 4 | tagger = UnigramTagger(brown.tagged_sents(categories='news')[:700]) 5 | sentence = ['John','and','Smith','went','to','NY','and','Germany'] 6 | for word, tag in tagger.tag(sentence): 7 | print(word,'->',tag) 8 | -------------------------------------------------------------------------------- /Module 3/Chapter 6/ch6_13.py: -------------------------------------------------------------------------------- 1 | import nltk 2 | from nltk.corpus import wordnet 3 | from nltk.corpus import wordnet as wn 4 | wn.synsets('cat') 5 | wn.synsets('cat', pos=wn.VERB) 6 | wn.synset('cat.n.01') 7 | print(wn.synset('cat.n.01').definition()) 8 | print(len(wn.synset('cat.n.01').examples())) 9 | print(wn.synset('cat.n.01').lemmas()) 10 | print([str(lemma.name()) for lemma in wn.synset('cat.n.01').lemmas()]) 11 | print(wn.lemma('cat.n.01.cat').synset()) 12 | 13 | -------------------------------------------------------------------------------- /Module 3/Chapter 6/ch6_14.py: -------------------------------------------------------------------------------- 1 | import nltk 2 | from nltk.corpus import wordnet 3 | from nltk.corpus import wordnet as wn 4 | print(sorted(wn.langs())) 5 | print(wn.synset('cat.n.01').lemma_names('ita')) 6 | print(sorted(wn.synset('cat.n.01').lemmas('dan'))) 7 | print(sorted(wn.synset('cat.n.01').lemmas('por'))) 8 | print(len(wordnet.all_lemma_names(pos='n', lang='jpn'))) 9 | cat = wn.synset('cat.n.01') 10 | print(cat.hypernyms()) 11 | print(cat.hyponyms()) 12 | print(cat.member_holonyms()) 13 | print(cat.root_hypernyms()) 14 | print(wn.synset('cat.n.01').lowest_common_hypernyms(wn.synset('dog.n.01'))) 15 | -------------------------------------------------------------------------------- /Module 3/Chapter 6/ch6_15.py: -------------------------------------------------------------------------------- 1 | import nltk 2 | from nltk.corpus import wordnet 3 | from nltk.corpus import wordnet as wn 4 | lion = wn.synset('lion.n.01') 5 | cat = wn.synset('cat.n.01') 6 | print(lion.path_similarity(cat)) 7 | 8 | -------------------------------------------------------------------------------- /Module 3/Chapter 6/ch6_16.py: -------------------------------------------------------------------------------- 1 | import nltk 2 | from nltk.corpus import wordnet 3 | from nltk.corpus import wordnet as wn 4 | lion = wn.synset('lion.n.01') 5 | cat = wn.synset('cat.n.01') 6 | print(lion.lch_similarity(cat)) 7 | -------------------------------------------------------------------------------- /Module 3/Chapter 6/ch6_17.py: -------------------------------------------------------------------------------- 1 | import nltk 2 | from nltk.corpus import wordnet 3 | from nltk.corpus import wordnet as wn 4 | lion = wn.synset('lion.n.01') 5 | cat = wn.synset('cat.n.01') 6 | print(lion.wup_similarity(cat)) 7 | 8 | -------------------------------------------------------------------------------- /Module 3/Chapter 6/ch6_18.py: -------------------------------------------------------------------------------- 1 | import nltk 2 | from nltk.corpus import wordnet 3 | from nltk.corpus import wordnet as wn 4 | from nltk.corpus import wordnet_ic 5 | brown_ic = wordnet_ic.ic('ic-brown.dat') 6 | semcor_ic = wordnet_ic.ic('ic-semcor.dat') 7 | from nltk.corpus import genesis 8 | genesis_ic = wn.ic(genesis, False, 0.0) 9 | lion = wn.synset('lion.n.01') 10 | cat = wn.synset('cat.n.01') 11 | print(lion.res_similarity(cat, brown_ic)) 12 | print(lion.res_similarity(cat, genesis_ic)) 13 | print(lion.jcn_similarity(cat, brown_ic)) 14 | print(lion.jcn_similarity(cat, genesis_ic)) 15 | print(lion.lin_similarity(cat, semcor_ic)) 16 | 17 | 18 | -------------------------------------------------------------------------------- /Module 3/Chapter 6/ch6_2.py: -------------------------------------------------------------------------------- 1 | import nltk 2 | input_expr = nltk.sem.Expression.fromstring 3 | print(input_expr('X | (Y -> Z)')) 4 | print(input_expr('-(X & Y)')) 5 | print(input_expr('X & Y')) 6 | print(input_expr('X <-> -- X')) 7 | -------------------------------------------------------------------------------- /Module 3/Chapter 6/ch6_3.py: -------------------------------------------------------------------------------- 1 | import nltk 2 | value = nltk.Valuation([('X', True), ('Y', False), ('Z', True)]) 3 | print(value['Z']) 4 | domain = set() 5 | v = nltk.Assignment(domain) 6 | u = nltk.Model(domain, value) 7 | print(u.evaluate('(X & Y)', v)) 8 | print(u.evaluate('-(X & Y)', v)) 9 | print(u.evaluate('(X & Z)', v)) 10 | print(u.evaluate('(X | Y)', v)) 11 | 12 | -------------------------------------------------------------------------------- /Module 3/Chapter 6/ch6_4.py: -------------------------------------------------------------------------------- 1 | import nltk 2 | input_expr = nltk.sem.Expression.fromstring 3 | expression = input_expr('run(marcus)', type_check=True) 4 | print(expression.argument) 5 | print(expression.argument.type) 6 | print(expression.function) 7 | print(expression.function.type) 8 | sign = {'run': ''} 9 | expression = input_expr('run(marcus)', signature=sign) 10 | print(expression.function.type) 11 | -------------------------------------------------------------------------------- /Module 3/Chapter 6/ch6_5.py: -------------------------------------------------------------------------------- 1 | import nltk 2 | locations=[('Jaipur', 'IN', 'Rajasthan'),('Ajmer', 'IN', 'Rajasthan'),('Udaipur', 'IN', 'Rajasthan'),('Mumbai', 'IN', 'Maharashtra'),('Ahmedabad', 'IN', 'Gujrat')] 3 | q = [x1 for (x1, relation, x2) in locations if x2=='Rajasthan'] 4 | print(q) 5 | -------------------------------------------------------------------------------- /Module 3/Chapter 6/ch6_6.py: -------------------------------------------------------------------------------- 1 | import nltk 2 | nltk.data.show_cfg('grammars/book_grammars/sql1.fcfg') 3 | 4 | 5 | from nltk import load_parser 6 | test = load_parser('grammars/book_grammars/sql1.fcfg') 7 | q=" What cities are in Greece" 8 | t = list(test.parse(q.split())) 9 | ans = t[0].label()['SEM'] 10 | ans = [s for s in ans if s] 11 | q = ' '.join(ans) 12 | print(q) 13 | from nltk.sem import chat80 14 | r = chat80.sql_query('corpora/city_database/city.db', q) 15 | for p in r: 16 | print(p[0], end=" ") 17 | -------------------------------------------------------------------------------- /Module 3/Chapter 6/ch6_7.py: -------------------------------------------------------------------------------- 1 | import nltk 2 | sentences1 = nltk.corpus.treebank.tagged_sents()[17] 3 | print(nltk.ne_chunk(sentences1, binary=True)) 4 | sentences2 = nltk.corpus.treebank.tagged_sents()[7] 5 | print(nltk.ne_chunk(sentences2, binary=True)) 6 | print(nltk.ne_chunk(sentences2)) 7 | 8 | -------------------------------------------------------------------------------- /Module 3/Chapter 6/ch6_8.py: -------------------------------------------------------------------------------- 1 | import nltk 2 | from nltk.corpus import conll2002 3 | for documents in conll2002.chunked_sents('ned.train')[25]: 4 | print(documents) 5 | -------------------------------------------------------------------------------- /Module 3/Chapter 6/ch6_9.py: -------------------------------------------------------------------------------- 1 | import nltk 2 | sentence = "I went to Greece to meet John"; 3 | tok=nltk.word_tokenize(sentence) 4 | pos_tag=nltk.pos_tag(tok) 5 | print(nltk.ne_chunk(pos_tag)) 6 | -------------------------------------------------------------------------------- /Module 3/Chapter 7/ch7_1.py: -------------------------------------------------------------------------------- 1 | import nltk 2 | import random 3 | from nltk.corpus import movie_reviews 4 | docs = [(list(movie_reviews.words(fid)), cat) for cat in movie_reviews.categories() for fid in movie_reviews.fileids(cat)] 5 | random.shuffle(docs) 6 | all_tokens = nltk.FreqDist(x.lower() for x in movie_reviews.words()) 7 | token_features = list(all_tokens.keys())[:2000] 8 | print(token_features[:100]) 9 | 10 | def doc_features(docs): 11 | doc_words = set(docs) 12 | features = {} 13 | for word in token_features: 14 | features['contains(%s)' % word] = (word in doc_words) 15 | return features 16 | 17 | print(doc_features(movie_reviews.words('pos/cv957_8737.txt'))) 18 | feature_sets = [(doc_features(d), c) for (d,c) in docs] 19 | train_sets, test_sets = feature_sets[100:], feature_sets[:100] 20 | classifiers = nltk.NaiveBayesClassifier.train(train_sets) 21 | print(nltk.classify.accuracy(classifiers, test_sets)) 22 | classifiers.show_most_informative_features(5) 23 | -------------------------------------------------------------------------------- /Module 3/Chapter 8/ch8_1.py: -------------------------------------------------------------------------------- 1 | import nltk 2 | from nltk.corpus import stopwords 3 | print(stopwords.words('english')) 4 | def not_stopwords(text): 5 | stopwords = nltk.corpus.stopwords.words('english') 6 | content = [w for w in text if w.lower() not in stopwords] 7 | return len(content) / len(text) 8 | print(not_stopwords(nltk.corpus.reuters.words())) 9 | -------------------------------------------------------------------------------- /Module 3/Chapter 9/ch9_1.py: -------------------------------------------------------------------------------- 1 | import nltk 2 | expr_read = nltk.sem.DrtExpression.fromstring 3 | expr1 = expr_read('([x], [John(x), Went(x)])') 4 | print(expr1) 5 | expr1.draw() 6 | print(expr1.fol()) 7 | -------------------------------------------------------------------------------- /Module 3/Chapter 9/ch9_2.py: -------------------------------------------------------------------------------- 1 | import nltk 2 | expr_read = nltk.sem.DrtExpression.fromstring 3 | expr2 = expr_read('([x,y], [John(x), Went(x),Sam(y),Meet(x,y)])') 4 | print(expr2) 5 | expr2.draw() 6 | print(expr2.fol()) 7 | -------------------------------------------------------------------------------- /Module 3/Chapter 9/ch9_3.py: -------------------------------------------------------------------------------- 1 | import nltk 2 | expr_read = nltk.sem.DrtExpression.fromstring 3 | expr3 = expr_read('([x], [John(x), eats(x)])+ ([y],[Sam(y),eats(y)])') 4 | print(expr3) 5 | print(expr3.simplify()) 6 | expr3.draw() 7 | 8 | -------------------------------------------------------------------------------- /Module 3/Chapter 9/ch9_4.py: -------------------------------------------------------------------------------- 1 | import nltk 2 | expr_read = nltk.sem.DrtExpression.fromstring 3 | expr4 = expr_read('([],[(([x],[student(x)])->([y],[book(y),read(x,y)]))])') 4 | print(expr4.fol()) 5 | -------------------------------------------------------------------------------- /Module 3/Chapter 9/ch9_5.py: -------------------------------------------------------------------------------- 1 | import nltk 2 | expr_read = nltk.sem.DrtExpression.fromstring 3 | expr5 = expr_read('([x,y],[ram(x),food(y),eats(x,y)])') 4 | expr6 = expr_read('([u,z],[PRO(u),coffee(z),drinks(u,z)])') 5 | expr7=expr5+expr6 6 | print(expr7.simplify()) 7 | print(expr7.simplify().resolve_anaphora()) 8 | -------------------------------------------------------------------------------- /Module 3/__pycache__/replacers.cpython-34.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Natural-Language-Processing-Python-and-NLTK/b34df3ceab78b3de29195a811696dcd06e77063a/Module 3/__pycache__/replacers.cpython-34.pyc -------------------------------------------------------------------------------- /Module 3/__pycache__/replacers.py: -------------------------------------------------------------------------------- 1 | import re 2 | 3 | from nltk.corpus import wordnet 4 | 5 | replacement_patterns = [ 6 | (r'won\'t', 'will not'), 7 | (r'can\'t', 'cannot'), 8 | (r'i\'m', 'i am'), 9 | (r'ain\'t', 'is not'), 10 | (r'(\w+)\'ll', '\g<1> will'), 11 | (r'(\w+)n\'t', '\g<1> not'), 12 | (r'(\w+)\'ve', '\g<1> have'), 13 | (r'(\w+)\'s', '\g<1> is'), 14 | (r'(\w+)\'re', '\g<1> are'), 15 | (r'(\w+)\'d', '\g<1> would') 16 | ] 17 | class RegexpReplacer(object): 18 | def __init__(self, patterns=replacement_patterns): 19 | self.patterns = [(re.compile(regex), repl) for (regex, repl) in 20 | patterns] 21 | def replace(self, text): 22 | s = text 23 | for (pattern, repl) in self.patterns: 24 | (s, count) = re.subn(pattern, repl, s) 25 | return s 26 | 27 | class RepeatReplacer(object): 28 | def __init__(self): 29 | self.repeat_regexp = re.compile(r'(\w*)(\w)\2(\w*)') 30 | self.repl = r'\1\2\3' 31 | def replace(self, word): 32 | if wordnet.synsets(word): 33 | return word 34 | repl_word = self.repeat_regexp.sub(self.repl, word) 35 | if repl_word != word: 36 | return self.replace(repl_word) 37 | else: 38 | return repl_word 39 | 40 | class WordReplacer(object): 41 | def __init__(self, word_map): 42 | self.word_map = word_map 43 | def replace(self, word): 44 | return self.word_map.get(word, word) 45 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | # Natural Language Processing Python and NLTK 5 | Code repository for Natural Language Processing Python and NLTK 6 | 7 | ##What You Will Learn: 8 | * Get a glimpse of the complexity of natural languages and how they are processed by machines 9 | * Clean and wrangle text using tokenization and chunking to help you better process data 10 | * Tokenize text into sentences, and sentences into words 11 | * Classify text and perform sentiment analysis 12 | * Implement string matching algorithms and normalization techniques 13 | * Understand and implement the concepts of information retrieval and text summarization 14 | * Find out how to implement various NLP tasks in Python 15 | 16 | ### Software and Hardware (Module 1): 17 | | Chapter number | Software required (with version) | Download links to the software | Hardware specifications | OS required | 18 | | -------------- | -------------- |-------------- |-------------- |-------------- | 19 | | 1-5 | Python/Anaconda NLTK | https://www.python.org/, http://continuum.io/downloads, http://www.nltk.org/ | Common Unix Printing System | any | 20 | | 6 | scikit-learn and gensim | http://scikit-learn.org/stable/, https://radimrehurek.com/gensim/ | Common Unix Printing System | any | 21 | | 7 | Scrappy | http://scrapy.org/ | Common Unix Printing System | any | 22 | | 8 | NumPy, SciPy, pandas, and matplotlib | http://www.numpy.org/, http://www.scipy.org/, http://pandas.pydata.org/, http://matplotlib.org/ | Common Unix Printing System | any | 23 | | 9 | Twitter Python APIs and Facebook python APIs | https://dev.twitter.com/overview/api/twitter-libraries, https://developers.facebook.com | Common Unix Printing System | any | 24 | 25 | 26 | 27 | ### Software and Hardware (Module 2): 28 | | Chapter number | Software required (with version) | Free/Proprietary | Download links to the software | 29 | | -------------- | -------------- |-------------- |-------------- |-------------- | 30 | | 1 | NLTK>=3.0a4, NLTK Data | Free | http://www.nltk.org, http://www.nltk.org/data.html | 31 | | 2 | pyenchant>=1.6.5 | Free | http://pythonhosted.org/pyenchant/ | 32 | | 3 | lockfile>=0.9.1, MongoDB >= 2.6, pymongo>=2.6.3 | Free | https://pypi.python.org/pypi/lockfile, http://www.mongodb.org/, https://pypi.python.org/pypi/pymongo/ | 33 | | 4 | NLTK-Trainer >= 0.9 | Free | https://github.com/japerk/nltk-trainer | 34 | | 7 | scikit-learn>=0.14.1 | Free |http://scikit-learn.org/stable/ | 35 | | 8 | Redis >= 2.8, redis>=2.8.0 , execnet>=1.1 | Free | http://redis.io/, https://pypi.python.org/pypi/redis/, https://codespeak.net/execnet/ | 36 | | 9 | python-dateutil>=2.0, beautifulsoup4>=4.3.2, lxml>=3.2.3, charade>=1.0.3 | Free | http://labix.org/python-dateutil, http://www.crummy.com/software/BeautifulSoup/, http://lxml.de/, https://pypi.python.org/pypi/charade | 37 | 38 | 39 | 40 | 41 | 42 | 43 | ### Software and Hardware (Module 3): 44 | | Chapter number | Software required (with version) | Hardware Specifications | OS required | 45 | | -------------- | -------------------------------- | ----------------------- | ----------- | 46 | | All chapters | Python 2.7 or 3.2+ | Install NLTK 3.0 either on 32-bit or 64-bit machine | Windows or Mac/Unix | 47 | 48 | 49 | 50 | 51 | 52 | ###Note 53 | Modules 1, 2 and 3 have code arranged by chapter (for the chapters that have code). [Click here](https://docs.google.com/forms/d/e/1FAIpQLSe5qwunkGf6PUvzPirPDtuy1Du5Rlzew23UBp2S-P3wB-GcwQ/viewform) if you have any feedback or suggestions. 54 | ### Download a free PDF 55 | 56 | If you have already purchased a print or Kindle version of this book, you can get a DRM-free PDF version at no cost.
Simply click on the link to claim your free PDF.
57 |

https://packt.link/free-ebook/9781787285101

--------------------------------------------------------------------------------