├── Section 4 ├── nlp-4.2-advanced-text-preprocessing.py ├── nlp-4-ngrams.py └── 4.2 Regular Expression for NLP.ipynb ├── Section 1 ├── nlp-nltk-scikit-learn-code.rar └── nlp-1-natural-language-data.py ├── LICENSE ├── Section 6 ├── nlp-6.3-lda.py ├── nlp-6.4-tfidf-svm.py ├── nlp-6.2-hashing-vs-count.py └── nlp-6.1-nlp-pipeline.py ├── Section 3 └── nlp-3-sentiment-analysis.py ├── Section 5 └── nlp-5-document-classification.py ├── Section 2 └── nlp-2-spam-classification.py └── README.md /Section 4/nlp-4.2-advanced-text-preprocessing.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /Section 1/nlp-nltk-scikit-learn-code.rar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Hands-on-NLP-with-NLTK-and-scikit-learn-/HEAD/Section 1/nlp-nltk-scikit-learn-code.rar -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2018 Packt 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /Section 6/nlp-6.3-lda.py: -------------------------------------------------------------------------------- 1 | import itertools 2 | import numpy as np 3 | from sklearn import ( 4 | datasets, feature_extraction, model_selection, pipeline, 5 | decomposition, preprocessing, naive_bayes 6 | ) 7 | import matplotlib.pyplot as plt 8 | 9 | 10 | if __name__ == '__main__': 11 | newsgroups_data = datasets.load_files( 12 | '20_newsgroups', shuffle=True, random_state=42, encoding='ISO-8859-1') 13 | 14 | print('Data loaded.\nClasses = {classes}\n{datapoints}'.format( 15 | classes=newsgroups_data.target_names, datapoints=len(newsgroups_data.data))) 16 | 17 | # sometimes the label is present in the training data 18 | print(newsgroups_data.data[0]) 19 | # remove any label present in the features 20 | 21 | X_train, X_test, y_train, y_test = model_selection.train_test_split( 22 | newsgroups_data.data, newsgroups_data.target, test_size=0.33, 23 | random_state=42) 24 | 25 | model = pipeline.Pipeline([ 26 | ('counts', feature_extraction.text.CountVectorizer()), 27 | ('tfidf', feature_extraction.text.TfidfTransformer()), 28 | ('SVD', decomposition.TruncatedSVD(128)), 29 | ('normalize', preprocessing.Normalizer(copy=False)), 30 | ('naivebayes', naive_bayes.GaussianNB()) 31 | ]) 32 | 33 | model.fit(X_train, y_train) 34 | y_pred = model.predict(X_test) 35 | 36 | print(model.score(X_test, y_test)) 37 | -------------------------------------------------------------------------------- /Section 3/nlp-3-sentiment-analysis.py: -------------------------------------------------------------------------------- 1 | import collections 2 | import nltk 3 | import os 4 | from sklearn import ( 5 | datasets, model_selection, feature_extraction, linear_model 6 | ) 7 | 8 | 9 | def extract_features(corpus): 10 | '''Extract TF-IDF features from corpus''' 11 | # vectorize means we turn non-numerical data into an array of numbers 12 | count_vectorizer = feature_extraction.text.CountVectorizer( 13 | lowercase=True, # for demonstration, True by default 14 | tokenizer=nltk.word_tokenize, # use the NLTK tokenizer 15 | stop_words='english', # remove stop words 16 | min_df=1 # minimum document frequency, i.e. the word must appear more than once. 17 | ) 18 | processed_corpus = count_vectorizer.fit_transform(corpus) 19 | processed_corpus = feature_extraction.text.TfidfTransformer().fit_transform( 20 | processed_corpus) 21 | 22 | return processed_corpus 23 | 24 | 25 | data_directory = 'movie_reviews' 26 | movie_sentiment_data = datasets.load_files(data_directory, shuffle=True) 27 | print('{} files loaded.'.format(len(movie_sentiment_data.data))) 28 | print('They contain the following classes: {}.'.format( 29 | movie_sentiment_data.target_names)) 30 | 31 | movie_tfidf = extract_features(movie_sentiment_data.data) 32 | 33 | X_train, X_test, y_train, y_test = model_selection.train_test_split( 34 | movie_tfidf, movie_sentiment_data.target, test_size=0.30, random_state=42) 35 | 36 | # similar to nltk.NaiveBayesClassifier.train() 37 | model = linear_model.LogisticRegression() 38 | model.fit(X_train, y_train) 39 | print('Model performance: {}'.format(model.score(X_test, y_test))) 40 | 41 | y_pred = model.predict(X_test) 42 | for i in range(5): 43 | print('Review:\n{review}\n-\nCorrect label: {correct}; Predicted: {predict}'.format( 44 | review=X_test[i], correct=y_test[i], predict=y_pred[i] 45 | )) 46 | -------------------------------------------------------------------------------- /Section 6/nlp-6.4-tfidf-svm.py: -------------------------------------------------------------------------------- 1 | import itertools 2 | import numpy as np 3 | import nltk 4 | from sklearn import ( 5 | datasets, feature_extraction, model_selection, pipeline, 6 | svm, metrics 7 | ) 8 | import matplotlib.pyplot as plt 9 | 10 | 11 | def extract_features(corpus): 12 | '''Extract TF-IDF features from corpus''' 13 | 14 | stop_words = nltk.corpus.stopwords.words("english") 15 | 16 | # vectorize means we turn non-numerical data into an array of numbers 17 | count_vectorizer = feature_extraction.text.CountVectorizer( 18 | lowercase=True, # for demonstration, True by default 19 | tokenizer=nltk.word_tokenize, # use the NLTK tokenizer 20 | min_df=2, # minimum document frequency, i.e. the word must appear more than once. 21 | ngram_range=(1, 2), 22 | stop_words=stop_words 23 | ) 24 | processed_corpus = count_vectorizer.fit_transform(corpus) 25 | processed_corpus = feature_extraction.text.TfidfTransformer().fit_transform( 26 | processed_corpus) 27 | 28 | return processed_corpus 29 | 30 | if __name__ == '__main__': 31 | newsgroups_data = datasets.load_files( 32 | '20_newsgroups', shuffle=True, random_state=42, encoding='ISO-8859-1') 33 | 34 | print('Data loaded.\nClasses = {classes}\n{datapoints}'.format( 35 | classes=newsgroups_data.target_names, 36 | datapoints=len(newsgroups_data.data))) 37 | 38 | print(newsgroups_data.data[0]) 39 | 40 | X_train, X_test, y_train, y_test = model_selection.train_test_split( 41 | newsgroups_data.data, newsgroups_data.target, test_size=0.33, 42 | random_state=42) 43 | 44 | model = pipeline.Pipeline([ 45 | ('counts', feature_extraction.text.CountVectorizer()), 46 | ('tfidf', feature_extraction.text.TfidfTransformer()), 47 | ('svm', svm.LinearSVC()), 48 | ]) 49 | 50 | model.fit(X_train, y_train) 51 | y_pred = model.predict(X_test) 52 | 53 | print('Accuracy of SVM= {}'.format( 54 | np.mean(y_pred == y_test))) 55 | 56 | print(metrics.classification_report( 57 | y_test, y_pred, target_names=newsgroups_data.target_names)) 58 | -------------------------------------------------------------------------------- /Section 6/nlp-6.2-hashing-vs-count.py: -------------------------------------------------------------------------------- 1 | from sklearn import feature_extraction 2 | 3 | 4 | corpus = [ 5 | 'Convert a collection of text documents to a matrix of token occurrences', 6 | 'It turns a collection of text documents into a scipy.sparse matrix holding token occurrence counts (or binary occurrence information), possibly normalized as token frequencies if norm=’l1’ or projected on the euclidean unit sphere if norm=’l2’.', 7 | 'This text vectorizer implementation uses the hashing trick to find the token string name to feature integer index mapping.', 8 | 'This strategy has several advantages:', 9 | 'it is very low memory scalable to large datasets as there is no need to store a vocabulary dictionary in memory', 10 | 'it is fast to pickle and un-pickle as it holds no state besides the constructor parameters', 11 | 'it can be used in a streaming (partial fit) or parallel pipeline as there is no state computed during fit.' 12 | ] 13 | 14 | print('Processing corpus: {} documents'.format(len(corpus))) 15 | 16 | print('Count Vectorizer:\n') 17 | vectorizer = feature_extraction.text.CountVectorizer() 18 | X = vectorizer.fit_transform(corpus) 19 | # Count Vectorizer stores a dictionary: a number per word 20 | print(vectorizer.vocabulary_) 21 | print('Resulting matrix has {} data points and {} features.\n'.format( 22 | X.shape[0], X.shape[1])) 23 | print('Document 1: \n{}'.format(X[0].toarray())) 24 | # as the number of words increase, you need a bigger and bigger dictionary! 25 | 26 | 27 | print('Hashing Vectorizer:\n') 28 | 29 | # norm=None means we don't normalize the values 30 | # alternative_sign=False means that we don't alternate the value's signs to 31 | # conserve any mathematical properties 32 | vectorizer = feature_extraction.text.HashingVectorizer( 33 | norm=None, alternate_sign=False) 34 | X = vectorizer.transform(corpus) # not fit_transform 35 | 36 | print('Resulting matrix has {} data points and {} features.\n'.format( 37 | X.shape[0], X.shape[1])) 38 | 39 | # > Resulting matrix has 7 data points and 1048576 features. 40 | 41 | print('Document 1: \n{}'.format(X[0])) 42 | 43 | # Document 1: 44 | # (0, 22468) 0.2886751345948129 45 | # (0, 124863) -0.2886751345948129 46 | # (0, 164975) -0.2886751345948129 47 | # (0, 174171) 0.2886751345948129 48 | # (0, 264705) 0.2886751345948129 49 | # (0, 479532) 0.5773502691896258 50 | # (0, 548700) -0.2886751345948129 51 | # (0, 676585) -0.2886751345948129 52 | # (0, 741852) -0.2886751345948129 53 | # Read the above as: 54 | # (document_index, feature_index) 55 | -------------------------------------------------------------------------------- /Section 6/nlp-6.1-nlp-pipeline.py: -------------------------------------------------------------------------------- 1 | import nltk 2 | 3 | 4 | corpus = [ 5 | """ 6 | This strategy has several advantages: 7 | it is very low memory scalable to large datasets as there is no need to store a vocabulary dictionary in memory 8 | it is fast to pickle and un-pickle as it holds no state besides the constructor parameters 9 | it can be used in a streaming (partial fit) or parallel pipeline as there is no state computed during fit. 10 | """, 11 | """ 12 | It turns a collection of text documents into a scipy.sparse matrix holding token occurrence counts (or binary occurrence information), 13 | possibly normalized as token frequencies if norm=’l1’ or projected on the euclidean unit sphere if norm=’l2’. 14 | """ 15 | ] 16 | 17 | 18 | def pipeline(f): 19 | '''pipeline decorator that calls next() on function f()''' 20 | def start_pipeline(*args, **kwargs): 21 | nf = f(*args, **kwargs) 22 | next(nf) 23 | return nf 24 | return start_pipeline 25 | 26 | 27 | def ingest(corpus, targets): 28 | for text in corpus: 29 | for t in targets: 30 | t.send(text) 31 | 32 | 33 | @pipeline 34 | def tokenize_sentences(targets): 35 | while True: 36 | text = (yield) # (yield) gets an item from an upstream step 37 | sentences = nltk.sent_tokenize(text) 38 | for sentence in sentences: 39 | for target in targets: 40 | target.send(sentence) # send() sends data downstream 41 | 42 | 43 | @pipeline 44 | def tokenize_words(targets): 45 | while True: 46 | sentence = (yield) 47 | words = nltk.word_tokenize(sentence) 48 | for target in targets: 49 | target.send(words) 50 | 51 | 52 | @pipeline 53 | def pos_tagging(targets): 54 | while True: 55 | words = (yield) 56 | tagged_words = nltk.pos_tag(words) 57 | 58 | for target in targets: 59 | target.send(tagged_words) 60 | 61 | 62 | @pipeline 63 | def ne_chunking(targets): 64 | while True: 65 | tagged_words = (yield) 66 | ner_tagged = nltk.ne_chunk(tagged_words) 67 | for target in targets: 68 | target.send(ner_tagged) 69 | 70 | 71 | @pipeline 72 | def printline(title): 73 | while True: 74 | line = (yield) 75 | print(title) 76 | print(line) 77 | 78 | ingest(corpus, [ 79 | tokenize_sentences([ 80 | tokenize_words([ 81 | printline('Word tokens:'), 82 | pos_tagging([ 83 | ne_chunking([ 84 | printline('Results:') 85 | ]) 86 | ]) 87 | ]) 88 | ]) 89 | ]) 90 | -------------------------------------------------------------------------------- /Section 5/nlp-5-document-classification.py: -------------------------------------------------------------------------------- 1 | import itertools 2 | import numpy as np 3 | import nltk 4 | from sklearn import ( 5 | datasets, feature_extraction, model_selection, pipeline, 6 | naive_bayes, metrics 7 | ) 8 | import matplotlib.pyplot as plt 9 | 10 | 11 | def extract_features(corpus): 12 | '''Extract TF-IDF features from corpus''' 13 | 14 | stop_words = nltk.corpus.stopwords.words("english") 15 | 16 | # vectorize means we turn non-numerical data into an array of numbers 17 | count_vectorizer = feature_extraction.text.CountVectorizer( 18 | lowercase=True, # for demonstration, True by default 19 | tokenizer=nltk.word_tokenize, # use the NLTK tokenizer 20 | min_df=2, # minimum document frequency, i.e. the word must appear more than once. 21 | ngram_range=(1, 2), 22 | stop_words=stop_words 23 | ) 24 | processed_corpus = count_vectorizer.fit_transform(corpus) 25 | processed_corpus = feature_extraction.text.TfidfTransformer().fit_transform( 26 | processed_corpus) 27 | 28 | return processed_corpus 29 | 30 | if __name__ == '__main__': 31 | newsgroups_data = datasets.load_files( 32 | '20_newsgroups', shuffle=True, random_state=42, encoding='ISO-8859-1') 33 | 34 | print('Data loaded.\nClasses = {classes}\n{datapoints}'.format( 35 | classes=newsgroups_data.target_names, 36 | datapoints=len(newsgroups_data.data))) 37 | 38 | print(newsgroups_data.data[0]) 39 | 40 | X_train, X_test, y_train, y_test = model_selection.train_test_split( 41 | newsgroups_data.data, newsgroups_data.target, test_size=0.33, 42 | random_state=42) 43 | 44 | stop_words = nltk.corpus.stopwords.words("english") 45 | 46 | model = pipeline.Pipeline([ 47 | ('counts', feature_extraction.text.CountVectorizer( 48 | lowercase=True, # for demonstration, True by default 49 | tokenizer=nltk.word_tokenize, # use the NLTK tokenizer 50 | min_df=2, # minimum document frequency, i.e. the word must appear more than once. 51 | ngram_range=(1, 2), 52 | stop_words=stop_words 53 | )), 54 | ('tfidf', feature_extraction.text.TfidfTransformer()), 55 | ('naivebayes', naive_bayes.MultinomialNB()), 56 | ]) 57 | 58 | model.fit(X_train, y_train) 59 | y_pred = model.predict(X_test) 60 | 61 | print('Accuracy of multinomial naive bayes= {}'.format( 62 | np.mean(y_pred == y_test))) 63 | 64 | print(metrics.classification_report( 65 | y_test, y_pred, target_names=newsgroups_data.target_names)) 66 | 67 | grid_search_model = model_selection.GridSearchCV( 68 | model, 69 | { 70 | 'counts__ngram_range': [(1, 1), (1, 2)], 71 | 'naivebayes__alpha': (0.1, 3.0) 72 | }, 73 | n_jobs=-1 # detect how many cores are installed and uses them all 74 | ) 75 | 76 | grid_search_model.fit(X_train, y_train) 77 | print(grid_search_model.cv_results_) 78 | -------------------------------------------------------------------------------- /Section 4/nlp-4-ngrams.py: -------------------------------------------------------------------------------- 1 | import collections 2 | import nltk 3 | import os 4 | from sklearn import ( 5 | datasets, model_selection, feature_extraction, linear_model, naive_bayes, 6 | ensemble 7 | ) 8 | 9 | 10 | def extract_features(corpus): 11 | '''Extract TF-IDF features from corpus''' 12 | 13 | sa_stop_words = nltk.corpus.stopwords.words("english") 14 | 15 | # words that might invert a sentence's meaning 16 | white_list = [ 17 | 'what', 'but', 'if', 'because', 'as', 'until', 'against', 18 | 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 19 | 'further', 'then', 'once', 'here', 'there', 'why', 'how', 'all', 'any', 20 | 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 21 | 'same', 'so', 'than', 'too', 'can', 'will', 'just', 'don', 'should'] 22 | 23 | # take these out of the standard NLTK stop word list 24 | sa_stop_words = [sw for sw in sa_stop_words if sw not in white_list] 25 | 26 | # vectorize means we turn non-numerical data into an array of numbers 27 | count_vectorizer = feature_extraction.text.CountVectorizer( 28 | lowercase=True, # for demonstration, True by default 29 | tokenizer=nltk.word_tokenize, # use the NLTK tokenizer 30 | min_df=2, # minimum document frequency, i.e. the word must appear more than once. 31 | ngram_range=(1, 2), 32 | stop_words=sa_stop_words 33 | ) 34 | processed_corpus = count_vectorizer.fit_transform(corpus) 35 | processed_corpus = feature_extraction.text.TfidfTransformer().fit_transform( 36 | processed_corpus) 37 | 38 | return processed_corpus 39 | 40 | 41 | data_directory = 'movie_reviews' 42 | movie_sentiment_data = datasets.load_files(data_directory, shuffle=True) 43 | print('{} files loaded.'.format(len(movie_sentiment_data.data))) 44 | print('They contain the following classes: {}.'.format( 45 | movie_sentiment_data.target_names)) 46 | 47 | movie_tfidf = extract_features(movie_sentiment_data.data) 48 | 49 | X_train, X_test, y_train, y_test = model_selection.train_test_split( 50 | movie_tfidf, movie_sentiment_data.target, test_size=0.30, random_state=42) 51 | 52 | # similar to nltk.NaiveBayesClassifier.train() 53 | clf1 = linear_model.LogisticRegression() 54 | clf1.fit(X_train, y_train) 55 | print('Logistic Regression performance: {}'.format(clf1.score(X_test, y_test))) 56 | 57 | clf2 = linear_model.SGDClassifier() 58 | clf2.fit(X_train, y_train) 59 | print('SGDClassifier performance: {}'.format(clf2.score(X_test, y_test))) 60 | 61 | clf3 = naive_bayes.MultinomialNB() 62 | clf3.fit(X_train, y_train) 63 | print('MultinomialNB performance: {}'.format(clf3.score(X_test, y_test))) 64 | 65 | clf4 = naive_bayes.BernoulliNB() 66 | clf4.fit(X_train, y_train) 67 | print('BernoulliNB performance: {}'.format(clf4.score(X_test, y_test))) 68 | 69 | 70 | voting_model = ensemble.VotingClassifier( 71 | estimators=[('lr', clf1), ('sgd', clf2), ('mnb', clf3), ('bnb', clf4)], 72 | voting='hard') 73 | voting_model.fit(X_train, y_train) 74 | print('Voting classifier performance: {}'.format( 75 | voting_model.score(X_test, y_test))) 76 | 77 | -------------------------------------------------------------------------------- /Section 2/nlp-2-spam-classification.py: -------------------------------------------------------------------------------- 1 | import collections 2 | import nltk 3 | import os 4 | import random 5 | 6 | 7 | # Define some stop words 8 | stop_words = { 9 | 'ourselves', 'hers', 'between', 'yourself', 'but', 'again', 10 | 'there', 'about', 'once', 'during', 'out', 'very', 'having', 'with', 'they', 11 | 'own', 'an', 'be', 'some', 'for', 'do', 'its', 'yours', 'such', 'into', 12 | 'of', 'most', 'itself', 'other', 'off', 'is', 's', 'am', 'or', 'who', 'as', 13 | 'from', 'him', 'each', 'the', 'themselves', 'until', 'below', 'are', 'we', 14 | 'these', 'your', 'his', 'through', 'don', 'nor', 'me', 'were', 'her', 'more', 15 | 'himself', 'this', 'down', 'should', 'our', 'their', 'while', 'above', 16 | 'both', 'up', 'to', 'ours', 'had', 'she', 'all', 'no', 'when', 'at', 'any', 17 | 'before', 'them', 'same', 'and', 'been', 'have', 'in', 'will', 'on', 'does', 18 | 'yourselves', 'then', 'that', 'because', 'what', 'over', 'why', 'so', 'can', 19 | 'did', 'not', 'now', 'under', 'he', 'you', 'herself', 'has', 'just', 'where', 20 | 'too', 'only', 'myself', 'which', 'those', 'i', 'after', 'few', 'whom', 't', 21 | 'being', 'if', 'theirs', 'my', 'against', 'a', 'by', 'doing', 'it', 'how', 22 | 'further', 'was', 'here', 'than'} 23 | 24 | 25 | def load_files(directory): 26 | result = [] 27 | for fname in os.listdir(directory): 28 | with open(directory + '/' + fname, 'r', encoding='ISO-8859-1') as f: 29 | result.append(f.read()) 30 | return result 31 | 32 | 33 | def preprocess_sentence(sentence): 34 | lemmatizer = nltk.WordNetLemmatizer() 35 | # clearly list out our preprocessing pipeline 36 | processed_tokens = nltk.word_tokenize(sentence) 37 | processed_tokens = [w.lower() for w in processed_tokens] 38 | # find least common elements 39 | word_counts = collections.Counter(processed_tokens) 40 | uncommon_words = word_counts.most_common()[:-10:-1] 41 | # remove these tokens 42 | processed_tokens = [w for w in processed_tokens if w not in stop_words] 43 | processed_tokens = [w for w in processed_tokens if w not in uncommon_words] 44 | # lemmatize 45 | processed_tokens = [lemmatizer.lemmatize(w) for w in processed_tokens] 46 | return processed_tokens 47 | 48 | 49 | def feature_extraction(tokens): 50 | '''Turn each word into a feature. The feature value is the word count.''' 51 | return dict(collections.Counter(tokens)) 52 | 53 | 54 | def train_test_split(dataset, train_size=0.8): 55 | num_training_examples = int(len(dataset) * train_size) 56 | return dataset[:num_training_examples], dataset[num_training_examples:] 57 | 58 | 59 | positive_examples = load_files('enron/spam') 60 | negative_examples = load_files('enron/ham') 61 | 62 | # Label the examples 63 | positive_examples = [preprocess_sentence(email) for email in positive_examples] 64 | negative_examples = [preprocess_sentence(email) for email in negative_examples] 65 | 66 | positive_examples = [(email, 1) for email in positive_examples] 67 | negative_examples = [(email, 0) for email in negative_examples] 68 | all_examples = positive_examples + negative_examples 69 | random.shuffle(all_examples) 70 | 71 | print('{} emails processed.'.format(len(all_examples))) 72 | 73 | featurized = [(feature_extraction(corpus), label) 74 | for corpus, label in all_examples] 75 | 76 | training_set, test_set = train_test_split(featurized, train_size=0.7) 77 | 78 | model = nltk.classify.NaiveBayesClassifier.train(training_set) 79 | training_error = nltk.classify.accuracy(model, training_set) 80 | print('Model training complete. Accuracy on training set: {}'.format( 81 | training_error)) 82 | 83 | testing_error = nltk.classify.accuracy(model, test_set) 84 | print('Accuracy on test set: {}'.format(testing_error)) 85 | -------------------------------------------------------------------------------- /Section 1/nlp-1-natural-language-data.py: -------------------------------------------------------------------------------- 1 | # 1.1 Use Python, NLTK and scikit-learn to build your NLP toolset. 2 | # pip install nltk 3 | # pip install scikit-learn 4 | import collections # 1.5 5 | import multiprocessing as mp # 1.2 6 | import re # 1.3 7 | 8 | 9 | # 1.2 Reading a simple natural language file into memory 10 | 11 | # def process(line): 12 | # print(line) 13 | 14 | # # try 1: readlines() 15 | # with open("natural-language-data.txt") as f: 16 | # data = f.readlines() # everything in memory! 17 | # for line in data: 18 | # process(line) 19 | 20 | # # try 2: use context managers to make sure file pointers are closed correctly. 21 | # with open("natural-language-data.txt") as f: 22 | # # to handle large text files, we use the file as an iterator 23 | # for line in f: 24 | # # each line is garbage collcted after the iteration 25 | # # unless it is referenced elsewhere. 26 | # process(line) 27 | 28 | # # # try 3: multiprocessing 29 | # pool = mp.Pool(2) # no. of pools = no. of CPU cores 30 | # jobs = [] 31 | 32 | # with open("natural-language-data.txt") as f: 33 | # for line in f: 34 | # jobs.append( 35 | # pool.apply_async(process, (line))) 36 | 37 | # for job in jobs: 38 | # job.get() # wait for all jobs to finish 39 | 40 | # pool.close() 41 | 42 | # # 1.3 Split the text into individual words with regular expression 43 | corpus = ("Andy is a data scientist. Andy's boss, Megan, was looking for him, " 44 | "but Andy was out to lunch. Megan texted Andy, 'How's the deadline" 45 | " coming along?'") 46 | 47 | # Simply splitting the sentence with spaces 48 | # print(corpus.split()) 49 | 50 | # Taking out punctuation 51 | punctuation = ".',?" # what is the universe of punctuation? How do we handle 's? 52 | for p in punctuation: 53 | corpus = corpus.replace(p, '') 54 | 55 | # print(corpus.split()) 56 | 57 | # Regex 58 | word_regex = r'\W+' # a raw str: one or more (+) non-word characters (\W) 59 | split_corpus = re.split(word_regex, corpus) 60 | # print(split_corpus) 61 | 62 | # a better regex 63 | # word character + zero or more word characters or 's + word character 64 | # OR 65 | # just a word character 66 | word_regex_improved = r"(\w[\w']*\w|\w)" 67 | word_matcher = re.compile(word_regex_improved) 68 | # print(word_matcher.findall(corpus)) 69 | 70 | 71 | # 1.4 Converting words into lists of lower case tokens 72 | 73 | def split_into_words(line): 74 | word_regex_improved = r"(\w[\w']*\w|\w)" 75 | word_matcher = re.compile(word_regex_improved) 76 | return word_matcher.findall(line) 77 | 78 | processed_corpus = [] 79 | 80 | with open("natural-language-data.txt") as f: 81 | # to handle large text files, we use the file as an iterator 82 | for line in f: 83 | processed_corpus.extend(split_into_words(line)) 84 | 85 | processed_corpus = [w.lower() for w in processed_corpus] 86 | 87 | print(processed_corpus) 88 | 89 | 90 | # # 1.5 Removing uncommon words and stop words 91 | 92 | # Before stop word removal 93 | word_counts = collections.Counter(processed_corpus) 94 | print(word_counts) 95 | 96 | # Define some stop words 97 | stop_words = { 98 | 'ourselves', 'hers', 'between', 'yourself', 'but', 'again', 99 | 'there', 'about', 'once', 'during', 'out', 'very', 'having', 'with', 'they', 100 | 'own', 'an', 'be', 'some', 'for', 'do', 'its', 'yours', 'such', 'into', 101 | 'of', 'most', 'itself', 'other', 'off', 'is', 's', 'am', 'or', 'who', 'as', 102 | 'from', 'him', 'each', 'the', 'themselves', 'until', 'below', 'are', 'we', 103 | 'these', 'your', 'his', 'through', 'don', 'nor', 'me', 'were', 'her', 'more', 104 | 'himself', 'this', 'down', 'should', 'our', 'their', 'while', 'above', 105 | 'both', 'up', 'to', 'ours', 'had', 'she', 'all', 'no', 'when', 'at', 'any', 106 | 'before', 'them', 'same', 'and', 'been', 'have', 'in', 'will', 'on', 'does', 107 | 'yourselves', 'then', 'that', 'because', 'what', 'over', 'why', 'so', 'can', 108 | 'did', 'not', 'now', 'under', 'he', 'you', 'herself', 'has', 'just', 'where', 109 | 'too', 'only', 'myself', 'which', 'those', 'i', 'after', 'few', 'whom', 't', 110 | 'being', 'if', 'theirs', 'my', 'against', 'a', 'by', 'doing', 'it', 'how', 111 | 'further', 'was', 'here', 'than'} 112 | 113 | # find least common elements 114 | uncommon_words = word_counts.most_common()[:-10:-1] 115 | 116 | processed_corpus = [w for w in processed_corpus if w not in stop_words] 117 | processed_corpus = [w for w in processed_corpus if w not in uncommon_words] 118 | print(processed_corpus) 119 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | # Hands-on-NLP-with-NLTK-and-scikit-learn- 5 | Hands-on NLP with NLTK and scikit-learn[video], published by Packt 6 | # Hands-on NLP with NLTK and Scikit-learn [Video] 7 | This is the code repository for [Hands-on NLP with NLTK and Scikit-learn [Video]](https://www.packtpub.com/big-data-and-business-intelligence/hands-nlp-nltk-and-scikit-learn-video?utm_source=github&utm_medium=repository&utm_campaign=9781789345612), published by [Packt](https://www.packtpub.com/?utm_source=github). It contains all the supporting project files necessary to work through the video course from start to finish. 8 | ## About the Video Course 9 | Your colleagues depend on you to monetize gigabytes of unstructured text data. What do you do? 10 | Taking this course will help you to precisely create new applications with Python and NLP. You will be able to build actual solutions backed by machine learning and NLP processing models with ease. 11 | 12 | There is an overflow of text data online nowadays. As a Python developer, you need to create a new solution using Natural Language Processing for your next project. Your colleagues depend on you to monetize gigabytes of unstructured text data. What do you do? 13 | Hands-on NLP with NLTK and scikit-learn is the answer. This course puts you right on the spot, starting off with building a spam classifier in our first video. At the end of the course, you are going to walk away with three NLP applications: a spam filter, a topic classifier, and a sentiment analyzer. There is no need for fancy mathematical theory, just plain English explanations of core NLP concepts and how to apply those using Python libraries. 14 | Taking this course will help you to precisely create new applications with Python and NLP. You will be able to build actual solutions backed by machine learning and NLP processing models with ease. 15 | 16 |

What You Will Learn

17 |
18 |
25 | 26 | ## Instructions and Navigation 27 | ### Assumed Knowledge 28 | To fully benefit from the coverage included in this course, you will need:
29 | This course is for developers, data scientists, and programmers who want to learn about practical Natural Language Processing with Python in a hands-on way. Developers who have an upcoming project that needs NLP, or a pile of unstructured text data on their hands, and don't know what to do with it, will find this course useful. Prior programming experience with Python is assumed along with being comfortable dealing with machine learning terms such as supervised learning, regression, and classification. No prior Natural Language Processing or text mining experience is needed. 30 | ### Technical Requirements 31 | This course has the following software requirements:
32 | SETUP AND INSTALLATION 33 | Minimum Hardware Requirements 34 | For successful completion of this course, students will require the computer systems with at least the following: 35 | 36 | 37 | OS: Windows 7 SP1 64-bit, Windows 8.1 64-bit or Windows 10 64-bit 38 | 39 | 40 | 41 | Processor: Intel Core i5 or equivalent 42 | 43 | 44 | 45 | Memory: 8 GB RAM 46 | 47 | 48 | 49 | Storage: 35 GB available space 50 | 51 | 52 | 53 | 54 | Recommended Hardware Requirements 55 | For an optimal experience with hands-on labs and other practical activities, we recommend the following configuration: 56 | 57 | 58 | OS: Windows 7 SP1 64-bit, Windows 8.1 64-bit or Windows 10 64-bit 59 | 60 | 61 | 62 | Processor: Intel Core i7 or equivalent 63 | 64 | 65 | 66 | Memory: 16 GB RAM 67 | 68 | 69 | 70 | Storage: 35 GB available space 71 | 72 | 73 | Software Requirements 74 | 75 | OS: Windows 7 or Windows 10 76 | 77 | 78 | 79 | Browser: Google Chrome, Latest Version 80 | 81 | 82 | 83 | Code Editor: Atom IDE, Latest Version 84 | 85 | 86 | 87 | Others: Python3 installed using the Anaconda package or equivalent, Tensorflow r1.4 88 | 89 | 90 | 91 | 92 | Exercise Files 93 | 94 | Exercise files should have a start and an end state for each video that contains a demonstration of code. 95 | 96 | ## Related Products 97 | * [Hands-on NLP with NLTK and Scikit-learn [Video]](https://www.packtpub.com/big-data-and-business-intelligence/hands-nlp-nltk-and-scikit-learn-video?utm_source=github&utm_medium=repository&utm_campaign=9781789345612) 98 | 99 | * [Hands-on NLP with NLTK and Scikit-learn [Video]](https://www.packtpub.com/big-data-and-business-intelligence/hands-nlp-nltk-and-scikit-learn-video?utm_source=github&utm_medium=repository&utm_campaign=9781789345612) 100 | 101 | * [Hands-on NLP with NLTK and Scikit-learn [Video]](https://www.packtpub.com/big-data-and-business-intelligence/hands-nlp-nltk-and-scikit-learn-video?utm_source=github&utm_medium=repository&utm_campaign=9781789345612) 102 | 103 | -------------------------------------------------------------------------------- /Section 4/4.2 Regular Expression for NLP.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 2, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "import re\n", 10 | "from nltk.corpus import words" 11 | ] 12 | }, 13 | { 14 | "cell_type": "code", 15 | "execution_count": 13, 16 | "metadata": {}, 17 | "outputs": [], 18 | "source": [ 19 | "english_words = words.raw().split('\\n')" 20 | ] 21 | }, 22 | { 23 | "cell_type": "code", 24 | "execution_count": 14, 25 | "metadata": {}, 26 | "outputs": [ 27 | { 28 | "data": { 29 | "text/plain": [ 30 | "236737" 31 | ] 32 | }, 33 | "execution_count": 14, 34 | "metadata": {}, 35 | "output_type": "execute_result" 36 | } 37 | ], 38 | "source": [ 39 | "len(english_words)" 40 | ] 41 | }, 42 | { 43 | "cell_type": "markdown", 44 | "metadata": {}, 45 | "source": [ 46 | "**Wildcards**" 47 | ] 48 | }, 49 | { 50 | "cell_type": "code", 51 | "execution_count": 18, 52 | "metadata": {}, 53 | "outputs": [ 54 | { 55 | "data": { 56 | "text/plain": [ 57 | "['tact',\n", 58 | " 'tactable',\n", 59 | " 'tactful',\n", 60 | " 'tactfully',\n", 61 | " 'tactfulness',\n", 62 | " 'tactic',\n", 63 | " 'tactical',\n", 64 | " 'tactically',\n", 65 | " 'tactician',\n", 66 | " 'tactics',\n", 67 | " 'tactile',\n", 68 | " 'tactilist',\n", 69 | " 'tactility',\n", 70 | " 'tactilogical',\n", 71 | " 'tactinvariant',\n", 72 | " 'taction',\n", 73 | " 'tactite',\n", 74 | " 'tactive',\n", 75 | " 'tactless',\n", 76 | " 'tactlessly']" 77 | ] 78 | }, 79 | "execution_count": 18, 80 | "metadata": {}, 81 | "output_type": "execute_result" 82 | } 83 | ], 84 | "source": [ 85 | "pattern = re.compile('t..t')\n", 86 | "[w for w in english_words if pattern.match(w)][:20]" 87 | ] 88 | }, 89 | { 90 | "cell_type": "markdown", 91 | "metadata": {}, 92 | "source": [ 93 | "**Endings**" 94 | ] 95 | }, 96 | { 97 | "cell_type": "code", 98 | "execution_count": 20, 99 | "metadata": {}, 100 | "outputs": [ 101 | { 102 | "data": { 103 | "text/plain": [ 104 | "['tact',\n", 105 | " 'taft',\n", 106 | " 'tait',\n", 107 | " 'takt',\n", 108 | " 'tart',\n", 109 | " 'taut',\n", 110 | " 'teat',\n", 111 | " 'teet',\n", 112 | " 'telt',\n", 113 | " 'tent',\n", 114 | " 'test',\n", 115 | " 'text',\n", 116 | " 'that',\n", 117 | " 'tift',\n", 118 | " 'tilt',\n", 119 | " 'tint',\n", 120 | " 'toat',\n", 121 | " 'toft',\n", 122 | " 'togt',\n", 123 | " 'toit']" 124 | ] 125 | }, 126 | "execution_count": 20, 127 | "metadata": {}, 128 | "output_type": "execute_result" 129 | } 130 | ], 131 | "source": [ 132 | "pattern = re.compile('t..t$')\n", 133 | "[w for w in english_words if pattern.match(w)][:20]" 134 | ] 135 | }, 136 | { 137 | "cell_type": "markdown", 138 | "metadata": {}, 139 | "source": [ 140 | "**Optionality**" 141 | ] 142 | }, 143 | { 144 | "cell_type": "code", 145 | "execution_count": 23, 146 | "metadata": {}, 147 | "outputs": [ 148 | { 149 | "data": { 150 | "text/plain": [ 151 | "['humor', 'humour']" 152 | ] 153 | }, 154 | "execution_count": 23, 155 | "metadata": {}, 156 | "output_type": "execute_result" 157 | } 158 | ], 159 | "source": [ 160 | "pattern = re.compile('humou?r$')\n", 161 | "[w for w in english_words if pattern.match(w)][:20]" 162 | ] 163 | }, 164 | { 165 | "cell_type": "markdown", 166 | "metadata": {}, 167 | "source": [ 168 | "**One or more**" 169 | ] 170 | }, 171 | { 172 | "cell_type": "code", 173 | "execution_count": 24, 174 | "metadata": {}, 175 | "outputs": [ 176 | { 177 | "data": { 178 | "text/plain": [ 179 | "['col', 'cool']" 180 | ] 181 | }, 182 | "execution_count": 24, 183 | "metadata": {}, 184 | "output_type": "execute_result" 185 | } 186 | ], 187 | "source": [ 188 | "pattern = re.compile('co+l$')\n", 189 | "[w for w in english_words if pattern.match(w)][:20]" 190 | ] 191 | }, 192 | { 193 | "cell_type": "markdown", 194 | "metadata": {}, 195 | "source": [ 196 | "**Grouped choices**" 197 | ] 198 | }, 199 | { 200 | "cell_type": "code", 201 | "execution_count": 25, 202 | "metadata": {}, 203 | "outputs": [ 204 | { 205 | "data": { 206 | "text/plain": [ 207 | "['analyse', 'analyser', 'analyses', 'analyze', 'analyzer']" 208 | ] 209 | }, 210 | "execution_count": 25, 211 | "metadata": {}, 212 | "output_type": "execute_result" 213 | } 214 | ], 215 | "source": [ 216 | "pattern = re.compile('analy[sz]e')\n", 217 | "[w for w in english_words if pattern.match(w)][:20]" 218 | ] 219 | }, 220 | { 221 | "cell_type": "code", 222 | "execution_count": 27, 223 | "metadata": {}, 224 | "outputs": [ 225 | { 226 | "data": { 227 | "text/plain": [ 228 | "['pat',\n", 229 | " 'paut',\n", 230 | " 'peat',\n", 231 | " 'pet',\n", 232 | " 'piet',\n", 233 | " 'pit',\n", 234 | " 'poet',\n", 235 | " 'poot',\n", 236 | " 'pot',\n", 237 | " 'pout',\n", 238 | " 'put',\n", 239 | " 'pot',\n", 240 | " 'put']" 241 | ] 242 | }, 243 | "execution_count": 27, 244 | "metadata": {}, 245 | "output_type": "execute_result" 246 | } 247 | ], 248 | "source": [ 249 | "pattern = re.compile('p[aeiou]+t$')\n", 250 | "[w for w in english_words if pattern.match(w)][:20]" 251 | ] 252 | } 253 | ], 254 | "metadata": { 255 | "kernelspec": { 256 | "display_name": "Python 3", 257 | "language": "python", 258 | "name": "python3" 259 | }, 260 | "language_info": { 261 | "codemirror_mode": { 262 | "name": "ipython", 263 | "version": 3 264 | }, 265 | "file_extension": ".py", 266 | "mimetype": "text/x-python", 267 | "name": "python", 268 | "nbconvert_exporter": "python", 269 | "pygments_lexer": "ipython3", 270 | "version": "3.6.2" 271 | } 272 | }, 273 | "nbformat": 4, 274 | "nbformat_minor": 2 275 | } 276 | --------------------------------------------------------------------------------