├── Section 4
    ├── nlp-4.2-advanced-text-preprocessing.py
    ├── nlp-4-ngrams.py
    └── 4.2 Regular Expression for NLP.ipynb
├── Section 1
    ├── nlp-nltk-scikit-learn-code.rar
    └── nlp-1-natural-language-data.py
├── LICENSE
├── Section 6
    ├── nlp-6.3-lda.py
    ├── nlp-6.4-tfidf-svm.py
    ├── nlp-6.2-hashing-vs-count.py
    └── nlp-6.1-nlp-pipeline.py
├── Section 3
    └── nlp-3-sentiment-analysis.py
├── Section 5
    └── nlp-5-document-classification.py
├── Section 2
    └── nlp-2-spam-classification.py
└── README.md


/Section 4/nlp-4.2-advanced-text-preprocessing.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/Section 1/nlp-nltk-scikit-learn-code.rar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Hands-on-NLP-with-NLTK-and-scikit-learn-/HEAD/Section 1/nlp-nltk-scikit-learn-code.rar


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2018 Packt
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/Section 6/nlp-6.3-lda.py:
--------------------------------------------------------------------------------
 1 | import itertools
 2 | import numpy as np
 3 | from sklearn import (
 4 |     datasets, feature_extraction, model_selection, pipeline,
 5 |     decomposition, preprocessing, naive_bayes
 6 | )
 7 | import matplotlib.pyplot as plt
 8 | 
 9 | 
10 | if __name__ == '__main__':
11 |     newsgroups_data = datasets.load_files(
12 |         '20_newsgroups', shuffle=True, random_state=42, encoding='ISO-8859-1')
13 | 
14 |     print('Data loaded.\nClasses = {classes}\n{datapoints}'.format(
15 |         classes=newsgroups_data.target_names, datapoints=len(newsgroups_data.data)))
16 | 
17 |     # sometimes the label is present in the training data
18 |     print(newsgroups_data.data[0])
19 |     # remove any label present in the features
20 | 
21 |     X_train, X_test, y_train, y_test = model_selection.train_test_split(
22 |         newsgroups_data.data, newsgroups_data.target, test_size=0.33,
23 |         random_state=42)
24 | 
25 |     model = pipeline.Pipeline([
26 |         ('counts', feature_extraction.text.CountVectorizer()),
27 |         ('tfidf', feature_extraction.text.TfidfTransformer()),
28 |         ('SVD', decomposition.TruncatedSVD(128)),
29 |         ('normalize', preprocessing.Normalizer(copy=False)),
30 |         ('naivebayes', naive_bayes.GaussianNB())
31 |     ])
32 | 
33 |     model.fit(X_train, y_train)
34 |     y_pred = model.predict(X_test)
35 | 
36 |     print(model.score(X_test, y_test))
37 | 


--------------------------------------------------------------------------------
/Section 3/nlp-3-sentiment-analysis.py:
--------------------------------------------------------------------------------
 1 | import collections
 2 | import nltk
 3 | import os
 4 | from sklearn import (
 5 |     datasets, model_selection, feature_extraction, linear_model
 6 | )
 7 | 
 8 | 
 9 | def extract_features(corpus):
10 |     '''Extract TF-IDF features from corpus'''
11 |     # vectorize means we turn non-numerical data into an array of numbers
12 |     count_vectorizer = feature_extraction.text.CountVectorizer(
13 |         lowercase=True,  # for demonstration, True by default
14 |         tokenizer=nltk.word_tokenize,  # use the NLTK tokenizer
15 |         stop_words='english',  # remove stop words
16 |         min_df=1  # minimum document frequency, i.e. the word must appear more than once.
17 |     )
18 |     processed_corpus = count_vectorizer.fit_transform(corpus)
19 |     processed_corpus = feature_extraction.text.TfidfTransformer().fit_transform(
20 |         processed_corpus)
21 | 
22 |     return processed_corpus
23 | 
24 | 
25 | data_directory = 'movie_reviews'
26 | movie_sentiment_data = datasets.load_files(data_directory, shuffle=True)
27 | print('{} files loaded.'.format(len(movie_sentiment_data.data)))
28 | print('They contain the following classes: {}.'.format(
29 |     movie_sentiment_data.target_names))
30 | 
31 | movie_tfidf = extract_features(movie_sentiment_data.data)
32 | 
33 | X_train, X_test, y_train, y_test = model_selection.train_test_split(
34 |     movie_tfidf, movie_sentiment_data.target, test_size=0.30, random_state=42)
35 | 
36 | # similar to nltk.NaiveBayesClassifier.train()
37 | model = linear_model.LogisticRegression()
38 | model.fit(X_train, y_train)
39 | print('Model performance: {}'.format(model.score(X_test, y_test)))
40 | 
41 | y_pred = model.predict(X_test)
42 | for i in range(5):
43 |     print('Review:\n{review}\n-\nCorrect label: {correct}; Predicted: {predict}'.format(
44 |         review=X_test[i], correct=y_test[i], predict=y_pred[i]
45 |     ))
46 | 


--------------------------------------------------------------------------------
/Section 6/nlp-6.4-tfidf-svm.py:
--------------------------------------------------------------------------------
 1 | import itertools
 2 | import numpy as np
 3 | import nltk
 4 | from sklearn import (
 5 |     datasets, feature_extraction, model_selection, pipeline,
 6 |     svm, metrics
 7 | )
 8 | import matplotlib.pyplot as plt
 9 | 
10 | 
11 | def extract_features(corpus):
12 |     '''Extract TF-IDF features from corpus'''
13 | 
14 |     stop_words = nltk.corpus.stopwords.words("english")
15 | 
16 |     # vectorize means we turn non-numerical data into an array of numbers
17 |     count_vectorizer = feature_extraction.text.CountVectorizer(
18 |         lowercase=True,  # for demonstration, True by default
19 |         tokenizer=nltk.word_tokenize,  # use the NLTK tokenizer
20 |         min_df=2,  # minimum document frequency, i.e. the word must appear more than once.
21 |         ngram_range=(1, 2),
22 |         stop_words=stop_words
23 |     )
24 |     processed_corpus = count_vectorizer.fit_transform(corpus)
25 |     processed_corpus = feature_extraction.text.TfidfTransformer().fit_transform(
26 |         processed_corpus)
27 | 
28 |     return processed_corpus
29 | 
30 | if __name__ == '__main__':
31 |     newsgroups_data = datasets.load_files(
32 |         '20_newsgroups', shuffle=True, random_state=42, encoding='ISO-8859-1')
33 | 
34 |     print('Data loaded.\nClasses = {classes}\n{datapoints}'.format(
35 |         classes=newsgroups_data.target_names,
36 |         datapoints=len(newsgroups_data.data)))
37 | 
38 |     print(newsgroups_data.data[0])
39 | 
40 |     X_train, X_test, y_train, y_test = model_selection.train_test_split(
41 |         newsgroups_data.data, newsgroups_data.target, test_size=0.33,
42 |         random_state=42)
43 | 
44 |     model = pipeline.Pipeline([
45 |         ('counts', feature_extraction.text.CountVectorizer()),
46 |         ('tfidf', feature_extraction.text.TfidfTransformer()),
47 |         ('svm', svm.LinearSVC()),
48 |     ])
49 | 
50 |     model.fit(X_train, y_train)
51 |     y_pred = model.predict(X_test)
52 | 
53 |     print('Accuracy of SVM= {}'.format(
54 |         np.mean(y_pred == y_test)))
55 | 
56 |     print(metrics.classification_report(
57 |         y_test, y_pred, target_names=newsgroups_data.target_names))
58 | 


--------------------------------------------------------------------------------
/Section 6/nlp-6.2-hashing-vs-count.py:
--------------------------------------------------------------------------------
 1 | from sklearn import feature_extraction
 2 | 
 3 | 
 4 | corpus = [
 5 |     'Convert a collection of text documents to a matrix of token occurrences',
 6 |     'It turns a collection of text documents into a scipy.sparse matrix holding token occurrence counts (or binary occurrence information), possibly normalized as token frequencies if norm=’l1’ or projected on the euclidean unit sphere if norm=’l2’.',
 7 |     'This text vectorizer implementation uses the hashing trick to find the token string name to feature integer index mapping.',
 8 |     'This strategy has several advantages:',
 9 |     'it is very low memory scalable to large datasets as there is no need to store a vocabulary dictionary in memory',
10 |     'it is fast to pickle and un-pickle as it holds no state besides the constructor parameters',
11 |     'it can be used in a streaming (partial fit) or parallel pipeline as there is no state computed during fit.'
12 | ]
13 | 
14 | print('Processing corpus: {} documents'.format(len(corpus)))
15 | 
16 | print('Count Vectorizer:\n')
17 | vectorizer = feature_extraction.text.CountVectorizer()
18 | X = vectorizer.fit_transform(corpus)
19 | # Count Vectorizer stores a dictionary: a number per word
20 | print(vectorizer.vocabulary_)
21 | print('Resulting matrix has {} data points and {} features.\n'.format(
22 |     X.shape[0], X.shape[1]))
23 | print('Document 1: \n{}'.format(X[0].toarray()))
24 | # as the number of words increase, you need a bigger and bigger dictionary!
25 | 
26 | 
27 | print('Hashing Vectorizer:\n')
28 | 
29 | # norm=None means we don't normalize the values
30 | # alternative_sign=False means that we don't alternate the value's signs to
31 | #   conserve any mathematical properties
32 | vectorizer = feature_extraction.text.HashingVectorizer(
33 |     norm=None, alternate_sign=False)
34 | X = vectorizer.transform(corpus)  # not fit_transform
35 | 
36 | print('Resulting matrix has {} data points and {} features.\n'.format(
37 |     X.shape[0], X.shape[1]))
38 | 
39 | # > Resulting matrix has 7 data points and 1048576 features.
40 | 
41 | print('Document 1: \n{}'.format(X[0]))
42 | 
43 | # Document 1: 
44 | #   (0, 22468)	0.2886751345948129
45 | #   (0, 124863)	-0.2886751345948129
46 | #   (0, 164975)	-0.2886751345948129
47 | #   (0, 174171)	0.2886751345948129
48 | #   (0, 264705)	0.2886751345948129
49 | #   (0, 479532)	0.5773502691896258
50 | #   (0, 548700)	-0.2886751345948129
51 | #   (0, 676585)	-0.2886751345948129
52 | #   (0, 741852)	-0.2886751345948129
53 | #   Read the above as:
54 | #   (document_index, feature_index) 
55 | 


--------------------------------------------------------------------------------
/Section 6/nlp-6.1-nlp-pipeline.py:
--------------------------------------------------------------------------------
 1 | import nltk
 2 | 
 3 | 
 4 | corpus = [
 5 |     """
 6 |     This strategy has several advantages:
 7 |     it is very low memory scalable to large datasets as there is no need to store a vocabulary dictionary in memory
 8 |     it is fast to pickle and un-pickle as it holds no state besides the constructor parameters
 9 |     it can be used in a streaming (partial fit) or parallel pipeline as there is no state computed during fit.
10 |     """,
11 |     """
12 |     It turns a collection of text documents into a scipy.sparse matrix holding token occurrence counts (or binary occurrence information), 
13 |     possibly normalized as token frequencies if norm=’l1’ or projected on the euclidean unit sphere if norm=’l2’.
14 |     """
15 | ]
16 | 
17 | 
18 | def pipeline(f):
19 |     '''pipeline decorator that calls next() on function f()'''
20 |     def start_pipeline(*args, **kwargs):
21 |         nf = f(*args, **kwargs)
22 |         next(nf)
23 |         return nf
24 |     return start_pipeline
25 | 
26 | 
27 | def ingest(corpus, targets):
28 |     for text in corpus:
29 |         for t in targets:
30 |             t.send(text)
31 | 
32 | 
33 | @pipeline
34 | def tokenize_sentences(targets):
35 |     while True:
36 |         text = (yield)  # (yield) gets an item from an upstream step
37 |         sentences = nltk.sent_tokenize(text)
38 |         for sentence in sentences:
39 |             for target in targets:
40 |                 target.send(sentence)  # send() sends data downstream
41 | 
42 | 
43 | @pipeline
44 | def tokenize_words(targets):
45 |     while True:
46 |         sentence = (yield)
47 |         words = nltk.word_tokenize(sentence)
48 |         for target in targets:
49 |             target.send(words)
50 | 
51 | 
52 | @pipeline
53 | def pos_tagging(targets):
54 |     while True:
55 |         words = (yield)
56 |         tagged_words = nltk.pos_tag(words)
57 | 
58 |         for target in targets:
59 |             target.send(tagged_words)
60 | 
61 | 
62 | @pipeline
63 | def ne_chunking(targets):
64 |     while True:
65 |         tagged_words = (yield)
66 |         ner_tagged = nltk.ne_chunk(tagged_words)
67 |         for target in targets:
68 |             target.send(ner_tagged)
69 | 
70 | 
71 | @pipeline
72 | def printline(title):
73 |     while True:
74 |         line = (yield)
75 |         print(title)
76 |         print(line)
77 | 
78 | ingest(corpus, [
79 |     tokenize_sentences([
80 |         tokenize_words([
81 |             printline('Word tokens:'),
82 |             pos_tagging([
83 |                 ne_chunking([
84 |                     printline('Results:')
85 |                 ])
86 |             ])
87 |         ])
88 |     ])
89 | ])
90 | 


--------------------------------------------------------------------------------
/Section 5/nlp-5-document-classification.py:
--------------------------------------------------------------------------------
 1 | import itertools
 2 | import numpy as np
 3 | import nltk
 4 | from sklearn import (
 5 |     datasets, feature_extraction, model_selection, pipeline,
 6 |     naive_bayes, metrics
 7 | )
 8 | import matplotlib.pyplot as plt
 9 | 
10 | 
11 | def extract_features(corpus):
12 |     '''Extract TF-IDF features from corpus'''
13 | 
14 |     stop_words = nltk.corpus.stopwords.words("english")
15 | 
16 |     # vectorize means we turn non-numerical data into an array of numbers
17 |     count_vectorizer = feature_extraction.text.CountVectorizer(
18 |         lowercase=True,  # for demonstration, True by default
19 |         tokenizer=nltk.word_tokenize,  # use the NLTK tokenizer
20 |         min_df=2,  # minimum document frequency, i.e. the word must appear more than once.
21 |         ngram_range=(1, 2),
22 |         stop_words=stop_words
23 |     )
24 |     processed_corpus = count_vectorizer.fit_transform(corpus)
25 |     processed_corpus = feature_extraction.text.TfidfTransformer().fit_transform(
26 |         processed_corpus)
27 | 
28 |     return processed_corpus
29 | 
30 | if __name__ == '__main__':
31 |     newsgroups_data = datasets.load_files(
32 |         '20_newsgroups', shuffle=True, random_state=42, encoding='ISO-8859-1')
33 | 
34 |     print('Data loaded.\nClasses = {classes}\n{datapoints}'.format(
35 |         classes=newsgroups_data.target_names,
36 |         datapoints=len(newsgroups_data.data)))
37 | 
38 |     print(newsgroups_data.data[0])
39 | 
40 |     X_train, X_test, y_train, y_test = model_selection.train_test_split(
41 |         newsgroups_data.data, newsgroups_data.target, test_size=0.33,
42 |         random_state=42)
43 | 
44 |     stop_words = nltk.corpus.stopwords.words("english")
45 | 
46 |     model = pipeline.Pipeline([
47 |         ('counts', feature_extraction.text.CountVectorizer(
48 |             lowercase=True,  # for demonstration, True by default
49 |             tokenizer=nltk.word_tokenize,  # use the NLTK tokenizer
50 |             min_df=2,  # minimum document frequency, i.e. the word must appear more than once.
51 |             ngram_range=(1, 2),
52 |             stop_words=stop_words
53 |         )),
54 |         ('tfidf', feature_extraction.text.TfidfTransformer()),
55 |         ('naivebayes', naive_bayes.MultinomialNB()),
56 |     ])
57 | 
58 |     model.fit(X_train, y_train)
59 |     y_pred = model.predict(X_test)
60 | 
61 |     print('Accuracy of multinomial naive bayes= {}'.format(
62 |         np.mean(y_pred == y_test)))
63 | 
64 |     print(metrics.classification_report(
65 |         y_test, y_pred, target_names=newsgroups_data.target_names))
66 | 
67 |     grid_search_model = model_selection.GridSearchCV(
68 |         model,
69 |         {
70 |             'counts__ngram_range': [(1, 1), (1, 2)],
71 |             'naivebayes__alpha': (0.1, 3.0)
72 |         },
73 |         n_jobs=-1  # detect how many cores are installed and uses them all
74 |     )
75 | 
76 |     grid_search_model.fit(X_train, y_train)
77 |     print(grid_search_model.cv_results_)
78 | 


--------------------------------------------------------------------------------
/Section 4/nlp-4-ngrams.py:
--------------------------------------------------------------------------------
 1 | import collections
 2 | import nltk
 3 | import os
 4 | from sklearn import (
 5 |     datasets, model_selection, feature_extraction, linear_model, naive_bayes,
 6 |     ensemble
 7 | )
 8 | 
 9 | 
10 | def extract_features(corpus):
11 |     '''Extract TF-IDF features from corpus'''
12 | 
13 |     sa_stop_words = nltk.corpus.stopwords.words("english")
14 | 
15 |     # words that might invert a sentence's meaning
16 |     white_list = [
17 |         'what', 'but', 'if', 'because', 'as', 'until', 'against',
18 |         'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again',
19 |         'further', 'then', 'once', 'here', 'there', 'why', 'how', 'all', 'any',
20 |         'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own',
21 |         'same', 'so', 'than', 'too', 'can', 'will', 'just', 'don', 'should']
22 | 
23 |     # take these out of the standard NLTK stop word list
24 |     sa_stop_words = [sw for sw in sa_stop_words if sw not in white_list]
25 | 
26 |     # vectorize means we turn non-numerical data into an array of numbers
27 |     count_vectorizer = feature_extraction.text.CountVectorizer(
28 |         lowercase=True,  # for demonstration, True by default
29 |         tokenizer=nltk.word_tokenize,  # use the NLTK tokenizer
30 |         min_df=2,  # minimum document frequency, i.e. the word must appear more than once.
31 |         ngram_range=(1, 2),
32 |         stop_words=sa_stop_words
33 |     )
34 |     processed_corpus = count_vectorizer.fit_transform(corpus)
35 |     processed_corpus = feature_extraction.text.TfidfTransformer().fit_transform(
36 |         processed_corpus)
37 | 
38 |     return processed_corpus
39 | 
40 | 
41 | data_directory = 'movie_reviews'
42 | movie_sentiment_data = datasets.load_files(data_directory, shuffle=True)
43 | print('{} files loaded.'.format(len(movie_sentiment_data.data)))
44 | print('They contain the following classes: {}.'.format(
45 |     movie_sentiment_data.target_names))
46 | 
47 | movie_tfidf = extract_features(movie_sentiment_data.data)
48 | 
49 | X_train, X_test, y_train, y_test = model_selection.train_test_split(
50 |     movie_tfidf, movie_sentiment_data.target, test_size=0.30, random_state=42)
51 | 
52 | # similar to nltk.NaiveBayesClassifier.train()
53 | clf1 = linear_model.LogisticRegression()
54 | clf1.fit(X_train, y_train)
55 | print('Logistic Regression performance: {}'.format(clf1.score(X_test, y_test)))
56 | 
57 | clf2 = linear_model.SGDClassifier()
58 | clf2.fit(X_train, y_train)
59 | print('SGDClassifier performance: {}'.format(clf2.score(X_test, y_test)))
60 | 
61 | clf3 = naive_bayes.MultinomialNB()
62 | clf3.fit(X_train, y_train)
63 | print('MultinomialNB performance: {}'.format(clf3.score(X_test, y_test)))
64 | 
65 | clf4 = naive_bayes.BernoulliNB()
66 | clf4.fit(X_train, y_train)
67 | print('BernoulliNB performance: {}'.format(clf4.score(X_test, y_test)))
68 | 
69 | 
70 | voting_model = ensemble.VotingClassifier(
71 |     estimators=[('lr', clf1), ('sgd', clf2), ('mnb', clf3), ('bnb', clf4)],
72 |     voting='hard')
73 | voting_model.fit(X_train, y_train)
74 | print('Voting classifier performance: {}'.format(
75 |     voting_model.score(X_test, y_test)))
76 | 
77 | 


--------------------------------------------------------------------------------
/Section 2/nlp-2-spam-classification.py:
--------------------------------------------------------------------------------
 1 | import collections
 2 | import nltk
 3 | import os
 4 | import random
 5 | 
 6 | 
 7 | # Define some stop words
 8 | stop_words = {
 9 |     'ourselves', 'hers', 'between', 'yourself', 'but', 'again', 
10 |     'there', 'about', 'once', 'during', 'out', 'very', 'having', 'with', 'they',
11 |     'own', 'an', 'be', 'some', 'for', 'do', 'its', 'yours', 'such', 'into', 
12 |     'of', 'most', 'itself', 'other', 'off', 'is', 's', 'am', 'or', 'who', 'as',
13 |     'from', 'him', 'each', 'the', 'themselves', 'until', 'below', 'are', 'we',
14 |     'these', 'your', 'his', 'through', 'don', 'nor', 'me', 'were', 'her', 'more',
15 |     'himself', 'this', 'down', 'should', 'our', 'their', 'while', 'above',
16 |     'both', 'up', 'to', 'ours', 'had', 'she', 'all', 'no', 'when', 'at', 'any',
17 |     'before', 'them', 'same', 'and', 'been', 'have', 'in', 'will', 'on', 'does',
18 |     'yourselves', 'then', 'that', 'because', 'what', 'over', 'why', 'so', 'can',
19 |     'did', 'not', 'now', 'under', 'he', 'you', 'herself', 'has', 'just', 'where',
20 |     'too', 'only', 'myself', 'which', 'those', 'i', 'after', 'few', 'whom', 't',
21 |     'being', 'if', 'theirs', 'my', 'against', 'a', 'by', 'doing', 'it', 'how',
22 |     'further', 'was', 'here', 'than'}
23 | 
24 | 
25 | def load_files(directory):
26 |     result = []
27 |     for fname in os.listdir(directory):
28 |         with open(directory + '/' + fname, 'r', encoding='ISO-8859-1') as f:
29 |             result.append(f.read())
30 |     return result
31 | 
32 | 
33 | def preprocess_sentence(sentence):
34 |     lemmatizer = nltk.WordNetLemmatizer()
35 |     # clearly list out our preprocessing pipeline
36 |     processed_tokens = nltk.word_tokenize(sentence)
37 |     processed_tokens = [w.lower() for w in processed_tokens]
38 |     # find least common elements
39 |     word_counts = collections.Counter(processed_tokens)
40 |     uncommon_words = word_counts.most_common()[:-10:-1]
41 |     # remove these tokens
42 |     processed_tokens = [w for w in processed_tokens if w not in stop_words]
43 |     processed_tokens = [w for w in processed_tokens if w not in uncommon_words]
44 |     # lemmatize
45 |     processed_tokens = [lemmatizer.lemmatize(w) for w in processed_tokens]
46 |     return processed_tokens
47 | 
48 | 
49 | def feature_extraction(tokens):
50 |     '''Turn each word into a feature. The feature value is the word count.'''
51 |     return dict(collections.Counter(tokens))
52 | 
53 | 
54 | def train_test_split(dataset, train_size=0.8):
55 |     num_training_examples = int(len(dataset) * train_size)
56 |     return dataset[:num_training_examples], dataset[num_training_examples:]
57 | 
58 | 
59 | positive_examples = load_files('enron/spam')
60 | negative_examples = load_files('enron/ham')
61 | 
62 | # Label the examples
63 | positive_examples = [preprocess_sentence(email) for email in positive_examples]
64 | negative_examples = [preprocess_sentence(email) for email in negative_examples]
65 | 
66 | positive_examples = [(email, 1) for email in positive_examples]
67 | negative_examples = [(email, 0) for email in negative_examples]
68 | all_examples = positive_examples + negative_examples
69 | random.shuffle(all_examples)
70 | 
71 | print('{} emails processed.'.format(len(all_examples)))
72 | 
73 | featurized = [(feature_extraction(corpus), label)
74 |               for corpus, label in all_examples]
75 | 
76 | training_set, test_set = train_test_split(featurized, train_size=0.7)
77 | 
78 | model = nltk.classify.NaiveBayesClassifier.train(training_set)
79 | training_error = nltk.classify.accuracy(model, training_set)
80 | print('Model training complete. Accuracy on training set: {}'.format(
81 |     training_error))
82 | 
83 | testing_error = nltk.classify.accuracy(model, test_set)
84 | print('Accuracy on test set: {}'.format(testing_error))
85 | 


--------------------------------------------------------------------------------
/Section 1/nlp-1-natural-language-data.py:
--------------------------------------------------------------------------------
  1 | # 1.1 Use Python, NLTK and scikit-learn to build your NLP toolset.
  2 | # pip install nltk
  3 | # pip install scikit-learn
  4 | import collections  # 1.5
  5 | import multiprocessing as mp  # 1.2
  6 | import re  # 1.3
  7 | 
  8 | 
  9 | # 1.2 Reading a simple natural language file into memory
 10 | 
 11 | # def process(line):
 12 | #     print(line)
 13 | 
 14 | # # try 1: readlines()
 15 | # with open("natural-language-data.txt") as f:
 16 | #     data = f.readlines()  # everything in memory!
 17 | #     for line in data:
 18 | #         process(line)
 19 | 
 20 | # # try 2: use context managers to make sure file pointers are closed correctly.
 21 | # with open("natural-language-data.txt") as f:
 22 | #     # to handle large text files, we use the file as an iterator
 23 | #     for line in f:
 24 | #         # each line is garbage collcted after the iteration
 25 | #         # unless it is referenced elsewhere.
 26 | #         process(line)
 27 | 
 28 | # # # try 3: multiprocessing
 29 | # pool = mp.Pool(2)  # no. of pools = no. of CPU cores
 30 | # jobs = []
 31 | 
 32 | # with open("natural-language-data.txt") as f:
 33 | #     for line in f:
 34 | #         jobs.append(
 35 | #             pool.apply_async(process, (line)))
 36 | 
 37 | # for job in jobs:
 38 | #     job.get()  # wait for all jobs to finish
 39 | 
 40 | # pool.close()
 41 | 
 42 | # # 1.3 Split the text into individual words with regular expression
 43 | corpus = ("Andy is a data scientist. Andy's boss, Megan, was looking for him, "
 44 |           "but Andy was out to lunch. Megan texted Andy, 'How's the deadline"
 45 |           " coming along?'")
 46 | 
 47 | # Simply splitting the sentence with spaces
 48 | # print(corpus.split())
 49 | 
 50 | # Taking out punctuation
 51 | punctuation = ".',?"  # what is the universe of punctuation? How do we handle 's?
 52 | for p in punctuation:
 53 |     corpus = corpus.replace(p, '')
 54 | 
 55 | # print(corpus.split())
 56 | 
 57 | # Regex
 58 | word_regex = r'\W+'  # a raw str: one or more (+) non-word characters (\W)
 59 | split_corpus = re.split(word_regex, corpus)
 60 | # print(split_corpus)
 61 | 
 62 | # a better regex
 63 | # word character + zero or more word characters or 's + word character
 64 | # OR
 65 | # just a word character
 66 | word_regex_improved = r"(\w[\w']*\w|\w)"
 67 | word_matcher = re.compile(word_regex_improved)
 68 | # print(word_matcher.findall(corpus))
 69 | 
 70 | 
 71 | # 1.4 Converting words into lists of lower case tokens
 72 | 
 73 | def split_into_words(line):
 74 |     word_regex_improved = r"(\w[\w']*\w|\w)"
 75 |     word_matcher = re.compile(word_regex_improved)
 76 |     return word_matcher.findall(line)
 77 | 
 78 | processed_corpus = []
 79 | 
 80 | with open("natural-language-data.txt") as f:
 81 |     # to handle large text files, we use the file as an iterator
 82 |     for line in f:
 83 |         processed_corpus.extend(split_into_words(line))
 84 | 
 85 | processed_corpus = [w.lower() for w in processed_corpus]
 86 | 
 87 | print(processed_corpus)
 88 | 
 89 | 
 90 | # # 1.5 Removing uncommon words and stop words
 91 | 
 92 | # Before stop word removal
 93 | word_counts = collections.Counter(processed_corpus)
 94 | print(word_counts)
 95 | 
 96 | # Define some stop words
 97 | stop_words = {
 98 |     'ourselves', 'hers', 'between', 'yourself', 'but', 'again',
 99 |     'there', 'about', 'once', 'during', 'out', 'very', 'having', 'with', 'they',
100 |     'own', 'an', 'be', 'some', 'for', 'do', 'its', 'yours', 'such', 'into',
101 |     'of', 'most', 'itself', 'other', 'off', 'is', 's', 'am', 'or', 'who', 'as',
102 |     'from', 'him', 'each', 'the', 'themselves', 'until', 'below', 'are', 'we',
103 |     'these', 'your', 'his', 'through', 'don', 'nor', 'me', 'were', 'her', 'more',
104 |     'himself', 'this', 'down', 'should', 'our', 'their', 'while', 'above',
105 |     'both', 'up', 'to', 'ours', 'had', 'she', 'all', 'no', 'when', 'at', 'any',
106 |     'before', 'them', 'same', 'and', 'been', 'have', 'in', 'will', 'on', 'does',
107 |     'yourselves', 'then', 'that', 'because', 'what', 'over', 'why', 'so', 'can',
108 |     'did', 'not', 'now', 'under', 'he', 'you', 'herself', 'has', 'just', 'where',
109 |     'too', 'only', 'myself', 'which', 'those', 'i', 'after', 'few', 'whom', 't',
110 |     'being', 'if', 'theirs', 'my', 'against', 'a', 'by', 'doing', 'it', 'how',
111 |     'further', 'was', 'here', 'than'}
112 | 
113 | # find least common elements
114 | uncommon_words = word_counts.most_common()[:-10:-1]
115 | 
116 | processed_corpus = [w for w in processed_corpus if w not in stop_words]
117 | processed_corpus = [w for w in processed_corpus if w not in uncommon_words]
118 | print(processed_corpus)
119 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | 
  2 | 
  3 | 
  4 | # Hands-on-NLP-with-NLTK-and-scikit-learn-
  5 | Hands-on NLP with NLTK and scikit-learn[video], published by Packt
  6 | # Hands-on NLP with NLTK and Scikit-learn [Video]
  7 | This is the code repository for [Hands-on NLP with NLTK and Scikit-learn [Video]](https://www.packtpub.com/big-data-and-business-intelligence/hands-nlp-nltk-and-scikit-learn-video?utm_source=github&utm_medium=repository&utm_campaign=9781789345612), published by [Packt](https://www.packtpub.com/?utm_source=github). It contains all the supporting project files necessary to work through the video course from start to finish.
  8 | ## About the Video Course
  9 | Your colleagues depend on you to monetize gigabytes of unstructured text data. What do you do?
 10 | Taking this course will help you to precisely create new applications with Python and NLP. You will be able to build actual solutions backed by machine learning and NLP processing models with ease.
 11 | 
 12 | There is an overflow of text data online nowadays. As a Python developer, you need to create a new solution using Natural Language Processing for your next project. Your colleagues depend on you to monetize gigabytes of unstructured text data. What do you do?
 13 | Hands-on NLP with NLTK and scikit-learn is the answer. This course puts you right on the spot, starting off with building a spam classifier in our first video. At the end of the course, you are going to walk away with three NLP applications: a spam filter, a topic classifier, and a sentiment analyzer. There is no need for fancy mathematical theory, just plain English explanations of core NLP concepts and how to apply those using Python libraries.
 14 | Taking this course will help you to precisely create new applications with Python and NLP. You will be able to build actual solutions backed by machine learning and NLP processing models with ease.
 15 | 
 16 | <H2>What You Will Learn</H2>
 17 | <DIV class=book-info-will-learn-text>
 18 | <UL>
 19 | <LI>Build end-to-end Natural Language Processing solutions, ranging from getting data for your model to presenting its results. 
 20 | <LI>Core NLP concepts such as tokenization, stemming, and stop word removal. 
 21 | <LI>Use open source libraries such as NLTK, scikit-learn, and spaCy to perform routine NLP tasks. 
 22 | <LI>Classify emails as spam or not-spam using basic NLP techniques and simple machine learning models. 
 23 | <LI>Put documents in their relevant topics using techniques such as TF-IDF, SVMs, and LDAs. 
 24 | <LI>Common text data processing steps to increase the performance of your machine learning models. </LI></UL></DIV>
 25 | 
 26 | ## Instructions and Navigation
 27 | ### Assumed Knowledge
 28 | To fully benefit from the coverage included in this course, you will need:<br/>
 29 | This course is for developers, data scientists, and programmers who want to learn about practical Natural Language Processing with Python in a hands-on way. Developers who have an upcoming project that needs NLP, or a pile of unstructured text data on their hands, and don't know what to do with it, will find this course useful. Prior programming experience with Python is assumed along with being comfortable dealing with machine learning terms such as supervised learning, regression, and classification. No prior Natural Language Processing or text mining experience is needed.
 30 | ### Technical Requirements
 31 | This course has the following software requirements:<br/>
 32 | SETUP AND INSTALLATION
 33 | Minimum Hardware Requirements
 34 | For successful completion of this course, students will require the computer systems with at least the following:
 35 | 
 36 | 
 37 | OS: Windows 7 SP1 64-bit, Windows 8.1 64-bit or Windows 10 64-bit
 38 | 
 39 | 
 40 | 
 41 | Processor: Intel Core i5 or equivalent
 42 | 
 43 | 
 44 | 
 45 | Memory: 8 GB RAM
 46 | 
 47 | 
 48 | 
 49 | Storage: 35 GB available space
 50 | 
 51 | 
 52 | 
 53 | 
 54 | Recommended Hardware Requirements
 55 | For an optimal experience with hands-on labs and other practical activities, we recommend the following configuration:
 56 | 
 57 | 
 58 | OS: Windows 7 SP1 64-bit, Windows 8.1 64-bit or Windows 10 64-bit
 59 | 
 60 | 
 61 | 
 62 | Processor: Intel Core i7 or equivalent
 63 | 
 64 | 
 65 | 
 66 | Memory: 16 GB RAM
 67 | 
 68 | 
 69 | 
 70 | Storage: 35 GB available space
 71 | 
 72 | 
 73 | Software Requirements
 74 | 
 75 | OS: Windows 7 or Windows 10
 76 | 
 77 | 
 78 | 
 79 | Browser: Google Chrome, Latest Version
 80 | 
 81 | 
 82 | 
 83 | Code Editor: Atom IDE, Latest Version
 84 | 
 85 | 
 86 | 
 87 | Others: Python3 installed using the Anaconda package or equivalent, Tensorflow r1.4 
 88 | 
 89 | 
 90 | 
 91 | 
 92 | Exercise Files
 93 | 
 94 | Exercise files should have a start and an end state for each video that contains a demonstration of code.
 95 | 
 96 | ## Related Products
 97 | * [Hands-on NLP with NLTK and Scikit-learn [Video]](https://www.packtpub.com/big-data-and-business-intelligence/hands-nlp-nltk-and-scikit-learn-video?utm_source=github&utm_medium=repository&utm_campaign=9781789345612)
 98 | 
 99 | * [Hands-on NLP with NLTK and Scikit-learn [Video]](https://www.packtpub.com/big-data-and-business-intelligence/hands-nlp-nltk-and-scikit-learn-video?utm_source=github&utm_medium=repository&utm_campaign=9781789345612)
100 | 
101 | * [Hands-on NLP with NLTK and Scikit-learn [Video]](https://www.packtpub.com/big-data-and-business-intelligence/hands-nlp-nltk-and-scikit-learn-video?utm_source=github&utm_medium=repository&utm_campaign=9781789345612)
102 | 
103 | 


--------------------------------------------------------------------------------
/Section 4/4.2 Regular Expression for NLP.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 2,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "import re\n",
 10 |     "from nltk.corpus import words"
 11 |    ]
 12 |   },
 13 |   {
 14 |    "cell_type": "code",
 15 |    "execution_count": 13,
 16 |    "metadata": {},
 17 |    "outputs": [],
 18 |    "source": [
 19 |     "english_words = words.raw().split('\\n')"
 20 |    ]
 21 |   },
 22 |   {
 23 |    "cell_type": "code",
 24 |    "execution_count": 14,
 25 |    "metadata": {},
 26 |    "outputs": [
 27 |     {
 28 |      "data": {
 29 |       "text/plain": [
 30 |        "236737"
 31 |       ]
 32 |      },
 33 |      "execution_count": 14,
 34 |      "metadata": {},
 35 |      "output_type": "execute_result"
 36 |     }
 37 |    ],
 38 |    "source": [
 39 |     "len(english_words)"
 40 |    ]
 41 |   },
 42 |   {
 43 |    "cell_type": "markdown",
 44 |    "metadata": {},
 45 |    "source": [
 46 |     "**Wildcards**"
 47 |    ]
 48 |   },
 49 |   {
 50 |    "cell_type": "code",
 51 |    "execution_count": 18,
 52 |    "metadata": {},
 53 |    "outputs": [
 54 |     {
 55 |      "data": {
 56 |       "text/plain": [
 57 |        "['tact',\n",
 58 |        " 'tactable',\n",
 59 |        " 'tactful',\n",
 60 |        " 'tactfully',\n",
 61 |        " 'tactfulness',\n",
 62 |        " 'tactic',\n",
 63 |        " 'tactical',\n",
 64 |        " 'tactically',\n",
 65 |        " 'tactician',\n",
 66 |        " 'tactics',\n",
 67 |        " 'tactile',\n",
 68 |        " 'tactilist',\n",
 69 |        " 'tactility',\n",
 70 |        " 'tactilogical',\n",
 71 |        " 'tactinvariant',\n",
 72 |        " 'taction',\n",
 73 |        " 'tactite',\n",
 74 |        " 'tactive',\n",
 75 |        " 'tactless',\n",
 76 |        " 'tactlessly']"
 77 |       ]
 78 |      },
 79 |      "execution_count": 18,
 80 |      "metadata": {},
 81 |      "output_type": "execute_result"
 82 |     }
 83 |    ],
 84 |    "source": [
 85 |     "pattern = re.compile('t..t')\n",
 86 |     "[w for w in english_words if pattern.match(w)][:20]"
 87 |    ]
 88 |   },
 89 |   {
 90 |    "cell_type": "markdown",
 91 |    "metadata": {},
 92 |    "source": [
 93 |     "**Endings**"
 94 |    ]
 95 |   },
 96 |   {
 97 |    "cell_type": "code",
 98 |    "execution_count": 20,
 99 |    "metadata": {},
100 |    "outputs": [
101 |     {
102 |      "data": {
103 |       "text/plain": [
104 |        "['tact',\n",
105 |        " 'taft',\n",
106 |        " 'tait',\n",
107 |        " 'takt',\n",
108 |        " 'tart',\n",
109 |        " 'taut',\n",
110 |        " 'teat',\n",
111 |        " 'teet',\n",
112 |        " 'telt',\n",
113 |        " 'tent',\n",
114 |        " 'test',\n",
115 |        " 'text',\n",
116 |        " 'that',\n",
117 |        " 'tift',\n",
118 |        " 'tilt',\n",
119 |        " 'tint',\n",
120 |        " 'toat',\n",
121 |        " 'toft',\n",
122 |        " 'togt',\n",
123 |        " 'toit']"
124 |       ]
125 |      },
126 |      "execution_count": 20,
127 |      "metadata": {},
128 |      "output_type": "execute_result"
129 |     }
130 |    ],
131 |    "source": [
132 |     "pattern = re.compile('t..t$')\n",
133 |     "[w for w in english_words if pattern.match(w)][:20]"
134 |    ]
135 |   },
136 |   {
137 |    "cell_type": "markdown",
138 |    "metadata": {},
139 |    "source": [
140 |     "**Optionality**"
141 |    ]
142 |   },
143 |   {
144 |    "cell_type": "code",
145 |    "execution_count": 23,
146 |    "metadata": {},
147 |    "outputs": [
148 |     {
149 |      "data": {
150 |       "text/plain": [
151 |        "['humor', 'humour']"
152 |       ]
153 |      },
154 |      "execution_count": 23,
155 |      "metadata": {},
156 |      "output_type": "execute_result"
157 |     }
158 |    ],
159 |    "source": [
160 |     "pattern = re.compile('humou?r$')\n",
161 |     "[w for w in english_words if pattern.match(w)][:20]"
162 |    ]
163 |   },
164 |   {
165 |    "cell_type": "markdown",
166 |    "metadata": {},
167 |    "source": [
168 |     "**One or more**"
169 |    ]
170 |   },
171 |   {
172 |    "cell_type": "code",
173 |    "execution_count": 24,
174 |    "metadata": {},
175 |    "outputs": [
176 |     {
177 |      "data": {
178 |       "text/plain": [
179 |        "['col', 'cool']"
180 |       ]
181 |      },
182 |      "execution_count": 24,
183 |      "metadata": {},
184 |      "output_type": "execute_result"
185 |     }
186 |    ],
187 |    "source": [
188 |     "pattern = re.compile('co+l$')\n",
189 |     "[w for w in english_words if pattern.match(w)][:20]"
190 |    ]
191 |   },
192 |   {
193 |    "cell_type": "markdown",
194 |    "metadata": {},
195 |    "source": [
196 |     "**Grouped choices**"
197 |    ]
198 |   },
199 |   {
200 |    "cell_type": "code",
201 |    "execution_count": 25,
202 |    "metadata": {},
203 |    "outputs": [
204 |     {
205 |      "data": {
206 |       "text/plain": [
207 |        "['analyse', 'analyser', 'analyses', 'analyze', 'analyzer']"
208 |       ]
209 |      },
210 |      "execution_count": 25,
211 |      "metadata": {},
212 |      "output_type": "execute_result"
213 |     }
214 |    ],
215 |    "source": [
216 |     "pattern = re.compile('analy[sz]e')\n",
217 |     "[w for w in english_words if pattern.match(w)][:20]"
218 |    ]
219 |   },
220 |   {
221 |    "cell_type": "code",
222 |    "execution_count": 27,
223 |    "metadata": {},
224 |    "outputs": [
225 |     {
226 |      "data": {
227 |       "text/plain": [
228 |        "['pat',\n",
229 |        " 'paut',\n",
230 |        " 'peat',\n",
231 |        " 'pet',\n",
232 |        " 'piet',\n",
233 |        " 'pit',\n",
234 |        " 'poet',\n",
235 |        " 'poot',\n",
236 |        " 'pot',\n",
237 |        " 'pout',\n",
238 |        " 'put',\n",
239 |        " 'pot',\n",
240 |        " 'put']"
241 |       ]
242 |      },
243 |      "execution_count": 27,
244 |      "metadata": {},
245 |      "output_type": "execute_result"
246 |     }
247 |    ],
248 |    "source": [
249 |     "pattern = re.compile('p[aeiou]+t$')\n",
250 |     "[w for w in english_words if pattern.match(w)][:20]"
251 |    ]
252 |   }
253 |  ],
254 |  "metadata": {
255 |   "kernelspec": {
256 |    "display_name": "Python 3",
257 |    "language": "python",
258 |    "name": "python3"
259 |   },
260 |   "language_info": {
261 |    "codemirror_mode": {
262 |     "name": "ipython",
263 |     "version": 3
264 |    },
265 |    "file_extension": ".py",
266 |    "mimetype": "text/x-python",
267 |    "name": "python",
268 |    "nbconvert_exporter": "python",
269 |    "pygments_lexer": "ipython3",
270 |    "version": "3.6.2"
271 |   }
272 |  },
273 |  "nbformat": 4,
274 |  "nbformat_minor": 2
275 | }
276 | 


--------------------------------------------------------------------------------