├── letters_dict.pickle
├── sentiment_analysis_dict.pickle
├── requirements.txt
├── constants.py
├── LICENSE
├── doc_sim_main.py
├── README.md
├── document_similarity.py
└── utils.py


/letters_dict.pickle:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jairNeto/warren_buffet_letters/HEAD/letters_dict.pickle


--------------------------------------------------------------------------------
/sentiment_analysis_dict.pickle:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jairNeto/warren_buffet_letters/HEAD/sentiment_analysis_dict.pickle


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | pandas==1.1.5
 2 | tika==1.24
 3 | matplotlib==3.2.2
 4 | nltk==3.5
 5 | seaborn==0.11.0
 6 | transformers==4.5.1
 7 | numpy==1.19.5
 8 | wordcloud==1.8.1
 9 | beautifulsoup4==4.9.3
10 | scikit_learn==0.24.1
11 | 


--------------------------------------------------------------------------------
/constants.py:
--------------------------------------------------------------------------------
 1 | ENGLISH = 'english'
 2 | FEATURES = 'features'
 3 | TEXT_PIPELINE = 'text_pipeline'
 4 | VECT = 'vect'
 5 | TFIDF = 'tfidf'
 6 | TOKENIZED = 'tokenized'
 7 | LETTER_TEXT = 'letter_text'
 8 | COSINE = 'cosine'
 9 | EUCLIDEAN = 'euclidean'
10 | WORD2VEC = 'word2vec'
11 | DOC2VECT = 'doc2vect'
12 | 
13 | 
14 | 
15 | 
16 | 
17 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2021 Jair Guedes Ferreira Neto
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/doc_sim_main.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | from document_similarity import get_letters_df, get_tfidf, get_word2vec, get_doc2vec, get_transformers
 3 | import constants as const
 4 | import time
 5 | 
 6 | 
 7 | def main(algorithm, distance, letters_dict_pickle, year, n, pre_trained_model=''):
 8 |     """Main function
 9 | 
10 |     Parameters
11 |     ----------
12 |     algorithm: string
13 |         The chosen algorithm to compute the similarity/distance.
14 |     distance: string
15 |         euclidean or cosine.
16 |     letters_dict_pickle: string
17 |         Pickle path to the letters dict.
18 |     year: int
19 |         The target letter year.
20 |     n: int
21 |         The number of letters to return.
22 |     pre_trained_model: string
23 |         The pretrained model to use in transformers.
24 | 
25 |     Returns
26 |     -------
27 |     Np.array
28 |         The tfidf vector representation of the text
29 |     """
30 |     letters_df = get_letters_df(letters_dict_pickle)
31 |     if algorithm == const.TFIDF:
32 |         return get_tfidf(letters_df, year, n, distance)
33 |     elif algorithm == const.WORD2VEC:
34 |         return get_word2vec(letters_df, year, n, distance)
35 |     elif algorithm == const.DOC2VECT:
36 |         return get_doc2vec(letters_df, year, n)
37 |     else:
38 |         return get_transformers(pre_trained_model, letters_df, year, n, distance)
39 | 
40 | 
41 | if __name__ == '__main__':
42 |     PARSER = argparse.ArgumentParser(description="Execute the distance similarity")
43 |     PARSER.add_argument("-alg", "--algorithm", help="The chosen algorithm to"
44 |                                                     " compute the similarity/distance.")
45 |     PARSER.add_argument("-dist", "--distance", help="euclidean or cosine.")
46 |     PARSER.add_argument(
47 |         "-p", "--path", help="Pickle path to the letters dict.")
48 |     PARSER.add_argument(
49 |         "-t", "--target", help="The target letter year.")
50 |     PARSER.add_argument(
51 |         "-n", "--number", help="The number of letters to return.")
52 |     PARSER.add_argument(
53 |         "-pre", "--pretrained", help="The pretrained model to use in transformers.")
54 | 
55 |     ARGS = PARSER.parse_args()
56 |     start = time.time()
57 |     print(main(ARGS.algorithm, ARGS.distance, ARGS.path, int(ARGS.target), int(ARGS.number), ARGS.pretrained))
58 |     print(f'Execution time = {time.time() - start} seconds.')
59 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Warren Buffet letters Analysis
 2 | 
 3 | ### Table of Contents
 4 | 
 5 | 1. [Overview](#overview)
 6 | 2. [Installation](#installation)
 7 | 3. [Running](#running)
 8 | 4. [Final Considerations](#considerations)
 9 | 
10 | ## Overview <a name="overview"></a>
11 | 
12 | The Goal of this project is to use NLP techniques such as Question and Answering,
13 | Sentiment Analysis, WordCloud, document similarity and others to extract meaningful insights about
14 | Warren Buffet annual letters to the Berkshire Hathaway shareholders.
15 | 
16 | ## Installation <a name="installation"></a>
17 | 
18 | Create a virtual environment named **ibm_venv**.
19 | 
20 | ```
21 | $ python3 -m venv warren_venv -- for Linux and macOS
22 | $ python -m venv warren_venv -- for Windows
23 | ```
24 | 
25 | After that, activate the python virtual environment
26 | 
27 | ```
28 | $ source warren_venv/bin/activate -- for Linux and macOS
29 | $ warren_venv\Scripts\activate -- for Windows
30 | ```
31 | 
32 | Install the requirements
33 | 
34 | ```
35 | $ pip install -r requirements.txt
36 | ```
37 | 
38 | ## Running <a name="running"></a>
39 | 
40 | ### Running the QA notebook
41 | 
42 | To run it you have to download the letters after 2000 at 
43 | https://www.berkshirehathaway.com/letters/letters.html. After that you need to
44 | change the parameters from the function get_letters_corpus_dict to the directory
45 | containing the letters, after that you only need to run the desired cells of
46 | the notebook
47 | 
48 | ### Running the document similarity
49 | 
50 | You can get the most similar documents to a specific letter year by running the
51 | doc_sim_main.py.
52 | 
53 | ```
54 | python doc_sim_main.py --algorithm <algorithm> --distance <distance> --path <path> --target <target> --number <number> --pretrained <pretrained>
55 | ```
56 | 
57 | Where:
58 | * algorithm: Could be tfidf, word2vec, doc2vect and transformer
59 | * distance: Could be cosine or euclidean
60 | * path: Pickle path to the letters dict
61 | * target: The target letter year
62 | * number: The number of letters to return
63 | * pretrained: The pretrained model to use in transformers
64 | 
65 | ## Final Considerations and acknowledgments <a name="considerations"></a>
66 | 
67 | To see the full analysis of this code, access my medium post at:
68 | https://medium.com/analytics-vidhya/best-nlp-algorithms-to-get-document-similarity-a5559244b23b
69 | https://medium.com/analytics-vidhya/using-nlp-to-get-inside-warren-buffet-mind-part-2-8e3557810a39
70 | https://medium.com/analytics-vidhya/using-nlp-to-get-inside-warren-buffet-mind-part-i-666d717d0c2e
71 | 


--------------------------------------------------------------------------------
/document_similarity.py:
--------------------------------------------------------------------------------
  1 | import pickle
  2 | import nltk
  3 | import re
  4 | import pandas as pd
  5 | import numpy as np
  6 | from nltk.tokenize import word_tokenize
  7 | from nltk.stem import WordNetLemmatizer
  8 | from nltk.stem.porter import PorterStemmer
  9 | from nltk.corpus import stopwords
 10 | from sklearn.pipeline import Pipeline, FeatureUnion
 11 | from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
 12 | from sklearn.metrics.pairwise import euclidean_distances
 13 | from gensim.models.word2vec import Word2Vec
 14 | import constants as const
 15 | from gensim.models.doc2vec import Doc2Vec
 16 | from gensim.models.doc2vec import TaggedDocument
 17 | from sentence_transformers import SentenceTransformer
 18 | nltk.download(['punkt', 'wordnet', 'averaged_perceptron_tagger', 'stopwords'])
 19 | 
 20 | 
 21 | def tokenize(text):
 22 |     """Tokenize the text
 23 | 
 24 |     Parameters
 25 |     ----------
 26 |     text: String
 27 |         The message to be tokenized
 28 | 
 29 |     Returns
 30 |     -------
 31 |     List
 32 |         List with the clean tokens
 33 |     """
 34 |     text = text.lower()
 35 |     text = re.sub("[^a-zA-Z0-9]", " ", text)
 36 |     tokens = word_tokenize(text)
 37 |     tokens = [w for w in tokens if w not in stopwords.words(const.ENGLISH)]
 38 | 
 39 |     lemmatizer = WordNetLemmatizer()
 40 |     stemmer = PorterStemmer()
 41 | 
 42 |     clean_tokens_list = []
 43 |     for tok in tokens:
 44 |         lemmatizer_tok = lemmatizer.lemmatize(tok).strip()
 45 |         clean_tok = stemmer.stem(lemmatizer_tok)
 46 |         clean_tokens_list.append(clean_tok)
 47 | 
 48 |     return clean_tokens_list
 49 | 
 50 | 
 51 | def build_model():
 52 |     """Build the model
 53 | 
 54 |     Returns
 55 |     -------
 56 |     sklearn.pipeline.Pipeline
 57 |         The model
 58 |     """
 59 |     pipeline = Pipeline([
 60 |         (const.FEATURES, FeatureUnion([
 61 | 
 62 |             (const.TEXT_PIPELINE, Pipeline([
 63 |                 (const.VECT, CountVectorizer(tokenizer=tokenize)),
 64 |                 (const.TFIDF, TfidfTransformer())
 65 |             ]))
 66 |     ]))])
 67 | 
 68 |     return pipeline
 69 | 
 70 | 
 71 | def get_avg_document_vector(model, df, year):
 72 |     """Get the a vector representation of a document using word2vec
 73 | 
 74 |     Parameters
 75 |     ----------
 76 |     model: Word2Vec
 77 |         Trained Word2Vec model
 78 |     df: pandas DataFrame
 79 |         Pandas DataFrame with a columns with the tokens
 80 |     year: int
 81 |         The target year
 82 | 
 83 |     Returns
 84 |     -------
 85 |     Tuple
 86 |         The vector representation of the document, the number os words that are not at the model
 87 |         vocabulary
 88 |     """
 89 |     word_vecs = []
 90 |     count = 0
 91 |     for word in df[const.TOKENIZED].loc[year]:
 92 |         try:
 93 |             vector = model[word]
 94 |             word_vecs.append(vector)
 95 |         except KeyError:
 96 |             count += 1
 97 |             pass
 98 |     vector_avg = np.mean(word_vecs, axis=0)
 99 | 
100 |     return vector_avg, count
101 | 
102 | 
103 | def get_letters_df(letters_dict_pickle):
104 |     """Get the letters Pandas Dataframe
105 | 
106 |     Parameters
107 |     ----------
108 |     letters_dict_pickle: string
109 |         Path to the dict with the letters text
110 | 
111 |     Returns
112 |     -------
113 |     Pandas DataFrame
114 |         Pandas DataFrame with a columns with the tokens
115 |     """
116 |     with open(letters_dict_pickle, 'rb') as handle:
117 |         letters_dict = pickle.load(handle)
118 | 
119 |     letters_df = pd.DataFrame(letters_dict, index=[const.LETTER_TEXT]).T
120 |     letters_df[const.TOKENIZED] = letters_df[const.LETTER_TEXT].apply(tokenize)
121 | 
122 |     return letters_df
123 | 
124 | 
125 | def get_most_similar_docs(pairwise_similarities, letter_year, distance_method, transformers=False, initial_year=1977):
126 |     """Get the most similar letters to a target one
127 | 
128 |     Parameters
129 |     ----------
130 |     pairwise_similarities: np.array
131 |         Numpy array of the pairwise similarities
132 |     letter_year: int
133 |         The target letter year
134 |     distance_method: string
135 |         Euclidean or cosine
136 |     transformers: boolean
137 |         True if you are calling from transformers or False otherwise
138 |     initial_year: int
139 |         The initial letter year
140 | 
141 |     Returns
142 |     -------
143 |     List
144 |         List with the letter year sorted descending by similarity
145 |     """
146 |     letter_i = letter_year - initial_year
147 |     if distance_method == const.COSINE:
148 |         if transformers:
149 |             similarity_index = np.array(np.argsort(-pairwise_similarities[letter_i]))
150 |         else:
151 |             similarity_index = np.array(np.argsort(-pairwise_similarities[letter_i].todense()))[0]
152 |     else:
153 |         similarity_index = np.argsort(pairwise_similarities[letter_i])
154 | 
155 |     similar_docs_sorted = []
156 |     for index in similarity_index:
157 |         if index == letter_i:
158 |             continue
159 |         similar_docs_sorted.append(index + initial_year)
160 | 
161 |     return similar_docs_sorted
162 | 
163 | 
164 | def get_pipe_vector(letters_df):
165 |     """Get the tfidf vector
166 | 
167 |     Parameters
168 |     ----------
169 |     letters_df: pandas DataFrame
170 |         The pandas Dataframe with text from the letters
171 | 
172 |     Returns
173 |     -------
174 |     Np.array
175 |         The tfidf vector representation of the text
176 |     """
177 |     pipeline = build_model()
178 |     pipeline.fit(letters_df[const.LETTER_TEXT])
179 |     vectors = pipeline.transform(letters_df[const.LETTER_TEXT])
180 | 
181 |     return vectors
182 | 
183 | 
184 | def get_tfidf(letters_df, year, n, distance):
185 |     """Get the tfidf most similar years
186 | 
187 |     Parameters
188 |     ----------
189 |     letters_df: pandas DataFrame
190 |         The pandas Dataframe with text from the letters
191 |     year: int
192 |         The target letter year
193 |     n: int
194 |         The number of letters to return
195 |     distance: string
196 |         Euclidean or cosine
197 | 
198 |     Returns
199 |     -------
200 |     List
201 |         List with the letter year sorted descending by similarity
202 |     """
203 |     vectors = get_pipe_vector(letters_df)
204 |     if distance == const.COSINE:
205 |         pairwise_dis = vectors @ vectors.T
206 |     else:
207 |         pairwise_dis = euclidean_distances(vectors)
208 | 
209 |     return get_most_similar_docs(pairwise_dis, year, distance)[:n]
210 | 
211 | 
212 | def get_most_similar_docs_docs2vec(letter_year, model, corpus, n, initial_year=1977):
213 |     """Get the docs2vec most similar years
214 | 
215 |     Parameters
216 |     ----------
217 |     letter_year: int
218 |         The target letter year
219 |     model: docs2vec
220 |         The trained Docs2vec model
221 |     corpus: List
222 |         TaggedDocument list
223 |     n: int
224 |         The number of letters to return
225 |     initial_year: int
226 |         The initial letter year
227 | 
228 |     Returns
229 |     -------
230 |     List
231 |         List with the letter year sorted descending by similarity
232 |     """
233 |     doc_id = letter_year - initial_year
234 |     inferred_vector = model.infer_vector(corpus[doc_id].words)
235 |     sims = model.docvecs.most_similar([inferred_vector], topn=len(model.docvecs))
236 |     sims = [index + initial_year for index, _ in sims]
237 | 
238 |     return sims[1:n + 1]
239 | 
240 | 
241 | def get_doc2vec(letters_df, year, n):
242 |     """Get the doc2vec most similar years
243 | 
244 |     Parameters
245 |     ----------
246 |     letters_df: pandas DataFrame
247 |         The pandas Dataframe with text from the letters
248 |     year: int
249 |         The target letter year
250 |     n: int
251 |         The number of letters to return
252 | 
253 |     Returns
254 |     -------
255 |     List
256 |         List with the letter year sorted descending by similarity
257 |     """
258 |     EPOCHS = 40
259 |     doc2_model = Doc2Vec(min_count=2)
260 |     corpus = [TaggedDocument(tokens, [i]) for i, tokens in enumerate(list(letters_df[const.TOKENIZED]))]
261 |     doc2_model.build_vocab(corpus)
262 |     doc2_model.train(corpus, total_examples=doc2_model.corpus_count, epochs=EPOCHS)
263 |     return get_most_similar_docs_docs2vec(year, doc2_model, corpus, n)
264 | 
265 | 
266 | def get_word2vec(letters_df, year, n, distance):
267 |     """Get the word2vec most similar years
268 | 
269 |     Parameters
270 |     ----------
271 |     letters_df: pandas DataFrame
272 |         The pandas Dataframe with text from the letters
273 |     year: int
274 |         The target letter year
275 |     n: int
276 |         The number of letters to return
277 |     distance: string
278 |         Euclidean or cosine
279 | 
280 |     Returns
281 |     -------
282 |     List
283 |         List with the letter year sorted descending by similarity
284 |     """
285 |     model = Word2Vec(letters_df[const.TOKENIZED])
286 |     target, _ = get_avg_document_vector(model, letters_df, year)
287 |     distances = []
288 |     for y in list(letters_df.index):
289 |         if y != year:
290 |             vector_year, _ = get_avg_document_vector(model, letters_df, y)
291 |             if distance == const.COSINE:
292 |                 distances.append(target @ vector_year.T / np.linalg.norm(target) / np.linalg.norm(vector_year))
293 |             else:
294 |                 distances.append(np.linalg.norm(target - vector_year))
295 | 
296 |     distances = np.array(distances)
297 |     if distance == const.COSINE:
298 |         return letters_df.index[(-distances).argsort()][:n]
299 |     else:
300 |         return letters_df.index[distances.argsort()][:n]
301 | 
302 | 
303 | def get_transformers(pre_trained_model, letters_df, year, n, distance):
304 |     """Get the word2vec most similar years
305 | 
306 |     Parameters
307 |     ----------
308 |     pre_trained_model: string
309 |         The name of the pre trained transform
310 |     letters_df: pandas DataFrame
311 |         The pandas Dataframe with text from the letters
312 |     year: int
313 |         The target letter year
314 |     n: int
315 |         The number of letters to return
316 |     distance: string
317 |         Euclidean or cosine
318 | 
319 |     Returns
320 |     -------
321 |     List
322 |         List with the letter year sorted descending by similarity
323 |     """
324 |     model = SentenceTransformer(pre_trained_model)
325 |     embeddings = model.encode(letters_df[const.TOKENIZED].values)
326 |     if distance == const.COSINE:
327 |         pairwise = embeddings @ embeddings.T / np.linalg.norm(embeddings) / np.linalg.norm(embeddings)
328 |         return get_most_similar_docs(pairwise, year, const.COSINE, transformers=True)[:n]
329 |     else:
330 |         euclidean = euclidean_distances(embeddings)
331 |         return get_most_similar_docs(euclidean, year, const.EUCLIDEAN)[:n]
332 | 


--------------------------------------------------------------------------------
/utils.py:
--------------------------------------------------------------------------------
  1 | """ Utils module """
  2 | 
  3 | from urllib.request import urlopen
  4 | from pathlib import Path
  5 | import re
  6 | from bs4 import BeautifulSoup
  7 | from tika import parser
  8 | import nltk
  9 | from nltk.tokenize import word_tokenize
 10 | from nltk.stem import WordNetLemmatizer
 11 | from nltk.corpus import stopwords
 12 | from nltk.sentiment import SentimentIntensityAnalyzer
 13 | from transformers import pipeline
 14 | import pandas as pd
 15 | import seaborn as sns
 16 | import matplotlib.pyplot as plt
 17 | from wordcloud import WordCloud
 18 | 
 19 | nltk.download(['punkt', 'wordnet', 'averaged_perceptron_tagger',
 20 |                'stopwords', 'vader_lexicon'])
 21 | 
 22 | 
 23 | def get_text_from_html(url, tags_to_ignore=["script", "style"]):
 24 |     """Extract the text from a webpage
 25 | 
 26 |     Parameters
 27 |     ----------
 28 |     url: String
 29 |         The url
 30 |     tags_to_ignore: List
 31 |         List with the tags to skip when getting the text
 32 | 
 33 |     Returns
 34 |     -------
 35 |     String
 36 |         A string file with the text from the webpage
 37 |     """
 38 |     text = ''
 39 |     try:
 40 |         html = urlopen(url).read()
 41 |         soup = BeautifulSoup(html, features="html.parser")
 42 |         for script in soup(tags_to_ignore):
 43 |             script.extract()  # rip it out
 44 | 
 45 |         # get text
 46 |         text = soup.get_text()
 47 | 
 48 |         # break into lines and remove leading and trailing space on each
 49 |         lines = (line.strip() for line in text.splitlines())
 50 |         # break multi-headlines into a line each
 51 |         chunks = (phrase.strip()
 52 |                   for line in lines for phrase in line.split("  "))
 53 |         # drop blank lines
 54 |         text = '\n'.join(chunk for chunk in chunks if chunk)
 55 |     except:
 56 |         print(f'Could not open the url {url}')
 57 | 
 58 |     return text
 59 | 
 60 | 
 61 | def get_text_from_pdf(path):
 62 |     """Extract the text from a pdf
 63 | 
 64 |     Parameters
 65 |     ----------
 66 |     path: String
 67 |         Path to a pdf file
 68 | 
 69 |     Returns
 70 |     -------
 71 |     String
 72 |         A string file with the text from the pdf file
 73 |     """
 74 |     text = ''
 75 |     try:
 76 |         raw = parser.from_file(path)
 77 |         text = raw['content']
 78 |     except:
 79 |         print(f'Could not open the path {path}')
 80 | 
 81 |     return text
 82 | 
 83 | 
 84 | def get_letters_corpus_dict(letters_pdf_path, init_year=1977, end_year=2020):
 85 |     """Build the dict where the keys are the years and the values are
 86 |     the text from the Warren Buffet letters
 87 | 
 88 |     Parameters
 89 |     ----------
 90 |     letters_pdf_path: String
 91 |         Path to the directory containing the pdf letters
 92 |     init_year: int
 93 |         The initial year to start getting the letters
 94 |     end_year: int
 95 |         The finial year to start getting the letters
 96 | 
 97 |     Returns
 98 |     -------
 99 |     Dictionary
100 |         Dict where the keys are the years and the values are
101 |     the text from the Warren Buffet letters
102 |     """
103 |     if init_year < 1977 or end_year > 2020:
104 |         print('The range supported is between 1977 and 2020')
105 |         return {}
106 | 
107 |     letters_dict = dict()
108 |     letters_years = [year for year in range(init_year, end_year + 1)]
109 |     for year in letters_years:
110 |         if year >= 2000:
111 |             filename = f'{year}ltr.pdf'
112 |             path = Path(letters_pdf_path).joinpath(filename)
113 |             letter_corpus = get_text_from_pdf(str(path))
114 |         else:
115 |             if year > 1997:
116 |                 url = f'https://www.berkshirehathaway.com/letters/{year}htm.html'
117 |             else:
118 |                 url = f'https://www.berkshirehathaway.com/letters/{year}.html'
119 |             letter_corpus = get_text_from_html(url)
120 | 
121 |         letters_dict[year] = letter_corpus
122 | 
123 |     return letters_dict
124 | 
125 | 
126 | def draw_heatmap(df, figsize=(15, 6), cmap='YlOrBr', ylabel='', xlabel='', title=''):
127 |     """Draw a heatmap using seaborn
128 | 
129 |     Parameters
130 |     ----------
131 |     df: Pandas Dataframe
132 |         Pandas Dataframe with the data to show at the heatmap
133 |     figsize: Tuple
134 |         The plot figure size
135 |     cmap: matplotlib colormap name or object, or list of colors, optional
136 |         The mapping from data values to color space. If not provided,
137 |         the default will depend on whether center is set.
138 |     ylabel: String
139 |         The y label of the plot
140 |     xlabel: String
141 |         The x label of the plot
142 |     title: String
143 |         The title of the plot
144 |     """
145 |     _, ax = plt.subplots(figsize=figsize)
146 |     sns.heatmap(df, cmap=cmap, annot=False)
147 |     ax.set_ylabel(ylabel, fontsize=15)
148 |     ax.set_xlabel(xlabel, fontsize=15)
149 |     ax.set_title(title, fontsize=20, weight='bold')
150 |     plt.show()
151 | 
152 | 
153 | def tokenize(text, freq_words=[]):
154 |     """Tokenize the text
155 | 
156 |     Parameters
157 |     ----------
158 |     text: String
159 |         The message to be tokenized
160 |     freq_words: List
161 |         List with words that appears frequent at the text
162 | 
163 |     Returns
164 |     -------
165 |     List
166 |         List with the clean tokens
167 |     """
168 |     text = text.lower()
169 |     text = re.sub("[^a-zA-Z]", " ", text)
170 |     tokens = word_tokenize(text)
171 |     tokens = [w for w in tokens if w not in stopwords.words('english')]
172 |     tokens = [w for w in tokens if w not in freq_words]
173 | 
174 |     lemmatizer = WordNetLemmatizer()
175 | 
176 |     clean_tokens_list = []
177 |     for tok in tokens:
178 |         lemmatizer_tok = lemmatizer.lemmatize(tok).strip()
179 |         clean_tokens_list.append(lemmatizer_tok)
180 | 
181 |     return clean_tokens_list
182 | 
183 | 
184 | def get_most_frequent_combinatation(tokens, freq=10, num_word_combination=-1):
185 |     """Get a dict with the most frequent onegram, bigram, trigram and quadgrams
186 | 
187 |     Parameters
188 |     ----------
189 |     text: List
190 |         List with the tokens
191 |     freq: Int
192 |         How many combination to return
193 |     num_word_combination: Int
194 |         1 to onegram
195 |         2 to bigram
196 |         3 to trigram
197 |         4 to quadgrams
198 |         -1 All
199 | 
200 |     Returns
201 |     -------
202 |     Dict
203 |         Dict with the frequencies
204 |     """
205 |     if num_word_combination < -1 or num_word_combination > 4:
206 |         raise Exception(
207 |             f'The num_word_combination shall be greater than -2 and lesser than 5 the values passes was {num_word_combination}')
208 | 
209 |     freq_dict = {}
210 |     if num_word_combination in [1, -1]:
211 |         freq_dist = nltk.FreqDist(tokens)
212 |         freq_dict['FreqDist_onegram'] = freq_dist.most_common(freq)
213 | 
214 |     if num_word_combination in [2, -1]:
215 |         bigrams = nltk.collocations.BigramCollocationFinder.from_words(tokens)
216 |         freq_dict['FreqDist_bigram'] = bigrams.ngram_fd.most_common(freq)
217 | 
218 |     if num_word_combination in [3, -1]:
219 |         trigram = nltk.collocations.TrigramCollocationFinder.from_words(tokens)
220 |         freq_dict['FreqDist_trigram'] = trigram.ngram_fd.most_common(freq)
221 | 
222 |     if num_word_combination in [4, -1]:
223 |         quadgrams = nltk.collocations.QuadgramCollocationFinder.from_words(
224 |             tokens)
225 |         freq_dict['FreqDist_quadgrams'] = quadgrams.ngram_fd.most_common(freq)
226 | 
227 |     return freq_dict
228 | 
229 | 
230 | def drawn_wordcloud(corpus, save_path, figsize=(15, 6)):
231 |     """Get a dict with the most frequent onegram, bigram, trigram and quadgrams
232 | 
233 |     Parameters
234 |     ----------
235 |     corpus: List
236 |         List with the words
237 |     save_path: String
238 |         Path to the file where the wordcloud will be saved at
239 |     figsize: Tuple
240 |         The figsize
241 |     """
242 |     _, _ = plt.subplots(figsize=figsize)
243 |     combined_text = " ".join(text for text in corpus)
244 |     wordcloud = WordCloud().generate(combined_text)
245 |     plt.imshow(wordcloud, interpolation='bilinear')
246 |     plt.axis("off")
247 |     plt.savefig(save_path)
248 |     plt.show()
249 | 
250 | 
251 | def tokenize_sent(text):
252 |     """Tokenize the sentence
253 | 
254 |     Parameters
255 |     ----------
256 |     text: String
257 |         The text to be tokenized
258 | 
259 |     Returns
260 |     -------
261 |     List
262 |         List with the tokenized sentences
263 |     """
264 |     sentence_list = nltk.tokenize.sent_tokenize(text)
265 |     tokenized_list = []
266 |     for sentence in sentence_list:
267 |         sentence_after_regex = re.sub("[^a-z0-9A-Z]", " ", sentence)
268 |         # Remove sentences where there was only numbers
269 |         if len(re.sub("[^a-zA-Z]", "", sentence_after_regex)) > 6:
270 |             tokenized_list.append(sentence_after_regex)
271 | 
272 |     return tokenized_list
273 | 
274 | 
275 | def calculate_text_sentiment_using_transform(sentence_list):
276 |     """Calculate the test sentiment using transforms
277 | 
278 |     Parameters
279 |     ----------
280 |     sentence_list: List
281 |         List with the tokenizes sentences
282 | 
283 |     Returns
284 |     -------
285 |     Dictonary
286 |         Dict with the cumulative sentiment of all the sentences at the list
287 |     """
288 |     sentiment_dict = {'POSITIVE': 0, 'NEGATIVE': 0}
289 |     classifier = pipeline('sentiment-analysis')
290 |     for sentence in sentence_list:
291 |         sentiment_result = classifier(sentence)
292 |         sentiment_dict[sentiment_result[0]['label']
293 |                        ] += sentiment_result[0]['score']
294 | 
295 |     return sentiment_dict
296 | 
297 | 
298 | def calculate_text_sia(sentence_list):
299 |     """Calculate the test sentiment using Sentiment Intensity Analyzer
300 | 
301 |     Parameters
302 |     ----------
303 |     sentence_list: List
304 |         List with the tokenizes sentences
305 | 
306 |     Returns
307 |     -------
308 |     Dictonary
309 |         Dict with the cumulative sentiment of all the sentences at the list
310 |     """
311 |     sentiment_dict = {'neg': 0, 'neu': 0, 'pos': 0, 'compound': 0}
312 |     sia = SentimentIntensityAnalyzer()
313 |     for sentence in sentence_list:
314 |         sentiment_result = sia.polarity_scores(sentence)
315 |         for k in sentiment_result:
316 |             sentiment_dict[k] += sentiment_result[k]
317 | 
318 |     return sentiment_dict
319 | 
320 | 
321 | def get_sentiment_analysis_df(letters_dict,
322 |                               calculate_text_sentiment,
323 |                               tokenize_sent,
324 |                               normalized=True):
325 |     """Get the DataFrame with the sentiment of each Warren letter
326 | 
327 |     Parameters
328 |     ----------
329 |     letters_dict: Dictonary
330 |         Dict with the letters text
331 |     calculate_text_sentiment: function
332 |         Function used to calculate the sentiment of the text
333 |     tokenize_sent: function
334 |         Function to tokenize the text into a list of sentences
335 |     normalized: bool
336 |         If the values of the df will be normalized or not
337 | 
338 |     Returns
339 |     -------
340 |     Pandas DataFrame
341 |         Pandas DataFrame with the sentiment analysis for each letter
342 |     """
343 |     sentiment_analysis_dict = {}
344 |     for k in letters_dict:
345 |         sentiment_analysis_dict[k] = calculate_text_sentiment(
346 |             tokenize_sent(letters_dict[k]))
347 |     sentiment_analysis_df = pd.DataFrame(sentiment_analysis_dict)
348 |     if normalized:
349 |         return sentiment_analysis_df / sentiment_analysis_df.sum(axis=0)
350 |     return sentiment_analysis_df
351 | 
352 | 
353 | def get_answer_using_qa(nlp, question, context):
354 |     """Get answer using a classifier trained with the QA technique
355 | 
356 |     Parameters
357 |     ----------
358 |     nlp: Pipeline
359 |         Trained QA Pipeline
360 |     question: String
361 |         Question that the model will answer
362 |     context: String
363 |         The Context of the question
364 | 
365 |     Returns
366 |     -------
367 |     Tuple
368 |         The answer, the score, the start position of the answer at the text and the final
369 |         position of the answer at the text
370 |     """
371 |     result = nlp(question=question, context=context)
372 | 
373 |     return result['answer'], round(result['score'], 4), result['start'], result['end']
374 | 
375 | 
376 | def format_spines(ax, right_border=True):
377 |     """
378 |     This function sets up borders from an axis and personalize colors
379 | 
380 |     Parameters
381 |     ----------
382 |         Axis: Matplotlib axis
383 |             The plot axis
384 |         right_border: Boolean
385 |             Whether to plot or not the right border
386 |     """
387 |     # Setting up colors
388 |     ax.spines['bottom'].set_color('#CCCCCC')
389 |     ax.spines['left'].set_color('#CCCCCC')
390 |     ax.spines['top'].set_visible(False)
391 |     if right_border:
392 |         ax.spines['right'].set_color('#CCCCCC')
393 |     else:
394 |         ax.spines['right'].set_color('#FFFFFF')
395 |     ax.patch.set_facecolor('#FFFFFF')
396 | 
397 | 
398 | def get_ngram_plot_data(df, type, sentiment):
399 |     """Format the data to the ngram plot
400 | 
401 |     Parameters
402 |     ----------
403 |     df: Pandas DataFrame
404 |         Pandas dataframe with the ngrams sentiment data
405 |     type: String
406 |         Type of the ngram to filter
407 |     sentiment: String
408 |         POSITIVE or NEGATIVE
409 | 
410 |     Returns
411 |     -------
412 |     Pandas Dataframe
413 |         The dataframe filtered
414 |     """
415 |     return df.query("type == @type and sentiment == @sentiment").sort_values('score', ascending=False)
416 | 


--------------------------------------------------------------------------------