├── letters_dict.pickle
├── sentiment_analysis_dict.pickle
├── requirements.txt
├── constants.py
├── LICENSE
├── doc_sim_main.py
├── README.md
├── document_similarity.py
└── utils.py
/letters_dict.pickle:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jairNeto/warren_buffet_letters/HEAD/letters_dict.pickle
--------------------------------------------------------------------------------
/sentiment_analysis_dict.pickle:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jairNeto/warren_buffet_letters/HEAD/sentiment_analysis_dict.pickle
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | pandas==1.1.5
2 | tika==1.24
3 | matplotlib==3.2.2
4 | nltk==3.5
5 | seaborn==0.11.0
6 | transformers==4.5.1
7 | numpy==1.19.5
8 | wordcloud==1.8.1
9 | beautifulsoup4==4.9.3
10 | scikit_learn==0.24.1
11 |
--------------------------------------------------------------------------------
/constants.py:
--------------------------------------------------------------------------------
1 | ENGLISH = 'english'
2 | FEATURES = 'features'
3 | TEXT_PIPELINE = 'text_pipeline'
4 | VECT = 'vect'
5 | TFIDF = 'tfidf'
6 | TOKENIZED = 'tokenized'
7 | LETTER_TEXT = 'letter_text'
8 | COSINE = 'cosine'
9 | EUCLIDEAN = 'euclidean'
10 | WORD2VEC = 'word2vec'
11 | DOC2VECT = 'doc2vect'
12 |
13 |
14 |
15 |
16 |
17 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2021 Jair Guedes Ferreira Neto
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/doc_sim_main.py:
--------------------------------------------------------------------------------
1 | import argparse
2 | from document_similarity import get_letters_df, get_tfidf, get_word2vec, get_doc2vec, get_transformers
3 | import constants as const
4 | import time
5 |
6 |
7 | def main(algorithm, distance, letters_dict_pickle, year, n, pre_trained_model=''):
8 | """Main function
9 |
10 | Parameters
11 | ----------
12 | algorithm: string
13 | The chosen algorithm to compute the similarity/distance.
14 | distance: string
15 | euclidean or cosine.
16 | letters_dict_pickle: string
17 | Pickle path to the letters dict.
18 | year: int
19 | The target letter year.
20 | n: int
21 | The number of letters to return.
22 | pre_trained_model: string
23 | The pretrained model to use in transformers.
24 |
25 | Returns
26 | -------
27 | Np.array
28 | The tfidf vector representation of the text
29 | """
30 | letters_df = get_letters_df(letters_dict_pickle)
31 | if algorithm == const.TFIDF:
32 | return get_tfidf(letters_df, year, n, distance)
33 | elif algorithm == const.WORD2VEC:
34 | return get_word2vec(letters_df, year, n, distance)
35 | elif algorithm == const.DOC2VECT:
36 | return get_doc2vec(letters_df, year, n)
37 | else:
38 | return get_transformers(pre_trained_model, letters_df, year, n, distance)
39 |
40 |
41 | if __name__ == '__main__':
42 | PARSER = argparse.ArgumentParser(description="Execute the distance similarity")
43 | PARSER.add_argument("-alg", "--algorithm", help="The chosen algorithm to"
44 | " compute the similarity/distance.")
45 | PARSER.add_argument("-dist", "--distance", help="euclidean or cosine.")
46 | PARSER.add_argument(
47 | "-p", "--path", help="Pickle path to the letters dict.")
48 | PARSER.add_argument(
49 | "-t", "--target", help="The target letter year.")
50 | PARSER.add_argument(
51 | "-n", "--number", help="The number of letters to return.")
52 | PARSER.add_argument(
53 | "-pre", "--pretrained", help="The pretrained model to use in transformers.")
54 |
55 | ARGS = PARSER.parse_args()
56 | start = time.time()
57 | print(main(ARGS.algorithm, ARGS.distance, ARGS.path, int(ARGS.target), int(ARGS.number), ARGS.pretrained))
58 | print(f'Execution time = {time.time() - start} seconds.')
59 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Warren Buffet letters Analysis
2 |
3 | ### Table of Contents
4 |
5 | 1. [Overview](#overview)
6 | 2. [Installation](#installation)
7 | 3. [Running](#running)
8 | 4. [Final Considerations](#considerations)
9 |
10 | ## Overview
11 |
12 | The Goal of this project is to use NLP techniques such as Question and Answering,
13 | Sentiment Analysis, WordCloud, document similarity and others to extract meaningful insights about
14 | Warren Buffet annual letters to the Berkshire Hathaway shareholders.
15 |
16 | ## Installation
17 |
18 | Create a virtual environment named **ibm_venv**.
19 |
20 | ```
21 | $ python3 -m venv warren_venv -- for Linux and macOS
22 | $ python -m venv warren_venv -- for Windows
23 | ```
24 |
25 | After that, activate the python virtual environment
26 |
27 | ```
28 | $ source warren_venv/bin/activate -- for Linux and macOS
29 | $ warren_venv\Scripts\activate -- for Windows
30 | ```
31 |
32 | Install the requirements
33 |
34 | ```
35 | $ pip install -r requirements.txt
36 | ```
37 |
38 | ## Running
39 |
40 | ### Running the QA notebook
41 |
42 | To run it you have to download the letters after 2000 at
43 | https://www.berkshirehathaway.com/letters/letters.html. After that you need to
44 | change the parameters from the function get_letters_corpus_dict to the directory
45 | containing the letters, after that you only need to run the desired cells of
46 | the notebook
47 |
48 | ### Running the document similarity
49 |
50 | You can get the most similar documents to a specific letter year by running the
51 | doc_sim_main.py.
52 |
53 | ```
54 | python doc_sim_main.py --algorithm --distance --path --target --number --pretrained
55 | ```
56 |
57 | Where:
58 | * algorithm: Could be tfidf, word2vec, doc2vect and transformer
59 | * distance: Could be cosine or euclidean
60 | * path: Pickle path to the letters dict
61 | * target: The target letter year
62 | * number: The number of letters to return
63 | * pretrained: The pretrained model to use in transformers
64 |
65 | ## Final Considerations and acknowledgments
66 |
67 | To see the full analysis of this code, access my medium post at:
68 | https://medium.com/analytics-vidhya/best-nlp-algorithms-to-get-document-similarity-a5559244b23b
69 | https://medium.com/analytics-vidhya/using-nlp-to-get-inside-warren-buffet-mind-part-2-8e3557810a39
70 | https://medium.com/analytics-vidhya/using-nlp-to-get-inside-warren-buffet-mind-part-i-666d717d0c2e
71 |
--------------------------------------------------------------------------------
/document_similarity.py:
--------------------------------------------------------------------------------
1 | import pickle
2 | import nltk
3 | import re
4 | import pandas as pd
5 | import numpy as np
6 | from nltk.tokenize import word_tokenize
7 | from nltk.stem import WordNetLemmatizer
8 | from nltk.stem.porter import PorterStemmer
9 | from nltk.corpus import stopwords
10 | from sklearn.pipeline import Pipeline, FeatureUnion
11 | from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
12 | from sklearn.metrics.pairwise import euclidean_distances
13 | from gensim.models.word2vec import Word2Vec
14 | import constants as const
15 | from gensim.models.doc2vec import Doc2Vec
16 | from gensim.models.doc2vec import TaggedDocument
17 | from sentence_transformers import SentenceTransformer
18 | nltk.download(['punkt', 'wordnet', 'averaged_perceptron_tagger', 'stopwords'])
19 |
20 |
21 | def tokenize(text):
22 | """Tokenize the text
23 |
24 | Parameters
25 | ----------
26 | text: String
27 | The message to be tokenized
28 |
29 | Returns
30 | -------
31 | List
32 | List with the clean tokens
33 | """
34 | text = text.lower()
35 | text = re.sub("[^a-zA-Z0-9]", " ", text)
36 | tokens = word_tokenize(text)
37 | tokens = [w for w in tokens if w not in stopwords.words(const.ENGLISH)]
38 |
39 | lemmatizer = WordNetLemmatizer()
40 | stemmer = PorterStemmer()
41 |
42 | clean_tokens_list = []
43 | for tok in tokens:
44 | lemmatizer_tok = lemmatizer.lemmatize(tok).strip()
45 | clean_tok = stemmer.stem(lemmatizer_tok)
46 | clean_tokens_list.append(clean_tok)
47 |
48 | return clean_tokens_list
49 |
50 |
51 | def build_model():
52 | """Build the model
53 |
54 | Returns
55 | -------
56 | sklearn.pipeline.Pipeline
57 | The model
58 | """
59 | pipeline = Pipeline([
60 | (const.FEATURES, FeatureUnion([
61 |
62 | (const.TEXT_PIPELINE, Pipeline([
63 | (const.VECT, CountVectorizer(tokenizer=tokenize)),
64 | (const.TFIDF, TfidfTransformer())
65 | ]))
66 | ]))])
67 |
68 | return pipeline
69 |
70 |
71 | def get_avg_document_vector(model, df, year):
72 | """Get the a vector representation of a document using word2vec
73 |
74 | Parameters
75 | ----------
76 | model: Word2Vec
77 | Trained Word2Vec model
78 | df: pandas DataFrame
79 | Pandas DataFrame with a columns with the tokens
80 | year: int
81 | The target year
82 |
83 | Returns
84 | -------
85 | Tuple
86 | The vector representation of the document, the number os words that are not at the model
87 | vocabulary
88 | """
89 | word_vecs = []
90 | count = 0
91 | for word in df[const.TOKENIZED].loc[year]:
92 | try:
93 | vector = model[word]
94 | word_vecs.append(vector)
95 | except KeyError:
96 | count += 1
97 | pass
98 | vector_avg = np.mean(word_vecs, axis=0)
99 |
100 | return vector_avg, count
101 |
102 |
103 | def get_letters_df(letters_dict_pickle):
104 | """Get the letters Pandas Dataframe
105 |
106 | Parameters
107 | ----------
108 | letters_dict_pickle: string
109 | Path to the dict with the letters text
110 |
111 | Returns
112 | -------
113 | Pandas DataFrame
114 | Pandas DataFrame with a columns with the tokens
115 | """
116 | with open(letters_dict_pickle, 'rb') as handle:
117 | letters_dict = pickle.load(handle)
118 |
119 | letters_df = pd.DataFrame(letters_dict, index=[const.LETTER_TEXT]).T
120 | letters_df[const.TOKENIZED] = letters_df[const.LETTER_TEXT].apply(tokenize)
121 |
122 | return letters_df
123 |
124 |
125 | def get_most_similar_docs(pairwise_similarities, letter_year, distance_method, transformers=False, initial_year=1977):
126 | """Get the most similar letters to a target one
127 |
128 | Parameters
129 | ----------
130 | pairwise_similarities: np.array
131 | Numpy array of the pairwise similarities
132 | letter_year: int
133 | The target letter year
134 | distance_method: string
135 | Euclidean or cosine
136 | transformers: boolean
137 | True if you are calling from transformers or False otherwise
138 | initial_year: int
139 | The initial letter year
140 |
141 | Returns
142 | -------
143 | List
144 | List with the letter year sorted descending by similarity
145 | """
146 | letter_i = letter_year - initial_year
147 | if distance_method == const.COSINE:
148 | if transformers:
149 | similarity_index = np.array(np.argsort(-pairwise_similarities[letter_i]))
150 | else:
151 | similarity_index = np.array(np.argsort(-pairwise_similarities[letter_i].todense()))[0]
152 | else:
153 | similarity_index = np.argsort(pairwise_similarities[letter_i])
154 |
155 | similar_docs_sorted = []
156 | for index in similarity_index:
157 | if index == letter_i:
158 | continue
159 | similar_docs_sorted.append(index + initial_year)
160 |
161 | return similar_docs_sorted
162 |
163 |
164 | def get_pipe_vector(letters_df):
165 | """Get the tfidf vector
166 |
167 | Parameters
168 | ----------
169 | letters_df: pandas DataFrame
170 | The pandas Dataframe with text from the letters
171 |
172 | Returns
173 | -------
174 | Np.array
175 | The tfidf vector representation of the text
176 | """
177 | pipeline = build_model()
178 | pipeline.fit(letters_df[const.LETTER_TEXT])
179 | vectors = pipeline.transform(letters_df[const.LETTER_TEXT])
180 |
181 | return vectors
182 |
183 |
184 | def get_tfidf(letters_df, year, n, distance):
185 | """Get the tfidf most similar years
186 |
187 | Parameters
188 | ----------
189 | letters_df: pandas DataFrame
190 | The pandas Dataframe with text from the letters
191 | year: int
192 | The target letter year
193 | n: int
194 | The number of letters to return
195 | distance: string
196 | Euclidean or cosine
197 |
198 | Returns
199 | -------
200 | List
201 | List with the letter year sorted descending by similarity
202 | """
203 | vectors = get_pipe_vector(letters_df)
204 | if distance == const.COSINE:
205 | pairwise_dis = vectors @ vectors.T
206 | else:
207 | pairwise_dis = euclidean_distances(vectors)
208 |
209 | return get_most_similar_docs(pairwise_dis, year, distance)[:n]
210 |
211 |
212 | def get_most_similar_docs_docs2vec(letter_year, model, corpus, n, initial_year=1977):
213 | """Get the docs2vec most similar years
214 |
215 | Parameters
216 | ----------
217 | letter_year: int
218 | The target letter year
219 | model: docs2vec
220 | The trained Docs2vec model
221 | corpus: List
222 | TaggedDocument list
223 | n: int
224 | The number of letters to return
225 | initial_year: int
226 | The initial letter year
227 |
228 | Returns
229 | -------
230 | List
231 | List with the letter year sorted descending by similarity
232 | """
233 | doc_id = letter_year - initial_year
234 | inferred_vector = model.infer_vector(corpus[doc_id].words)
235 | sims = model.docvecs.most_similar([inferred_vector], topn=len(model.docvecs))
236 | sims = [index + initial_year for index, _ in sims]
237 |
238 | return sims[1:n + 1]
239 |
240 |
241 | def get_doc2vec(letters_df, year, n):
242 | """Get the doc2vec most similar years
243 |
244 | Parameters
245 | ----------
246 | letters_df: pandas DataFrame
247 | The pandas Dataframe with text from the letters
248 | year: int
249 | The target letter year
250 | n: int
251 | The number of letters to return
252 |
253 | Returns
254 | -------
255 | List
256 | List with the letter year sorted descending by similarity
257 | """
258 | EPOCHS = 40
259 | doc2_model = Doc2Vec(min_count=2)
260 | corpus = [TaggedDocument(tokens, [i]) for i, tokens in enumerate(list(letters_df[const.TOKENIZED]))]
261 | doc2_model.build_vocab(corpus)
262 | doc2_model.train(corpus, total_examples=doc2_model.corpus_count, epochs=EPOCHS)
263 | return get_most_similar_docs_docs2vec(year, doc2_model, corpus, n)
264 |
265 |
266 | def get_word2vec(letters_df, year, n, distance):
267 | """Get the word2vec most similar years
268 |
269 | Parameters
270 | ----------
271 | letters_df: pandas DataFrame
272 | The pandas Dataframe with text from the letters
273 | year: int
274 | The target letter year
275 | n: int
276 | The number of letters to return
277 | distance: string
278 | Euclidean or cosine
279 |
280 | Returns
281 | -------
282 | List
283 | List with the letter year sorted descending by similarity
284 | """
285 | model = Word2Vec(letters_df[const.TOKENIZED])
286 | target, _ = get_avg_document_vector(model, letters_df, year)
287 | distances = []
288 | for y in list(letters_df.index):
289 | if y != year:
290 | vector_year, _ = get_avg_document_vector(model, letters_df, y)
291 | if distance == const.COSINE:
292 | distances.append(target @ vector_year.T / np.linalg.norm(target) / np.linalg.norm(vector_year))
293 | else:
294 | distances.append(np.linalg.norm(target - vector_year))
295 |
296 | distances = np.array(distances)
297 | if distance == const.COSINE:
298 | return letters_df.index[(-distances).argsort()][:n]
299 | else:
300 | return letters_df.index[distances.argsort()][:n]
301 |
302 |
303 | def get_transformers(pre_trained_model, letters_df, year, n, distance):
304 | """Get the word2vec most similar years
305 |
306 | Parameters
307 | ----------
308 | pre_trained_model: string
309 | The name of the pre trained transform
310 | letters_df: pandas DataFrame
311 | The pandas Dataframe with text from the letters
312 | year: int
313 | The target letter year
314 | n: int
315 | The number of letters to return
316 | distance: string
317 | Euclidean or cosine
318 |
319 | Returns
320 | -------
321 | List
322 | List with the letter year sorted descending by similarity
323 | """
324 | model = SentenceTransformer(pre_trained_model)
325 | embeddings = model.encode(letters_df[const.TOKENIZED].values)
326 | if distance == const.COSINE:
327 | pairwise = embeddings @ embeddings.T / np.linalg.norm(embeddings) / np.linalg.norm(embeddings)
328 | return get_most_similar_docs(pairwise, year, const.COSINE, transformers=True)[:n]
329 | else:
330 | euclidean = euclidean_distances(embeddings)
331 | return get_most_similar_docs(euclidean, year, const.EUCLIDEAN)[:n]
332 |
--------------------------------------------------------------------------------
/utils.py:
--------------------------------------------------------------------------------
1 | """ Utils module """
2 |
3 | from urllib.request import urlopen
4 | from pathlib import Path
5 | import re
6 | from bs4 import BeautifulSoup
7 | from tika import parser
8 | import nltk
9 | from nltk.tokenize import word_tokenize
10 | from nltk.stem import WordNetLemmatizer
11 | from nltk.corpus import stopwords
12 | from nltk.sentiment import SentimentIntensityAnalyzer
13 | from transformers import pipeline
14 | import pandas as pd
15 | import seaborn as sns
16 | import matplotlib.pyplot as plt
17 | from wordcloud import WordCloud
18 |
19 | nltk.download(['punkt', 'wordnet', 'averaged_perceptron_tagger',
20 | 'stopwords', 'vader_lexicon'])
21 |
22 |
23 | def get_text_from_html(url, tags_to_ignore=["script", "style"]):
24 | """Extract the text from a webpage
25 |
26 | Parameters
27 | ----------
28 | url: String
29 | The url
30 | tags_to_ignore: List
31 | List with the tags to skip when getting the text
32 |
33 | Returns
34 | -------
35 | String
36 | A string file with the text from the webpage
37 | """
38 | text = ''
39 | try:
40 | html = urlopen(url).read()
41 | soup = BeautifulSoup(html, features="html.parser")
42 | for script in soup(tags_to_ignore):
43 | script.extract() # rip it out
44 |
45 | # get text
46 | text = soup.get_text()
47 |
48 | # break into lines and remove leading and trailing space on each
49 | lines = (line.strip() for line in text.splitlines())
50 | # break multi-headlines into a line each
51 | chunks = (phrase.strip()
52 | for line in lines for phrase in line.split(" "))
53 | # drop blank lines
54 | text = '\n'.join(chunk for chunk in chunks if chunk)
55 | except:
56 | print(f'Could not open the url {url}')
57 |
58 | return text
59 |
60 |
61 | def get_text_from_pdf(path):
62 | """Extract the text from a pdf
63 |
64 | Parameters
65 | ----------
66 | path: String
67 | Path to a pdf file
68 |
69 | Returns
70 | -------
71 | String
72 | A string file with the text from the pdf file
73 | """
74 | text = ''
75 | try:
76 | raw = parser.from_file(path)
77 | text = raw['content']
78 | except:
79 | print(f'Could not open the path {path}')
80 |
81 | return text
82 |
83 |
84 | def get_letters_corpus_dict(letters_pdf_path, init_year=1977, end_year=2020):
85 | """Build the dict where the keys are the years and the values are
86 | the text from the Warren Buffet letters
87 |
88 | Parameters
89 | ----------
90 | letters_pdf_path: String
91 | Path to the directory containing the pdf letters
92 | init_year: int
93 | The initial year to start getting the letters
94 | end_year: int
95 | The finial year to start getting the letters
96 |
97 | Returns
98 | -------
99 | Dictionary
100 | Dict where the keys are the years and the values are
101 | the text from the Warren Buffet letters
102 | """
103 | if init_year < 1977 or end_year > 2020:
104 | print('The range supported is between 1977 and 2020')
105 | return {}
106 |
107 | letters_dict = dict()
108 | letters_years = [year for year in range(init_year, end_year + 1)]
109 | for year in letters_years:
110 | if year >= 2000:
111 | filename = f'{year}ltr.pdf'
112 | path = Path(letters_pdf_path).joinpath(filename)
113 | letter_corpus = get_text_from_pdf(str(path))
114 | else:
115 | if year > 1997:
116 | url = f'https://www.berkshirehathaway.com/letters/{year}htm.html'
117 | else:
118 | url = f'https://www.berkshirehathaway.com/letters/{year}.html'
119 | letter_corpus = get_text_from_html(url)
120 |
121 | letters_dict[year] = letter_corpus
122 |
123 | return letters_dict
124 |
125 |
126 | def draw_heatmap(df, figsize=(15, 6), cmap='YlOrBr', ylabel='', xlabel='', title=''):
127 | """Draw a heatmap using seaborn
128 |
129 | Parameters
130 | ----------
131 | df: Pandas Dataframe
132 | Pandas Dataframe with the data to show at the heatmap
133 | figsize: Tuple
134 | The plot figure size
135 | cmap: matplotlib colormap name or object, or list of colors, optional
136 | The mapping from data values to color space. If not provided,
137 | the default will depend on whether center is set.
138 | ylabel: String
139 | The y label of the plot
140 | xlabel: String
141 | The x label of the plot
142 | title: String
143 | The title of the plot
144 | """
145 | _, ax = plt.subplots(figsize=figsize)
146 | sns.heatmap(df, cmap=cmap, annot=False)
147 | ax.set_ylabel(ylabel, fontsize=15)
148 | ax.set_xlabel(xlabel, fontsize=15)
149 | ax.set_title(title, fontsize=20, weight='bold')
150 | plt.show()
151 |
152 |
153 | def tokenize(text, freq_words=[]):
154 | """Tokenize the text
155 |
156 | Parameters
157 | ----------
158 | text: String
159 | The message to be tokenized
160 | freq_words: List
161 | List with words that appears frequent at the text
162 |
163 | Returns
164 | -------
165 | List
166 | List with the clean tokens
167 | """
168 | text = text.lower()
169 | text = re.sub("[^a-zA-Z]", " ", text)
170 | tokens = word_tokenize(text)
171 | tokens = [w for w in tokens if w not in stopwords.words('english')]
172 | tokens = [w for w in tokens if w not in freq_words]
173 |
174 | lemmatizer = WordNetLemmatizer()
175 |
176 | clean_tokens_list = []
177 | for tok in tokens:
178 | lemmatizer_tok = lemmatizer.lemmatize(tok).strip()
179 | clean_tokens_list.append(lemmatizer_tok)
180 |
181 | return clean_tokens_list
182 |
183 |
184 | def get_most_frequent_combinatation(tokens, freq=10, num_word_combination=-1):
185 | """Get a dict with the most frequent onegram, bigram, trigram and quadgrams
186 |
187 | Parameters
188 | ----------
189 | text: List
190 | List with the tokens
191 | freq: Int
192 | How many combination to return
193 | num_word_combination: Int
194 | 1 to onegram
195 | 2 to bigram
196 | 3 to trigram
197 | 4 to quadgrams
198 | -1 All
199 |
200 | Returns
201 | -------
202 | Dict
203 | Dict with the frequencies
204 | """
205 | if num_word_combination < -1 or num_word_combination > 4:
206 | raise Exception(
207 | f'The num_word_combination shall be greater than -2 and lesser than 5 the values passes was {num_word_combination}')
208 |
209 | freq_dict = {}
210 | if num_word_combination in [1, -1]:
211 | freq_dist = nltk.FreqDist(tokens)
212 | freq_dict['FreqDist_onegram'] = freq_dist.most_common(freq)
213 |
214 | if num_word_combination in [2, -1]:
215 | bigrams = nltk.collocations.BigramCollocationFinder.from_words(tokens)
216 | freq_dict['FreqDist_bigram'] = bigrams.ngram_fd.most_common(freq)
217 |
218 | if num_word_combination in [3, -1]:
219 | trigram = nltk.collocations.TrigramCollocationFinder.from_words(tokens)
220 | freq_dict['FreqDist_trigram'] = trigram.ngram_fd.most_common(freq)
221 |
222 | if num_word_combination in [4, -1]:
223 | quadgrams = nltk.collocations.QuadgramCollocationFinder.from_words(
224 | tokens)
225 | freq_dict['FreqDist_quadgrams'] = quadgrams.ngram_fd.most_common(freq)
226 |
227 | return freq_dict
228 |
229 |
230 | def drawn_wordcloud(corpus, save_path, figsize=(15, 6)):
231 | """Get a dict with the most frequent onegram, bigram, trigram and quadgrams
232 |
233 | Parameters
234 | ----------
235 | corpus: List
236 | List with the words
237 | save_path: String
238 | Path to the file where the wordcloud will be saved at
239 | figsize: Tuple
240 | The figsize
241 | """
242 | _, _ = plt.subplots(figsize=figsize)
243 | combined_text = " ".join(text for text in corpus)
244 | wordcloud = WordCloud().generate(combined_text)
245 | plt.imshow(wordcloud, interpolation='bilinear')
246 | plt.axis("off")
247 | plt.savefig(save_path)
248 | plt.show()
249 |
250 |
251 | def tokenize_sent(text):
252 | """Tokenize the sentence
253 |
254 | Parameters
255 | ----------
256 | text: String
257 | The text to be tokenized
258 |
259 | Returns
260 | -------
261 | List
262 | List with the tokenized sentences
263 | """
264 | sentence_list = nltk.tokenize.sent_tokenize(text)
265 | tokenized_list = []
266 | for sentence in sentence_list:
267 | sentence_after_regex = re.sub("[^a-z0-9A-Z]", " ", sentence)
268 | # Remove sentences where there was only numbers
269 | if len(re.sub("[^a-zA-Z]", "", sentence_after_regex)) > 6:
270 | tokenized_list.append(sentence_after_regex)
271 |
272 | return tokenized_list
273 |
274 |
275 | def calculate_text_sentiment_using_transform(sentence_list):
276 | """Calculate the test sentiment using transforms
277 |
278 | Parameters
279 | ----------
280 | sentence_list: List
281 | List with the tokenizes sentences
282 |
283 | Returns
284 | -------
285 | Dictonary
286 | Dict with the cumulative sentiment of all the sentences at the list
287 | """
288 | sentiment_dict = {'POSITIVE': 0, 'NEGATIVE': 0}
289 | classifier = pipeline('sentiment-analysis')
290 | for sentence in sentence_list:
291 | sentiment_result = classifier(sentence)
292 | sentiment_dict[sentiment_result[0]['label']
293 | ] += sentiment_result[0]['score']
294 |
295 | return sentiment_dict
296 |
297 |
298 | def calculate_text_sia(sentence_list):
299 | """Calculate the test sentiment using Sentiment Intensity Analyzer
300 |
301 | Parameters
302 | ----------
303 | sentence_list: List
304 | List with the tokenizes sentences
305 |
306 | Returns
307 | -------
308 | Dictonary
309 | Dict with the cumulative sentiment of all the sentences at the list
310 | """
311 | sentiment_dict = {'neg': 0, 'neu': 0, 'pos': 0, 'compound': 0}
312 | sia = SentimentIntensityAnalyzer()
313 | for sentence in sentence_list:
314 | sentiment_result = sia.polarity_scores(sentence)
315 | for k in sentiment_result:
316 | sentiment_dict[k] += sentiment_result[k]
317 |
318 | return sentiment_dict
319 |
320 |
321 | def get_sentiment_analysis_df(letters_dict,
322 | calculate_text_sentiment,
323 | tokenize_sent,
324 | normalized=True):
325 | """Get the DataFrame with the sentiment of each Warren letter
326 |
327 | Parameters
328 | ----------
329 | letters_dict: Dictonary
330 | Dict with the letters text
331 | calculate_text_sentiment: function
332 | Function used to calculate the sentiment of the text
333 | tokenize_sent: function
334 | Function to tokenize the text into a list of sentences
335 | normalized: bool
336 | If the values of the df will be normalized or not
337 |
338 | Returns
339 | -------
340 | Pandas DataFrame
341 | Pandas DataFrame with the sentiment analysis for each letter
342 | """
343 | sentiment_analysis_dict = {}
344 | for k in letters_dict:
345 | sentiment_analysis_dict[k] = calculate_text_sentiment(
346 | tokenize_sent(letters_dict[k]))
347 | sentiment_analysis_df = pd.DataFrame(sentiment_analysis_dict)
348 | if normalized:
349 | return sentiment_analysis_df / sentiment_analysis_df.sum(axis=0)
350 | return sentiment_analysis_df
351 |
352 |
353 | def get_answer_using_qa(nlp, question, context):
354 | """Get answer using a classifier trained with the QA technique
355 |
356 | Parameters
357 | ----------
358 | nlp: Pipeline
359 | Trained QA Pipeline
360 | question: String
361 | Question that the model will answer
362 | context: String
363 | The Context of the question
364 |
365 | Returns
366 | -------
367 | Tuple
368 | The answer, the score, the start position of the answer at the text and the final
369 | position of the answer at the text
370 | """
371 | result = nlp(question=question, context=context)
372 |
373 | return result['answer'], round(result['score'], 4), result['start'], result['end']
374 |
375 |
376 | def format_spines(ax, right_border=True):
377 | """
378 | This function sets up borders from an axis and personalize colors
379 |
380 | Parameters
381 | ----------
382 | Axis: Matplotlib axis
383 | The plot axis
384 | right_border: Boolean
385 | Whether to plot or not the right border
386 | """
387 | # Setting up colors
388 | ax.spines['bottom'].set_color('#CCCCCC')
389 | ax.spines['left'].set_color('#CCCCCC')
390 | ax.spines['top'].set_visible(False)
391 | if right_border:
392 | ax.spines['right'].set_color('#CCCCCC')
393 | else:
394 | ax.spines['right'].set_color('#FFFFFF')
395 | ax.patch.set_facecolor('#FFFFFF')
396 |
397 |
398 | def get_ngram_plot_data(df, type, sentiment):
399 | """Format the data to the ngram plot
400 |
401 | Parameters
402 | ----------
403 | df: Pandas DataFrame
404 | Pandas dataframe with the ngrams sentiment data
405 | type: String
406 | Type of the ngram to filter
407 | sentiment: String
408 | POSITIVE or NEGATIVE
409 |
410 | Returns
411 | -------
412 | Pandas Dataframe
413 | The dataframe filtered
414 | """
415 | return df.query("type == @type and sentiment == @sentiment").sort_values('score', ascending=False)
416 |
--------------------------------------------------------------------------------