├── letters_dict.pickle ├── sentiment_analysis_dict.pickle ├── requirements.txt ├── constants.py ├── LICENSE ├── doc_sim_main.py ├── README.md ├── document_similarity.py └── utils.py /letters_dict.pickle: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jairNeto/warren_buffet_letters/HEAD/letters_dict.pickle -------------------------------------------------------------------------------- /sentiment_analysis_dict.pickle: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jairNeto/warren_buffet_letters/HEAD/sentiment_analysis_dict.pickle -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | pandas==1.1.5 2 | tika==1.24 3 | matplotlib==3.2.2 4 | nltk==3.5 5 | seaborn==0.11.0 6 | transformers==4.5.1 7 | numpy==1.19.5 8 | wordcloud==1.8.1 9 | beautifulsoup4==4.9.3 10 | scikit_learn==0.24.1 11 | -------------------------------------------------------------------------------- /constants.py: -------------------------------------------------------------------------------- 1 | ENGLISH = 'english' 2 | FEATURES = 'features' 3 | TEXT_PIPELINE = 'text_pipeline' 4 | VECT = 'vect' 5 | TFIDF = 'tfidf' 6 | TOKENIZED = 'tokenized' 7 | LETTER_TEXT = 'letter_text' 8 | COSINE = 'cosine' 9 | EUCLIDEAN = 'euclidean' 10 | WORD2VEC = 'word2vec' 11 | DOC2VECT = 'doc2vect' 12 | 13 | 14 | 15 | 16 | 17 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2021 Jair Guedes Ferreira Neto 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /doc_sim_main.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | from document_similarity import get_letters_df, get_tfidf, get_word2vec, get_doc2vec, get_transformers 3 | import constants as const 4 | import time 5 | 6 | 7 | def main(algorithm, distance, letters_dict_pickle, year, n, pre_trained_model=''): 8 | """Main function 9 | 10 | Parameters 11 | ---------- 12 | algorithm: string 13 | The chosen algorithm to compute the similarity/distance. 14 | distance: string 15 | euclidean or cosine. 16 | letters_dict_pickle: string 17 | Pickle path to the letters dict. 18 | year: int 19 | The target letter year. 20 | n: int 21 | The number of letters to return. 22 | pre_trained_model: string 23 | The pretrained model to use in transformers. 24 | 25 | Returns 26 | ------- 27 | Np.array 28 | The tfidf vector representation of the text 29 | """ 30 | letters_df = get_letters_df(letters_dict_pickle) 31 | if algorithm == const.TFIDF: 32 | return get_tfidf(letters_df, year, n, distance) 33 | elif algorithm == const.WORD2VEC: 34 | return get_word2vec(letters_df, year, n, distance) 35 | elif algorithm == const.DOC2VECT: 36 | return get_doc2vec(letters_df, year, n) 37 | else: 38 | return get_transformers(pre_trained_model, letters_df, year, n, distance) 39 | 40 | 41 | if __name__ == '__main__': 42 | PARSER = argparse.ArgumentParser(description="Execute the distance similarity") 43 | PARSER.add_argument("-alg", "--algorithm", help="The chosen algorithm to" 44 | " compute the similarity/distance.") 45 | PARSER.add_argument("-dist", "--distance", help="euclidean or cosine.") 46 | PARSER.add_argument( 47 | "-p", "--path", help="Pickle path to the letters dict.") 48 | PARSER.add_argument( 49 | "-t", "--target", help="The target letter year.") 50 | PARSER.add_argument( 51 | "-n", "--number", help="The number of letters to return.") 52 | PARSER.add_argument( 53 | "-pre", "--pretrained", help="The pretrained model to use in transformers.") 54 | 55 | ARGS = PARSER.parse_args() 56 | start = time.time() 57 | print(main(ARGS.algorithm, ARGS.distance, ARGS.path, int(ARGS.target), int(ARGS.number), ARGS.pretrained)) 58 | print(f'Execution time = {time.time() - start} seconds.') 59 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Warren Buffet letters Analysis 2 | 3 | ### Table of Contents 4 | 5 | 1. [Overview](#overview) 6 | 2. [Installation](#installation) 7 | 3. [Running](#running) 8 | 4. [Final Considerations](#considerations) 9 | 10 | ## Overview 11 | 12 | The Goal of this project is to use NLP techniques such as Question and Answering, 13 | Sentiment Analysis, WordCloud, document similarity and others to extract meaningful insights about 14 | Warren Buffet annual letters to the Berkshire Hathaway shareholders. 15 | 16 | ## Installation 17 | 18 | Create a virtual environment named **ibm_venv**. 19 | 20 | ``` 21 | $ python3 -m venv warren_venv -- for Linux and macOS 22 | $ python -m venv warren_venv -- for Windows 23 | ``` 24 | 25 | After that, activate the python virtual environment 26 | 27 | ``` 28 | $ source warren_venv/bin/activate -- for Linux and macOS 29 | $ warren_venv\Scripts\activate -- for Windows 30 | ``` 31 | 32 | Install the requirements 33 | 34 | ``` 35 | $ pip install -r requirements.txt 36 | ``` 37 | 38 | ## Running 39 | 40 | ### Running the QA notebook 41 | 42 | To run it you have to download the letters after 2000 at 43 | https://www.berkshirehathaway.com/letters/letters.html. After that you need to 44 | change the parameters from the function get_letters_corpus_dict to the directory 45 | containing the letters, after that you only need to run the desired cells of 46 | the notebook 47 | 48 | ### Running the document similarity 49 | 50 | You can get the most similar documents to a specific letter year by running the 51 | doc_sim_main.py. 52 | 53 | ``` 54 | python doc_sim_main.py --algorithm --distance --path --target --number --pretrained 55 | ``` 56 | 57 | Where: 58 | * algorithm: Could be tfidf, word2vec, doc2vect and transformer 59 | * distance: Could be cosine or euclidean 60 | * path: Pickle path to the letters dict 61 | * target: The target letter year 62 | * number: The number of letters to return 63 | * pretrained: The pretrained model to use in transformers 64 | 65 | ## Final Considerations and acknowledgments 66 | 67 | To see the full analysis of this code, access my medium post at: 68 | https://medium.com/analytics-vidhya/best-nlp-algorithms-to-get-document-similarity-a5559244b23b 69 | https://medium.com/analytics-vidhya/using-nlp-to-get-inside-warren-buffet-mind-part-2-8e3557810a39 70 | https://medium.com/analytics-vidhya/using-nlp-to-get-inside-warren-buffet-mind-part-i-666d717d0c2e 71 | -------------------------------------------------------------------------------- /document_similarity.py: -------------------------------------------------------------------------------- 1 | import pickle 2 | import nltk 3 | import re 4 | import pandas as pd 5 | import numpy as np 6 | from nltk.tokenize import word_tokenize 7 | from nltk.stem import WordNetLemmatizer 8 | from nltk.stem.porter import PorterStemmer 9 | from nltk.corpus import stopwords 10 | from sklearn.pipeline import Pipeline, FeatureUnion 11 | from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer 12 | from sklearn.metrics.pairwise import euclidean_distances 13 | from gensim.models.word2vec import Word2Vec 14 | import constants as const 15 | from gensim.models.doc2vec import Doc2Vec 16 | from gensim.models.doc2vec import TaggedDocument 17 | from sentence_transformers import SentenceTransformer 18 | nltk.download(['punkt', 'wordnet', 'averaged_perceptron_tagger', 'stopwords']) 19 | 20 | 21 | def tokenize(text): 22 | """Tokenize the text 23 | 24 | Parameters 25 | ---------- 26 | text: String 27 | The message to be tokenized 28 | 29 | Returns 30 | ------- 31 | List 32 | List with the clean tokens 33 | """ 34 | text = text.lower() 35 | text = re.sub("[^a-zA-Z0-9]", " ", text) 36 | tokens = word_tokenize(text) 37 | tokens = [w for w in tokens if w not in stopwords.words(const.ENGLISH)] 38 | 39 | lemmatizer = WordNetLemmatizer() 40 | stemmer = PorterStemmer() 41 | 42 | clean_tokens_list = [] 43 | for tok in tokens: 44 | lemmatizer_tok = lemmatizer.lemmatize(tok).strip() 45 | clean_tok = stemmer.stem(lemmatizer_tok) 46 | clean_tokens_list.append(clean_tok) 47 | 48 | return clean_tokens_list 49 | 50 | 51 | def build_model(): 52 | """Build the model 53 | 54 | Returns 55 | ------- 56 | sklearn.pipeline.Pipeline 57 | The model 58 | """ 59 | pipeline = Pipeline([ 60 | (const.FEATURES, FeatureUnion([ 61 | 62 | (const.TEXT_PIPELINE, Pipeline([ 63 | (const.VECT, CountVectorizer(tokenizer=tokenize)), 64 | (const.TFIDF, TfidfTransformer()) 65 | ])) 66 | ]))]) 67 | 68 | return pipeline 69 | 70 | 71 | def get_avg_document_vector(model, df, year): 72 | """Get the a vector representation of a document using word2vec 73 | 74 | Parameters 75 | ---------- 76 | model: Word2Vec 77 | Trained Word2Vec model 78 | df: pandas DataFrame 79 | Pandas DataFrame with a columns with the tokens 80 | year: int 81 | The target year 82 | 83 | Returns 84 | ------- 85 | Tuple 86 | The vector representation of the document, the number os words that are not at the model 87 | vocabulary 88 | """ 89 | word_vecs = [] 90 | count = 0 91 | for word in df[const.TOKENIZED].loc[year]: 92 | try: 93 | vector = model[word] 94 | word_vecs.append(vector) 95 | except KeyError: 96 | count += 1 97 | pass 98 | vector_avg = np.mean(word_vecs, axis=0) 99 | 100 | return vector_avg, count 101 | 102 | 103 | def get_letters_df(letters_dict_pickle): 104 | """Get the letters Pandas Dataframe 105 | 106 | Parameters 107 | ---------- 108 | letters_dict_pickle: string 109 | Path to the dict with the letters text 110 | 111 | Returns 112 | ------- 113 | Pandas DataFrame 114 | Pandas DataFrame with a columns with the tokens 115 | """ 116 | with open(letters_dict_pickle, 'rb') as handle: 117 | letters_dict = pickle.load(handle) 118 | 119 | letters_df = pd.DataFrame(letters_dict, index=[const.LETTER_TEXT]).T 120 | letters_df[const.TOKENIZED] = letters_df[const.LETTER_TEXT].apply(tokenize) 121 | 122 | return letters_df 123 | 124 | 125 | def get_most_similar_docs(pairwise_similarities, letter_year, distance_method, transformers=False, initial_year=1977): 126 | """Get the most similar letters to a target one 127 | 128 | Parameters 129 | ---------- 130 | pairwise_similarities: np.array 131 | Numpy array of the pairwise similarities 132 | letter_year: int 133 | The target letter year 134 | distance_method: string 135 | Euclidean or cosine 136 | transformers: boolean 137 | True if you are calling from transformers or False otherwise 138 | initial_year: int 139 | The initial letter year 140 | 141 | Returns 142 | ------- 143 | List 144 | List with the letter year sorted descending by similarity 145 | """ 146 | letter_i = letter_year - initial_year 147 | if distance_method == const.COSINE: 148 | if transformers: 149 | similarity_index = np.array(np.argsort(-pairwise_similarities[letter_i])) 150 | else: 151 | similarity_index = np.array(np.argsort(-pairwise_similarities[letter_i].todense()))[0] 152 | else: 153 | similarity_index = np.argsort(pairwise_similarities[letter_i]) 154 | 155 | similar_docs_sorted = [] 156 | for index in similarity_index: 157 | if index == letter_i: 158 | continue 159 | similar_docs_sorted.append(index + initial_year) 160 | 161 | return similar_docs_sorted 162 | 163 | 164 | def get_pipe_vector(letters_df): 165 | """Get the tfidf vector 166 | 167 | Parameters 168 | ---------- 169 | letters_df: pandas DataFrame 170 | The pandas Dataframe with text from the letters 171 | 172 | Returns 173 | ------- 174 | Np.array 175 | The tfidf vector representation of the text 176 | """ 177 | pipeline = build_model() 178 | pipeline.fit(letters_df[const.LETTER_TEXT]) 179 | vectors = pipeline.transform(letters_df[const.LETTER_TEXT]) 180 | 181 | return vectors 182 | 183 | 184 | def get_tfidf(letters_df, year, n, distance): 185 | """Get the tfidf most similar years 186 | 187 | Parameters 188 | ---------- 189 | letters_df: pandas DataFrame 190 | The pandas Dataframe with text from the letters 191 | year: int 192 | The target letter year 193 | n: int 194 | The number of letters to return 195 | distance: string 196 | Euclidean or cosine 197 | 198 | Returns 199 | ------- 200 | List 201 | List with the letter year sorted descending by similarity 202 | """ 203 | vectors = get_pipe_vector(letters_df) 204 | if distance == const.COSINE: 205 | pairwise_dis = vectors @ vectors.T 206 | else: 207 | pairwise_dis = euclidean_distances(vectors) 208 | 209 | return get_most_similar_docs(pairwise_dis, year, distance)[:n] 210 | 211 | 212 | def get_most_similar_docs_docs2vec(letter_year, model, corpus, n, initial_year=1977): 213 | """Get the docs2vec most similar years 214 | 215 | Parameters 216 | ---------- 217 | letter_year: int 218 | The target letter year 219 | model: docs2vec 220 | The trained Docs2vec model 221 | corpus: List 222 | TaggedDocument list 223 | n: int 224 | The number of letters to return 225 | initial_year: int 226 | The initial letter year 227 | 228 | Returns 229 | ------- 230 | List 231 | List with the letter year sorted descending by similarity 232 | """ 233 | doc_id = letter_year - initial_year 234 | inferred_vector = model.infer_vector(corpus[doc_id].words) 235 | sims = model.docvecs.most_similar([inferred_vector], topn=len(model.docvecs)) 236 | sims = [index + initial_year for index, _ in sims] 237 | 238 | return sims[1:n + 1] 239 | 240 | 241 | def get_doc2vec(letters_df, year, n): 242 | """Get the doc2vec most similar years 243 | 244 | Parameters 245 | ---------- 246 | letters_df: pandas DataFrame 247 | The pandas Dataframe with text from the letters 248 | year: int 249 | The target letter year 250 | n: int 251 | The number of letters to return 252 | 253 | Returns 254 | ------- 255 | List 256 | List with the letter year sorted descending by similarity 257 | """ 258 | EPOCHS = 40 259 | doc2_model = Doc2Vec(min_count=2) 260 | corpus = [TaggedDocument(tokens, [i]) for i, tokens in enumerate(list(letters_df[const.TOKENIZED]))] 261 | doc2_model.build_vocab(corpus) 262 | doc2_model.train(corpus, total_examples=doc2_model.corpus_count, epochs=EPOCHS) 263 | return get_most_similar_docs_docs2vec(year, doc2_model, corpus, n) 264 | 265 | 266 | def get_word2vec(letters_df, year, n, distance): 267 | """Get the word2vec most similar years 268 | 269 | Parameters 270 | ---------- 271 | letters_df: pandas DataFrame 272 | The pandas Dataframe with text from the letters 273 | year: int 274 | The target letter year 275 | n: int 276 | The number of letters to return 277 | distance: string 278 | Euclidean or cosine 279 | 280 | Returns 281 | ------- 282 | List 283 | List with the letter year sorted descending by similarity 284 | """ 285 | model = Word2Vec(letters_df[const.TOKENIZED]) 286 | target, _ = get_avg_document_vector(model, letters_df, year) 287 | distances = [] 288 | for y in list(letters_df.index): 289 | if y != year: 290 | vector_year, _ = get_avg_document_vector(model, letters_df, y) 291 | if distance == const.COSINE: 292 | distances.append(target @ vector_year.T / np.linalg.norm(target) / np.linalg.norm(vector_year)) 293 | else: 294 | distances.append(np.linalg.norm(target - vector_year)) 295 | 296 | distances = np.array(distances) 297 | if distance == const.COSINE: 298 | return letters_df.index[(-distances).argsort()][:n] 299 | else: 300 | return letters_df.index[distances.argsort()][:n] 301 | 302 | 303 | def get_transformers(pre_trained_model, letters_df, year, n, distance): 304 | """Get the word2vec most similar years 305 | 306 | Parameters 307 | ---------- 308 | pre_trained_model: string 309 | The name of the pre trained transform 310 | letters_df: pandas DataFrame 311 | The pandas Dataframe with text from the letters 312 | year: int 313 | The target letter year 314 | n: int 315 | The number of letters to return 316 | distance: string 317 | Euclidean or cosine 318 | 319 | Returns 320 | ------- 321 | List 322 | List with the letter year sorted descending by similarity 323 | """ 324 | model = SentenceTransformer(pre_trained_model) 325 | embeddings = model.encode(letters_df[const.TOKENIZED].values) 326 | if distance == const.COSINE: 327 | pairwise = embeddings @ embeddings.T / np.linalg.norm(embeddings) / np.linalg.norm(embeddings) 328 | return get_most_similar_docs(pairwise, year, const.COSINE, transformers=True)[:n] 329 | else: 330 | euclidean = euclidean_distances(embeddings) 331 | return get_most_similar_docs(euclidean, year, const.EUCLIDEAN)[:n] 332 | -------------------------------------------------------------------------------- /utils.py: -------------------------------------------------------------------------------- 1 | """ Utils module """ 2 | 3 | from urllib.request import urlopen 4 | from pathlib import Path 5 | import re 6 | from bs4 import BeautifulSoup 7 | from tika import parser 8 | import nltk 9 | from nltk.tokenize import word_tokenize 10 | from nltk.stem import WordNetLemmatizer 11 | from nltk.corpus import stopwords 12 | from nltk.sentiment import SentimentIntensityAnalyzer 13 | from transformers import pipeline 14 | import pandas as pd 15 | import seaborn as sns 16 | import matplotlib.pyplot as plt 17 | from wordcloud import WordCloud 18 | 19 | nltk.download(['punkt', 'wordnet', 'averaged_perceptron_tagger', 20 | 'stopwords', 'vader_lexicon']) 21 | 22 | 23 | def get_text_from_html(url, tags_to_ignore=["script", "style"]): 24 | """Extract the text from a webpage 25 | 26 | Parameters 27 | ---------- 28 | url: String 29 | The url 30 | tags_to_ignore: List 31 | List with the tags to skip when getting the text 32 | 33 | Returns 34 | ------- 35 | String 36 | A string file with the text from the webpage 37 | """ 38 | text = '' 39 | try: 40 | html = urlopen(url).read() 41 | soup = BeautifulSoup(html, features="html.parser") 42 | for script in soup(tags_to_ignore): 43 | script.extract() # rip it out 44 | 45 | # get text 46 | text = soup.get_text() 47 | 48 | # break into lines and remove leading and trailing space on each 49 | lines = (line.strip() for line in text.splitlines()) 50 | # break multi-headlines into a line each 51 | chunks = (phrase.strip() 52 | for line in lines for phrase in line.split(" ")) 53 | # drop blank lines 54 | text = '\n'.join(chunk for chunk in chunks if chunk) 55 | except: 56 | print(f'Could not open the url {url}') 57 | 58 | return text 59 | 60 | 61 | def get_text_from_pdf(path): 62 | """Extract the text from a pdf 63 | 64 | Parameters 65 | ---------- 66 | path: String 67 | Path to a pdf file 68 | 69 | Returns 70 | ------- 71 | String 72 | A string file with the text from the pdf file 73 | """ 74 | text = '' 75 | try: 76 | raw = parser.from_file(path) 77 | text = raw['content'] 78 | except: 79 | print(f'Could not open the path {path}') 80 | 81 | return text 82 | 83 | 84 | def get_letters_corpus_dict(letters_pdf_path, init_year=1977, end_year=2020): 85 | """Build the dict where the keys are the years and the values are 86 | the text from the Warren Buffet letters 87 | 88 | Parameters 89 | ---------- 90 | letters_pdf_path: String 91 | Path to the directory containing the pdf letters 92 | init_year: int 93 | The initial year to start getting the letters 94 | end_year: int 95 | The finial year to start getting the letters 96 | 97 | Returns 98 | ------- 99 | Dictionary 100 | Dict where the keys are the years and the values are 101 | the text from the Warren Buffet letters 102 | """ 103 | if init_year < 1977 or end_year > 2020: 104 | print('The range supported is between 1977 and 2020') 105 | return {} 106 | 107 | letters_dict = dict() 108 | letters_years = [year for year in range(init_year, end_year + 1)] 109 | for year in letters_years: 110 | if year >= 2000: 111 | filename = f'{year}ltr.pdf' 112 | path = Path(letters_pdf_path).joinpath(filename) 113 | letter_corpus = get_text_from_pdf(str(path)) 114 | else: 115 | if year > 1997: 116 | url = f'https://www.berkshirehathaway.com/letters/{year}htm.html' 117 | else: 118 | url = f'https://www.berkshirehathaway.com/letters/{year}.html' 119 | letter_corpus = get_text_from_html(url) 120 | 121 | letters_dict[year] = letter_corpus 122 | 123 | return letters_dict 124 | 125 | 126 | def draw_heatmap(df, figsize=(15, 6), cmap='YlOrBr', ylabel='', xlabel='', title=''): 127 | """Draw a heatmap using seaborn 128 | 129 | Parameters 130 | ---------- 131 | df: Pandas Dataframe 132 | Pandas Dataframe with the data to show at the heatmap 133 | figsize: Tuple 134 | The plot figure size 135 | cmap: matplotlib colormap name or object, or list of colors, optional 136 | The mapping from data values to color space. If not provided, 137 | the default will depend on whether center is set. 138 | ylabel: String 139 | The y label of the plot 140 | xlabel: String 141 | The x label of the plot 142 | title: String 143 | The title of the plot 144 | """ 145 | _, ax = plt.subplots(figsize=figsize) 146 | sns.heatmap(df, cmap=cmap, annot=False) 147 | ax.set_ylabel(ylabel, fontsize=15) 148 | ax.set_xlabel(xlabel, fontsize=15) 149 | ax.set_title(title, fontsize=20, weight='bold') 150 | plt.show() 151 | 152 | 153 | def tokenize(text, freq_words=[]): 154 | """Tokenize the text 155 | 156 | Parameters 157 | ---------- 158 | text: String 159 | The message to be tokenized 160 | freq_words: List 161 | List with words that appears frequent at the text 162 | 163 | Returns 164 | ------- 165 | List 166 | List with the clean tokens 167 | """ 168 | text = text.lower() 169 | text = re.sub("[^a-zA-Z]", " ", text) 170 | tokens = word_tokenize(text) 171 | tokens = [w for w in tokens if w not in stopwords.words('english')] 172 | tokens = [w for w in tokens if w not in freq_words] 173 | 174 | lemmatizer = WordNetLemmatizer() 175 | 176 | clean_tokens_list = [] 177 | for tok in tokens: 178 | lemmatizer_tok = lemmatizer.lemmatize(tok).strip() 179 | clean_tokens_list.append(lemmatizer_tok) 180 | 181 | return clean_tokens_list 182 | 183 | 184 | def get_most_frequent_combinatation(tokens, freq=10, num_word_combination=-1): 185 | """Get a dict with the most frequent onegram, bigram, trigram and quadgrams 186 | 187 | Parameters 188 | ---------- 189 | text: List 190 | List with the tokens 191 | freq: Int 192 | How many combination to return 193 | num_word_combination: Int 194 | 1 to onegram 195 | 2 to bigram 196 | 3 to trigram 197 | 4 to quadgrams 198 | -1 All 199 | 200 | Returns 201 | ------- 202 | Dict 203 | Dict with the frequencies 204 | """ 205 | if num_word_combination < -1 or num_word_combination > 4: 206 | raise Exception( 207 | f'The num_word_combination shall be greater than -2 and lesser than 5 the values passes was {num_word_combination}') 208 | 209 | freq_dict = {} 210 | if num_word_combination in [1, -1]: 211 | freq_dist = nltk.FreqDist(tokens) 212 | freq_dict['FreqDist_onegram'] = freq_dist.most_common(freq) 213 | 214 | if num_word_combination in [2, -1]: 215 | bigrams = nltk.collocations.BigramCollocationFinder.from_words(tokens) 216 | freq_dict['FreqDist_bigram'] = bigrams.ngram_fd.most_common(freq) 217 | 218 | if num_word_combination in [3, -1]: 219 | trigram = nltk.collocations.TrigramCollocationFinder.from_words(tokens) 220 | freq_dict['FreqDist_trigram'] = trigram.ngram_fd.most_common(freq) 221 | 222 | if num_word_combination in [4, -1]: 223 | quadgrams = nltk.collocations.QuadgramCollocationFinder.from_words( 224 | tokens) 225 | freq_dict['FreqDist_quadgrams'] = quadgrams.ngram_fd.most_common(freq) 226 | 227 | return freq_dict 228 | 229 | 230 | def drawn_wordcloud(corpus, save_path, figsize=(15, 6)): 231 | """Get a dict with the most frequent onegram, bigram, trigram and quadgrams 232 | 233 | Parameters 234 | ---------- 235 | corpus: List 236 | List with the words 237 | save_path: String 238 | Path to the file where the wordcloud will be saved at 239 | figsize: Tuple 240 | The figsize 241 | """ 242 | _, _ = plt.subplots(figsize=figsize) 243 | combined_text = " ".join(text for text in corpus) 244 | wordcloud = WordCloud().generate(combined_text) 245 | plt.imshow(wordcloud, interpolation='bilinear') 246 | plt.axis("off") 247 | plt.savefig(save_path) 248 | plt.show() 249 | 250 | 251 | def tokenize_sent(text): 252 | """Tokenize the sentence 253 | 254 | Parameters 255 | ---------- 256 | text: String 257 | The text to be tokenized 258 | 259 | Returns 260 | ------- 261 | List 262 | List with the tokenized sentences 263 | """ 264 | sentence_list = nltk.tokenize.sent_tokenize(text) 265 | tokenized_list = [] 266 | for sentence in sentence_list: 267 | sentence_after_regex = re.sub("[^a-z0-9A-Z]", " ", sentence) 268 | # Remove sentences where there was only numbers 269 | if len(re.sub("[^a-zA-Z]", "", sentence_after_regex)) > 6: 270 | tokenized_list.append(sentence_after_regex) 271 | 272 | return tokenized_list 273 | 274 | 275 | def calculate_text_sentiment_using_transform(sentence_list): 276 | """Calculate the test sentiment using transforms 277 | 278 | Parameters 279 | ---------- 280 | sentence_list: List 281 | List with the tokenizes sentences 282 | 283 | Returns 284 | ------- 285 | Dictonary 286 | Dict with the cumulative sentiment of all the sentences at the list 287 | """ 288 | sentiment_dict = {'POSITIVE': 0, 'NEGATIVE': 0} 289 | classifier = pipeline('sentiment-analysis') 290 | for sentence in sentence_list: 291 | sentiment_result = classifier(sentence) 292 | sentiment_dict[sentiment_result[0]['label'] 293 | ] += sentiment_result[0]['score'] 294 | 295 | return sentiment_dict 296 | 297 | 298 | def calculate_text_sia(sentence_list): 299 | """Calculate the test sentiment using Sentiment Intensity Analyzer 300 | 301 | Parameters 302 | ---------- 303 | sentence_list: List 304 | List with the tokenizes sentences 305 | 306 | Returns 307 | ------- 308 | Dictonary 309 | Dict with the cumulative sentiment of all the sentences at the list 310 | """ 311 | sentiment_dict = {'neg': 0, 'neu': 0, 'pos': 0, 'compound': 0} 312 | sia = SentimentIntensityAnalyzer() 313 | for sentence in sentence_list: 314 | sentiment_result = sia.polarity_scores(sentence) 315 | for k in sentiment_result: 316 | sentiment_dict[k] += sentiment_result[k] 317 | 318 | return sentiment_dict 319 | 320 | 321 | def get_sentiment_analysis_df(letters_dict, 322 | calculate_text_sentiment, 323 | tokenize_sent, 324 | normalized=True): 325 | """Get the DataFrame with the sentiment of each Warren letter 326 | 327 | Parameters 328 | ---------- 329 | letters_dict: Dictonary 330 | Dict with the letters text 331 | calculate_text_sentiment: function 332 | Function used to calculate the sentiment of the text 333 | tokenize_sent: function 334 | Function to tokenize the text into a list of sentences 335 | normalized: bool 336 | If the values of the df will be normalized or not 337 | 338 | Returns 339 | ------- 340 | Pandas DataFrame 341 | Pandas DataFrame with the sentiment analysis for each letter 342 | """ 343 | sentiment_analysis_dict = {} 344 | for k in letters_dict: 345 | sentiment_analysis_dict[k] = calculate_text_sentiment( 346 | tokenize_sent(letters_dict[k])) 347 | sentiment_analysis_df = pd.DataFrame(sentiment_analysis_dict) 348 | if normalized: 349 | return sentiment_analysis_df / sentiment_analysis_df.sum(axis=0) 350 | return sentiment_analysis_df 351 | 352 | 353 | def get_answer_using_qa(nlp, question, context): 354 | """Get answer using a classifier trained with the QA technique 355 | 356 | Parameters 357 | ---------- 358 | nlp: Pipeline 359 | Trained QA Pipeline 360 | question: String 361 | Question that the model will answer 362 | context: String 363 | The Context of the question 364 | 365 | Returns 366 | ------- 367 | Tuple 368 | The answer, the score, the start position of the answer at the text and the final 369 | position of the answer at the text 370 | """ 371 | result = nlp(question=question, context=context) 372 | 373 | return result['answer'], round(result['score'], 4), result['start'], result['end'] 374 | 375 | 376 | def format_spines(ax, right_border=True): 377 | """ 378 | This function sets up borders from an axis and personalize colors 379 | 380 | Parameters 381 | ---------- 382 | Axis: Matplotlib axis 383 | The plot axis 384 | right_border: Boolean 385 | Whether to plot or not the right border 386 | """ 387 | # Setting up colors 388 | ax.spines['bottom'].set_color('#CCCCCC') 389 | ax.spines['left'].set_color('#CCCCCC') 390 | ax.spines['top'].set_visible(False) 391 | if right_border: 392 | ax.spines['right'].set_color('#CCCCCC') 393 | else: 394 | ax.spines['right'].set_color('#FFFFFF') 395 | ax.patch.set_facecolor('#FFFFFF') 396 | 397 | 398 | def get_ngram_plot_data(df, type, sentiment): 399 | """Format the data to the ngram plot 400 | 401 | Parameters 402 | ---------- 403 | df: Pandas DataFrame 404 | Pandas dataframe with the ngrams sentiment data 405 | type: String 406 | Type of the ngram to filter 407 | sentiment: String 408 | POSITIVE or NEGATIVE 409 | 410 | Returns 411 | ------- 412 | Pandas Dataframe 413 | The dataframe filtered 414 | """ 415 | return df.query("type == @type and sentiment == @sentiment").sort_values('score', ascending=False) 416 | --------------------------------------------------------------------------------