├── src ├── data │ ├── subsample.py │ └── extractor.py └── bert_class │ └── bert_class.py ├── README.md ├── .gitignore └── notebooks ├── Test_Colab_Categorical.ipynb ├── Processing_test_dataset.ipynb └── GettingRealAboutFake.ipynb /src/data/subsample.py: -------------------------------------------------------------------------------- 1 | """ 2 | Módulo para procesar un dataset grande (varios GB) con pocos recursos de memoria. 3 | 4 | Autor: Álvaro Ibrain 5 | Fecha: 4 de marzo de 2018 6 | """ 7 | 8 | import sys 9 | import random 10 | import csv 11 | csv.field_size_limit(99999999999) 12 | 13 | 14 | def sample_file(input_data, output_data, percent = 0.2): 15 | """ 16 | Función para extraer muestras aleatorias de un csv grande. Sampleando de esta 17 | manera sólo se carga en memoria una fila cada vez, lo que permite que sin muchos 18 | recursos se pueda procesar un dataser csv muy grande. 19 | 20 | :param (string) input_data: Path al fichero de lectura 21 | :param (string) output_data: Path al fichero de escritura 22 | :param (float) percent: Porcentaje del fichero a analizar. Número entre (0, 1] 23 | """ 24 | with open(input_data) as file: 25 | with open(output_data, 'w+') as out: 26 | header = True 27 | reader = csv.reader(file) 28 | writer = csv.writer(out) 29 | for r in reader: 30 | if header: 31 | #Keep header 32 | writer.writerow(r) 33 | header = False 34 | else: 35 | if random.random() < prob: 36 | writer.writerow(r) 37 | 38 | 39 | def main(): 40 | inp_file = "" 41 | out_file = "" 42 | percentage = -1 43 | 44 | if len(sys.argv) < 3: 45 | sys.stderr.write("Error. Argumentos necesarios: "+ 46 | " [percentage] ") 47 | return 48 | if len(sys.argv) <= 3: 49 | inp_file = sys.argv[1] 50 | out_file = sys.argv[2] 51 | sample_file(inp_file, out_file) 52 | 53 | if len(sys.argv) == 4: 54 | percentage = sys.argv[3] 55 | sample_file(inp_file, out_file, percent = percentage) 56 | 57 | 58 | if __name__ == '__main__': 59 | main() 60 | -------------------------------------------------------------------------------- /src/data/extractor.py: -------------------------------------------------------------------------------- 1 | """ 2 | Módulo para extraer un csv con los tipos de noticias infrarrepresentados 3 | 4 | Autor: Álvaro Ibrain 5 | Fecha: 12 de marzo de 2018 6 | """ 7 | 8 | import sys 9 | import random 10 | import csv 11 | csv.field_size_limit(99999999999) 12 | 13 | 14 | def extract(input_data, output_data, types = ['clickbait', 'bias'], offset_col = 3): 15 | """ 16 | Función para extraer solo los articulos de un tipo del dataset Fake News Corpus 17 | 18 | :param (string) input_data: Path al fichero de lectura 19 | :param (string) output_data: Path al fichero de escritura 20 | :param (float) types: Tipos de noticias que se desean seleccionar 21 | """ 22 | with open(input_data) as file: 23 | with open(output_data, 'w+') as out: 24 | header = True 25 | reader = csv.reader(file) 26 | writer = csv.writer(out) 27 | for r in reader: 28 | if header: 29 | #Keep header 30 | writer.writerow(r) 31 | header = False 32 | else: 33 | #Select only if it is in types 34 | try: 35 | if r[offset_col] in types: 36 | writer.writerow(r) 37 | except: 38 | pass #If the column is empty 39 | 40 | def main(): 41 | inp_file = "" 42 | out_file = "" 43 | 44 | if len(sys.argv) < 3: 45 | sys.stderr.write("Error. Argumentos necesarios: "+ 46 | " ") 47 | return 48 | 49 | types = ['clickbait', 'bias'] 50 | if len(sys.argv) == 4: 51 | types = sys.argv[3].split(',') 52 | 53 | inp_file = sys.argv[1] 54 | out_file = sys.argv[2] 55 | print("Processing...") 56 | print("Extracting news of type:") 57 | print(types) 58 | extract(input_data =inp_file, output_data = out_file, types = types) 59 | 60 | if __name__ == '__main__': 61 | main() 62 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Fake news detection using deep learning 2 | ## Final master thesis project 3 | 4 | This repository is focused on finding fake news using deep learning 5 | 6 | There are multiple methods focused on achieving this goal, but the objective 7 | of this work is discriminating the fake ones by only looking at the text. No graphs, 8 | no social network analysis neither images. 9 | 10 | In this work three deep learning architectures are proposed and later tested over two datasets (Fake news corpus and TI-CNN), obtaining state of the art results. 11 | 12 | 1. **LSTM Based architecture**: $91\%$ accuracy (TI-CNN) || $76\%$ accuracy (FNC) 13 | 2. **CNN Based architecture**: $97\%$ accuracy (TI-CNN) || $82\%$ accuracy (FNC) 14 | 3. **BERT Based architecture**: $97\%$ accuracy (TI-CNN) || $76\%$ accuracy (FNC) 15 | 16 | This repository contains several Python notebooks with the developed code 17 | 18 | ### Data sources 19 | * Fake News Corpus: https://github.com/several27/FakeNewsCorpus 20 | * Getting Real About Fake News: https://www.kaggle.com/mrisdal/fake-news 21 | * Fake News Detection: https://www.kaggle.com/jruvika/fake-news-detection 22 | * News Dataset from TI-CNN: https://arxiv.org/abs/1806.00749 23 | 24 | ### Folder structure 25 | * **data**: This directory must be created with the necessary data for scripts to work. 26 | (Not uploaded to GH due to filesize restrictions). 27 | - GoogleNews-vectors-negative300.bin.gz: Word2Vec news trained model weights 28 | - Other_datasets 29 | - GettingRealAboutFake/ 30 | - ```fake.csv``` (*Getting Real About Fake News Dataset*) 31 | - ``all_data.csv`` (*TI-CNN dataset*) 32 | - ``real_or_fake.csv`` 33 | - `FakeNewsCorpus.csv` (*Fake News Corpus*) 34 | * **notebooks**: Notebooks for prototyping 35 | * **src**: Code with utils 36 | * **data**: Code to generate datasets / process data 37 | * **bert_class**: Fine-tuned classifier built over Google's BERT to detect fake/true news. 38 | 39 | ### Notebook explanation 40 | * `FakeNewsCorpus.ipynb:` Cleaning and preprocessing the dataset 'Fake News Corpus'. 41 | * `GettingRealAboutFake.ipynb:` Cleaning and preprocessing the dataset 'Getting Real 42 | About Fake News' from Kaggle. 43 | * `TI_CNN-Dataset:` Cleaning and preprocessing the tadaset 'TI-CNN'. 44 | * `Processing_test_dataset.ipynb:` Cleaning and preprocessing the dataset 'True or Fake' from Kaggle. 45 | * `BayesianOpt.pynb:**` Obtaining model hyperparameters using Bayesian Optimization 46 | * `Train-Colab-Categorical.ipynb:` Train DNN to categorize 4 types of news. 47 | * `Train\_Colab_Binary.ipynb:` Train DNN to categorize only **True** or **Fake** 48 | classes. 49 | * `Test\_Colab_Categorical.ipynb:` Testing the previously trained categorical models on TI-CNN. 50 | * `Test\_Colab_Binary.ipynb:` Testing the previously trained binary models on FNC. 51 | * `data\_analysis/Data_analysis-FNC.ipynb:` EDA of the Fake News Corpus. 52 | * `data\_analysis/Data_analysis-TI-CNN.ipynb:` EDA of the TI-CNN Dataset 53 | * `data\_analysis/Data_analysis-Getting-Real.ipynb:` EDA of the Getting Real About 54 | fake news Dataset 55 | 56 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | 2 | ### JupyterNotebook ### 3 | .ipynb_checkpoints 4 | */.ipynb_checkpoints/* 5 | 6 | # Remove previous ipynb_checkpoints 7 | # git rm -r .ipynb_checkpoints/ 8 | # 9 | 10 | ### LaTeX ### 11 | ## Core latex/pdflatex auxiliary files: 12 | *.aux 13 | *.lof 14 | *.log 15 | *.lot 16 | *.fls 17 | *.out 18 | *.toc 19 | *.fmt 20 | *.fot 21 | *.cb 22 | *.cb2 23 | .*.lb 24 | 25 | ## Intermediate documents: 26 | *.dvi 27 | *.xdv 28 | *-converted-to.* 29 | # these rules might exclude image files for figures etc. 30 | # *.ps 31 | # *.eps 32 | # *.pdf 33 | 34 | ## Generated if empty string is given at "Please type another file name for output:" 35 | .pdf 36 | 37 | ## Bibliography auxiliary files (bibtex/biblatex/biber): 38 | *.bbl 39 | *.bcf 40 | *.blg 41 | *-blx.aux 42 | *-blx.bib 43 | *.run.xml 44 | 45 | ## Build tool auxiliary files: 46 | *.fdb_latexmk 47 | *.synctex 48 | *.synctex(busy) 49 | *.synctex.gz 50 | *.synctex.gz(busy) 51 | *.pdfsync 52 | 53 | ## Build tool directories for auxiliary files 54 | # latexrun 55 | latex.out/ 56 | 57 | ## Auxiliary and intermediate files from other packages: 58 | # algorithms 59 | *.alg 60 | *.loa 61 | 62 | # achemso 63 | acs-*.bib 64 | 65 | # amsthm 66 | *.thm 67 | 68 | # beamer 69 | *.nav 70 | *.pre 71 | *.snm 72 | *.vrb 73 | 74 | # changes 75 | *.soc 76 | 77 | # comment 78 | *.cut 79 | 80 | # cprotect 81 | *.cpt 82 | 83 | # elsarticle (documentclass of Elsevier journals) 84 | *.spl 85 | 86 | # endnotes 87 | *.ent 88 | 89 | # fixme 90 | *.lox 91 | 92 | # feynmf/feynmp 93 | *.mf 94 | *.mp 95 | *.t[1-9] 96 | *.t[1-9][0-9] 97 | *.tfm 98 | 99 | #(r)(e)ledmac/(r)(e)ledpar 100 | *.end 101 | *.?end 102 | *.[1-9] 103 | *.[1-9][0-9] 104 | *.[1-9][0-9][0-9] 105 | *.[1-9]R 106 | *.[1-9][0-9]R 107 | *.[1-9][0-9][0-9]R 108 | *.eledsec[1-9] 109 | *.eledsec[1-9]R 110 | *.eledsec[1-9][0-9] 111 | *.eledsec[1-9][0-9]R 112 | *.eledsec[1-9][0-9][0-9] 113 | *.eledsec[1-9][0-9][0-9]R 114 | 115 | # glossaries 116 | *.acn 117 | *.acr 118 | *.glg 119 | *.glo 120 | *.gls 121 | *.glsdefs 122 | 123 | # gnuplottex 124 | *-gnuplottex-* 125 | 126 | # gregoriotex 127 | *.gaux 128 | *.gtex 129 | 130 | # htlatex 131 | *.4ct 132 | *.4tc 133 | *.idv 134 | *.lg 135 | *.trc 136 | *.xref 137 | 138 | # hyperref 139 | *.brf 140 | 141 | # knitr 142 | *-concordance.tex 143 | # TODO Comment the next line if you want to keep your tikz graphics files 144 | *.tikz 145 | *-tikzDictionary 146 | 147 | # listings 148 | *.lol 149 | 150 | # makeidx 151 | *.idx 152 | *.ilg 153 | *.ind 154 | *.ist 155 | 156 | # minitoc 157 | *.maf 158 | *.mlf 159 | *.mlt 160 | *.mtc[0-9]* 161 | *.slf[0-9]* 162 | *.slt[0-9]* 163 | *.stc[0-9]* 164 | 165 | # minted 166 | _minted* 167 | *.pyg 168 | 169 | # morewrites 170 | *.mw 171 | 172 | # nomencl 173 | *.nlg 174 | *.nlo 175 | *.nls 176 | 177 | # pax 178 | *.pax 179 | 180 | # pdfpcnotes 181 | *.pdfpc 182 | 183 | # sagetex 184 | *.sagetex.sage 185 | *.sagetex.py 186 | *.sagetex.scmd 187 | 188 | # scrwfile 189 | *.wrt 190 | 191 | # sympy 192 | *.sout 193 | *.sympy 194 | sympy-plots-for-*.tex/ 195 | 196 | # pdfcomment 197 | *.upa 198 | *.upb 199 | 200 | # pythontex 201 | *.pytxcode 202 | pythontex-files-*/ 203 | 204 | # tcolorbox 205 | *.listing 206 | 207 | # thmtools 208 | *.loe 209 | 210 | # TikZ & PGF 211 | *.dpth 212 | *.md5 213 | *.auxlock 214 | 215 | # todonotes 216 | *.tdo 217 | 218 | # vhistory 219 | *.hst 220 | *.ver 221 | 222 | # easy-todo 223 | *.lod 224 | 225 | # xcolor 226 | *.xcp 227 | 228 | # xmpincl 229 | *.xmpi 230 | 231 | # xindy 232 | *.xdy 233 | 234 | # xypic precompiled matrices 235 | *.xyc 236 | 237 | # endfloat 238 | *.ttt 239 | *.fff 240 | 241 | # Latexian 242 | TSWLatexianTemp* 243 | 244 | ## Editors: 245 | # WinEdt 246 | *.bak 247 | *.sav 248 | 249 | # Texpad 250 | .texpadtmp 251 | 252 | # LyX 253 | *.lyx~ 254 | 255 | # Kile 256 | *.backup 257 | 258 | # KBibTeX 259 | *~[0-9]* 260 | 261 | # auto folder when using emacs and auctex 262 | ./auto/* 263 | *.el 264 | 265 | # expex forward references with \gathertags 266 | *-tags.tex 267 | 268 | # standalone packages 269 | *.sta 270 | 271 | ### LaTeX Patch ### 272 | # glossaries 273 | *.glstex 274 | 275 | ### macOS ### 276 | # General 277 | .DS_Store 278 | .AppleDouble 279 | .LSOverride 280 | 281 | # Icon must end with two \r 282 | Icon 283 | 284 | # Thumbnails 285 | ._* 286 | 287 | # Files that might appear in the root of a volume 288 | .DocumentRevisions-V100 289 | .fseventsd 290 | .Spotlight-V100 291 | .TemporaryItems 292 | .Trashes 293 | .VolumeIcon.icns 294 | .com.apple.timemachine.donotpresent 295 | 296 | # Directories potentially created on remote AFP share 297 | .AppleDB 298 | .AppleDesktop 299 | Network Trash Folder 300 | Temporary Items 301 | .apdisk 302 | 303 | ### Python ### 304 | # Byte-compiled / optimized / DLL files 305 | __pycache__/ 306 | *.py[cod] 307 | *$py.class 308 | 309 | # C extensions 310 | *.so 311 | 312 | # Distribution / packaging 313 | .Python 314 | build/ 315 | develop-eggs/ 316 | dist/ 317 | downloads/ 318 | eggs/ 319 | .eggs/ 320 | lib/ 321 | lib64/ 322 | parts/ 323 | sdist/ 324 | var/ 325 | wheels/ 326 | pip-wheel-metadata/ 327 | share/python-wheels/ 328 | *.egg-info/ 329 | .installed.cfg 330 | *.egg 331 | MANIFEST 332 | 333 | # PyInstaller 334 | # Usually these files are written by a python script from a template 335 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 336 | *.manifest 337 | *.spec 338 | 339 | # Installer logs 340 | pip-log.txt 341 | pip-delete-this-directory.txt 342 | 343 | # Unit test / coverage reports 344 | htmlcov/ 345 | .tox/ 346 | .nox/ 347 | .coverage 348 | .coverage.* 349 | .cache 350 | nosetests.xml 351 | coverage.xml 352 | *.cover 353 | .hypothesis/ 354 | .pytest_cache/ 355 | 356 | # Translations 357 | *.mo 358 | *.pot 359 | 360 | # Django stuff: 361 | local_settings.py 362 | db.sqlite3 363 | 364 | # Flask stuff: 365 | instance/ 366 | .webassets-cache 367 | 368 | # Scrapy stuff: 369 | .scrapy 370 | 371 | # Sphinx documentation 372 | docs/_build/ 373 | 374 | # PyBuilder 375 | target/ 376 | 377 | # Jupyter Notebook 378 | 379 | # IPython 380 | profile_default/ 381 | ipython_config.py 382 | 383 | # pyenv 384 | .python-version 385 | 386 | # celery beat schedule file 387 | celerybeat-schedule 388 | 389 | # SageMath parsed files 390 | *.sage.py 391 | 392 | # Environments 393 | .env 394 | .venv 395 | env/ 396 | venv/ 397 | ENV/ 398 | env.bak/ 399 | venv.bak/ 400 | 401 | # Spyder project settings 402 | .spyderproject 403 | .spyproject 404 | 405 | # Rope project settings 406 | .ropeproject 407 | 408 | # mkdocs documentation 409 | /site 410 | 411 | # mypy 412 | .mypy_cache/ 413 | .dmypy.json 414 | dmypy.json 415 | 416 | # Pyre type checker 417 | .pyre/ 418 | 419 | ### Python Patch ### 420 | .venv/ 421 | 422 | ### Ignored folder 423 | */ignored/* 424 | 425 | ### Data folder 426 | data/* 427 | 428 | 429 | -------------------------------------------------------------------------------- /src/bert_class/bert_class.py: -------------------------------------------------------------------------------- 1 | from bert.run_classifier import convert_examples_to_features, InputExample, input_fn_builder 2 | from bert.tokenization import FullTokenizer 3 | from bert.optimization import create_optimizer 4 | 5 | 6 | import numpy as np 7 | import tensorflow as tf 8 | import tensorflow_hub as hub 9 | 10 | 11 | class BertClassifier(object): 12 | 13 | def __init__(self, ): 14 | self.BERT_MODEL_HUB = "https://tfhub.dev/google/bert_uncased_L-12_H-768_A-12/1" 15 | self.tokenizer = None 16 | self.model = None 17 | 18 | self.max_seq_len = 128 19 | self.tokenizer = self.__create_tokenizer_from_hub_module() 20 | 21 | def __create_tokenizer_from_hub_module(self): 22 | if self.tokenizer is not None: 23 | return self.tokenizer 24 | 25 | with tf.Graph().as_default(): 26 | bert_module = hub.Module(self.BERT_MODEL_HUB) 27 | tokenization_info = bert_module(signature="tokenization_info", 28 | as_dict=True) 29 | with tf.Session() as sess: 30 | vocab_file, do_lower_case = sess.run([tokenization_info["vocab_file"], 31 | tokenization_info["do_lower_case"]]) 32 | 33 | tokenizer = FullTokenizer(vocab_file=vocab_file, 34 | do_lower_case=do_lower_case) 35 | 36 | self.tokenizer = tokenizer 37 | return tokenizer 38 | 39 | def __create_features(self, pd_dataset, label_list, 40 | max_seq_len, tokenizer, 41 | data_column, label_column): 42 | input_examples = pd_dataset.apply(lambda x: InputExample(guid=None, 43 | text_a=x[data_column], 44 | text_b=None, 45 | label=x[label_column]), axis=1) 46 | return convert_examples_to_features(input_examples, label_list, 47 | max_seq_len, tokenizer) 48 | 49 | def __create_model(self, input_ids, input_mask, segment_ids, 50 | labels, num_labels, is_predicting=True): 51 | 52 | bert_module = hub.Module( 53 | self.BERT_MODEL_HUB, 54 | trainable=True) 55 | 56 | bert_inputs = dict( 57 | input_ids=input_ids, 58 | input_mask=input_mask, 59 | segment_ids=segment_ids) 60 | 61 | bert_outputs = bert_module( 62 | inputs=bert_inputs, 63 | signature="tokens", 64 | as_dict=True) 65 | 66 | output_layer = bert_outputs["pooled_output"] 67 | 68 | hidden_size = output_layer.shape[-1].value 69 | 70 | # Create our own layer to tune for politeness data. 71 | output_weights = tf.get_variable( 72 | "output_weights", [num_labels, hidden_size], 73 | initializer=tf.truncated_normal_initializer(stddev=0.02)) 74 | 75 | output_bias = tf.get_variable( 76 | "output_bias", [num_labels], initializer=tf.zeros_initializer()) 77 | 78 | with tf.variable_scope("loss"): 79 | # Dropout helps prevent overfitting 80 | output_layer = tf.nn.dropout(output_layer, keep_prob=0.9) 81 | 82 | logits = tf.matmul(output_layer, output_weights, transpose_b=True) 83 | logits = tf.nn.bias_add(logits, output_bias) 84 | log_probs = tf.nn.log_softmax(logits, axis=-1) 85 | 86 | # Convert labels into one-hot encoding 87 | one_hot_labels = tf.one_hot(labels, depth=num_labels, 88 | dtype=tf.float32) 89 | 90 | predicted_labels = tf.squeeze(tf.argmax(log_probs, axis=-1, 91 | output_type=tf.int32)) 92 | # If we're predicting, we want predicted labels 93 | # and the probabiltiies. 94 | if is_predicting: 95 | return (predicted_labels, log_probs) 96 | 97 | # If we're train/eval, compute loss between predicted and actual label 98 | per_example_loss = -tf.reduce_sum(one_hot_labels * log_probs, axis=-1) 99 | loss = tf.reduce_mean(per_example_loss) 100 | return (loss, predicted_labels, log_probs) 101 | 102 | def __model_fn_builder(self, num_labels, learning_rate, 103 | num_train_steps, 104 | num_warmup_steps): 105 | """Returns `model_fn` closure for TPUEstimator.""" 106 | 107 | def model_fn(features, labels, mode, params): 108 | input_ids = features["input_ids"] 109 | input_mask = features["input_mask"] 110 | segment_ids = features["segment_ids"] 111 | label_ids = features["label_ids"] 112 | 113 | is_predicting = (mode == tf.estimator.ModeKeys.PREDICT) 114 | 115 | # TRAIN and EVAL 116 | if not is_predicting: 117 | 118 | (loss, predicted_labels, log_probs) = self.__create_model( 119 | input_ids, 120 | input_mask, segment_ids, label_ids, num_labels, 121 | is_predicting=is_predicting 122 | ) 123 | 124 | train_op = create_optimizer( 125 | loss, learning_rate, num_train_steps, num_warmup_steps, 126 | use_tpu=False) 127 | 128 | # Calculate evaluation metrics. 129 | def metric_fn(label_ids, predicted_labels): 130 | accuracy = tf.metrics.accuracy(label_ids, predicted_labels) 131 | f1_score = tf.contrib.metrics.f1_score( 132 | label_ids, 133 | predicted_labels) 134 | auc = tf.metrics.auc( 135 | label_ids, 136 | predicted_labels) 137 | recall = tf.metrics.recall( 138 | label_ids, 139 | predicted_labels) 140 | precision = tf.metrics.precision( 141 | label_ids, 142 | predicted_labels) 143 | true_pos = tf.metrics.true_positives( 144 | label_ids, 145 | predicted_labels) 146 | true_neg = tf.metrics.true_negatives( 147 | label_ids, 148 | predicted_labels) 149 | false_pos = tf.metrics.false_positives( 150 | label_ids, 151 | predicted_labels) 152 | false_neg = tf.metrics.false_negatives( 153 | label_ids, 154 | predicted_labels) 155 | return { 156 | "eval_accuracy": accuracy, 157 | "f1_score": f1_score, 158 | "auc": auc, 159 | "precision": precision, 160 | "recall": recall, 161 | "true_positives": true_pos, 162 | "true_negatives": true_neg, 163 | "false_positives": false_pos, 164 | "false_negatives": false_neg 165 | } 166 | 167 | eval_metrics = metric_fn(label_ids, predicted_labels) 168 | 169 | if mode == tf.estimator.ModeKeys.TRAIN: 170 | return tf.estimator.EstimatorSpec(mode=mode, 171 | loss=loss, 172 | train_op=train_op) 173 | else: 174 | return tf.estimator.EstimatorSpec(mode=mode, 175 | loss=loss, 176 | eval_metric_ops=eval_metrics) 177 | else: 178 | (predicted_labels, log_probs) = self.__create_model( 179 | input_ids, 180 | input_mask, segment_ids, label_ids, num_labels, 181 | is_predicting=is_predicting 182 | ) 183 | 184 | predictions = { 185 | 'probabilities': log_probs, 186 | 'labels': predicted_labels 187 | } 188 | return tf.estimator.EstimatorSpec(mode, predictions=predictions) 189 | 190 | # Return the actual model function in the closure 191 | return model_fn 192 | 193 | def __create_estimator(self, label_list, lr, batch_size, n_train, n_warm): 194 | model_fn = self.__model_fn_builder( 195 | num_labels=len(label_list), 196 | learning_rate=lr, 197 | num_train_steps=n_train, 198 | num_warmup_steps=n_warm 199 | ) 200 | 201 | estimator = tf.estimator.Estimator(model_fn=model_fn, 202 | params={"batch_size": batch_size}) 203 | 204 | return estimator, model_fn 205 | 206 | def train(self, train, test, data_col, lbl_col, 207 | batch_size=32, 208 | lr=2e-5, 209 | epochs=3, 210 | warmup=0.1): 211 | """ 212 | Trains a BERT based model to classify fake/true news 213 | 214 | Params: 215 | 216 | train -- Pandas dataframe to train with at least (text, type) columns 217 | test -- Pandas dataframe to evaluate with at least (text, type) columns 218 | data_col -- Name of the Text column 219 | lbl_col -- Name of the Type column 220 | batch_size -- Training batch size (default = 32) 221 | epochs -- Epochs to train (default = 3) 222 | warmup -- Warmup percent to train. Defined in BERT paper (default = 0.1) 223 | 224 | Returns: 225 | 226 | Rictionary with evaluation results 227 | """ 228 | label_list = train[lbl_col].unique().tolist() 229 | tokenizer = self.__create_tokenizer_from_hub_module() 230 | 231 | train_features = self.__create_features( 232 | train, label_list, 233 | self.max_seq_len, tokenizer, data_col, lbl_col 234 | ) 235 | test_features = self.__create_features( 236 | test, label_list, 237 | self.max_seq_len, tokenizer, data_col, lbl_col 238 | ) 239 | 240 | num_train_steps = int(len(train_features) / batch_size * epochs) 241 | num_warmup_steps = int(num_train_steps * warmup) 242 | 243 | estimator, model_fn = self.__create_estimator( 244 | label_list, 245 | lr, 246 | batch_size, 247 | num_train_steps, 248 | num_warmup_steps 249 | ) 250 | 251 | train_input_fn = input_fn_builder( 252 | features=train_features, 253 | seq_length=self.max_seq_len, 254 | is_training=True, 255 | drop_remainder=False) 256 | 257 | estimator.train(input_fn=train_input_fn, max_steps=num_train_steps) 258 | 259 | test_input_fn = input_fn_builder( 260 | features=test_features, 261 | seq_length=self.max_seq_len, 262 | is_training=False, 263 | drop_remainder=False) 264 | 265 | result_dict = estimator.evaluate(input_fn=test_input_fn, steps=None) 266 | 267 | self.model = estimator 268 | 269 | return result_dict 270 | 271 | def predict(self, df): 272 | """ 273 | Predicts over a pandas dataframe. 274 | 275 | Params: 276 | df -- Pandas dataframe to train with at least (text, type) columns 277 | 278 | Returns: 279 | 280 | Dictionary with predicted labels and probabilities. 281 | """ 282 | # TODO: REMOVE type column 283 | 284 | tokenizer = self.__create_tokenizer_from_hub_module() 285 | label_list = test_other[LABEL_COLUMN].unique().tolist() 286 | #label_list = [0, 1] 287 | test_features = self.__create_features( 288 | df, label_list, 289 | self.max_seq_len, tokenizer, 'text', 'type' 290 | ) 291 | 292 | preds = [] 293 | if type(self.model) == tf.estimator.Estimator: 294 | # Is trained 295 | input_fn = input_fn_builder( 296 | features=test_features, 297 | seq_length=self.max_seq_len, 298 | is_training=False, 299 | drop_remainder=False) 300 | pred = self.model.predict(input_fn=input_fn) 301 | for p in pred: 302 | preds.append(p) 303 | else: 304 | # Is loaded from a SavedModel 305 | # Format inputs 306 | inpu = { 307 | 'label_ids': np.array([x.label_id for x in test_features]).reshape(-1,), 308 | 'input_ids': np.array([x.input_ids for x in test_features]).reshape(-1, self.max_seq_len), 309 | 'input_mask': np.array([x.input_mask for x in test_features]).reshape(-1, self.max_seq_len), 310 | 'segment_ids': np.array([x.segment_ids for x in test_features]).reshape(-1, self.max_seq_len) 311 | } 312 | preds = self.model(inpu) 313 | 314 | return preds 315 | 316 | def save_model(self, directory): 317 | """Saves model in the specified path""" 318 | def serving_input_fn(): 319 | label_ids = tf.placeholder(tf.int32, [None], name='label_ids') 320 | input_ids = tf.placeholder(tf.int32, [None, self.max_seq_len], name='input_ids') 321 | input_mask = tf.placeholder(tf.int32, [None, self.max_seq_len], name='input_mask') 322 | segment_ids = tf.placeholder(tf.int32, [None, self.max_seq_len], name='segment_ids') 323 | input_fn = tf.estimator.export.build_raw_serving_input_receiver_fn({ 324 | 'label_ids': label_ids, 325 | 'input_ids': input_ids, 326 | 'input_mask': input_mask, 327 | 'segment_ids': segment_ids, 328 | })() 329 | return input_fn 330 | 331 | self.model._export_to_tpu = False # this is important 332 | self.model.export_savedmodel(directory, serving_input_fn) 333 | 334 | def load_model(self, directory): 335 | """ 336 | Restores a previously saved model. 337 | 338 | Params: 339 | 340 | directory -- Folder in which is the .pb file 341 | """ 342 | with tf.Session() as sess: 343 | tf.saved_model.loader.load(sess, [tf.saved_model.tag_constants.SERVING], directory) 344 | self.model = tf.contrib.predictor.from_saved_model(directory) 345 | -------------------------------------------------------------------------------- /notebooks/Test_Colab_Categorical.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "nbformat": 4, 3 | "nbformat_minor": 0, 4 | "metadata": { 5 | "colab": { 6 | "name": "Test-Colab-Categorical.ipynb", 7 | "version": "0.3.2", 8 | "provenance": [], 9 | "collapsed_sections": [] 10 | }, 11 | "kernelspec": { 12 | "name": "python3", 13 | "display_name": "Python 3" 14 | }, 15 | "accelerator": "GPU" 16 | }, 17 | "cells": [ 18 | { 19 | "cell_type": "markdown", 20 | "metadata": { 21 | "id": "8D_Equ4vDsT7", 22 | "colab_type": "text" 23 | }, 24 | "source": [ 25 | "# Prueba de los modelos\n", 26 | "\n", 27 | "En este notebook se probarán los modelos entrenados (CNN y LSTM) sobre los datasets de prueba *\"Getting Real About Fake News\"* y *\"Fake news detection\"*.\n" 28 | ] 29 | }, 30 | { 31 | "cell_type": "markdown", 32 | "metadata": { 33 | "id": "cM03nLDQDx50", 34 | "colab_type": "text" 35 | }, 36 | "source": [ 37 | "## Imports" 38 | ] 39 | }, 40 | { 41 | "cell_type": "code", 42 | "metadata": { 43 | "id": "yTnhOxpiEMtP", 44 | "colab_type": "code", 45 | "colab": { 46 | "base_uri": "https://localhost:8080/", 47 | "height": 209 48 | }, 49 | "outputId": "e00a7543-4089-4493-9d06-8106969ce2ec" 50 | }, 51 | "source": [ 52 | "from google.colab import drive\n", 53 | "drive.mount('/content/drive')" 54 | ], 55 | "execution_count": 1, 56 | "outputs": [ 57 | { 58 | "output_type": "stream", 59 | "text": [ 60 | "Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3Aietf%3Awg%3Aoauth%3A2.0%3Aoob&scope=email%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdocs.test%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive.photos.readonly%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fpeopleapi.readonly&response_type=code\n", 61 | "\n", 62 | "Enter your authorization code:\n", 63 | "··········\n", 64 | "Mounted at /content/drive\n" 65 | ], 66 | "name": "stdout" 67 | } 68 | ] 69 | }, 70 | { 71 | "cell_type": "code", 72 | "metadata": { 73 | "id": "-h0PLOjXEUGh", 74 | "colab_type": "code", 75 | "colab": { 76 | "base_uri": "https://localhost:8080/", 77 | "height": 151 78 | }, 79 | "outputId": "a7a38658-cb94-4750-bfc3-8ae7c3199b12" 80 | }, 81 | "source": [ 82 | "!pip install --upgrade pandas" 83 | ], 84 | "execution_count": 2, 85 | "outputs": [ 86 | { 87 | "output_type": "stream", 88 | "text": [ 89 | "Requirement already up-to-date: pandas in /usr/local/lib/python3.6/dist-packages (0.24.2)\n", 90 | "Requirement already satisfied, skipping upgrade: pytz>=2011k in /usr/local/lib/python3.6/dist-packages (from pandas) (2018.9)\n", 91 | "Requirement already satisfied, skipping upgrade: python-dateutil>=2.5.0 in /usr/local/lib/python3.6/dist-packages (from pandas) (2.5.3)\n", 92 | "Requirement already satisfied, skipping upgrade: numpy>=1.12.0 in /usr/local/lib/python3.6/dist-packages (from pandas) (1.16.3)\n", 93 | "Requirement already satisfied, skipping upgrade: six>=1.5 in /usr/local/lib/python3.6/dist-packages (from python-dateutil>=2.5.0->pandas) (1.12.0)\n" 94 | ], 95 | "name": "stdout" 96 | } 97 | ] 98 | }, 99 | { 100 | "cell_type": "code", 101 | "metadata": { 102 | "id": "sGymmzMYEb9y", 103 | "colab_type": "code", 104 | "colab": { 105 | "base_uri": "https://localhost:8080/", 106 | "height": 51 107 | }, 108 | "outputId": "9154b81a-b7a4-418b-b6ea-7bd3a0e6bca6" 109 | }, 110 | "source": [ 111 | "!ls \"drive/My Drive/Colab Notebooks/data\"" 112 | ], 113 | "execution_count": 3, 114 | "outputs": [ 115 | { 116 | "output_type": "stream", 117 | "text": [ 118 | "data_kaggle_proc.pickle\t\t news_getting_real.pickle\n", 119 | "GoogleNews-vectors-negative300.bin.gz news_proc_12_3_19.pickle\n" 120 | ], 121 | "name": "stdout" 122 | } 123 | ] 124 | }, 125 | { 126 | "cell_type": "code", 127 | "metadata": { 128 | "id": "obqRzUUdEeRy", 129 | "colab_type": "code", 130 | "colab": {} 131 | }, 132 | "source": [ 133 | "import pandas as pd\n", 134 | "import numpy as np\n", 135 | "\n", 136 | "#Progress bars\n", 137 | "from tqdm import tqdm\n", 138 | "tqdm.pandas()\n", 139 | "\n", 140 | "#Paralelize pandas apply on multiple cores\n", 141 | "#import swifter\n", 142 | "\n", 143 | "from matplotlib import pyplot as plt\n", 144 | "from matplotlib import style\n", 145 | "#Nicer style\n", 146 | "style.use('seaborn')\n", 147 | "import seaborn as sns\n", 148 | "\n", 149 | "from tensorflow import keras as k\n", 150 | "\n", 151 | "from sklearn.model_selection import train_test_split\n", 152 | "\n", 153 | "from gensim.models import KeyedVectors\n", 154 | "\n", 155 | "from sklearn.metrics import accuracy_score, confusion_matrix, classification_report" 156 | ], 157 | "execution_count": 0, 158 | "outputs": [] 159 | }, 160 | { 161 | "cell_type": "code", 162 | "metadata": { 163 | "id": "Phzn0O7dEkxK", 164 | "colab_type": "code", 165 | "colab": {} 166 | }, 167 | "source": [ 168 | "LSTM_PATH = 'drive/My Drive/Colab Notebooks/weights/lstm.h5'\n", 169 | "CONV_PATH = 'drive/My Drive/Colab Notebooks/weights/conv.h5'" 170 | ], 171 | "execution_count": 0, 172 | "outputs": [] 173 | }, 174 | { 175 | "cell_type": "markdown", 176 | "metadata": { 177 | "id": "u2VhzVjJEvW4", 178 | "colab_type": "text" 179 | }, 180 | "source": [ 181 | "## Prepare data" 182 | ] 183 | }, 184 | { 185 | "cell_type": "code", 186 | "metadata": { 187 | "id": "r6sw8lCcExJ9", 188 | "colab_type": "code", 189 | "colab": {} 190 | }, 191 | "source": [ 192 | "getting_real = pd.read_pickle('drive/My Drive/Colab Notebooks/data/news_getting_real2.pickle')\n", 193 | "true_or_fake = pd.read_pickle('drive/My Drive/Colab Notebooks/data/data_kaggle_proc.pickle')" 194 | ], 195 | "execution_count": 0, 196 | "outputs": [] 197 | }, 198 | { 199 | "cell_type": "code", 200 | "metadata": { 201 | "id": "lckZjRSZGMSB", 202 | "colab_type": "code", 203 | "colab": { 204 | "base_uri": "https://localhost:8080/", 205 | "height": 111 206 | }, 207 | "outputId": "c2c3e26b-ce7e-4ff0-e4f6-467cd6fa2bb2" 208 | }, 209 | "source": [ 210 | "getting_real.head(2)" 211 | ], 212 | "execution_count": 127, 213 | "outputs": [ 214 | { 215 | "output_type": "execute_result", 216 | "data": { 217 | "text/html": [ 218 | "
\n", 219 | "\n", 232 | "\n", 233 | " \n", 234 | " \n", 235 | " \n", 236 | " \n", 237 | " \n", 238 | " \n", 239 | " \n", 240 | " \n", 241 | " \n", 242 | " \n", 243 | " \n", 244 | " \n", 245 | " \n", 246 | " \n", 247 | " \n", 248 | " \n", 249 | " \n", 250 | " \n", 251 | " \n", 252 | " \n", 253 | " \n", 254 | " \n", 255 | " \n", 256 | " \n", 257 | " \n", 258 | "
titletypecontentone_hot_label
0[4024, 2370, 1280, 11538, 17251, 20, 17919, 57...fake[4024, 2370, 1280, 11538, 17251, 20, 17919, 57...[0, 1, 0]
1[5977, 4211, 7726, 11538, 9311, 8469, 4211, 12...fake[5760, 5760, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...[0, 1, 0]
\n", 259 | "
" 260 | ], 261 | "text/plain": [ 262 | " title type \\\n", 263 | "0 [4024, 2370, 1280, 11538, 17251, 20, 17919, 57... fake \n", 264 | "1 [5977, 4211, 7726, 11538, 9311, 8469, 4211, 12... fake \n", 265 | "\n", 266 | " content one_hot_label \n", 267 | "0 [4024, 2370, 1280, 11538, 17251, 20, 17919, 57... [0, 1, 0] \n", 268 | "1 [5760, 5760, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ... [0, 1, 0] " 269 | ] 270 | }, 271 | "metadata": { 272 | "tags": [] 273 | }, 274 | "execution_count": 127 275 | } 276 | ] 277 | }, 278 | { 279 | "cell_type": "code", 280 | "metadata": { 281 | "id": "xtUJOJSwIwOE", 282 | "colab_type": "code", 283 | "colab": { 284 | "base_uri": "https://localhost:8080/", 285 | "height": 111 286 | }, 287 | "outputId": "a958b02e-9b69-4b5f-9e57-e009f56e3117" 288 | }, 289 | "source": [ 290 | "true_or_fake.head(2)" 291 | ], 292 | "execution_count": 128, 293 | "outputs": [ 294 | { 295 | "output_type": "execute_result", 296 | "data": { 297 | "text/html": [ 298 | "
\n", 299 | "\n", 312 | "\n", 313 | " \n", 314 | " \n", 315 | " \n", 316 | " \n", 317 | " \n", 318 | " \n", 319 | " \n", 320 | " \n", 321 | " \n", 322 | " \n", 323 | " \n", 324 | " \n", 325 | " \n", 326 | " \n", 327 | " \n", 328 | " \n", 329 | " \n", 330 | " \n", 331 | " \n", 332 | " \n", 333 | " \n", 334 | " \n", 335 | "
HeadlineBodyLabel
0[2675, 1143, 2205, 48926, 6117, 13034, 0, 0, 0...[15680, 8429, 28683, 14257, 312, 281, 565, 611...[0, 0, 0, 1]
1[556, 2168, 3912, 5042, 2360, 508, 115, 948, 0...[4949, 20876, 17535, 3912, 46, 3610, 556, 3230...[0, 0, 0, 1]
\n", 336 | "
" 337 | ], 338 | "text/plain": [ 339 | " Headline \\\n", 340 | "0 [2675, 1143, 2205, 48926, 6117, 13034, 0, 0, 0... \n", 341 | "1 [556, 2168, 3912, 5042, 2360, 508, 115, 948, 0... \n", 342 | "\n", 343 | " Body Label \n", 344 | "0 [15680, 8429, 28683, 14257, 312, 281, 565, 611... [0, 0, 0, 1] \n", 345 | "1 [4949, 20876, 17535, 3912, 46, 3610, 556, 3230... [0, 0, 0, 1] " 346 | ] 347 | }, 348 | "metadata": { 349 | "tags": [] 350 | }, 351 | "execution_count": 128 352 | } 353 | ] 354 | }, 355 | { 356 | "cell_type": "markdown", 357 | "metadata": { 358 | "id": "AuK3kmSMQ4i-", 359 | "colab_type": "text" 360 | }, 361 | "source": [ 362 | "### Reprocesar variables objetivo\n", 363 | "Los modelos están preparados para distinguir 4 tipos de noticia (clickbait, bias, fake y true). Sin embargo, estos datasets no cuentan con las mismas categorias.\n", 364 | " \n", 365 | " * Getting Real About FN: *fake, bias y true*\n", 366 | " * Fake or True: *true y fake*\n", 367 | " \n", 368 | "**Nota:** *(ver datasets de procesado)*" 369 | ] 370 | }, 371 | { 372 | "cell_type": "code", 373 | "metadata": { 374 | "id": "cFcp51Z7Fs3j", 375 | "colab_type": "code", 376 | "colab": {} 377 | }, 378 | "source": [ 379 | "fit_getting_real = [np.asarray(getting_real['title'].tolist()), np.asarray(getting_real['content'].tolist())]\n", 380 | "fit_true_or_fake = [np.asarray(true_or_fake['Headline'].tolist()), np.asarray(true_or_fake['Body'].tolist())]\n", 381 | "\n", 382 | "target_or_fake_tgt = np.asarray(true_or_fake['Label'].tolist())[:,2:]\n", 383 | "target_getting_real = np.asarray(getting_real['one_hot_label'].tolist())" 384 | ], 385 | "execution_count": 0, 386 | "outputs": [] 387 | }, 388 | { 389 | "cell_type": "markdown", 390 | "metadata": { 391 | "id": "ygFIAbW6F45h", 392 | "colab_type": "text" 393 | }, 394 | "source": [ 395 | "## Predict CNN" 396 | ] 397 | }, 398 | { 399 | "cell_type": "code", 400 | "metadata": { 401 | "id": "y2mJM53VF67K", 402 | "colab_type": "code", 403 | "colab": { 404 | "base_uri": "https://localhost:8080/", 405 | "height": 210 406 | }, 407 | "outputId": "b6f434fd-b9b5-4817-8b8b-c0cdb0fa1e33" 408 | }, 409 | "source": [ 410 | "cnn_model = k.models.load_model(CONV_PATH)" 411 | ], 412 | "execution_count": 11, 413 | "outputs": [ 414 | { 415 | "output_type": "stream", 416 | "text": [ 417 | "WARNING:tensorflow:From /usr/local/lib/python3.6/dist-packages/tensorflow/python/ops/resource_variable_ops.py:435: colocate_with (from tensorflow.python.framework.ops) is deprecated and will be removed in a future version.\n", 418 | "Instructions for updating:\n", 419 | "Colocations handled automatically by placer.\n", 420 | "WARNING:tensorflow:From /usr/local/lib/python3.6/dist-packages/tensorflow/python/keras/layers/core.py:143: calling dropout (from tensorflow.python.ops.nn_ops) with keep_prob is deprecated and will be removed in a future version.\n", 421 | "Instructions for updating:\n", 422 | "Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.\n", 423 | "WARNING:tensorflow:From /usr/local/lib/python3.6/dist-packages/tensorflow/python/ops/math_ops.py:3066: to_int32 (from tensorflow.python.ops.math_ops) is deprecated and will be removed in a future version.\n", 424 | "Instructions for updating:\n", 425 | "Use tf.cast instead.\n" 426 | ], 427 | "name": "stdout" 428 | } 429 | ] 430 | }, 431 | { 432 | "cell_type": "markdown", 433 | "metadata": { 434 | "id": "HaiJxOz8czgH", 435 | "colab_type": "text" 436 | }, 437 | "source": [ 438 | "#### Predicción sobre *Fake or true*" 439 | ] 440 | }, 441 | { 442 | "cell_type": "code", 443 | "metadata": { 444 | "id": "vRCzMJUaF_mj", 445 | "colab_type": "code", 446 | "colab": {} 447 | }, 448 | "source": [ 449 | "pred = cnn_model.predict(fit_true_or_fake)\n", 450 | "\n", 451 | "pr = pred.round()\n", 452 | "#Set Bias to Fake\n", 453 | "mask = np.all((pred.round() == [1.,0.,0.,0.]), axis=1)\n", 454 | "pr[mask] = np.repeat(np.array([0.,0.,1.,0.]).reshape(-1,4), pr[mask].shape[0], axis=0)\n", 455 | "\n", 456 | "#Set Fake\n", 457 | "mask = np.all((pred.round() == [0.,1.,0.,0.]), axis=1)\n", 458 | "pr[mask] = np.repeat(np.array([0.,0.,1.,0.]).reshape(-1,4), pr[mask].shape[0], axis=0)" 459 | ], 460 | "execution_count": 0, 461 | "outputs": [] 462 | }, 463 | { 464 | "cell_type": "code", 465 | "metadata": { 466 | "id": "RmVc6C71WVmt", 467 | "colab_type": "code", 468 | "colab": { 469 | "base_uri": "https://localhost:8080/", 470 | "height": 34 471 | }, 472 | "outputId": "db82b969-2291-4aef-c332-12f6c5dcf4c4" 473 | }, 474 | "source": [ 475 | "accuracy_score(target_or_fake_tgt, pr[:,2:])" 476 | ], 477 | "execution_count": 132, 478 | "outputs": [ 479 | { 480 | "output_type": "execute_result", 481 | "data": { 482 | "text/plain": [ 483 | "0.5484924623115578" 484 | ] 485 | }, 486 | "metadata": { 487 | "tags": [] 488 | }, 489 | "execution_count": 132 490 | } 491 | ] 492 | }, 493 | { 494 | "cell_type": "code", 495 | "metadata": { 496 | "id": "Fj2gAUk0YJEJ", 497 | "colab_type": "code", 498 | "colab": { 499 | "base_uri": "https://localhost:8080/", 500 | "height": 261 501 | }, 502 | "outputId": "756f3f0a-4e23-40fa-83ec-6045182c7f39" 503 | }, 504 | "source": [ 505 | "report = classification_report(target_or_fake_tgt, pr[:,2:])\n", 506 | "print(report)" 507 | ], 508 | "execution_count": 133, 509 | "outputs": [ 510 | { 511 | "output_type": "stream", 512 | "text": [ 513 | " precision recall f1-score support\n", 514 | "\n", 515 | " 0 0.61 0.65 0.63 2113\n", 516 | " 1 0.59 0.43 0.50 1867\n", 517 | "\n", 518 | " micro avg 0.61 0.55 0.58 3980\n", 519 | " macro avg 0.60 0.54 0.57 3980\n", 520 | "weighted avg 0.60 0.55 0.57 3980\n", 521 | " samples avg 0.55 0.55 0.55 3980\n", 522 | "\n" 523 | ], 524 | "name": "stdout" 525 | }, 526 | { 527 | "output_type": "stream", 528 | "text": [ 529 | "/usr/local/lib/python3.6/dist-packages/sklearn/metrics/classification.py:1143: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in samples with no predicted labels.\n", 530 | " 'precision', 'predicted', average, warn_for)\n" 531 | ], 532 | "name": "stderr" 533 | } 534 | ] 535 | }, 536 | { 537 | "cell_type": "code", 538 | "metadata": { 539 | "id": "VsBjJGRCY8tV", 540 | "colab_type": "code", 541 | "colab": { 542 | "base_uri": "https://localhost:8080/", 543 | "height": 398 544 | }, 545 | "outputId": "2bf1f4ed-bfc6-447c-9a8d-e67d25fa4f76" 546 | }, 547 | "source": [ 548 | "matrix = confusion_matrix(target_or_fake_tgt.argmax(axis=1), pr[:,2:].argmax(axis=1))\n", 549 | "print(matrix)\n", 550 | "\n", 551 | "df_cm = pd.DataFrame(matrix, columns=['true', 'fake'], index=['true', 'fake'])\n", 552 | "sns.heatmap(df_cm, annot=True)" 553 | ], 554 | "execution_count": 134, 555 | "outputs": [ 556 | { 557 | "output_type": "stream", 558 | "text": [ 559 | "[[1566 547]\n", 560 | " [1064 803]]\n" 561 | ], 562 | "name": "stdout" 563 | }, 564 | { 565 | "output_type": "execute_result", 566 | "data": { 567 | "text/plain": [ 568 | "" 569 | ] 570 | }, 571 | "metadata": { 572 | "tags": [] 573 | }, 574 | "execution_count": 134 575 | }, 576 | { 577 | "output_type": "display_data", 578 | "data": { 579 | "image/png": "iVBORw0KGgoAAAANSUhEUgAAAccAAAFKCAYAAABo0pS0AAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDMuMC4zLCBo\ndHRwOi8vbWF0cGxvdGxpYi5vcmcvnQurowAAIABJREFUeJzt3XtcVHX+x/H3DDOIFxQxxgKz1Exs\nF9113VVRt9y0Vv3tZpkXWNDKLhZq21JeyIrEVqm0UknNtjIJM602flai9TOrFbONMi3vl1RQLgqi\noDIw8/vDbTZOIopnuIyv5z7m8dg5Z+Z8v2e3R28+38sZi9vtdgsAAHhY67oDAADUN4QjAAAGhCMA\nAAaEIwAABoQjAAAGhCMAAAY2bzfgLNjj7SYAr2sc2reuuwCYorws22vXvph/39sva29iTy6e18MR\nAHCJcFXUdQ9Mw7AqAAAGVI4AAHO4XXXdA9MQjgAAc7gIRwAAKnFTOQIAYEDlCACAAZUjAAAGbOUA\nAMB3UTkCAMzBsCoAAAYsyAEAoDK2cgAAYETlCACAgQ9VjqxWBQDAgMoRAGAOH9rnSDgCAMzhQ8Oq\nhCMAwBwsyAEAwIDKEQAAAypHAAAqc7t9Z0EOWzkAADCgcgQAmIM5RwAADJhzBADAgMoRAAADnpAD\nAIABlSMAAAY+NOfIVg4AAAyoHAEA5mBYFQAAAx8aViUcAQDmIBwBAKjMl56tSjgCAMxB5QgAgIEP\nLchhKwcAAAZUjgAAczCsCgCAgQ8NqxKOAABzUDkCAGBA5QgAgAGVIwAABj4UjmzlAADAgMoRAGAO\nH5pzpHIEAJjD5ar5qxo7duxQ//79lZqaWun4Z599pk6dOnnep6ena+jQoRo2bJiWL18uSXI6nYqP\nj1dUVJRiYmJ04MCBatsjHAEA5nC7av46h9LSUiUlJalXr16Vjp8+fVovvfSSQkJCPJ9LSUnRa6+9\npiVLlmjx4sUqKirSypUr1bx5cy1dulRjx47VrFmzqr0VwhEAYA4vVY7+/v5atGiRHA5HpeMLFixQ\ndHS0/P39JUmbNm1SRESEAgMDFRAQoG7duikrK0uZmZkaMGCAJCkyMlJZWVnV3grhCAAwh5cqR5vN\npoCAgErH9u7dq23btmngwIGeYwUFBQoODva8Dw4OVn5+fqXjVqtVFotFZWVl527zQu8dAICzqsWt\nHDNmzNDUqVPP+Rm3231Bx3+KyhEA0KDk5uZqz549evjhhzV8+HDl5eUpJiZGDodDBQUFns/l5eXJ\n4XDI4XAoPz9f0pnFOW632zMUWxUqRwCAOWqpcmzdurU++ugjz/s//OEPSk1N1alTpzR16lQVFxfL\nz89PWVlZSkhI0IkTJ7Rq1Sr17dtXa9euVY8ePaptg3AEAJjjPIYra2LLli1KTk5Wdna2bDabMjIy\nNHfuXAUFBVX6XEBAgOLj4zVmzBhZLBbFxcUpMDBQgwYN0vr16xUVFSV/f3/NnDmz2jYt7vMZfL0I\nzoI93rw8UCsah/at6y4Apigvy/batU8ufaLG320c9aSJPbl4VI4AAHP40LNVCUcAgDl86PFxhCMA\nwBw+VDmylQMAAAMqRwCAOby7vrNWEY4AAHP40LAq4QgAMAfhCACAAatVAQCozO1izhEAgMp8aFiV\nrRwAABhQOQIAzMGcIwAABsw5AgBg4ENzjoQjAMAchCMAAAY8Pg5m2rlnn8ZPelKjRtyq6Nv/XOnc\nodx8TUycKaezXJ2vvUZPTBx/wdfftnOPkp6dJ4vFoms7XK3HHxkvl8ulp2a/qB2796q8vEK3/3mg\nhv7pZrNuCZeY63/fS28uXajvv98uSdq8ZZv++tBjnvO7dmzQwYM5qqiokCTFjh6vnJzDF9RGly7X\nKWXuDLndbn27eavGjZ8iSRo/boyio26VLBYtXvyWFixcbNJd4YJROcIspSdP6e+z56tn91+d9fyz\n8xZp9Mjb1P/63po+K0WHDufpissdF9RG8gsLNfmv9ymicydNTEzWZ5lfqmmTxrLZbFoyf5ZKS0/q\nj8Pu1K2DB8hqZXcPaubTzzZoxMh7qzw/+E8xKikprfH1Zz/7pB762+P691ebtOT1efrjzf20fcdu\njR49Qj16DpTVatXW7z5T2tJ3VFx8vMbtABL7HOucv92u+bOmKeSyVj8753K59NWmLerXp6ckaWp8\nnK643KGKigo9NuM53TlukmLvj9cXX31T6Xt3jJvo+e9Op1PZhw4ronMnSdINvXtow7+/Ubeuv9SU\nv46VJB0pLFKL5oEEI2qV1WrVSwuf1Uerl2vd2nfV74belc5/vGa557/b7XZdffWV+vdXmyRJK99f\noxv/0Ff79h3Q9TcMUUVFhZxOp0pPnlTz5oG1eh/4CZe75q96ptp/G544cUILFizQU089JUnasGGD\niouLvd6xS4XN5qeARo3Oeu5o0TE1bdJEyXNeUuz98Xpu/quSpPfXfKKQVsF6dV6y5sx4XDNfWFjl\n9QuLitU8sJnnfXDLFso/ctTz/m9Tn1Ls/fF6NP4Bk+4Il6rOnTvq3Xde1bq176r/jX1/dv7FlJla\nt/Zd/f2pM8OhUVG36vDhPPW/aZhuu/0uzZqVWOW1L7ssWIVFxzzv8/OO6PIrHHK73Z5qdED/3+tI\nwVEdPJhj7o3h/LldNX/VM9UOq06ePFmRkZH65JNPJElHjx5VfHy8Fi1a5O2+we1WXn6BYobdorAr\nWuuBR57QuvUb9c3m75W16TtlffudJOn06TI5nU49mDBdpSdPavvOPbpj3EQFNGqkaZP/arxkJbOn\nP6qcw7m676GpevPlF9S0aZPaujv4kJ279ipp+nNavjxd7dtfpY9WL1enzr3ldDolSYnTnlVGxlod\nPVqkd1a8ottuG6xePburT5/fqXfkbyVJjQMay263a8VbL6tZsybq2vUX+njNcp08eUr33PdwpfYs\nFkul9z1+103JyY/pz7eMrp0bxtnVwwqwpqoNx5KSEkVHR+vDDz+UJA0aNEhLly71escgBbVooSsu\nd6htm1BJUo/f/Eq79/4gu92ue0eP1KABN1T6/IvPPCnpzLDqa/OeliQ5y8tV9JP5l7z8AoVcFqw9\nPxyQ2+1Wh6vbKvTy1moTern2/HBAEdd1qp2bg0/JyTms5cvTJUl79vyg3Nw8hYVdrn37DkiSUlNX\neD774ar/U8Qvw1VWVqYZM+do2bL3Kl3rllvPBNzHa5brxgHDJEk2m02tglt6PhMadrkO5eRKOrNQ\nZ+HCZ3TLkNFUjXXM7UMLcqodVnW5XNq/f7/nL7VPP/1ULh/6H6A+s9n81Cb0Cv1wIFuS9P32nbq6\nbRt1ua6T/u+zTEln5gufX/Baldew22xq17aNsjZtkSR9tG69+vTorj379uuFhWe+d/LUKe3df1Bh\nV7T26v3Ad0VF3aq/PXSfJKl16xA5HCHKzj6zGrV580B9sPIN2e12SdLvf99TW77bro1ffq0//2eF\ndEhIK01Pmlzl9cvLy7V9+y5PlXnrkIHKWP2JrFarFr00S8NH3KsffjjozVvE+fChOUeL233ujSm7\ndu3S9OnT9e2336px48YKDw9XQkKCOnTocF4NOAv2mNJRX/Xdtp16Zt4i5RzKlc1mkyOklfr16amw\nK1qr//W9tf9gjh59apZcLreu7XC1Hnt4nFwut6Y9M1e79+2Xy+XSA3f9RX17/bbKNnbv/UFPPj1X\nLrdbXa7rpIkT7pXb7daM5+bru+275HQ6NXzIIN3+54G1eOcNS+PQn8+h4b+aNWuq1CUpCmrRXP7+\ndiVNf04hIZfpWHGx3ntvlcaPG6PY2GE6dfKUvv5mix7861T5+fnpxZSZuq7ztfLzs2pa0mytylhb\nZRudO3fU/JRkWa1Wbdz4tR6e+KQG9P+93kh9UZs3b/V8bvKUp/Tlv7+p8jqXuvKybK9du2R6TI2/\n23Rqqok9uXjVhuPFIhzhCwhH+ArC8fxUO+fYs2dPz5BqeXm5SkpKFBYWpjVr1ni9cwCABqQeDo/W\nVLXhuGHDhkrvt23bpvT0dK91CADQQPnQepQL3vUdHh6ur7/+2ht9AQA0ZD60IKfaynHChAmV9hTl\n5+erSRP2wgEADOrhZv6aqjYco6Ki5OfnJ+nMxttmzZopPDzc6x0DADQw9bACrKlqwzElJUWpqfVr\nFREAoP7xpYcAVBuOYWFhio+PV0REhGcTryT95S9/8WrHAACoK9UuyElPT1e7du104sQJFRYWqrCw\nUAcOHKiNvgEAGpJLYUHO6tWrtXLlSgUFBWnnzp368VkBFRUV2rp1qyZPrvpRTwCAS1A9DLmaqjIc\nb7rpJl133XVKSkqqNIRqtVrVvn37WukcAKABuVRWq7Zp00YLF1b9W4EAAHhcCpUjAAAXwk04AgBg\n4EPheMGPjwMAwNdROQIAzHEpPQQAAIDz4kPDqoQjAMAchCMAAJX9+LAYX0A4AgDMQeUIAICBD4Uj\nWzkAADCgcgQAmIIn5AAAYEQ4AgBg4DvPACAcAQDm8KVhVRbkAADM4XLX/FWNHTt2qH///kpNTZUk\nHTp0SHfccYdiYmJ0xx13KD8/X5KUnp6uoUOHatiwYVq+fLkkyel0Kj4+XlFRUYqJidGBAweqbY9w\nBACYw3URr3MoLS1VUlKSevXq5Tn2/PPPa/jw4UpNTdWAAQP06quvqrS0VCkpKXrttde0ZMkSLV68\nWEVFRVq5cqWaN2+upUuXauzYsZo1a1a1t0I4AgDqNX9/fy1atEgOh8Nz7IknntDNN98sSWrZsqWK\nioq0adMmRUREKDAwUAEBAerWrZuysrKUmZmpAQMGSJIiIyOVlZVVbZuEIwDAFG6Xu8avc7HZbAoI\nCKh0rEmTJvLz81NFRYXS0tL0pz/9SQUFBQoODvZ8Jjg4WPn5+ZWOW61WWSwWlZWVnbNNwhEAYA4v\nDatWpaKiQhMnTlTPnj0rDbn+qKpnvZ7PM2AJRwCAKbxVOVZlypQpuuqqqzRu3DhJksPhUEFBged8\nXl6eHA6HHA6HZ8GO0+mU2+2Wv7//Oa9NOAIAzFGLlWN6errsdrsmTJjgOda1a1dt3rxZxcXFKikp\nUVZWlrp3767evXtr1apVkqS1a9eqR48e1V6ffY4AAFO4vfQQgC1btig5OVnZ2dmy2WzKyMjQkSNH\n1KhRI8XGxkqSOnTooMTERMXHx2vMmDGyWCyKi4tTYGCgBg0apPXr1ysqKkr+/v6aOXNmtW1a3F7+\nAS5nwR5vXh6oFY1D+9Z1FwBTlJdle+3aRwZfX+Pvtnp/nYk9uXgMqwIAYMCwKgDAFN4aVq0LhCMA\nwByEIwAAlVE5AgBgQDgCAGBAOAIAYOS21HUPTMNWDgAADKgcAQCmYFgVAAADt8t3hlUJRwCAKagc\nAQAwcPvQghzCEQBgCipHAAAMfGnOka0cAAAYUDkCAEzh3V8Hrl2EIwDAFL40rEo4AgBMQTgCAGDA\nsCoAAAZUjgAAGPjSQwDYygEAgAGVIwDAFDwhBwAAA5cPDasSjgAAU/jSnCPhCAAwBatVAQAwYJ8j\nAAAGvlQ5spUDAAADKkcAgClYrQoAgAGrVQEAMGBBDgAABgyrAgBgwLAqAAAGvjSsylYOAAAMvF45\nbv71Q95uAvC6UaG96roLQL3HnCMAAAbMOQIAYEDlCACAgQ+txyEcAQDmoHIEAMDAl+Yc2coBAIAB\nlSMAwBSuuu6AiQhHAIAp3PKdYVXCEQBgCpcPLVclHAEApnBROQIAUBnDqgAAGPjSghy2cgAA6r0d\nO3aof//+Sk1NlSQdOnRIsbGxio6O1oMPPqiysjJJUnp6uoYOHaphw4Zp+fLlkiSn06n4+HhFRUUp\nJiZGBw4cqLY9whEAYAq3LDV+nUtpaamSkpLUq9d/fx1nzpw5io6OVlpamq666iqtWLFCpaWlSklJ\n0WuvvaYlS5Zo8eLFKioq0sqVK9W8eXMtXbpUY8eO1axZs6q9F8IRAGAK10W8zsXf31+LFi2Sw+Hw\nHPviiy904403SpL69eunzMxMbdq0SREREQoMDFRAQIC6deumrKwsZWZmasCAAZKkyMhIZWVlVXsv\nzDkCAEzhrTlHm80mm61yXJ08eVL+/v6SpFatWik/P18FBQUKDg72fCY4OPhnx61WqywWi8rKyjzf\nP2ubXrgPAMAlqK5Wq7rdZ99geaHHf4phVQCAKVyWmr8uVJMmTXTq1ClJUm5urhwOhxwOhwoKCjyf\nycvL8xzPz8+XdGZxjtvtPmfVKBGOAACTuGSp8etCRUZGKiMjQ5K0evVq9e3bV127dtXmzZtVXFys\nkpISZWVlqXv37urdu7dWrVolSVq7dq169OhR7fUZVgUA1GtbtmxRcnKysrOzZbPZlJGRoWeffVaT\nJ0/WsmXLFBoaqiFDhshutys+Pl5jxoyRxWJRXFycAgMDNWjQIK1fv15RUVHy9/fXzJkzq23T4j6f\nwdeLkHXlLd68PFAr5vnZ67oLgCle2bfCa9f+5+XRNf7ukMNpJvbk4lE5AgBM4UtPyCEcAQCmcFl4\ntioAAJX40C9WEY4AAHP40rAqWzkAADCgcgQAmKImm/nrK8IRAGCKmmzmr68IRwCAKViQAwCAAcOq\nAAAY+NJqVcIRAGAKXxpWZSsHAAAGVI4AAFMw5wgAgAFzjgAAGBCOAAAYuBlWBQCgMipHAAAMfCkc\n2coBAIABlSMAwBS+9BAAwhEAYAr2OQIAYOBLc46EIwDAFIQjAAAGzDkCAGDgS3OObOUAAMCAyhEA\nYArmHAEAMGDOEQAAA5cPxSPhCAAwBcOqAAAY+E7dSDgCAEziS5UjWzkAADCgcgQAmMKXHgJAOAIA\nTMFqVQAADHwnGglHAIBJfGlBDuEIADAFw6oAABj4TjSylQMAgJ+hcgQAmII5RwAADJhzBADAwHei\nkXAEAJiEYVUAAAzcPlQ7Eo4AAFP4UuXIVg4AAAyoHAEApmC1KkwV0KmtOvwjQXmL0pW/+INK5yyN\n7Go74wEFdGqr7YPja3T9xp2v1pV/v19yu3Vy2z4dSFggWSy6cvq9ahx+tSx2mwreyNCRZR+ZcTuA\nGjUJ0N2zx6tJi6ay+9v13gtv6btPN13QNUY+dofa/7qj5JbSnnxF+77drZZXtNKYZ+LkZ/dThbNC\nLz00R8X5RV66C1wo34lGhlXrnLVxI1057V4d//zbs54Pe/QOnfx+70W10Sbxbh1MXKQdt02WX2AT\nNb+hm5p2D5fbWaEdQ6do58jHFDo5VrL40I+xoU71vv0GHd6TrWeiEvXi/c8q+om7Luj71/a4Tq2v\nvkJ/v+1RvTrxRUUnnvn+bQ9Had3SNUoe8YSyMjbq5jH/443uo4Zcctf4Vd9QOdYxV5lTu0ZP0+X3\n33bW8znJqbK1DFTLW6//70GrVW2TH1CjtpfLYvNTzqw0nVi/2XO641vTtXP4VEmSxW6T/5UOlW7a\nJUk69tGXCuzbVdlJr6rky62SJNtlLVRRdEJy179/QNEwnSg8rivDr5IkNWnRVCeOFqvjbztr6CPR\nqigv19FDR/Ta5AWqcJZLOhOmkvSvFZ9Ikq6LjFDW6o2SpEO7s9W0RTMFNGusJVMXyXnaKUk6fvSY\nrvplu9q9MZyTtxbklJSUaNKkSTp27JicTqfi4uIUEhKixMRESVKnTp305JNPSpJefvllrVq1ShaL\nRePGjdP1119/jitX7bzC8euvv1ZOTo4GDx6svLw8ORyOGjWGs6hwyV1RVuVpV8lJqWVgpWPBQ34v\nZ16h9j8yT34tA3XtsunaetODZ/2+Lbi5Ko6VeN6XFxyT3dHS877d/Ilq9tvO2vfgcxd5I8B/bfzf\nf6n37Tdoxidz1bRFMz1/198VO/1ePRv9pEqOndCwyTH67aBe2vDeZ2f9fouQIO3bssfz/viRYrUI\nCVLu3kOSJIvVqj+M+qPSX1hRK/eD8+OtrRzvvvuu2rVrp/j4eOXm5mr06NEKCQlRQkKCunTpovj4\neK1bt07t27fXBx98oDfffFMnTpxQdHS0+vTpIz8/vwtus9pwTE5O1qFDh7R//34NHjxYy5Yt07Fj\nxzR16tQa3SQuXtPu4Wr2u+vU7LedJUmWAH9Z7Da1f2myrE0D1Pi6dur41nS5TpVp/yPzKn/ZMHS6\n9/6n5R8WomtSE7Xtfx4+E8bAReo5pK+OZhfoudFP6crOV2ncSxMVGNxccQsfkSQ1atxIxwuP6zcD\ne+rG0QPVIiRIktT79n5am5rxs+v99B9bi9Wqe54br63rt2jrT0ZMUPe8VTm2bNlS27dvlyQVFxcr\nKChI2dnZ6tKliySpX79+yszMVH5+vvr27St/f38FBwcrLCxMu3btUqdOnS64zWrDccuWLVqyZIli\nY2MlSePHj1d0dPQFNwTzuMvKdXjuchUa/urefed0SZWHVWXzk+0nlaf98mA5c4+qUYcwWSwWndp1\nUGXZ+Tq9/7ACOrZR6Tc7a+0+4Ls6dg/Xlk+/kSQd2PqDWoQEqfDQUT098omfffarDzf8bFg19Jo2\nnsCUpKDWwTqWVyhJGvNMnHL3HVL6C8u9exOoNwYPHqx33nlHAwYMUHFxsebPn69p06Z5zrdq1Ur5\n+fkKCgpScHCw53hwcLDy8/NrFI7VLsgpLy+X0+mU5T9/uh09elSnT5++4IZgnpJvdqjFTT0kSbZW\nLRQ6KabqD5dX6NTug2r6nyozaGAvFX/ytQI6XqnQSWf+4LEE+CugQ5jK9ud6ve+4NOTtO6z2v+oo\nSWoVdpmO5hyR2+VS6DVtJEk3jh6oNv+ZkzybLZ9uUveBPSVJbX/RTkW5R3Wq5JR63tJX5U6n3nvu\nLe/fBC6Y+yL+cy7vvfeeQkNDtWbNGi1evFiPPPJI5XarWC9R1fHzUW3leOedd2rEiBHKycnR3Xff\nrT179ighIaHGDaKyxhEd1OaxO+XfxiF3eYWCBkfq2JqNOn0gT8dWbVC7+RPlH3qZAtqHqeNb01WQ\ntlqF//u5AiO76Np3k2Xxs+rQ7KWVrumpGv/jYOI/1Hbm/ZLFqpJvduj452eW1AdGRujad5Nl9bfp\ncMrbKj9aXGv3Dd/2Sdoa3fX0A5q07ElZ/fz0+qMvqcJZrruejVN5WbmK8o5q3dI1ns//WDH+aHfW\ndv2wZY8S3n5KbpdLqY+/LEn6w6g/yt7Irolvnll8kbPzgFIfe7nW7gvn5q1h1aysLPXp00eSFB4e\nrtOnT6u8vNxzPjc3Vw6HQw6HQ3v37v3Z8ZqwuKuJ1pMnT8rtdmvXrl2y2+1q166djh07ptatW59X\nA1lX3lKjjgH1yTw/e113ATDFK/u8t4gp9qqzr7o/H0t+eKfKc6+88ooKCgo0ceJEZWdn66677lJY\nWJgeeOABde/eXffff79iY2N19dVX67777tPbb7+twsJCjRo1Sh9++KGs1gvftVht5XjXXXfp+eef\n90x8Ll++XK+++qo++OCDar4JALiUeGsz2IgRI5SQkKCYmBiVl5crMTFRISEhevzxx+VyudS1a1dF\nRkZKkoYPH66YmBhZLBYlJibWKBil86gct27dqsTERN1zzz1aunSpHA6HpkyZoubNm59XA1SO8AVU\njvAV3qwco6+6tcbfTfvhXRN7cvGqjdTOnTtrwYIFeuONN9SxY0fNmDHjvIMRAHDp8NaCnLpQ5bBq\nz549PStUJcnlcmnjxo365z//KYvFoszMzFrpIAAAta3KcNywYUOVX/rXv/7llc4AABouX/o9x2oX\n5Bw4cEBpaWkqKjrz5Hun06kvv/xS69at83rnAAANR318gHhNVTvnOHnyZF1zzTX67rvvdMMNN8hq\ntVZ6MgEAAJJvzTlWG442m01Dhw5V8+bNdfPNN+vpp59WampqbfQNANCAuC7iVd9UO6zqdru1ceNG\nBQUFadmyZWrbtq0OHjxYG30DADQgF/O4tvqmyspxypQpkqSwsDA1adJEU6dO1TfffKPXX39dkydP\nrrUOAgAahkvix453796tW2+9Vfv379fOnWd+qeHHvwrmzp1b4x+QBACgvqsyHNPS0pSXl6eZM2dq\n0qRJtdknAEADVB/nDmuqynC02WwKDQ3VnDlzarM/AIAGqj6uOq2pahfkAABwPurj3GFNEY4AAFP4\n0mpVwhEAYIpLYs4RAIAL4UtzjjX7FUgAAHwYlSMAwBQsyAEAwIAFOQAAGFA5AgBg4EsLcghHAIAp\nXAyrAgBQme9EI1s5AAD4GSpHAIApWJADAIAB4QgAgAH7HAEAMKByBADAgH2OAAAY+NKwKls5AAAw\noHIEAJiCOUcAAAx8aViVcAQAmILKEQAAA1arAgBgwK9yAABg4EuVI1s5AAAwoHIEAJiCYVUAAAx8\naViVcAQAmILKEQAAAypHAAAMqBwBADDwpcqRrRwAABhQOQIATOF2u+q6C6YhHAEApuDB4wAAGPCT\nVQAAGFA5AgBgQOUIAIAB+xwBAKhF6enpevnll2Wz2TRhwgR16tRJEydOVEVFhUJCQvTMM8/I399f\n6enpWrx4saxWq4YPH65hw4bVqD3CEQBgCm89BKCwsFApKSl6++23VVpaqrlz5yojI0PR0dEaOHCg\nZs+erRUrVmjIkCFKSUnRihUrZLfbdfvtt2vAgAEKCgq64DZ5CAAAwBRut7vGr3PJzMxUr1691KxZ\nMzkcDiUlJemLL77QjTfeKEnq16+fMjMztWnTJkVERCgwMFABAQHq1q2bsrKyanQvVI4AAFN4a7Xq\nwYMHderUKY0dO1bFxcUaP368Tp48KX9/f0lSq1atlJ+fr4KCAgUHB3u+FxwcrPz8/Bq1STgCAEzh\nzdWqRUVFmjdvnnJycjRq1KhKbVXV7sX0h2FVAIApXG53jV/n0qpVK/3617+WzWZT27Zt1bRpUzVt\n2lSnTp2SJOXm5srhcMjhcKigoMDzvby8PDkcjhrdC+EIADCFt+Yc+/Tpow0bNsjlcqmwsFClpaWK\njIxURkaGJGn16tXq27evunbtqs2bN6u4uFglJSXKyspS9+7da3QvDKsCAOq11q1b6+abb9bw4cMl\nSVOnTlVERIQmTZqkZcuWKTQ0VEOGDJHdbld8fLzGjBkji8WiuLg4BQYG1qhNi9vLjzTIuvIWb14e\nqBXz/Ox13QXAFK/sW+G1a7cQCwa9AAADYklEQVRo1qHG3z12YreJPbl4VI4AAFPw+DgAAAx4fBwA\nAAbeekJOXSAcAQCmoHIEAMDAl+Yc2ecIAIABlSMAwBTMOQIAYOBLw6qEIwDAFIQjAAAGvhONtfD4\nOAAAGhpWqwIAYEA4AgBgQDgCAGBAOAIAYEA4AgBgQDgCAGBAODYQGRkZdd0F4KI4nU4NGzZMkyZN\nOuv5Hj161HKPgKoRjg3AwYMH9f7779d1N4CLkp+fr7KyMiUnJ9d1V4BqEY4NwLRp07Rx40aFh4dr\n4sSJio6OVmZmpiZMmOD5zI9/de/atUujRo3S6NGj9cADD6i4uLiuug1UMmPGDO3fv19TpkxRbGys\nYmNjFRUVpf3791f63NatWzVy5EiVlJRo9erVGjlypGJiYjRz5sw66jkuRYRjAzBmzBj97ne/U1xc\nnJxOp9LS0mS1nv3/uqSkJE2bNk2LFy9W79699cYbb9Ryb4GzmzRpktq1a6eoqCjFxcVpyZIlGjp0\nqNLS0jyfOXr0qJ544gnNnj1bkjR//ny9/vrrSk1N1aFDh/TVV1/VVfdxieHZqg1Mly5dznn+22+/\n1WOPPSZJKisrU0RERG10CzhvISEhmj59uubOnavi4mL94he/kHTmodUPPfSQ7r77boWGhmrTpk3K\nycnRmDFjJEnHjx9XTk6OfvOb39Rl93GJIBwbGLvdLkmyWCyVjpeXl0uSGjdurNdff/1n54H6Ys6c\nOerTp4+ioqK0atUqffLJJ5KkEydOqFOnTnrzzTd10003yW6365e//KX+8Y9/1G2HcUliWLUBsFqt\nnvD7UbNmzZSXlydJ2rZtm0pKSiRJ4eHh+vTTTyVJ77//vjIzM2u3s0A1CgsL1bZtW7ndbn388cdy\nOp2SpMDAQCUkJCgkJERvvfWW2rVrp927d+vIkSOSzoRqbm5uXXYdlxDCsQHo0KGDvv/+ex0/ftxz\nLDw8XE2aNNHIkSP13nvvKSwsTJL06KOPauHChYqJidE777yjzp0711W3gbMaMWKEkpKSdPfdd2vw\n4MHauHGjPv/8c8/5hIQEvfLKKyoqKlJCQoLuuecejRw5UkVFRXI4HHXYc1xK+MkqAAAMqBwBADAg\nHAEAMCAcAQAwIBwBADAgHAEAMCAcAQAwIBwBADAgHAEAMPh/i8hNYbN28u8AAAAASUVORK5CYII=\n", 580 | "text/plain": [ 581 | "
" 582 | ] 583 | }, 584 | "metadata": { 585 | "tags": [] 586 | } 587 | } 588 | ] 589 | }, 590 | { 591 | "cell_type": "markdown", 592 | "metadata": { 593 | "id": "Xam8w_ELZcdZ", 594 | "colab_type": "text" 595 | }, 596 | "source": [ 597 | "#### Predicción sobre Getting Real\n" 598 | ] 599 | }, 600 | { 601 | "cell_type": "code", 602 | "metadata": { 603 | "id": "_699fuOldBa0", 604 | "colab_type": "code", 605 | "colab": {} 606 | }, 607 | "source": [ 608 | "pred = cnn_model.predict(fit_getting_real)\n", 609 | "\n", 610 | "pr = pred.round()\n", 611 | "#Set Bias to Fake\n", 612 | "mask = np.all((pred.round() == [1.,0.,0.,0.]), axis=1)\n", 613 | "pr[mask] = np.repeat(np.array([0.,0.,1.,0.]).reshape(-1,4), pr[mask].shape[0], axis=0)\n", 614 | "\n", 615 | "\n", 616 | "#Set target bias to fake\n", 617 | "mask = np.all((target_getting_real == [1.,0.,0.]), axis=1)\n", 618 | "target_getting_real[mask] = np.repeat(np.array([0.,1.,0.]).reshape(-1,3), target_getting_real[mask].shape[0], axis=0)" 619 | ], 620 | "execution_count": 0, 621 | "outputs": [] 622 | }, 623 | { 624 | "cell_type": "code", 625 | "metadata": { 626 | "id": "H-yjOsv6e7Vd", 627 | "colab_type": "code", 628 | "colab": { 629 | "base_uri": "https://localhost:8080/", 630 | "height": 34 631 | }, 632 | "outputId": "bc3e502f-f07a-4c61-bb44-f8414570b9c1" 633 | }, 634 | "source": [ 635 | "accuracy_score(target_getting_real[:,1:], pr[:,2:])" 636 | ], 637 | "execution_count": 157, 638 | "outputs": [ 639 | { 640 | "output_type": "execute_result", 641 | "data": { 642 | "text/plain": [ 643 | "0.4734318817419097" 644 | ] 645 | }, 646 | "metadata": { 647 | "tags": [] 648 | }, 649 | "execution_count": 157 650 | } 651 | ] 652 | }, 653 | { 654 | "cell_type": "code", 655 | "metadata": { 656 | "id": "_-7lOOV0fjia", 657 | "colab_type": "code", 658 | "colab": { 659 | "base_uri": "https://localhost:8080/", 660 | "height": 261 661 | }, 662 | "outputId": "ca296e58-a222-4dcc-9dd8-6bc7da04fb32" 663 | }, 664 | "source": [ 665 | "report = classification_report(target_getting_real[:,1:], pr[:,2:])\n", 666 | "print(report)" 667 | ], 668 | "execution_count": 158, 669 | "outputs": [ 670 | { 671 | "output_type": "stream", 672 | "text": [ 673 | " precision recall f1-score support\n", 674 | "\n", 675 | " 0 0.57 0.66 0.61 1503\n", 676 | " 1 0.30 0.19 0.23 1000\n", 677 | "\n", 678 | " micro avg 0.49 0.47 0.48 2503\n", 679 | " macro avg 0.43 0.43 0.42 2503\n", 680 | "weighted avg 0.46 0.47 0.46 2503\n", 681 | " samples avg 0.47 0.47 0.47 2503\n", 682 | "\n" 683 | ], 684 | "name": "stdout" 685 | }, 686 | { 687 | "output_type": "stream", 688 | "text": [ 689 | "/usr/local/lib/python3.6/dist-packages/sklearn/metrics/classification.py:1143: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in samples with no predicted labels.\n", 690 | " 'precision', 'predicted', average, warn_for)\n" 691 | ], 692 | "name": "stderr" 693 | } 694 | ] 695 | }, 696 | { 697 | "cell_type": "code", 698 | "metadata": { 699 | "id": "c5_wJ3I3fmma", 700 | "colab_type": "code", 701 | "colab": { 702 | "base_uri": "https://localhost:8080/", 703 | "height": 398 704 | }, 705 | "outputId": "203639a3-605e-4e68-99d3-2b3f47468aa5" 706 | }, 707 | "source": [ 708 | "matrix = confusion_matrix(target_getting_real[:,1:].argmax(axis=1), pr[:,2:].argmax(axis=1))\n", 709 | "print(matrix)\n", 710 | "\n", 711 | "df_cm = pd.DataFrame(matrix, columns=['true', 'fake'], index=['true', 'fake'])\n", 712 | "sns.heatmap(df_cm, annot=True)" 713 | ], 714 | "execution_count": 160, 715 | "outputs": [ 716 | { 717 | "output_type": "stream", 718 | "text": [ 719 | "[[1047 456]\n", 720 | " [ 808 192]]\n" 721 | ], 722 | "name": "stdout" 723 | }, 724 | { 725 | "output_type": "execute_result", 726 | "data": { 727 | "text/plain": [ 728 | "" 729 | ] 730 | }, 731 | "metadata": { 732 | "tags": [] 733 | }, 734 | "execution_count": 160 735 | }, 736 | { 737 | "output_type": "display_data", 738 | "data": { 739 | "image/png": "iVBORw0KGgoAAAANSUhEUgAAAcEAAAFKCAYAAABlzOTzAAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDMuMC4zLCBo\ndHRwOi8vbWF0cGxvdGxpYi5vcmcvnQurowAAIABJREFUeJzt3XtcVHX+x/H3DAwCiiLGWJKUeUFM\nNM0tM81b1JptYiopqZVkF13ddd0VU1PzsqaW22qkdrHVvK50o9zS1tXcCqUW85bmLUPFgFEUBVFg\n5veHv6VYA7ycgQ7n9ewxjwczc853vvNIefv5Xs6xeTwejwAAsCB7VXcAAICqQggCACyLEAQAWBYh\nCACwLEIQAGBZhCAAwLJ8vf0Bha6D3v4IwOsmt5tQ1V0ADDH90HKvtX01v+8d19xkYE8unddDEABg\nEe7iqu7BZWM4FABgWVSCAABjeNxV3YPLRggCAIzhJgQBABbloRIEAFgWlSAAwLKoBAEAlsUWCQAA\nzINKEABgDIZDAQCWxcIYAIBVsUUCAGBdVIIAAMsyYSXI6lAAgGVRCQIAjGHCfYKEIADAGCYcDiUE\nAQDGYGEMAMCyqAQBAJZFJQgAsCqPx3wLY9giAQCwLCpBAIAxmBMEAFgWc4IAAMuiEgQAWBZXjAEA\nWBaVIADAskw4J8gWCQCAZVEJAgCMwXAoAMCyTDgcSggCAIxBCAIArMqM1w4lBAEAxqASBABYlgkX\nxrBFAgBgWVSCAABjMBwKALAsEw6HEoIAAGNQCQIALItKEABgWVSCAADLMmEIskUCAGBZVIIAAGMw\nJwgAsCwTDocSggAAY1AJAgAsi0oQAGBZXqoE3W63Jk2apH379snhcGjy5MkKDAzUmDFjVFxcrNDQ\nUM2ePVt+fn5KTk7W4sWLZbfbFRsbq379+pXbNiEIADCGlyrB9evX6/Tp01q5cqXS09M1ffp0hYSE\nKC4uTj169NCcOXOUlJSkmJgYJSYmKikpSQ6HQ3379lV0dLSCg4PLbJstEgCAX7RDhw6pVatWkqTw\n8HBlZGRoy5Yt6t69uySpa9euSklJ0bZt2xQVFaWgoCD5+/urbdu2SktLK7dtQhAAYAy3+8of5WjW\nrJk+++wzFRcX6+DBgzp8+LCOHj0qPz8/SVK9evWUnZ0tl8ulkJCQkvNCQkKUnZ1dbtsMhwIAjOHx\neKXZzp07Ky0tTQ8//LAiIiJ00003ae/evT/52J//3LJe/ylCEABgDC+uDh01alTJz3fffbfq16+v\ngoIC+fv7KzMzU06nU06nUy6Xq+S4rKws3XLLLeW2y3AoAMAYXhoO3bNnj5555hlJ0qZNm9SiRQt1\n6NBBa9eulSStW7dOnTp1UuvWrbVjxw7l5uYqLy9PaWlpateuXbltUwkCAIzhpS0SzZo1k8fjUd++\nfVWjRg298MIL8vHxUUJCglatWqUGDRooJiZGDodDo0ePVnx8vGw2m4YPH66goKBy27Z5LmXQ9CoU\nug56s3mgUkxuN6GquwAYYvqh5V5r++ySZ6743IDBMwzsyaVjOBQAYFkMhwIAjOHdgUWvIAQBAMbg\n2qEAAMsiBAEAlsWtlAAAVuVxMycIALAqEw6HskUCAGBZVIIAAGMwJwgAsCzmBAEAlmXCOUFCEABg\nDEIQAGBZXDYNRth38JBGJDynwQ/1VlzfBy77/EXLkrRuw79ls9n09GNxuqvDbTqUfkTPzZonSfLI\no+cSfqcbGoYZ3XVYnG8Nh0aum6UN897V1qRNJa/XuS5EsXNHyNfhq4xd3+n98Ysuu+1rI8P1wLQh\nksejH/YcVvKEC23c8divdUuvOyWblLb6U21Z+k/Dvg8ukwkrQbZI/MLkny3Qn+fMV/t25d8NuSxH\nMn7Qx+s/1VvzX1DirMmaPe81FRcXa9W7azT88YF68+WZ6n1ftN5cnmRwzwGp64jeOnvyzEWv9xg/\nUJ+/vkbzY56Vu9itOg3qXXbbPScO0prnlujVvs/JPyhAzbq0Vt2GTt3a7y4t7DNJr/adrE5P/UY1\nggKM+CqwCELwF8bP4dD8F6co9Joff0kc+O57DRkxVvEjx2rk2CnKPf3jL5nUtO1KfGPpT55vU8f2\n7eRwOBRSN1jXXevUgUPpSvjdk2p3S5Qk6Ycsl+qHXlN5XwqWcE3jBnI2DdO3G74u9brNZtONt0Vo\n9yf/kSR9MPFvOpVxXDa7Tb1nDlX8ivEaunqSbrqjRanz4lf+eA9HH4eP6jZ06uj2C/cn3bM+TY3v\nbKmTR7L1at/n5C52q7iwWIVnz8m/FiFYZdyeK39UkQpD8MyZM1qwYIGmT58uSdq8ebNyc3O93jGr\n8vX1kX+NGqVe+/Nf5mvSmBF6Y+7z6nBbW61454Myz3cdz1Hd4Dolz0PqBivbdUKStGfvAfUe/LQ+\n/XyLHhnQxztfAJZ13/iH9Y+pSy96PbBekM6dKdB9zw7S0NWTdM+YhyRJrXvdqdNZJ/XGgOla9sQc\n9Zw4uMy2A+sG6eypvJLnZ1y5CnIGy+Px6Hz+OUlSk05RyjtxWqeOnTD4m+GSedxX/qgiFc4Jjh07\nVh06dNDGjRslSSdOnNDo0aP12muvebtv+H87vtmrSTP/KkkqPF+omyObKW3bTs19bYlOn87T6TNn\n9OXW7ep+V4eLzvX8ZKK6ebPGenfJfK1850PNmrtQkxN+V2nfAdXbLQ92UnraPuUcyb7oPZvNptrX\n1lXKmx8r50i2Br85RhFdb1H4rc10468idMOvIiRJvv5+8nH4KG7BKPnV9Nd1LW5Q/MoJKio4r3fG\nvHpRmz/VsE0T9Rj3sJYMmeW9L4mKVcd9gnl5eYqLi9NHH30kSbrvvvu0YsUKr3cMP/L3r6E35828\n6C/+316epdS07fpy63YNjx8oSXpvzSf6Lv1IyTFZ2cflDK2nT79IVYfb2srh66vorh21/O3kSv0O\nqN4iut2ikIZORXRvozrXhqjofJFyj53Qgc93Kv/EaZ086tKJ9CxJ0oHPd8nZ7HoVFxZpY+J72p6c\nUqqtt+JfkHRhOPSN/tMkSXZfHwUG1yo5pva1dZWbmSPpwoKZ3s8P1ZL42VSBVcxTHRfGuN1upaen\nl/wC3rRpk9wm/KJmFtGkkT7b/JUk6R//3KjNX20t89jbb22tTSmpKiwsVFb2cWW5jqvxjeFKev8j\nbfoiVZK0Y9ce3Rh+faX0Hdaw6rfzNL/Xs1rYe5K+WrVRG+a9qwOf75QkuYvdOpGepXo3XitJCotq\nJNfBYzq8db8io9tJkmrWq63oPz1UZvvuomJlH8jQDe0uVI03//pX2vfpdtnsNj0460ktf/olnTzi\n8vK3RIVMOCdo83jK39ixf/9+TZs2Tdu3b1dAQICaN2+ucePGqXHjxpf0AYWug4Z01Cp27dmn2S+/\npoxjmfL19ZUztJ5GPvGIXpr/pux2m2rUqKFZkxNUp3ZQmW0sW/2+1qzbIJvNphFPDFb7dm2UfiRD\nk57/qzwetzwe6bmxvyMIL8PkdhMqPgiSpG6/71MyLHrudL6+WfuVQm6orz4vPCWb3abMbw8refwi\n2ew29Zoer9CmYbL72PWvl97W3o3bymw3tEmYYv4cL5vdpsNfH9BH05aqSacoPTRvhH7Yk15y3NoZ\nK3Rk2wGvf0+zmn5oudfazps28IrPrTnh4vnkylBhCF4tQhDVASGI6oIQLK3COcH27duXDIUWFRUp\nLy9PYWFh+uSTT7zeOQCAiVTHhTGbN28u9XzPnj1KTmZRBQDgf5hwvchlb5Zv3ry5tm4te2EGAMCi\nTLgwpsJKcOTIkaWW5mdnZyswMNCrnQIAmFB1vKnugAED5OPjI+nCBtVatWqpefPmXu8YAMBkquOc\nYGJiopYurZpVOwAA8zDjZvkKQzAsLEyjR49WVFSUHA5HyesPP/ywVzsGAIC3VbgwJjk5WY0aNdKZ\nM2eUk5OjnJwcHT58uDL6BgAwk+q0MGbdunX68MMPFRwcrH379pVciLm4uFi7d+/W2LFjK62TAAAT\nqE5zgvfcc49atGihqVOnlhr6tNvtuummmyqlcwAAE6luq0Ovv/56LVy4sLL6AgAws+pUCQIAcDk8\nhCAAwLJMGIKXfdk0AACqCypBAIAxquNmeQAALokJh0MJQQCAMQhBAIBV/feiKmZCCAIAjEElCACw\nLBOGIFskAACWRSUIADCEt64Ys3r1aiUnJ5c837lzp1q2bKn8/HwFBgZKkhISEtSyZUu9/vrr+vjj\nj2Wz2fTb3/5WnTt3LrdtQhAAYAwvhWC/fv3Ur18/SVJqaqo++ugj7d+/XzNmzFCzZs1Kjjt8+LD+\n8Y9/aOXKlTpz5ozi4uLUsWNH+fj4lNk2w6EAAGO4r+JxiRITEzVs2LCffW/Lli3q1KmT/Pz8FBIS\norCwMO3fv7/c9qgEAQCG8PYFtLdv367rrrtOoaGhkqS5c+cqJydHjRs31rhx4+RyuRQSElJyfEhI\niLKzsxUREVFmm4QgAMAYXg7BpKQk9e7dW5I0ePBgRUREKDw8XJMmTdKyZcsuOv5S9i0yHAoAMIaX\nh0O3bNmiNm3aSJKio6MVHh4uSerWrZv27t0rp9Mpl8tVcnxmZqacTme5bRKCAIBfvMzMTNWsWVN+\nfn7yeDx69NFHlZubK+lCODZt2lTt27fXxo0bdf78eWVmZiorK0tNmjQpt12GQwEAhvDmnGB2dnbJ\nfJ/NZlNsbKweffRRBQQEqH79+hoxYoQCAgIUGxurgQMHymazafLkybLby6/1bB4vX+yt0HXQm80D\nlWJyuwlV3QXAENMPLfda2zl9ulzxuXXf3mhYPy4HlSAAwBDeXh3qDYQgAMAY5runLiEIADCGhxAE\nAFiWCUOQLRIAAMuiEgQAGILhUACAdRGCAACrohIEAFgWIQgAsCxCEABgXR5bVffgsrFFAgBgWVSC\nAABDMBwKALAsj9t8w6GEIADAEFSCAADL8phwYQwhCAAwBJUgAMCyzDgnyBYJAIBlUQkCAAzh8VR1\nDy4fIQgAMIQZh0MJQQCAIQhBAIBlMRwKALAsKkEAgGWZcbM8WyQAAJZFJQgAMARXjAEAWJbbhMOh\nhCAAwBBmnBMkBAEAhmB1KADAstgnCACwLDNWgmyRAABYFpUgAMAQrA4FAFgWq0MBAJbFwhgAgGUx\nHAoAsCyGQwEAlmXG4VC2SAAALMvrleD5vyR4+yMAr5uZ8UVVdwEwxHQvts2cIADAspgTBABYFpUg\nAMCyTLguhhAEABjDm5VgcnKyXn/9dfn6+mrkyJGKiIjQmDFjVFxcrNDQUM2ePVt+fn5KTk7W4sWL\nZbfbFRsbq379+pXbLiEIADCEt+YEc3JylJiYqLffflv5+fmaN2+e1q5dq7i4OPXo0UNz5sxRUlKS\nYmJilJiYqKSkJDkcDvXt21fR0dEKDg4us222SAAAftFSUlJ0xx13qFatWnI6nZo6daq2bNmi7t27\nS5K6du2qlJQUbdu2TVFRUQoKCpK/v7/atm2rtLS0ctumEgQAGMLtpXaPHDmigoICPfXUU8rNzdWI\nESN09uxZ+fn5SZLq1aun7OxsuVwuhYSElJwXEhKi7OzsctsmBAEAhvDIe3OCJ0+e1Msvv6yMjAwN\nHjxYnp9cnsZTxqVqynr9pxgOBQAYwu258kd56tWrpzZt2sjX11fh4eGqWbOmatasqYKCAklSZmam\nnE6nnE6nXC5XyXlZWVlyOp3ltk0IAgAM4Zbtih/l6dixozZv3iy3262cnBzl5+erQ4cOWrt2rSRp\n3bp16tSpk1q3bq0dO3YoNzdXeXl5SktLU7t27cptm+FQAIAhvDUcWr9+fd17772KjY2VJE2YMEFR\nUVFKSEjQqlWr1KBBA8XExMjhcGj06NGKj4+XzWbT8OHDFRQUVG7bNs+lDJpehbzx5e/RAMygzmyu\nHYrqoej8Ua+1/Un9h6743OjMVQb25NIxHAoAsCyGQwEAhvDm6lBvIQQBAIbw1j5BbyIEAQCGIAQB\nAJbFcCgAwLLc5stAQhAAYIyKNr3/ErFFAgBgWVSCAABDcGd5AIBlsToUAGBZbpv55gQJQQCAIRgO\nBQBYlhmHQ1kdCgCwLCpBAIAh2CwPALAsM26WJwQBAIZgYQwAwLIYDgUAWJYZV4cSggAAQ5hxOJQt\nEgAAy6ISBAAYgjlBAIBlMScIALAsQhAAYFkehkMBAFZFJQgAsCwzhiBbJAAAlkUlCAAwhBk3yxOC\nAABDsE8QAGBZZpwTJAQBAIYgBAEAlsWcIADAssw4J8gWCQCAZVEJAgAMwZwgAMCymBMEAFiW24Qx\nSAgCAAzBcCgAwLLMVwcSggAAg5ixEmSLBADAsqgEAQCGMONmeUIQAGAIb68OLSgo0P33369hw4Yp\nNTVVu3btUnBwsCQpPj5eXbp0UXJyshYvXiy73a7Y2Fj169ev3DYJQQCAIby9MGb+/PmqU6dOyfM/\n/OEP6tq1a8nz/Px8JSYmKikpSQ6HQ3379lV0dHRJUP4c5gQBAIZwX8WjIgcOHND+/fvVpUuXMo/Z\ntm2boqKiFBQUJH9/f7Vt21ZpaWnltksIAgAM4Zbnih8VmTlzpsaOHVvqtaVLl2rw4MEaNWqUTpw4\nIZfLpZCQkJL3Q0JClJ2dXW67DIcCAAzhreHQ9957T7fccosaNmxY8lqvXr0UHBysyMhIvfrqq3r5\n5ZfVpk2b0v3xVNwjKkEAwC/axo0btX79esXGxmr16tV65ZVX5PF4FBkZKUnq1q2b9u7dK6fTKZfL\nVXJeVlaWnE5nuW1TCQIADOGtzfIvvfRSyc/z5s1TWFiYVqxYoYYNG6phw4basmWLmjZtqtatW2vC\nhAnKzc2Vj4+P0tLSNG7cuHLbJgQBAIaozAtoP/zww/r973+vgIAABQYGasaMGfL399fo0aMVHx8v\nm82m4cOHKygoqNx2bJ5LGTS9Cnnjy9+jAZhBndlfVHUXAEMUnT/qtbZH3dj/is/9y6GVBvbk0lEJ\nAgAMYcZrhxKCAABDeEx4HwlCEABgCDNWgmyRAABYFpUgAMAQlbk61ChUglXNz1814v4o//hJ8n9i\nmnyatC79vn9N1XhkvGoMGH3FH2FvHCX/p2fI/8npcnTtU/K6496B8n9yuvyfniGfFrddcfuAJN18\nc4S+3f25hj396EXv/eY39yjlizX6dMO7P/v+pWjVqoX+/en72rTxPb08b0bJ6yN+G6+Uzz9Uyhdr\n9NSTj1xh72EEz1U8qgohWMV823aR25Whgjee07kVL8rv/sdKvV+j1xNyf7/7qj6jxv1DdG75Cyp4\ndYJ8mrSWLfR62RvdLHv9hipYOF4Ff5suv56PVdwQUIbAwAD99S/T9K8Nn130ns1m09yXpus3DwxS\nl24P6v6e0QoLu+6yP2POC89p1B8m6q4uMapTJ0i/vrerGjUK1yOPPKSOd/XSXZ1j9MfRT6t27fL3\nhcF7vHntUG8hBKuYJy9XtsD//0sbUFOevNOl3j/37nwVf7+n9El+/qoxYLT8h0yU/+PPyVY/vNTb\n/vGTS3621XXKk39GnlPHJY9Hxd+myadxS7kP7da5FXMuHFSQL5tfDcnGHwdcmXPnzuv+Bwbp2LHM\ni9675poQnTx1Si7XCXk8Hv1rw2fq3q2T7Ha7Xl34gv65brU+3fCuuna5s9R56z9ZXfKzw+HQjTc2\n1Ff/2SZJ+nDNJ+rerZMOHTqszl1iVFxcrMLCQuWfPUsIViFv3kXCWy7pt97WrVu1Zs0aSReuxQbj\nFO/4QrY61yjgD/MUMHSKzn+8pPQB5wsuOsdxZ08V79uqgkVTdO7911TjvrKHgGxBwfLk55Y89+Tl\nyhZUV/K4pcJzkiTfdt1U/O3WC68BV6C4uFgFBRf/WZWk7OzjCqpVS02aNJKvr6+6dO6g+vWv0YAB\nvfXDD1m6+55+erDvEL344uQy27/mmhDlnDz1Y5tZx3XtdU55PB7l5eVLkqLvvkvHXSd05EiGod8N\nl85zFf9VlQoXxsycOVPHjh1Tenq6evbsqVWrVunUqVOaMGFCZfSv2vNp3UmeUy6dXTxd9mtvkN+D\nT6vglbHlnmMPj5CtZm35tr7rwgt+NSQ/f/kPunCe/bob5R8/We6cLBV99c/SJ9v+5/Mj28n31m4q\neHOaUV8JuMiQ+N/r9Vdf1KlTp/XdocOy2Wy6o307dex4m+7s8CtJUoB/gBwOh5L+/rpq1QpU69Y3\na/0nq3X2bIGGPvnHUu3ZbKX/IN9+W1vNnPmsHujFnGBVMuM/oysMwZ07d+qtt97SoEGDJEkjRoxQ\nXFyc1ztmFT43NFfxvq8lSe4fvr9Qpdns5VdlxUU6/8EiuQ/vLfVywRuTJV0YDv3vz7bgUNlq/XhX\nZVvtEHlO51z47Cat5ejSRwV/my6dyzfuSwH/Y9O/N6tLtwclSdOnjdWh7w/r2mudmvH8XK1a9X6p\nY3v1vhBk6z9Zre7RFy676Ovrq3ohdUuOaRB2rY5lXBh6bdWqhRYunK1eMY9QBeKyVTgcWlRUpMLC\nwpJ/eZ04cULnzp3zeseswn38mOzXN5Uk2YKvuTD8WcGwpPvwPvm0uPCvZ1vo9fK98/4yj/WczJat\nRoBswaGS3S6fiFtVvG+bVCNQfj0GqWDJDOnsGeO+EPAzPkx+S6Gh9RQYGKCePaO1fv2/lfrlVj3w\nm3slSaGh9TRtatkjIEVFRfr22/0lVWPvmB5au26j7Ha7Xnv1RcU+9IS+//5IpXwXlM2Mw6EVXkB7\n3bp1WrBggTIyMtSyZUsdPHhQ48aN0913331JH8AFtCvg568aDz59oVqz23X+n6vkc30TFX/3jdxH\n9st/yEQpoKbstUPkzjys8xuS5D6yXzX6DJetVh3JZtf5NYvkPnqwzI+w3xgpv3sHSpKKdm1W0Wcf\nyPdXd8vRrZ88rmMlx51LelmeU66ymrE0LqBdvrZtojR71kTdcENDFRYWKSPjmD748BN9dyhd77//\nsWJiemjC+FHyeDya85cFWrHiXfn4+OiVxOfVIrKZfHzsmjJ1jj5eu6HMz4iMbKr5iTNlt9uVmrpV\nfxzznKLvvkvLlr6iHTt+XEE99pnp+vKrryvja5uSNy+g/ciNfSo+qAyLD71tYE8uXYUhePbsWXk8\nHu3fv18Oh0ONGjXSqVOnVL9+/Uv6AEIQ1QEhiOrCmyE46IYHr/jct75/x8CeXLoKh0OHDBmi06dP\nq1WrVoqMjNQHH3ygxx5jTxkAoDQzbpavcGHMxIkTNXLkSA0dOlQrVqyQ0+nUypVVc98nAMAvV7W8\nbFpkZKQWLFigZcuWqWnTppoxY4Zq165dGX0DAJiIGRfGlFkJtm/fvtReHLfbrdTUVL333nuy2WxK\nSUmplA4CAOAtZYbg5s2byzzp888/90pnAADmVS03yx8+fFjLly/XyZMnJUmFhYX68ssv9emnn3q9\ncwAA86iWc4Jjx45VkyZNtGvXLnXp0kV2u11TpkypjL4BAEzEjHOCFYagr6+v+vTpo9q1a+vee+/V\nrFmztHTp0sroGwDARMx4F4kKh0M9Ho9SU1MVHBysVatWKTw8XEeOcHkiAEBpFVx75RepzErwmWee\nkSSFhYUpMDBQEyZM0Ndff60lS5Zo7Njy73IAALAeM95Ut8xK8MCBA+rdu7fS09O1b98+ST+m/Lx5\n89S5c+fK6SEAAF5SZgguX75cWVlZev7555WQkFCZfQIAmFC12iLh6+urBg0aaO7cuZXZHwCASVXl\nKs8rVeHCGAAALoUZ9wkSggAAQ5hxdSghCAAwRLWaEwQA4HKYcU6wwivGAABQXVEJAgAMwcIYAIBl\nsTAGAGBZVIIAAMsy48IYQhAAYAg3w6EAAKsyXwSyRQIAYGFUggAAQ7AwBgBgWYQgAMCy2CcIALAs\nKkEAgGV5a5/g2bNnNXbsWB0/flznzp3TsGHD1Lx5c40ZM0bFxcUKDQ3V7Nmz5efnp+TkZC1evFh2\nu12xsbHq169fuW0TggAAQ3hrOHTDhg1q2bKlhg4dqqNHj2rIkCFq27at4uLi1KNHD82ZM0dJSUmK\niYlRYmKikpKS5HA41LdvX0VHRys4OLjMttkiAQD4Rbvvvvs0dOhQSdKxY8dUv359bdmyRd27d5ck\nde3aVSkpKdq2bZuioqIUFBQkf39/tW3bVmlpaeW2TSUIADCEt+cE+/fvrx9++EELFizQY489Jj8/\nP0lSvXr1lJ2dLZfLpZCQkJLjQ0JClJ2dXW6bhCAAwBDeXh26cuVK7d69W3/6059KfVZZn3sp/WE4\nFABgCLc8V/woz86dO3Xs2DFJUmRkpIqLi1WzZk0VFBRIkjIzM+V0OuV0OuVyuUrOy8rKktPpLLdt\nQhAAYAjPVfxXnq+++kqLFi2SJLlcLuXn56tDhw5au3atJGndunXq1KmTWrdurR07dig3N1d5eXlK\nS0tTu3btym2b4VAAgCG8dReJ/v37a/z48YqLi1NBQYEmTpyoli1bKiEhQatWrVKDBg0UExMjh8Oh\n0aNHKz4+XjabTcOHD1dQUFC5bds8Xh7EzRtf/h4NwAzqzP6iqrsAGKLo/FGvtX1z/duv+NxdmVsM\n7MmlYzgUAGBZDIcCAAzBTXUBAJblrcumeRMhCAAwBJUgAMCyqAQBAJZFJQgAsCwzVoJskQAAWBaV\nIADAEB6Pu6q7cNkIQQCAIbx9KyVvIAQBAIbw9q2UvIEQBAAYgkoQAGBZVIIAAMsy4z5BtkgAACyL\nShAAYAgzbpYnBAEAhmBOEABgWawOBQBYFpUgAMCyzLg6lBAEABjCjJUgWyQAAJZFJQgAMAQLYwAA\nlmXG4VBCEABgCBbGAAAsiyvGAAAsi0oQAGBZZpwTZIsEAMCyqAQBAIZgThAAYFlmHA4lBAEAhiAE\nAQCWZb4IlGweM0Y3AAAGYHUoAMCyCEEAgGURggAAyyIEAQCWRQgCACyLEAQAWBYhaBJr166t6i4A\nV6WwsFD9+vVTQkLCz75/++23V3KPAELQFI4cOaI1a9ZUdTeAq5Kdna3z589r5syZVd0VoAQhaAJT\npkxRamqqmjdvrjFjxiguLk4pKSkaOXJkyTH//Vf0/v37NXjwYD3yyCMaNmyYcnNzq6rbQCkzZsxQ\nenq6nnnmGQ0aNEiDBg3SgAGEfC1HAAACXUlEQVQDlJ6eXuq43bt3q3///srLy9O6devUv39/DRw4\nUM8//3wV9RzVGSFoAvHx8brttts0fPhwFRYWavny5bLbf/5/3dSpUzVlyhQtXrxYd955p5YtW1bJ\nvQV+XkJCgho1aqQBAwZo+PDheuutt9SnTx8tX7685JgTJ05o0qRJmjNnjiRp/vz5WrJkiZYuXapj\nx47pP//5T1V1H9UU1w41mVatWpX7/vbt2/Xss89Kks6fP6+oqKjK6BZwyUJDQzVt2jTNmzdPubm5\nuvnmmyVduPjyqFGj9Pjjj6tBgwbatm2bMjIyFB8fL0k6ffq0MjIydOutt1Zl91HNEIIm43A4JEk2\nm63U60VFRZKkgIAALVmy5KL3gV+KuXPnqmPHjhowYIA+/vhjbdy4UZJ05swZRUREaOXKlbrnnnvk\ncDjUsmVLvfHGG1XbYVRrDIeagN1uLwm5/6pVq5aysrIkSXv27FFeXp4kqXnz5tq0aZMkac2aNUpJ\nSanczgIVyMnJUXh4uDwej9avX6/CwkJJUlBQkMaNG6fQ0FD9/e9/V6NGjXTgwAEdP35c0oXwzMzM\nrMquoxoiBE2gcePG+uabb3T69OmS15o3b67AwED1799f77//vsLCwiRJ48eP18KFCzVw4EC98847\nioyMrKpuAz/roYce0tSpU/X444+rZ8+eSk1N1WeffVby/rhx47Ro0SKdPHlS48aN09ChQ9W/f3+d\nPHlSTqezCnuO6ohbKQEALItKEABgWYQgAMCyCEEAgGURggAAyyIEAQCWRQgCACyLEAQAWBYhCACw\nrP8Dh28eVk3t6v0AAAAASUVORK5CYII=\n", 740 | "text/plain": [ 741 | "
" 742 | ] 743 | }, 744 | "metadata": { 745 | "tags": [] 746 | } 747 | } 748 | ] 749 | }, 750 | { 751 | "cell_type": "code", 752 | "metadata": { 753 | "id": "kjKxqHSMgJJb", 754 | "colab_type": "code", 755 | "colab": {} 756 | }, 757 | "source": [ 758 | "" 759 | ], 760 | "execution_count": 0, 761 | "outputs": [] 762 | } 763 | ] 764 | } -------------------------------------------------------------------------------- /notebooks/Processing_test_dataset.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Procesar el dataset de pruebas" 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "Este dataset está obtenido de [Kaggle](https://www.kaggle.com/jruvika/fake-news-detection). Contiene los titulares y cuerpos de las noticias, que son de dos posibles tipos de categorías. \"Falsa (0)\" y \"True (1)\". " 15 | ] 16 | }, 17 | { 18 | "cell_type": "code", 19 | "execution_count": 16, 20 | "metadata": {}, 21 | "outputs": [], 22 | "source": [ 23 | "import pandas as pd\n", 24 | "\n", 25 | "#Progress bars\n", 26 | "from tqdm import tqdm\n", 27 | "tqdm.pandas()\n", 28 | "\n", 29 | "#Paralelize pandas apply on multiple cores\n", 30 | "import swifter\n", 31 | "\n", 32 | "from matplotlib import pyplot as plt\n", 33 | "from matplotlib import style\n", 34 | "\n", 35 | "#Nicer style\n", 36 | "style.use('seaborn') \n", 37 | "\n", 38 | "import re #regexp\n", 39 | "from nltk.tokenize import RegexpTokenizer\n", 40 | "\n", 41 | "#word2vec\n", 42 | "from gensim.models import KeyedVectors" 43 | ] 44 | }, 45 | { 46 | "cell_type": "markdown", 47 | "metadata": {}, 48 | "source": [ 49 | "Constantes necesarias" 50 | ] 51 | }, 52 | { 53 | "cell_type": "code", 54 | "execution_count": 17, 55 | "metadata": {}, 56 | "outputs": [], 57 | "source": [ 58 | "PATH_DATASET = \"../data/Other_datasets/fake-news-detection/data.csv\"\n", 59 | "PATH_PROCESSED = \"../data/Other_datasets/fake-news-detection/data_kaggle_proc.pickle\"\n", 60 | "\n", 61 | "#Padding number for title and content\n", 62 | "MAX_LEN_TITLE = 13\n", 63 | "MAX_LEN_CONTENT = 1598\n", 64 | "\n", 65 | "#Convert labels\n", 66 | "LBL_TRUE = [0,0,0,1]\n", 67 | "LBL_FAKE = [0,0,1,0]" 68 | ] 69 | }, 70 | { 71 | "cell_type": "code", 72 | "execution_count": 18, 73 | "metadata": {}, 74 | "outputs": [], 75 | "source": [ 76 | "df = pd.read_csv(PATH_DATASET).dropna()[['Headline', 'Body', 'Label']]" 77 | ] 78 | }, 79 | { 80 | "cell_type": "code", 81 | "execution_count": 19, 82 | "metadata": {}, 83 | "outputs": [ 84 | { 85 | "data": { 86 | "text/html": [ 87 | "
\n", 88 | "\n", 101 | "\n", 102 | " \n", 103 | " \n", 104 | " \n", 105 | " \n", 106 | " \n", 107 | " \n", 108 | " \n", 109 | " \n", 110 | " \n", 111 | " \n", 112 | " \n", 113 | " \n", 114 | " \n", 115 | " \n", 116 | " \n", 117 | " \n", 118 | " \n", 119 | " \n", 120 | " \n", 121 | " \n", 122 | " \n", 123 | " \n", 124 | " \n", 125 | " \n", 126 | " \n", 127 | " \n", 128 | " \n", 129 | " \n", 130 | " \n", 131 | " \n", 132 | " \n", 133 | " \n", 134 | " \n", 135 | " \n", 136 | " \n", 137 | " \n", 138 | " \n", 139 | " \n", 140 | " \n", 141 | " \n", 142 | "
HeadlineBodyLabel
0Four ways Bob Corker skewered Donald TrumpImage copyright Getty Images\\nOn Sunday mornin...1
1Linklater's war veteran comedy speaks to moder...LONDON (Reuters) - “Last Flag Flying”, a comed...1
2Trump’s Fight With Corker Jeopardizes His Legi...The feud broke into public view last week when...1
3Egypt's Cheiron wins tie-up with Pemex for Mex...MEXICO CITY (Reuters) - Egypt’s Cheiron Holdin...1
4Jason Aldean opens 'SNL' with Vegas tributeCountry singer Jason Aldean, who was performin...1
\n", 143 | "
" 144 | ], 145 | "text/plain": [ 146 | " Headline \\\n", 147 | "0 Four ways Bob Corker skewered Donald Trump \n", 148 | "1 Linklater's war veteran comedy speaks to moder... \n", 149 | "2 Trump’s Fight With Corker Jeopardizes His Legi... \n", 150 | "3 Egypt's Cheiron wins tie-up with Pemex for Mex... \n", 151 | "4 Jason Aldean opens 'SNL' with Vegas tribute \n", 152 | "\n", 153 | " Body Label \n", 154 | "0 Image copyright Getty Images\\nOn Sunday mornin... 1 \n", 155 | "1 LONDON (Reuters) - “Last Flag Flying”, a comed... 1 \n", 156 | "2 The feud broke into public view last week when... 1 \n", 157 | "3 MEXICO CITY (Reuters) - Egypt’s Cheiron Holdin... 1 \n", 158 | "4 Country singer Jason Aldean, who was performin... 1 " 159 | ] 160 | }, 161 | "execution_count": 19, 162 | "metadata": {}, 163 | "output_type": "execute_result" 164 | } 165 | ], 166 | "source": [ 167 | "df.head()" 168 | ] 169 | }, 170 | { 171 | "cell_type": "markdown", 172 | "metadata": {}, 173 | "source": [ 174 | "Número de noticias. por tipo en el dataset." 175 | ] 176 | }, 177 | { 178 | "cell_type": "code", 179 | "execution_count": 20, 180 | "metadata": {}, 181 | "outputs": [ 182 | { 183 | "data": { 184 | "text/plain": [ 185 | "" 186 | ] 187 | }, 188 | "execution_count": 20, 189 | "metadata": {}, 190 | "output_type": "execute_result" 191 | }, 192 | { 193 | "data": { 194 | "image/png": "iVBORw0KGgoAAAANSUhEUgAAA3AAAAGzCAYAAABuCfXkAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDIuMS4xLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvAOZPmwAAIABJREFUeJzt3Xm4VnW9///XZm8RZQgI9eRXITWH1JQIBwTxOBBlGqbkjCnWOZlDWhhIKA6AONGASqan65xDecTx5C87DVpKCqJhaJBD5YiSoaLCVmGz7/X7o8t9JAdyC2fzcT8ef7nuve57vRd4sa7n/qz7vuuqqqoCAADAOq9DWw8AAADAP0bAAQAAFELAAQAAFELAAQAAFELAAQAAFELAAQAAFELAAbxPLVy4MNtuu22uu+66VR7/t3/7t4wZM2aNHWefffbJ73//+zX2eu9k2bJlOfzww/OZz3wmv/jFL/5PjlmSSy+9NLfeemtbjwHAWiTgAN7HOnTokAsuuCCPPvpoW4+yRjz44IN5/vnnc8stt+STn/xkW4+zzpkzZ05WrlzZ1mMAsBY1tPUAAKw9nTp1ynHHHZdRo0blmmuuSceOHVf5+ZgxY7L11lvn+OOPf9P2PvvskwMOOCB33313XnrppXzxi1/MfffdlwULFqShoSHTpk3LJptskiS5+uqr89BDD2XFihU57rjjMnz48CTJr371q0ybNi1NTU3p1KlTRo8enY9//OOZOnVq5s2bl7/+9a/Zdtttc/HFF68y16233ppLL700tVotnTt3zhlnnJEuXbpk7NixefbZZzNs2LDMmDEjnTp1annOiBEj0rdv39x3331ZtGhRBgwYkPPOOy8dOnTIfffdl4svvjivvvpqOnTokJNOOimDBw/OwIEDM2PGjPTp0ydXXHFFrrnmmvz6179Okhx77LE57rjjsnz58kybNi11dXWpr6/PN77xjeyyyy6rzHvjjTfmZz/7WWq1Wp555plssskmmTx5cjbZZJMsXbo0EydOzCOPPJKmpqYMGDAg3/jGN9LQ0JAdd9wx++67bx566KFcfPHF+djHPtbymo2NjZkwYULuu+++1NfXZ7/99stpp52Wxx9/POeee24aGxuzePHibLfddvn2t7+d66+/PvPnz8+FF16Y+vr67LXXXrn44otz7733prm5Odtvv33GjRuXLl265IEHHsjZZ5+dpqam9O7dO88880zGjBmT3XbbLTNmzMj06dPToUOH9OrVK2eeeWa22GKLjBkzJi+++GKeeuqpDB48ONdff32uvfbabLHFFi1/XkcffXT222+/NfG/LgBvpwLgfempp56q+vbtWzU3N1dHHXVUNXny5Kqqquqqq66qRo8eXVVVVY0ePbq66qqrWp7zxu299967mjRpUlVVVXXLLbdU2223XfXggw9WVVVVX/nKV6pp06a17Dd+/PiqqqrqL3/5SzVgwIDqkUceqR577LHqgAMOqF544YWqqqrqkUceqQYOHFg1NjZW3/3ud6uhQ4dWTU1Nb5r7T3/6U7XHHntUTz75ZFVVVTVr1qxq4MCB1dKlS6u77767+sxnPvOW53v00UdXp5xyStXc3FwtXbq0GjRoUDV79uzqxRdfrD75yU9WTz31VMuMgwcPrp5++ulqzJgx1fTp06uqqqqjjjqqGjhwYPXoo49WL7/8crXbbrtVy5cvr/bdd9/qd7/7XVVVVfWb3/ymmjp16puOfcMNN1R9+/atHn300aqqquqiiy6qTj755KqqqmrMmDHVf/7nf1ZVVVUrV66sRo0aVX3/+9+vqqqqttlmm+qmm256y/OZNGlSddppp1UrV66sli9fXh111FHV3XffXU2ePLn67//+76qqqmrFihXVAQccUP3sZz9r+TP4n//5n6qqqmrq1KnV5MmTq1qtVlVVVV1yySXV+PHjq6ampmrw4MHV7bffXlVVVc2ePbvadtttq7vvvruaNWtWtd9++1XPP/98y3l9+tOfrmq1WjV69OjqC1/4Qst8EyZMqC644IKqqqrqiSeeqPbaa69q5cqVb3kuAKw5VuAA3uc6dOiQiy66KAcddFAGDRr0rp77+m2Km2++eXr16pXtttsuSdK7d++89NJLLfsdfvjhSZJNNtkkAwcOzOzZs1NfX5+//vWvOfbYY1v2q6ury5NPPpkk6du3bxoa3nwZuvvuu7P77rtn8803T5IMGDAgPXv2zPz581NXV/eO8+69997p0KFDunTpkj59+uSll17KvHnzsnjx4px44omrzPHwww9nyJAhueaaa3LQQQdl8eLFOeCAAzJr1qx84AMfyJ577pmOHTvmM5/5TE466aTstddeGThwYL70pS+95bEHDhzYshp16KGHZtiwYUmS22+/Pb///e9z/fXXJ0lee+21VZ7Xv3//t3y9WbNm5Ywzzkh9fX3q6+vzwx/+MEmyyy675K677sqVV16Zxx9/PH/961/zyiuvvOn5t99+e5YuXZpZs2YlSZqamvLBD34wjzzySJJkr732SpLsvvvu2XrrrZMkv/nNb7L//vunZ8+eSZKDDz44EydOzMKFC5Mkn/jEJ1pe/8gjj8zRRx+d0047LTNmzMjw4cNTX1//1n8xAKwxAg6gHfjQhz6Uc845J6NHj85BBx3U8nhdXV2qqmrZbmpqWuV5b7zlcr311nvb1+/Q4X/fUl2r1dLQ0JDm5uYMGDAg3/72t1t+tmjRomy88cb55S9/mQ033PAtX6tWq70p1KqqysqVK99xhiSr3FL5+rk1Nzdnq622WuXDXJ599tn07NkztVot48aNyx133JHddtste+yxR/7rv/4rG2ywQfbff/8kyWmnnZZDDjkkd911V2688cb84Ac/aImxN3pjvNRqtZbtWq2W73znO9lqq62SJC+//PIq5/d2fw4NDQ2r7Ldo0aJ06tQp55xzTpqbm/PpT386//zP/5xFixat8nf4xhnGjh3bEmqNjY1Zvnx5Fi9e/Kb93zjr33v9z/7vZ91iiy2y7bbb5rbbbstPfvKTXHvttW95HgCsWT7EBKCd+NSnPpXBgwfnP/7jP1oe69GjR+bPn5/kb1Fzzz33tOq1b7rppiTJM888k9mzZ2fAgAEZMGBA7rrrrvz5z39Oktxxxx357Gc/+6YVqL83YMCA3HnnnXnqqaeSJLNnz86iRYuy8847t2q2vn375oknnsi9996b5G8fhDJ06NA8++yzWX/99bPLLrvk0ksvzcCBA7Prrrtm3rx5+e1vf5s999wzK1euzD777JNXX301RxxxRMaPH5+HH344K1aseNNx7r777jz77LNJkmuuuSZ77713kmTQoEH593//91RVlRUrVuSEE05oWU1b3Z/DTTfdlFqtlhUrVuSUU07JvffemzvvvDMnnnhiS2Def//9aW5uTvK3EHs9tgYNGpQf/ehHWbFiRWq1Ws4888xMmTIlW221VTp27JiZM2cmSR544IE88sgjqaury5577pmf/vSneeGFF5IkN9xwQ7p3754+ffq85YxHHnlkLrzwwuy0004t74cEYO2yAgfQjowbNy5z585t2R4xYkRGjRqVoUOHZrPNNsvuu+/eqtddvnx5Pve5z6WpqSnjxo1ruZXw3HPPzde+9rVUVdXywSedO3d+x9f6yEc+kvHjx+ekk05Kc3NzOnXqlO9973vp2rVrq2br2bNnvvvd7+bCCy/M8uXLU1VVLrzwwmy22WZJkiFDhuQXv/hFdt9993Tq1CnbbbddPvCBD2T99ddPkowdOzajRo1qWRGbNGnSmz4MJvnb7aOnn356Fi9enI985CM599xzkyTf/OY3M3HixBx44IFpamrKHnvskS9+8Yurnfukk07KxIkTM2zYsDQ3N2f//ffPJz/5yZbbQTfccMN06dIlu+yyS8ttqfvss0+mTJmSpqamfOUrX8kFF1yQz33uc2lubs5HP/rRjBkzJg0NDZk6dWrGjx+fKVOm5MMf/nB69eqVTp06Zdddd82xxx6bL3zhC6nVaunZs2euuOKKVVZY32jvvffOuHHjWm6hBWDtq6ve6r4LAOAfduONN+bnP/95rrjiirYe5R9ywQUX5Pjjj0+vXr2yaNGiDBs2LLfeemu6dev2rl7nd7/7XcaNG5ef/OQnq31/IgBrhhU4AGhn/t//+3859thj09DQkKqqMmHChHcdb6NHj84999yTb33rW+IN4P+QFTgAAIBC+BATAACAQgg4AACAQgg4AACAQqxzH2KyePHSth4Bitejx4ZZsuSVth4DgHbKdQjem402evuvzrECB+9DDQ31bT0CAO2Y6xCsPQIOAACgEAIOAACgEAIOAACgEAIOAACgEAIOAACgEAIOAACgEAIOAACgEAIOAACgEAIOAACgEAIOAACgEAIOAACgEAIOAACgEAIOAACgEAIOAACgEAIOAACgEAIOAACgEA1tPQDlGTn5V209AhTtB2P2aesRAIBCWYEDAAAohIADAAAohIADAAAohIADAAAohIADAAAohIADAAAohIADAAAohIADAAAohIADAAAohIADAAAohIADAAAohIADAAAohIADAAAohIADAAAohIADAAAohIADAAAoRMPqdmhqasrYsWPz9NNPZ8WKFTnhhBPykY98JGPGjEldXV223nrrjB8/Ph06dMill16a22+/PQ0NDRk7dmx22mmnPPHEE2+5LwAAAO/Oakvq5ptvTvfu3XP11VfnyiuvzHnnnZfzzz8/p556aq6++upUVZXbbrstCxYsyD333JPrrrsuU6ZMyTnnnJMkb7kvAAAA795qA+5Tn/pUvvrVr7Zs19fXZ8GCBdl1112TJIMHD86sWbMyd+7cDBo0KHV1ddl0003T3NycF1544S33BQAA4N1b7S2UnTt3TpIsW7Ysp5xySk499dRccMEFqaura/n50qVLs2zZsnTv3n2V5y1dujRVVb1p33fSo8eGaWiob/UJAazrNtqoa1uPALDW+bcO1o7VBlySLFq0KCeeeGKOPPLIHHjggbnoootaftbY2Jhu3bqlS5cuaWxsXOXxrl27rvJ+t9f3fSdLlrzybs8BoCiLF7/zL7IASrfRRl39WwfvwTv9AmS1t1A+99xzGTlyZE4//fQMHz48SbL99ttnzpw5SZKZM2emf//+6devX+68887UarU888wzqdVq6dmz51vuCwAAwLu32hW4733ve3n55Zdz+eWX5/LLL0+SfPOb38yECRMyZcqUbLnllhk6dGjq6+vTv3//HHbYYanVajnrrLOSJKNHj86ZZ565yr4AAAC8e3VVVVVtPcQbWW5f942c/Ku2HgGK9oMx+7T1CABrlVso4b15T7dQAgAAsG4QcAAAAIUQcAAAAIUQcAAAAIUQcAAAAIUQcAAAAIUQcAAAAIUQcAAAAIUQcAAAAIUQcAAAAIUQcAAAAIUQcAAAAIUQcAAAAIUQcAAAAIUQcAAAAIUQcAAAAIUQcAAAAIUQcAAAAIVoaOsBAADerZGTf9XWI0DRfjBmn7YegVayAgcAAFAIAQcAAFAIAQcAAFAIAQcAAFAIAQcAAFAIAQcAAFAIAQcAAFAIAQcAAFAIAQcAAFAIAQcAAFAIAQcAAFAIAQcAAFAIAQcAAFAIAQcAAFAIAQcAAFAIAQcAAFCIhn9kp/vvvz8XX3xxpk+fntNOOy3PPfdckuTpp5/OzjvvnG9961v58pe/nBdffDHrrbde1l9//Vx11VV54oknMmbMmNTV1WXrrbfO+PHj06GDZgQAAGiN1QbclVdemZtvvjkbbLBBkuRb3/pWkuSll17KMccckzPOOCNJ8uSTT+aWW25JXV1dy3PPP//8nHrqqdltt91y1lln5bbbbsuQIUPWxnkAAAC87612Oax3796ZOnXqmx6fOnVqjj766Gy88cZ57rnn8vLLL+fLX/5yjjjiiPz6179OkixYsCC77rprkmTw4MGZNWvWGh4fAACg/VjtCtzQoUOzcOHCVR57/vnnM3v27JbVt6ampowcOTLHHHNMXnrppRxxxBHZaaedUlVVy4pc586ds3Tp0tUO1KPHhmloqG/NuQAUYaONurb1CAC0c65F5fqH3gP39372s5/lgAMOSH3930KrV69eOfzww9PQ0JAPfvCD+ehHP5rHHntslfe7NTY2plu3bqt97SVLXmnNSADFWLx49b/MAoC1ybVo3fZOgd2qTxSZPXt2Bg8e3LI9a9asnHrqqUn+Fmp//OMfs+WWW2b77bfPnDlzkiQzZ85M//79W3M4AAAA0sqAe+yxx7L55pu3bO+1117p06dPDj300Bx//PH52te+lp49e2b06NGZOnVqDjvssDQ1NWXo0KFrbHAAAID2pq6qqqqth3gjy7nrvpGTf9XWI0DRfjBmn7YeAYrnWgTvjWvRum2N30IJAADA/z0BBwAAUAgBBwAAUAgBBwAAUAgBBwAAUAgBBwAAUAgBBwAAUAgBBwAAUAgBBwAAUAgBBwAAUAgBBwAAUAgBBwAAUAgBBwAAUAgBBwAAUAgBBwAAUAgBBwAAUAgBBwAAUAgBBwAAUAgBBwAAUAgBBwAAUAgBBwAAUAgBBwAAUAgBBwAAUAgBBwAAUAgBBwAAUAgBBwAAUAgBBwAAUAgBBwAAUAgBBwAAUAgBBwAAUAgBBwAAUAgBBwAAUAgBBwAAUAgBBwAAUAgBBwAAUIh/KODuv//+jBgxIkmyYMGC7LnnnhkxYkRGjBiRn/70p0mSSy+9NMOHD8/hhx+eBx54IEnyxBNP5IgjjsiRRx6Z8ePHp1arraXTAAAAeP9rWN0OV155ZW6++eZssMEGSZI//OEPOe644zJy5MiWfRYsWJB77rkn1113XRYtWpSTTz45N9xwQ84///yceuqp2W233XLWWWfltttuy5AhQ9be2QAAALyPrXYFrnfv3pk6dWrL9vz583P77bfnqKOOytixY7Ns2bLMnTs3gwYNSl1dXTbddNM0NzfnhRdeyIIFC7LrrrsmSQYPHpxZs2atvTMBAAB4n1vtCtzQoUOzcOHClu2ddtopn//857Pjjjtm2rRpueyyy9K1a9d07969ZZ/OnTtn6dKlqaoqdXV1qzy2Oj16bJiGhvrWnAtAETbaqGtbjwBAO+daVK7VBtzfGzJkSLp169by3+edd1723XffNDY2tuzT2NiYrl27pkOHDqs89vrz3smSJa+825EAirJ48ep/mQUAa5Nr0brtnQL7XX8K5fHHH9/yISWzZ8/ODjvskH79+uXOO+9MrVbLM888k1qtlp49e2b77bfPnDlzkiQzZ85M//79W3kKAAAAvOsVuLPPPjvnnXde1ltvvfTq1SvnnXdeunTpkv79++ewww5LrVbLWWedlSQZPXp0zjzzzEyZMiVbbrllhg4dusZPAAAAoL2oq6qqaush3shy7rpv5ORftfUIULQfjNmnrUeA4rkWwXvjWrRuW6O3UAIAANA2BBwAAEAhBBwAAEAhBBwAAEAhBBwAAEAhBBwAAEAhBBwAAEAhBBwAAEAhBBwAAEAhBBwAAEAhBBwAAEAhBBwAAEAhBBwAAEAhBBwAAEAhBBwAAEAhBBwAAEAhBBwAAEAhBBwAAEAhBBwAAEAhBBwAAEAhBBwAAEAhBBwAAEAhBBwAAEAhBBwAAEAhBBwAAEAhBBwAAEAhBBwAAEAhBBwAAEAhBBwAAEAhBBwAAEAhBBwAAEAhBBwAAEAhBBwAAEAhBBwAAEAhBBwAAEAhGv6Rne6///5cfPHFmT59eh588MGcd955qa+vT8eOHXPBBRekV69emTBhQu6777507tw5SXL55Zenqakpo0aNymuvvZaNN944559/fjbYYIO1ekIAAADvV6tdgbvyyiszbty4LF++PEkyceLEnHnmmZk+fXqGDBmSK6+8MkmyYMGCXHXVVZk+fXqmT5+erl275vLLL88BBxyQq6++Ottvv31mzJixds8GAADgfWy1K3C9e/fO1KlT841vfCNJMmXKlGy88cZJkubm5qy//vqp1Wp54oknctZZZ+W5557L8OHDM3z48MydOzf/+q//miQZPHhwpkyZkmOPPfYdj9ejx4ZpaKh/j6cFsO7aaKOubT0CAO2ca1G5VhtwQ4cOzcKFC1u2X4+3++67Lz/84Q/zox/9KK+88kqOPvroHHfccWlubs4xxxyTHXfcMcuWLUvXrn/7n6Nz585ZunTpagdasuSV1p4LQBEWL179v4UAsDa5Fq3b3imw/6H3wP29n/70p5k2bVq+//3vp2fPni3R9vr723bfffc89NBD6dKlSxobG9OpU6c0NjamW7durTsDAAAA3v2nUP74xz/OD3/4w0yfPj2bb755kuTxxx/PkUcemebm5jQ1NeW+++7LDjvskH79+uWOO+5IksycOTOf+MQn1uz0AAAA7ci7WoFrbm7OxIkT86EPfSgnn3xykmSXXXbJKaeckgMPPDCHHnpo1ltvvQwbNixbb711TjjhhIwePTrXXnttevTokUsuuWStnAQAAEB7UFdVVdXWQ7yR+3HXfSMn/6qtR4Ci/WDMPm09AhTPtQjeG9eidds7vQfOF3kDAAAUQsABAAAUQsABAAAUQsABAAAUQsABAAAUQsABAAAUQsABAAAUQsABAAAUQsABAAAUQsABAAAUQsABAAAUQsABAAAUQsABAAAUQsABAAAUQsABAAAUQsABAAAUQsABAAAUQsABAAAUQsABAAAUQsABAAAUQsABAAAUQsABAAAUQsABAAAUQsABAAAUQsABAAAUQsABAAAUQsABAAAUQsABAAAUQsABAAAUQsABAAAUQsABAAAUQsABAAAUQsABAAAUQsABAAAU4h8KuPvvvz8jRoxIkjzxxBM54ogjcuSRR2b8+PGp1WpJkksvvTTDhw/P4YcfngceeOAd9wUAAODdW23AXXnllRk3blyWL1+eJDn//PNz6qmn5uqrr05VVbntttuyYMGC3HPPPbnuuusyZcqUnHPOOW+7LwAAAK2z2oDr3bt3pk6d2rK9YMGC7LrrrkmSwYMHZ9asWZk7d24GDRqUurq6bLrppmlubs4LL7zwlvsCAADQOg2r22Ho0KFZuHBhy3ZVVamrq0uSdO7cOUuXLs2yZcvSvXv3ln1ef/yt9l2dHj02TEND/bs+EYBSbLRR17YeAYB2zrWoXKsNuL/XocP/Lto1NjamW7du6dKlSxobG1d5vGvXrm+57+osWfLKux0JoCiLF6/+l1kAsDa5Fq3b3imw3/WnUG6//faZM2dOkmTmzJnp379/+vXrlzvvvDO1Wi3PPPNMarVaevbs+Zb7AgAA0DrvegVu9OjROfPMMzNlypRsueWWGTp0aOrr69O/f/8cdthhqdVqOeuss952XwAAAFqnrqqqqq2HeCPLueu+kZN/1dYjQNF+MGafth4BiudaBO+Na9G6bY3eQgkAAEDbEHAAAACFEHAAAACFEHAAAACFEHAAAACFEHAAAACFEHAAAACFEHAAAACFEHAAAACFEHAAAACFEHAAAACFEHAAAACFEHAAAACFEHAAAACFEHAAAACFEHAAAACFEHAAAACFEHAAAACFEHAAAACFEHAAAACFEHAAAACFEHAAAACFEHAAAACFEHAAAACFEHAAAACFEHAAAACFEHAAAACFEHAAAACFEHAAAACFEHAAAACFEHAAAACFEHAAAACFEHAAAACFEHAAAACFEHAAAACFaGjNk2688cbcdNNNSZLly5fnwQcfzCWXXJILL7wwH/rQh5IkJ598cvr375+zzz47Dz/8cDp27JgJEyakT58+a256AACAdqRVAXfwwQfn4IMPTpKcc845OeSQQ7JgwYKcfvrpGTp0aMt+v/jFL7JixYrMmDEj8+bNy+TJkzNt2rQ1MzkAAEA7855uofz973+fP/3pTznssMOyYMGC3HDDDTnyyCMzefLkrFy5MnPnzs2ee+6ZJOnbt2/mz5+/RoYGAABoj1q1Ave6K664IieeeGKSZODAgdlvv/2y2WabZfz48bnmmmuybNmydOnSpWX/+vr6rFy5Mg0Nb3/YHj02TEND/XsZC2CdttFGXdt6BADaOdeicrU64F5++eU8+uij2X333ZMkhxxySLp165Yk2XffffPzn/88Xbt2TWNjY8tzarXaO8ZbkixZ8kprRwIowuLFS9t6BADaOdeidds7BXarb6G89957s8ceeyRJqqrKZz/72fzlL39JksyePTs77LBD+vXrl5kzZyZJ5s2bl2222aa1hwMAAGj3Wr0C99hjj2WzzTZLktTV1WXChAk56aST0qlTp2y11VY59NBDU19fn7vuuiuHH354qqrKpEmT1tjgAAAA7U2rA+6LX/ziKtuDBg3KoEGD3rTfueee29pDAAAA8Aa+yBsAAKAQAg4AAKAQAg4AAKAQAg4AAKAQAg4AAKAQAg4AAKAQAg4AAKAQAg4AAKAQAg4AAKAQAg4AAKAQAg4AAKAQAg4AAKAQAg4AAKAQAg4AAKAQAg4AAKAQAg4AAKAQAg4AAKAQAg4AAKAQAg4AAKAQAg4AAKAQAg4AAKAQAg4AAKAQAg4AAKAQAg4AAKAQAg4AAKAQAg4AAKAQAg4AAKAQAg4AAKAQAg4AAKAQAg4AAKAQAg4AAKAQAg4AAKAQAg4AAKAQAg4AAKAQAg4AAKAQDa194kEHHZSuXbsmSTbbbLMcdthhmThxYurr6zNo0KCcdNJJqdVqOfvss/Pwww+nY8eOmTBhQvr06bPGhgcAAGhPWhVwy5cvT5JMnz695bFhw4Zl6tSp2XzzzfMv//IvWbBgQZ5++umsWLEiM2bMyLx58zJ58uRMmzZtzUwOAADQzrQq4B566KG8+uqrGTlyZFauXJmTTz45K1asSO/evZMkgwYNyuzZs7N48eLsueeeSZK+fftm/vz5a25yAACAdqZVAdepU6ccf/zx+fznP5/HH388X/rSl9KtW7eWn3fu3DlPPfVUli1bli5durQ8Xl9fn5UrV6ah4e0P26PHhmloqG/NWABF2Gijrm09AgDtnGtRuVoVcFtssUX69OmTurq6bLHFFunatWtefPHFlp83NjamW7duee2119LY2NjyeK1We8d7SXnXAAAIwklEQVR4S5IlS15pzUgAxVi8eGlbjwBAO+datG57p8Bu1adQXn/99Zk8eXKS5Nlnn82rr76aDTfcME8++WSqqsqdd96Z/v37p1+/fpk5c2aSZN68edlmm21aczgAAADSyhW44cOH54wzzsgRRxyRurq6TJo0KR06dMioUaPS3NycQYMGZeedd87HPvax3HXXXTn88MNTVVUmTZq0pucHAABoN1oVcB07dswll1zypsevvfbaVbY7dOiQc889t3WTAQAAsApf5A0AAFAIAQcAAFAIAQcAAFAIAQcAAFAIAQcAAFAIAQcAAFAIAQcAAFAIAQcAAFAIAQcAAFAIAQcAAFAIAQcAAFAIAQcAAFAIAQcAAFAIAQcAAFAIAQcAAFAIAQcAAFAIAQcAAFAIAQcAAFAIAQcAAFAIAQcAAFAIAQcAAFAIAQcAAFAIAQcAAFAIAQcAAFAIAQcAAFAIAQcAAFAIAQcAAFAIAQcAAFAIAQcAAFAIAQcAAFAIAQcAAFAIAQcAAFAIAQcAAFAIAQcAAFAIAQcAAFCIhtY8qampKWPHjs3TTz+dFStW5IQTTsg//dM/5ctf/nI+/OEPJ0mOOOKI7L///rn00ktz++23p6GhIWPHjs1OO+20JucHAABoN1oVcDfffHO6d++eiy66KEuWLMnnPve5nHjiiTnuuOMycuTIlv0WLFiQe+65J9ddd10WLVqUk08+OTfccMMaGx4AAKA9aVXAfepTn8rQoUNbtuvr6zN//vw89thjue2229KnT5+MHTs2c+fOzaBBg1JXV5dNN900zc3NeeGFF9KzZ8+3fe0ePTZMQ0N9a8YCKMJGG3Vt6xEAaOdci8rVqoDr3LlzkmTZsmU55ZRTcuqpp2bFihX5/Oc/nx133DHTpk3LZZddlq5du6Z79+6rPG/p0qXvGHBLlrzSmpEAirF48dK2HgGAds61aN32ToHd6g8xWbRoUY455pgMGzYsBx54YIYMGZIdd9wxSTJkyJD84Q9/SJcuXdLY2NjynMbGxnTtqvYBAABao1UB99xzz2XkyJE5/fTTM3z48CTJ8ccfnwceeCBJMnv27Oywww7p169f7rzzztRqtTzzzDOp1WrvuPoGAADA22vVLZTf+9738vLLL+fyyy/P5ZdfniQZM2ZMJk2alPXWWy+9evXKeeedly5duqR///457LDDUqvVctZZZ63R4QEAANqTuqqqqrYe4o3cj7vuGzn5V209AhTtB2P2aesRoHiuRfDeuBat29bKe+AAAAD4vyXgAAAACiHgAAAACiHgAAAACiHgAAAACiHgAAAACiHgAAAACiHgAAAACiHgAAAACiHgAAAACiHgAAAACiHgAAAACiHgAAAACiHgAAAACiHgAAAACiHgAAAACiHgAAAACiHgAAAACiHgAAAACiHgAAAACiHgAAAACiHgAAAACiHgAAAACiHgAAAACiHgAAAACiHgAAAACiHgAAAACiHgAAAACiHgAAAACiHgAAAACiHgAAAACiHgAAAACiHgAAAACiHgAAAACtGwtg9Qq9Vy9tln5+GHH07Hjh0zYcKE9OnTZ20fFgAA4H1nra/A3XrrrVmxYkVmzJiRr3/965k8efLaPiQAAMD70loPuLlz52bPPfdMkvTt2zfz589f24cEAAB4X1rrt1AuW7YsXbp0admur6/PypUr09Dw1ofeaKOua3sk3qP/75JhbT0CAO2caxHQXq31FbguXbqksbGxZbtWq71tvAEAAPD21nrA9evXLzNnzkySzJs3L9tss83aPiQAAMD7Ul1VVdXaPMDrn0L5yCOPpKqqTJo0KVtttdXaPCQAAMD70loPOAAAANYMX+QNAABQCAEHAABQCAEHAABQCAEH7yO1Wq2tRwAAYC3yhWxQuKeeeirnn39+5s+fn4aGhtRqtWyzzTY544wzssUWW7T1eAAArEE+hRIKd8wxx+TrX/96dt5555bH5s2bl8mTJ+eaa65pw8kAAFjTrMBB4VasWLFKvCVJ375922gaANqjESNGpKmpaZXHqqpKXV2dXybCGibgoHDbbrttzjjjjOy5557p2rVrGhsbc8cdd2Tbbbdt69EAaCdGjRqVcePG5bLLLkt9fX1bjwPva26hhMJVVZVbb701c+fOzbJly9KlS5f069cvQ4YMSV1dXVuPB0A7cdVVV6VPnz4ZMmRIW48C72sCDgAAoBC+RgAAAKAQAg4AAKAQAg6A97U5c+ZkxIgR/9C+CxcuzD777POuXn/EiBGZM2dOa0YDgHdNwAEAABTC1wgA0O6sXLkyZ599dv74xz/mueeey7bbbpspU6YkSZYvX56vfvWreeyxx9K7d+9MnDgxH/jAB/LAAw/k/PPPz2uvvZYePXrknHPOyeabb97GZwJAe2MFDoB253e/+13WW2+9zJgxI7/85S+zdOnS3HHHHUmS559/PiNGjMjNN9+czTffPJdddllWrFiRcePG5ZJLLslNN92U4447LmeeeWYbnwUA7ZEVOADanV122SXdu3fPj370ozz66KN5/PHH88orryRJtthii/Tv3z9JMmzYsIwZMyaPP/54nnrqqZxwwgktr7Fs2bI2mR2A9k3AAdDu3Hbbbfnud7+bY445JgcffHCWLFmS178WtaHhfy+NVVWloaEhtVotm222WX784x8nSZqbm/Pcc8+1yewAtG9uoQSg3Zk9e3Y+/elP55BDDkm3bt0yZ86cNDc3J0n+/Oc/5w9/+EOS5IYbbsgee+yRLbfcMi+99FJ++9vftjw+atSoNpsfgPbLChwA73u//e1v8/GPf7xle6eddsqcOXNyyy23ZL311ku/fv2ycOHCJEnv3r1z2WWX5cknn8w222yT0047LR07dsx3vvOdTJw4McuXL0+XLl1ywQUXtNXpANCO1VWv3zMCAADAOs0tlAAAAIUQcAAAAIUQcAAAAIUQcAAAAIUQcAAAAIUQcAAAAIUQcAAAAIUQcAAAAIX4/wEHeAw+b0td2AAAAABJRU5ErkJggg==\n", 195 | "text/plain": [ 196 | "" 197 | ] 198 | }, 199 | "metadata": {}, 200 | "output_type": "display_data" 201 | } 202 | ], 203 | "source": [ 204 | "plt.figure(figsize=(15,7))\n", 205 | "df.groupby('Label').count()['Headline'].plot.bar(title=\"Number of news per category\")" 206 | ] 207 | }, 208 | { 209 | "cell_type": "markdown", 210 | "metadata": {}, 211 | "source": [ 212 | "Procesamos el dataset" 213 | ] 214 | }, 215 | { 216 | "cell_type": "code", 217 | "execution_count": 21, 218 | "metadata": {}, 219 | "outputs": [], 220 | "source": [ 221 | "def clean_text(text):\n", 222 | " #Remove URLs\n", 223 | " text = re.sub(r\"http\\S+\", \"\", text)\n", 224 | " #Tokenize\n", 225 | " tokenizer = RegexpTokenizer('\\w+|\\$[\\d\\.]+|\\S+')\n", 226 | " tokens = tokenizer.tokenize(text)\n", 227 | " #Remove non alphanumerica characters\n", 228 | " words = [word for word in tokens if word.isalpha()] \n", 229 | " return words" 230 | ] 231 | }, 232 | { 233 | "cell_type": "code", 234 | "execution_count": 22, 235 | "metadata": {}, 236 | "outputs": [], 237 | "source": [ 238 | "def pad_array(array, token_len):\n", 239 | " diff_token = token_len - len(array)\n", 240 | " if(diff_token < 0):\n", 241 | " array = array[:token_len] #Truncate\n", 242 | " else:\n", 243 | " #Pad\n", 244 | " array += [0]*diff_token #Pad\n", 245 | " \n", 246 | " return array " 247 | ] 248 | }, 249 | { 250 | "cell_type": "code", 251 | "execution_count": 23, 252 | "metadata": {}, 253 | "outputs": [ 254 | { 255 | "name": "stderr", 256 | "output_type": "stream", 257 | "text": [ 258 | "Pandas Apply: 100%|██████████| 3988/3988 [00:01<00:00, 3963.97it/s]\n", 259 | "Pandas Apply: 100%|██████████| 3988/3988 [00:00<00:00, 34533.49it/s]\n" 260 | ] 261 | } 262 | ], 263 | "source": [ 264 | "#Clean content\n", 265 | "df['Body'] = df.swifter.apply(lambda row: clean_text(row['Body']), axis=1)\n", 266 | "#Clean title\n", 267 | "df['Headline'] = df.swifter.apply(lambda row: clean_text(row['Headline']), axis=1)" 268 | ] 269 | }, 270 | { 271 | "cell_type": "markdown", 272 | "metadata": {}, 273 | "source": [ 274 | "Convertir a los indices de **word2vec**" 275 | ] 276 | }, 277 | { 278 | "cell_type": "code", 279 | "execution_count": 24, 280 | "metadata": {}, 281 | "outputs": [], 282 | "source": [ 283 | "model = KeyedVectors.load_word2vec_format(\"../data/GoogleNews-vectors-negative300.bin.gz\", binary=True,\n", 284 | " limit=50000)" 285 | ] 286 | }, 287 | { 288 | "cell_type": "code", 289 | "execution_count": 25, 290 | "metadata": {}, 291 | "outputs": [ 292 | { 293 | "name": "stderr", 294 | "output_type": "stream", 295 | "text": [ 296 | "Pandas Apply: 100%|██████████| 3988/3988 [00:00<00:00, 37513.90it/s]\n", 297 | "Pandas Apply: 100%|██████████| 3988/3988 [00:00<00:00, 6349.86it/s]\n" 298 | ] 299 | } 300 | ], 301 | "source": [ 302 | "df['Headline'] = df.swifter.apply(lambda r: \n", 303 | " [model.vocab[x].index for x in r['Headline'] if x in model.vocab], axis=1)\n", 304 | "df['Body'] = df.swifter.apply(lambda r: \n", 305 | " [model.vocab[x].index for x in r['Body'] if x in model.vocab], axis=1)\n", 306 | "\n", 307 | "\n", 308 | "#Drop news with short or no title\n", 309 | "df = df[df['Headline'].map(len) >= 1]\n", 310 | "#Reset index\n", 311 | "df = df.reset_index().drop(\"index\", axis=1)" 312 | ] 313 | }, 314 | { 315 | "cell_type": "markdown", 316 | "metadata": {}, 317 | "source": [ 318 | "Distribución para el cuerpo de los artículos." 319 | ] 320 | }, 321 | { 322 | "cell_type": "code", 323 | "execution_count": 26, 324 | "metadata": {}, 325 | "outputs": [ 326 | { 327 | "name": "stderr", 328 | "output_type": "stream", 329 | "text": [ 330 | "Pandas Apply: 100%|██████████| 3980/3980 [00:00<00:00, 49281.68it/s]\n" 331 | ] 332 | }, 333 | { 334 | "data": { 335 | "text/plain": [ 336 | "" 337 | ] 338 | }, 339 | "execution_count": 26, 340 | "metadata": {}, 341 | "output_type": "execute_result" 342 | }, 343 | { 344 | "data": { 345 | "image/png": "iVBORw0KGgoAAAANSUhEUgAAA7YAAAFyCAYAAADBHEYTAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDIuMS4xLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvAOZPmwAAIABJREFUeJzt3X2Y1XWdP/7nYUZkuRNIbCO1S5Q0MzUkkgLazCssNdQoiL3QJGtjU8TIuFGGDBC59EuWZKabW6viDQmoWZZihICCSysqa2VehSnkHagwpgxzzu+Pfs5KIMyIc/PRx+Ov+dy/zpnXKM/zfn8+p1SpVCoBAACAgmrX2gUAAADA7hBsAQAAKDTBFgAAgEITbAEAACg0wRYAAIBCE2wBAAAoNMEWgBb3xBNP5OCDD868efO2Wf+jH/0oEydOfNOuc8wxx+Shhx560863M5s3b86IESNy/PHH51e/+tU22xYvXpzvfve7uzzHqFGjcscddzRXiTs0dOjQvPjii43ef9OmTTn11FMblg8++OBs2LChOUrbzoMPPpiampokyUMPPZSxY8fudP+JEyfmRz/6UUuUBkArq27tAgB4e2rXrl1mzZqVo446Kr17927tcnbbI488kueeey533nnndtseeuihvPDCC61Q1a7dcsstTdr/hRdeaLEPC/7RH//4xzz11FNJkg984AP53ve+1yp1AND2CLYAtIoOHTrk9NNPzze+8Y3ccMMNad++/TbbJ06cmD59+uRLX/rSdsvHHHNMTjjhhNx333154YUXcsYZZ+S3v/1t1qxZk+rq6vzgBz/IO9/5ziTJ3Llz87vf/S5btmzJ6aefnmHDhiVJ7r777vzgBz9IXV1dOnTokAkTJuSDH/xgLrvssjzwwAN5+umnc/DBB+eSSy7Zpq677rorc+bMSblcTqdOnTJp0qR07tw5kydPzlNPPZWhQ4fmxhtvTIcOHZIkq1evzg033JD6+vp06dIl55xzTr7//e/n9ttvT1VVVQ444IBMmTIlPXv2bLjG1q1bM378+FRXV2fWrFn529/+lhkzZuQPf/hD6urqMmDAgHzzm99MdXV1PvCBD+QrX/lKli1blqeffjpnnHFGRo4cmWeeeSYTJkzIxo0bkyQf+9jHMm7cuO1+DwcffHDuvffeLF68OHfeeWfatWuXtWvXpkOHDpk1a1YOPPDAbfafNGlSXn755QwdOjTz589Pklx22WVZvXp1nn/++XzpS1/Kv/7rvyZJ5s2bl+uvvz7lcjndunXLlClTtjtfuVzOhRdemNWrV6e2tjaVSiXTp0/PUUcdlYkTJ+b555/PX/7ylxxxxBFZvnx5Nm3alEmTJuWkk07KtGnT8rOf/Sy1tbWZPn16fvvb36aqqirHHntszjnnnG2u89hjj2XGjBl5/vnnU19fn1GjRjX0AgDFZyoyAK1mzJgx6dixY77zne80+dhXXnklN910U84+++zU1NTktNNOy6233pp3vetdWbBgQcN+e+65ZxYsWJCrr746s2fPzqOPPpo///nP+c53vpMrr7wyCxcuzLRp03LWWWflpZdeSpI8+eSTWbBgwXah9rHHHsvUqVNz2WWX5dZbb83YsWPz7//+79lnn30yffr07L///rnlllsaQm2SHHHEERkxYkQ+/elP55xzzsnNN9+ce+65Jz/96U9z2223pU+fPttMv66rq8vZZ5+dd7zjHbnkkktSXV2dCy+8MO9///szf/78LFy4MBs3bsx//ud/Jkm2bNmS7t2754Ybbsj3vve9zJw5s+G92XfffbNgwYJcd911Wbt2bTZt2rTT9/T+++/PlClT8rOf/SxHHHFErrzyyu32mTlzZjp06JBbbrklVVVVSZL99tsv8+fPz5w5c3LRRRelrq4uK1euzMKFC3Pddddl4cKFOeOMM3LmmWdud77Vq1fn6aefzo033pif//znOfnkk3PVVVc1bH/55Zdz++2358ILL8zYsWPTr1+/zJw5c5tzfO9738srr7ySn//851m4cGF++9vfZuXKlQ3bt27dmrFjx2b8+PGZP39+rr322lx99dV54IEHdvp+AFAcRmwBaDXt2rXLxRdfnJNOOikDBw5s0rGf/OQnk/w9VO2999455JBDkiT777//NtN+R4wYkSR55zvfmY9+9KO59957U1VVlaeffjpf/OIXG/YrlUp5/PHHkyRHHnlkqqu3/1/kfffdl6OPPjr77bdfkmTAgAHp0aNHHn744ZRKpUbVvWTJkpxyyinp2LFjkuTUU0/NFVdckS1btiRJZs2aldra2tx5550N51y8eHEeeuih/PSnP03y97D3Wp/4xCeSJO9///uzZcuWvPTSSxk0aFC+8pWvZP369fnIRz6S8ePHp0uXLjut7f3vf3/++Z//OUly6KGH7nBa9Y6ccMIJSZL3ve992bJlSzZv3pzFixdn7dq1De9/krz44ot5/vnn061bt4Z1H/zgB7PXXnvlhhtuyF/+8pesWLEinTp1ath+1FFH7fL6y5cvz6RJk1JVVZWqqqpce+21SdLwAcef//znPP7445k8eXLDMS+//HL+93//N0ceeWSjXiMAbZtgC0Crete73pULLrggEyZMyEknndSwvlQqpVKpNCzX1dVtc9xrpy7vsccer3v+du3+b3JSuVxOdXV16uvrM2DAgFx66aUN29avX5999tknd955Z0Po/Eflcnm7AFupVLJ169ad1rCzc5TL5WzdurVh+TOf+UwqlUrOP//8XHHFFQ37fPe7322Yxvviiy9uc44999wzSRrWVSqVHH744Vm0aFHuvffe3Hffffnc5z6Xq666Kocddtjr1vbakeZ/fP935tUPAV57/XK5nKFDh+bcc89teA1PP/109tprr22OXbx4cWbMmJHTTz89n/jEJ9K7d+/ceuutDdtf73fxj9d/7fuxfv36bV7Lq9PAX3s/8bPPPrvLoA9AcZiKDECrO+644zJ48OD85Cc/aVjXvXv3PPzww0mSp556apuppU3x6qjdunXrcu+992bAgAEZMGBAli1blsceeyxJ8pvf/Caf+cxnthsJ/UcDBgzI0qVL85e//CVJcu+992b9+vU54ogjdnpcVVVVQ3gdNGhQbr755oZpz9dcc00+9KEPNQT1ww8/POPGjcvjjz+em266KUkycODA/PjHP06lUsmWLVsyZsyYhlHJ13PJJZfk8ssvz7HHHpvzzjsvBx10UB599NGdHtMYr34wsKvQO3DgwNx+++15+umnkyTXX399TjvttO32W7ZsWT7+8Y9n5MiROeyww3LXXXelvr5+h+d87fv4WgMGDMiCBQtSLpezZcuWjB07Nvfff3/D9gMOOKBh+nTy9+B7wgknNPQXAMVnxBaANuH888/PqlWrGpZHjRqVb3zjGxkyZEj23XffHH300W/ovK+88kpOPvnk1NXV5fzzz88BBxyQJPn2t7+dr3/966lUKg0PnHrtFNgdOeiggzJ16tSceeaZqa+vT4cOHXLFFVfscuTv6KOPzje+8Y1MmzYt5513XtavX5/Pfe5zKZfLec973rPdvbx77rlnLrrooowePTpHH310zjvvvMyYMSMnnnhi6urq8pGPfCRnnHHGTq952mmnZeLEiTnhhBPSvn37HHzwwTn++OMb8Y7tXM+ePXP44Yfn+OOPz3XXXfe6+w0cODBf/vKXM3r06JRKpXTu3Dlz5szZbsR7xIgRGT9+fE488cRs3bo1H/3oR/OrX/0q5XJ5u3MeeeSR+f73v58zzzwzo0aNalh/5plnZsaMGRk6dGjq6+vz6U9/Op/85Cdz9913J/n76P7ll1+eGTNm5D/+4z+ydevWnH322Y2a5gxAMZQqjZ1nBAAAAG2QqcgAAAAUmmALAABAoQm2AAAAFJpgCwAAQKEJtgAAABTaW+brfp55ZlNrl/C6unfvmI0bX2rtMmC36GOKTg9TdHqYotPD7K6ePV//6/WM2LaA6uqq1i4Bdps+puj0MEWnhyk6PUxzEmwBAAAoNMEWAACAQhNsAQAAKDTBFgAAgEITbAEAACg0wRYAAIBCE2wBAAAoNMEWAACAQhNsAQAAKDTBFgAAgEITbAEAACg0wRYAAIBCE2wBAAAotOrWLuDtZPRFdzdp/6snHtNMlQAAALx1NOuI7erVqzNq1Kgkydq1a/OFL3whI0eOzNSpU1Mul5Mkc+bMybBhwzJixIg8+OCDO90XAAAA/lGzBdurrroq559/fl555ZUkycyZMzNu3LjMnTs3lUolixYtypo1a7Jy5crMmzcvs2fPzgUXXPC6+wIAAMCONFuw3X///XPZZZc1LK9Zsyb9+/dPkgwePDjLly/PqlWrMnDgwJRKpfTq1Sv19fXZsGHDDvcFAACAHWm2e2yHDBmSJ554omG5UqmkVColSTp16pRNmzZl8+bN6datW8M+r67f0b670r17x1RXV73Jr+LN07NnlxY5BpqTnqTo9DBFp4cpOj1Mc2mxh0e1a/d/g8O1tbXp2rVrOnfunNra2m3Wd+nSZYf77srGjS+9uQW/iXr27JJnntl1OP9Hb+QYaC5vtI+hrdDDFJ0epuj0MLtrZx+MtNjX/Rx66KFZsWJFkmTJkiXp169f+vbtm6VLl6ZcLmfdunUpl8vp0aPHDvcFAACAHWmxEdsJEyZkypQpmT17dnr37p0hQ4akqqoq/fr1y/Dhw1Mul1NTU/O6+wIAAMCOlCqVSqW1i3gztOVpDa9Ou/A9thSZ6UMUnR6m6PQwRaeH2V1tYioyAAAANAfBFgAAgEITbAEAACg0wRYAAIBCE2wBAAAoNMEWAACAQhNsAQAAKDTBFgAAgEITbAEAACg0wRYAAIBCE2wBAAAoNMEWAACAQhNsAQAAKDTBFgAAgEITbAEAACg0wRYAAIBCE2wBAAAoNMEWAACAQhNsAQAAKDTBFgAAgEITbAEAACg0wRYAAIBCE2wBAAAoNMEWAACAQhNsAQAAKDTBFgAAgEITbAEAACg0wRYAAIBCE2wBAAAoNMEWAACAQhNsAQAAKDTBFgAAgEITbAEAACg0wRYAAIBCE2wBAAAoNMEWAACAQhNsAQAAKDTBFgAAgEITbAEAACg0wRYAAIBCE2wBAAAoNMEWAACAQhNsAQAAKDTBFgAAgEITbAEAACg0wRYAAIBCE2wBAAAoNMEWAACAQhNsAQAAKDTBFgAAgEITbAEAACg0wRYAAIBCE2wBAAAotOqWvFhdXV0mTpyYJ598Mu3atcu0adNSXV2diRMnplQqpU+fPpk6dWratWuXOXPmZPHixamurs7kyZNz+OGHt2SpAAAAFESLBtvf/OY32bp1a2644YYsW7Ysl156aerq6jJu3Lh8+MMfTk1NTRYtWpRevXpl5cqVmTdvXtavX5+zzjorN998c0uWCgAAQEG06FTkAw44IPX19SmXy9m8eXOqq6uzZs2a9O/fP0kyePDgLF++PKtWrcrAgQNTKpXSq1ev1NfXZ8OGDS1ZKgAAAAXRoiO2HTt2zJNPPplPfepT2bhxY6644orcf//9KZVKSZJOnTpl06ZN2bx5c7p169Zw3Kvre/To8brn7t69Y6qrq5r9NbxRPXt2aZFjoDnpSYpOD1N0epii08M0lxYNtj/+8Y8zcODAjB8/PuvXr89pp52Wurq6hu21tbXp2rVrOnfunNra2m3Wd+my8z+CjRtfara6d1fPnl3yzDObmnzcGzkGmssb7WNoK/QwRaeHKTo9zO7a2QcjLToVuWvXrg0Bda+99srWrVtz6KGHZsWKFUmSJUuWpF+/funbt2+WLl2acrmcdevWpVwu73S0FgAAgLevFh2x/eIXv5jJkydn5MiRqauryznnnJPDDjssU6ZMyezZs9O7d+8MGTIkVVVV6devX4YPH55yuZyampqWLBMAAIACKVUqlUprF/FmaMvTGl6ddjH6orubdNzVE49ppoqg6Uwfouj0MEWnhyk6PczuajNTkQEAAODNJtgCAABQaIItAAAAhSbYAgAAUGiCLQAAAIUm2AIAAFBogi0AAACFJtgCAABQaIItAAAAhSbYAgAAUGiCLQAAAIUm2AIAAFBogi0AAACFJtgCAABQaIItAAAAhSbYAgAAUGiCLQAAAIUm2AIAAFBogi0AAACFJtgCAABQaIItAAAAhSbYAgAAUGiCLQAAAIUm2AIAAFBogi0AAACFJtgCAABQaIItAAAAhSbYAgAAUGiCLQAAAIUm2AIAAFBogi0AAACFJtgCAABQaIItAAAAhSbYAgAAUGiCLQAAAIUm2AIAAFBogi0AAACFJtgCAABQaIItAAAAhSbYAgAAUGiCLQAAAIUm2AIAAFBogi0AAACFJtgCAABQaIItAAAAhSbYAgAAUGiCLQAAAIUm2AIAAFBogi0AAACFJtgCAABQaIItAAAAhSbYAgAAUGiCLQAAAIUm2AIAAFBojQq2X/7yl/OLX/wiW7Zs2e0L/vCHP8zw4cNzyimnZN68eVm7dm2+8IUvZOTIkZk6dWrK5XKSZM6cORk2bFhGjBiRBx98cLevCwAAwFtTo4PtPffck+OOOy4XXHDBGw6aK1asyP/8z//k+uuvzzXXXJO//vWvmTlzZsaNG5e5c+emUqlk0aJFWbNmTVauXJl58+Zl9uzZueCCC97Q9QAAAHjrq27MTv3790///v3z8ssv54477sjYsWPTuXPnDBs2LCNHjkz79u0bdbGlS5fmve99b772ta9l8+bN+eY3v5mbbrop/fv3T5IMHjw4y5YtywEHHJCBAwemVCqlV69eqa+vz4YNG9KjR4/XPXf37h1TXV3VqDpaQ8+eXVrkGGhOepKi08MUnR6m6PQwzaVRwTb5+2jrLbfckmXLlmXw4MH59Kc/neXLl2fMmDH50Y9+1KhzbNy4MevWrcsVV1yRJ554ImPGjEmlUkmpVEqSdOrUKZs2bcrmzZvTrVu3huNeXb+zYLtx40uNfSktrmfPLnnmmU1NPu6NHAPN5Y32MbQVepii08MUnR5md+3sg5FGBduPf/zj2XffffPZz342NTU16dChQ5Lkwx/+cD772c82upBu3bqld+/ead++fXr37p0999wzf/3rXxu219bWpmvXruncuXNqa2u3Wd+li093AAAA2F6j7rH9yU9+kksvvTQnnXRSkmTt2rV/P7hduyxYsKDRFzvqqKNyzz33pFKp5Kmnnsrf/va3DBgwICtWrEiSLFmyJP369Uvfvn2zdOnSlMvlrFu3LuVyeaejtQAAALx9NWrEdvHixVmwYEEWLFiQ5557Ll/96lfzxS9+McOHD2/SxT7+8Y/n/vvvz7Bhw1KpVFJTU5N99903U6ZMyezZs9O7d+8MGTIkVVVV6devX4YPH55yuZyampo39OIAAAB46ytVKpXKrnY64YQTctNNN6Vjx45Jkr/97W/5/Oc/n9tuu63ZC2ystjxf/9X7CUZfdHeTjrt64jHNVBE0nftiKDo9TNHpYYpOD7O7dnaPbaOmItfV1W3z5OM99thj96sCAACAN0GjpiIfe+yxOe200/KpT30qpVIpv/zlL3PMMUYTAQAAaH2NCrbnnntu7rjjjtx///2prq7OqaeemmOPPba5awMAAIBdavT32B544IHZe++98+otuffff38+9KEPNVthAAAA0BiNCrYXXHBBfv3rX2e//fZrWFcqlfJf//VfzVYYAAAANEajgu2yZctyxx13pEOHDs1dDwAAADRJo56KvN9++6UR3woEAAAALa5RI7Z77bVXjj/++Hzwgx/c5mt/Zs6c2WyFAQAAQGM0KtgOGjQogwYNau5aAAAAoMkaFWxPPvnkPPHEE/njH/+YgQMHZv369ds8SAoAAABaS6Pusf35z3+eMWPGZMaMGXnhhRcyYsSI3HLLLc1dGwAAAOxSo4LtVVddleuvvz6dOnXKO97xjixYsCBXXnllc9cGAAAAu9SoYNuuXbt07ty5YXmfffZJu3aNOhQAAACaVaPuse3Tp0+uvfbabN26NY888kjmzp2bQw45pLlrAwAAgF1q1LBrTU1Nnnrqqey5556ZPHlyOnfunKlTpzZ3bQAAALBLjRqx7dixY8aPH5/x48c3dz0AAADQJI0KtoccckhKpdI263r27JklS5Y0S1EAAADQWI0Ktr/73e8afq6rq8tdd92VBx54oNmKAgAAgMZq8qON99hjj3zqU5/Kfffd1xz1AAAAQJM0asR24cKFDT9XKpU8+uijqa5u1KEAAADQrBqVTlesWLHNcvfu3XPppZc2S0EAAADQFI0KtjNnzmzuOgAAAOANaVSwPeaYY7Z7KnLy92nJpVIpixYtetMLAwAAgMZoVLA98cQTs8cee+Tzn/98qqurc9ttt+Whhx7KOeec09z1AQAAwE41Ktjec889mT9/fsPyaaedllNOOSXvfve7m60wAAAAaIxGf93P8uXLG37+9a9/nU6dOjVLQQAAANAUjRqx/fa3v50JEybk2WefTZL07t07s2bNatbCAAAAoDEaFWwPO+yw3H777dmwYUM6dOiQjh07NnddAAAA0CiNmor85JNP5vTTT8+IESNSW1ubU089NU888URz1wYAAAC71KhgW1NTky996Uvp2LFj9t5775xwwgmZMGFCc9cGAAAAu9SoYLtx48YMHDgwSVIqlfL5z38+mzdvbtbCAAAAoDEaFWw7dOiQv/71rymVSkmS//7v/0779u2btTAAAABojEY9PGrSpEn5t3/7tzz++OMZOnRoXnjhhXz3u99t7toAAABglxoVbJ977rn89Kc/zZ///OfU19end+/eRmwBAABoExo1Ffniiy/OHnvskT59+uSQQw4RagEAAGgzGjViu99++2XSpEk54ogj0qFDh4b1J510UrMVBgAAAI2x02D71FNP5Z3vfGe6d++eJFm9evU22wVbAAAAWttOg+1Xv/rVLFiwIDNnzszVV1+d0aNHt1RdAAAA0Cg7vce2Uqk0/Hzbbbc1ezEAAADQVDsNtq9+b22ybcgFAACAtqJRT0VOtg25AAAA0Fbs9B7bRx99NJ/4xCeS/P1BUq/+XKlUUiqVsmjRouavEAAAAHZip8H2l7/8ZUvVAQAAAG/IToPtu9/97paqAwAAAN6QRt9jCwAAAG2RYAsAAEChCbYAAAAUmmALAABAoQm2AAAAFJpgCwAAQKEJtgAAABSaYAsAAEChCbYAAAAUWqsE2+eeey4f+9jH8thjj2Xt2rX5whe+kJEjR2bq1Kkpl8tJkjlz5mTYsGEZMWJEHnzwwdYoEwAAgAJo8WBbV1eXmpqadOjQIUkyc+bMjBs3LnPnzk2lUsmiRYuyZs2arFy5MvPmzcvs2bNzwQUXtHSZAAAAFESLB9tZs2ZlxIgR2WeffZIka9asSf/+/ZMkgwcPzvLly7Nq1aoMHDgwpVIpvXr1Sn19fTZs2NDSpQIAAFAA1S15sfnz56dHjx4ZNGhQrrzyyiRJpVJJqVRKknTq1CmbNm3K5s2b061bt4bjXl3fo0eP1z139+4dU11d1bwvYDf07NmlRY6B5qQnKTo9TNHpYYpOD9NcWjTY3nzzzSmVSrn33nvzyCOPZMKECduMxNbW1qZr167p3Llzamtrt1nfpcvO/wg2bnyp2ereXT17dskzz2xq8nFv5BhoLm+0j6Gt0MMUnR6m6PQwu2tnH4y06FTk6667Ltdee22uueaavO9978usWbMyePDgrFixIkmyZMmS9OvXL3379s3SpUtTLpezbt26lMvlnY7WAgAA8PbVoiO2OzJhwoRMmTIls2fPTu/evTNkyJBUVVWlX79+GT58eMrlcmpqalq7TAAAANqoVgu211xzTcPP11577XbbzzrrrJx11lktWRIAAAAF1CrfYwsAAABvFsEWAACAQhNsAQAAKDTBFgAAgEITbAEAACg0wRYAAIBCE2wBAAAoNMEWAACAQhNsAQAAKDTBFgAAgEITbAEAACg0wRYAAIBCE2wBAAAoNMEWAACAQhNsAQAAKDTBFgAAgEKrbu0CeHOMvujuJu1/9cRjmqkSAACAlmXEFgAAgEITbAEAACg0U5Hfppoyddm0ZQAAoC0zYgsAAEChCbYAAAAUmmALAABAoQm2AAAAFJpgCwAAQKEJtgAAABSaYAsAAEChCbYAAAAUmmALAABAoQm2AAAAFJpgCwAAQKEJtgAAABSaYAsAAEChVbd2Abz1jL7o7kbve/XEY5qxEgAA4O3AiC0AAACFJtgCAABQaIItAAAAhSbYAgAAUGiCLQAAAIUm2AIAAFBovu6HXWrK1/cAAAC0NCO2AAAAFJpgCwAAQKGZityGmQIMAACwa0ZsAQAAKDTBFgAAgEITbAEAACg0wRYAAIBCE2wBAAAoNMEWAACAQhNsAQAAKDTBFgAAgEITbAEAACi06pa8WF1dXSZPnpwnn3wyW7ZsyZgxY3LQQQdl4sSJKZVK6dOnT6ZOnZp27dplzpw5Wbx4caqrqzN58uQcfvjhLVkqAAAABdGiwfbWW29Nt27dcvHFF2fjxo05+eSTc8ghh2TcuHH58Ic/nJqamixatCi9evXKypUrM2/evKxfvz5nnXVWbr755pYsFQAAgIJo0WB73HHHZciQIQ3LVVVVWbNmTfr3758kGTx4cJYtW5YDDjggAwcOTKlUSq9evVJfX58NGzakR48eLVkuAAAABdCiwbZTp05Jks2bN2fs2LEZN25cZs2alVKp1LB906ZN2bx5c7p167bNcZs2bdppsO3evWOqq6ua9wXshp49u7R2CW3S6IvubtL+t/2/oc1UCY2hjyk6PUzR6WGKTg/TXFo02CbJ+vXr87WvfS0jR47MiSeemIsvvrhhW21tbbp27ZrOnTuntrZ2m/Vduuz8j2Djxpearebd1bNnlzzzzKbWLuMtwfvYevQxRaeHKTo9TNHpYXbXzj4YadGnIj/77LMZPXp0zj333AwbNixJcuihh2bFihVJkiVLlqRfv37p27dvli5dmnK5nHXr1qVcLpuGDAAAwA616IjtFVdckRdffDGXX355Lr/88iTJeeedl+nTp2f27Nnp3bt3hgwZkqqqqvTr1y/Dhw9PuVxOTU1KnTOFAAAHMElEQVRNS5YJAABAgZQqlUqltYt4M7TlaQ2vTrto6v2kbO/qice0dglvW6YPUXR6mKLTwxSdHmZ3tZmpyAAAAPBmE2wBAAAoNMEWAACAQhNsAQAAKDTBFgAAgEJr0a/7gd3V1CdLe4oyAAC89RmxBQAAoNAEWwAAAApNsAUAAKDQ3GPLW1pT7sl1Py4AABSTYAv/Pw+mAgCAYjIVGQAAgEITbAEAACg0wRYAAIBCE2wBAAAoNA+PgjfIE5cBAKBtMGILAABAoQm2AAAAFJpgCwAAQKEJtgAAABSaYAsAAECheSoytICmPEE58RRlAABoCiO2AAAAFJoRW2iDjPACAEDjGbEFAACg0ARbAAAACk2wBQAAoNAEWwAAAApNsAUAAKDQPBUZ3mY8cRkAgLcaI7YAAAAUmhFbeAto6igsAAC8lRixBQAAoNCM2AI71ZTRYPfjAgDQGozYAgAAUGiCLQAAAIUm2AIAAFBogi0AAACFJtgCAABQaIItAAAAhSbYAgAAUGiCLQAAAIUm2AIAAFBogi0AAACFVt3aBQBvHaMvurtJ+1898ZhmqgQAgLcTI7YAAAAUmmALAABAoZmKDLQaU5cBAHgzGLEFAACg0IzYAoXR1BHepjAaDABQXIItQDMz5RoAoHkJtgBN1JwjxwAANJ17bAEAACg0wRYAAIBCa7NTkcvlcr71rW/l97//fdq3b5/p06fnPe95T2uXBbxFmV68PfcGAwBF0WaD7V133ZUtW7bkxhtvzAMPPJCLLrooP/jBD1q7LIBmV9SQ7anVO9aU96XIrxMAWlObDbarVq3KoEGDkiRHHnlkHn744VauCIDW0pZGj5szwBf1Q42maurvpzk/HHi7fPDQVt7Dpp6/Lf3ts2NN+R3d9v+GNmMlTfN2+TD27fQ3VKpUKpXWLmJHzjvvvHzyk5/Mxz72sSTJv/zLv+Suu+5KdXWbzeIAAAC0gjb78KjOnTuntra2YblcLgu1AAAAbKfNBtu+fftmyZIlSZIHHngg733ve1u5IgAAANqiNjsV+dWnIv/hD39IpVLJhRdemAMPPLC1ywIAAKCNabPBFgAAABqjzU5FBgAAgMYQbAEAACg0jxluRq/eJ/z73/8+7du3z/Tp0/Oe97yntcuC7axevTqXXHJJrrnmmqxduzYTJ05MqVRKnz59MnXq1LRr1y5z5szJ4sWLU11dncmTJ+fwww9/3X2hpdTV1WXy5Ml58skns2XLlowZMyYHHXSQHqZQ6uvrc/755+dPf/pTqqqqMnPmzFQqFX1MoTz33HM55ZRTcvXVV6e6ulr/0uJ0TTO66667smXLltx4440ZP358LrrootYuCbZz1VVX5fzzz88rr7ySJJk5c2bGjRuXuXPnplKpZNGiRVmzZk1WrlyZefPmZfbs2bngggted19oSbfeemu6deuWuXPn5qqrrsq0adP0MIXz61//Oklyww03ZOzYsZk5c6Y+plDq6upSU1OTDh06JPFvCVqHYNuMVq1alUGDBiVJjjzyyDz88MOtXBFsb//9989ll13WsLxmzZr0798/STJ48OAsX748q1atysCBA1MqldKrV6/U19dnw4YNO9wXWtJxxx2Xs88+u2G5qqpKD1M4xx57bKZNm5YkWbduXfbee299TKHMmjUrI0aMyD777JPEvyVoHYJtM9q8eXM6d+7csFxVVZWtW7e2YkWwvSFDhqS6+v/uSqhUKimVSkmSTp06ZdOmTdv18qvrd7QvtKROnTqlc+fO2bx5c8aOHZtx48bpYQqpuro6EyZMyLRp0zJkyBB9TGHMnz8/PXr0aBjMSfxbgtYh2Dajzp07p7a2tmG5XC5vEyCgLXrtfS21tbXp2rXrdr1cW1ubLl267HBfaGnr16/PqaeemqFDh+bEE0/UwxTWrFmz8stf/jJTpkxpuD0k0ce0bTfffHOWL1+eUaNG5ZFHHsmECROyYcOGhu36l5Yi2Dajvn37ZsmSJUmSBx54IO9973tbuSLYtUMPPTQrVqxIkixZsiT9+vVL3759s3Tp0pTL5axbty7lcjk9evTY4b7Qkp599tmMHj065557boYNG5ZED1M8CxcuzA9/+MMkyT/90z+lVCrlsMMO08cUwnXXXZdrr70211xzTd73vvdl1qxZGTx4sP6lxZUqlUqltYt4q3r1qch/+MMfUqlUcuGFF+bAAw9s7bJgO0888US+/vWv56abbsqf/vSnTJkyJXV1dendu3emT5+eqqqqXHbZZVmyZEnK5XImTZqUfv36ve6+0FKmT5+eX/ziF+ndu3fDuvPOOy/Tp0/XwxTGSy+9lEmTJuXZZ5/N1q1b8+UvfzkHHnig/xZTOKNGjcq3vvWttGvXTv/S4gRbAAAACs1UZAAAAApNsAUAAKDQBFsAAAAKTbAFAACg0ARbAAAACk2wBQAAoNAEWwAAAApNsAUAAKDQ/j9hRrA6KsVZQgAAAABJRU5ErkJggg==\n", 346 | "text/plain": [ 347 | "" 348 | ] 349 | }, 350 | "metadata": {}, 351 | "output_type": "display_data" 352 | } 353 | ], 354 | "source": [ 355 | "# Longitud en tokens de cada noticia\n", 356 | "tokens_numbers = df.swifter.apply(lambda row: len(row['Body']), axis = 1)\n", 357 | "\n", 358 | "fig, ax = plt.subplots(1,1, figsize=(16,6))\n", 359 | "tokens_numbers.plot.hist(title=\"Number of tokens in the article\", bins = 70, ax=ax)" 360 | ] 361 | }, 362 | { 363 | "cell_type": "code", 364 | "execution_count": 15, 365 | "metadata": {}, 366 | "outputs": [ 367 | { 368 | "name": "stdout", 369 | "output_type": "stream", 370 | "text": [ 371 | "Número medio de tokens por artículo: 404\n", 372 | "Desviación estándar de tokens por artículo: 424\n", 373 | "Mediana de tokens por artículo: 289\n", 374 | "Ventana de tokens escogida: 1254 - Cubre el 96.20603015075378% del dataset\n" 375 | ] 376 | } 377 | ], 378 | "source": [ 379 | "print(\"Número medio de tokens por artículo: {}\".format(int(np.mean(tokens_numbers))))\n", 380 | "print(\"Desviación estándar de tokens por artículo: {}\".format(int(np.std(tokens_numbers))))\n", 381 | "print(\"Mediana de tokens por artículo: {}\".format(int(np.median(tokens_numbers))))\n", 382 | "\n", 383 | "max_tokens = np.mean(tokens_numbers) + 2 * np.std(tokens_numbers)\n", 384 | "max_tokens = int(max_tokens)\n", 385 | "\n", 386 | "percent_tokens = tokens_numbers[tokens_numbers <= max_tokens].count() / tokens_numbers.count()\n", 387 | "print(\"Ventana de tokens escogida: {} - Cubre el {}% del dataset\".format(max_tokens, percent_tokens*100))" 388 | ] 389 | }, 390 | { 391 | "cell_type": "markdown", 392 | "metadata": {}, 393 | "source": [ 394 | "Para los títulos" 395 | ] 396 | }, 397 | { 398 | "cell_type": "code", 399 | "execution_count": 28, 400 | "metadata": {}, 401 | "outputs": [ 402 | { 403 | "name": "stdout", 404 | "output_type": "stream", 405 | "text": [ 406 | "Número medio de tokens por título: 7\n", 407 | "Desviación estándar de tokens por título: 3\n", 408 | "Mediana de tokens por título: 8\n", 409 | "Ventana de tokens escogida: 13 - Cubre el 97.9145728643216% del dataset\n" 410 | ] 411 | }, 412 | { 413 | "data": { 414 | "image/png": "iVBORw0KGgoAAAANSUhEUgAAA7gAAAFyCAYAAADf1XagAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDIuMS4xLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvAOZPmwAAIABJREFUeJzt3XuUlQW9//HPZkYkGBRI7GRelqOoqakpkRRSmSs0NS9ZImepidbRoyCGxkUZIkHk6KFMMi/pqQPeFdSyLMUIEUKOJSrHyvyVpuAVVBgvDLP374+Wc0QUZ5SZwWder7+Yvffs5zvznYH15nlmT6lSqVQCAAAAH3Cd2nsAAAAA2BAELgAAAIUgcAEAACgEgQsAAEAhCFwAAAAKQeACAABQCAIXgDb35JNPZuedd86NN9641u1XXnllRo8evcGOs//+++ehhx7aYM+3PqtWrcrgwYNz8MEH5ze/+c1a982ZMycXXXTRuz7HsccemzvuuKO1Rnxbhx12WF5++eVmP37lypU57rjjmt7eeeeds3z58tYYbR0PPvhg6urqkiQPPfRQhg8fvt7Hjx49OldeeWVbjAbARqK6vQcAoGPq1KlTpkyZkn322Se1tbXtPc779sgjj+SFF17InXfeuc59Dz30UF566aV2mOrd3XrrrS16/EsvvdRm/2nwVn/961/zzDPPJEk+8YlP5Ic//GG7zAHAxkvgAtAuunTpkhNOOCFnnnlmrrvuunTu3Hmt+0ePHp0+ffrkxBNPXOft/fffP4ccckh+//vf56WXXspJJ52UP/zhD1myZEmqq6vz4x//OB/5yEeSJNdcc03+9Kc/ZfXq1TnhhBNy1FFHJUnuvvvu/PjHP05DQ0O6dOmSUaNG5ZOf/GQuvvjiPPDAA3n22Wez884758ILL1xrrrvuuivTpk1LuVxOt27dMmbMmNTU1GTs2LF55plncthhh+X6669Ply5dkiSLFy/Oddddl8bGxnTv3j1nnHFGfvSjH+X2229PVVVVtt9++4wbNy69e/duOsaaNWsycuTIVFdXZ8qUKXn11VczadKk/OUvf0lDQ0P69++f73znO6murs4nPvGJfOtb38q9996bZ599NieddFKGDBmS5557LqNGjcqKFSuSJJ/73OcyYsSIdfaw8847Z8GCBZkzZ07uvPPOdOrUKY8//ni6dOmSKVOmZIcddljr8WPGjMlrr72Www47LDNnzkySXHzxxVm8eHFefPHFnHjiifnXf/3XJMmNN96Ya6+9NuVyOT169Mi4cePWeb5yuZzzzjsvixcvTn19fSqVSiZOnJh99tkno0ePzosvvph//OMf2XPPPTN//vysXLkyY8aMyeGHH55zzz03v/jFL1JfX5+JEyfmD3/4Q6qqqnLAAQfkjDPOWOs4jz32WCZNmpQXX3wxjY2NOfbYY5u+FgAoDpcoA9BuTjnllHTt2jXf//73W/y+r7/+em644Yacfvrpqaury/HHH5/bbrstH/3oRzNr1qymx2266aaZNWtWrrrqqkydOjWPPvpo/v73v+f73/9+Lr/88txyyy0599xzM2zYsLzyyitJkqeeeiqzZs1aJ24fe+yxjB8/PhdffHFuu+22DB8+PP/+7/+eLbfcMhMnTsy2226bW2+9tSluk2TPPffM4MGD8+UvfzlnnHFGbr755txzzz256aab8vOf/zx9+vRZ67LshoaGnH766fnwhz+cCy+8MNXV1TnvvPOy2267ZebMmbnllluyYsWK/Nd//VeSZPXq1enZs2euu+66/PCHP8zkyZObPjdbb711Zs2alauvvjqPP/54Vq5cud7P6aJFizJu3Lj84he/yJ577pnLL798ncdMnjw5Xbp0ya233pqqqqokyTbbbJOZM2dm2rRpOf/889PQ0JD77rsvt9xyS66++urccsstOemkk3Laaaet83yLFy/Os88+m+uvvz6//OUvc8QRR+SKK65ouv+1117L7bffnvPOOy/Dhw9P3759M3ny5LWe44c//GFef/31/PKXv8wtt9ySP/zhD7nvvvua7l+zZk2GDx+ekSNHZubMmZkxY0auuuqqPPDAA+v9fADwweMMLgDtplOnTrngggty+OGHZ8CAAS163y996UtJ/hlXW2yxRXbZZZckybbbbrvW5cCDBw9OknzkIx/JZz/72SxYsCBVVVV59tln841vfKPpcaVSKU888USSZK+99kp19br/RP7+97/Pvvvum2222SZJ0r9///Tq1SsPP/xwSqVSs+aeO3dujjzyyHTt2jVJctxxx+XSSy/N6tWrkyRTpkxJfX197rzzzqbnnDNnTh566KHcdNNNSf4ZfW/2xS9+MUmy2267ZfXq1XnllVey33775Vvf+laWLVuWz3zmMxk5cmS6d+++3tl22223/Mu//EuSZNddd33by63fziGHHJIk+fjHP57Vq1dn1apVmTNnTh5//PGmz3+SvPzyy3nxxRfTo0ePpts++clPZvPNN891112Xf/zjH1m4cGG6devWdP8+++zzrsefP39+xowZk6qqqlRVVWXGjBlJ0vQfHX//+9/zxBNPZOzYsU3v89prr+V///d/s9deezXrYwTgg0HgAtCuPvrRj2bChAkZNWpUDj/88KbbS6VSKpVK09sNDQ1rvd+bL2neZJNN3vH5O3X6v4uVyuVyqqur09jYmP79++cHP/hB033Lli3LlltumTvvvLMpPt+qXC6vE7KVSiVr1qxZ7wzre45yuZw1a9Y0vf2Vr3wllUol55xzTi699NKmx1x00UVNl/e+/PLLaz3HpptumiRNt1Uqleyxxx6ZPXt2FixYkN///vf52te+liuuuCK77777O8725jPPb/38r88b/xnw5uOXy+UcdthhOeuss5o+hmeffTabb775Wu87Z86cTJo0KSeccEK++MUvpra2NrfddlvT/e+0i7ce/82fj2XLlq31sbxxefibf974+eeff9fgB+CDxyXKALS7Aw88MAMHDszPfvazptt69uyZhx9+OEnyzDPPrHXJaUu8cRZv6dKlWbBgQfr375/+/fvn3nvvzWOPPZYk+d3vfpevfOUr65wZfav+/ftn3rx5+cc//pEkWbBgQZYtW5Y999xzve9XVVXVFLH77bdfbr755qbLoadPn55PfepTTcG+xx57ZMSIEXniiSdyww03JEkGDBiQn/70p6lUKlm9enVOOeWUprOU7+TCCy/MJZdckgMOOCBnn312dtxxxzz66KPrfZ/meOM/CN4tfgcMGJDbb789zz77bJLk2muvzfHHH7/O4+6999584QtfyJAhQ7L77rvnrrvuSmNj49s+55s/j2/Wv3//zJo1K+VyOatXr87w4cOzaNGipvu33377psuqk38G8CGHHNL09QVAcTiDC8BG4Zxzzsn999/f9Paxxx6bM888M4MGDcrWW2+dfffd9z097+uvv54jjjgiDQ0NOeecc7L99tsnSb73ve/l29/+diqVStMLU7350ti3s+OOO2b8+PE57bTT0tjYmC5duuTSSy991zOB++67b84888yce+65Ofvss7Ns2bJ87WtfS7lcznbbbbfOz/puuummOf/88zN06NDsu+++OfvsszNp0qQceuihaWhoyGc+85mcdNJJ6z3m8ccfn9GjR+eQQw5J586ds/POO+fggw9uxmds/Xr37p099tgjBx98cK6++up3fNyAAQPyzW9+M0OHDk2pVEpNTU2mTZu2zhnwwYMHZ+TIkTn00EOzZs2afPazn81vfvOblMvldZ5zr732yo9+9KOcdtppOfbYY5tuP+200zJp0qQcdthhaWxszJe//OV86Utfyt13353kn2f7L7nkkkyaNCk/+clPsmbNmpx++unNuvwZgA+WUqW51x8BAADARswlygAAABSCwAUAAKAQBC4AAACFIHABAAAoBIELAABAIRTm1wQ999zKNj9mz55ds2LFK21+XNqeXXccdt1x2HXHYdcdgz13HHbdcbx11717r//X8iXO4L4v1dVV7T0CbcSuOw677jjsuuOw647BnjsOu+443suuBS4AAACFIHABAAAoBIELAABAIQhcAAAACkHgAgAAUAgCFwAAgEIQuAAAABSCwAUAAKAQBC4AAACFIHABAAAoBIELAABAIQhcAAAACkHgAgAAUAjV7T0AtMTQ8+9us2NdNXr/NjsWAADw/jmDCwAAQCEIXAAAAApB4AIAAFAIAhcAAIBCELgAAAAUgsAFAACgEAQuAAAAhSBwAQAAKASBCwAAQCEIXAAAAApB4AIAAFAIAhcAAIBCELgAAAAUgsAFAACgEAQuAAAAhSBwAQAAKASBCwAAQCEIXAAAAApB4AIAAFAIAhcAAIBCELgAAAAUgsAFAACgEAQuAAAAhSBwAQAAKASBCwAAQCFUt/cAfPANPf/u9h4BAADAGVwAAACKQeACAABQCAIXAACAQhC4AAAAFILABQAAoBAELgAAAIUgcAEAACgEgQsAAEAhtGrgLl68OMcee2yS5PHHH88xxxyTIUOGZPz48SmXy0mSadOm5aijjsrgwYPz4IMPrvexAAAA8E5aLXCvuOKKnHPOOXn99deTJJMnT86IESNyzTXXpFKpZPbs2VmyZEnuu+++3HjjjZk6dWomTJjwjo8FAACA9Wm1wN12221z8cUXN729ZMmS9OvXL0kycODAzJ8/P/fff38GDBiQUqmUrbbaKo2NjVm+fPnbPhYAAADWp7q1nnjQoEF58sknm96uVCoplUpJkm7dumXlypVZtWpVevTo0fSYN25/u8e+m549u6a6umoDfxTvrnfv7m1+TNrGW3dr1x2HXXccdt1x2HXHYM8dh113HC3ddasF7lt16vR/J4vr6+uz2WabpaamJvX19Wvd3r1797d97LtZseKVDTtwM/Tu3T3PPffu8c0H05t3a9cdh113HHbdcdh1x2DPHYdddxxv3XVzYrfNXkV51113zcKFC5Mkc+fOTd++fbP33ntn3rx5KZfLWbp0acrlcnr16vW2jwUAAID1abMzuKNGjcq4ceMyderU1NbWZtCgQamqqkrfvn1z9NFHp1wup66u7h0fCwAAAOtTqlQqlfYeYkNoj8sUXB7xT0PPv7u9R2gVV43ev+nPdt1x2HXHYdcdh113DPbccdh1x7FRX6IMAAAArUngAgAAUAgCFwAAgEIQuAAAABSCwAUAAKAQBC4AAACFIHABAAAoBIELAABAIQhcAAAACkHgAgAAUAgCFwAAgEIQuAAAABSCwAUAAKAQBC4AAACFIHABAAAoBIELAABAIQhcAAAACkHgAgAAUAgCFwAAgEIQuAAAABSCwAUAAKAQBC4AAACFIHABAAAoBIELAABAIQhcAAAACkHgAgAAUAgCFwAAgEIQuAAAABSCwAUAAKAQBC4AAACFIHABAAAoBIELAABAIQhcAAAACkHgAgAAUAgCFwAAgEIQuAAAABSCwAUAAKAQBC4AAACFIHABAAAoBIELAABAIQhcAAAACkHgAgAAUAgCFwAAgEIQuAAAABSCwAUAAKAQBC4AAACFUN2WB2toaMjo0aPz1FNPpVOnTjn33HNTXV2d0aNHp1QqpU+fPhk/fnw6deqUadOmZc6cOamurs7YsWOzxx57tOWoAAAAfMC0aeD+7ne/y5o1a3Ldddfl3nvvzQ9+8IM0NDRkxIgR+fSnP526urrMnj07W221Ve67777ceOONWbZsWYYNG5abb765LUcFAADgA6ZNL1Hefvvt09jYmHK5nFWrVqW6ujpLlixJv379kiQDBw7M/Pnzc//992fAgAEplUrZaqut0tjYmOXLl7flqAAAAHzAtOkZ3K5du+app57KQQcdlBUrVuTSSy/NokWLUiqVkiTdunXLypUrs2rVqvTo0aPp/d64vVevXu/43D17dk11dVWrfwxv1bt39zY/Jm3jrbu1647DrjsOu+447LpjsOeOw647jpbuuk0D96c//WkGDBiQkSNHZtmyZTn++OPT0NDQdH99fX0222yz1NTUpL6+fq3bu3df/we2YsUrrTb3O+ndu3uee25lmx+XtvHm3dp1x2HXHYdddxx23THYc8dh1x3HW3fdnNht00uUN9tss6ZQ3XzzzbNmzZrsuuuuWbhwYZJk7ty56du3b/bee+/Mmzcv5XI5S5cuTblcXu/ZWwAAAGjTM7jf+MY3Mnbs2AwZMiQNDQ0544wzsvvuu2fcuHGZOnVqamtrM2jQoFRVVaVv3745+uijUy6XU1dX15ZjAgAA8AHUpoHbrVu3XHTRRevcPmPGjHVuGzZsWIYNG9YWYwEAAFAAbXqJMgAAALQWgQsAAEAhCFwAAAAKQeACAABQCAIXAACAQhC4AAAAFILABQAAoBAELgAAAIUgcAEAACgEgQsAAEAhCFwAAAAKQeACAABQCAIXAACAQhC4AAAAFILABQAAoBAELgAAAIUgcAEAACgEgQsAAEAhCFwAAAAKQeACAABQCAIXAACAQhC4AAAAFILABQAAoBAELgAAAIUgcAEAACgEgQsAAEAhCFwAAAAKoVmB+81vfjO/+tWvsnr16taeBwAAAN6TZgfuPffckwMPPDATJkzIgw8+2NpzAQAAQItUN+dB/fr1S79+/fLaa6/ljjvuyPDhw1NTU5OjjjoqQ4YMSefOnVt7TgAAAFivZgVukixcuDC33npr7r333gwcODBf/vKXM3/+/Jxyyim58sorW3NGAAAAeFfNCtwvfOEL2XrrrfPVr341dXV16dKlS5Lk05/+dL761a+26oAAAADQHM0K3J/97Gfp1q1bPvzhD+e1117L448/nu222y6dOnXKrFmzWntGAAAAeFfNepGpOXPm5KSTTkqSvPDCCzn55JNz/fXXt+pgAAAA0BLNCtwbbrghV199dZLkYx/7WGbOnJkZM2a06mAAAADQEs0K3IaGhrVeKXmTTTZptYEAAADgvWjWz+AecMABOf7443PQQQelVCrl17/+dfbff//Wng0AAACarVmBe9ZZZ+WOO+7IokWLUl1dneOOOy4HHHBAa88GAAAAzdbs34O7ww47ZIsttkilUkmSLFq0KJ/61KdabTDen6Hn393eIwAAALSpZgXuhAkT8tvf/jbbbLNN022lUin//d//3WqDAQAAQEs0K3Dvvffe3HHHHenSpUtrzwMAAADvSbNeRXmbbbZpujQZAAAANkbNOoO7+eab5+CDD84nP/nJtX5d0OTJk1ttMAAAAGiJZgXufvvtl/3226+1Z4GNSlu+UNdVo/3aLQAAeL+aFbhHHHFEnnzyyfz1r3/NgAEDsmzZsrVecAoAAADaW7N+BveXv/xlTjnllEyaNCkvvfRSBg8enFtvvbW1ZwMAAIBma1bgXnHFFbn22mvTrVu3fPjDH86sWbNy+eWXt/ZsAAAA0GzNukS5U6dOqampaXp7yy23TKdOzWrjdVx22WW5++6709DQkGOOOSb9+vXL6NGjUyqV0qdPn4wfPz6dOnXKtGnTMmfOnFRXV2fs2LHZY4893tPxAAAA6BiaVal9+vTJjBkzsmbNmjzyyCMZN25cdtlllxYfbOHChfnjH/+Ya6+9NtOnT8/TTz+dyZMnZ8SIEbnmmmtSqVQye/bsLFmyJPfdd19uvPHGTJ06NRMmTGjxsQAAAOhYmhW4dXV1eeaZZ7Lppptm7Nixqampyfjx41t8sHnz5mWnnXbKqaeempNPPjmf//zns2TJkvTr1y9JMnDgwMyfPz/3339/BgwYkFKplK222iqNjY1Zvnx5i48HAABAx9GsS5S7du2akSNHZuTIke/rYCtWrMjSpUtz6aWX5sknn8wpp5ySSqWSUqmUJOnWrVtWrlyZVatWpUePHk3v98btvXr1esfn7tmza6qrq97XfO9F797d2/yYFI+vo42LfXQcdt1x2HXHYM8dh113HC3ddbMCd5dddmmK0P87UO/MnTu3RQfr0aNHamtr07lz59TW1mbTTTfN008/3XR/fX19Nttss9TU1KS+vn6t27t3X/8HtmLFKy2aZUPo3bt7nntuZZsfl+LxdbTx8H3dcdh1x2HXHYM9dxx23XG8ddfNid1mXaL8pz/9KY888kgeeeSRPPjgg5k6dWoOOuigFg+4zz775J577kmlUskzzzyTV199Nf3798/ChQuTJHPnzk3fvn2z9957Z968eSmXy1m6dGnK5fJ6z94CAABAs87gvtkmm2ySgw46KJdeemmLD/aFL3whixYtylFHHZVKpZK6urpsvfXWGTduXKZOnZra2toMGjQoVVVV6du3b44++uiUy+XU1dW1+FgAAAB0LM0K3FtuuaXpz5VKJY8++miqq1vcxkmS73znO+vcNmPGjHVuGzZsWIYNG/aejgEAAEDH06xKfeMS4jf07NkzP/jBD1plIAAAAHgvmhW4kydPbu05AAAA4H1pVuDuv//+67yKcpKmX/Eze/bsDT4YAAAAtESzAvfQQw/NJptskq9//euprq7Oz3/+8zz00EM544wzWns+AAAAaJZmBe4999yTmTNnNr19/PHH58gjj8zHPvaxVhsMAAAAWqJZvwc3SebPn9/059/+9rfp1q1bqwwEAAAA70WzzuB+73vfy6hRo/L8888nSWprazNlypRWHQwAAABaolmBu/vuu+f222/P8uXL06VLl3Tt2rW15wIAAIAWadYlyk899VROOOGEDB48OPX19TnuuOPy5JNPtvZsAAAA0GzNCty6urqceOKJ6dq1a7bYYosccsghGTVqVGvPBgAAAM3WrMBdsWJFBgwYkCQplUr5+te/nlWrVrXqYAAAANASzQrcLl265Omnn06pVEqS/M///E86d+7cqoMBAABASzTrRabGjBmTf/u3f8sTTzyRww47LC+99FIuuuii1p4NAAAAmq1ZgfvCCy/kpptuyt///vc0NjamtrbWGVwAAAA2Ks26RPmCCy7IJptskj59+mSXXXYRtwAAAGx0mnUGd5tttsmYMWOy5557pkuXLk23H3744a02GAAAALTEegP3mWeeyUc+8pH07NkzSbJ48eK17he4AAAAbCzWG7gnn3xyZs2alcmTJ+eqq67K0KFD22ouAAAAaJH1/gxupVJp+vPPf/7zVh8GAAAA3qv1Bu4bv/c2WTt2AQAAYGPTrFdRTtaOXQAAANjYrPdncB999NF88YtfTPLPF5x648+VSiWlUimzZ89u/QkBAACgGdYbuL/+9a/bag4AAAB4X9YbuB/72Mfaag4AAAB4X5r9M7gAAACwMRO4AAAAFILABQAAoBAELgAAAIUgcAEAACgEgQsAAEAhCFwAAAAKQeACAABQCAIXAACAQhC4AAAAFILABQAAoBAELgAAAIUgcAEAACgEgQsAAEAhCFwAAAAKQeACAABQCAIXAACAQhC4AAAAFILABQAAoBAELgAAAIUgcAEAACgEgQsAAEAhCFwAAAAKoV0C94UXXsjnPve5PPbYY3n88cdzzDHHZMiQIRk/fnzK5XKSZNq0aTnqqKMyePDgPPjgg+0xJgAAAB8gbR64DQ0NqaurS5cuXZIkkydPzogRI3LNNdekUqlk9uzZWbJkSe67777ceOONmTp1aiZMmNDWYwIAAPAB0+aBO2XKlAwePDhbbrllkmTJkiXp169fkmTgwIGZP39+7r///gwYMCClUilbbbVVGhsbs3z58rYeFQAAgA+Q6rY82MyZM9OrV6/st99+ufzyy5MklUolpVIpSdKtW7esXLkyq1atSo8ePZre743be/Xq9Y7P3bNn11RXV7XuB/A2evfu3ubHpHh8HW1c7KPjsOuOw647BnvuOOy642jprts0cG+++eaUSqUsWLAgjzzySEaNGrXWmdn6+vpsttlmqampSX19/Vq3d+++/g9sxYpXWm3ud9K7d/c899zKNj8uxePraOPh+7rjsOuOw647BnvuOOy643jrrpsTu216ifLVV1+dGTNmZPr06fn4xz+eKVOmZODAgVm4cGGSZO7cuenbt2/23nvvzJs3L+VyOUuXLk25XF7v2VsAAABo0zO4b2fUqFEZN25cpk6dmtra2gwaNChVVVXp27dvjj766JTL5dTV1bX3mAAAAGzk2i1wp0+f3vTnGTNmrHP/sGHDMmzYsLYcCQAAgA+wdvk9uAAAALChCVwAAAAKQeACAABQCAIXAACAQhC4AAAAFILABQAAoBAELgAAAIUgcAEAACgEgQsAAEAhCFwAAAAKQeACAABQCAIXAACAQhC4AAAAFILABQAAoBAELgAAAIUgcAEAACgEgQsAAEAhCFwAAAAKQeACAABQCAIXAACAQhC4AAAAFILABQAAoBAELgAAAIUgcAEAACgEgQsAAEAhCFwAAAAKQeACAABQCAIXAACAQhC4AAAAFILABQAAoBAELgAAAIUgcAEAACgEgQsAAEAhCFwAAAAKQeACAABQCAIXAACAQqhu7wE6kqHn393eIwAAABSWM7gAAAAUgsAFAACgEFyiDBuBtrx8/arR+7fZsQAAoC05gwsAAEAhCFwAAAAKQeACAABQCAIXAACAQhC4AAAAFILABQAAoBDa9NcENTQ0ZOzYsXnqqaeyevXqnHLKKdlxxx0zevTolEql9OnTJ+PHj0+nTp0ybdq0zJkzJ9XV1Rk7dmz22GOPthwVAACAD5g2DdzbbrstPXr0yAUXXJAVK1bkiCOOyC677JIRI0bk05/+dOrq6jJ79uxstdVWue+++3LjjTdm2bJlGTZsWG6++ea2HBUAAIAPmDYN3AMPPDCDBg1qeruqqipLlixJv379kiQDBw7Mvffem+233z4DBgxIqVTKVlttlcbGxixfvjy9evVqy3EBAAD4AGnTwO3WrVuSZNWqVRk+fHhGjBiRKVOmpFQqNd2/cuXKrFq1Kj169Fjr/VauXLnewO3Zs2uqq6ta9wN4G717d2/zY8L74Wv23fkcdRx23XHYdcdgzx2HXXccLd11mwZukixbtiynnnpqhgwZkkMPPTQXXHBB03319fXZbLPNUlNTk/r6+rVu7959/R/YihWvtNrM76R37+557rmVbX5ceD98za6f7+uOw647DrvuGOy547DrjuMePR2tAAAIvklEQVStu25O7Lbpqyg///zzGTp0aM4666wcddRRSZJdd901CxcuTJLMnTs3ffv2zd5775158+alXC5n6dKlKZfLLk8GAABgvdr0DO6ll16al19+OZdcckkuueSSJMnZZ5+diRMnZurUqamtrc2gQYNSVVWVvn375uijj065XE5dXV1bjgkAAMAHUKlSqVTae4gNoT0uU2jp5RFDz7+7FaeB5rlq9P7tPcJGzWVPHYdddxx23THYc8dh1x3HRn+JMgAAALQWgQsAAEAhCFwAAAAKQeACAABQCAIXAACAQhC4AAAAFILABQAAoBAELgAAAIUgcAEAACgEgQsAAEAhCFwAAAAKQeACAABQCAIXAACAQhC4AAAAFILABQAAoBAELgAAAIUgcAEAACgEgQsAAEAhCFwAAAAKQeACAABQCAIXAACAQhC4AAAAFILABQAAoBAELgAAAIUgcAEAACgEgQsAAEAhCFwAAAAKQeACAABQCAIXAACAQhC4AAAAFILABQAAoBAELgAAAIUgcAEAACgEgQsAAEAhCFwAAAAKQeACAABQCAIXAACAQhC4AAAAFILABQAAoBAELgAAAIUgcAEAACgEgQsAAEAhVLf3AEDbGnr+3W12rKtG799mxwIAAGdwAQAAKASBCwAAQCEIXAAAAApB4AIAAFAIG+2LTJXL5Xz3u9/Nn//853Tu3DkTJ07Mdttt195jAQAAsJHaaAP3rrvuyurVq3P99dfngQceyPnnn58f//jH7T0WsJHy6tAAAGy0gXv//fdnv/32S5Lstddeefjhh9t5IqCl2jI625KY3jB8HgGADW2jDdxVq1alpqam6e2qqqqsWbMm1dVvP3Lv3t3barT3fNyf/+dhrTgJwAfLxvB3Ynv920Hbs+uOwZ47DrvuOFq66432RaZqampSX1/f9Ha5XH7HuAUAAICNNnD33nvvzJ07N0nywAMPZKeddmrniQAAANiYlSqVSqW9h3g7b7yK8l/+8pdUKpWcd9552WGHHdp7LAAAADZSG23gAgAAQEtstJcoAwAAQEsIXAAAAArByxK30Bs/G/znP/85nTt3zsSJE7Pddtu191hsYIsXL86FF16Y6dOn5/HHH8/o0aNTKpXSp0+fjB8/Pp06+b+hD7qGhoaMHTs2Tz31VFavXp1TTjklO+64o10XUGNjY84555z87W9/S1VVVSZPnpxKpWLXBfbCCy/kyCOPzFVXXZXq6mq7LqjDDz883bv/89eHbL311jn66KMzadKkVFVVZcCAATnttNPaeUI2lMsuuyx33313Ghoacswxx6Rfv36+rwto5syZmTVrVpLk9ddfzyOPPJLp06e3+PvaV0IL3XXXXVm9enWuv/76jBw5Mueff357j8QGdsUVV+Scc87J66+/niSZPHlyRowYkWuuuSaVSiWzZ89u5wnZEG677bb06NEj11xzTa644oqce+65dl1Qv/3tb5Mk1113XYYPH57JkyfbdYE1NDSkrq4uXbp0SeLv8KJ649/o6dOnZ/r06Zk8eXLGjx+f//zP/8y1116bxYsXZ8mSJe08JRvCwoUL88c//jHXXnttpk+fnqefftr3dUEdeeSRTd/Tu+22W84555z39H0tcFvo/vvvz3777Zck2WuvvfLwww+380RsaNtuu20uvvjipreXLFmSfv36JUkGDhyY+fPnt9dobEAHHnhgTj/99Ka3q6qq7LqgDjjggJx77rlJkqVLl2aLLbaw6wKbMmVKBg8enC233DKJv8OL6k9/+lNeffXVDB06NMcdd1wWLVqU1atXZ9ttt02pVMqAAQOyYMGC9h6TDWDevHnZaaedcuqpp+bkk0/O5z//ed/XBffQQw/lr3/9aw4++OD39H0tcFto1apVqampaXq7qqoqa9asaceJ2NAGDRqU6ur/u3q/UqmkVColSbp165aVK1e212hsQN26dUtNTU1WrVqV4cOHZ8SIEXZdYNXV1Rk1alTOPffcDBo0yK4LaubMmenVq1fTf0Qn/g4vqi5duuTEE0/MlVdemQkTJmTMmDH50Ic+1HS/XRfHihUr8vDDD+eiiy7KhAkTcuaZZ/q+LrjLLrssp5566jrd1dxd+xncFqqpqUl9fX3T2+Vyea0Yonje/DMd9fX12WyzzdpxGjakZcuW5dRTT82QIUNy6KGH5oILLmi6z66LZ8qUKTnzzDPz9a9/venyxsSui+Tmm29OqVTKggUL8sgjj2TUqFFZvnx50/12XRzbb799tttuu5RKpWy//fbp3r17Xnzxxab77bo4evTokdra2nTu3Dm1tbXZdNNN8/TTTzfdb9fF8vLLL+f//b//l3333TerVq1aq7uau2tncFto7733zty5c5MkDzzwQHbaaad2nojWtuuuu2bhwoVJkrlz56Zv377tPBEbwvPPP5+hQ4fmrLPOylFHHZXErovqlltuyWWXXZYk+dCHPpRSqZTdd9/drgvo6quvzowZMzJ9+vR8/OMfz5QpUzJw4EC7LqCbbrqp6XVQnnnmmbz66qvp2rVrnnjiiVQqlcybN8+uC2KfffbJPffck0ql0rTr/v37+74uqEWLFuUzn/lMkn+eWNxkk01a/H1dqlQqldYetEjeeBXlv/zlL6lUKjnvvPOyww47tPdYbGBPPvlkvv3tb+eGG27I3/72t4wbNy4NDQ2pra3NxIkTU1VV1d4j8j5NnDgxv/rVr1JbW9t029lnn52JEyfadcG88sorGTNmTJ5//vmsWbMm3/zmN7PDDjv4vi64Y489Nt/97nfTqVMnuy6g1atXZ8yYMVm6dGlKpVLOPPPMdOrUKeedd14aGxszYMCAnHHGGe09JhvIf/zHf2ThwoWpVCo544wzsvXWW/u+Lqif/OQnqa6uzje+8Y0k/zyh2NLva4ELAABAIbhEGQAAgEIQuAAAABSCwAUAAKAQBC4AAACFIHABAAAoBIELAABAIQhcAAAACkHgAgAAUAj/Hz3g9NtjvkOZAAAAAElFTkSuQmCC\n", 415 | "text/plain": [ 416 | "" 417 | ] 418 | }, 419 | "metadata": {}, 420 | "output_type": "display_data" 421 | } 422 | ], 423 | "source": [ 424 | "# Longitud en tokens de cada noticia\n", 425 | "tokens_numbers = df.apply(lambda row: len(row['Headline']), axis = 1)\n", 426 | "\n", 427 | "fig, ax = plt.subplots(1,1, figsize=(16,6))\n", 428 | "tokens_numbers.plot.hist(title=\"Number of tokens in the article\", bins = 30, ax=ax)\n", 429 | "\n", 430 | "print(\"Número medio de tokens por título: {}\".format(int(np.mean(tokens_numbers))))\n", 431 | "print(\"Desviación estándar de tokens por título: {}\".format(int(np.std(tokens_numbers))))\n", 432 | "print(\"Mediana de tokens por título: {}\".format(int(np.median(tokens_numbers))))\n", 433 | "\n", 434 | "max_tokens = np.mean(tokens_numbers) + 2 * np.std(tokens_numbers)\n", 435 | "max_tokens = int(max_tokens)\n", 436 | "\n", 437 | "n_tokens = max_tokens\n", 438 | "\n", 439 | "percent_tokens = tokens_numbers[tokens_numbers <= max_tokens].count() / tokens_numbers.count()\n", 440 | "print(\"Ventana de tokens escogida: {} - Cubre el {}% del dataset\".format(max_tokens, percent_tokens*100))\n" 441 | ] 442 | }, 443 | { 444 | "cell_type": "code", 445 | "execution_count": null, 446 | "metadata": {}, 447 | "outputs": [], 448 | "source": [] 449 | }, 450 | { 451 | "cell_type": "code", 452 | "execution_count": null, 453 | "metadata": {}, 454 | "outputs": [], 455 | "source": [] 456 | }, 457 | { 458 | "cell_type": "code", 459 | "execution_count": 11, 460 | "metadata": {}, 461 | "outputs": [], 462 | "source": [ 463 | "df['Headline'] = df.apply(lambda r: pad_array(r['Headline'], MAX_LEN_TITLE) , axis=1)\n", 464 | "df['Body'] = df.apply(lambda r: pad_array(r['Body'], MAX_LEN_CONTENT) , axis=1)" 465 | ] 466 | }, 467 | { 468 | "cell_type": "markdown", 469 | "metadata": {}, 470 | "source": [ 471 | "Convertir label" 472 | ] 473 | }, 474 | { 475 | "cell_type": "code", 476 | "execution_count": 12, 477 | "metadata": {}, 478 | "outputs": [], 479 | "source": [ 480 | "df.loc[df['Label'] == 1, 'Label'] = pd.Series([LBL_TRUE] * len(df))\n", 481 | "df.loc[df['Label'] == 0, 'Label'] = pd.Series([LBL_FAKE] * len(df))\n", 482 | "#loc[df1['stream'] == 2, 'feat'] = 10" 483 | ] 484 | }, 485 | { 486 | "cell_type": "code", 487 | "execution_count": 13, 488 | "metadata": {}, 489 | "outputs": [ 490 | { 491 | "data": { 492 | "text/html": [ 493 | "
\n", 494 | "\n", 507 | "\n", 508 | " \n", 509 | " \n", 510 | " \n", 511 | " \n", 512 | " \n", 513 | " \n", 514 | " \n", 515 | " \n", 516 | " \n", 517 | " \n", 518 | " \n", 519 | " \n", 520 | " \n", 521 | " \n", 522 | " \n", 523 | " \n", 524 | " \n", 525 | " \n", 526 | " \n", 527 | " \n", 528 | " \n", 529 | " \n", 530 | " \n", 531 | " \n", 532 | " \n", 533 | " \n", 534 | " \n", 535 | " \n", 536 | " \n", 537 | " \n", 538 | " \n", 539 | " \n", 540 | " \n", 541 | " \n", 542 | " \n", 543 | " \n", 544 | " \n", 545 | " \n", 546 | " \n", 547 | " \n", 548 | "
HeadlineBodyLabel
0[2675, 1143, 2205, 48926, 6117, 13034, 0, 0, 0...[15680, 8429, 28683, 14257, 312, 281, 565, 611...[0, 0, 0, 1]
1[556, 2168, 3912, 5042, 2360, 508, 115, 948, 0...[4949, 20876, 17535, 3912, 46, 3610, 556, 3230...[0, 0, 0, 1]
2[13034, 12828, 316, 48926, 553, 16854, 23325, ...[7, 14446, 1506, 69, 208, 995, 64, 116, 61, 60...[0, 0, 0, 1]
3[2905, 1469, 1930, 8, 44732, 2, 3538, 19568, 4...[29922, 2905, 6585, 4840, 301, 11, 1080, 1428,...[0, 0, 0, 1]
4[2678, 3466, 8, 11440, 5739, 0, 0, 0, 0, 0, 0,...[4868, 3076, 2678, 31, 10, 3246, 14, 11, 11440...[0, 0, 0, 1]
\n", 549 | "
" 550 | ], 551 | "text/plain": [ 552 | " Headline \\\n", 553 | "0 [2675, 1143, 2205, 48926, 6117, 13034, 0, 0, 0... \n", 554 | "1 [556, 2168, 3912, 5042, 2360, 508, 115, 948, 0... \n", 555 | "2 [13034, 12828, 316, 48926, 553, 16854, 23325, ... \n", 556 | "3 [2905, 1469, 1930, 8, 44732, 2, 3538, 19568, 4... \n", 557 | "4 [2678, 3466, 8, 11440, 5739, 0, 0, 0, 0, 0, 0,... \n", 558 | "\n", 559 | " Body Label \n", 560 | "0 [15680, 8429, 28683, 14257, 312, 281, 565, 611... [0, 0, 0, 1] \n", 561 | "1 [4949, 20876, 17535, 3912, 46, 3610, 556, 3230... [0, 0, 0, 1] \n", 562 | "2 [7, 14446, 1506, 69, 208, 995, 64, 116, 61, 60... [0, 0, 0, 1] \n", 563 | "3 [29922, 2905, 6585, 4840, 301, 11, 1080, 1428,... [0, 0, 0, 1] \n", 564 | "4 [4868, 3076, 2678, 31, 10, 3246, 14, 11, 11440... [0, 0, 0, 1] " 565 | ] 566 | }, 567 | "execution_count": 13, 568 | "metadata": {}, 569 | "output_type": "execute_result" 570 | } 571 | ], 572 | "source": [ 573 | "df.head()" 574 | ] 575 | }, 576 | { 577 | "cell_type": "code", 578 | "execution_count": 14, 579 | "metadata": {}, 580 | "outputs": [], 581 | "source": [ 582 | "df.to_pickle(PATH_PROCESSED)" 583 | ] 584 | } 585 | ], 586 | "metadata": { 587 | "kernelspec": { 588 | "display_name": "Python 3", 589 | "language": "python", 590 | "name": "python3" 591 | }, 592 | "language_info": { 593 | "codemirror_mode": { 594 | "name": "ipython", 595 | "version": 3 596 | }, 597 | "file_extension": ".py", 598 | "mimetype": "text/x-python", 599 | "name": "python", 600 | "nbconvert_exporter": "python", 601 | "pygments_lexer": "ipython3", 602 | "version": "3.6.7" 603 | } 604 | }, 605 | "nbformat": 4, 606 | "nbformat_minor": 2 607 | } 608 | -------------------------------------------------------------------------------- /notebooks/GettingRealAboutFake.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Procesado del dataset: Getting Real About Fake News\n", 8 | "## Fuente: Kaggle. https://www.kaggle.com/mrisdal/fake-news" 9 | ] 10 | }, 11 | { 12 | "cell_type": "code", 13 | "execution_count": 1, 14 | "metadata": {}, 15 | "outputs": [], 16 | "source": [ 17 | "import pandas as pd\n", 18 | "import numpy as np\n", 19 | "\n", 20 | "from matplotlib import pyplot as plt\n", 21 | "from matplotlib import style\n", 22 | "\n", 23 | "#One-hot encodding\n", 24 | "from sklearn.preprocessing import LabelBinarizer\n", 25 | "\n", 26 | "#Progress bars\n", 27 | "from tqdm import tqdm\n", 28 | "tqdm.pandas()\n", 29 | "\n", 30 | "#Paralelize pandas apply on multiple cores\n", 31 | "import swifter\n", 32 | "\n", 33 | "#Nicer style\n", 34 | "style.use('seaborn') \n", 35 | "\n", 36 | "import re #regexp\n", 37 | "\n", 38 | "from nltk.tokenize import RegexpTokenizer" 39 | ] 40 | }, 41 | { 42 | "cell_type": "code", 43 | "execution_count": 2, 44 | "metadata": {}, 45 | "outputs": [], 46 | "source": [ 47 | "news = pd.read_csv(\"../data/Other_datasets/GettingRealAboutFake/fake.csv\")" 48 | ] 49 | }, 50 | { 51 | "cell_type": "markdown", 52 | "metadata": {}, 53 | "source": [ 54 | "Longitudes necesarias de los artículos" 55 | ] 56 | }, 57 | { 58 | "cell_type": "code", 59 | "execution_count": 3, 60 | "metadata": {}, 61 | "outputs": [], 62 | "source": [ 63 | "#Padding number for title and content\n", 64 | "MAX_LEN_TITLE = 13\n", 65 | "MAX_LEN_CONTENT = 1598" 66 | ] 67 | }, 68 | { 69 | "cell_type": "markdown", 70 | "metadata": {}, 71 | "source": [ 72 | "### Limpieza del dataset.\n", 73 | "\n", 74 | "Nos quedamos con:\n", 75 | " * title\n", 76 | " * language == english\n", 77 | " * text\n", 78 | " * type\n", 79 | " * type != bs $^*$\n", 80 | " \n", 81 | "$^*$ *Data sources that were missing a label were simply assigned a label of \"bs\". There are (ostensibly) no genuine, reliable, or trustworthy news sources represented in this dataset (so far), so don't trust anything you read.*" 82 | ] 83 | }, 84 | { 85 | "cell_type": "code", 86 | "execution_count": 4, 87 | "metadata": {}, 88 | "outputs": [], 89 | "source": [ 90 | "news = news[news['language'] == 'english']\n", 91 | "news = news[news['type'] != 'bs']\n", 92 | "news = news[['title', 'text', 'type']]" 93 | ] 94 | }, 95 | { 96 | "cell_type": "code", 97 | "execution_count": 5, 98 | "metadata": {}, 99 | "outputs": [ 100 | { 101 | "data": { 102 | "text/html": [ 103 | "
\n", 104 | "\n", 117 | "\n", 118 | " \n", 119 | " \n", 120 | " \n", 121 | " \n", 122 | " \n", 123 | " \n", 124 | " \n", 125 | " \n", 126 | " \n", 127 | " \n", 128 | " \n", 129 | " \n", 130 | " \n", 131 | " \n", 132 | " \n", 133 | " \n", 134 | " \n", 135 | " \n", 136 | " \n", 137 | " \n", 138 | " \n", 139 | " \n", 140 | " \n", 141 | " \n", 142 | " \n", 143 | " \n", 144 | " \n", 145 | " \n", 146 | " \n", 147 | " \n", 148 | " \n", 149 | " \n", 150 | " \n", 151 | " \n", 152 | " \n", 153 | " \n", 154 | " \n", 155 | " \n", 156 | " \n", 157 | " \n", 158 | "
titletexttype
0Muslims BUSTED: They Stole Millions In Gov’t B...Print They should pay all the back all the mon...bias
1Re: Why Did Attorney General Loretta Lynch Ple...Why Did Attorney General Loretta Lynch Plead T...bias
2BREAKING: Weiner Cooperating With FBI On Hilla...Red State : \\nFox News Sunday reported this mo...bias
3PIN DROP SPEECH BY FATHER OF DAUGHTER Kidnappe...Email Kayla Mueller was a prisoner and torture...bias
4FANTASTIC! TRUMP'S 7 POINT PLAN To Reform Heal...Email HEALTHCARE REFORM TO MAKE AMERICA GREAT ...bias
\n", 159 | "
" 160 | ], 161 | "text/plain": [ 162 | " title \\\n", 163 | "0 Muslims BUSTED: They Stole Millions In Gov’t B... \n", 164 | "1 Re: Why Did Attorney General Loretta Lynch Ple... \n", 165 | "2 BREAKING: Weiner Cooperating With FBI On Hilla... \n", 166 | "3 PIN DROP SPEECH BY FATHER OF DAUGHTER Kidnappe... \n", 167 | "4 FANTASTIC! TRUMP'S 7 POINT PLAN To Reform Heal... \n", 168 | "\n", 169 | " text type \n", 170 | "0 Print They should pay all the back all the mon... bias \n", 171 | "1 Why Did Attorney General Loretta Lynch Plead T... bias \n", 172 | "2 Red State : \\nFox News Sunday reported this mo... bias \n", 173 | "3 Email Kayla Mueller was a prisoner and torture... bias \n", 174 | "4 Email HEALTHCARE REFORM TO MAKE AMERICA GREAT ... bias " 175 | ] 176 | }, 177 | "execution_count": 5, 178 | "metadata": {}, 179 | "output_type": "execute_result" 180 | } 181 | ], 182 | "source": [ 183 | "news.head()" 184 | ] 185 | }, 186 | { 187 | "cell_type": "markdown", 188 | "metadata": {}, 189 | "source": [ 190 | "El dataset incluye distintos tipos de noticias falsas categorizadas junto a metadatos. Se pretende entrenar un modelo que distinga también verdaderas, por lo que debemos añadir samples verdaderos de FakeNewsCorpus.\n", 191 | "\n", 192 | "Previamente se han extraido sólamente las verdaderas en `data/only_true.csv`." 193 | ] 194 | }, 195 | { 196 | "cell_type": "code", 197 | "execution_count": 6, 198 | "metadata": {}, 199 | "outputs": [], 200 | "source": [ 201 | "true_news = pd.read_csv(\"../data/Other_datasets/GettingRealAboutFake/news_only_true.csv\", nrows=1000)" 202 | ] 203 | }, 204 | { 205 | "cell_type": "code", 206 | "execution_count": 7, 207 | "metadata": {}, 208 | "outputs": [], 209 | "source": [ 210 | "#Poner todos los type como true\n", 211 | "true_news['type'] = 'true'" 212 | ] 213 | }, 214 | { 215 | "cell_type": "code", 216 | "execution_count": 8, 217 | "metadata": {}, 218 | "outputs": [], 219 | "source": [ 220 | "true_news = true_news[['title', 'content', 'type']]\n", 221 | "true_news = true_news.rename(str, columns={\"content\": \"text\"})" 222 | ] 223 | }, 224 | { 225 | "cell_type": "code", 226 | "execution_count": 9, 227 | "metadata": {}, 228 | "outputs": [ 229 | { 230 | "data": { 231 | "text/html": [ 232 | "
\n", 233 | "\n", 246 | "\n", 247 | " \n", 248 | " \n", 249 | " \n", 250 | " \n", 251 | " \n", 252 | " \n", 253 | " \n", 254 | " \n", 255 | " \n", 256 | " \n", 257 | " \n", 258 | " \n", 259 | " \n", 260 | " \n", 261 | " \n", 262 | " \n", 263 | " \n", 264 | " \n", 265 | " \n", 266 | " \n", 267 | " \n", 268 | " \n", 269 | " \n", 270 | " \n", 271 | " \n", 272 | " \n", 273 | " \n", 274 | " \n", 275 | " \n", 276 | " \n", 277 | " \n", 278 | " \n", 279 | " \n", 280 | " \n", 281 | " \n", 282 | " \n", 283 | " \n", 284 | " \n", 285 | " \n", 286 | " \n", 287 | "
titletexttype
0Iranian Christian Convert Leads 1,500 Muslims ...(Screencap: YouTube/Tidningen Dagen) Annahita ...true
1Finding wonder and foreboding in the pathway o...It is rather hard to write with eclipse glasse...true
2#MeToo! Our culture of sexual predation – Bapt...Social media is blowing up about a culture of ...true
3God’s invitation to life – Baptist News GlobalMany preachers took up the Isaiah 55 passage t...true
4What is sown in the heart – Baptist News GlobalA trip to the cardiologist is rarely routine. ...true
\n", 288 | "
" 289 | ], 290 | "text/plain": [ 291 | " title \\\n", 292 | "0 Iranian Christian Convert Leads 1,500 Muslims ... \n", 293 | "1 Finding wonder and foreboding in the pathway o... \n", 294 | "2 #MeToo! Our culture of sexual predation – Bapt... \n", 295 | "3 God’s invitation to life – Baptist News Global \n", 296 | "4 What is sown in the heart – Baptist News Global \n", 297 | "\n", 298 | " text type \n", 299 | "0 (Screencap: YouTube/Tidningen Dagen) Annahita ... true \n", 300 | "1 It is rather hard to write with eclipse glasse... true \n", 301 | "2 Social media is blowing up about a culture of ... true \n", 302 | "3 Many preachers took up the Isaiah 55 passage t... true \n", 303 | "4 A trip to the cardiologist is rarely routine. ... true " 304 | ] 305 | }, 306 | "execution_count": 9, 307 | "metadata": {}, 308 | "output_type": "execute_result" 309 | } 310 | ], 311 | "source": [ 312 | "true_news.head()" 313 | ] 314 | }, 315 | { 316 | "cell_type": "markdown", 317 | "metadata": {}, 318 | "source": [ 319 | "Concatenar y shufflear Dataframes" 320 | ] 321 | }, 322 | { 323 | "cell_type": "code", 324 | "execution_count": 10, 325 | "metadata": {}, 326 | "outputs": [], 327 | "source": [ 328 | "news = pd.concat([news, true_news])\n", 329 | "news = news.sample(frac=1).reset_index(drop=True)" 330 | ] 331 | }, 332 | { 333 | "cell_type": "markdown", 334 | "metadata": {}, 335 | "source": [ 336 | "Agrupamos los tipos de noticia en bias - fake - true" 337 | ] 338 | }, 339 | { 340 | "cell_type": "code", 341 | "execution_count": 11, 342 | "metadata": {}, 343 | "outputs": [], 344 | "source": [ 345 | "news.loc[news['type'] =='conspiracy', 'type'] = 'fake'\n", 346 | "news.loc[news['type'] =='hate', 'type'] = 'fake'\n", 347 | "news.loc[news['type'] =='junksci', 'type'] = 'fake'\n", 348 | "news.loc[news['type'] =='satire', 'type'] = 'fake'\n", 349 | "news.loc[news['type'] =='state', 'type'] = 'fake'" 350 | ] 351 | }, 352 | { 353 | "cell_type": "markdown", 354 | "metadata": {}, 355 | "source": [ 356 | "**Noticias por categoría**" 357 | ] 358 | }, 359 | { 360 | "cell_type": "code", 361 | "execution_count": 12, 362 | "metadata": {}, 363 | "outputs": [ 364 | { 365 | "data": { 366 | "text/plain": [ 367 | "" 368 | ] 369 | }, 370 | "execution_count": 12, 371 | "metadata": {}, 372 | "output_type": "execute_result" 373 | }, 374 | { 375 | "data": { 376 | "image/png": "iVBORw0KGgoAAAANSUhEUgAAA6gAAAGKCAYAAAABumw8AAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDIuMS4xLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvAOZPmwAAHCRJREFUeJzt3X2453Vd5/HXzBxQsQFnczLYJLTy3WZ5AxaoIKQYi3jT1d2SeRmImcm6stmWGQrbrZo3V26YJhFmWW2kV6tFUVBGiLKmqZS+S8too5sBBxkDyYGzf/y+s5wOZ4bZc2bm9+HM43FdXPzO9/f9/r7vc7iu78XzfG/OhsXFxQAAAMC8bZz3AAAAAJAIVAAAAAYhUAEAABiCQAUAAGAIAhUAAIAhCFQAAACGIFABOGCq6piqWqyqc5Yt/4GqunQf7ufTVfW4ffV597Kvw6vqmqr686r6ljV8zlur6rjp9cVVdeoe1j2mqj632n0t+6yzquo9a9j+iKq6al/MAgAL8x4AgIPOXUleV1V/0t0972H2gcckeUh3f+UaP+epSd6SJN39/DVPdeBsSfIN8x4CgPVBoAJwoN2e5HVJ3lFVj+/uf1365nQm9frufu3yr6vq00nekeTJmYXRa5I8MclxSb6Q5JndfeP0UedW1aOT3C/J67r7kunznpHk/CSHJrktyQ9097VVdWGSxyc5KslHuvs5y+b65iQXZHb10Y4k35/ks0kuSfLvq+rPkjy+u29fss0J04z3S3Jkkt/v7nOq6pgkVyf5eJJdr49K8itV9dwkr07ys919WVU9PcmPT/v9lyQvnPa7dLYfSfKt0zqfTvKi7r5xOqN7fma/FLgzyX/r7j9e4b/JkVX1u9MMf5vke6afz/VJHtrdn62qDUk6ybd390eWbPuLSR4wff8/Pe37idNcRyd5//Q93pbkVUlOT/LAJC/v7ndO652T5EXT/Dcn+c/d/YkV5gRgnXOJLwDz8BNJPpfkJ1ex7f27+4Qkr0zy80l+prsfneTvkpy1ZL3bu/vYzM5M/lRVPbKqvmra59O6+7FJXpDknVX1wGmbL0/y2BXi9KuTvDnJt077emWS30ryD0men+RT3f2YpXE6eUmSV3b38Um+Jskzd13Gm+TLkvxYdz+iu89JcmOS7+ruDyzZ70OS/HKSs7v7UZkF4KuWzfbcJF+X5Bu6+zFJfifJxdPbu4LxcUlekeSU3fxMH5FZFD4qycemn+kNSa5K8l3TOt+Y5OZlcZokZ2f2s35Mkt9I8pVV9cjpvecnedv0S4hNSW7r7uOSfEeSS6pqa1WdnOS7k5w0/Td5TZJ37WZOANY5gQrAAdfddyV5TpKzq+qp/5+b/+b0708l+cclwfSpJP9uyXq7Lpe9MckVSZ6SWawemeTK6Yzfr2R2dnHX5bnv7+6dK+zzyUmu7O6/nj7zqiT/nNmZ2z357iQPqqqXJ3lTkgck+aLpvZ1Jrr2X7Z+Y2dnjD0/7fWd3n75snacnOSHJB6fv6cVJanrv15K8q6ouzt1nnFfyB939yen1L2T2c0qSizI7m5ok35vk5/Y07BSiFyd5flVtyuwXBj+/ZJWfndb7aGYh/KQkZ2T283/fNP9rkmypqqX/LQE4SAhUAOaiu/8us+h5W5IHL3lrMcmGJV8fumzTO5a8/sIednHnktcbp3U3ZRaaj9n1T2Zxd/203u4ePLRpmmupjUkO2cP+k+SPkzwtySeS/GiSv8/d39sdu4nhpXYu3W9VbaiqR60w26uXfD+Pyyxs090/kuTEJB/MLBZXurw3WflnlSR/kOSwqnpKZjH5P+9l3mR2pvk7kzwjs7j+m2Xfz9L93DnN//Yl8x87fQ/b92JfAKwzAhWAuenuy5JcnuS8JYu3ZRYoqaqjkpy8yo8/a/qMo5OcmuTK6Z9vmi7ZTVU9LclHMzuzuSdXJjmtqh4+bffkJA9N8oHdbVBVD0ry9Ul+aLrX8ssyO1O4aTeb7Mw9g/cDSf7Dkktmn5XZJb9L/V5mZywPn77+0SRvr6qF6Z7dw7r7zZnd4/moqrrfCvv+xunnlMzucb08Sbp7MbMzvxcneUd3f343c2+a7lHd9YuHa5O8Ifc84/rcJKmqY5N8dZL3TvN/Z1UduWT/V66wHwAOAgIVgHn7L5k9mGeX/5HZQ3s6swfwrPZPmNy/qj6U2T2ZL+7uv+zuv8jsvtNfq6qPJPmxzB6stMc/2TJt96LM7le9PrP7QJ/R3Z/dwza3JPmpJB+atnlZkmty9+XEy70zyS9X1Tct+Yx/yuwe0LdNl79+f5Izl213cZL3JHl/Vf15kkclOWs6O3teZg+j+lBm94c+r7vvyD19NLN7Qq9PcvS0n13ellmMv2U3c/9DkuuS/HlVffG07BczC/HfWbbuE6dZLknyn7p7e3dfkdlDoX6/qj6a5NlJvmWKYwAOMhsWFx3/AYCVVdWZSb57hXtfd7f+xszuNf3b7n71kuWLSbZ29037Z1IA1gN/ZgYAWFFV/VGSrZldWrw3629OckNmZ4pfuv8mA2C9cgYVAACAIbgHFQAAgCEIVAAAAIYgUAEAABjCcA9J2rZth5ti2ae2bDks27ffNu8xAHbLcQoYmWMU+9rWrZs37O49Z1BZ9xYWNs17BIA9cpwCRuYYxYEkUAEAABiCQAUAAGAIAhUAAIAhCFQAAACGIFABAAAYgkAFAABgCAIVAACAIQhUAAAAhiBQAQAAGIJABQAAYAgCFQAAgCEIVAAAAIYgUAEAABjCwrwHAGB9et6rrpr3CKwzl7zsyfMeAYD9zBlUAAAAhiBQAQAAGIJABQAAYAgCFQAAgCEIVAAAAIYgUAEAABiCQAUAAGAIAhUAAIAhCFQAAACGsLA3K1XV8Ule3d2nVNVXJrk0yWKS65Oc2913VdUFSc5IsjPJed193e7W3fffBgAAAPd193oGtap+MMnFSe4/LXp9kvO7+6QkG5I8q6qOTXJykuOTnJnkot2tu2/HBwAAYL3Ym0t8P5XkW5Z8fVyS906vL09yapITk1zR3YvdfUOSharaupt1AQAA4B7u9RLf7v7NqjpmyaIN3b04vd6R5Igkhye5eck6u5avtO4ebdlyWBYWNu3F6LD3tm7dPO8RAFgjx3L2tWe89LfmPQLrzLtf54LRtdqre1CXWXoP6eYktyS5dXq9fPlK6+7R9u23rWIk2L2tWzdn27Yd8x4DgDVyLAdG5zi1d/b0C8fVPMX3w1V1yvT69CRXJ7kmyWlVtbGqjk6ysbtv2s26AAAAcA+rOYP60iRvrapDk3w8yWXdfWdVXZ3k2syi99zdrbsPZgYAAGAd2qtA7e5PJzlhev2XmT2xd/k6Fya5cNmyFdcFAACA5VZziS8AAADscwIVAACAIQhUAAAAhiBQAQAAGIJABQAAYAgCFQAAgCEIVAAAAIYgUAEAABiCQAUAAGAIAhUAAIAhCFQAAACGIFABAAAYgkAFAABgCAIVAACAIQhUAAAAhiBQAQAAGIJABQAAYAgCFQAAgCEIVAAAAIYgUAEAABiCQAUAAGAIAhUAAIAhCFQAAACGIFABAAAYgkAFAABgCAIVAACAIQhUAAAAhiBQAQAAGIJABQAAYAgCFQAAgCEIVAAAAIYgUAEAABiCQAUAAGAIAhUAAIAhCFQAAACGIFABAAAYgkAFAABgCAIVAACAIQhUAAAAhiBQAQAAGIJABQAAYAgCFQAAgCEIVAAAAIYgUAEAABiCQAUAAGAIAhUAAIAhCFQAAACGIFABAAAYgkAFAABgCAIVAACAIQhUAAAAhrCwmo2q6pAkb0tyTJI7k3xPkp1JLk2ymOT6JOd2911VdUGSM6b3z+vu69Y+NgAAAOvNas+gPi3JQnc/IcmPJvmJJK9Pcn53n5RkQ5JnVdWxSU5OcnySM5NctPaRAQAAWI9WG6h/mWShqjYmOTzJF5Icl+S90/uXJzk1yYlJrujuxe6+Ydpm6xpnBgAAYB1a1SW+ST6X2eW9n0jy4CRPT/Kk7l6c3t+R5IjM4vXmJdvtWr5tdx+8ZcthWVjYtMqxYGVbt26e9wgArJFjOTA6x6m1W22g/tckv9fdP1xVD01yVZJDl7y/OcktSW6dXi9fvlvbt9+2ypFgZVu3bs62bTvmPQYAa+RYDozOcWrv7CnkV3uJ7/Ykn51efybJIUk+XFWnTMtOT3J1kmuSnFZVG6vq6CQbu/umVe4TAACAdWy1Z1DfkOSSqro6szOnL0/ywSRvrapDk3w8yWXdfee0zrWZxfC5+2BmAAAA1qFVBWp3fy7Jd6zw1skrrHthkgtXsx8AAAAOHqu9xBcAAAD2KYEKAADAEAQqAAAAQxCoAAAADEGgAgAAMASBCgAAwBAEKgAAAEMQqAAAAAxBoAIAADAEgQoAAMAQBCoAAABDEKgAAAAMQaACAAAwBIEKAADAEAQqAAAAQxCoAAAADEGgAgAAMASBCgAAwBAEKgAAAEMQqAAAAAxBoAIAADAEgQoAAMAQBCoAAABDEKgAAAAMQaACAAAwBIEKAADAEAQqAAAAQxCoAAAADEGgAgAAMASBCgAAwBAEKgAAAEMQqAAAAAxBoAIAADAEgQoAAMAQBCoAAABDEKgAAAAMQaACAAAwBIEKAADAEAQqAAAAQxCoAAAADEGgAgAAMASBCgAAwBAEKgAAAEMQqAAAAAxBoAIAADAEgQoAAMAQBCoAAABDEKgAAAAMQaACAAAwBIEKAADAEAQqAAAAQxCoAAAADGFhtRtW1Q8neWaSQ5O8Kcl7k1yaZDHJ9UnO7e67quqCJGck2ZnkvO6+bq1DAwAAsP6s6gxqVZ2S5AlJnpjk5CQPTfL6JOd390lJNiR5VlUdO71/fJIzk1y0D2YGAABgHVrtJb6nJflYkncleXeS9yQ5LrOzqElyeZJTk5yY5IruXuzuG5IsVNXWtY0MAADAerTaS3wfnOTLkzw9ycOS/K8kG7t7cXp/R5Ijkhye5OYl2+1avm13H7xly2FZWNi0yrFgZVu3bp73CACskWM5MDrHqbVbbaDenOQT3f2vSbqqPp/ZZb67bE5yS5Jbp9fLl+/W9u23rXIkWNnWrZuzbduOeY8BwBo5lgOjc5zaO3sK+dVe4vsnSf5jVW2oqqOSPDDJldO9qUlyepKrk1yT5LSq2lhVR2d2lvWmVe4TAACAdWxVZ1C7+z1V9aQk12UWuecm+Zskb62qQ5N8PMll3X1nVV2d5Nol6wEAAMA9rPrPzHT3D66w+OQV1rswyYWr3Q8AAAAHh9Ve4gsAAAD7lEAFAABgCAIVAACAIQhUAAAAhiBQAQAAGIJABQAAYAgCFQAAgCEIVAAAAIYgUAEAABiCQAUAAGAIAhUAAIAhCFQAAACGIFABAAAYgkAFAABgCAIVAACAIQhUAAAAhiBQAQAAGIJABQAAYAgCFQAAgCEIVAAAAIYgUAEAABiCQAUAAGAIAhUAAIAhCFQAAACGIFABAAAYgkAFAABgCAIVAACAIQhUAAAAhiBQAQAAGIJABQAAYAgCFQAAgCEIVAAAAIYgUAEAABiCQAUAAGAIAhUAAIAhCFQAAACGIFABAAAYgkAFAABgCAIVAACAIQhUAAAAhiBQAQAAGIJABQAAYAgCFQAAgCEIVAAAAIYgUAEAABiCQAUAAGAIAhUAAIAhCFQAAACGIFABAAAYgkAFAABgCAIVAACAIQhUAAAAhrCwlo2r6kuS/GmSpybZmeTSJItJrk9ybnffVVUXJDljev+87r5uTRMDAACwLq36DGpVHZLkLUlunxa9Psn53X1Skg1JnlVVxyY5OcnxSc5MctHaxgUAAGC9Wsslvq9N8uYkN05fH5fkvdPry5OcmuTEJFd092J335Bkoaq2rmGfAAAArFOrusS3qs5Ksq27f6+qfnhavKG7F6fXO5IckeTwJDcv2XTX8m27++wtWw7LwsKm1YwFu7V16+Z5jwDAGjmWA6NznFq71d6D+rwki1V1apLHJPmlJF+y5P3NSW5Jcuv0evny3dq+/bZVjgQr27p1c7Zt2zHvMQBYI8dyYHSOU3tnTyG/qkt8u/tJ3X1yd5+S5M+SPDfJ5VV1yrTK6UmuTnJNktOqamNVHZ1kY3fftJp9AgAAsL6t6Sm+y7w0yVur6tAkH09yWXffWVVXJ7k2sxg+dx/uDwAAgHVkzYE6nUXd5eQV3r8wyYVr3Q8AAADr21qe4gsAAAD7jEAFAABgCAIVAACAIQhUAAAAhiBQAQAAGIJABQAAYAgCFQAAgCEIVAAAAIawMO8BWL3nveqqeY/AOnPJy5487xEAADiIOYMKAADAEAQqAAAAQxCoAAAADEGgAgAAMASBCgAAwBAEKgAAAEMQqAAAAAxBoAIAADAEgQoAAMAQBCoAAABDEKgAAAAMQaACAAAwBIEKAADAEAQqAAAAQxCoAAAADEGgAgAAMASBCgAAwBAEKgAAAEMQqAAAAAxBoAIAADAEgQoAAMAQBCoAAABDEKgAAAAMQaACAAAwBIEKAADAEAQqAAAAQxCoAAAADEGgAgAAMASBCgAAwBAEKgAAAEMQqAAAAAxBoAIAADAEgQoAAMAQBCoAAABDEKgAAAAMQaACAAAwBIEKAADAEAQqAAAAQxCoAAAADEGgAgAAMASBCgAAwBAEKgAAAENYWM1GVXVIkkuSHJPkfkl+PMlfJLk0yWKS65Oc2913VdUFSc5IsjPJed193drHBgAAYL1Z7RnU5yS5ubtPSnJ6kp9N8vok50/LNiR5VlUdm+TkJMcnOTPJRWsfGQAAgPVotYH6G0leseTrnUmOS/Le6evLk5ya5MQkV3T3YnffkGShqraudlgAAADWr1Vd4tvdn0uSqtqc5LIk5yd5bXcvTqvsSHJEksOT3Lxk013Lt+3us7dsOSwLC5tWMxawRlu3bp73CAC75RgFjM5xau1WFahJUlUPTfKuJG/q7ndU1WuWvL05yS1Jbp1eL1++W9u337bakYA12rZtx7xHANgtxyhgdI5Te2dPIb+qS3yr6iFJrkjyQ919ybT4w1V1yvT69CRXJ7kmyWlVtbGqjk6ysbtvWs0+AQAAWN9Wewb15Um2JHlFVe26F/UlSd5YVYcm+XiSy7r7zqq6Osm1mcXwuWsdGAAAgPVptfegviSzIF3u5BXWvTDJhavZDwAAAAeP1T7FFwAAAPYpgQoAAMAQBCoAAABDEKgAAAAMQaACAAAwBIEKAADAEAQqAAAAQxCoAAAADEGgAgAAMASBCgAAwBAEKgAAAEMQqAAAAAxBoAIAADAEgQoAAMAQBCoAAABDEKgAAAAMQaACAAAwBIEKAADAEAQqAAAAQxCoAAAADEGgAgAAMASBCgAAwBAEKgAAAEMQqAAAAAxBoAIAADAEgQoAAMAQBCoAAABDEKgAAAAMQaACAAAwBIEKAADAEAQqAAAAQxCoAAAADEGgAgAAMASBCgAAwBAEKgAAAEMQqAAAAAxBoAIAADAEgQoAAMAQBCoAAABDEKgAAAAMQaACAAAwBIEKAADAEAQqAAAAQxCoAAAADEGgAgAAMASBCgAAwBAEKgAAAEMQqAAAAAxBoAIAADAEgQoAAMAQBCoAAABDEKgAAAAMYWF/76CqNiZ5U5JHJ7kjyfO7+5P7e78AAADctxyIM6jfnOT+3f34JC9L8roDsE8AAADuYw5EoJ6Y5HeTpLvfn+RxB2CfAAAA3MdsWFxc3K87qKqLk/xmd18+fX1Dkod39879umMAAADuUw7EGdRbk2xeuk9xCgAAwHIHIlCvSfK0JKmqE5J87ADsEwAAgPuY/f4U3yTvSvLUqnpfkg1Jzj4A+wQAAOA+Zr/fgwoAAAB740Bc4gsAAAD3SqACAAAwBIEKAADAEAQqAAAAQzgQT/GFuaqqjd1917znANilqjYlOSvJ0Un+MMn13X3TXIcCgAEIVNalqvr2JJuS3C/JT1fVa7r7tXMeC2CXtyS5MclTk3wwyS9l+pvhACOoqq9N8nNJHpTkVzL7Rdp75jsVBwOX+LJe/UCS30/ynCQPTfKM+Y4D8G98RXe/Msnt3f3uJEfMeyCAZX4mydlJbkryC0kunOs0HDQEKuvV56d/7+juO5JsnucwAMssVNWDk6SqNidxGwIwnO7+ZJLF7t6WZMe85+Hg4BJf1qu/yeyyuRdX1QVJPjDneQCWOj/JNUmOTPL+JC+Z7zgA9/CZqvreJA+sqjOT3DLvgTg4OIPKutTdZyX5uuleibd09/fNeSSApd7X3ZXkK5J8bWa/VAMYyTlJHpbZJb6Pm76G/c4ZVNalqjohydlVdUiSDVV1VHefNu+5ACa/muTbunvbdIbipUkeMeeZAJZ6UJI3Lfn6i5J8Zk6zcBARqKxXb0zyhiTfluRjSQ6d7zgA/8YfVNXbM/sfwFuSHD/neQCW+/Uki5ldcfmwJH+V5MS5TsRBwSW+rFe3dPevJrm1uy9M8mVzngcgVXVoVR2a5JIkH8nsF8XnJPmXuQ4GsEx3P767n9DdJySpJH8/75k4ODiDynq1WFWPTHJYVVWSL533QABJOrMzEkmyYdmyh89lIoB799nM7pmH/U6gsl59f5JHZnap7zuSvHm+4wAk3f2wec8AsDeq6trc/Qu1L8ns78vDfidQWVeqaqG7d2Z2n8RfTYufkLsPsABzV1XPTHJukkMyO5P6xd39qPlOBfBvnJ3k9un157v7n+Y5DAcPgcp680tJnp3ZJXMbknxxZo9Hd/kcMJJXJnlxkhcm+cMkT53vOAD3cHF3eygSB5yHJLGudPezp5fnJrkrySeT7Iy/3QWM5ebuvjZJuvvSeJAbMJ5/qao3VNULq+oFVfWCeQ/EwUGgsl5dkOT47n5skpOS/NSc5wFIVR0xvbyjqp6U5JCqOi3JkXMcC2AlpybZntn9p1+a2Z+agf1OoLJe7ejubUnS3f8Yf8IBGMO7p3//c2a32fx4ku9J8oq5TQSwRFWdMz0g6XNJTp/+OSPJaXMdjIOGe1BZV6rqJ6eXC1X1niR/kuQbktwxv6kA/p/bq+p/J/mqJF8zLduQ5CVJfn1uUwHc7ZeTXJnk5Ul+Ylp2V2a/WIP9TqCy3vSyfyfJb81jEIAVnJ7kqCRvSfKiOc8CcA/dfUeSTydxzylzsWFx0V/fAAAAYP7cgwoAAMAQBCoAAABDEKgAsB9U1RFV9a55zwEA9yUCFQD2jy1JHjvvIQDgvsRTfAFg/3hjkqOms6h/0d0/kiRVdWmSyzN7ou/tSb4+yeFJfqy7315VX5TkoiRfm2RTkld396/OYX4AOOA8xRcA9oOqOibJHyV5cmZ/U/DhSR6Q5BNJHpHkzZn9yZkzkjwkyZ8meXSS85Lc2N1vrKrDk7wvyTO7+68P8LcAAAecS3wBYD+awvLTSZ6U5FuT/HZ3f356+xe7+wvd/X+SXJPkxCSnJnlhVf1Zkj9O8sAkjzzggwPAHLjEFwD2v0uSPDvJ0UkuXLJ855LXG6evNyV5Tnd/KEmq6iFJPnNgxgSA+XIGFQD2j525+xfBlyV5SpIv7e4PLFnnO6pqQ1V9eZLjk1yd5Kok35ckVXVkko9mFrYAsO4JVADYP/4pyQ1V9YfdfXuSa5Msf9jRYUk+mOS3k7ygu29O8t+TPKCqrs8sVn+wuz91AOcGgLnxkCQA2I+qakOSzZkF6lO6+x+n5Zcm+aPuvnR+0wHAWJxBBYD96+sze0jSz++KUwBgZc6gAgAAMARnUAEAABiCQAUAAGAIAhUAAIAhCFQAAACGIFABAAAYgkAFAABgCP8XcB/V6gne/nIAAAAASUVORK5CYII=\n", 377 | "text/plain": [ 378 | "" 379 | ] 380 | }, 381 | "metadata": { 382 | "needs_background": "light" 383 | }, 384 | "output_type": "display_data" 385 | } 386 | ], 387 | "source": [ 388 | "fig, ax = plt.subplots(1,1, figsize=(16,6))\n", 389 | "news.groupby(\"type\").count()['title'].plot.bar(title = \"Number of articles by type\", ax = ax)" 390 | ] 391 | }, 392 | { 393 | "cell_type": "markdown", 394 | "metadata": {}, 395 | "source": [ 396 | "### Vectorización" 397 | ] 398 | }, 399 | { 400 | "cell_type": "code", 401 | "execution_count": 13, 402 | "metadata": {}, 403 | "outputs": [], 404 | "source": [ 405 | "from gensim.models import KeyedVectors" 406 | ] 407 | }, 408 | { 409 | "cell_type": "code", 410 | "execution_count": 14, 411 | "metadata": {}, 412 | "outputs": [], 413 | "source": [ 414 | "model = KeyedVectors.load_word2vec_format(\"../data/GoogleNews-vectors-negative300.bin.gz\", binary=True,\n", 415 | " limit=50000)" 416 | ] 417 | }, 418 | { 419 | "cell_type": "code", 420 | "execution_count": 15, 421 | "metadata": {}, 422 | "outputs": [ 423 | { 424 | "name": "stderr", 425 | "output_type": "stream", 426 | "text": [ 427 | "100%|██████████| 2503/2503 [00:00<00:00, 17425.65it/s]\n" 428 | ] 429 | } 430 | ], 431 | "source": [ 432 | "news['title'] = news.progress_apply(lambda r: \n", 433 | " [model.vocab[x].index for x in str(r['title']) if x in model.vocab], axis=1)" 434 | ] 435 | }, 436 | { 437 | "cell_type": "code", 438 | "execution_count": 16, 439 | "metadata": {}, 440 | "outputs": [ 441 | { 442 | "name": "stderr", 443 | "output_type": "stream", 444 | "text": [ 445 | "100%|██████████| 2503/2503 [00:01<00:00, 2373.28it/s]\n" 446 | ] 447 | } 448 | ], 449 | "source": [ 450 | "news['content'] = news.progress_apply(lambda r: \n", 451 | " [model.vocab[x].index for x in str(r['text']) if x in model.vocab], axis=1)" 452 | ] 453 | }, 454 | { 455 | "cell_type": "code", 456 | "execution_count": 17, 457 | "metadata": {}, 458 | "outputs": [ 459 | { 460 | "data": { 461 | "text/html": [ 462 | "
\n", 463 | "\n", 476 | "\n", 477 | " \n", 478 | " \n", 479 | " \n", 480 | " \n", 481 | " \n", 482 | " \n", 483 | " \n", 484 | " \n", 485 | " \n", 486 | " \n", 487 | " \n", 488 | " \n", 489 | " \n", 490 | " \n", 491 | " \n", 492 | " \n", 493 | " \n", 494 | " \n", 495 | " \n", 496 | " \n", 497 | " \n", 498 | " \n", 499 | " \n", 500 | " \n", 501 | " \n", 502 | " \n", 503 | " \n", 504 | " \n", 505 | " \n", 506 | " \n", 507 | " \n", 508 | " \n", 509 | " \n", 510 | " \n", 511 | " \n", 512 | " \n", 513 | " \n", 514 | " \n", 515 | " \n", 516 | " \n", 517 | "
titletypecontent
0[4024, 2370, 1280, 11538, 17251, 20, 17919, 57...fake[4024, 2370, 1280, 11538, 17251, 20, 17919, 57...
1[5977, 4211, 7726, 11538, 9311, 8469, 4211, 12...fake[5760, 5760]
2[7203, 15775, 15775, 3581, 7726, 17919, 4211, ...fake[3708, 4211, 5760, 7726, 15775, 4211, 6869, 42...
3[11969, 4211, 1280, 4883, 4501, 23199, 8469, 2...fake[3708, 11538, 7726, 11538, 73, 17919, 7726, 45...
4[3581, 4501, 11538, 16205, 8303, 20, 5760, 846...fake[3581, 4501, 11538, 16205, 8303, 20, 5760, 846...
\n", 518 | "
" 519 | ], 520 | "text/plain": [ 521 | " title type \\\n", 522 | "0 [4024, 2370, 1280, 11538, 17251, 20, 17919, 57... fake \n", 523 | "1 [5977, 4211, 7726, 11538, 9311, 8469, 4211, 12... fake \n", 524 | "2 [7203, 15775, 15775, 3581, 7726, 17919, 4211, ... fake \n", 525 | "3 [11969, 4211, 1280, 4883, 4501, 23199, 8469, 2... fake \n", 526 | "4 [3581, 4501, 11538, 16205, 8303, 20, 5760, 846... fake \n", 527 | "\n", 528 | " content \n", 529 | "0 [4024, 2370, 1280, 11538, 17251, 20, 17919, 57... \n", 530 | "1 [5760, 5760] \n", 531 | "2 [3708, 4211, 5760, 7726, 15775, 4211, 6869, 42... \n", 532 | "3 [3708, 11538, 7726, 11538, 73, 17919, 7726, 45... \n", 533 | "4 [3581, 4501, 11538, 16205, 8303, 20, 5760, 846... " 534 | ] 535 | }, 536 | "execution_count": 17, 537 | "metadata": {}, 538 | "output_type": "execute_result" 539 | } 540 | ], 541 | "source": [ 542 | "news = news.drop('text', axis=1)\n", 543 | "news.head()" 544 | ] 545 | }, 546 | { 547 | "cell_type": "code", 548 | "execution_count": 18, 549 | "metadata": {}, 550 | "outputs": [], 551 | "source": [ 552 | "news = news[news['title'].map(len) >= 1]\n", 553 | "#Reset index\n", 554 | "news = news.reset_index().drop(\"index\", axis=1)" 555 | ] 556 | }, 557 | { 558 | "cell_type": "code", 559 | "execution_count": 19, 560 | "metadata": {}, 561 | "outputs": [], 562 | "source": [ 563 | "def pad_array(array, token_len):\n", 564 | " diff_token = token_len - len(array)\n", 565 | " if(diff_token < 0):\n", 566 | " array = array[:token_len] #Truncate\n", 567 | " else:\n", 568 | " #Pad\n", 569 | " array += [0]*diff_token #Pad\n", 570 | " \n", 571 | " return array " 572 | ] 573 | }, 574 | { 575 | "cell_type": "code", 576 | "execution_count": 20, 577 | "metadata": {}, 578 | "outputs": [ 579 | { 580 | "name": "stderr", 581 | "output_type": "stream", 582 | "text": [ 583 | "Pandas Apply: 100%|██████████| 2503/2503 [00:00<00:00, 45125.24it/s]\n" 584 | ] 585 | }, 586 | { 587 | "data": { 588 | "text/plain": [ 589 | "" 590 | ] 591 | }, 592 | "execution_count": 20, 593 | "metadata": {}, 594 | "output_type": "execute_result" 595 | }, 596 | { 597 | "data": { 598 | "image/png": "iVBORw0KGgoAAAANSUhEUgAAA7AAAAFyCAYAAADMAjZUAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDIuMS4xLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvAOZPmwAAIABJREFUeJzt3X/c5XVdJ/zXzFwg4j3QmJetFkpqvXdv25UAF1AQljAW0TCrzS1LIXN9RHdRdicWJrY/bn9htxamiwHanXe7YWpqyNxJEiJIutqK4duHaLr3XVsDDTAKijNz3X+cM3q4mGvmDM6Za77XeT4fj3lwvj/P+5z3XMz1Op/P93vWLS0tBQAAAA5261e7AAAAAJiGAAsAAMAgCLAAAAAMggALAADAIAiwAAAADIIACwAAwCAIsADMVFUdXVVLVfUzy9b/SlVduR+f52+q6vj9db69PNcRVXVDVX26qp6zbNvZVfWbU5zjQ1X1o7OrcrfP+cmq+rZ92P/Iqrp2Ynmpqh4xm+oe8NxPrqo3jx8fX1VX7WX/K6vqVw5EbQCsnoXVLgCAubAzySVV9eHu7tUuZj84Jsl3dPcTdrPtyUkefoDrmUp3H7OPh2xK8i9nUcsUnpjku5Kkuz+W5ICGfQAOTgIsAAfCvUkuSfKOqjqpu++b3Dgeib2lu1+3fLmq/ibJO5KcnlGgek2SpyY5LsnXk/xQd//t+FTnV9WTkjwkySXdffn4fM9KclGSQ5Pck+RXuvvGqro4yUlJHp3kr7r7ecvqenaSV2Q0Y2lbkl9OcleSy5N8Z1V9MslJ3X3veP8Tkrw4yYaququ7f72qXp7k3ybZnuSzSX6+u//nxHMsjF/f15M8P8nDkrwhyT9PckiSDyb537t7e1V9NcmrkvxgkkcleU13/25V/ZMkb0+ya3T0/d398uVNqKqlJItJnpnkhzP6YOF7xu/J87v71mWHXJHkoePXedx43Sur6sQk357ktd196fjcP5Pk58bv1R3j1/mZZc+/PslvJTkxycYk65K8sLtvGPf84Uken+SmJE9PcmRVXZHkbUl+p7u/r6r+lyS/ndHfge1J3p3k15c9zz8bv4ffnmRDkjfu+rsAwLCZQgzAgfIfk3w5yX96EMce1t0nJvmNJP85yRu6+0lJ/keSF0zsd293H5tR+Pk/quqJVfU94+d8Rnd/f5IXJfnjqnrY+JjHJvn+3YTXf5rkzUl+ZPxcv5HkPUn+LskLk9zW3cfsCq9J0t0fHR/zX8bh9dwkZyV5cnf/iyS3JLly4mkOTfJHSf4hyfO6e3tGAe/j3X1cku/PKJT+8nj/hyS5vbufktGI5G9V1WFJfjbJ58ev/ZQk31NVR+7lPT01yf/W3d+X5KNJLtzNPueO39NjunvHeN3nx7X9cEaj6odU1akZhe9Txu/xa5K8azfnOyGjDwtO6u7/NaNgOvm8h3f3E7v7ZzJ6v6/v7nOXneM3kxyW5J9lNBL+1PFrSfKNDwSuSnLhuM5Tk/zKOHQDMHBGYAE4ILp7Z1U9L8knq+qafTz8neP/3pbkf3b3X00sT07Xfcv4uf62qjYn+YGMRukeleSDVbVrv51Jdk3/vWkcHJc7PckHu/vz43NeW1X/kNFI5NKUdZ+V5Iru/sp4+Q1Jfr2qDh0vX5LRSOTju3vXOZ+Z5F9OXDP80GXnfM/4v/8to0D7sCQfSPKnVfWYJH+WUXi7ay+1fby7/9+Jcz1nTztPeMf4v58cP/8RSc7O6P38yMR7vKmqHt7d/7hrxXjU+6Ik/66qHp/ktIxGtnf58BTPf0aSXx4H6h0Zh9eqesF4+/dmNIp7+UQtD83ow4CbpnyNABykjMACcMB09/9I8u8yGnmbvBnQUkbTSXc5NPf3tYnHX9/DU+yYeLx+vO+GjILoMbv+ZDSF9Zbxfl9e4Vwb8sCguj6jab3TWn6O9Rl9eLzrtf5+kt9NctmyY35sotYTkvz8xPZ7k2Qi8K7r7r9M8t0ZjU4fneTmqjoue3bvxOPl7/+efH35849r/v2Jmo9NcnySrZMHVtXZSd4/XnxPRqPVk8+7Ui8mbc/Ee1pVR1XVt09s35Dkrt30+4opXx8ABzEBFoADqruvSnJ1kgsmVm/JKPCkqh6diSmh++gF43M8JqORug+O//zgeEpwquoZSf57HjiyudwHk5xZVY8bH3d6kqMymm67J9vzzZD7gSTnTUxX/oUkf9HduwL5zUlenuQJVfWz43XXJPmlqlpXVQ9J8ie5f4B9gKp6VZKXd/e7k/xikk8n+b691DmN7Rldz7u3cHtNkn9bVY8aL784o/dvuacneW93/26SjyV5dkaBc6Xn3t2HBX+W5PlVtX78/lyV+/996ST3jkf7U1VHZfRhxd4CPQADIMACsBp+IckXJ5Z/O8mjqqozGim7drdH7d1hVfXfkvxpRtd3fra7/zqj617/sKr+Ksm/z+jGT3sc7Rsf93MZXS97S0Y3T3rWFFNzr80o+P52kt/LKHDdXFW3ZjQy+ZPLnuerGQXv146n1f5CRtOCP5VR0P5URteU7sn/meSYcZ0fS/KFJH+4l2Om8XcZhexPLxvlvJ/u3pzk1Un+n6r670l+IslzJkZpd3lzktOq6lMZTVu+Lcl3j2/utNxNSR5XVX+8bP0rk9yX5K+SfCLJn3b3N/YZ3yDsnCQvHNeyOaNwf8O0LxqAg9e6paVpL+MBAACA1WMEFgAAgEEQYAEAABgEARYAAIBBEGABAAAYBAEWAACAQVhY7QIejC1bth3Ut07etOnwbN16z2qXwQGi3/NFv+eLfs8X/Z4v+j1f9Ht4Fhc37vY7yI3AzsDCwkrfyc5apN/zRb/ni37PF/2eL/o9X/R77RBgAQAAGAQBFgAAgEEQYAEAABgEARYAAIBBEGABAAAYBAEWAACAQRBgAQAAGAQBFgAAgEEQYAEAABgEARYAAIBBEGABAAAYBAEWAACAQRBgAQAAGISF1S5gLXrWS96zT/tffuHpM6oEAABg7TACCwAAwCAIsAAAAAyCAAsAAMAgCLAAAAAMggALAADAIAiwAAAADIIACwAAwCAIsAAAAAyCAAsAAMAgCLAAAAAMggALAADAIAiwAAAADMLCrE5cVYckeVuSo5PsSPKzSbYnuTLJUpJbkpzf3Tur6hVJzh5vv6C7b55VXQAAAAzTLEdgn5FkobufkuQ3k/zHJK9PclF3n5JkXZJzqurYJKcmOSHJc5NcOsOaAAAAGKhZBtjPJlmoqvVJjkjy9STHJbluvP3qJGckOTnJ5u5e6u4vjY9ZnGFdAAAADNDMphAn+XJG04c/k+QRSZ6Z5GndvTTevi3JkRmF2zsmjtu1fstKJ9606fAsLGyYQcmrY3Fx42qXwLdID+eLfs8X/Z4v+j1f9Hu+6PfaMMsA+0tJrunul1XVUUmuTXLoxPaNSe5Mcvf48fL1K9q69Z79XOrq2rJl22qXwLdgcXGjHs4R/Z4v+j1f9Hu+6Pd80e/hWekDh1lOId6a5K7x439MckiST1TVaeN1ZyW5PskNSc6sqvVV9Zgk67v79hnWBQAAwADNcgT2t5JcXlXXZzTy+mtJPpbksqo6NMmtSa7q7h3jfW7MKFCfP8OaAAAAGKiZBdju/nKSf7ObTafuZt+Lk1w8q1oAAAAYvllOIQYAAID9RoAFAABgEARYAAAABkGABQAAYBAEWAAAAAZBgAUAAGAQBFgAAAAGQYAFAABgEARYAAAABkGABQAAYBAEWAAAAAZBgAUAAGAQBFgAAAAGQYAFAABgEARYAAAABkGABQAAYBAEWAAAAAZBgAUAAGAQBFgAAAAGQYAFAABgEARYAAAABkGABQAAYBAEWAAAAAZBgAUAAGAQBFgAAAAGQYAFAABgEARYAAAABkGABQAAYBAEWAAAAAZhYVYnrqoXJHnBePGwJMckOS3JG5JsT7K5u19ZVeuTvCnJk5J8LckLu/tzs6oLAACAYZpZgO3uK5NcmSRVdWmSy5O8OcmPJPl8kvdX1bFJjk5yWHefVFUnJrkkyTmzqgsAAIBhmvkU4qo6PskTk/xhkod0923dvZTkmiQ/kOTkJB9Iku6+Kcnxs64JAACA4ZnZCOyEX0vyyiRHJLl7Yv22JI8br79rYv2Oqlro7u0rnXDTpsOzsLBhFrWuisXFjatdAt8iPZwv+j1f9Hu+6Pd80e/5ot9rw0wDbFV9W5J/2t1/XlVHJJn8W7MxyZ1JDl+2fv2ewmuSbN16z36vdTVt2bJttUvgW7C4uFEP54h+zxf9ni/6PV/0e77o9/Cs9IHDrKcQPy3JnyVJd9+d5L6qenxVrUtyZpLrk9yQ5BlJMr4G9lMzrgkAAIABmvUU4srohk27vDjJHyTZkNFdiD9aVX+Z5OlV9ZEk65KcO+OaAAAAGKCZBtjufu2y5ZuSnLhs3c6Mgi0AAACsaOZ3IQYAAID9QYAFAABgEARYAAAABkGABQAAYBAEWAAAAAZBgAUAAGAQBFgAAAAGQYAFAABgEARYAAAABkGABQAAYBAEWAAAAAZBgAUAAGAQBFgAAAAGQYAFAABgEARYAAAABkGABQAAYBAEWAAAAAZBgAUAAGAQBFgAAAAGQYAFAABgEARYAAAABkGABQAAYBAEWAAAAAZBgAUAAGAQBFgAAAAGQYAFAABgEARYAAAABkGABQAAYBAEWAAAAAZhYZYnr6qXJfmhJIcmeVOS65JcmWQpyS1Jzu/unVX1iiRnJ9me5ILuvnmWdQEAADA8MxuBrarTkjwlyVOTnJrkqCSvT3JRd5+SZF2Sc6rq2PH2E5I8N8mls6oJAACA4ZrlFOIzk3wqybuSvDfJ+5Icl9EobJJcneSMJCcn2dzdS939pSQLVbU4w7oAAAAYoFlOIX5EkscmeWaS707yJ0nWd/fSePu2JEcmOSLJHRPH7Vq/ZaUTb9p0eBYWNsyi5lWxuLhxtUvgW6SH80W/54t+zxf9ni/6PV/0e22YZYC9I8lnuvu+JF1VX81oGvEuG5PcmeTu8ePl61e0des9+7nU1bVly7bVLoFvweLiRj2cI/o9X/R7vuj3fNHv+aLfw7PSBw6znEL84ST/uqrWVdWjkzwsyQfH18YmyVlJrk9yQ5Izq2p9VT0mo1Ha22dYFwAAAAM0sxHY7n5fVT0tyc0ZBeXzk3whyWVVdWiSW5Nc1d07qur6JDdO7AcAAAD3M9Ov0enuX93N6lN3s9/FSS6eZS0AAAAM2yynEAMAAMB+I8ACAAAwCAIsAAAAgyDAAgAAMAgCLAAAAIMw07sQM53zXnXtPu1/+YWnz6gSAACAg5cRWAAAAAZBgAUAAGAQBFgAAAAGQYAFAABgEARYAAAABkGABQAAYBAEWAAAAAZBgAUAAGAQBFgAAAAGQYAFAABgEARYAAAABkGABQAAYBAEWAAAAAZBgAUAAGAQBFgAAAAGQYAFAABgEARYAAAABkGABQAAYBAEWAAAAAZBgAUAAGAQBFgAAAAGQYAFAABgEARYAAAABmFhmp2q6k+TXJHkPd1937Qnr6pPJLlrvPiFJG9J8oYk25Ns7u5XVtX6JG9K8qQkX0vywu7+3PQvAQAAgHkwVYBN8uokP53ktVX1/iRXdvdf7umAqjosSbr7tIl1n0zyI0k+n+T9VXVskqOTHNbdJ1XViUkuSXLOPr4OAAAA1ripAmx3X5fkuqp6aJIfTfLOqro7yVuT/G53f203hz0pyeFVtXn8PBcneUh335YkVXVNkh9I8qgkHxg/z01Vdfy39pIAAABYi6YdgU1VnZbkp5L8YJKrk/xhkqcn+ZMkZ+7mkHuSvC6jkPs942PunNi+LcnjkhyRb04zTpIdVbXQ3dtXqmXTpsOzsLBh2tLXnMXFjatdAsvoyXzR7/mi3/NFv+eLfs8X/V4bpr0G9osZTfu9IsnPd/e94/UfSvKxFQ77bJLPdfdSks9W1V1JHj6xfWNGgfbw8eNd1u8pvCbJ1q33TFP2mrVly7bVLoEJi4sb9WSO6Pd80e/5ot/zRb/ni34Pz0ofOEx7F+LTk/x4d789SarqCUnS3Tu7+9gVjjkvo+tZU1WPziiofqWqHl9V6zIatb0+yQ1JnjHe78Qkn5qyJgAAAObItAH27IyvU03yyCTvraoX7eWY30vybVX14ST/JaNA+8Ikf5Dk5iSf6O6PJnlXkq9W1UeS/FaSX9q3lwAAAMA8mPYa2BclOSFJuvuLVXVcko8m+c8rHTD+up2f2M2mE5fttzPJi6esAwAAgDk17QjsIRl9R+su9yVZ2v/lAAAAwO5NOwL77iTXVtV/zSi4/khGdx8GAACAA2KqEdjufmmSNyapJI9P8sbuvmiWhQEAAMCkaacQJ8mtSf5rRqOx/1hVT5tNSQAAAPBA034P7KVJnpXktonVSxl9vQ4AAADM3LTXwP5gkurue2dZDAAAAKxk2inEn0+ybpaFAAAAwJ5MOwL7j0n+uqo+kuSru1Z293kzqQoAAACWmTbAfmD8BwAAAFbFVAG2u99WVUcneWKSa5Ic1d1fmGVhAAAAMGmqa2Cr6seTvDfJG5I8PMmNVfW8WRYGAAAAk6a9idNLkzwlybbu/ock35/kZTOrCgAAAJaZNsDu6O5tuxa6+++S7JxNSQAAAPBA097E6dNV9fNJDqmqY5L8XJJPzq4sAAAAuL9pR2DPT/KdSe5NcnmSuzMKsQAAAHBATHsX4q9kdM2r614BAABYFVMF2KramWRp2eq/6+7v2v8lAQAAwANNOwL7janGVXVIkmcnOWlWRQEAAMBy014D+w3d/fXu/qMkp8+gHgAAANitaacQ//TE4rokT0zy9ZlUBAAAALsx7dfo/KuJx0tJbk/y4/u/HAAAANi9aa+BPXfWhQAAAMCeTDuF+At54F2Ik9F04qXuftx+rQoAAACWmXYK8TuSfC3JZRld+/qTSZ6c5NdnVBcAAADcz7QB9szuPn5i+Q1V9fHu/uIsigIAAIDlpv0anXVVdcauhap6ZpK7Z1MSAAAAPNC0I7AvSvL2qvonGV0L+5kkz59ZVQAAALDMtHch/niSJ1bVI5Lc291fmW1ZAAAAcH/T3oX4sUnemuToJKdU1XuTnNfdfzO70ljJea+6dup9L7/w9BlWAgAAcOBMO4X4LUlem+TVSf4+yf+d5O1Jnrang6rqkUk+nuTpSbYnuTKjKci3JDm/u3dW1SuSnD3efkF337zvLwMAAIC1btqbOD2iuzcnSXcvdfdlSY7Y0wFVdUhGwffe8arXJ7mou0/J6Ptjz6mqY5OcmuSEJM9Ncum+vwQAAADmwbQB9t6q+q6MRk9TVSdn9L2we/K6JG9O8rfj5eOSXDd+fHWSM5KcnGTzOBR/KclCVS3uQ/0AAADMiWmnEP9SkvcleXxVfTLJw5P82Eo7V9ULkmzp7muq6mXj1eu6e2n8eFuSIzMaxb1j4tBd67fsqZhNmw7PwsKGKUufb4uLG1e7hLngfZ4v+j1f9Hu+6Pd80e/5ot9rw7QB9juSPDnJ9ybZkOQz3X3fHvY/L8nS+Ltjj8noetlHTmzfmOTOjL5LduNu1u/R1q33TFk2W7ZsW+0S1rzFxY3e5zmi3/NFv+eLfs8X/Z4v+j08K33gMG2AfU13vz/Jp6fZubu/cXOnqvpQkhcneW1VndbdH0pyVpI/T/K5JK+pqtcl+a4k67v79ilrAgAAYI5MG2Bvq6rLk3w037wpU7r77fvwXC9JcllVHZrk1iRXdfeOqro+yY0ZXY97/j6cDwAAgDmyxwBbVd/Z3f9fRteprkty4sTmpYymBu9Rd582sXjqbrZfnOTivZcKAADAPNvbCOx7kxzb3edW1Uu6+5IDURQAAAAst7ev0Vk38fgnZ1kIAAAA7MneAuzSxON1K+4FAAAAM7a3ADtpae+7AAAAwGzs7RrYJ1bV58ePv3Pi8bokS939uNmVBgAAAN+0twD7vQekCgAAANiLPQbY7v7igSoEAAAA9mRfroEFAACAVSPAAgAAMAgCLAAAAIMgwAIAADAIAiwAAACDIMACAAAwCAIsAAAAgyDAAgAAMAgCLAAAAIMgwAIAADAIAiwAAACDIMACAAAwCAIsAAAAgyDAAgAAMAgCLAAAAIMgwAIAADAIAiwAAACDIMACAAAwCAIsAAAAgyDAAgAAMAgCLAAAAIOwMKsTV9WGJJclqSQ7kpybZF2SK5MsJbklyfndvbOqXpHk7CTbk1zQ3TfPqi4AAACGaZYjsM9Kku5+apLfSPL68Z+LuvuUjMLsOVV1bJJTk5yQ5LlJLp1hTQAAAAzUzAJsd787yYvGi49N8vdJjkty3Xjd1UnOSHJyks3dvdTdX0qyUFWLs6oLAACAYZrZFOIk6e7tVfW2JD+c5EeTPLO7l8abtyU5MskRSe6YOGzX+i0rnXfTpsOzsLBhNkWvMYuLG1e7hLngfZ4v+j1f9Hu+6Pd80e/5ot9rw0wDbJJ09/Or6qVJPprkoRObNia5M8nd48fL169o69Z79neZa9aWLdtWu4Q1b3Fxo/d5juj3fNHv+aLf80W/54t+D89KHzjMbApxVf1UVb1svHhPkp1JPlZVp43XnZXk+iQ3JDmzqtZX1WOSrO/u22dVFwAAAMM0yxHYP05yRVX9RZJDklyQ5NYkl1XVoePHV3X3jqq6PsmNGQXq82dYEwAAAAM1swDb3V9J8m92s+nU3ex7cZKLZ1ULAAAAwzfLr9EBAACA/UaABQAAYBAEWAAAAAZBgAUAAGAQBFgAAAAGQYAFAABgEARYAAAABkGABQAAYBAEWAAAAAZBgAUAAGAQBFgAAAAGQYAFAABgEARYAAAABkGABQAAYBAEWAAAAAZhYbULYLbOe9W1+7T/5ReePqNKAAAAvjVGYAEAABgEARYAAIBBEGABAAAYBAEWAACAQRBgAQAAGAQBFgAAgEEQYAEAABgEARYAAIBBEGABAAAYBAEWAACAQRBgAQAAGAQBFgAAgEEQYAEAABiEhVmctKoOSXJ5kqOTPCTJf0jy10muTLKU5JYk53f3zqp6RZKzk2xPckF33zyLmgAAABi2WY3APi/JHd19SpKzkvxOktcnuWi8bl2Sc6rq2CSnJjkhyXOTXDqjegAAABi4WQXYP0ry8onl7UmOS3LdePnqJGckOTnJ5u5e6u4vJVmoqsUZ1QQAAMCAzWQKcXd/OUmqamOSq5JclOR13b003mVbkiOTHJHkjolDd63fsqfzb9p0eBYWNuzvskmyuLhxtUsYJO/bfNHv+aLf80W/54t+zxf9XhtmEmCTpKqOSvKuJG/q7ndU1WsmNm9McmeSu8ePl6/fo61b79mfpTJhy5Ztq13C4CwubvS+zRH9ni/6PV/0e77o93zR7+FZ6QOHmUwhrqrvSLI5yUu7+/Lx6k9U1Wnjx2cluT7JDUnOrKr1VfWYJOu7+/ZZ1AQAAMCwzWoE9teSbEry8qradS3sLyZ5Y1UdmuTWJFd1946quj7JjRmF6fNnVA8AAAADN6trYH8xo8C63Km72ffiJBfPog4AAADWjlndhRgAAAD2KwEWAACAQRBgAQAAGAQBFgAAgEEQYAEAABgEARYAAIBBEGABAAAYBAEWAACAQRBgAQAAGAQBFgAAgEEQYAEAABgEARYAAIBBWFjtAji4nPeqa6fe9/ILT59hJQAAAPdnBBYAAIBBEGABAAAYBAEWAACAQRBgAQAAGAQBFgAAgEEQYAEAABgEARYAAIBBEGABAAAYBAEWAACAQRBgAQAAGAQBFgAAgEEQYAEAABgEARYAAIBBEGABAAAYBAEWAACAQRBgAQAAGISFWZ68qk5I8uruPq2qnpDkyiRLSW5Jcn5376yqVyQ5O8n2JBd0982zrAkAAIBhmtkIbFX9apK3JjlsvOr1SS7q7lOSrEtyTlUdm+TUJCckeW6SS2dVDwAAAMM2yynEtyV5zsTycUmuGz++OskZSU5Osrm7l7r7S0kWqmpxhjUBAAAwUDObQtzd76yqoydWrevupfHjbUmOTHJEkjsm9tm1fsuezr1p0+FZWNiwH6vlwTjvVdfu0/7vveScGVWy+hYXN652CRxA+j1f9Hu+6Pd80e/5ot9rw0yvgV1m58TjjUnuTHL3+PHy9Xu0des9+7cyDogtW7atdgkzsbi4cc2+Nh5Iv+eLfs8X/Z4v+j1f9Ht4VvrA4UDehfgTVXXa+PFZSa5PckOSM6tqfVU9Jsn67r79ANYEAADAQBzIEdiXJLmsqg5NcmuSq7p7R1Vdn+TGjML0+QewHgAAAAZkpgG2u/8myYnjx5/N6I7Dy/e5OMnFs6wDAACA4TuQU4gBAADgQRNgAQAAGAQBFgAAgEEQYAEAABgEARYAAIBBEGABAAAYBAEWAACAQRBgAQAAGAQBFgAAgEEQYAEAABgEARYAAIBBEGABAAAYhIXVLoD5cd6rrt2n/S+/8PQZVQIAAAyREVgAAAAGQYAFAABgEARYAAAABkGABQAAYBAEWAAAAAZBgAUAAGAQBFgAAAAGwffActDal++N9Z2xAACw9hmBBQAAYBAEWAAAAAbBFGLWhH2ZbpyYcgwAAENkBBYAAIBBEGABAAAYBFOImUvucAwAAMNjBBYAAIBBMAIL+9ksbyjlZlUAAMyzgyLAVtX6JG9K8qQkX0vywu7+3OpWBQfGvoZSAACYVwdFgE3y7CSHdfdJVXVikkuSnLPKNQGrxEgzAAC7c7AE2JOTfCBJuvumqjp+leuBbxjyCOmQa58V78nu+RDg4OZDHQD2ZJ7+nVi3tLS02jWkqt6a5J3dffV4+UtJHtfd21e3MgAAAA4WB8tdiO9OsnFieb3wCgAAwKSDJcDekOQZSTK+BvZTq1sOAAAAB5uD5RrYdyV5elV9JMm6JOeucj0AAAAcZA6Ka2ABAABgbw6WKcQAAACwRwIsAAAAg3CwXAO7JlTV+iRvSvKkJF9L8sLu/tzqVsWDUVUnJHl1d59WVU9IcmWSpSS3JDm/u3dW1SuSnJ1ke5ILuvvmfdn3gL8oHqCqDklyeZKjkzwkyX9I8tfR7zWpqjYkuSxJJdmR0f0W1kW/16yqemSSjyd5ekb9uTJ6vSZV1SeS3DVe/EKStyR5Q0aJjTtIAAADuElEQVS92tzdr1zp97TxDUSn2veAvihWVFUvS/JDSQ7NqE/Xxc/33DACu389O8lh3X1SkguTXLLK9fAgVNWvJnlrksPGq16f5KLuPiWjX3bPqapjk5ya5IQkz01y6YPYl9X3vCR3jPt1VpLfiX6vZc9Kku5+apLfyKh/+r1GjT+gekuSe8er9HqNqqrDkqS7Txv/OTfJm5P8RJKTk5ww7t9Kv6fty76ssqo6LclTkjw1o5/Jo+Lne64IsPvXyUk+kCTdfVOS41e3HB6k25I8Z2L5uIw+2UuSq5OckVGvN3f3Und/KclCVS3u476svj9K8vKJ5e3R7zWru9+d5EXjxccm+fvo91r2uoyCyd+Ol/V67XpSksOranNVXVtVT0vykO6+rbuXklyT5Aeym9/TquqIafc94K+KlZyZ0VduvivJe5O8L36+54oAu38dkW9OX0mSHVVlmvbAdPc7k3x9YtW68T9qSbItyZF5YK93rd+XfVll3f3l7t5WVRuTXJXkouj3mtbd26vqbUl+O6Oe6/caVFUvSLKlu6+ZWK3Xa9c9GX1gcWaSFye5Yrxul5V6uGO87u5p9vU73UHjERl9oPBjGfX7D5Ks9/M9PwTY/evuJBsnltd39/bVKob9ZufE441J7swDe71r/b7sy0Ggqo5K8udJfr+73xH9XvO6+/lJvjej62EfOrFJv9eO8zL6fvkPJTkmyduTPHJiu16vLZ9N8n+NR88+m1EQefjE9pV6uH4361bc1+90B407klzT3fd1dyf5au4fNv18r3EC7P51Q5JnJMn4hgCfWt1y2E8+Mb7eIhldJ3l9Rr0+s6rWV9VjMvqH7fZ93JdVVlXfkWRzkpd29+Xj1fq9RlXVT41v/JGMRmd2JvmYfq893f207j61u09L8skkP53kar1es87L+BrVqnp0ksOTfKWqHl9V6zIamd3Vw/v9ntbddye5b5p9D+xLYg8+nORfV9W6cb8fluSDfr7nh6kQ+9e7MvrE9yMZXRR+7irXw/7xkiSXVdWhSW5NclV376iq65PcmNEHQec/iH1Zfb+WZFOSl1fVrmthfzHJG/V7TfrjJFdU1V8kOSTJBRn1zc/3fPD/8rXr95JcWVUfzujOsudl9AHVHyTZkNG1jR+tqr/M7n9Pe/E+7Msq6+73ja9zvjnf/Fn8Qvx8z411S0tLe98LAAAAVpkpxAAAAAyCAAsAAMAgCLAAAAAMggALAADAIAiwAAAADIIACwAAwCAIsAAAAAyCAAsAAMAg/P/Qj6k57rspGwAAAABJRU5ErkJggg==\n", 599 | "text/plain": [ 600 | "" 601 | ] 602 | }, 603 | "metadata": { 604 | "needs_background": "light" 605 | }, 606 | "output_type": "display_data" 607 | } 608 | ], 609 | "source": [ 610 | "# Longitud en tokens de cada noticia\n", 611 | "tokens_numbers = news.swifter.apply(lambda row: len(row['content']), axis = 1)\n", 612 | "\n", 613 | "fig, ax = plt.subplots(1,1, figsize=(16,6))\n", 614 | "tokens_numbers.plot.hist(title=\"Number of tokens in the article\", bins = 70, ax=ax)" 615 | ] 616 | }, 617 | { 618 | "cell_type": "code", 619 | "execution_count": 21, 620 | "metadata": {}, 621 | "outputs": [ 622 | { 623 | "name": "stdout", 624 | "output_type": "stream", 625 | "text": [ 626 | "Número medio de tokens por artículo: 2260\n", 627 | "Desviación estándar de tokens por artículo: 2892\n", 628 | "Mediana de tokens por artículo: 1592\n", 629 | "Ventana de tokens escogida: 8045 - Cubre el 96.92369157011586% del dataset\n" 630 | ] 631 | } 632 | ], 633 | "source": [ 634 | "print(\"Número medio de tokens por artículo: {}\".format(int(np.mean(tokens_numbers))))\n", 635 | "print(\"Desviación estándar de tokens por artículo: {}\".format(int(np.std(tokens_numbers))))\n", 636 | "print(\"Mediana de tokens por artículo: {}\".format(int(np.median(tokens_numbers))))\n", 637 | "\n", 638 | "max_tokens = np.mean(tokens_numbers) + 2 * np.std(tokens_numbers)\n", 639 | "max_tokens = int(max_tokens)\n", 640 | "\n", 641 | "percent_tokens = tokens_numbers[tokens_numbers <= max_tokens].count() / tokens_numbers.count()\n", 642 | "print(\"Ventana de tokens escogida: {} - Cubre el {}% del dataset\".format(max_tokens, percent_tokens*100))" 643 | ] 644 | }, 645 | { 646 | "cell_type": "code", 647 | "execution_count": 22, 648 | "metadata": {}, 649 | "outputs": [], 650 | "source": [ 651 | "n_tokens = max_tokens\n", 652 | "news['content'] = news.apply(lambda r: pad_array(r['content'], MAX_LEN_CONTENT) , axis=1) #Use necessary for the model" 653 | ] 654 | }, 655 | { 656 | "cell_type": "code", 657 | "execution_count": 23, 658 | "metadata": {}, 659 | "outputs": [ 660 | { 661 | "name": "stderr", 662 | "output_type": "stream", 663 | "text": [ 664 | "\r", 665 | "Pandas Apply: 0%| | 0/2503 [00:00" 690 | ] 691 | }, 692 | "metadata": { 693 | "needs_background": "light" 694 | }, 695 | "output_type": "display_data" 696 | } 697 | ], 698 | "source": [ 699 | "# Longitud en tokens de cada noticia\n", 700 | "tokens_numbers = news.apply(lambda row: len(row['title']), axis = 1)\n", 701 | "\n", 702 | "fig, ax = plt.subplots(1,1, figsize=(16,6))\n", 703 | "tokens_numbers.plot.hist(title=\"Number of tokens in the article\", bins = 30, ax=ax)\n", 704 | "\n", 705 | "print(\"Número medio de tokens por título: {}\".format(int(np.mean(tokens_numbers))))\n", 706 | "print(\"Desviación estándar de tokens por título: {}\".format(int(np.std(tokens_numbers))))\n", 707 | "print(\"Mediana de tokens por título: {}\".format(int(np.median(tokens_numbers))))\n", 708 | "\n", 709 | "max_tokens = np.mean(tokens_numbers) + 2 * np.std(tokens_numbers)\n", 710 | "max_tokens = int(max_tokens)\n", 711 | "\n", 712 | "n_tokens = max_tokens\n", 713 | "\n", 714 | "percent_tokens = tokens_numbers[tokens_numbers <= max_tokens].count() / tokens_numbers.count()\n", 715 | "print(\"Ventana de tokens escogida: {} - Cubre el {}% del dataset\".format(max_tokens, percent_tokens*100))\n", 716 | "\n", 717 | "#Use necessary for the model\n", 718 | "news['title'] = news.progress_apply(lambda r: pad_array(r['title'], MAX_LEN_TITLE) , axis=1)" 719 | ] 720 | }, 721 | { 722 | "cell_type": "markdown", 723 | "metadata": {}, 724 | "source": [ 725 | "### Label a categorical" 726 | ] 727 | }, 728 | { 729 | "cell_type": "code", 730 | "execution_count": 24, 731 | "metadata": {}, 732 | "outputs": [ 733 | { 734 | "data": { 735 | "text/html": [ 736 | "
\n", 737 | "\n", 750 | "\n", 751 | " \n", 752 | " \n", 753 | " \n", 754 | " \n", 755 | " \n", 756 | " \n", 757 | " \n", 758 | " \n", 759 | " \n", 760 | " \n", 761 | " \n", 762 | " \n", 763 | " \n", 764 | " \n", 765 | " \n", 766 | " \n", 767 | " \n", 768 | " \n", 769 | " \n", 770 | " \n", 771 | " \n", 772 | " \n", 773 | " \n", 774 | " \n", 775 | " \n", 776 | " \n", 777 | " \n", 778 | " \n", 779 | " \n", 780 | " \n", 781 | " \n", 782 | " \n", 783 | " \n", 784 | " \n", 785 | " \n", 786 | " \n", 787 | " \n", 788 | " \n", 789 | " \n", 790 | " \n", 791 | " \n", 792 | " \n", 793 | " \n", 794 | " \n", 795 | " \n", 796 | " \n", 797 | "
titletypecontentone_hot_label
0[4024, 2370, 1280, 11538, 17251, 20, 17919, 57...fake[4024, 2370, 1280, 11538, 17251, 20, 17919, 57...[0, 1, 0]
1[5977, 4211, 7726, 11538, 9311, 8469, 4211, 12...fake[5760, 5760, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...[0, 1, 0]
2[7203, 15775, 15775, 3581, 7726, 17919, 4211, ...fake[3708, 4211, 5760, 7726, 15775, 4211, 6869, 42...[0, 1, 0]
3[11969, 4211, 1280, 4883, 4501, 23199, 8469, 2...fake[3708, 11538, 7726, 11538, 73, 17919, 7726, 45...[0, 1, 0]
4[3581, 4501, 11538, 16205, 8303, 20, 5760, 846...fake[3581, 4501, 11538, 16205, 8303, 20, 5760, 846...[0, 1, 0]
\n", 798 | "
" 799 | ], 800 | "text/plain": [ 801 | " title type \\\n", 802 | "0 [4024, 2370, 1280, 11538, 17251, 20, 17919, 57... fake \n", 803 | "1 [5977, 4211, 7726, 11538, 9311, 8469, 4211, 12... fake \n", 804 | "2 [7203, 15775, 15775, 3581, 7726, 17919, 4211, ... fake \n", 805 | "3 [11969, 4211, 1280, 4883, 4501, 23199, 8469, 2... fake \n", 806 | "4 [3581, 4501, 11538, 16205, 8303, 20, 5760, 846... fake \n", 807 | "\n", 808 | " content one_hot_label \n", 809 | "0 [4024, 2370, 1280, 11538, 17251, 20, 17919, 57... [0, 1, 0] \n", 810 | "1 [5760, 5760, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ... [0, 1, 0] \n", 811 | "2 [3708, 4211, 5760, 7726, 15775, 4211, 6869, 42... [0, 1, 0] \n", 812 | "3 [3708, 11538, 7726, 11538, 73, 17919, 7726, 45... [0, 1, 0] \n", 813 | "4 [3581, 4501, 11538, 16205, 8303, 20, 5760, 846... [0, 1, 0] " 814 | ] 815 | }, 816 | "execution_count": 24, 817 | "metadata": {}, 818 | "output_type": "execute_result" 819 | } 820 | ], 821 | "source": [ 822 | "encoder = LabelBinarizer().fit(list(news['type']))\n", 823 | "news['one_hot_label'] = news.apply(lambda r: encoder.transform([r['type']])[0], axis=1)\n", 824 | "news.head()" 825 | ] 826 | }, 827 | { 828 | "cell_type": "code", 829 | "execution_count": 25, 830 | "metadata": {}, 831 | "outputs": [ 832 | { 833 | "name": "stdout", 834 | "output_type": "stream", 835 | "text": [ 836 | "LABELS\n", 837 | "bias [1 0 0]\n", 838 | "fake [0 1 0]\n", 839 | "true [0 0 1]\n" 840 | ] 841 | } 842 | ], 843 | "source": [ 844 | "print(\"LABELS\")\n", 845 | "enc = encoder.transform(encoder.classes_)\n", 846 | "for x, y in zip(encoder.classes_, enc):\n", 847 | " print(x, y)" 848 | ] 849 | }, 850 | { 851 | "cell_type": "code", 852 | "execution_count": 26, 853 | "metadata": {}, 854 | "outputs": [], 855 | "source": [ 856 | "### Guardar el dataset\n", 857 | "news.to_pickle('../data/news_getting_real.pickle')" 858 | ] 859 | }, 860 | { 861 | "cell_type": "code", 862 | "execution_count": null, 863 | "metadata": {}, 864 | "outputs": [], 865 | "source": [] 866 | } 867 | ], 868 | "metadata": { 869 | "kernelspec": { 870 | "display_name": "Python 3", 871 | "language": "python", 872 | "name": "python3" 873 | }, 874 | "language_info": { 875 | "codemirror_mode": { 876 | "name": "ipython", 877 | "version": 3 878 | }, 879 | "file_extension": ".py", 880 | "mimetype": "text/x-python", 881 | "name": "python", 882 | "nbconvert_exporter": "python", 883 | "pygments_lexer": "ipython3", 884 | "version": "3.6.7" 885 | } 886 | }, 887 | "nbformat": 4, 888 | "nbformat_minor": 2 889 | } 890 | --------------------------------------------------------------------------------