├── src ├── data │ ├── subsample.py │ └── extractor.py └── bert_class │ └── bert_class.py ├── README.md ├── .gitignore └── notebooks ├── Test_Colab_Categorical.ipynb ├── Processing_test_dataset.ipynb └── GettingRealAboutFake.ipynb /src/data/subsample.py: -------------------------------------------------------------------------------- 1 | """ 2 | Módulo para procesar un dataset grande (varios GB) con pocos recursos de memoria. 3 | 4 | Autor: Álvaro Ibrain 5 | Fecha: 4 de marzo de 2018 6 | """ 7 | 8 | import sys 9 | import random 10 | import csv 11 | csv.field_size_limit(99999999999) 12 | 13 | 14 | def sample_file(input_data, output_data, percent = 0.2): 15 | """ 16 | Función para extraer muestras aleatorias de un csv grande. Sampleando de esta 17 | manera sólo se carga en memoria una fila cada vez, lo que permite que sin muchos 18 | recursos se pueda procesar un dataser csv muy grande. 19 | 20 | :param (string) input_data: Path al fichero de lectura 21 | :param (string) output_data: Path al fichero de escritura 22 | :param (float) percent: Porcentaje del fichero a analizar. Número entre (0, 1] 23 | """ 24 | with open(input_data) as file: 25 | with open(output_data, 'w+') as out: 26 | header = True 27 | reader = csv.reader(file) 28 | writer = csv.writer(out) 29 | for r in reader: 30 | if header: 31 | #Keep header 32 | writer.writerow(r) 33 | header = False 34 | else: 35 | if random.random() < prob: 36 | writer.writerow(r) 37 | 38 | 39 | def main(): 40 | inp_file = "" 41 | out_file = "" 42 | percentage = -1 43 | 44 | if len(sys.argv) < 3: 45 | sys.stderr.write("Error. Argumentos necesarios: "+ 46 | " [percentage] ") 47 | return 48 | if len(sys.argv) <= 3: 49 | inp_file = sys.argv[1] 50 | out_file = sys.argv[2] 51 | sample_file(inp_file, out_file) 52 | 53 | if len(sys.argv) == 4: 54 | percentage = sys.argv[3] 55 | sample_file(inp_file, out_file, percent = percentage) 56 | 57 | 58 | if __name__ == '__main__': 59 | main() 60 | -------------------------------------------------------------------------------- /src/data/extractor.py: -------------------------------------------------------------------------------- 1 | """ 2 | Módulo para extraer un csv con los tipos de noticias infrarrepresentados 3 | 4 | Autor: Álvaro Ibrain 5 | Fecha: 12 de marzo de 2018 6 | """ 7 | 8 | import sys 9 | import random 10 | import csv 11 | csv.field_size_limit(99999999999) 12 | 13 | 14 | def extract(input_data, output_data, types = ['clickbait', 'bias'], offset_col = 3): 15 | """ 16 | Función para extraer solo los articulos de un tipo del dataset Fake News Corpus 17 | 18 | :param (string) input_data: Path al fichero de lectura 19 | :param (string) output_data: Path al fichero de escritura 20 | :param (float) types: Tipos de noticias que se desean seleccionar 21 | """ 22 | with open(input_data) as file: 23 | with open(output_data, 'w+') as out: 24 | header = True 25 | reader = csv.reader(file) 26 | writer = csv.writer(out) 27 | for r in reader: 28 | if header: 29 | #Keep header 30 | writer.writerow(r) 31 | header = False 32 | else: 33 | #Select only if it is in types 34 | try: 35 | if r[offset_col] in types: 36 | writer.writerow(r) 37 | except: 38 | pass #If the column is empty 39 | 40 | def main(): 41 | inp_file = "" 42 | out_file = "" 43 | 44 | if len(sys.argv) < 3: 45 | sys.stderr.write("Error. Argumentos necesarios: "+ 46 | " ") 47 | return 48 | 49 | types = ['clickbait', 'bias'] 50 | if len(sys.argv) == 4: 51 | types = sys.argv[3].split(',') 52 | 53 | inp_file = sys.argv[1] 54 | out_file = sys.argv[2] 55 | print("Processing...") 56 | print("Extracting news of type:") 57 | print(types) 58 | extract(input_data =inp_file, output_data = out_file, types = types) 59 | 60 | if __name__ == '__main__': 61 | main() 62 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Fake news detection using deep learning 2 | ## Final master thesis project 3 | 4 | This repository is focused on finding fake news using deep learning 5 | 6 | There are multiple methods focused on achieving this goal, but the objective 7 | of this work is discriminating the fake ones by only looking at the text. No graphs, 8 | no social network analysis neither images. 9 | 10 | In this work three deep learning architectures are proposed and later tested over two datasets (Fake news corpus and TI-CNN), obtaining state of the art results. 11 | 12 | 1. **LSTM Based architecture**: $91\%$ accuracy (TI-CNN) || $76\%$ accuracy (FNC) 13 | 2. **CNN Based architecture**: $97\%$ accuracy (TI-CNN) || $82\%$ accuracy (FNC) 14 | 3. **BERT Based architecture**: $97\%$ accuracy (TI-CNN) || $76\%$ accuracy (FNC) 15 | 16 | This repository contains several Python notebooks with the developed code 17 | 18 | ### Data sources 19 | * Fake News Corpus: https://github.com/several27/FakeNewsCorpus 20 | * Getting Real About Fake News: https://www.kaggle.com/mrisdal/fake-news 21 | * Fake News Detection: https://www.kaggle.com/jruvika/fake-news-detection 22 | * News Dataset from TI-CNN: https://arxiv.org/abs/1806.00749 23 | 24 | ### Folder structure 25 | * **data**: This directory must be created with the necessary data for scripts to work. 26 | (Not uploaded to GH due to filesize restrictions). 27 | - GoogleNews-vectors-negative300.bin.gz: Word2Vec news trained model weights 28 | - Other_datasets 29 | - GettingRealAboutFake/ 30 | - ```fake.csv``` (*Getting Real About Fake News Dataset*) 31 | - ``all_data.csv`` (*TI-CNN dataset*) 32 | - ``real_or_fake.csv`` 33 | - `FakeNewsCorpus.csv` (*Fake News Corpus*) 34 | * **notebooks**: Notebooks for prototyping 35 | * **src**: Code with utils 36 | * **data**: Code to generate datasets / process data 37 | * **bert_class**: Fine-tuned classifier built over Google's BERT to detect fake/true news. 38 | 39 | ### Notebook explanation 40 | * `FakeNewsCorpus.ipynb:` Cleaning and preprocessing the dataset 'Fake News Corpus'. 41 | * `GettingRealAboutFake.ipynb:` Cleaning and preprocessing the dataset 'Getting Real 42 | About Fake News' from Kaggle. 43 | * `TI_CNN-Dataset:` Cleaning and preprocessing the tadaset 'TI-CNN'. 44 | * `Processing_test_dataset.ipynb:` Cleaning and preprocessing the dataset 'True or Fake' from Kaggle. 45 | * `BayesianOpt.pynb:**` Obtaining model hyperparameters using Bayesian Optimization 46 | * `Train-Colab-Categorical.ipynb:` Train DNN to categorize 4 types of news. 47 | * `Train\_Colab_Binary.ipynb:` Train DNN to categorize only **True** or **Fake** 48 | classes. 49 | * `Test\_Colab_Categorical.ipynb:` Testing the previously trained categorical models on TI-CNN. 50 | * `Test\_Colab_Binary.ipynb:` Testing the previously trained binary models on FNC. 51 | * `data\_analysis/Data_analysis-FNC.ipynb:` EDA of the Fake News Corpus. 52 | * `data\_analysis/Data_analysis-TI-CNN.ipynb:` EDA of the TI-CNN Dataset 53 | * `data\_analysis/Data_analysis-Getting-Real.ipynb:` EDA of the Getting Real About 54 | fake news Dataset 55 | 56 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | 2 | ### JupyterNotebook ### 3 | .ipynb_checkpoints 4 | */.ipynb_checkpoints/* 5 | 6 | # Remove previous ipynb_checkpoints 7 | # git rm -r .ipynb_checkpoints/ 8 | # 9 | 10 | ### LaTeX ### 11 | ## Core latex/pdflatex auxiliary files: 12 | *.aux 13 | *.lof 14 | *.log 15 | *.lot 16 | *.fls 17 | *.out 18 | *.toc 19 | *.fmt 20 | *.fot 21 | *.cb 22 | *.cb2 23 | .*.lb 24 | 25 | ## Intermediate documents: 26 | *.dvi 27 | *.xdv 28 | *-converted-to.* 29 | # these rules might exclude image files for figures etc. 30 | # *.ps 31 | # *.eps 32 | # *.pdf 33 | 34 | ## Generated if empty string is given at "Please type another file name for output:" 35 | .pdf 36 | 37 | ## Bibliography auxiliary files (bibtex/biblatex/biber): 38 | *.bbl 39 | *.bcf 40 | *.blg 41 | *-blx.aux 42 | *-blx.bib 43 | *.run.xml 44 | 45 | ## Build tool auxiliary files: 46 | *.fdb_latexmk 47 | *.synctex 48 | *.synctex(busy) 49 | *.synctex.gz 50 | *.synctex.gz(busy) 51 | *.pdfsync 52 | 53 | ## Build tool directories for auxiliary files 54 | # latexrun 55 | latex.out/ 56 | 57 | ## Auxiliary and intermediate files from other packages: 58 | # algorithms 59 | *.alg 60 | *.loa 61 | 62 | # achemso 63 | acs-*.bib 64 | 65 | # amsthm 66 | *.thm 67 | 68 | # beamer 69 | *.nav 70 | *.pre 71 | *.snm 72 | *.vrb 73 | 74 | # changes 75 | *.soc 76 | 77 | # comment 78 | *.cut 79 | 80 | # cprotect 81 | *.cpt 82 | 83 | # elsarticle (documentclass of Elsevier journals) 84 | *.spl 85 | 86 | # endnotes 87 | *.ent 88 | 89 | # fixme 90 | *.lox 91 | 92 | # feynmf/feynmp 93 | *.mf 94 | *.mp 95 | *.t[1-9] 96 | *.t[1-9][0-9] 97 | *.tfm 98 | 99 | #(r)(e)ledmac/(r)(e)ledpar 100 | *.end 101 | *.?end 102 | *.[1-9] 103 | *.[1-9][0-9] 104 | *.[1-9][0-9][0-9] 105 | *.[1-9]R 106 | *.[1-9][0-9]R 107 | *.[1-9][0-9][0-9]R 108 | *.eledsec[1-9] 109 | *.eledsec[1-9]R 110 | *.eledsec[1-9][0-9] 111 | *.eledsec[1-9][0-9]R 112 | *.eledsec[1-9][0-9][0-9] 113 | *.eledsec[1-9][0-9][0-9]R 114 | 115 | # glossaries 116 | *.acn 117 | *.acr 118 | *.glg 119 | *.glo 120 | *.gls 121 | *.glsdefs 122 | 123 | # gnuplottex 124 | *-gnuplottex-* 125 | 126 | # gregoriotex 127 | *.gaux 128 | *.gtex 129 | 130 | # htlatex 131 | *.4ct 132 | *.4tc 133 | *.idv 134 | *.lg 135 | *.trc 136 | *.xref 137 | 138 | # hyperref 139 | *.brf 140 | 141 | # knitr 142 | *-concordance.tex 143 | # TODO Comment the next line if you want to keep your tikz graphics files 144 | *.tikz 145 | *-tikzDictionary 146 | 147 | # listings 148 | *.lol 149 | 150 | # makeidx 151 | *.idx 152 | *.ilg 153 | *.ind 154 | *.ist 155 | 156 | # minitoc 157 | *.maf 158 | *.mlf 159 | *.mlt 160 | *.mtc[0-9]* 161 | *.slf[0-9]* 162 | *.slt[0-9]* 163 | *.stc[0-9]* 164 | 165 | # minted 166 | _minted* 167 | *.pyg 168 | 169 | # morewrites 170 | *.mw 171 | 172 | # nomencl 173 | *.nlg 174 | *.nlo 175 | *.nls 176 | 177 | # pax 178 | *.pax 179 | 180 | # pdfpcnotes 181 | *.pdfpc 182 | 183 | # sagetex 184 | *.sagetex.sage 185 | *.sagetex.py 186 | *.sagetex.scmd 187 | 188 | # scrwfile 189 | *.wrt 190 | 191 | # sympy 192 | *.sout 193 | *.sympy 194 | sympy-plots-for-*.tex/ 195 | 196 | # pdfcomment 197 | *.upa 198 | *.upb 199 | 200 | # pythontex 201 | *.pytxcode 202 | pythontex-files-*/ 203 | 204 | # tcolorbox 205 | *.listing 206 | 207 | # thmtools 208 | *.loe 209 | 210 | # TikZ & PGF 211 | *.dpth 212 | *.md5 213 | *.auxlock 214 | 215 | # todonotes 216 | *.tdo 217 | 218 | # vhistory 219 | *.hst 220 | *.ver 221 | 222 | # easy-todo 223 | *.lod 224 | 225 | # xcolor 226 | *.xcp 227 | 228 | # xmpincl 229 | *.xmpi 230 | 231 | # xindy 232 | *.xdy 233 | 234 | # xypic precompiled matrices 235 | *.xyc 236 | 237 | # endfloat 238 | *.ttt 239 | *.fff 240 | 241 | # Latexian 242 | TSWLatexianTemp* 243 | 244 | ## Editors: 245 | # WinEdt 246 | *.bak 247 | *.sav 248 | 249 | # Texpad 250 | .texpadtmp 251 | 252 | # LyX 253 | *.lyx~ 254 | 255 | # Kile 256 | *.backup 257 | 258 | # KBibTeX 259 | *~[0-9]* 260 | 261 | # auto folder when using emacs and auctex 262 | ./auto/* 263 | *.el 264 | 265 | # expex forward references with \gathertags 266 | *-tags.tex 267 | 268 | # standalone packages 269 | *.sta 270 | 271 | ### LaTeX Patch ### 272 | # glossaries 273 | *.glstex 274 | 275 | ### macOS ### 276 | # General 277 | .DS_Store 278 | .AppleDouble 279 | .LSOverride 280 | 281 | # Icon must end with two \r 282 | Icon 283 | 284 | # Thumbnails 285 | ._* 286 | 287 | # Files that might appear in the root of a volume 288 | .DocumentRevisions-V100 289 | .fseventsd 290 | .Spotlight-V100 291 | .TemporaryItems 292 | .Trashes 293 | .VolumeIcon.icns 294 | .com.apple.timemachine.donotpresent 295 | 296 | # Directories potentially created on remote AFP share 297 | .AppleDB 298 | .AppleDesktop 299 | Network Trash Folder 300 | Temporary Items 301 | .apdisk 302 | 303 | ### Python ### 304 | # Byte-compiled / optimized / DLL files 305 | __pycache__/ 306 | *.py[cod] 307 | *$py.class 308 | 309 | # C extensions 310 | *.so 311 | 312 | # Distribution / packaging 313 | .Python 314 | build/ 315 | develop-eggs/ 316 | dist/ 317 | downloads/ 318 | eggs/ 319 | .eggs/ 320 | lib/ 321 | lib64/ 322 | parts/ 323 | sdist/ 324 | var/ 325 | wheels/ 326 | pip-wheel-metadata/ 327 | share/python-wheels/ 328 | *.egg-info/ 329 | .installed.cfg 330 | *.egg 331 | MANIFEST 332 | 333 | # PyInstaller 334 | # Usually these files are written by a python script from a template 335 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 336 | *.manifest 337 | *.spec 338 | 339 | # Installer logs 340 | pip-log.txt 341 | pip-delete-this-directory.txt 342 | 343 | # Unit test / coverage reports 344 | htmlcov/ 345 | .tox/ 346 | .nox/ 347 | .coverage 348 | .coverage.* 349 | .cache 350 | nosetests.xml 351 | coverage.xml 352 | *.cover 353 | .hypothesis/ 354 | .pytest_cache/ 355 | 356 | # Translations 357 | *.mo 358 | *.pot 359 | 360 | # Django stuff: 361 | local_settings.py 362 | db.sqlite3 363 | 364 | # Flask stuff: 365 | instance/ 366 | .webassets-cache 367 | 368 | # Scrapy stuff: 369 | .scrapy 370 | 371 | # Sphinx documentation 372 | docs/_build/ 373 | 374 | # PyBuilder 375 | target/ 376 | 377 | # Jupyter Notebook 378 | 379 | # IPython 380 | profile_default/ 381 | ipython_config.py 382 | 383 | # pyenv 384 | .python-version 385 | 386 | # celery beat schedule file 387 | celerybeat-schedule 388 | 389 | # SageMath parsed files 390 | *.sage.py 391 | 392 | # Environments 393 | .env 394 | .venv 395 | env/ 396 | venv/ 397 | ENV/ 398 | env.bak/ 399 | venv.bak/ 400 | 401 | # Spyder project settings 402 | .spyderproject 403 | .spyproject 404 | 405 | # Rope project settings 406 | .ropeproject 407 | 408 | # mkdocs documentation 409 | /site 410 | 411 | # mypy 412 | .mypy_cache/ 413 | .dmypy.json 414 | dmypy.json 415 | 416 | # Pyre type checker 417 | .pyre/ 418 | 419 | ### Python Patch ### 420 | .venv/ 421 | 422 | ### Ignored folder 423 | */ignored/* 424 | 425 | ### Data folder 426 | data/* 427 | 428 | 429 | -------------------------------------------------------------------------------- /src/bert_class/bert_class.py: -------------------------------------------------------------------------------- 1 | from bert.run_classifier import convert_examples_to_features, InputExample, input_fn_builder 2 | from bert.tokenization import FullTokenizer 3 | from bert.optimization import create_optimizer 4 | 5 | 6 | import numpy as np 7 | import tensorflow as tf 8 | import tensorflow_hub as hub 9 | 10 | 11 | class BertClassifier(object): 12 | 13 | def __init__(self, ): 14 | self.BERT_MODEL_HUB = "https://tfhub.dev/google/bert_uncased_L-12_H-768_A-12/1" 15 | self.tokenizer = None 16 | self.model = None 17 | 18 | self.max_seq_len = 128 19 | self.tokenizer = self.__create_tokenizer_from_hub_module() 20 | 21 | def __create_tokenizer_from_hub_module(self): 22 | if self.tokenizer is not None: 23 | return self.tokenizer 24 | 25 | with tf.Graph().as_default(): 26 | bert_module = hub.Module(self.BERT_MODEL_HUB) 27 | tokenization_info = bert_module(signature="tokenization_info", 28 | as_dict=True) 29 | with tf.Session() as sess: 30 | vocab_file, do_lower_case = sess.run([tokenization_info["vocab_file"], 31 | tokenization_info["do_lower_case"]]) 32 | 33 | tokenizer = FullTokenizer(vocab_file=vocab_file, 34 | do_lower_case=do_lower_case) 35 | 36 | self.tokenizer = tokenizer 37 | return tokenizer 38 | 39 | def __create_features(self, pd_dataset, label_list, 40 | max_seq_len, tokenizer, 41 | data_column, label_column): 42 | input_examples = pd_dataset.apply(lambda x: InputExample(guid=None, 43 | text_a=x[data_column], 44 | text_b=None, 45 | label=x[label_column]), axis=1) 46 | return convert_examples_to_features(input_examples, label_list, 47 | max_seq_len, tokenizer) 48 | 49 | def __create_model(self, input_ids, input_mask, segment_ids, 50 | labels, num_labels, is_predicting=True): 51 | 52 | bert_module = hub.Module( 53 | self.BERT_MODEL_HUB, 54 | trainable=True) 55 | 56 | bert_inputs = dict( 57 | input_ids=input_ids, 58 | input_mask=input_mask, 59 | segment_ids=segment_ids) 60 | 61 | bert_outputs = bert_module( 62 | inputs=bert_inputs, 63 | signature="tokens", 64 | as_dict=True) 65 | 66 | output_layer = bert_outputs["pooled_output"] 67 | 68 | hidden_size = output_layer.shape[-1].value 69 | 70 | # Create our own layer to tune for politeness data. 71 | output_weights = tf.get_variable( 72 | "output_weights", [num_labels, hidden_size], 73 | initializer=tf.truncated_normal_initializer(stddev=0.02)) 74 | 75 | output_bias = tf.get_variable( 76 | "output_bias", [num_labels], initializer=tf.zeros_initializer()) 77 | 78 | with tf.variable_scope("loss"): 79 | # Dropout helps prevent overfitting 80 | output_layer = tf.nn.dropout(output_layer, keep_prob=0.9) 81 | 82 | logits = tf.matmul(output_layer, output_weights, transpose_b=True) 83 | logits = tf.nn.bias_add(logits, output_bias) 84 | log_probs = tf.nn.log_softmax(logits, axis=-1) 85 | 86 | # Convert labels into one-hot encoding 87 | one_hot_labels = tf.one_hot(labels, depth=num_labels, 88 | dtype=tf.float32) 89 | 90 | predicted_labels = tf.squeeze(tf.argmax(log_probs, axis=-1, 91 | output_type=tf.int32)) 92 | # If we're predicting, we want predicted labels 93 | # and the probabiltiies. 94 | if is_predicting: 95 | return (predicted_labels, log_probs) 96 | 97 | # If we're train/eval, compute loss between predicted and actual label 98 | per_example_loss = -tf.reduce_sum(one_hot_labels * log_probs, axis=-1) 99 | loss = tf.reduce_mean(per_example_loss) 100 | return (loss, predicted_labels, log_probs) 101 | 102 | def __model_fn_builder(self, num_labels, learning_rate, 103 | num_train_steps, 104 | num_warmup_steps): 105 | """Returns `model_fn` closure for TPUEstimator.""" 106 | 107 | def model_fn(features, labels, mode, params): 108 | input_ids = features["input_ids"] 109 | input_mask = features["input_mask"] 110 | segment_ids = features["segment_ids"] 111 | label_ids = features["label_ids"] 112 | 113 | is_predicting = (mode == tf.estimator.ModeKeys.PREDICT) 114 | 115 | # TRAIN and EVAL 116 | if not is_predicting: 117 | 118 | (loss, predicted_labels, log_probs) = self.__create_model( 119 | input_ids, 120 | input_mask, segment_ids, label_ids, num_labels, 121 | is_predicting=is_predicting 122 | ) 123 | 124 | train_op = create_optimizer( 125 | loss, learning_rate, num_train_steps, num_warmup_steps, 126 | use_tpu=False) 127 | 128 | # Calculate evaluation metrics. 129 | def metric_fn(label_ids, predicted_labels): 130 | accuracy = tf.metrics.accuracy(label_ids, predicted_labels) 131 | f1_score = tf.contrib.metrics.f1_score( 132 | label_ids, 133 | predicted_labels) 134 | auc = tf.metrics.auc( 135 | label_ids, 136 | predicted_labels) 137 | recall = tf.metrics.recall( 138 | label_ids, 139 | predicted_labels) 140 | precision = tf.metrics.precision( 141 | label_ids, 142 | predicted_labels) 143 | true_pos = tf.metrics.true_positives( 144 | label_ids, 145 | predicted_labels) 146 | true_neg = tf.metrics.true_negatives( 147 | label_ids, 148 | predicted_labels) 149 | false_pos = tf.metrics.false_positives( 150 | label_ids, 151 | predicted_labels) 152 | false_neg = tf.metrics.false_negatives( 153 | label_ids, 154 | predicted_labels) 155 | return { 156 | "eval_accuracy": accuracy, 157 | "f1_score": f1_score, 158 | "auc": auc, 159 | "precision": precision, 160 | "recall": recall, 161 | "true_positives": true_pos, 162 | "true_negatives": true_neg, 163 | "false_positives": false_pos, 164 | "false_negatives": false_neg 165 | } 166 | 167 | eval_metrics = metric_fn(label_ids, predicted_labels) 168 | 169 | if mode == tf.estimator.ModeKeys.TRAIN: 170 | return tf.estimator.EstimatorSpec(mode=mode, 171 | loss=loss, 172 | train_op=train_op) 173 | else: 174 | return tf.estimator.EstimatorSpec(mode=mode, 175 | loss=loss, 176 | eval_metric_ops=eval_metrics) 177 | else: 178 | (predicted_labels, log_probs) = self.__create_model( 179 | input_ids, 180 | input_mask, segment_ids, label_ids, num_labels, 181 | is_predicting=is_predicting 182 | ) 183 | 184 | predictions = { 185 | 'probabilities': log_probs, 186 | 'labels': predicted_labels 187 | } 188 | return tf.estimator.EstimatorSpec(mode, predictions=predictions) 189 | 190 | # Return the actual model function in the closure 191 | return model_fn 192 | 193 | def __create_estimator(self, label_list, lr, batch_size, n_train, n_warm): 194 | model_fn = self.__model_fn_builder( 195 | num_labels=len(label_list), 196 | learning_rate=lr, 197 | num_train_steps=n_train, 198 | num_warmup_steps=n_warm 199 | ) 200 | 201 | estimator = tf.estimator.Estimator(model_fn=model_fn, 202 | params={"batch_size": batch_size}) 203 | 204 | return estimator, model_fn 205 | 206 | def train(self, train, test, data_col, lbl_col, 207 | batch_size=32, 208 | lr=2e-5, 209 | epochs=3, 210 | warmup=0.1): 211 | """ 212 | Trains a BERT based model to classify fake/true news 213 | 214 | Params: 215 | 216 | train -- Pandas dataframe to train with at least (text, type) columns 217 | test -- Pandas dataframe to evaluate with at least (text, type) columns 218 | data_col -- Name of the Text column 219 | lbl_col -- Name of the Type column 220 | batch_size -- Training batch size (default = 32) 221 | epochs -- Epochs to train (default = 3) 222 | warmup -- Warmup percent to train. Defined in BERT paper (default = 0.1) 223 | 224 | Returns: 225 | 226 | Rictionary with evaluation results 227 | """ 228 | label_list = train[lbl_col].unique().tolist() 229 | tokenizer = self.__create_tokenizer_from_hub_module() 230 | 231 | train_features = self.__create_features( 232 | train, label_list, 233 | self.max_seq_len, tokenizer, data_col, lbl_col 234 | ) 235 | test_features = self.__create_features( 236 | test, label_list, 237 | self.max_seq_len, tokenizer, data_col, lbl_col 238 | ) 239 | 240 | num_train_steps = int(len(train_features) / batch_size * epochs) 241 | num_warmup_steps = int(num_train_steps * warmup) 242 | 243 | estimator, model_fn = self.__create_estimator( 244 | label_list, 245 | lr, 246 | batch_size, 247 | num_train_steps, 248 | num_warmup_steps 249 | ) 250 | 251 | train_input_fn = input_fn_builder( 252 | features=train_features, 253 | seq_length=self.max_seq_len, 254 | is_training=True, 255 | drop_remainder=False) 256 | 257 | estimator.train(input_fn=train_input_fn, max_steps=num_train_steps) 258 | 259 | test_input_fn = input_fn_builder( 260 | features=test_features, 261 | seq_length=self.max_seq_len, 262 | is_training=False, 263 | drop_remainder=False) 264 | 265 | result_dict = estimator.evaluate(input_fn=test_input_fn, steps=None) 266 | 267 | self.model = estimator 268 | 269 | return result_dict 270 | 271 | def predict(self, df): 272 | """ 273 | Predicts over a pandas dataframe. 274 | 275 | Params: 276 | df -- Pandas dataframe to train with at least (text, type) columns 277 | 278 | Returns: 279 | 280 | Dictionary with predicted labels and probabilities. 281 | """ 282 | # TODO: REMOVE type column 283 | 284 | tokenizer = self.__create_tokenizer_from_hub_module() 285 | label_list = test_other[LABEL_COLUMN].unique().tolist() 286 | #label_list = [0, 1] 287 | test_features = self.__create_features( 288 | df, label_list, 289 | self.max_seq_len, tokenizer, 'text', 'type' 290 | ) 291 | 292 | preds = [] 293 | if type(self.model) == tf.estimator.Estimator: 294 | # Is trained 295 | input_fn = input_fn_builder( 296 | features=test_features, 297 | seq_length=self.max_seq_len, 298 | is_training=False, 299 | drop_remainder=False) 300 | pred = self.model.predict(input_fn=input_fn) 301 | for p in pred: 302 | preds.append(p) 303 | else: 304 | # Is loaded from a SavedModel 305 | # Format inputs 306 | inpu = { 307 | 'label_ids': np.array([x.label_id for x in test_features]).reshape(-1,), 308 | 'input_ids': np.array([x.input_ids for x in test_features]).reshape(-1, self.max_seq_len), 309 | 'input_mask': np.array([x.input_mask for x in test_features]).reshape(-1, self.max_seq_len), 310 | 'segment_ids': np.array([x.segment_ids for x in test_features]).reshape(-1, self.max_seq_len) 311 | } 312 | preds = self.model(inpu) 313 | 314 | return preds 315 | 316 | def save_model(self, directory): 317 | """Saves model in the specified path""" 318 | def serving_input_fn(): 319 | label_ids = tf.placeholder(tf.int32, [None], name='label_ids') 320 | input_ids = tf.placeholder(tf.int32, [None, self.max_seq_len], name='input_ids') 321 | input_mask = tf.placeholder(tf.int32, [None, self.max_seq_len], name='input_mask') 322 | segment_ids = tf.placeholder(tf.int32, [None, self.max_seq_len], name='segment_ids') 323 | input_fn = tf.estimator.export.build_raw_serving_input_receiver_fn({ 324 | 'label_ids': label_ids, 325 | 'input_ids': input_ids, 326 | 'input_mask': input_mask, 327 | 'segment_ids': segment_ids, 328 | })() 329 | return input_fn 330 | 331 | self.model._export_to_tpu = False # this is important 332 | self.model.export_savedmodel(directory, serving_input_fn) 333 | 334 | def load_model(self, directory): 335 | """ 336 | Restores a previously saved model. 337 | 338 | Params: 339 | 340 | directory -- Folder in which is the .pb file 341 | """ 342 | with tf.Session() as sess: 343 | tf.saved_model.loader.load(sess, [tf.saved_model.tag_constants.SERVING], directory) 344 | self.model = tf.contrib.predictor.from_saved_model(directory) 345 | -------------------------------------------------------------------------------- /notebooks/Test_Colab_Categorical.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "nbformat": 4, 3 | "nbformat_minor": 0, 4 | "metadata": { 5 | "colab": { 6 | "name": "Test-Colab-Categorical.ipynb", 7 | "version": "0.3.2", 8 | "provenance": [], 9 | "collapsed_sections": [] 10 | }, 11 | "kernelspec": { 12 | "name": "python3", 13 | "display_name": "Python 3" 14 | }, 15 | "accelerator": "GPU" 16 | }, 17 | "cells": [ 18 | { 19 | "cell_type": "markdown", 20 | "metadata": { 21 | "id": "8D_Equ4vDsT7", 22 | "colab_type": "text" 23 | }, 24 | "source": [ 25 | "# Prueba de los modelos\n", 26 | "\n", 27 | "En este notebook se probarán los modelos entrenados (CNN y LSTM) sobre los datasets de prueba *\"Getting Real About Fake News\"* y *\"Fake news detection\"*.\n" 28 | ] 29 | }, 30 | { 31 | "cell_type": "markdown", 32 | "metadata": { 33 | "id": "cM03nLDQDx50", 34 | "colab_type": "text" 35 | }, 36 | "source": [ 37 | "## Imports" 38 | ] 39 | }, 40 | { 41 | "cell_type": "code", 42 | "metadata": { 43 | "id": "yTnhOxpiEMtP", 44 | "colab_type": "code", 45 | "colab": { 46 | "base_uri": "https://localhost:8080/", 47 | "height": 209 48 | }, 49 | "outputId": "e00a7543-4089-4493-9d06-8106969ce2ec" 50 | }, 51 | "source": [ 52 | "from google.colab import drive\n", 53 | "drive.mount('/content/drive')" 54 | ], 55 | "execution_count": 1, 56 | "outputs": [ 57 | { 58 | "output_type": "stream", 59 | "text": [ 60 | "Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3Aietf%3Awg%3Aoauth%3A2.0%3Aoob&scope=email%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdocs.test%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive.photos.readonly%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fpeopleapi.readonly&response_type=code\n", 61 | "\n", 62 | "Enter your authorization code:\n", 63 | "··········\n", 64 | "Mounted at /content/drive\n" 65 | ], 66 | "name": "stdout" 67 | } 68 | ] 69 | }, 70 | { 71 | "cell_type": "code", 72 | "metadata": { 73 | "id": "-h0PLOjXEUGh", 74 | "colab_type": "code", 75 | "colab": { 76 | "base_uri": "https://localhost:8080/", 77 | "height": 151 78 | }, 79 | "outputId": "a7a38658-cb94-4750-bfc3-8ae7c3199b12" 80 | }, 81 | "source": [ 82 | "!pip install --upgrade pandas" 83 | ], 84 | "execution_count": 2, 85 | "outputs": [ 86 | { 87 | "output_type": "stream", 88 | "text": [ 89 | "Requirement already up-to-date: pandas in /usr/local/lib/python3.6/dist-packages (0.24.2)\n", 90 | "Requirement already satisfied, skipping upgrade: pytz>=2011k in /usr/local/lib/python3.6/dist-packages (from pandas) (2018.9)\n", 91 | "Requirement already satisfied, skipping upgrade: python-dateutil>=2.5.0 in /usr/local/lib/python3.6/dist-packages (from pandas) (2.5.3)\n", 92 | "Requirement already satisfied, skipping upgrade: numpy>=1.12.0 in /usr/local/lib/python3.6/dist-packages (from pandas) (1.16.3)\n", 93 | "Requirement already satisfied, skipping upgrade: six>=1.5 in /usr/local/lib/python3.6/dist-packages (from python-dateutil>=2.5.0->pandas) (1.12.0)\n" 94 | ], 95 | "name": "stdout" 96 | } 97 | ] 98 | }, 99 | { 100 | "cell_type": "code", 101 | "metadata": { 102 | "id": "sGymmzMYEb9y", 103 | "colab_type": "code", 104 | "colab": { 105 | "base_uri": "https://localhost:8080/", 106 | "height": 51 107 | }, 108 | "outputId": "9154b81a-b7a4-418b-b6ea-7bd3a0e6bca6" 109 | }, 110 | "source": [ 111 | "!ls \"drive/My Drive/Colab Notebooks/data\"" 112 | ], 113 | "execution_count": 3, 114 | "outputs": [ 115 | { 116 | "output_type": "stream", 117 | "text": [ 118 | "data_kaggle_proc.pickle\t\t news_getting_real.pickle\n", 119 | "GoogleNews-vectors-negative300.bin.gz news_proc_12_3_19.pickle\n" 120 | ], 121 | "name": "stdout" 122 | } 123 | ] 124 | }, 125 | { 126 | "cell_type": "code", 127 | "metadata": { 128 | "id": "obqRzUUdEeRy", 129 | "colab_type": "code", 130 | "colab": {} 131 | }, 132 | "source": [ 133 | "import pandas as pd\n", 134 | "import numpy as np\n", 135 | "\n", 136 | "#Progress bars\n", 137 | "from tqdm import tqdm\n", 138 | "tqdm.pandas()\n", 139 | "\n", 140 | "#Paralelize pandas apply on multiple cores\n", 141 | "#import swifter\n", 142 | "\n", 143 | "from matplotlib import pyplot as plt\n", 144 | "from matplotlib import style\n", 145 | "#Nicer style\n", 146 | "style.use('seaborn')\n", 147 | "import seaborn as sns\n", 148 | "\n", 149 | "from tensorflow import keras as k\n", 150 | "\n", 151 | "from sklearn.model_selection import train_test_split\n", 152 | "\n", 153 | "from gensim.models import KeyedVectors\n", 154 | "\n", 155 | "from sklearn.metrics import accuracy_score, confusion_matrix, classification_report" 156 | ], 157 | "execution_count": 0, 158 | "outputs": [] 159 | }, 160 | { 161 | "cell_type": "code", 162 | "metadata": { 163 | "id": "Phzn0O7dEkxK", 164 | "colab_type": "code", 165 | "colab": {} 166 | }, 167 | "source": [ 168 | "LSTM_PATH = 'drive/My Drive/Colab Notebooks/weights/lstm.h5'\n", 169 | "CONV_PATH = 'drive/My Drive/Colab Notebooks/weights/conv.h5'" 170 | ], 171 | "execution_count": 0, 172 | "outputs": [] 173 | }, 174 | { 175 | "cell_type": "markdown", 176 | "metadata": { 177 | "id": "u2VhzVjJEvW4", 178 | "colab_type": "text" 179 | }, 180 | "source": [ 181 | "## Prepare data" 182 | ] 183 | }, 184 | { 185 | "cell_type": "code", 186 | "metadata": { 187 | "id": "r6sw8lCcExJ9", 188 | "colab_type": "code", 189 | "colab": {} 190 | }, 191 | "source": [ 192 | "getting_real = pd.read_pickle('drive/My Drive/Colab Notebooks/data/news_getting_real2.pickle')\n", 193 | "true_or_fake = pd.read_pickle('drive/My Drive/Colab Notebooks/data/data_kaggle_proc.pickle')" 194 | ], 195 | "execution_count": 0, 196 | "outputs": [] 197 | }, 198 | { 199 | "cell_type": "code", 200 | "metadata": { 201 | "id": "lckZjRSZGMSB", 202 | "colab_type": "code", 203 | "colab": { 204 | "base_uri": "https://localhost:8080/", 205 | "height": 111 206 | }, 207 | "outputId": "c2c3e26b-ce7e-4ff0-e4f6-467cd6fa2bb2" 208 | }, 209 | "source": [ 210 | "getting_real.head(2)" 211 | ], 212 | "execution_count": 127, 213 | "outputs": [ 214 | { 215 | "output_type": "execute_result", 216 | "data": { 217 | "text/html": [ 218 | "
\n", 219 | "\n", 232 | "\n", 233 | " \n", 234 | " \n", 235 | " \n", 236 | " \n", 237 | " \n", 238 | " \n", 239 | " \n", 240 | " \n", 241 | " \n", 242 | " \n", 243 | " \n", 244 | " \n", 245 | " \n", 246 | " \n", 247 | " \n", 248 | " \n", 249 | " \n", 250 | " \n", 251 | " \n", 252 | " \n", 253 | " \n", 254 | " \n", 255 | " \n", 256 | " \n", 257 | " \n", 258 | "
titletypecontentone_hot_label
0[4024, 2370, 1280, 11538, 17251, 20, 17919, 57...fake[4024, 2370, 1280, 11538, 17251, 20, 17919, 57...[0, 1, 0]
1[5977, 4211, 7726, 11538, 9311, 8469, 4211, 12...fake[5760, 5760, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...[0, 1, 0]
\n", 259 | "
" 260 | ], 261 | "text/plain": [ 262 | " title type \\\n", 263 | "0 [4024, 2370, 1280, 11538, 17251, 20, 17919, 57... fake \n", 264 | "1 [5977, 4211, 7726, 11538, 9311, 8469, 4211, 12... fake \n", 265 | "\n", 266 | " content one_hot_label \n", 267 | "0 [4024, 2370, 1280, 11538, 17251, 20, 17919, 57... [0, 1, 0] \n", 268 | "1 [5760, 5760, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ... [0, 1, 0] " 269 | ] 270 | }, 271 | "metadata": { 272 | "tags": [] 273 | }, 274 | "execution_count": 127 275 | } 276 | ] 277 | }, 278 | { 279 | "cell_type": "code", 280 | "metadata": { 281 | "id": "xtUJOJSwIwOE", 282 | "colab_type": "code", 283 | "colab": { 284 | "base_uri": "https://localhost:8080/", 285 | "height": 111 286 | }, 287 | "outputId": "a958b02e-9b69-4b5f-9e57-e009f56e3117" 288 | }, 289 | "source": [ 290 | "true_or_fake.head(2)" 291 | ], 292 | "execution_count": 128, 293 | "outputs": [ 294 | { 295 | "output_type": "execute_result", 296 | "data": { 297 | "text/html": [ 298 | "
\n", 299 | "\n", 312 | "\n", 313 | " \n", 314 | " \n", 315 | " \n", 316 | " \n", 317 | " \n", 318 | " \n", 319 | " \n", 320 | " \n", 321 | " \n", 322 | " \n", 323 | " \n", 324 | " \n", 325 | " \n", 326 | " \n", 327 | " \n", 328 | " \n", 329 | " \n", 330 | " \n", 331 | " \n", 332 | " \n", 333 | " \n", 334 | " \n", 335 | "
HeadlineBodyLabel
0[2675, 1143, 2205, 48926, 6117, 13034, 0, 0, 0...[15680, 8429, 28683, 14257, 312, 281, 565, 611...[0, 0, 0, 1]
1[556, 2168, 3912, 5042, 2360, 508, 115, 948, 0...[4949, 20876, 17535, 3912, 46, 3610, 556, 3230...[0, 0, 0, 1]
\n", 336 | "
" 337 | ], 338 | "text/plain": [ 339 | " Headline \\\n", 340 | "0 [2675, 1143, 2205, 48926, 6117, 13034, 0, 0, 0... \n", 341 | "1 [556, 2168, 3912, 5042, 2360, 508, 115, 948, 0... \n", 342 | "\n", 343 | " Body Label \n", 344 | "0 [15680, 8429, 28683, 14257, 312, 281, 565, 611... [0, 0, 0, 1] \n", 345 | "1 [4949, 20876, 17535, 3912, 46, 3610, 556, 3230... [0, 0, 0, 1] " 346 | ] 347 | }, 348 | "metadata": { 349 | "tags": [] 350 | }, 351 | "execution_count": 128 352 | } 353 | ] 354 | }, 355 | { 356 | "cell_type": "markdown", 357 | "metadata": { 358 | "id": "AuK3kmSMQ4i-", 359 | "colab_type": "text" 360 | }, 361 | "source": [ 362 | "### Reprocesar variables objetivo\n", 363 | "Los modelos están preparados para distinguir 4 tipos de noticia (clickbait, bias, fake y true). Sin embargo, estos datasets no cuentan con las mismas categorias.\n", 364 | " \n", 365 | " * Getting Real About FN: *fake, bias y true*\n", 366 | " * Fake or True: *true y fake*\n", 367 | " \n", 368 | "**Nota:** *(ver datasets de procesado)*" 369 | ] 370 | }, 371 | { 372 | "cell_type": "code", 373 | "metadata": { 374 | "id": "cFcp51Z7Fs3j", 375 | "colab_type": "code", 376 | "colab": {} 377 | }, 378 | "source": [ 379 | "fit_getting_real = [np.asarray(getting_real['title'].tolist()), np.asarray(getting_real['content'].tolist())]\n", 380 | "fit_true_or_fake = [np.asarray(true_or_fake['Headline'].tolist()), np.asarray(true_or_fake['Body'].tolist())]\n", 381 | "\n", 382 | "target_or_fake_tgt = np.asarray(true_or_fake['Label'].tolist())[:,2:]\n", 383 | "target_getting_real = np.asarray(getting_real['one_hot_label'].tolist())" 384 | ], 385 | "execution_count": 0, 386 | "outputs": [] 387 | }, 388 | { 389 | "cell_type": "markdown", 390 | "metadata": { 391 | "id": "ygFIAbW6F45h", 392 | "colab_type": "text" 393 | }, 394 | "source": [ 395 | "## Predict CNN" 396 | ] 397 | }, 398 | { 399 | "cell_type": "code", 400 | "metadata": { 401 | "id": "y2mJM53VF67K", 402 | "colab_type": "code", 403 | "colab": { 404 | "base_uri": "https://localhost:8080/", 405 | "height": 210 406 | }, 407 | "outputId": "b6f434fd-b9b5-4817-8b8b-c0cdb0fa1e33" 408 | }, 409 | "source": [ 410 | "cnn_model = k.models.load_model(CONV_PATH)" 411 | ], 412 | "execution_count": 11, 413 | "outputs": [ 414 | { 415 | "output_type": "stream", 416 | "text": [ 417 | "WARNING:tensorflow:From /usr/local/lib/python3.6/dist-packages/tensorflow/python/ops/resource_variable_ops.py:435: colocate_with (from tensorflow.python.framework.ops) is deprecated and will be removed in a future version.\n", 418 | "Instructions for updating:\n", 419 | "Colocations handled automatically by placer.\n", 420 | "WARNING:tensorflow:From /usr/local/lib/python3.6/dist-packages/tensorflow/python/keras/layers/core.py:143: calling dropout (from tensorflow.python.ops.nn_ops) with keep_prob is deprecated and will be removed in a future version.\n", 421 | "Instructions for updating:\n", 422 | "Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.\n", 423 | "WARNING:tensorflow:From /usr/local/lib/python3.6/dist-packages/tensorflow/python/ops/math_ops.py:3066: to_int32 (from tensorflow.python.ops.math_ops) is deprecated and will be removed in a future version.\n", 424 | "Instructions for updating:\n", 425 | "Use tf.cast instead.\n" 426 | ], 427 | "name": "stdout" 428 | } 429 | ] 430 | }, 431 | { 432 | "cell_type": "markdown", 433 | "metadata": { 434 | "id": "HaiJxOz8czgH", 435 | "colab_type": "text" 436 | }, 437 | "source": [ 438 | "#### Predicción sobre *Fake or true*" 439 | ] 440 | }, 441 | { 442 | "cell_type": "code", 443 | "metadata": { 444 | "id": "vRCzMJUaF_mj", 445 | "colab_type": "code", 446 | "colab": {} 447 | }, 448 | "source": [ 449 | "pred = cnn_model.predict(fit_true_or_fake)\n", 450 | "\n", 451 | "pr = pred.round()\n", 452 | "#Set Bias to Fake\n", 453 | "mask = np.all((pred.round() == [1.,0.,0.,0.]), axis=1)\n", 454 | "pr[mask] = np.repeat(np.array([0.,0.,1.,0.]).reshape(-1,4), pr[mask].shape[0], axis=0)\n", 455 | "\n", 456 | "#Set Fake\n", 457 | "mask = np.all((pred.round() == [0.,1.,0.,0.]), axis=1)\n", 458 | "pr[mask] = np.repeat(np.array([0.,0.,1.,0.]).reshape(-1,4), pr[mask].shape[0], axis=0)" 459 | ], 460 | "execution_count": 0, 461 | "outputs": [] 462 | }, 463 | { 464 | "cell_type": "code", 465 | "metadata": { 466 | "id": "RmVc6C71WVmt", 467 | "colab_type": "code", 468 | "colab": { 469 | "base_uri": "https://localhost:8080/", 470 | "height": 34 471 | }, 472 | "outputId": "db82b969-2291-4aef-c332-12f6c5dcf4c4" 473 | }, 474 | "source": [ 475 | "accuracy_score(target_or_fake_tgt, pr[:,2:])" 476 | ], 477 | "execution_count": 132, 478 | "outputs": [ 479 | { 480 | "output_type": "execute_result", 481 | "data": { 482 | "text/plain": [ 483 | "0.5484924623115578" 484 | ] 485 | }, 486 | "metadata": { 487 | "tags": [] 488 | }, 489 | "execution_count": 132 490 | } 491 | ] 492 | }, 493 | { 494 | "cell_type": "code", 495 | "metadata": { 496 | "id": "Fj2gAUk0YJEJ", 497 | "colab_type": "code", 498 | "colab": { 499 | "base_uri": "https://localhost:8080/", 500 | "height": 261 501 | }, 502 | "outputId": "756f3f0a-4e23-40fa-83ec-6045182c7f39" 503 | }, 504 | "source": [ 505 | "report = classification_report(target_or_fake_tgt, pr[:,2:])\n", 506 | "print(report)" 507 | ], 508 | "execution_count": 133, 509 | "outputs": [ 510 | { 511 | "output_type": "stream", 512 | "text": [ 513 | " precision recall f1-score support\n", 514 | "\n", 515 | " 0 0.61 0.65 0.63 2113\n", 516 | " 1 0.59 0.43 0.50 1867\n", 517 | "\n", 518 | " micro avg 0.61 0.55 0.58 3980\n", 519 | " macro avg 0.60 0.54 0.57 3980\n", 520 | "weighted avg 0.60 0.55 0.57 3980\n", 521 | " samples avg 0.55 0.55 0.55 3980\n", 522 | "\n" 523 | ], 524 | "name": "stdout" 525 | }, 526 | { 527 | "output_type": "stream", 528 | "text": [ 529 | "/usr/local/lib/python3.6/dist-packages/sklearn/metrics/classification.py:1143: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in samples with no predicted labels.\n", 530 | " 'precision', 'predicted', average, warn_for)\n" 531 | ], 532 | "name": "stderr" 533 | } 534 | ] 535 | }, 536 | { 537 | "cell_type": "code", 538 | "metadata": { 539 | "id": "VsBjJGRCY8tV", 540 | "colab_type": "code", 541 | "colab": { 542 | "base_uri": "https://localhost:8080/", 543 | "height": 398 544 | }, 545 | "outputId": "2bf1f4ed-bfc6-447c-9a8d-e67d25fa4f76" 546 | }, 547 | "source": [ 548 | "matrix = confusion_matrix(target_or_fake_tgt.argmax(axis=1), pr[:,2:].argmax(axis=1))\n", 549 | "print(matrix)\n", 550 | "\n", 551 | "df_cm = pd.DataFrame(matrix, columns=['true', 'fake'], index=['true', 'fake'])\n", 552 | "sns.heatmap(df_cm, annot=True)" 553 | ], 554 | "execution_count": 134, 555 | "outputs": [ 556 | { 557 | "output_type": "stream", 558 | "text": [ 559 | "[[1566 547]\n", 560 | " [1064 803]]\n" 561 | ], 562 | "name": "stdout" 563 | }, 564 | { 565 | "output_type": "execute_result", 566 | "data": { 567 | "text/plain": [ 568 | "" 569 | ] 570 | }, 571 | "metadata": { 572 | "tags": [] 573 | }, 574 | "execution_count": 134 575 | }, 576 | { 577 | "output_type": "display_data", 578 | "data": { 579 | "image/png": "iVBORw0KGgoAAAANSUhEUgAAAccAAAFKCAYAAABo0pS0AAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDMuMC4zLCBo\ndHRwOi8vbWF0cGxvdGxpYi5vcmcvnQurowAAIABJREFUeJzt3XtcVHX+x/H3DDOIFxQxxgKz1Exs\nF9113VVRt9y0Vv3tZpkXWNDKLhZq21JeyIrEVqm0UknNtjIJM602flai9TOrFbONMi3vl1RQLgqi\noDIw8/vDbTZOIopnuIyv5z7m8dg5Z+Z8v2e3R28+38sZi9vtdgsAAHhY67oDAADUN4QjAAAGhCMA\nAAaEIwAABoQjAAAGhCMAAAY2bzfgLNjj7SYAr2sc2reuuwCYorws22vXvph/39sva29iTy6e18MR\nAHCJcFXUdQ9Mw7AqAAAGVI4AAHO4XXXdA9MQjgAAc7gIRwAAKnFTOQIAYEDlCACAAZUjAAAGbOUA\nAMB3UTkCAMzBsCoAAAYsyAEAoDK2cgAAYETlCACAgQ9VjqxWBQDAgMoRAGAOH9rnSDgCAMzhQ8Oq\nhCMAwBwsyAEAwIDKEQAAAypHAAAqc7t9Z0EOWzkAADCgcgQAmIM5RwAADJhzBADAgMoRAAADnpAD\nAIABlSMAAAY+NOfIVg4AAAyoHAEA5mBYFQAAAx8aViUcAQDmIBwBAKjMl56tSjgCAMxB5QgAgIEP\nLchhKwcAAAZUjgAAczCsCgCAgQ8NqxKOAABzUDkCAGBA5QgAgAGVIwAABj4UjmzlAADAgMoRAGAO\nH5pzpHIEAJjD5ar5qxo7duxQ//79lZqaWun4Z599pk6dOnnep6ena+jQoRo2bJiWL18uSXI6nYqP\nj1dUVJRiYmJ04MCBatsjHAEA5nC7av46h9LSUiUlJalXr16Vjp8+fVovvfSSQkJCPJ9LSUnRa6+9\npiVLlmjx4sUqKirSypUr1bx5cy1dulRjx47VrFmzqr0VwhEAYA4vVY7+/v5atGiRHA5HpeMLFixQ\ndHS0/P39JUmbNm1SRESEAgMDFRAQoG7duikrK0uZmZkaMGCAJCkyMlJZWVnV3grhCAAwh5cqR5vN\npoCAgErH9u7dq23btmngwIGeYwUFBQoODva8Dw4OVn5+fqXjVqtVFotFZWVl527zQu8dAICzqsWt\nHDNmzNDUqVPP+Rm3231Bx3+KyhEA0KDk5uZqz549evjhhzV8+HDl5eUpJiZGDodDBQUFns/l5eXJ\n4XDI4XAoPz9f0pnFOW632zMUWxUqRwCAOWqpcmzdurU++ugjz/s//OEPSk1N1alTpzR16lQVFxfL\nz89PWVlZSkhI0IkTJ7Rq1Sr17dtXa9euVY8ePaptg3AEAJjjPIYra2LLli1KTk5Wdna2bDabMjIy\nNHfuXAUFBVX6XEBAgOLj4zVmzBhZLBbFxcUpMDBQgwYN0vr16xUVFSV/f3/NnDmz2jYt7vMZfL0I\nzoI93rw8UCsah/at6y4Apigvy/batU8ufaLG320c9aSJPbl4VI4AAHP40LNVCUcAgDl86PFxhCMA\nwBw+VDmylQMAAAMqRwCAOby7vrNWEY4AAHP40LAq4QgAMAfhCACAAatVAQCozO1izhEAgMp8aFiV\nrRwAABhQOQIAzMGcIwAABsw5AgBg4ENzjoQjAMAchCMAAAY8Pg5m2rlnn8ZPelKjRtyq6Nv/XOnc\nodx8TUycKaezXJ2vvUZPTBx/wdfftnOPkp6dJ4vFoms7XK3HHxkvl8ulp2a/qB2796q8vEK3/3mg\nhv7pZrNuCZeY63/fS28uXajvv98uSdq8ZZv++tBjnvO7dmzQwYM5qqiokCTFjh6vnJzDF9RGly7X\nKWXuDLndbn27eavGjZ8iSRo/boyio26VLBYtXvyWFixcbNJd4YJROcIspSdP6e+z56tn91+d9fyz\n8xZp9Mjb1P/63po+K0WHDufpissdF9RG8gsLNfmv9ymicydNTEzWZ5lfqmmTxrLZbFoyf5ZKS0/q\nj8Pu1K2DB8hqZXcPaubTzzZoxMh7qzw/+E8xKikprfH1Zz/7pB762+P691ebtOT1efrjzf20fcdu\njR49Qj16DpTVatXW7z5T2tJ3VFx8vMbtABL7HOucv92u+bOmKeSyVj8753K59NWmLerXp6ckaWp8\nnK643KGKigo9NuM53TlukmLvj9cXX31T6Xt3jJvo+e9Op1PZhw4ronMnSdINvXtow7+/Ubeuv9SU\nv46VJB0pLFKL5oEEI2qV1WrVSwuf1Uerl2vd2nfV74belc5/vGa557/b7XZdffWV+vdXmyRJK99f\noxv/0Ff79h3Q9TcMUUVFhZxOp0pPnlTz5oG1eh/4CZe75q96ptp/G544cUILFizQU089JUnasGGD\niouLvd6xS4XN5qeARo3Oeu5o0TE1bdJEyXNeUuz98Xpu/quSpPfXfKKQVsF6dV6y5sx4XDNfWFjl\n9QuLitU8sJnnfXDLFso/ctTz/m9Tn1Ls/fF6NP4Bk+4Il6rOnTvq3Xde1bq176r/jX1/dv7FlJla\nt/Zd/f2pM8OhUVG36vDhPPW/aZhuu/0uzZqVWOW1L7ssWIVFxzzv8/OO6PIrHHK73Z5qdED/3+tI\nwVEdPJhj7o3h/LldNX/VM9UOq06ePFmRkZH65JNPJElHjx5VfHy8Fi1a5O2+we1WXn6BYobdorAr\nWuuBR57QuvUb9c3m75W16TtlffudJOn06TI5nU49mDBdpSdPavvOPbpj3EQFNGqkaZP/arxkJbOn\nP6qcw7m676GpevPlF9S0aZPaujv4kJ279ipp+nNavjxd7dtfpY9WL1enzr3ldDolSYnTnlVGxlod\nPVqkd1a8ottuG6xePburT5/fqXfkbyVJjQMay263a8VbL6tZsybq2vUX+njNcp08eUr33PdwpfYs\nFkul9z1+103JyY/pz7eMrp0bxtnVwwqwpqoNx5KSEkVHR+vDDz+UJA0aNEhLly71escgBbVooSsu\nd6htm1BJUo/f/Eq79/4gu92ue0eP1KABN1T6/IvPPCnpzLDqa/OeliQ5y8tV9JP5l7z8AoVcFqw9\nPxyQ2+1Wh6vbKvTy1moTern2/HBAEdd1qp2bg0/JyTms5cvTJUl79vyg3Nw8hYVdrn37DkiSUlNX\neD774ar/U8Qvw1VWVqYZM+do2bL3Kl3rllvPBNzHa5brxgHDJEk2m02tglt6PhMadrkO5eRKOrNQ\nZ+HCZ3TLkNFUjXXM7UMLcqodVnW5XNq/f7/nL7VPP/1ULh/6H6A+s9n81Cb0Cv1wIFuS9P32nbq6\nbRt1ua6T/u+zTEln5gufX/Baldew22xq17aNsjZtkSR9tG69+vTorj379uuFhWe+d/LUKe3df1Bh\nV7T26v3Ad0VF3aq/PXSfJKl16xA5HCHKzj6zGrV580B9sPIN2e12SdLvf99TW77bro1ffq0//2eF\ndEhIK01Pmlzl9cvLy7V9+y5PlXnrkIHKWP2JrFarFr00S8NH3KsffjjozVvE+fChOUeL233ujSm7\ndu3S9OnT9e2336px48YKDw9XQkKCOnTocF4NOAv2mNJRX/Xdtp16Zt4i5RzKlc1mkyOklfr16amw\nK1qr//W9tf9gjh59apZcLreu7XC1Hnt4nFwut6Y9M1e79+2Xy+XSA3f9RX17/bbKNnbv/UFPPj1X\nLrdbXa7rpIkT7pXb7daM5+bru+275HQ6NXzIIN3+54G1eOcNS+PQn8+h4b+aNWuq1CUpCmrRXP7+\ndiVNf04hIZfpWHGx3ntvlcaPG6PY2GE6dfKUvv5mix7861T5+fnpxZSZuq7ztfLzs2pa0mytylhb\nZRudO3fU/JRkWa1Wbdz4tR6e+KQG9P+93kh9UZs3b/V8bvKUp/Tlv7+p8jqXuvKybK9du2R6TI2/\n23Rqqok9uXjVhuPFIhzhCwhH+ArC8fxUO+fYs2dPz5BqeXm5SkpKFBYWpjVr1ni9cwCABqQeDo/W\nVLXhuGHDhkrvt23bpvT0dK91CADQQPnQepQL3vUdHh6ur7/+2ht9AQA0ZD60IKfaynHChAmV9hTl\n5+erSRP2wgEADOrhZv6aqjYco6Ki5OfnJ+nMxttmzZopPDzc6x0DADQw9bACrKlqwzElJUWpqfVr\nFREAoP7xpYcAVBuOYWFhio+PV0REhGcTryT95S9/8WrHAACoK9UuyElPT1e7du104sQJFRYWqrCw\nUAcOHKiNvgEAGpJLYUHO6tWrtXLlSgUFBWnnzp368VkBFRUV2rp1qyZPrvpRTwCAS1A9DLmaqjIc\nb7rpJl133XVKSkqqNIRqtVrVvn37WukcAKABuVRWq7Zp00YLF1b9W4EAAHhcCpUjAAAXwk04AgBg\n4EPheMGPjwMAwNdROQIAzHEpPQQAAIDz4kPDqoQjAMAchCMAAJX9+LAYX0A4AgDMQeUIAICBD4Uj\nWzkAADCgcgQAmIIn5AAAYEQ4AgBg4DvPACAcAQDm8KVhVRbkAADM4XLX/FWNHTt2qH///kpNTZUk\nHTp0SHfccYdiYmJ0xx13KD8/X5KUnp6uoUOHatiwYVq+fLkkyel0Kj4+XlFRUYqJidGBAweqbY9w\nBACYw3URr3MoLS1VUlKSevXq5Tn2/PPPa/jw4UpNTdWAAQP06quvqrS0VCkpKXrttde0ZMkSLV68\nWEVFRVq5cqWaN2+upUuXauzYsZo1a1a1t0I4AgDqNX9/fy1atEgOh8Nz7IknntDNN98sSWrZsqWK\nioq0adMmRUREKDAwUAEBAerWrZuysrKUmZmpAQMGSJIiIyOVlZVVbZuEIwDAFG6Xu8avc7HZbAoI\nCKh0rEmTJvLz81NFRYXS0tL0pz/9SQUFBQoODvZ8Jjg4WPn5+ZWOW61WWSwWlZWVnbNNwhEAYA4v\nDatWpaKiQhMnTlTPnj0rDbn+qKpnvZ7PM2AJRwCAKbxVOVZlypQpuuqqqzRu3DhJksPhUEFBged8\nXl6eHA6HHA6HZ8GO0+mU2+2Wv7//Oa9NOAIAzFGLlWN6errsdrsmTJjgOda1a1dt3rxZxcXFKikp\nUVZWlrp3767evXtr1apVkqS1a9eqR48e1V6ffY4AAFO4vfQQgC1btig5OVnZ2dmy2WzKyMjQkSNH\n1KhRI8XGxkqSOnTooMTERMXHx2vMmDGyWCyKi4tTYGCgBg0apPXr1ysqKkr+/v6aOXNmtW1a3F7+\nAS5nwR5vXh6oFY1D+9Z1FwBTlJdle+3aRwZfX+Pvtnp/nYk9uXgMqwIAYMCwKgDAFN4aVq0LhCMA\nwByEIwAAlVE5AgBgQDgCAGBAOAIAYOS21HUPTMNWDgAADKgcAQCmYFgVAAADt8t3hlUJRwCAKagc\nAQAwcPvQghzCEQBgCipHAAAMfGnOka0cAAAYUDkCAEzh3V8Hrl2EIwDAFL40rEo4AgBMQTgCAGDA\nsCoAAAZUjgAAGPjSQwDYygEAgAGVIwDAFDwhBwAAA5cPDasSjgAAU/jSnCPhCAAwBatVAQAwYJ8j\nAAAGvlQ5spUDAAADKkcAgClYrQoAgAGrVQEAMGBBDgAABgyrAgBgwLAqAAAGvjSsylYOAAAMvF45\nbv71Q95uAvC6UaG96roLQL3HnCMAAAbMOQIAYEDlCACAgQ+txyEcAQDmoHIEAMDAl+Yc2coBAIAB\nlSMAwBSuuu6AiQhHAIAp3PKdYVXCEQBgCpcPLVclHAEApnBROQIAUBnDqgAAGPjSghy2cgAA6r0d\nO3aof//+Sk1NlSQdOnRIsbGxio6O1oMPPqiysjJJUnp6uoYOHaphw4Zp+fLlkiSn06n4+HhFRUUp\nJiZGBw4cqLY9whEAYAq3LDV+nUtpaamSkpLUq9d/fx1nzpw5io6OVlpamq666iqtWLFCpaWlSklJ\n0WuvvaYlS5Zo8eLFKioq0sqVK9W8eXMtXbpUY8eO1axZs6q9F8IRAGAK10W8zsXf31+LFi2Sw+Hw\nHPviiy904403SpL69eunzMxMbdq0SREREQoMDFRAQIC6deumrKwsZWZmasCAAZKkyMhIZWVlVXsv\nzDkCAEzhrTlHm80mm61yXJ08eVL+/v6SpFatWik/P18FBQUKDg72fCY4OPhnx61WqywWi8rKyjzf\nP2ubXrgPAMAlqK5Wq7rdZ99geaHHf4phVQCAKVyWmr8uVJMmTXTq1ClJUm5urhwOhxwOhwoKCjyf\nycvL8xzPz8+XdGZxjtvtPmfVKBGOAACTuGSp8etCRUZGKiMjQ5K0evVq9e3bV127dtXmzZtVXFys\nkpISZWVlqXv37urdu7dWrVolSVq7dq169OhR7fUZVgUA1GtbtmxRcnKysrOzZbPZlJGRoWeffVaT\nJ0/WsmXLFBoaqiFDhshutys+Pl5jxoyRxWJRXFycAgMDNWjQIK1fv15RUVHy9/fXzJkzq23T4j6f\nwdeLkHXlLd68PFAr5vnZ67oLgCle2bfCa9f+5+XRNf7ukMNpJvbk4lE5AgBM4UtPyCEcAQCmcFl4\ntioAAJX40C9WEY4AAHP40rAqWzkAADCgcgQAmKImm/nrK8IRAGCKmmzmr68IRwCAKViQAwCAAcOq\nAAAY+NJqVcIRAGAKXxpWZSsHAAAGVI4AAFMw5wgAgAFzjgAAGBCOAAAYuBlWBQCgMipHAAAMfCkc\n2coBAIABlSMAwBS+9BAAwhEAYAr2OQIAYOBLc46EIwDAFIQjAAAGzDkCAGDgS3OObOUAAMCAyhEA\nYArmHAEAMGDOEQAAA5cPxSPhCAAwBcOqAAAY+E7dSDgCAEziS5UjWzkAADCgcgQAmMKXHgJAOAIA\nTMFqVQAADHwnGglHAIBJfGlBDuEIADAFw6oAABj4TjSylQMAgJ+hcgQAmII5RwAADJhzBADAwHei\nkXAEAJiEYVUAAAzcPlQ7Eo4AAFP4UuXIVg4AAAyoHAEApmC1KkwV0KmtOvwjQXmL0pW/+INK5yyN\n7Go74wEFdGqr7YPja3T9xp2v1pV/v19yu3Vy2z4dSFggWSy6cvq9ahx+tSx2mwreyNCRZR+ZcTuA\nGjUJ0N2zx6tJi6ay+9v13gtv6btPN13QNUY+dofa/7qj5JbSnnxF+77drZZXtNKYZ+LkZ/dThbNC\nLz00R8X5RV66C1wo34lGhlXrnLVxI1057V4d//zbs54Pe/QOnfx+70W10Sbxbh1MXKQdt02WX2AT\nNb+hm5p2D5fbWaEdQ6do58jHFDo5VrL40I+xoU71vv0GHd6TrWeiEvXi/c8q+om7Luj71/a4Tq2v\nvkJ/v+1RvTrxRUUnnvn+bQ9Had3SNUoe8YSyMjbq5jH/443uo4Zcctf4Vd9QOdYxV5lTu0ZP0+X3\n33bW8znJqbK1DFTLW6//70GrVW2TH1CjtpfLYvNTzqw0nVi/2XO641vTtXP4VEmSxW6T/5UOlW7a\nJUk69tGXCuzbVdlJr6rky62SJNtlLVRRdEJy179/QNEwnSg8rivDr5IkNWnRVCeOFqvjbztr6CPR\nqigv19FDR/Ta5AWqcJZLOhOmkvSvFZ9Ikq6LjFDW6o2SpEO7s9W0RTMFNGusJVMXyXnaKUk6fvSY\nrvplu9q9MZyTtxbklJSUaNKkSTp27JicTqfi4uIUEhKixMRESVKnTp305JNPSpJefvllrVq1ShaL\nRePGjdP1119/jitX7bzC8euvv1ZOTo4GDx6svLw8ORyOGjWGs6hwyV1RVuVpV8lJqWVgpWPBQ34v\nZ16h9j8yT34tA3XtsunaetODZ/2+Lbi5Ko6VeN6XFxyT3dHS877d/Ilq9tvO2vfgcxd5I8B/bfzf\nf6n37Tdoxidz1bRFMz1/198VO/1ePRv9pEqOndCwyTH67aBe2vDeZ2f9fouQIO3bssfz/viRYrUI\nCVLu3kOSJIvVqj+M+qPSX1hRK/eD8+OtrRzvvvuu2rVrp/j4eOXm5mr06NEKCQlRQkKCunTpovj4\neK1bt07t27fXBx98oDfffFMnTpxQdHS0+vTpIz8/vwtus9pwTE5O1qFDh7R//34NHjxYy5Yt07Fj\nxzR16tQa3SQuXtPu4Wr2u+vU7LedJUmWAH9Z7Da1f2myrE0D1Pi6dur41nS5TpVp/yPzKn/ZMHS6\n9/6n5R8WomtSE7Xtfx4+E8bAReo5pK+OZhfoudFP6crOV2ncSxMVGNxccQsfkSQ1atxIxwuP6zcD\ne+rG0QPVIiRIktT79n5am5rxs+v99B9bi9Wqe54br63rt2jrT0ZMUPe8VTm2bNlS27dvlyQVFxcr\nKChI2dnZ6tKliySpX79+yszMVH5+vvr27St/f38FBwcrLCxMu3btUqdOnS64zWrDccuWLVqyZIli\nY2MlSePHj1d0dPQFNwTzuMvKdXjuchUa/urefed0SZWHVWXzk+0nlaf98mA5c4+qUYcwWSwWndp1\nUGXZ+Tq9/7ACOrZR6Tc7a+0+4Ls6dg/Xlk+/kSQd2PqDWoQEqfDQUT098omfffarDzf8bFg19Jo2\nnsCUpKDWwTqWVyhJGvNMnHL3HVL6C8u9exOoNwYPHqx33nlHAwYMUHFxsebPn69p06Z5zrdq1Ur5\n+fkKCgpScHCw53hwcLDy8/NrFI7VLsgpLy+X0+mU5T9/uh09elSnT5++4IZgnpJvdqjFTT0kSbZW\nLRQ6KabqD5dX6NTug2r6nyozaGAvFX/ytQI6XqnQSWf+4LEE+CugQ5jK9ud6ve+4NOTtO6z2v+oo\nSWoVdpmO5hyR2+VS6DVtJEk3jh6oNv+ZkzybLZ9uUveBPSVJbX/RTkW5R3Wq5JR63tJX5U6n3nvu\nLe/fBC6Y+yL+cy7vvfeeQkNDtWbNGi1evFiPPPJI5XarWC9R1fHzUW3leOedd2rEiBHKycnR3Xff\nrT179ighIaHGDaKyxhEd1OaxO+XfxiF3eYWCBkfq2JqNOn0gT8dWbVC7+RPlH3qZAtqHqeNb01WQ\ntlqF//u5AiO76Np3k2Xxs+rQ7KWVrumpGv/jYOI/1Hbm/ZLFqpJvduj452eW1AdGRujad5Nl9bfp\ncMrbKj9aXGv3Dd/2Sdoa3fX0A5q07ElZ/fz0+qMvqcJZrruejVN5WbmK8o5q3dI1ns//WDH+aHfW\ndv2wZY8S3n5KbpdLqY+/LEn6w6g/yt7Irolvnll8kbPzgFIfe7nW7gvn5q1h1aysLPXp00eSFB4e\nrtOnT6u8vNxzPjc3Vw6HQw6HQ3v37v3Z8ZqwuKuJ1pMnT8rtdmvXrl2y2+1q166djh07ptatW59X\nA1lX3lKjjgH1yTw/e113ATDFK/u8t4gp9qqzr7o/H0t+eKfKc6+88ooKCgo0ceJEZWdn66677lJY\nWJgeeOABde/eXffff79iY2N19dVX67777tPbb7+twsJCjRo1Sh9++KGs1gvftVht5XjXXXfp+eef\n90x8Ll++XK+++qo++OCDar4JALiUeGsz2IgRI5SQkKCYmBiVl5crMTFRISEhevzxx+VyudS1a1dF\nRkZKkoYPH66YmBhZLBYlJibWKBil86gct27dqsTERN1zzz1aunSpHA6HpkyZoubNm59XA1SO8AVU\njvAV3qwco6+6tcbfTfvhXRN7cvGqjdTOnTtrwYIFeuONN9SxY0fNmDHjvIMRAHDp8NaCnLpQ5bBq\nz549PStUJcnlcmnjxo365z//KYvFoszMzFrpIAAAta3KcNywYUOVX/rXv/7llc4AABouX/o9x2oX\n5Bw4cEBpaWkqKjrz5Hun06kvv/xS69at83rnAAANR318gHhNVTvnOHnyZF1zzTX67rvvdMMNN8hq\ntVZ6MgEAAJJvzTlWG442m01Dhw5V8+bNdfPNN+vpp59WampqbfQNANCAuC7iVd9UO6zqdru1ceNG\nBQUFadmyZWrbtq0OHjxYG30DADQgF/O4tvqmyspxypQpkqSwsDA1adJEU6dO1TfffKPXX39dkydP\nrrUOAgAahkvix453796tW2+9Vfv379fOnWd+qeHHvwrmzp1b4x+QBACgvqsyHNPS0pSXl6eZM2dq\n0qRJtdknAEADVB/nDmuqynC02WwKDQ3VnDlzarM/AIAGqj6uOq2pahfkAABwPurj3GFNEY4AAFP4\n0mpVwhEAYIpLYs4RAIAL4UtzjjX7FUgAAHwYlSMAwBQsyAEAwIAFOQAAGFA5AgBg4EsLcghHAIAp\nXAyrAgBQme9EI1s5AAD4GSpHAIApWJADAIAB4QgAgAH7HAEAMKByBADAgH2OAAAY+NKwKls5AAAw\noHIEAJiCOUcAAAx8aViVcAQAmILKEQAAA1arAgBgwK9yAABg4EuVI1s5AAAwoHIEAJiCYVUAAAx8\naViVcAQAmILKEQAAAypHAAAMqBwBADDwpcqRrRwAABhQOQIATOF2u+q6C6YhHAEApuDB4wAAGPCT\nVQAAGFA5AgBgQOUIAIAB+xwBAKhF6enpevnll2Wz2TRhwgR16tRJEydOVEVFhUJCQvTMM8/I399f\n6enpWrx4saxWq4YPH65hw4bVqD3CEQBgCm89BKCwsFApKSl6++23VVpaqrlz5yojI0PR0dEaOHCg\nZs+erRUrVmjIkCFKSUnRihUrZLfbdfvtt2vAgAEKCgq64DZ5CAAAwBRut7vGr3PJzMxUr1691KxZ\nMzkcDiUlJemLL77QjTfeKEnq16+fMjMztWnTJkVERCgwMFABAQHq1q2bsrKyanQvVI4AAFN4a7Xq\nwYMHderUKY0dO1bFxcUaP368Tp48KX9/f0lSq1atlJ+fr4KCAgUHB3u+FxwcrPz8/Bq1STgCAEzh\nzdWqRUVFmjdvnnJycjRq1KhKbVXV7sX0h2FVAIApXG53jV/n0qpVK/3617+WzWZT27Zt1bRpUzVt\n2lSnTp2SJOXm5srhcMjhcKigoMDzvby8PDkcjhrdC+EIADCFt+Yc+/Tpow0bNsjlcqmwsFClpaWK\njIxURkaGJGn16tXq27evunbtqs2bN6u4uFglJSXKyspS9+7da3QvDKsCAOq11q1b6+abb9bw4cMl\nSVOnTlVERIQmTZqkZcuWKTQ0VEOGDJHdbld8fLzGjBkji8WiuLg4BQYG1qhNi9vLjzTIuvIWb14e\nqBXz/Ox13QXAFK/sW+G1a7cQCwa9AAADYklEQVRo1qHG3z12YreJPbl4VI4AAFPw+DgAAAx4fBwA\nAAbeekJOXSAcAQCmoHIEAMDAl+Yc2ecIAIABlSMAwBTMOQIAYOBLw6qEIwDAFIQjAAAGvhONtfD4\nOAAAGhpWqwIAYEA4AgBgQDgCAGBAOAIAYEA4AgBgQDgCAGBAODYQGRkZdd0F4KI4nU4NGzZMkyZN\nOuv5Hj161HKPgKoRjg3AwYMH9f7779d1N4CLkp+fr7KyMiUnJ9d1V4BqEY4NwLRp07Rx40aFh4dr\n4sSJio6OVmZmpiZMmOD5zI9/de/atUujRo3S6NGj9cADD6i4uLiuug1UMmPGDO3fv19TpkxRbGys\nYmNjFRUVpf3791f63NatWzVy5EiVlJRo9erVGjlypGJiYjRz5sw66jkuRYRjAzBmzBj97ne/U1xc\nnJxOp9LS0mS1nv3/uqSkJE2bNk2LFy9W79699cYbb9Ryb4GzmzRpktq1a6eoqCjFxcVpyZIlGjp0\nqNLS0jyfOXr0qJ544gnNnj1bkjR//ny9/vrrSk1N1aFDh/TVV1/VVfdxieHZqg1Mly5dznn+22+/\n1WOPPSZJKisrU0RERG10CzhvISEhmj59uubOnavi4mL94he/kHTmodUPPfSQ7r77boWGhmrTpk3K\nycnRmDFjJEnHjx9XTk6OfvOb39Rl93GJIBwbGLvdLkmyWCyVjpeXl0uSGjdurNdff/1n54H6Ys6c\nOerTp4+ioqK0atUqffLJJ5KkEydOqFOnTnrzzTd10003yW6365e//KX+8Y9/1G2HcUliWLUBsFqt\nnvD7UbNmzZSXlydJ2rZtm0pKSiRJ4eHh+vTTTyVJ77//vjIzM2u3s0A1CgsL1bZtW7ndbn388cdy\nOp2SpMDAQCUkJCgkJERvvfWW2rVrp927d+vIkSOSzoRqbm5uXXYdlxDCsQHo0KGDvv/+ex0/ftxz\nLDw8XE2aNNHIkSP13nvvKSwsTJL06KOPauHChYqJidE777yjzp0711W3gbMaMWKEkpKSdPfdd2vw\n4MHauHGjPv/8c8/5hIQEvfLKKyoqKlJCQoLuuecejRw5UkVFRXI4HHXYc1xK+MkqAAAMqBwBADAg\nHAEAMCAcAQAwIBwBADAgHAEAMCAcAQAwIBwBADAgHAEAMPh/i8hNYbN28u8AAAAASUVORK5CYII=\n", 580 | "text/plain": [ 581 | "
" 582 | ] 583 | }, 584 | "metadata": { 585 | "tags": [] 586 | } 587 | } 588 | ] 589 | }, 590 | { 591 | "cell_type": "markdown", 592 | "metadata": { 593 | "id": "Xam8w_ELZcdZ", 594 | "colab_type": "text" 595 | }, 596 | "source": [ 597 | "#### Predicción sobre Getting Real\n" 598 | ] 599 | }, 600 | { 601 | "cell_type": "code", 602 | "metadata": { 603 | "id": "_699fuOldBa0", 604 | "colab_type": "code", 605 | "colab": {} 606 | }, 607 | "source": [ 608 | "pred = cnn_model.predict(fit_getting_real)\n", 609 | "\n", 610 | "pr = pred.round()\n", 611 | "#Set Bias to Fake\n", 612 | "mask = np.all((pred.round() == [1.,0.,0.,0.]), axis=1)\n", 613 | "pr[mask] = np.repeat(np.array([0.,0.,1.,0.]).reshape(-1,4), pr[mask].shape[0], axis=0)\n", 614 | "\n", 615 | "\n", 616 | "#Set target bias to fake\n", 617 | "mask = np.all((target_getting_real == [1.,0.,0.]), axis=1)\n", 618 | "target_getting_real[mask] = np.repeat(np.array([0.,1.,0.]).reshape(-1,3), target_getting_real[mask].shape[0], axis=0)" 619 | ], 620 | "execution_count": 0, 621 | "outputs": [] 622 | }, 623 | { 624 | "cell_type": "code", 625 | "metadata": { 626 | "id": "H-yjOsv6e7Vd", 627 | "colab_type": "code", 628 | "colab": { 629 | "base_uri": "https://localhost:8080/", 630 | "height": 34 631 | }, 632 | "outputId": "bc3e502f-f07a-4c61-bb44-f8414570b9c1" 633 | }, 634 | "source": [ 635 | "accuracy_score(target_getting_real[:,1:], pr[:,2:])" 636 | ], 637 | "execution_count": 157, 638 | "outputs": [ 639 | { 640 | "output_type": "execute_result", 641 | "data": { 642 | "text/plain": [ 643 | "0.4734318817419097" 644 | ] 645 | }, 646 | "metadata": { 647 | "tags": [] 648 | }, 649 | "execution_count": 157 650 | } 651 | ] 652 | }, 653 | { 654 | "cell_type": "code", 655 | "metadata": { 656 | "id": "_-7lOOV0fjia", 657 | "colab_type": "code", 658 | "colab": { 659 | "base_uri": "https://localhost:8080/", 660 | "height": 261 661 | }, 662 | "outputId": "ca296e58-a222-4dcc-9dd8-6bc7da04fb32" 663 | }, 664 | "source": [ 665 | "report = classification_report(target_getting_real[:,1:], pr[:,2:])\n", 666 | "print(report)" 667 | ], 668 | "execution_count": 158, 669 | "outputs": [ 670 | { 671 | "output_type": "stream", 672 | "text": [ 673 | " precision recall f1-score support\n", 674 | "\n", 675 | " 0 0.57 0.66 0.61 1503\n", 676 | " 1 0.30 0.19 0.23 1000\n", 677 | "\n", 678 | " micro avg 0.49 0.47 0.48 2503\n", 679 | " macro avg 0.43 0.43 0.42 2503\n", 680 | "weighted avg 0.46 0.47 0.46 2503\n", 681 | " samples avg 0.47 0.47 0.47 2503\n", 682 | "\n" 683 | ], 684 | "name": "stdout" 685 | }, 686 | { 687 | "output_type": "stream", 688 | "text": [ 689 | "/usr/local/lib/python3.6/dist-packages/sklearn/metrics/classification.py:1143: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in samples with no predicted labels.\n", 690 | " 'precision', 'predicted', average, warn_for)\n" 691 | ], 692 | "name": "stderr" 693 | } 694 | ] 695 | }, 696 | { 697 | "cell_type": "code", 698 | "metadata": { 699 | "id": "c5_wJ3I3fmma", 700 | "colab_type": "code", 701 | "colab": { 702 | "base_uri": "https://localhost:8080/", 703 | "height": 398 704 | }, 705 | "outputId": "203639a3-605e-4e68-99d3-2b3f47468aa5" 706 | }, 707 | "source": [ 708 | "matrix = confusion_matrix(target_getting_real[:,1:].argmax(axis=1), pr[:,2:].argmax(axis=1))\n", 709 | "print(matrix)\n", 710 | "\n", 711 | "df_cm = pd.DataFrame(matrix, columns=['true', 'fake'], index=['true', 'fake'])\n", 712 | "sns.heatmap(df_cm, annot=True)" 713 | ], 714 | "execution_count": 160, 715 | "outputs": [ 716 | { 717 | "output_type": "stream", 718 | "text": [ 719 | "[[1047 456]\n", 720 | " [ 808 192]]\n" 721 | ], 722 | "name": "stdout" 723 | }, 724 | { 725 | "output_type": "execute_result", 726 | "data": { 727 | "text/plain": [ 728 | "" 729 | ] 730 | }, 731 | "metadata": { 732 | "tags": [] 733 | }, 734 | "execution_count": 160 735 | }, 736 | { 737 | "output_type": "display_data", 738 | "data": { 739 | "image/png": "iVBORw0KGgoAAAANSUhEUgAAAcEAAAFKCAYAAABlzOTzAAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDMuMC4zLCBo\ndHRwOi8vbWF0cGxvdGxpYi5vcmcvnQurowAAIABJREFUeJzt3XtcVHX+x/H3DAwCiiLGWJKUeUFM\nNM0tM81b1JptYiopqZVkF13ddd0VU1PzsqaW22qkdrHVvK50o9zS1tXcCqUW85bmLUPFgFEUBVFg\n5veHv6VYA7ycgQ7n9ewxjwczc853vvNIefv5Xs6xeTwejwAAsCB7VXcAAICqQggCACyLEAQAWBYh\nCACwLEIQAGBZhCAAwLJ8vf0Bha6D3v4IwOsmt5tQ1V0ADDH90HKvtX01v+8d19xkYE8unddDEABg\nEe7iqu7BZWM4FABgWVSCAABjeNxV3YPLRggCAIzhJgQBABbloRIEAFgWlSAAwLKoBAEAlsUWCQAA\nzINKEABgDIZDAQCWxcIYAIBVsUUCAGBdVIIAAMsyYSXI6lAAgGVRCQIAjGHCfYKEIADAGCYcDiUE\nAQDGYGEMAMCyqAQBAJZFJQgAsCqPx3wLY9giAQCwLCpBAIAxmBMEAFgWc4IAAMuiEgQAWBZXjAEA\nWBaVIADAskw4J8gWCQCAZVEJAgCMwXAoAMCyTDgcSggCAIxBCAIArMqM1w4lBAEAxqASBABYlgkX\nxrBFAgBgWVSCAABjMBwKALAsEw6HEoIAAGNQCQIALItKEABgWVSCAADLMmEIskUCAGBZVIIAAGMw\nJwgAsCwTDocSggAAY1AJAgAsi0oQAGBZXqoE3W63Jk2apH379snhcGjy5MkKDAzUmDFjVFxcrNDQ\nUM2ePVt+fn5KTk7W4sWLZbfbFRsbq379+pXbNiEIADCGlyrB9evX6/Tp01q5cqXS09M1ffp0hYSE\nKC4uTj169NCcOXOUlJSkmJgYJSYmKikpSQ6HQ3379lV0dLSCg4PLbJstEgCAX7RDhw6pVatWkqTw\n8HBlZGRoy5Yt6t69uySpa9euSklJ0bZt2xQVFaWgoCD5+/urbdu2SktLK7dtQhAAYAy3+8of5WjW\nrJk+++wzFRcX6+DBgzp8+LCOHj0qPz8/SVK9evWUnZ0tl8ulkJCQkvNCQkKUnZ1dbtsMhwIAjOHx\neKXZzp07Ky0tTQ8//LAiIiJ00003ae/evT/52J//3LJe/ylCEABgDC+uDh01alTJz3fffbfq16+v\ngoIC+fv7KzMzU06nU06nUy6Xq+S4rKws3XLLLeW2y3AoAMAYXhoO3bNnj5555hlJ0qZNm9SiRQt1\n6NBBa9eulSStW7dOnTp1UuvWrbVjxw7l5uYqLy9PaWlpateuXbltUwkCAIzhpS0SzZo1k8fjUd++\nfVWjRg298MIL8vHxUUJCglatWqUGDRooJiZGDodDo0ePVnx8vGw2m4YPH66goKBy27Z5LmXQ9CoU\nug56s3mgUkxuN6GquwAYYvqh5V5r++ySZ6743IDBMwzsyaVjOBQAYFkMhwIAjOHdgUWvIAQBAMbg\n2qEAAMsiBAEAlsWtlAAAVuVxMycIALAqEw6HskUCAGBZVIIAAGMwJwgAsCzmBAEAlmXCOUFCEABg\nDEIQAGBZXDYNRth38JBGJDynwQ/1VlzfBy77/EXLkrRuw79ls9n09GNxuqvDbTqUfkTPzZonSfLI\no+cSfqcbGoYZ3XVYnG8Nh0aum6UN897V1qRNJa/XuS5EsXNHyNfhq4xd3+n98Ysuu+1rI8P1wLQh\nksejH/YcVvKEC23c8divdUuvOyWblLb6U21Z+k/Dvg8ukwkrQbZI/MLkny3Qn+fMV/t25d8NuSxH\nMn7Qx+s/1VvzX1DirMmaPe81FRcXa9W7azT88YF68+WZ6n1ftN5cnmRwzwGp64jeOnvyzEWv9xg/\nUJ+/vkbzY56Vu9itOg3qXXbbPScO0prnlujVvs/JPyhAzbq0Vt2GTt3a7y4t7DNJr/adrE5P/UY1\nggKM+CqwCELwF8bP4dD8F6co9Joff0kc+O57DRkxVvEjx2rk2CnKPf3jL5nUtO1KfGPpT55vU8f2\n7eRwOBRSN1jXXevUgUPpSvjdk2p3S5Qk6Ycsl+qHXlN5XwqWcE3jBnI2DdO3G74u9brNZtONt0Vo\n9yf/kSR9MPFvOpVxXDa7Tb1nDlX8ivEaunqSbrqjRanz4lf+eA9HH4eP6jZ06uj2C/cn3bM+TY3v\nbKmTR7L1at/n5C52q7iwWIVnz8m/FiFYZdyeK39UkQpD8MyZM1qwYIGmT58uSdq8ebNyc3O93jGr\n8vX1kX+NGqVe+/Nf5mvSmBF6Y+7z6nBbW61454Myz3cdz1Hd4Dolz0PqBivbdUKStGfvAfUe/LQ+\n/XyLHhnQxztfAJZ13/iH9Y+pSy96PbBekM6dKdB9zw7S0NWTdM+YhyRJrXvdqdNZJ/XGgOla9sQc\n9Zw4uMy2A+sG6eypvJLnZ1y5CnIGy+Px6Hz+OUlSk05RyjtxWqeOnTD4m+GSedxX/qgiFc4Jjh07\nVh06dNDGjRslSSdOnNDo0aP12muvebtv+H87vtmrSTP/KkkqPF+omyObKW3bTs19bYlOn87T6TNn\n9OXW7ep+V4eLzvX8ZKK6ebPGenfJfK1850PNmrtQkxN+V2nfAdXbLQ92UnraPuUcyb7oPZvNptrX\n1lXKmx8r50i2Br85RhFdb1H4rc10468idMOvIiRJvv5+8nH4KG7BKPnV9Nd1LW5Q/MoJKio4r3fG\nvHpRmz/VsE0T9Rj3sJYMmeW9L4mKVcd9gnl5eYqLi9NHH30kSbrvvvu0YsUKr3cMP/L3r6E35828\n6C/+316epdS07fpy63YNjx8oSXpvzSf6Lv1IyTFZ2cflDK2nT79IVYfb2srh66vorh21/O3kSv0O\nqN4iut2ikIZORXRvozrXhqjofJFyj53Qgc93Kv/EaZ086tKJ9CxJ0oHPd8nZ7HoVFxZpY+J72p6c\nUqqtt+JfkHRhOPSN/tMkSXZfHwUG1yo5pva1dZWbmSPpwoKZ3s8P1ZL42VSBVcxTHRfGuN1upaen\nl/wC3rRpk9wm/KJmFtGkkT7b/JUk6R//3KjNX20t89jbb22tTSmpKiwsVFb2cWW5jqvxjeFKev8j\nbfoiVZK0Y9ce3Rh+faX0Hdaw6rfzNL/Xs1rYe5K+WrVRG+a9qwOf75QkuYvdOpGepXo3XitJCotq\nJNfBYzq8db8io9tJkmrWq63oPz1UZvvuomJlH8jQDe0uVI03//pX2vfpdtnsNj0460ktf/olnTzi\n8vK3RIVMOCdo83jK39ixf/9+TZs2Tdu3b1dAQICaN2+ucePGqXHjxpf0AYWug4Z01Cp27dmn2S+/\npoxjmfL19ZUztJ5GPvGIXpr/pux2m2rUqKFZkxNUp3ZQmW0sW/2+1qzbIJvNphFPDFb7dm2UfiRD\nk57/qzwetzwe6bmxvyMIL8PkdhMqPgiSpG6/71MyLHrudL6+WfuVQm6orz4vPCWb3abMbw8refwi\n2ew29Zoer9CmYbL72PWvl97W3o3bymw3tEmYYv4cL5vdpsNfH9BH05aqSacoPTRvhH7Yk15y3NoZ\nK3Rk2wGvf0+zmn5oudfazps28IrPrTnh4vnkylBhCF4tQhDVASGI6oIQLK3COcH27duXDIUWFRUp\nLy9PYWFh+uSTT7zeOQCAiVTHhTGbN28u9XzPnj1KTmZRBQDgf5hwvchlb5Zv3ry5tm4te2EGAMCi\nTLgwpsJKcOTIkaWW5mdnZyswMNCrnQIAmFB1vKnugAED5OPjI+nCBtVatWqpefPmXu8YAMBkquOc\nYGJiopYurZpVOwAA8zDjZvkKQzAsLEyjR49WVFSUHA5HyesPP/ywVzsGAIC3VbgwJjk5WY0aNdKZ\nM2eUk5OjnJwcHT58uDL6BgAwk+q0MGbdunX68MMPFRwcrH379pVciLm4uFi7d+/W2LFjK62TAAAT\nqE5zgvfcc49atGihqVOnlhr6tNvtuummmyqlcwAAE6luq0Ovv/56LVy4sLL6AgAws+pUCQIAcDk8\nhCAAwLJMGIKXfdk0AACqCypBAIAxquNmeQAALokJh0MJQQCAMQhBAIBV/feiKmZCCAIAjEElCACw\nLBOGIFskAACWRSUIADCEt64Ys3r1aiUnJ5c837lzp1q2bKn8/HwFBgZKkhISEtSyZUu9/vrr+vjj\nj2Wz2fTb3/5WnTt3LrdtQhAAYAwvhWC/fv3Ur18/SVJqaqo++ugj7d+/XzNmzFCzZs1Kjjt8+LD+\n8Y9/aOXKlTpz5ozi4uLUsWNH+fj4lNk2w6EAAGO4r+JxiRITEzVs2LCffW/Lli3q1KmT/Pz8FBIS\norCwMO3fv7/c9qgEAQCG8PYFtLdv367rrrtOoaGhkqS5c+cqJydHjRs31rhx4+RyuRQSElJyfEhI\niLKzsxUREVFmm4QgAMAYXg7BpKQk9e7dW5I0ePBgRUREKDw8XJMmTdKyZcsuOv5S9i0yHAoAMIaX\nh0O3bNmiNm3aSJKio6MVHh4uSerWrZv27t0rp9Mpl8tVcnxmZqacTme5bRKCAIBfvMzMTNWsWVN+\nfn7yeDx69NFHlZubK+lCODZt2lTt27fXxo0bdf78eWVmZiorK0tNmjQpt12GQwEAhvDmnGB2dnbJ\nfJ/NZlNsbKweffRRBQQEqH79+hoxYoQCAgIUGxurgQMHymazafLkybLby6/1bB4vX+yt0HXQm80D\nlWJyuwlV3QXAENMPLfda2zl9ulzxuXXf3mhYPy4HlSAAwBDeXh3qDYQgAMAY5runLiEIADCGhxAE\nAFiWCUOQLRIAAMuiEgQAGILhUACAdRGCAACrohIEAFgWIQgAsCxCEABgXR5bVffgsrFFAgBgWVSC\nAABDMBwKALAsj9t8w6GEIADAEFSCAADL8phwYQwhCAAwBJUgAMCyzDgnyBYJAIBlUQkCAAzh8VR1\nDy4fIQgAMIQZh0MJQQCAIQhBAIBlMRwKALAsKkEAgGWZcbM8WyQAAJZFJQgAMARXjAEAWJbbhMOh\nhCAAwBBmnBMkBAEAhmB1KADAstgnCACwLDNWgmyRAABYFpUgAMAQrA4FAFgWq0MBAJbFwhgAgGUx\nHAoAsCyGQwEAlmXG4VC2SAAALMvrleD5vyR4+yMAr5uZ8UVVdwEwxHQvts2cIADAspgTBABYFpUg\nAMCyTLguhhAEABjDm5VgcnKyXn/9dfn6+mrkyJGKiIjQmDFjVFxcrNDQUM2ePVt+fn5KTk7W4sWL\nZbfbFRsbq379+pXbLiEIADCEt+YEc3JylJiYqLffflv5+fmaN2+e1q5dq7i4OPXo0UNz5sxRUlKS\nYmJilJiYqKSkJDkcDvXt21fR0dEKDg4us222SAAAftFSUlJ0xx13qFatWnI6nZo6daq2bNmi7t27\nS5K6du2qlJQUbdu2TVFRUQoKCpK/v7/atm2rtLS0ctumEgQAGMLtpXaPHDmigoICPfXUU8rNzdWI\nESN09uxZ+fn5SZLq1aun7OxsuVwuhYSElJwXEhKi7OzsctsmBAEAhvDIe3OCJ0+e1Msvv6yMjAwN\nHjxYnp9cnsZTxqVqynr9pxgOBQAYwu258kd56tWrpzZt2sjX11fh4eGqWbOmatasqYKCAklSZmam\nnE6nnE6nXC5XyXlZWVlyOp3ltk0IAgAM4Zbtih/l6dixozZv3iy3262cnBzl5+erQ4cOWrt2rSRp\n3bp16tSpk1q3bq0dO3YoNzdXeXl5SktLU7t27cptm+FQAIAhvDUcWr9+fd17772KjY2VJE2YMEFR\nUVFKSEjQqlWr1KBBA8XExMjhcGj06NGKj4+XzWbT8OHDFRQUVG7bNs+lDJpehbzx5e/RAMygzmyu\nHYrqoej8Ua+1/Un9h6743OjMVQb25NIxHAoAsCyGQwEAhvDm6lBvIQQBAIbw1j5BbyIEAQCGIAQB\nAJbFcCgAwLLc5stAQhAAYIyKNr3/ErFFAgBgWVSCAABDcGd5AIBlsToUAGBZbpv55gQJQQCAIRgO\nBQBYlhmHQ1kdCgCwLCpBAIAh2CwPALAsM26WJwQBAIZgYQwAwLIYDgUAWJYZV4cSggAAQ5hxOJQt\nEgAAy6ISBAAYgjlBAIBlMScIALAsQhAAYFkehkMBAFZFJQgAsCwzhiBbJAAAlkUlCAAwhBk3yxOC\nAABDsE8QAGBZZpwTJAQBAIYgBAEAlsWcIADAssw4J8gWCQCAZVEJAgAMwZwgAMCymBMEAFiW24Qx\nSAgCAAzBcCgAwLLMVwcSggAAg5ixEmSLBADAsqgEAQCGMONmeUIQAGAIb68OLSgo0P33369hw4Yp\nNTVVu3btUnBwsCQpPj5eXbp0UXJyshYvXiy73a7Y2Fj169ev3DYJQQCAIby9MGb+/PmqU6dOyfM/\n/OEP6tq1a8nz/Px8JSYmKikpSQ6HQ3379lV0dHRJUP4c5gQBAIZwX8WjIgcOHND+/fvVpUuXMo/Z\ntm2boqKiFBQUJH9/f7Vt21ZpaWnltksIAgAM4Zbnih8VmTlzpsaOHVvqtaVLl2rw4MEaNWqUTpw4\nIZfLpZCQkJL3Q0JClJ2dXW67DIcCAAzhreHQ9957T7fccosaNmxY8lqvXr0UHBysyMhIvfrqq3r5\n5ZfVpk2b0v3xVNwjKkEAwC/axo0btX79esXGxmr16tV65ZVX5PF4FBkZKUnq1q2b9u7dK6fTKZfL\nVXJeVlaWnE5nuW1TCQIADOGtzfIvvfRSyc/z5s1TWFiYVqxYoYYNG6phw4basmWLmjZtqtatW2vC\nhAnKzc2Vj4+P0tLSNG7cuHLbJgQBAIaozAtoP/zww/r973+vgIAABQYGasaMGfL399fo0aMVHx8v\nm82m4cOHKygoqNx2bJ5LGTS9Cnnjy9+jAZhBndlfVHUXAEMUnT/qtbZH3dj/is/9y6GVBvbk0lEJ\nAgAMYcZrhxKCAABDeEx4HwlCEABgCDNWgmyRAABYFpUgAMAQlbk61ChUglXNz1814v4o//hJ8n9i\nmnyatC79vn9N1XhkvGoMGH3FH2FvHCX/p2fI/8npcnTtU/K6496B8n9yuvyfniGfFrddcfuAJN18\nc4S+3f25hj396EXv/eY39yjlizX6dMO7P/v+pWjVqoX+/en72rTxPb08b0bJ6yN+G6+Uzz9Uyhdr\n9NSTj1xh72EEz1U8qgohWMV823aR25Whgjee07kVL8rv/sdKvV+j1xNyf7/7qj6jxv1DdG75Cyp4\ndYJ8mrSWLfR62RvdLHv9hipYOF4Ff5suv56PVdwQUIbAwAD99S/T9K8Nn130ns1m09yXpus3DwxS\nl24P6v6e0QoLu+6yP2POC89p1B8m6q4uMapTJ0i/vrerGjUK1yOPPKSOd/XSXZ1j9MfRT6t27fL3\nhcF7vHntUG8hBKuYJy9XtsD//0sbUFOevNOl3j/37nwVf7+n9El+/qoxYLT8h0yU/+PPyVY/vNTb\n/vGTS3621XXKk39GnlPHJY9Hxd+myadxS7kP7da5FXMuHFSQL5tfDcnGHwdcmXPnzuv+Bwbp2LHM\ni9675poQnTx1Si7XCXk8Hv1rw2fq3q2T7Ha7Xl34gv65brU+3fCuuna5s9R56z9ZXfKzw+HQjTc2\n1Ff/2SZJ+nDNJ+rerZMOHTqszl1iVFxcrMLCQuWfPUsIViFv3kXCWy7pt97WrVu1Zs0aSReuxQbj\nFO/4QrY61yjgD/MUMHSKzn+8pPQB5wsuOsdxZ08V79uqgkVTdO7911TjvrKHgGxBwfLk55Y89+Tl\nyhZUV/K4pcJzkiTfdt1U/O3WC68BV6C4uFgFBRf/WZWk7OzjCqpVS02aNJKvr6+6dO6g+vWv0YAB\nvfXDD1m6+55+erDvEL344uQy27/mmhDlnDz1Y5tZx3XtdU55PB7l5eVLkqLvvkvHXSd05EiGod8N\nl85zFf9VlQoXxsycOVPHjh1Tenq6evbsqVWrVunUqVOaMGFCZfSv2vNp3UmeUy6dXTxd9mtvkN+D\nT6vglbHlnmMPj5CtZm35tr7rwgt+NSQ/f/kPunCe/bob5R8/We6cLBV99c/SJ9v+5/Mj28n31m4q\neHOaUV8JuMiQ+N/r9Vdf1KlTp/XdocOy2Wy6o307dex4m+7s8CtJUoB/gBwOh5L+/rpq1QpU69Y3\na/0nq3X2bIGGPvnHUu3ZbKX/IN9+W1vNnPmsHujFnGBVMuM/oysMwZ07d+qtt97SoEGDJEkjRoxQ\nXFyc1ztmFT43NFfxvq8lSe4fvr9Qpdns5VdlxUU6/8EiuQ/vLfVywRuTJV0YDv3vz7bgUNlq/XhX\nZVvtEHlO51z47Cat5ejSRwV/my6dyzfuSwH/Y9O/N6tLtwclSdOnjdWh7w/r2mudmvH8XK1a9X6p\nY3v1vhBk6z9Zre7RFy676Ovrq3ohdUuOaRB2rY5lXBh6bdWqhRYunK1eMY9QBeKyVTgcWlRUpMLC\nwpJ/eZ04cULnzp3zeseswn38mOzXN5Uk2YKvuTD8WcGwpPvwPvm0uPCvZ1vo9fK98/4yj/WczJat\nRoBswaGS3S6fiFtVvG+bVCNQfj0GqWDJDOnsGeO+EPAzPkx+S6Gh9RQYGKCePaO1fv2/lfrlVj3w\nm3slSaGh9TRtatkjIEVFRfr22/0lVWPvmB5au26j7Ha7Xnv1RcU+9IS+//5IpXwXlM2Mw6EVXkB7\n3bp1WrBggTIyMtSyZUsdPHhQ48aN0913331JH8AFtCvg568aDz59oVqz23X+n6vkc30TFX/3jdxH\n9st/yEQpoKbstUPkzjys8xuS5D6yXzX6DJetVh3JZtf5NYvkPnqwzI+w3xgpv3sHSpKKdm1W0Wcf\nyPdXd8vRrZ88rmMlx51LelmeU66ymrE0LqBdvrZtojR71kTdcENDFRYWKSPjmD748BN9dyhd77//\nsWJiemjC+FHyeDya85cFWrHiXfn4+OiVxOfVIrKZfHzsmjJ1jj5eu6HMz4iMbKr5iTNlt9uVmrpV\nfxzznKLvvkvLlr6iHTt+XEE99pnp+vKrryvja5uSNy+g/ciNfSo+qAyLD71tYE8uXYUhePbsWXk8\nHu3fv18Oh0ONGjXSqVOnVL9+/Uv6AEIQ1QEhiOrCmyE46IYHr/jct75/x8CeXLoKh0OHDBmi06dP\nq1WrVoqMjNQHH3ygxx5jTxkAoDQzbpavcGHMxIkTNXLkSA0dOlQrVqyQ0+nUypVVc98nAMAvV7W8\nbFpkZKQWLFigZcuWqWnTppoxY4Zq165dGX0DAJiIGRfGlFkJtm/fvtReHLfbrdTUVL333nuy2WxK\nSUmplA4CAOAtZYbg5s2byzzp888/90pnAADmVS03yx8+fFjLly/XyZMnJUmFhYX68ssv9emnn3q9\ncwAA86iWc4Jjx45VkyZNtGvXLnXp0kV2u11TpkypjL4BAEzEjHOCFYagr6+v+vTpo9q1a+vee+/V\nrFmztHTp0sroGwDARMx4F4kKh0M9Ho9SU1MVHBysVatWKTw8XEeOcHkiAEBpFVx75RepzErwmWee\nkSSFhYUpMDBQEyZM0Ndff60lS5Zo7Njy73IAALAeM95Ut8xK8MCBA+rdu7fS09O1b98+ST+m/Lx5\n89S5c+fK6SEAAF5SZgguX75cWVlZev7555WQkFCZfQIAmFC12iLh6+urBg0aaO7cuZXZHwCASVXl\nKs8rVeHCGAAALoUZ9wkSggAAQ5hxdSghCAAwRLWaEwQA4HKYcU6wwivGAABQXVEJAgAMwcIYAIBl\nsTAGAGBZVIIAAMsy48IYQhAAYAg3w6EAAKsyXwSyRQIAYGFUggAAQ7AwBgBgWYQgAMCy2CcIALAs\nKkEAgGV5a5/g2bNnNXbsWB0/flznzp3TsGHD1Lx5c40ZM0bFxcUKDQ3V7Nmz5efnp+TkZC1evFh2\nu12xsbHq169fuW0TggAAQ3hrOHTDhg1q2bKlhg4dqqNHj2rIkCFq27at4uLi1KNHD82ZM0dJSUmK\niYlRYmKikpKS5HA41LdvX0VHRys4OLjMttkiAQD4Rbvvvvs0dOhQSdKxY8dUv359bdmyRd27d5ck\nde3aVSkpKdq2bZuioqIUFBQkf39/tW3bVmlpaeW2TSUIADCEt+cE+/fvrx9++EELFizQY489Jj8/\nP0lSvXr1lJ2dLZfLpZCQkJLjQ0JClJ2dXW6bhCAAwBDeXh26cuVK7d69W3/6059KfVZZn3sp/WE4\nFABgCLc8V/woz86dO3Xs2DFJUmRkpIqLi1WzZk0VFBRIkjIzM+V0OuV0OuVyuUrOy8rKktPpLLdt\nQhAAYAjPVfxXnq+++kqLFi2SJLlcLuXn56tDhw5au3atJGndunXq1KmTWrdurR07dig3N1d5eXlK\nS0tTu3btym2b4VAAgCG8dReJ/v37a/z48YqLi1NBQYEmTpyoli1bKiEhQatWrVKDBg0UExMjh8Oh\n0aNHKz4+XjabTcOHD1dQUFC5bds8Xh7EzRtf/h4NwAzqzP6iqrsAGKLo/FGvtX1z/duv+NxdmVsM\n7MmlYzgUAGBZDIcCAAzBTXUBAJblrcumeRMhCAAwBJUgAMCyqAQBAJZFJQgAsCwzVoJskQAAWBaV\nIADAEB6Pu6q7cNkIQQCAIbx9KyVvIAQBAIbw9q2UvIEQBAAYgkoQAGBZVIIAAMsy4z5BtkgAACyL\nShAAYAgzbpYnBAEAhmBOEABgWawOBQBYFpUgAMCyzLg6lBAEABjCjJUgWyQAAJZFJQgAMAQLYwAA\nlmXG4VBCEABgCBbGAAAsiyvGAAAsi0oQAGBZZpwTZIsEAMCyqAQBAIZgThAAYFlmHA4lBAEAhiAE\nAQCWZb4IlGweM0Y3AAAGYHUoAMCyCEEAgGURggAAyyIEAQCWRQgCACyLEAQAWBYhaBJr166t6i4A\nV6WwsFD9+vVTQkLCz75/++23V3KPAELQFI4cOaI1a9ZUdTeAq5Kdna3z589r5syZVd0VoAQhaAJT\npkxRamqqmjdvrjFjxiguLk4pKSkaOXJkyTH//Vf0/v37NXjwYD3yyCMaNmyYcnNzq6rbQCkzZsxQ\nenq6nnnmGQ0aNEiDBg3SgAGEfC1HAAACXUlEQVQDlJ6eXuq43bt3q3///srLy9O6devUv39/DRw4\nUM8//3wV9RzVGSFoAvHx8brttts0fPhwFRYWavny5bLbf/5/3dSpUzVlyhQtXrxYd955p5YtW1bJ\nvQV+XkJCgho1aqQBAwZo+PDheuutt9SnTx8tX7685JgTJ05o0qRJmjNnjiRp/vz5WrJkiZYuXapj\nx47pP//5T1V1H9UU1w41mVatWpX7/vbt2/Xss89Kks6fP6+oqKjK6BZwyUJDQzVt2jTNmzdPubm5\nuvnmmyVduPjyqFGj9Pjjj6tBgwbatm2bMjIyFB8fL0k6ffq0MjIydOutt1Zl91HNEIIm43A4JEk2\nm63U60VFRZKkgIAALVmy5KL3gV+KuXPnqmPHjhowYIA+/vhjbdy4UZJ05swZRUREaOXKlbrnnnvk\ncDjUsmVLvfHGG1XbYVRrDIeagN1uLwm5/6pVq5aysrIkSXv27FFeXp4kqXnz5tq0aZMkac2aNUpJ\nSanczgIVyMnJUXh4uDwej9avX6/CwkJJUlBQkMaNG6fQ0FD9/e9/V6NGjXTgwAEdP35c0oXwzMzM\nrMquoxoiBE2gcePG+uabb3T69OmS15o3b67AwED1799f77//vsLCwiRJ48eP18KFCzVw4EC98847\nioyMrKpuAz/roYce0tSpU/X444+rZ8+eSk1N1WeffVby/rhx47Ro0SKdPHlS48aN09ChQ9W/f3+d\nPHlSTqezCnuO6ohbKQEALItKEABgWYQgAMCyCEEAgGURggAAyyIEAQCWRQgCACyLEAQAWBYhCACw\nrP8Dh28eVk3t6v0AAAAASUVORK5CYII=\n", 740 | "text/plain": [ 741 | "
" 742 | ] 743 | }, 744 | "metadata": { 745 | "tags": [] 746 | } 747 | } 748 | ] 749 | }, 750 | { 751 | "cell_type": "code", 752 | "metadata": { 753 | "id": "kjKxqHSMgJJb", 754 | "colab_type": "code", 755 | "colab": {} 756 | }, 757 | "source": [ 758 | "" 759 | ], 760 | "execution_count": 0, 761 | "outputs": [] 762 | } 763 | ] 764 | } -------------------------------------------------------------------------------- /notebooks/Processing_test_dataset.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Procesar el dataset de pruebas" 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "Este dataset está obtenido de [Kaggle](https://www.kaggle.com/jruvika/fake-news-detection). Contiene los titulares y cuerpos de las noticias, que son de dos posibles tipos de categorías. \"Falsa (0)\" y \"True (1)\". " 15 | ] 16 | }, 17 | { 18 | "cell_type": "code", 19 | "execution_count": 16, 20 | "metadata": {}, 21 | "outputs": [], 22 | "source": [ 23 | "import pandas as pd\n", 24 | "\n", 25 | "#Progress bars\n", 26 | "from tqdm import tqdm\n", 27 | "tqdm.pandas()\n", 28 | "\n", 29 | "#Paralelize pandas apply on multiple cores\n", 30 | "import swifter\n", 31 | "\n", 32 | "from matplotlib import pyplot as plt\n", 33 | "from matplotlib import style\n", 34 | "\n", 35 | "#Nicer style\n", 36 | "style.use('seaborn') \n", 37 | "\n", 38 | "import re #regexp\n", 39 | "from nltk.tokenize import RegexpTokenizer\n", 40 | "\n", 41 | "#word2vec\n", 42 | "from gensim.models import KeyedVectors" 43 | ] 44 | }, 45 | { 46 | "cell_type": "markdown", 47 | "metadata": {}, 48 | "source": [ 49 | "Constantes necesarias" 50 | ] 51 | }, 52 | { 53 | "cell_type": "code", 54 | "execution_count": 17, 55 | "metadata": {}, 56 | "outputs": [], 57 | "source": [ 58 | "PATH_DATASET = \"../data/Other_datasets/fake-news-detection/data.csv\"\n", 59 | "PATH_PROCESSED = \"../data/Other_datasets/fake-news-detection/data_kaggle_proc.pickle\"\n", 60 | "\n", 61 | "#Padding number for title and content\n", 62 | "MAX_LEN_TITLE = 13\n", 63 | "MAX_LEN_CONTENT = 1598\n", 64 | "\n", 65 | "#Convert labels\n", 66 | "LBL_TRUE = [0,0,0,1]\n", 67 | "LBL_FAKE = [0,0,1,0]" 68 | ] 69 | }, 70 | { 71 | "cell_type": "code", 72 | "execution_count": 18, 73 | "metadata": {}, 74 | "outputs": [], 75 | "source": [ 76 | "df = pd.read_csv(PATH_DATASET).dropna()[['Headline', 'Body', 'Label']]" 77 | ] 78 | }, 79 | { 80 | "cell_type": "code", 81 | "execution_count": 19, 82 | "metadata": {}, 83 | "outputs": [ 84 | { 85 | "data": { 86 | "text/html": [ 87 | "
\n", 88 | "\n", 101 | "\n", 102 | " \n", 103 | " \n", 104 | " \n", 105 | " \n", 106 | " \n", 107 | " \n", 108 | " \n", 109 | " \n", 110 | " \n", 111 | " \n", 112 | " \n", 113 | " \n", 114 | " \n", 115 | " \n", 116 | " \n", 117 | " \n", 118 | " \n", 119 | " \n", 120 | " \n", 121 | " \n", 122 | " \n", 123 | " \n", 124 | " \n", 125 | " \n", 126 | " \n", 127 | " \n", 128 | " \n", 129 | " \n", 130 | " \n", 131 | " \n", 132 | " \n", 133 | " \n", 134 | " \n", 135 | " \n", 136 | " \n", 137 | " \n", 138 | " \n", 139 | " \n", 140 | " \n", 141 | " \n", 142 | "
HeadlineBodyLabel
0Four ways Bob Corker skewered Donald TrumpImage copyright Getty Images\\nOn Sunday mornin...1
1Linklater's war veteran comedy speaks to moder...LONDON (Reuters) - “Last Flag Flying”, a comed...1
2Trump’s Fight With Corker Jeopardizes His Legi...The feud broke into public view last week when...1
3Egypt's Cheiron wins tie-up with Pemex for Mex...MEXICO CITY (Reuters) - Egypt’s Cheiron Holdin...1
4Jason Aldean opens 'SNL' with Vegas tributeCountry singer Jason Aldean, who was performin...1
\n", 143 | "
" 144 | ], 145 | "text/plain": [ 146 | " Headline \\\n", 147 | "0 Four ways Bob Corker skewered Donald Trump \n", 148 | "1 Linklater's war veteran comedy speaks to moder... \n", 149 | "2 Trump’s Fight With Corker Jeopardizes His Legi... \n", 150 | "3 Egypt's Cheiron wins tie-up with Pemex for Mex... \n", 151 | "4 Jason Aldean opens 'SNL' with Vegas tribute \n", 152 | "\n", 153 | " Body Label \n", 154 | "0 Image copyright Getty Images\\nOn Sunday mornin... 1 \n", 155 | "1 LONDON (Reuters) - “Last Flag Flying”, a comed... 1 \n", 156 | "2 The feud broke into public view last week when... 1 \n", 157 | "3 MEXICO CITY (Reuters) - Egypt’s Cheiron Holdin... 1 \n", 158 | "4 Country singer Jason Aldean, who was performin... 1 " 159 | ] 160 | }, 161 | "execution_count": 19, 162 | "metadata": {}, 163 | "output_type": "execute_result" 164 | } 165 | ], 166 | "source": [ 167 | "df.head()" 168 | ] 169 | }, 170 | { 171 | "cell_type": "markdown", 172 | "metadata": {}, 173 | "source": [ 174 | "Número de noticias. por tipo en el dataset." 175 | ] 176 | }, 177 | { 178 | "cell_type": "code", 179 | "execution_count": 20, 180 | "metadata": {}, 181 | "outputs": [ 182 | { 183 | "data": { 184 | "text/plain": [ 185 | "" 186 | ] 187 | }, 188 | "execution_count": 20, 189 | "metadata": {}, 190 | "output_type": "execute_result" 191 | }, 192 | { 193 | "data": { 194 | "image/png": "\n", 195 | "text/plain": [ 196 | "" 197 | ] 198 | }, 199 | "metadata": {}, 200 | "output_type": "display_data" 201 | } 202 | ], 203 | "source": [ 204 | "plt.figure(figsize=(15,7))\n", 205 | "df.groupby('Label').count()['Headline'].plot.bar(title=\"Number of news per category\")" 206 | ] 207 | }, 208 | { 209 | "cell_type": "markdown", 210 | "metadata": {}, 211 | "source": [ 212 | "Procesamos el dataset" 213 | ] 214 | }, 215 | { 216 | "cell_type": "code", 217 | "execution_count": 21, 218 | "metadata": {}, 219 | "outputs": [], 220 | "source": [ 221 | "def clean_text(text):\n", 222 | " #Remove URLs\n", 223 | " text = re.sub(r\"http\\S+\", \"\", text)\n", 224 | " #Tokenize\n", 225 | " tokenizer = RegexpTokenizer('\\w+|\\$[\\d\\.]+|\\S+')\n", 226 | " tokens = tokenizer.tokenize(text)\n", 227 | " #Remove non alphanumerica characters\n", 228 | " words = [word for word in tokens if word.isalpha()] \n", 229 | " return words" 230 | ] 231 | }, 232 | { 233 | "cell_type": "code", 234 | "execution_count": 22, 235 | "metadata": {}, 236 | "outputs": [], 237 | "source": [ 238 | "def pad_array(array, token_len):\n", 239 | " diff_token = token_len - len(array)\n", 240 | " if(diff_token < 0):\n", 241 | " array = array[:token_len] #Truncate\n", 242 | " else:\n", 243 | " #Pad\n", 244 | " array += [0]*diff_token #Pad\n", 245 | " \n", 246 | " return array " 247 | ] 248 | }, 249 | { 250 | "cell_type": "code", 251 | "execution_count": 23, 252 | "metadata": {}, 253 | "outputs": [ 254 | { 255 | "name": "stderr", 256 | "output_type": "stream", 257 | "text": [ 258 | "Pandas Apply: 100%|██████████| 3988/3988 [00:01<00:00, 3963.97it/s]\n", 259 | "Pandas Apply: 100%|██████████| 3988/3988 [00:00<00:00, 34533.49it/s]\n" 260 | ] 261 | } 262 | ], 263 | "source": [ 264 | "#Clean content\n", 265 | "df['Body'] = df.swifter.apply(lambda row: clean_text(row['Body']), axis=1)\n", 266 | "#Clean title\n", 267 | "df['Headline'] = df.swifter.apply(lambda row: clean_text(row['Headline']), axis=1)" 268 | ] 269 | }, 270 | { 271 | "cell_type": "markdown", 272 | "metadata": {}, 273 | "source": [ 274 | "Convertir a los indices de **word2vec**" 275 | ] 276 | }, 277 | { 278 | "cell_type": "code", 279 | "execution_count": 24, 280 | "metadata": {}, 281 | "outputs": [], 282 | "source": [ 283 | "model = KeyedVectors.load_word2vec_format(\"../data/GoogleNews-vectors-negative300.bin.gz\", binary=True,\n", 284 | " limit=50000)" 285 | ] 286 | }, 287 | { 288 | "cell_type": "code", 289 | "execution_count": 25, 290 | "metadata": {}, 291 | "outputs": [ 292 | { 293 | "name": "stderr", 294 | "output_type": "stream", 295 | "text": [ 296 | "Pandas Apply: 100%|██████████| 3988/3988 [00:00<00:00, 37513.90it/s]\n", 297 | "Pandas Apply: 100%|██████████| 3988/3988 [00:00<00:00, 6349.86it/s]\n" 298 | ] 299 | } 300 | ], 301 | "source": [ 302 | "df['Headline'] = df.swifter.apply(lambda r: \n", 303 | " [model.vocab[x].index for x in r['Headline'] if x in model.vocab], axis=1)\n", 304 | "df['Body'] = df.swifter.apply(lambda r: \n", 305 | " [model.vocab[x].index for x in r['Body'] if x in model.vocab], axis=1)\n", 306 | "\n", 307 | "\n", 308 | "#Drop news with short or no title\n", 309 | "df = df[df['Headline'].map(len) >= 1]\n", 310 | "#Reset index\n", 311 | "df = df.reset_index().drop(\"index\", axis=1)" 312 | ] 313 | }, 314 | { 315 | "cell_type": "markdown", 316 | "metadata": {}, 317 | "source": [ 318 | "Distribución para el cuerpo de los artículos." 319 | ] 320 | }, 321 | { 322 | "cell_type": "code", 323 | "execution_count": 26, 324 | "metadata": {}, 325 | "outputs": [ 326 | { 327 | "name": "stderr", 328 | "output_type": "stream", 329 | "text": [ 330 | "Pandas Apply: 100%|██████████| 3980/3980 [00:00<00:00, 49281.68it/s]\n" 331 | ] 332 | }, 333 | { 334 | "data": { 335 | "text/plain": [ 336 | "" 337 | ] 338 | }, 339 | "execution_count": 26, 340 | "metadata": {}, 341 | "output_type": "execute_result" 342 | }, 343 | { 344 | "data": { 345 | "image/png": "iVBORw0KGgoAAAANSUhEUgAAA7YAAAFyCAYAAADBHEYTAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDIuMS4xLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvAOZPmwAAIABJREFUeJzt3X2Y1XWdP/7nYUZkuRNIbCO1S5Q0MzUkkgLazCssNdQoiL3QJGtjU8TIuFGGDBC59EuWZKabW6viDQmoWZZihICCSysqa2VehSnkHagwpgxzzu+Pfs5KIMyIc/PRx+Ov+dy/zpnXKM/zfn8+p1SpVCoBAACAgmrX2gUAAADA7hBsAQAAKDTBFgAAgEITbAEAACg0wRYAAIBCE2wBAAAoNMEWgBb3xBNP5OCDD868efO2Wf+jH/0oEydOfNOuc8wxx+Shhx560863M5s3b86IESNy/PHH51e/+tU22xYvXpzvfve7uzzHqFGjcscddzRXiTs0dOjQvPjii43ef9OmTTn11FMblg8++OBs2LChOUrbzoMPPpiampokyUMPPZSxY8fudP+JEyfmRz/6UUuUBkArq27tAgB4e2rXrl1mzZqVo446Kr17927tcnbbI488kueeey533nnndtseeuihvPDCC61Q1a7dcsstTdr/hRdeaLEPC/7RH//4xzz11FNJkg984AP53ve+1yp1AND2CLYAtIoOHTrk9NNPzze+8Y3ccMMNad++/TbbJ06cmD59+uRLX/rSdsvHHHNMTjjhhNx333154YUXcsYZZ+S3v/1t1qxZk+rq6vzgBz/IO9/5ziTJ3Llz87vf/S5btmzJ6aefnmHDhiVJ7r777vzgBz9IXV1dOnTokAkTJuSDH/xgLrvssjzwwAN5+umnc/DBB+eSSy7Zpq677rorc+bMSblcTqdOnTJp0qR07tw5kydPzlNPPZWhQ4fmxhtvTIcOHZIkq1evzg033JD6+vp06dIl55xzTr7//e/n9ttvT1VVVQ444IBMmTIlPXv2bLjG1q1bM378+FRXV2fWrFn529/+lhkzZuQPf/hD6urqMmDAgHzzm99MdXV1PvCBD+QrX/lKli1blqeffjpnnHFGRo4cmWeeeSYTJkzIxo0bkyQf+9jHMm7cuO1+DwcffHDuvffeLF68OHfeeWfatWuXtWvXpkOHDpk1a1YOPPDAbfafNGlSXn755QwdOjTz589Pklx22WVZvXp1nn/++XzpS1/Kv/7rvyZJ5s2bl+uvvz7lcjndunXLlClTtjtfuVzOhRdemNWrV6e2tjaVSiXTp0/PUUcdlYkTJ+b555/PX/7ylxxxxBFZvnx5Nm3alEmTJuWkk07KtGnT8rOf/Sy1tbWZPn16fvvb36aqqirHHntszjnnnG2u89hjj2XGjBl5/vnnU19fn1GjRjX0AgDFZyoyAK1mzJgx6dixY77zne80+dhXXnklN910U84+++zU1NTktNNOy6233pp3vetdWbBgQcN+e+65ZxYsWJCrr746s2fPzqOPPpo///nP+c53vpMrr7wyCxcuzLRp03LWWWflpZdeSpI8+eSTWbBgwXah9rHHHsvUqVNz2WWX5dZbb83YsWPz7//+79lnn30yffr07L///rnlllsaQm2SHHHEERkxYkQ+/elP55xzzsnNN9+ce+65Jz/96U9z2223pU+fPttMv66rq8vZZ5+dd7zjHbnkkktSXV2dCy+8MO9///szf/78LFy4MBs3bsx//ud/Jkm2bNmS7t2754Ybbsj3vve9zJw5s+G92XfffbNgwYJcd911Wbt2bTZt2rTT9/T+++/PlClT8rOf/SxHHHFErrzyyu32mTlzZjp06JBbbrklVVVVSZL99tsv8+fPz5w5c3LRRRelrq4uK1euzMKFC3Pddddl4cKFOeOMM3LmmWdud77Vq1fn6aefzo033pif//znOfnkk3PVVVc1bH/55Zdz++2358ILL8zYsWPTr1+/zJw5c5tzfO9738srr7ySn//851m4cGF++9vfZuXKlQ3bt27dmrFjx2b8+PGZP39+rr322lx99dV54IEHdvp+AFAcRmwBaDXt2rXLxRdfnJNOOikDBw5s0rGf/OQnk/w9VO2999455JBDkiT777//NtN+R4wYkSR55zvfmY9+9KO59957U1VVlaeffjpf/OIXG/YrlUp5/PHHkyRHHnlkqqu3/1/kfffdl6OPPjr77bdfkmTAgAHp0aNHHn744ZRKpUbVvWTJkpxyyinp2LFjkuTUU0/NFVdckS1btiRJZs2aldra2tx5550N51y8eHEeeuih/PSnP03y97D3Wp/4xCeSJO9///uzZcuWvPTSSxk0aFC+8pWvZP369fnIRz6S8ePHp0uXLjut7f3vf3/++Z//OUly6KGH7nBa9Y6ccMIJSZL3ve992bJlSzZv3pzFixdn7dq1De9/krz44ot5/vnn061bt4Z1H/zgB7PXXnvlhhtuyF/+8pesWLEinTp1ath+1FFH7fL6y5cvz6RJk1JVVZWqqqpce+21SdLwAcef//znPP7445k8eXLDMS+//HL+93//N0ceeWSjXiMAbZtgC0Crete73pULLrggEyZMyEknndSwvlQqpVKpNCzX1dVtc9xrpy7vsccer3v+du3+b3JSuVxOdXV16uvrM2DAgFx66aUN29avX5999tknd955Z0Po/Eflcnm7AFupVLJ169ad1rCzc5TL5WzdurVh+TOf+UwqlUrOP//8XHHFFQ37fPe7322Yxvviiy9uc44999wzSRrWVSqVHH744Vm0aFHuvffe3Hffffnc5z6Xq666Kocddtjr1vbakeZ/fP935tUPAV57/XK5nKFDh+bcc89teA1PP/109tprr22OXbx4cWbMmJHTTz89n/jEJ9K7d+/ceuutDdtf73fxj9d/7fuxfv36bV7Lq9PAX3s/8bPPPrvLoA9AcZiKDECrO+644zJ48OD85Cc/aVjXvXv3PPzww0mSp556apuppU3x6qjdunXrcu+992bAgAEZMGBAli1blsceeyxJ8pvf/Caf+cxnthsJ/UcDBgzI0qVL85e//CVJcu+992b9+vU54ogjdnpcVVVVQ3gdNGhQbr755oZpz9dcc00+9KEPNQT1ww8/POPGjcvjjz+em266KUkycODA/PjHP06lUsmWLVsyZsyYhlHJ13PJJZfk8ssvz7HHHpvzzjsvBx10UB599NGdHtMYr34wsKvQO3DgwNx+++15+umnkyTXX399TjvttO32W7ZsWT7+8Y9n5MiROeyww3LXXXelvr5+h+d87fv4WgMGDMiCBQtSLpezZcuWjB07Nvfff3/D9gMOOKBh+nTy9+B7wgknNPQXAMVnxBaANuH888/PqlWrGpZHjRqVb3zjGxkyZEj23XffHH300W/ovK+88kpOPvnk1NXV5fzzz88BBxyQJPn2t7+dr3/966lUKg0PnHrtFNgdOeiggzJ16tSceeaZqa+vT4cOHXLFFVfscuTv6KOPzje+8Y1MmzYt5513XtavX5/Pfe5zKZfLec973rPdvbx77rlnLrrooowePTpHH310zjvvvMyYMSMnnnhi6urq8pGPfCRnnHHGTq952mmnZeLEiTnhhBPSvn37HHzwwTn++OMb8Y7tXM+ePXP44Yfn+OOPz3XXXfe6+w0cODBf/vKXM3r06JRKpXTu3Dlz5szZbsR7xIgRGT9+fE488cRs3bo1H/3oR/OrX/0q5XJ5u3MeeeSR+f73v58zzzwzo0aNalh/5plnZsaMGRk6dGjq6+vz6U9/Op/85Cdz9913J/n76P7ll1+eGTNm5D/+4z+ydevWnH322Y2a5gxAMZQqjZ1nBAAAAG2QqcgAAAAUmmALAABAoQm2AAAAFJpgCwAAQKEJtgAAABTaW+brfp55ZlNrl/C6unfvmI0bX2rtMmC36GOKTg9TdHqYotPD7K6ePV//6/WM2LaA6uqq1i4Bdps+puj0MEWnhyk6PUxzEmwBAAAoNMEWAACAQhNsAQAAKDTBFgAAgEITbAEAACg0wRYAAIBCE2wBAAAoNMEWAACAQhNsAQAAKDTBFgAAgEITbAEAACg0wRYAAIBCE2wBAAAotOrWLuDtZPRFdzdp/6snHtNMlQAAALx1NOuI7erVqzNq1Kgkydq1a/OFL3whI0eOzNSpU1Mul5Mkc+bMybBhwzJixIg8+OCDO90XAAAA/lGzBdurrroq559/fl555ZUkycyZMzNu3LjMnTs3lUolixYtypo1a7Jy5crMmzcvs2fPzgUXXPC6+wIAAMCONFuw3X///XPZZZc1LK9Zsyb9+/dPkgwePDjLly/PqlWrMnDgwJRKpfTq1Sv19fXZsGHDDvcFAACAHWm2e2yHDBmSJ554omG5UqmkVColSTp16pRNmzZl8+bN6datW8M+r67f0b670r17x1RXV73Jr+LN07NnlxY5BpqTnqTo9DBFp4cpOj1Mc2mxh0e1a/d/g8O1tbXp2rVrOnfunNra2m3Wd+nSZYf77srGjS+9uQW/iXr27JJnntl1OP9Hb+QYaC5vtI+hrdDDFJ0epuj0MLtrZx+MtNjX/Rx66KFZsWJFkmTJkiXp169f+vbtm6VLl6ZcLmfdunUpl8vp0aPHDvcFAACAHWmxEdsJEyZkypQpmT17dnr37p0hQ4akqqoq/fr1y/Dhw1Mul1NTU/O6+wIAAMCOlCqVSqW1i3gztOVpDa9Ou/A9thSZ6UMUnR6m6PQwRaeH2V1tYioyAAAANAfBFgAAgEITbAEAACg0wRYAAIBCE2wBAAAoNMEWAACAQhNsAQAAKDTBFgAAgEITbAEAACg0wRYAAIBCE2wBAAAoNMEWAACAQhNsAQAAKDTBFgAAgEITbAEAACg0wRYAAIBCE2wBAAAoNMEWAACAQhNsAQAAKDTBFgAAgEITbAEAACg0wRYAAIBCE2wBAAAoNMEWAACAQhNsAQAAKDTBFgAAgEITbAEAACg0wRYAAIBCE2wBAAAoNMEWAACAQhNsAQAAKDTBFgAAgEITbAEAACg0wRYAAIBCE2wBAAAoNMEWAACAQhNsAQAAKDTBFgAAgEITbAEAACg0wRYAAIBCE2wBAAAoNMEWAACAQhNsAQAAKDTBFgAAgEITbAEAACg0wRYAAIBCE2wBAAAoNMEWAACAQhNsAQAAKDTBFgAAgEITbAEAACg0wRYAAIBCE2wBAAAotOqWvFhdXV0mTpyYJ598Mu3atcu0adNSXV2diRMnplQqpU+fPpk6dWratWuXOXPmZPHixamurs7kyZNz+OGHt2SpAAAAFESLBtvf/OY32bp1a2644YYsW7Ysl156aerq6jJu3Lh8+MMfTk1NTRYtWpRevXpl5cqVmTdvXtavX5+zzjorN998c0uWCgAAQEG06FTkAw44IPX19SmXy9m8eXOqq6uzZs2a9O/fP0kyePDgLF++PKtWrcrAgQNTKpXSq1ev1NfXZ8OGDS1ZKgAAAAXRoiO2HTt2zJNPPplPfepT2bhxY6644orcf//9KZVKSZJOnTpl06ZN2bx5c7p169Zw3Kvre/To8brn7t69Y6qrq5r9NbxRPXt2aZFjoDnpSYpOD1N0epii08M0lxYNtj/+8Y8zcODAjB8/PuvXr89pp52Wurq6hu21tbXp2rVrOnfunNra2m3Wd+my8z+CjRtfara6d1fPnl3yzDObmnzcGzkGmssb7WNoK/QwRaeHKTo9zO7a2QcjLToVuWvXrg0Bda+99srWrVtz6KGHZsWKFUmSJUuWpF+/funbt2+WLl2acrmcdevWpVwu73S0FgAAgLevFh2x/eIXv5jJkydn5MiRqauryznnnJPDDjssU6ZMyezZs9O7d+8MGTIkVVVV6devX4YPH55yuZyampqWLBMAAIACKVUqlUprF/FmaMvTGl6ddjH6orubdNzVE49ppoqg6Uwfouj0MEWnhyk6PczuajNTkQEAAODNJtgCAABQaIItAAAAhSbYAgAAUGiCLQAAAIUm2AIAAFBogi0AAACFJtgCAABQaIItAAAAhSbYAgAAUGiCLQAAAIUm2AIAAFBogi0AAACFJtgCAABQaIItAAAAhSbYAgAAUGiCLQAAAIUm2AIAAFBogi0AAACFJtgCAABQaIItAAAAhSbYAgAAUGiCLQAAAIUm2AIAAFBogi0AAACFJtgCAABQaIItAAAAhSbYAgAAUGiCLQAAAIUm2AIAAFBogi0AAACFJtgCAABQaIItAAAAhSbYAgAAUGiCLQAAAIUm2AIAAFBogi0AAACFJtgCAABQaIItAAAAhSbYAgAAUGiCLQAAAIUm2AIAAFBogi0AAACFJtgCAABQaIItAAAAhSbYAgAAUGiCLQAAAIUm2AIAAFBogi0AAACFJtgCAABQaIItAAAAhSbYAgAAUGiCLQAAAIUm2AIAAFBojQq2X/7yl/OLX/wiW7Zs2e0L/vCHP8zw4cNzyimnZN68eVm7dm2+8IUvZOTIkZk6dWrK5XKSZM6cORk2bFhGjBiRBx98cLevCwAAwFtTo4PtPffck+OOOy4XXHDBGw6aK1asyP/8z//k+uuvzzXXXJO//vWvmTlzZsaNG5e5c+emUqlk0aJFWbNmTVauXJl58+Zl9uzZueCCC97Q9QAAAHjrq27MTv3790///v3z8ssv54477sjYsWPTuXPnDBs2LCNHjkz79u0bdbGlS5fmve99b772ta9l8+bN+eY3v5mbbrop/fv3T5IMHjw4y5YtywEHHJCBAwemVCqlV69eqa+vz4YNG9KjR4/XPXf37h1TXV3VqDpaQ8+eXVrkGGhOepKi08MUnR6m6PQwzaVRwTb5+2jrLbfckmXLlmXw4MH59Kc/neXLl2fMmDH50Y9+1KhzbNy4MevWrcsVV1yRJ554ImPGjEmlUkmpVEqSdOrUKZs2bcrmzZvTrVu3huNeXb+zYLtx40uNfSktrmfPLnnmmU1NPu6NHAPN5Y32MbQVepii08MUnR5md+3sg5FGBduPf/zj2XffffPZz342NTU16dChQ5Lkwx/+cD772c82upBu3bqld+/ead++fXr37p0999wzf/3rXxu219bWpmvXruncuXNqa2u3Wd+li093AAAA2F6j7rH9yU9+kksvvTQnnXRSkmTt2rV/P7hduyxYsKDRFzvqqKNyzz33pFKp5Kmnnsrf/va3DBgwICtWrEiSLFmyJP369Uvfvn2zdOnSlMvlrFu3LuVyeaejtQAAALx9NWrEdvHixVmwYEEWLFiQ5557Ll/96lfzxS9+McOHD2/SxT7+8Y/n/vvvz7Bhw1KpVFJTU5N99903U6ZMyezZs9O7d+8MGTIkVVVV6devX4YPH55yuZyampo39OIAAAB46ytVKpXKrnY64YQTctNNN6Vjx45Jkr/97W/5/Oc/n9tuu63ZC2ystjxf/9X7CUZfdHeTjrt64jHNVBE0nftiKDo9TNHpYYpOD7O7dnaPbaOmItfV1W3z5OM99thj96sCAACAN0GjpiIfe+yxOe200/KpT30qpVIpv/zlL3PMMUYTAQAAaH2NCrbnnntu7rjjjtx///2prq7OqaeemmOPPba5awMAAIBdavT32B544IHZe++98+otuffff38+9KEPNVthAAAA0BiNCrYXXHBBfv3rX2e//fZrWFcqlfJf//VfzVYYAAAANEajgu2yZctyxx13pEOHDs1dDwAAADRJo56KvN9++6UR3woEAAAALa5RI7Z77bVXjj/++Hzwgx/c5mt/Zs6c2WyFAQAAQGM0KtgOGjQogwYNau5aAAAAoMkaFWxPPvnkPPHEE/njH/+YgQMHZv369ds8SAoAAABaS6Pusf35z3+eMWPGZMaMGXnhhRcyYsSI3HLLLc1dGwAAAOxSo4LtVVddleuvvz6dOnXKO97xjixYsCBXXnllc9cGAAAAu9SoYNuuXbt07ty5YXmfffZJu3aNOhQAAACaVaPuse3Tp0+uvfbabN26NY888kjmzp2bQw45pLlrAwAAgF1q1LBrTU1Nnnrqqey5556ZPHlyOnfunKlTpzZ3bQAAALBLjRqx7dixY8aPH5/x48c3dz0AAADQJI0KtoccckhKpdI263r27JklS5Y0S1EAAADQWI0Ktr/73e8afq6rq8tdd92VBx54oNmKAgAAgMZq8qON99hjj3zqU5/Kfffd1xz1AAAAQJM0asR24cKFDT9XKpU8+uijqa5u1KEAAADQrBqVTlesWLHNcvfu3XPppZc2S0EAAADQFI0KtjNnzmzuOgAAAOANaVSwPeaYY7Z7KnLy92nJpVIpixYtetMLAwAAgMZoVLA98cQTs8cee+Tzn/98qqurc9ttt+Whhx7KOeec09z1AQAAwE41Ktjec889mT9/fsPyaaedllNOOSXvfve7m60wAAAAaIxGf93P8uXLG37+9a9/nU6dOjVLQQAAANAUjRqx/fa3v50JEybk2WefTZL07t07s2bNatbCAAAAoDEaFWwPO+yw3H777dmwYUM6dOiQjh07NnddAAAA0CiNmor85JNP5vTTT8+IESNSW1ubU089NU888URz1wYAAAC71KhgW1NTky996Uvp2LFj9t5775xwwgmZMGFCc9cGAAAAu9SoYLtx48YMHDgwSVIqlfL5z38+mzdvbtbCAAAAoDEaFWw7dOiQv/71rymVSkmS//7v/0779u2btTAAAABojEY9PGrSpEn5t3/7tzz++OMZOnRoXnjhhXz3u99t7toAAABglxoVbJ977rn89Kc/zZ///OfU19end+/eRmwBAABoExo1Ffniiy/OHnvskT59+uSQQw4RagEAAGgzGjViu99++2XSpEk54ogj0qFDh4b1J510UrMVBgAAAI2x02D71FNP5Z3vfGe6d++eJFm9evU22wVbAAAAWttOg+1Xv/rVLFiwIDNnzszVV1+d0aNHt1RdAAAA0Cg7vce2Uqk0/Hzbbbc1ezEAAADQVDsNtq9+b22ybcgFAACAtqJRT0VOtg25AAAA0Fbs9B7bRx99NJ/4xCeS/P1BUq/+XKlUUiqVsmjRouavEAAAAHZip8H2l7/8ZUvVAQAAAG/IToPtu9/97paqAwAAAN6QRt9jCwAAAG2RYAsAAEChCbYAAAAUmmALAABAoQm2AAAAFJpgCwAAQKEJtgAAABSaYAsAAEChCbYAAAAUWqsE2+eeey4f+9jH8thjj2Xt2rX5whe+kJEjR2bq1Kkpl8tJkjlz5mTYsGEZMWJEHnzwwdYoEwAAgAJo8WBbV1eXmpqadOjQIUkyc+bMjBs3LnPnzk2lUsmiRYuyZs2arFy5MvPmzcvs2bNzwQUXtHSZAAAAFESLB9tZs2ZlxIgR2WeffZIka9asSf/+/ZMkgwcPzvLly7Nq1aoMHDgwpVIpvXr1Sn19fTZs2NDSpQIAAFAA1S15sfnz56dHjx4ZNGhQrrzyyiRJpVJJqVRKknTq1CmbNm3K5s2b061bt4bjXl3fo0eP1z139+4dU11d1bwvYDf07NmlRY6B5qQnKTo9TNHpYYpOD9NcWjTY3nzzzSmVSrn33nvzyCOPZMKECduMxNbW1qZr167p3Llzamtrt1nfpcvO/wg2bnyp2ereXT17dskzz2xq8nFv5BhoLm+0j6Gt0MMUnR6m6PQwu2tnH4y06FTk6667Ltdee22uueaavO9978usWbMyePDgrFixIkmyZMmS9OvXL3379s3SpUtTLpezbt26lMvlnY7WAgAA8PbVoiO2OzJhwoRMmTIls2fPTu/evTNkyJBUVVWlX79+GT58eMrlcmpqalq7TAAAANqoVgu211xzTcPP11577XbbzzrrrJx11lktWRIAAAAF1CrfYwsAAABvFsEWAACAQhNsAQAAKDTBFgAAgEITbAEAACg0wRYAAIBCE2wBAAAoNMEWAACAQhNsAQAAKDTBFgAAgEITbAEAACg0wRYAAIBCE2wBAAAoNMEWAACAQhNsAQAAKDTBFgAAgEKrbu0CeHOMvujuJu1/9cRjmqkSAACAlmXEFgAAgEITbAEAACg0U5Hfppoyddm0ZQAAoC0zYgsAAEChCbYAAAAUmmALAABAoQm2AAAAFJpgCwAAQKEJtgAAABSaYAsAAEChCbYAAAAUmmALAABAoQm2AAAAFJpgCwAAQKEJtgAAABSaYAsAAEChVbd2Abz1jL7o7kbve/XEY5qxEgAA4O3AiC0AAACFJtgCAABQaIItAAAAhSbYAgAAUGiCLQAAAIUm2AIAAFBovu6HXWrK1/cAAAC0NCO2AAAAFJpgCwAAQKGZityGmQIMAACwa0ZsAQAAKDTBFgAAgEITbAEAACg0wRYAAIBCE2wBAAAoNMEWAACAQhNsAQAAKDTBFgAAgEITbAEAACi06pa8WF1dXSZPnpwnn3wyW7ZsyZgxY3LQQQdl4sSJKZVK6dOnT6ZOnZp27dplzpw5Wbx4caqrqzN58uQcfvjhLVkqAAAABdGiwfbWW29Nt27dcvHFF2fjxo05+eSTc8ghh2TcuHH58Ic/nJqamixatCi9evXKypUrM2/evKxfvz5nnXVWbr755pYsFQAAgIJo0WB73HHHZciQIQ3LVVVVWbNmTfr3758kGTx4cJYtW5YDDjggAwcOTKlUSq9evVJfX58NGzakR48eLVkuAAAABdCiwbZTp05Jks2bN2fs2LEZN25cZs2alVKp1LB906ZN2bx5c7p167bNcZs2bdppsO3evWOqq6ua9wXshp49u7R2CW3S6IvubtL+t/2/oc1UCY2hjyk6PUzR6WGKTg/TXFo02CbJ+vXr87WvfS0jR47MiSeemIsvvrhhW21tbbp27ZrOnTuntrZ2m/Vduuz8j2Djxpearebd1bNnlzzzzKbWLuMtwfvYevQxRaeHKTo9TNHpYXbXzj4YadGnIj/77LMZPXp0zj333AwbNixJcuihh2bFihVJkiVLlqRfv37p27dvli5dmnK5nHXr1qVcLpuGDAAAwA616IjtFVdckRdffDGXX355Lr/88iTJeeedl+nTp2f27Nnp3bt3hgwZkqqqqvTr1y/Dhw9PuVxOTU1KnTOFAAAHMElEQVRNS5YJAABAgZQqlUqltYt4M7TlaQ2vTrto6v2kbO/qice0dglvW6YPUXR6mKLTwxSdHmZ3tZmpyAAAAPBmE2wBAAAoNMEWAACAQhNsAQAAKDTBFgAAgEJr0a/7gd3V1CdLe4oyAAC89RmxBQAAoNAEWwAAAApNsAUAAKDQ3GPLW1pT7sl1Py4AABSTYAv/Pw+mAgCAYjIVGQAAgEITbAEAACg0wRYAAIBCE2wBAAAoNA+PgjfIE5cBAKBtMGILAABAoQm2AAAAFJpgCwAAQKEJtgAAABSaYAsAAECheSoytICmPEE58RRlAABoCiO2AAAAFJoRW2iDjPACAEDjGbEFAACg0ARbAAAACk2wBQAAoNAEWwAAAApNsAUAAKDQPBUZ3mY8cRkAgLcaI7YAAAAUmhFbeAto6igsAAC8lRixBQAAoNCM2AI71ZTRYPfjAgDQGozYAgAAUGiCLQAAAIUm2AIAAFBogi0AAACFJtgCAABQaIItAAAAhSbYAgAAUGiCLQAAAIUm2AIAAFBogi0AAACFVt3aBQBvHaMvurtJ+1898ZhmqgQAgLcTI7YAAAAUmmALAABAoZmKDLQaU5cBAHgzGLEFAACg0IzYAoXR1BHepjAaDABQXIItQDMz5RoAoHkJtgBN1JwjxwAANJ17bAEAACg0wRYAAIBCa7NTkcvlcr71rW/l97//fdq3b5/p06fnPe95T2uXBbxFmV68PfcGAwBF0WaD7V133ZUtW7bkxhtvzAMPPJCLLrooP/jBD1q7LIBmV9SQ7anVO9aU96XIrxMAWlObDbarVq3KoEGDkiRHHnlkHn744VauCIDW0pZGj5szwBf1Q42maurvpzk/HHi7fPDQVt7Dpp6/Lf3ts2NN+R3d9v+GNmMlTfN2+TD27fQ3VKpUKpXWLmJHzjvvvHzyk5/Mxz72sSTJv/zLv+Suu+5KdXWbzeIAAAC0gjb78KjOnTuntra2YblcLgu1AAAAbKfNBtu+fftmyZIlSZIHHngg733ve1u5IgAAANqiNjsV+dWnIv/hD39IpVLJhRdemAMPPLC1ywIAAKCNabPBFgAAABqjzU5FBgAAgMYQbAEAACg0jxluRq/eJ/z73/8+7du3z/Tp0/Oe97yntcuC7axevTqXXHJJrrnmmqxduzYTJ05MqVRKnz59MnXq1LRr1y5z5szJ4sWLU11dncmTJ+fwww9/3X2hpdTV1WXy5Ml58skns2XLlowZMyYHHXSQHqZQ6uvrc/755+dPf/pTqqqqMnPmzFQqFX1MoTz33HM55ZRTcvXVV6e6ulr/0uJ0TTO66667smXLltx4440ZP358LrrootYuCbZz1VVX5fzzz88rr7ySJJk5c2bGjRuXuXPnplKpZNGiRVmzZk1WrlyZefPmZfbs2bngggted19oSbfeemu6deuWuXPn5qqrrsq0adP0MIXz61//Oklyww03ZOzYsZk5c6Y+plDq6upSU1OTDh06JPFvCVqHYNuMVq1alUGDBiVJjjzyyDz88MOtXBFsb//9989ll13WsLxmzZr0798/STJ48OAsX748q1atysCBA1MqldKrV6/U19dnw4YNO9wXWtJxxx2Xs88+u2G5qqpKD1M4xx57bKZNm5YkWbduXfbee299TKHMmjUrI0aMyD777JPEvyVoHYJtM9q8eXM6d+7csFxVVZWtW7e2YkWwvSFDhqS6+v/uSqhUKimVSkmSTp06ZdOmTdv18qvrd7QvtKROnTqlc+fO2bx5c8aOHZtx48bpYQqpuro6EyZMyLRp0zJkyBB9TGHMnz8/PXr0aBjMSfxbgtYh2Dajzp07p7a2tmG5XC5vEyCgLXrtfS21tbXp2rXrdr1cW1ubLl267HBfaGnr16/PqaeemqFDh+bEE0/UwxTWrFmz8stf/jJTpkxpuD0k0ce0bTfffHOWL1+eUaNG5ZFHHsmECROyYcOGhu36l5Yi2Dajvn37ZsmSJUmSBx54IO9973tbuSLYtUMPPTQrVqxIkixZsiT9+vVL3759s3Tp0pTL5axbty7lcjk9evTY4b7Qkp599tmMHj065557boYNG5ZED1M8CxcuzA9/+MMkyT/90z+lVCrlsMMO08cUwnXXXZdrr70211xzTd73vvdl1qxZGTx4sP6lxZUqlUqltYt4q3r1qch/+MMfUqlUcuGFF+bAAw9s7bJgO0888US+/vWv56abbsqf/vSnTJkyJXV1dendu3emT5+eqqqqXHbZZVmyZEnK5XImTZqUfv36ve6+0FKmT5+eX/ziF+ndu3fDuvPOOy/Tp0/XwxTGSy+9lEmTJuXZZ5/N1q1b8+UvfzkHHnig/xZTOKNGjcq3vvWttGvXTv/S4gRbAAAACs1UZAAAAApNsAUAAKDQBFsAAAAKTbAFAACg0ARbAAAACk2wBQAAoNAEWwAAAApNsAUAAKDQ/j9hRrA6KsVZQgAAAABJRU5ErkJggg==\n", 346 | "text/plain": [ 347 | "" 348 | ] 349 | }, 350 | "metadata": {}, 351 | "output_type": "display_data" 352 | } 353 | ], 354 | "source": [ 355 | "# Longitud en tokens de cada noticia\n", 356 | "tokens_numbers = df.swifter.apply(lambda row: len(row['Body']), axis = 1)\n", 357 | "\n", 358 | "fig, ax = plt.subplots(1,1, figsize=(16,6))\n", 359 | "tokens_numbers.plot.hist(title=\"Number of tokens in the article\", bins = 70, ax=ax)" 360 | ] 361 | }, 362 | { 363 | "cell_type": "code", 364 | "execution_count": 15, 365 | "metadata": {}, 366 | "outputs": [ 367 | { 368 | "name": "stdout", 369 | "output_type": "stream", 370 | "text": [ 371 | "Número medio de tokens por artículo: 404\n", 372 | "Desviación estándar de tokens por artículo: 424\n", 373 | "Mediana de tokens por artículo: 289\n", 374 | "Ventana de tokens escogida: 1254 - Cubre el 96.20603015075378% del dataset\n" 375 | ] 376 | } 377 | ], 378 | "source": [ 379 | "print(\"Número medio de tokens por artículo: {}\".format(int(np.mean(tokens_numbers))))\n", 380 | "print(\"Desviación estándar de tokens por artículo: {}\".format(int(np.std(tokens_numbers))))\n", 381 | "print(\"Mediana de tokens por artículo: {}\".format(int(np.median(tokens_numbers))))\n", 382 | "\n", 383 | "max_tokens = np.mean(tokens_numbers) + 2 * np.std(tokens_numbers)\n", 384 | "max_tokens = int(max_tokens)\n", 385 | "\n", 386 | "percent_tokens = tokens_numbers[tokens_numbers <= max_tokens].count() / tokens_numbers.count()\n", 387 | "print(\"Ventana de tokens escogida: {} - Cubre el {}% del dataset\".format(max_tokens, percent_tokens*100))" 388 | ] 389 | }, 390 | { 391 | "cell_type": "markdown", 392 | "metadata": {}, 393 | "source": [ 394 | "Para los títulos" 395 | ] 396 | }, 397 | { 398 | "cell_type": "code", 399 | "execution_count": 28, 400 | "metadata": {}, 401 | "outputs": [ 402 | { 403 | "name": "stdout", 404 | "output_type": "stream", 405 | "text": [ 406 | "Número medio de tokens por título: 7\n", 407 | "Desviación estándar de tokens por título: 3\n", 408 | "Mediana de tokens por título: 8\n", 409 | "Ventana de tokens escogida: 13 - Cubre el 97.9145728643216% del dataset\n" 410 | ] 411 | }, 412 | { 413 | "data": { 414 | "image/png": "\n", 415 | "text/plain": [ 416 | "" 417 | ] 418 | }, 419 | "metadata": {}, 420 | "output_type": "display_data" 421 | } 422 | ], 423 | "source": [ 424 | "# Longitud en tokens de cada noticia\n", 425 | "tokens_numbers = df.apply(lambda row: len(row['Headline']), axis = 1)\n", 426 | "\n", 427 | "fig, ax = plt.subplots(1,1, figsize=(16,6))\n", 428 | "tokens_numbers.plot.hist(title=\"Number of tokens in the article\", bins = 30, ax=ax)\n", 429 | "\n", 430 | "print(\"Número medio de tokens por título: {}\".format(int(np.mean(tokens_numbers))))\n", 431 | "print(\"Desviación estándar de tokens por título: {}\".format(int(np.std(tokens_numbers))))\n", 432 | "print(\"Mediana de tokens por título: {}\".format(int(np.median(tokens_numbers))))\n", 433 | "\n", 434 | "max_tokens = np.mean(tokens_numbers) + 2 * np.std(tokens_numbers)\n", 435 | "max_tokens = int(max_tokens)\n", 436 | "\n", 437 | "n_tokens = max_tokens\n", 438 | "\n", 439 | "percent_tokens = tokens_numbers[tokens_numbers <= max_tokens].count() / tokens_numbers.count()\n", 440 | "print(\"Ventana de tokens escogida: {} - Cubre el {}% del dataset\".format(max_tokens, percent_tokens*100))\n" 441 | ] 442 | }, 443 | { 444 | "cell_type": "code", 445 | "execution_count": null, 446 | "metadata": {}, 447 | "outputs": [], 448 | "source": [] 449 | }, 450 | { 451 | "cell_type": "code", 452 | "execution_count": null, 453 | "metadata": {}, 454 | "outputs": [], 455 | "source": [] 456 | }, 457 | { 458 | "cell_type": "code", 459 | "execution_count": 11, 460 | "metadata": {}, 461 | "outputs": [], 462 | "source": [ 463 | "df['Headline'] = df.apply(lambda r: pad_array(r['Headline'], MAX_LEN_TITLE) , axis=1)\n", 464 | "df['Body'] = df.apply(lambda r: pad_array(r['Body'], MAX_LEN_CONTENT) , axis=1)" 465 | ] 466 | }, 467 | { 468 | "cell_type": "markdown", 469 | "metadata": {}, 470 | "source": [ 471 | "Convertir label" 472 | ] 473 | }, 474 | { 475 | "cell_type": "code", 476 | "execution_count": 12, 477 | "metadata": {}, 478 | "outputs": [], 479 | "source": [ 480 | "df.loc[df['Label'] == 1, 'Label'] = pd.Series([LBL_TRUE] * len(df))\n", 481 | "df.loc[df['Label'] == 0, 'Label'] = pd.Series([LBL_FAKE] * len(df))\n", 482 | "#loc[df1['stream'] == 2, 'feat'] = 10" 483 | ] 484 | }, 485 | { 486 | "cell_type": "code", 487 | "execution_count": 13, 488 | "metadata": {}, 489 | "outputs": [ 490 | { 491 | "data": { 492 | "text/html": [ 493 | "
\n", 494 | "\n", 507 | "\n", 508 | " \n", 509 | " \n", 510 | " \n", 511 | " \n", 512 | " \n", 513 | " \n", 514 | " \n", 515 | " \n", 516 | " \n", 517 | " \n", 518 | " \n", 519 | " \n", 520 | " \n", 521 | " \n", 522 | " \n", 523 | " \n", 524 | " \n", 525 | " \n", 526 | " \n", 527 | " \n", 528 | " \n", 529 | " \n", 530 | " \n", 531 | " \n", 532 | " \n", 533 | " \n", 534 | " \n", 535 | " \n", 536 | " \n", 537 | " \n", 538 | " \n", 539 | " \n", 540 | " \n", 541 | " \n", 542 | " \n", 543 | " \n", 544 | " \n", 545 | " \n", 546 | " \n", 547 | " \n", 548 | "
HeadlineBodyLabel
0[2675, 1143, 2205, 48926, 6117, 13034, 0, 0, 0...[15680, 8429, 28683, 14257, 312, 281, 565, 611...[0, 0, 0, 1]
1[556, 2168, 3912, 5042, 2360, 508, 115, 948, 0...[4949, 20876, 17535, 3912, 46, 3610, 556, 3230...[0, 0, 0, 1]
2[13034, 12828, 316, 48926, 553, 16854, 23325, ...[7, 14446, 1506, 69, 208, 995, 64, 116, 61, 60...[0, 0, 0, 1]
3[2905, 1469, 1930, 8, 44732, 2, 3538, 19568, 4...[29922, 2905, 6585, 4840, 301, 11, 1080, 1428,...[0, 0, 0, 1]
4[2678, 3466, 8, 11440, 5739, 0, 0, 0, 0, 0, 0,...[4868, 3076, 2678, 31, 10, 3246, 14, 11, 11440...[0, 0, 0, 1]
\n", 549 | "
" 550 | ], 551 | "text/plain": [ 552 | " Headline \\\n", 553 | "0 [2675, 1143, 2205, 48926, 6117, 13034, 0, 0, 0... \n", 554 | "1 [556, 2168, 3912, 5042, 2360, 508, 115, 948, 0... \n", 555 | "2 [13034, 12828, 316, 48926, 553, 16854, 23325, ... \n", 556 | "3 [2905, 1469, 1930, 8, 44732, 2, 3538, 19568, 4... \n", 557 | "4 [2678, 3466, 8, 11440, 5739, 0, 0, 0, 0, 0, 0,... \n", 558 | "\n", 559 | " Body Label \n", 560 | "0 [15680, 8429, 28683, 14257, 312, 281, 565, 611... [0, 0, 0, 1] \n", 561 | "1 [4949, 20876, 17535, 3912, 46, 3610, 556, 3230... [0, 0, 0, 1] \n", 562 | "2 [7, 14446, 1506, 69, 208, 995, 64, 116, 61, 60... [0, 0, 0, 1] \n", 563 | "3 [29922, 2905, 6585, 4840, 301, 11, 1080, 1428,... [0, 0, 0, 1] \n", 564 | "4 [4868, 3076, 2678, 31, 10, 3246, 14, 11, 11440... [0, 0, 0, 1] " 565 | ] 566 | }, 567 | "execution_count": 13, 568 | "metadata": {}, 569 | "output_type": "execute_result" 570 | } 571 | ], 572 | "source": [ 573 | "df.head()" 574 | ] 575 | }, 576 | { 577 | "cell_type": "code", 578 | "execution_count": 14, 579 | "metadata": {}, 580 | "outputs": [], 581 | "source": [ 582 | "df.to_pickle(PATH_PROCESSED)" 583 | ] 584 | } 585 | ], 586 | "metadata": { 587 | "kernelspec": { 588 | "display_name": "Python 3", 589 | "language": "python", 590 | "name": "python3" 591 | }, 592 | "language_info": { 593 | "codemirror_mode": { 594 | "name": "ipython", 595 | "version": 3 596 | }, 597 | "file_extension": ".py", 598 | "mimetype": "text/x-python", 599 | "name": "python", 600 | "nbconvert_exporter": "python", 601 | "pygments_lexer": "ipython3", 602 | "version": "3.6.7" 603 | } 604 | }, 605 | "nbformat": 4, 606 | "nbformat_minor": 2 607 | } 608 | -------------------------------------------------------------------------------- /notebooks/GettingRealAboutFake.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Procesado del dataset: Getting Real About Fake News\n", 8 | "## Fuente: Kaggle. https://www.kaggle.com/mrisdal/fake-news" 9 | ] 10 | }, 11 | { 12 | "cell_type": "code", 13 | "execution_count": 1, 14 | "metadata": {}, 15 | "outputs": [], 16 | "source": [ 17 | "import pandas as pd\n", 18 | "import numpy as np\n", 19 | "\n", 20 | "from matplotlib import pyplot as plt\n", 21 | "from matplotlib import style\n", 22 | "\n", 23 | "#One-hot encodding\n", 24 | "from sklearn.preprocessing import LabelBinarizer\n", 25 | "\n", 26 | "#Progress bars\n", 27 | "from tqdm import tqdm\n", 28 | "tqdm.pandas()\n", 29 | "\n", 30 | "#Paralelize pandas apply on multiple cores\n", 31 | "import swifter\n", 32 | "\n", 33 | "#Nicer style\n", 34 | "style.use('seaborn') \n", 35 | "\n", 36 | "import re #regexp\n", 37 | "\n", 38 | "from nltk.tokenize import RegexpTokenizer" 39 | ] 40 | }, 41 | { 42 | "cell_type": "code", 43 | "execution_count": 2, 44 | "metadata": {}, 45 | "outputs": [], 46 | "source": [ 47 | "news = pd.read_csv(\"../data/Other_datasets/GettingRealAboutFake/fake.csv\")" 48 | ] 49 | }, 50 | { 51 | "cell_type": "markdown", 52 | "metadata": {}, 53 | "source": [ 54 | "Longitudes necesarias de los artículos" 55 | ] 56 | }, 57 | { 58 | "cell_type": "code", 59 | "execution_count": 3, 60 | "metadata": {}, 61 | "outputs": [], 62 | "source": [ 63 | "#Padding number for title and content\n", 64 | "MAX_LEN_TITLE = 13\n", 65 | "MAX_LEN_CONTENT = 1598" 66 | ] 67 | }, 68 | { 69 | "cell_type": "markdown", 70 | "metadata": {}, 71 | "source": [ 72 | "### Limpieza del dataset.\n", 73 | "\n", 74 | "Nos quedamos con:\n", 75 | " * title\n", 76 | " * language == english\n", 77 | " * text\n", 78 | " * type\n", 79 | " * type != bs $^*$\n", 80 | " \n", 81 | "$^*$ *Data sources that were missing a label were simply assigned a label of \"bs\". There are (ostensibly) no genuine, reliable, or trustworthy news sources represented in this dataset (so far), so don't trust anything you read.*" 82 | ] 83 | }, 84 | { 85 | "cell_type": "code", 86 | "execution_count": 4, 87 | "metadata": {}, 88 | "outputs": [], 89 | "source": [ 90 | "news = news[news['language'] == 'english']\n", 91 | "news = news[news['type'] != 'bs']\n", 92 | "news = news[['title', 'text', 'type']]" 93 | ] 94 | }, 95 | { 96 | "cell_type": "code", 97 | "execution_count": 5, 98 | "metadata": {}, 99 | "outputs": [ 100 | { 101 | "data": { 102 | "text/html": [ 103 | "
\n", 104 | "\n", 117 | "\n", 118 | " \n", 119 | " \n", 120 | " \n", 121 | " \n", 122 | " \n", 123 | " \n", 124 | " \n", 125 | " \n", 126 | " \n", 127 | " \n", 128 | " \n", 129 | " \n", 130 | " \n", 131 | " \n", 132 | " \n", 133 | " \n", 134 | " \n", 135 | " \n", 136 | " \n", 137 | " \n", 138 | " \n", 139 | " \n", 140 | " \n", 141 | " \n", 142 | " \n", 143 | " \n", 144 | " \n", 145 | " \n", 146 | " \n", 147 | " \n", 148 | " \n", 149 | " \n", 150 | " \n", 151 | " \n", 152 | " \n", 153 | " \n", 154 | " \n", 155 | " \n", 156 | " \n", 157 | " \n", 158 | "
titletexttype
0Muslims BUSTED: They Stole Millions In Gov’t B...Print They should pay all the back all the mon...bias
1Re: Why Did Attorney General Loretta Lynch Ple...Why Did Attorney General Loretta Lynch Plead T...bias
2BREAKING: Weiner Cooperating With FBI On Hilla...Red State : \\nFox News Sunday reported this mo...bias
3PIN DROP SPEECH BY FATHER OF DAUGHTER Kidnappe...Email Kayla Mueller was a prisoner and torture...bias
4FANTASTIC! TRUMP'S 7 POINT PLAN To Reform Heal...Email HEALTHCARE REFORM TO MAKE AMERICA GREAT ...bias
\n", 159 | "
" 160 | ], 161 | "text/plain": [ 162 | " title \\\n", 163 | "0 Muslims BUSTED: They Stole Millions In Gov’t B... \n", 164 | "1 Re: Why Did Attorney General Loretta Lynch Ple... \n", 165 | "2 BREAKING: Weiner Cooperating With FBI On Hilla... \n", 166 | "3 PIN DROP SPEECH BY FATHER OF DAUGHTER Kidnappe... \n", 167 | "4 FANTASTIC! TRUMP'S 7 POINT PLAN To Reform Heal... \n", 168 | "\n", 169 | " text type \n", 170 | "0 Print They should pay all the back all the mon... bias \n", 171 | "1 Why Did Attorney General Loretta Lynch Plead T... bias \n", 172 | "2 Red State : \\nFox News Sunday reported this mo... bias \n", 173 | "3 Email Kayla Mueller was a prisoner and torture... bias \n", 174 | "4 Email HEALTHCARE REFORM TO MAKE AMERICA GREAT ... bias " 175 | ] 176 | }, 177 | "execution_count": 5, 178 | "metadata": {}, 179 | "output_type": "execute_result" 180 | } 181 | ], 182 | "source": [ 183 | "news.head()" 184 | ] 185 | }, 186 | { 187 | "cell_type": "markdown", 188 | "metadata": {}, 189 | "source": [ 190 | "El dataset incluye distintos tipos de noticias falsas categorizadas junto a metadatos. Se pretende entrenar un modelo que distinga también verdaderas, por lo que debemos añadir samples verdaderos de FakeNewsCorpus.\n", 191 | "\n", 192 | "Previamente se han extraido sólamente las verdaderas en `data/only_true.csv`." 193 | ] 194 | }, 195 | { 196 | "cell_type": "code", 197 | "execution_count": 6, 198 | "metadata": {}, 199 | "outputs": [], 200 | "source": [ 201 | "true_news = pd.read_csv(\"../data/Other_datasets/GettingRealAboutFake/news_only_true.csv\", nrows=1000)" 202 | ] 203 | }, 204 | { 205 | "cell_type": "code", 206 | "execution_count": 7, 207 | "metadata": {}, 208 | "outputs": [], 209 | "source": [ 210 | "#Poner todos los type como true\n", 211 | "true_news['type'] = 'true'" 212 | ] 213 | }, 214 | { 215 | "cell_type": "code", 216 | "execution_count": 8, 217 | "metadata": {}, 218 | "outputs": [], 219 | "source": [ 220 | "true_news = true_news[['title', 'content', 'type']]\n", 221 | "true_news = true_news.rename(str, columns={\"content\": \"text\"})" 222 | ] 223 | }, 224 | { 225 | "cell_type": "code", 226 | "execution_count": 9, 227 | "metadata": {}, 228 | "outputs": [ 229 | { 230 | "data": { 231 | "text/html": [ 232 | "
\n", 233 | "\n", 246 | "\n", 247 | " \n", 248 | " \n", 249 | " \n", 250 | " \n", 251 | " \n", 252 | " \n", 253 | " \n", 254 | " \n", 255 | " \n", 256 | " \n", 257 | " \n", 258 | " \n", 259 | " \n", 260 | " \n", 261 | " \n", 262 | " \n", 263 | " \n", 264 | " \n", 265 | " \n", 266 | " \n", 267 | " \n", 268 | " \n", 269 | " \n", 270 | " \n", 271 | " \n", 272 | " \n", 273 | " \n", 274 | " \n", 275 | " \n", 276 | " \n", 277 | " \n", 278 | " \n", 279 | " \n", 280 | " \n", 281 | " \n", 282 | " \n", 283 | " \n", 284 | " \n", 285 | " \n", 286 | " \n", 287 | "
titletexttype
0Iranian Christian Convert Leads 1,500 Muslims ...(Screencap: YouTube/Tidningen Dagen) Annahita ...true
1Finding wonder and foreboding in the pathway o...It is rather hard to write with eclipse glasse...true
2#MeToo! Our culture of sexual predation – Bapt...Social media is blowing up about a culture of ...true
3God’s invitation to life – Baptist News GlobalMany preachers took up the Isaiah 55 passage t...true
4What is sown in the heart – Baptist News GlobalA trip to the cardiologist is rarely routine. ...true
\n", 288 | "
" 289 | ], 290 | "text/plain": [ 291 | " title \\\n", 292 | "0 Iranian Christian Convert Leads 1,500 Muslims ... \n", 293 | "1 Finding wonder and foreboding in the pathway o... \n", 294 | "2 #MeToo! Our culture of sexual predation – Bapt... \n", 295 | "3 God’s invitation to life – Baptist News Global \n", 296 | "4 What is sown in the heart – Baptist News Global \n", 297 | "\n", 298 | " text type \n", 299 | "0 (Screencap: YouTube/Tidningen Dagen) Annahita ... true \n", 300 | "1 It is rather hard to write with eclipse glasse... true \n", 301 | "2 Social media is blowing up about a culture of ... true \n", 302 | "3 Many preachers took up the Isaiah 55 passage t... true \n", 303 | "4 A trip to the cardiologist is rarely routine. ... true " 304 | ] 305 | }, 306 | "execution_count": 9, 307 | "metadata": {}, 308 | "output_type": "execute_result" 309 | } 310 | ], 311 | "source": [ 312 | "true_news.head()" 313 | ] 314 | }, 315 | { 316 | "cell_type": "markdown", 317 | "metadata": {}, 318 | "source": [ 319 | "Concatenar y shufflear Dataframes" 320 | ] 321 | }, 322 | { 323 | "cell_type": "code", 324 | "execution_count": 10, 325 | "metadata": {}, 326 | "outputs": [], 327 | "source": [ 328 | "news = pd.concat([news, true_news])\n", 329 | "news = news.sample(frac=1).reset_index(drop=True)" 330 | ] 331 | }, 332 | { 333 | "cell_type": "markdown", 334 | "metadata": {}, 335 | "source": [ 336 | "Agrupamos los tipos de noticia en bias - fake - true" 337 | ] 338 | }, 339 | { 340 | "cell_type": "code", 341 | "execution_count": 11, 342 | "metadata": {}, 343 | "outputs": [], 344 | "source": [ 345 | "news.loc[news['type'] =='conspiracy', 'type'] = 'fake'\n", 346 | "news.loc[news['type'] =='hate', 'type'] = 'fake'\n", 347 | "news.loc[news['type'] =='junksci', 'type'] = 'fake'\n", 348 | "news.loc[news['type'] =='satire', 'type'] = 'fake'\n", 349 | "news.loc[news['type'] =='state', 'type'] = 'fake'" 350 | ] 351 | }, 352 | { 353 | "cell_type": "markdown", 354 | "metadata": {}, 355 | "source": [ 356 | "**Noticias por categoría**" 357 | ] 358 | }, 359 | { 360 | "cell_type": "code", 361 | "execution_count": 12, 362 | "metadata": {}, 363 | "outputs": [ 364 | { 365 | "data": { 366 | "text/plain": [ 367 | "" 368 | ] 369 | }, 370 | "execution_count": 12, 371 | "metadata": {}, 372 | "output_type": "execute_result" 373 | }, 374 | { 375 | "data": { 376 | "image/png": "\n", 377 | "text/plain": [ 378 | "" 379 | ] 380 | }, 381 | "metadata": { 382 | "needs_background": "light" 383 | }, 384 | "output_type": "display_data" 385 | } 386 | ], 387 | "source": [ 388 | "fig, ax = plt.subplots(1,1, figsize=(16,6))\n", 389 | "news.groupby(\"type\").count()['title'].plot.bar(title = \"Number of articles by type\", ax = ax)" 390 | ] 391 | }, 392 | { 393 | "cell_type": "markdown", 394 | "metadata": {}, 395 | "source": [ 396 | "### Vectorización" 397 | ] 398 | }, 399 | { 400 | "cell_type": "code", 401 | "execution_count": 13, 402 | "metadata": {}, 403 | "outputs": [], 404 | "source": [ 405 | "from gensim.models import KeyedVectors" 406 | ] 407 | }, 408 | { 409 | "cell_type": "code", 410 | "execution_count": 14, 411 | "metadata": {}, 412 | "outputs": [], 413 | "source": [ 414 | "model = KeyedVectors.load_word2vec_format(\"../data/GoogleNews-vectors-negative300.bin.gz\", binary=True,\n", 415 | " limit=50000)" 416 | ] 417 | }, 418 | { 419 | "cell_type": "code", 420 | "execution_count": 15, 421 | "metadata": {}, 422 | "outputs": [ 423 | { 424 | "name": "stderr", 425 | "output_type": "stream", 426 | "text": [ 427 | "100%|██████████| 2503/2503 [00:00<00:00, 17425.65it/s]\n" 428 | ] 429 | } 430 | ], 431 | "source": [ 432 | "news['title'] = news.progress_apply(lambda r: \n", 433 | " [model.vocab[x].index for x in str(r['title']) if x in model.vocab], axis=1)" 434 | ] 435 | }, 436 | { 437 | "cell_type": "code", 438 | "execution_count": 16, 439 | "metadata": {}, 440 | "outputs": [ 441 | { 442 | "name": "stderr", 443 | "output_type": "stream", 444 | "text": [ 445 | "100%|██████████| 2503/2503 [00:01<00:00, 2373.28it/s]\n" 446 | ] 447 | } 448 | ], 449 | "source": [ 450 | "news['content'] = news.progress_apply(lambda r: \n", 451 | " [model.vocab[x].index for x in str(r['text']) if x in model.vocab], axis=1)" 452 | ] 453 | }, 454 | { 455 | "cell_type": "code", 456 | "execution_count": 17, 457 | "metadata": {}, 458 | "outputs": [ 459 | { 460 | "data": { 461 | "text/html": [ 462 | "
\n", 463 | "\n", 476 | "\n", 477 | " \n", 478 | " \n", 479 | " \n", 480 | " \n", 481 | " \n", 482 | " \n", 483 | " \n", 484 | " \n", 485 | " \n", 486 | " \n", 487 | " \n", 488 | " \n", 489 | " \n", 490 | " \n", 491 | " \n", 492 | " \n", 493 | " \n", 494 | " \n", 495 | " \n", 496 | " \n", 497 | " \n", 498 | " \n", 499 | " \n", 500 | " \n", 501 | " \n", 502 | " \n", 503 | " \n", 504 | " \n", 505 | " \n", 506 | " \n", 507 | " \n", 508 | " \n", 509 | " \n", 510 | " \n", 511 | " \n", 512 | " \n", 513 | " \n", 514 | " \n", 515 | " \n", 516 | " \n", 517 | "
titletypecontent
0[4024, 2370, 1280, 11538, 17251, 20, 17919, 57...fake[4024, 2370, 1280, 11538, 17251, 20, 17919, 57...
1[5977, 4211, 7726, 11538, 9311, 8469, 4211, 12...fake[5760, 5760]
2[7203, 15775, 15775, 3581, 7726, 17919, 4211, ...fake[3708, 4211, 5760, 7726, 15775, 4211, 6869, 42...
3[11969, 4211, 1280, 4883, 4501, 23199, 8469, 2...fake[3708, 11538, 7726, 11538, 73, 17919, 7726, 45...
4[3581, 4501, 11538, 16205, 8303, 20, 5760, 846...fake[3581, 4501, 11538, 16205, 8303, 20, 5760, 846...
\n", 518 | "
" 519 | ], 520 | "text/plain": [ 521 | " title type \\\n", 522 | "0 [4024, 2370, 1280, 11538, 17251, 20, 17919, 57... fake \n", 523 | "1 [5977, 4211, 7726, 11538, 9311, 8469, 4211, 12... fake \n", 524 | "2 [7203, 15775, 15775, 3581, 7726, 17919, 4211, ... fake \n", 525 | "3 [11969, 4211, 1280, 4883, 4501, 23199, 8469, 2... fake \n", 526 | "4 [3581, 4501, 11538, 16205, 8303, 20, 5760, 846... fake \n", 527 | "\n", 528 | " content \n", 529 | "0 [4024, 2370, 1280, 11538, 17251, 20, 17919, 57... \n", 530 | "1 [5760, 5760] \n", 531 | "2 [3708, 4211, 5760, 7726, 15775, 4211, 6869, 42... \n", 532 | "3 [3708, 11538, 7726, 11538, 73, 17919, 7726, 45... \n", 533 | "4 [3581, 4501, 11538, 16205, 8303, 20, 5760, 846... " 534 | ] 535 | }, 536 | "execution_count": 17, 537 | "metadata": {}, 538 | "output_type": "execute_result" 539 | } 540 | ], 541 | "source": [ 542 | "news = news.drop('text', axis=1)\n", 543 | "news.head()" 544 | ] 545 | }, 546 | { 547 | "cell_type": "code", 548 | "execution_count": 18, 549 | "metadata": {}, 550 | "outputs": [], 551 | "source": [ 552 | "news = news[news['title'].map(len) >= 1]\n", 553 | "#Reset index\n", 554 | "news = news.reset_index().drop(\"index\", axis=1)" 555 | ] 556 | }, 557 | { 558 | "cell_type": "code", 559 | "execution_count": 19, 560 | "metadata": {}, 561 | "outputs": [], 562 | "source": [ 563 | "def pad_array(array, token_len):\n", 564 | " diff_token = token_len - len(array)\n", 565 | " if(diff_token < 0):\n", 566 | " array = array[:token_len] #Truncate\n", 567 | " else:\n", 568 | " #Pad\n", 569 | " array += [0]*diff_token #Pad\n", 570 | " \n", 571 | " return array " 572 | ] 573 | }, 574 | { 575 | "cell_type": "code", 576 | "execution_count": 20, 577 | "metadata": {}, 578 | "outputs": [ 579 | { 580 | "name": "stderr", 581 | "output_type": "stream", 582 | "text": [ 583 | "Pandas Apply: 100%|██████████| 2503/2503 [00:00<00:00, 45125.24it/s]\n" 584 | ] 585 | }, 586 | { 587 | "data": { 588 | "text/plain": [ 589 | "" 590 | ] 591 | }, 592 | "execution_count": 20, 593 | "metadata": {}, 594 | "output_type": "execute_result" 595 | }, 596 | { 597 | "data": { 598 | "image/png": "\n", 599 | "text/plain": [ 600 | "" 601 | ] 602 | }, 603 | "metadata": { 604 | "needs_background": "light" 605 | }, 606 | "output_type": "display_data" 607 | } 608 | ], 609 | "source": [ 610 | "# Longitud en tokens de cada noticia\n", 611 | "tokens_numbers = news.swifter.apply(lambda row: len(row['content']), axis = 1)\n", 612 | "\n", 613 | "fig, ax = plt.subplots(1,1, figsize=(16,6))\n", 614 | "tokens_numbers.plot.hist(title=\"Number of tokens in the article\", bins = 70, ax=ax)" 615 | ] 616 | }, 617 | { 618 | "cell_type": "code", 619 | "execution_count": 21, 620 | "metadata": {}, 621 | "outputs": [ 622 | { 623 | "name": "stdout", 624 | "output_type": "stream", 625 | "text": [ 626 | "Número medio de tokens por artículo: 2260\n", 627 | "Desviación estándar de tokens por artículo: 2892\n", 628 | "Mediana de tokens por artículo: 1592\n", 629 | "Ventana de tokens escogida: 8045 - Cubre el 96.92369157011586% del dataset\n" 630 | ] 631 | } 632 | ], 633 | "source": [ 634 | "print(\"Número medio de tokens por artículo: {}\".format(int(np.mean(tokens_numbers))))\n", 635 | "print(\"Desviación estándar de tokens por artículo: {}\".format(int(np.std(tokens_numbers))))\n", 636 | "print(\"Mediana de tokens por artículo: {}\".format(int(np.median(tokens_numbers))))\n", 637 | "\n", 638 | "max_tokens = np.mean(tokens_numbers) + 2 * np.std(tokens_numbers)\n", 639 | "max_tokens = int(max_tokens)\n", 640 | "\n", 641 | "percent_tokens = tokens_numbers[tokens_numbers <= max_tokens].count() / tokens_numbers.count()\n", 642 | "print(\"Ventana de tokens escogida: {} - Cubre el {}% del dataset\".format(max_tokens, percent_tokens*100))" 643 | ] 644 | }, 645 | { 646 | "cell_type": "code", 647 | "execution_count": 22, 648 | "metadata": {}, 649 | "outputs": [], 650 | "source": [ 651 | "n_tokens = max_tokens\n", 652 | "news['content'] = news.apply(lambda r: pad_array(r['content'], MAX_LEN_CONTENT) , axis=1) #Use necessary for the model" 653 | ] 654 | }, 655 | { 656 | "cell_type": "code", 657 | "execution_count": 23, 658 | "metadata": {}, 659 | "outputs": [ 660 | { 661 | "name": "stderr", 662 | "output_type": "stream", 663 | "text": [ 664 | "\r", 665 | "Pandas Apply: 0%| | 0/2503 [00:00" 690 | ] 691 | }, 692 | "metadata": { 693 | "needs_background": "light" 694 | }, 695 | "output_type": "display_data" 696 | } 697 | ], 698 | "source": [ 699 | "# Longitud en tokens de cada noticia\n", 700 | "tokens_numbers = news.apply(lambda row: len(row['title']), axis = 1)\n", 701 | "\n", 702 | "fig, ax = plt.subplots(1,1, figsize=(16,6))\n", 703 | "tokens_numbers.plot.hist(title=\"Number of tokens in the article\", bins = 30, ax=ax)\n", 704 | "\n", 705 | "print(\"Número medio de tokens por título: {}\".format(int(np.mean(tokens_numbers))))\n", 706 | "print(\"Desviación estándar de tokens por título: {}\".format(int(np.std(tokens_numbers))))\n", 707 | "print(\"Mediana de tokens por título: {}\".format(int(np.median(tokens_numbers))))\n", 708 | "\n", 709 | "max_tokens = np.mean(tokens_numbers) + 2 * np.std(tokens_numbers)\n", 710 | "max_tokens = int(max_tokens)\n", 711 | "\n", 712 | "n_tokens = max_tokens\n", 713 | "\n", 714 | "percent_tokens = tokens_numbers[tokens_numbers <= max_tokens].count() / tokens_numbers.count()\n", 715 | "print(\"Ventana de tokens escogida: {} - Cubre el {}% del dataset\".format(max_tokens, percent_tokens*100))\n", 716 | "\n", 717 | "#Use necessary for the model\n", 718 | "news['title'] = news.progress_apply(lambda r: pad_array(r['title'], MAX_LEN_TITLE) , axis=1)" 719 | ] 720 | }, 721 | { 722 | "cell_type": "markdown", 723 | "metadata": {}, 724 | "source": [ 725 | "### Label a categorical" 726 | ] 727 | }, 728 | { 729 | "cell_type": "code", 730 | "execution_count": 24, 731 | "metadata": {}, 732 | "outputs": [ 733 | { 734 | "data": { 735 | "text/html": [ 736 | "
\n", 737 | "\n", 750 | "\n", 751 | " \n", 752 | " \n", 753 | " \n", 754 | " \n", 755 | " \n", 756 | " \n", 757 | " \n", 758 | " \n", 759 | " \n", 760 | " \n", 761 | " \n", 762 | " \n", 763 | " \n", 764 | " \n", 765 | " \n", 766 | " \n", 767 | " \n", 768 | " \n", 769 | " \n", 770 | " \n", 771 | " \n", 772 | " \n", 773 | " \n", 774 | " \n", 775 | " \n", 776 | " \n", 777 | " \n", 778 | " \n", 779 | " \n", 780 | " \n", 781 | " \n", 782 | " \n", 783 | " \n", 784 | " \n", 785 | " \n", 786 | " \n", 787 | " \n", 788 | " \n", 789 | " \n", 790 | " \n", 791 | " \n", 792 | " \n", 793 | " \n", 794 | " \n", 795 | " \n", 796 | " \n", 797 | "
titletypecontentone_hot_label
0[4024, 2370, 1280, 11538, 17251, 20, 17919, 57...fake[4024, 2370, 1280, 11538, 17251, 20, 17919, 57...[0, 1, 0]
1[5977, 4211, 7726, 11538, 9311, 8469, 4211, 12...fake[5760, 5760, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...[0, 1, 0]
2[7203, 15775, 15775, 3581, 7726, 17919, 4211, ...fake[3708, 4211, 5760, 7726, 15775, 4211, 6869, 42...[0, 1, 0]
3[11969, 4211, 1280, 4883, 4501, 23199, 8469, 2...fake[3708, 11538, 7726, 11538, 73, 17919, 7726, 45...[0, 1, 0]
4[3581, 4501, 11538, 16205, 8303, 20, 5760, 846...fake[3581, 4501, 11538, 16205, 8303, 20, 5760, 846...[0, 1, 0]
\n", 798 | "
" 799 | ], 800 | "text/plain": [ 801 | " title type \\\n", 802 | "0 [4024, 2370, 1280, 11538, 17251, 20, 17919, 57... fake \n", 803 | "1 [5977, 4211, 7726, 11538, 9311, 8469, 4211, 12... fake \n", 804 | "2 [7203, 15775, 15775, 3581, 7726, 17919, 4211, ... fake \n", 805 | "3 [11969, 4211, 1280, 4883, 4501, 23199, 8469, 2... fake \n", 806 | "4 [3581, 4501, 11538, 16205, 8303, 20, 5760, 846... fake \n", 807 | "\n", 808 | " content one_hot_label \n", 809 | "0 [4024, 2370, 1280, 11538, 17251, 20, 17919, 57... [0, 1, 0] \n", 810 | "1 [5760, 5760, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ... [0, 1, 0] \n", 811 | "2 [3708, 4211, 5760, 7726, 15775, 4211, 6869, 42... [0, 1, 0] \n", 812 | "3 [3708, 11538, 7726, 11538, 73, 17919, 7726, 45... [0, 1, 0] \n", 813 | "4 [3581, 4501, 11538, 16205, 8303, 20, 5760, 846... [0, 1, 0] " 814 | ] 815 | }, 816 | "execution_count": 24, 817 | "metadata": {}, 818 | "output_type": "execute_result" 819 | } 820 | ], 821 | "source": [ 822 | "encoder = LabelBinarizer().fit(list(news['type']))\n", 823 | "news['one_hot_label'] = news.apply(lambda r: encoder.transform([r['type']])[0], axis=1)\n", 824 | "news.head()" 825 | ] 826 | }, 827 | { 828 | "cell_type": "code", 829 | "execution_count": 25, 830 | "metadata": {}, 831 | "outputs": [ 832 | { 833 | "name": "stdout", 834 | "output_type": "stream", 835 | "text": [ 836 | "LABELS\n", 837 | "bias [1 0 0]\n", 838 | "fake [0 1 0]\n", 839 | "true [0 0 1]\n" 840 | ] 841 | } 842 | ], 843 | "source": [ 844 | "print(\"LABELS\")\n", 845 | "enc = encoder.transform(encoder.classes_)\n", 846 | "for x, y in zip(encoder.classes_, enc):\n", 847 | " print(x, y)" 848 | ] 849 | }, 850 | { 851 | "cell_type": "code", 852 | "execution_count": 26, 853 | "metadata": {}, 854 | "outputs": [], 855 | "source": [ 856 | "### Guardar el dataset\n", 857 | "news.to_pickle('../data/news_getting_real.pickle')" 858 | ] 859 | }, 860 | { 861 | "cell_type": "code", 862 | "execution_count": null, 863 | "metadata": {}, 864 | "outputs": [], 865 | "source": [] 866 | } 867 | ], 868 | "metadata": { 869 | "kernelspec": { 870 | "display_name": "Python 3", 871 | "language": "python", 872 | "name": "python3" 873 | }, 874 | "language_info": { 875 | "codemirror_mode": { 876 | "name": "ipython", 877 | "version": 3 878 | }, 879 | "file_extension": ".py", 880 | "mimetype": "text/x-python", 881 | "name": "python", 882 | "nbconvert_exporter": "python", 883 | "pygments_lexer": "ipython3", 884 | "version": "3.6.7" 885 | } 886 | }, 887 | "nbformat": 4, 888 | "nbformat_minor": 2 889 | } 890 | --------------------------------------------------------------------------------