├── span_identification ├── __init__.py ├── ner │ ├── __init__.py │ ├── bert_lstm_crf.py │ ├── utils_ner.py │ └── run_ner.py ├── submission.py ├── dataset.py └── __main__.py ├── .gitattributes ├── technique_classification ├── __init__.py ├── transformers_classifier │ ├── __init__.py │ └── utils.py ├── dataset.py ├── submission.py └── __main__.py ├── tools ├── ._README.md ├── data │ ├── submission-task-SI.tsv │ ├── article736757214.task-SI.labels │ ├── submission-task-TC.tsv │ ├── article736757214.labels-task-TC │ ├── propaganda-techniques-names-semeval2020task11.txt │ ├── propaganda-techniques-names.txt │ └── article736757214.txt ├── src │ ├── annotation_task_si.py │ ├── propaganda_techniques.py │ ├── annotation.py │ ├── annotation_w_o_label.py │ └── annotations.py ├── print_spans.py ├── task-TC_scorer.py └── README.md ├── visualization_example └── visualization │ ├── highlight.js │ ├── __init__.py │ ├── html_template.py │ └── highlight.css ├── requirements.txt ├── configs ├── si_config.yml └── tc_config.yml ├── .gitignore ├── README.md └── results └── SI_output.txt /span_identification/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /.gitattributes: -------------------------------------------------------------------------------- 1 | *.py linguist-detectable=true 2 | *.ipynb linguist-detectable=false 3 | -------------------------------------------------------------------------------- /technique_classification/__init__.py: -------------------------------------------------------------------------------- 1 | from .transformers_classifier import transformers_clf -------------------------------------------------------------------------------- /tools/._README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aschern/semeval2020_task11/HEAD/tools/._README.md -------------------------------------------------------------------------------- /tools/data/submission-task-SI.tsv: -------------------------------------------------------------------------------- 1 | 736757214 0 50 2 | 736757214 161 172 3 | 736757214 0 10 4 | 736757214 115 167 5 | -------------------------------------------------------------------------------- /tools/data/article736757214.task-SI.labels: -------------------------------------------------------------------------------- 1 | 736757214 0 59 2 | 736757214 171 181 3 | 736757214 0 9 4 | 736757214 115 167 5 | 736757214 740 759 6 | -------------------------------------------------------------------------------- /tools/data/submission-task-TC.tsv: -------------------------------------------------------------------------------- 1 | 736757214 Exaggeration,Minimisation 0 59 2 | 736757214 Doubt 171 181 3 | 736757214 Name_Calling,Labeling 0 9 4 | 736757214 Name_Calling,Labeling 115 167 5 | 736757214 Loaded_Language 740 759 6 | -------------------------------------------------------------------------------- /technique_classification/transformers_classifier/__init__.py: -------------------------------------------------------------------------------- 1 | from .run_glue import transformers_clf 2 | from .modeling_roberta import RobertaForSequenceClassification 3 | from .utils import glue_processors, glue_output_modes, glue_compute_metrics 4 | -------------------------------------------------------------------------------- /tools/data/article736757214.labels-task-TC: -------------------------------------------------------------------------------- 1 | 736757214 Exaggeration,Minimisation 0 59 2 | 736757214 Whataboutism,Straw_Men,Red_Herring 171 181 3 | 736757214 Name_Calling,Labeling 0 9 4 | 736757214 Loaded_Language 115 167 5 | 736757214 Loaded_Language 740 759 6 | -------------------------------------------------------------------------------- /visualization_example/visualization/highlight.js: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | torch==1.2.0 2 | transformers==2.3.0 3 | scipy==1.4.1 4 | numpy==1.16.4 5 | joblib==0.13.2 6 | nltk==3.4.5 7 | ConfigArgParse==1.0 8 | sklearn_crfsuite==0.3.6 9 | apex==0.1 10 | seqeval==0.0.5 11 | spacy==2.2.3 12 | Unidecode==1.1.1 13 | tqdm==4.43.0 14 | pandas==1.0.1 15 | ipython==7.13.0 16 | ptvsd==4.3.2 17 | scikit_learn==0.22.2.post1 18 | tensorboardX==2.0 19 | -------------------------------------------------------------------------------- /span_identification/ner/__init__.py: -------------------------------------------------------------------------------- 1 | from .run_ner import transformers_ner 2 | from .modeling_roberta import RobertaForTokenClassification 3 | from .utils_ner import convert_examples_to_features, get_labels, read_examples_from_file 4 | from .run_ner_crf import transformers_ner_crf 5 | from .bert_lstm_crf import BertLstmCrf 6 | from .conditional_random_field import ConditionalRandomField, allowed_transitions 7 | -------------------------------------------------------------------------------- /tools/data/propaganda-techniques-names-semeval2020task11.txt: -------------------------------------------------------------------------------- 1 | Appeal_to_Authority 2 | Appeal_to_fear-prejudice 3 | Bandwagon,Reductio_ad_hitlerum 4 | Black-and-White_Fallacy 5 | Causal_Oversimplification 6 | Doubt 7 | Exaggeration,Minimisation 8 | Flag-Waving 9 | Loaded_Language 10 | Name_Calling,Labeling 11 | Repetition 12 | Slogans 13 | Thought-terminating_Cliches 14 | Whataboutism,Straw_Men,Red_Herring 15 | -------------------------------------------------------------------------------- /tools/data/propaganda-techniques-names.txt: -------------------------------------------------------------------------------- 1 | Appeal_to_Authority 2 | Appeal_to_fear-prejudice 3 | Bandwagon 4 | Black-and-White_Fallacy 5 | Causal_Oversimplification 6 | Doubt 7 | Exaggeration,Minimisation 8 | Flag-Waving 9 | Loaded_Language 10 | Name_Calling,Labeling 11 | Obfuscation,Intentional_Vagueness,Confusion 12 | Red_Herring 13 | Reductio_ad_hitlerum 14 | Repetition 15 | Slogans 16 | Straw_Men 17 | Thought-terminating_Cliches 18 | Whataboutism 19 | -------------------------------------------------------------------------------- /tools/src/annotation_task_si.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import src.annotation as an 3 | 4 | class AnnotationTaskSI(Annotation): 5 | 6 | def __init__(self, label=None, start_offset = None, end_offset=None): #, article_id=None): 7 | 8 | self.label = label 9 | self.start_offset = int(start_offset) 10 | self.end_offset = int(end_offset) 11 | 12 | 13 | def get_label(self): 14 | 15 | sys.error("ERRRO: trying to access technique label from file in SI task format") 16 | 17 | 18 | -------------------------------------------------------------------------------- /tools/src/propaganda_techniques.py: -------------------------------------------------------------------------------- 1 | 2 | class Propaganda_Techniques(): 3 | 4 | 5 | TECHNIQUE_NAMES_FILE="data/propaganda-techniques-names.txt" 6 | 7 | def __init__(self, filename=TECHNIQUE_NAMES_FILE): 8 | 9 | with open(filename, "r") as f: 10 | self.techniques = [ line.rstrip() for line in f.readlines() ] 11 | 12 | 13 | def get_propaganda_techniques_list(self)->list: 14 | 15 | return self.techniques 16 | 17 | 18 | def get_propaganda_techniques_list_sorted(self)->list: 19 | 20 | return sorted(self.techniques) 21 | 22 | 23 | def is_valid_technique(self, technique_name): 24 | 25 | return technique_name in self.techniques 26 | 27 | 28 | def __str__(self): 29 | 30 | return "\n".join(self.techniques) 31 | 32 | 33 | def __getitem__(self, index): 34 | return self.techniques[index] 35 | 36 | 37 | def get_technique(self, index): 38 | return self.techniques[index] 39 | 40 | 41 | def indexOf(self, technique_name): 42 | return self.techniques.index(technique_name) 43 | -------------------------------------------------------------------------------- /configs/si_config.yml: -------------------------------------------------------------------------------- 1 | ---------------dataset params--------------- 2 | 3 | train_data_folder: datasets/train-articles 4 | test_data_folder: datasets/dev-articles 5 | labels_path: datasets/train-task1-SI.labels 6 | gold_annot_file: results/dev-task-SI.labels 7 | propaganda_techniques_file: tools/data/propaganda-techniques-names-semeval2020task11.txt 8 | data_dir: cached_datasets/SI/ 9 | train_file: train.tsv 10 | dev_file: dev.tsv 11 | test_file: test.tsv 12 | split_by_ids: True 13 | dev_size: 0.18 14 | overwrite_cache: False 15 | 16 | 17 | ----------------model params---------------- 18 | 19 | use_crf: True 20 | output_file: SI_output_dev.txt 21 | predicted_labels_files: [model_checkpoints/si_roberta_crf/test_predictions.txt] 22 | 23 | 24 | -------------transformers params------------ 25 | 26 | model_type: roberta 27 | config_name: roberta-large 28 | model_name_or_path: model_checkpoints/ner_roberta_large_uncased_crf_7700 29 | max_seq_length: 256 30 | per_gpu_train_batch_size: 8 31 | per_gpu_eval_batch_size: 1 32 | learning_rate: 2e-5 33 | save_steps: 700 34 | warmup_steps: 500 35 | num_train_epochs: 27 36 | output_dir: model_checkpoints/ner_roberta_large_uncased_crf_7700/ 37 | do_lower_case: True 38 | -------------------------------------------------------------------------------- /tools/data/article736757214.txt: -------------------------------------------------------------------------------- 1 | Sanctuary City Mayor Protected Illegal Alien Mexican Rapist 2 | 3 | Oakland Mayor Libby Schaaf claims to fight for women. 4 | Except when she's fighting for their rapists instead. 5 | A Democratic mayor’s warning to illegal immigrants of an incoming ICE raid in northern California may have led to a number of illegal immigrants with violent and sex-related convictions evading capture and deportation. 6 | Oakland Mayor Libby Schaaf tweeted out an impending warning of the four-day raid last week, alerting targeted individuals to the imminent arrests, and infuriating Immigrations and Customs Enforcement (ICE) officials, who say that many more could have been caught if they hadn't been warned. 7 | A spokesperson for ICE gave Fox News examples of some of the unsavory characters who evaded officals during the raid. 8 | One Mexican citizen had convictions for unlawful sexual intercourse with a minor and a conviction for driving under the influence (DUI), and had been deported in 2003. 9 | Another who evaded capture had a conviction for sodomizing a drugged victim in 2012, as well as a DUI from this year -- that Mexican citizen had also been previously deported in 2013. 10 | Another illegal immigrant from Mexico, previously deported in 2014 for a conviction for armed robbery, also evaded capture. 11 | -------------------------------------------------------------------------------- /configs/tc_config.yml: -------------------------------------------------------------------------------- 1 | ---------------dataset params--------------- 2 | 3 | propaganda_techniques_file: tools/data/propaganda-techniques-names-semeval2020task11.txt 4 | train_data_folder: datasets/train-articles 5 | #test_data_folder: datasets/train-articles 6 | test_data_folder: datasets/dev-articles 7 | #test_data_folder: datasets/test/test-articles 8 | labels_path: datasets/train-task2-TC.labels 9 | #test_template_labels_path: results/mydev-task-TC.labelss 10 | test_template_labels_path: datasets/dev-task-TC-template.out 11 | #test_template_labels_path: datasets/test/test-task-TC-template.out 12 | data_dir: cached_datasets/TC/ 13 | train_file: train.tsv 14 | dev_file: dev.tsv 15 | #test_file: dev.tsv 16 | #test_file: eval_tc_new.tsv 17 | test_file: test.tsv 18 | split_by_ids: True 19 | dev_size: 0.18 20 | balance: False 21 | shuffle: True 22 | overwrite_cache: False 23 | 24 | 25 | ----------------model params---------------- 26 | 27 | output_file: TC_output_dev_sc.txt 28 | #weights: [1, 0] 29 | predicted_logits_files: [model_checkpoints/tc_roberta_joineds/predicted_logits] 30 | 31 | 32 | -------------transformers params------------ 33 | 34 | task_name: prop 35 | model_type: roberta 36 | #model_name_or_path: model_checkpoints/tc_roberta_large_cased_transfer_joined 37 | model_name_or_path: model_checkpoints/tc_roberta_large_cased_transfer_3500 38 | max_seq_length: 256 39 | per_gpu_train_batch_size: 8 40 | per_gpu_eval_batch_size: 8 41 | learning_rate: 2e-5 42 | save_steps: 700 43 | warmup_steps: 500 44 | num_train_epochs: 10 45 | #output_dir: model_checkpoints/tc_roberta_large_cased_transfer_joined 46 | output_dir: model_checkpoints/tc_roberta_large_cased_transfer_3500 47 | do_lower_case: False 48 | -------------------------------------------------------------------------------- /tools/print_spans.py: -------------------------------------------------------------------------------- 1 | __author__ = "Giovanni Da San Martino" 2 | __copyright__ = "Copyright 2019" 3 | __credits__ = ["Giovanni Da San Martino"] 4 | __license__ = "GPL" 5 | __version__ = "0.1" 6 | __maintainer__ = "Giovanni Da San Martino" 7 | __email__ = "gmartino@hbku.edu.qa" 8 | __status__ = "Beta" 9 | 10 | import codecs 11 | import argparse 12 | import src.annotation as an 13 | import src.article_annotations as aa 14 | import src.propaganda_techniques as pt 15 | 16 | 17 | def main(args): 18 | 19 | span_file = args.spans_file 20 | article_file = args.article_file 21 | propaganda_techniques_list_file = args.propaganda_techniques_list_file 22 | 23 | propaganda_techniques = pt.Propaganda_Techniques(propaganda_techniques_list_file) 24 | annotations = aa.Articles_annotations() 25 | aa.Articles_annotations.techniques = propaganda_techniques 26 | 27 | annotations.load_article_annotations_from_csv_file(span_file) 28 | 29 | with codecs.open(article_file, "r", encoding="utf8") as f: 30 | article_content = f.read() 31 | 32 | #print("\n".join([str(i)+") "+x for i,x in enumerate(str(aa.techniques).split("\n"))])) 33 | #output_text, footnotes = annotations.tag_text_with_annotations(article_content) 34 | output_text, footnotes, legend = annotations.mark_text(article_content) 35 | 36 | print(output_text) 37 | print(footnotes) 38 | 39 | 40 | if __name__ == "__main__": 41 | 42 | parser = argparse.ArgumentParser(description="Add tags to mark spans in a text file. \n" + 43 | "Example: print_spans.py -s data/article736757214.task-FLC.labels -t data/article736757214.txt") 44 | parser.add_argument('-t', '--text-file', dest='article_file', required=True, help="file with text document") 45 | parser.add_argument('-s', '--spans-file', dest='spans_file', required=True, 46 | help="file with spans to be highlighted. One line of the span file") 47 | parser.add_argument('-p', '--propaganda-techniques-list-file', dest='propaganda_techniques_list_file', required=False, 48 | default="data/propaganda-techniques-names.txt", 49 | help="file with list of propaganda techniques (one per line).") 50 | 51 | main(parser.parse_args()) 52 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | pip-wheel-metadata/ 24 | share/python-wheels/ 25 | *.egg-info/ 26 | .installed.cfg 27 | *.egg 28 | MANIFEST 29 | 30 | # PyInstaller 31 | # Usually these files are written by a python script from a template 32 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 33 | *.manifest 34 | *.spec 35 | 36 | # Installer logs 37 | pip-log.txt 38 | pip-delete-this-directory.txt 39 | 40 | # Unit test / coverage reports 41 | htmlcov/ 42 | .tox/ 43 | .nox/ 44 | .coverage 45 | .coverage.* 46 | .cache 47 | nosetests.xml 48 | coverage.xml 49 | *.cover 50 | *.py,cover 51 | .hypothesis/ 52 | .pytest_cache/ 53 | cover/ 54 | 55 | # Translations 56 | *.mo 57 | *.pot 58 | 59 | # Django stuff: 60 | *.log 61 | local_settings.py 62 | db.sqlite3 63 | db.sqlite3-journal 64 | 65 | # Flask stuff: 66 | instance/ 67 | .webassets-cache 68 | 69 | # Scrapy stuff: 70 | .scrapy 71 | 72 | # Sphinx documentation 73 | docs/_build/ 74 | 75 | # PyBuilder 76 | .pybuilder/ 77 | target/ 78 | 79 | # Jupyter Notebook 80 | .ipynb_checkpoints 81 | 82 | # IPython 83 | profile_default/ 84 | ipython_config.py 85 | 86 | # pyenv 87 | # For a library or package, you might want to ignore these files since the code is 88 | # intended to run in multiple environments; otherwise, check them in: 89 | # .python-version 90 | 91 | # pipenv 92 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 93 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 94 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 95 | # install all needed dependencies. 96 | #Pipfile.lock 97 | 98 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 99 | __pypackages__/ 100 | 101 | # Celery stuff 102 | celerybeat-schedule 103 | celerybeat.pid 104 | 105 | # SageMath parsed files 106 | *.sage.py 107 | 108 | # Environments 109 | .env 110 | .venv 111 | env/ 112 | venv/ 113 | ENV/ 114 | env.bak/ 115 | venv.bak/ 116 | 117 | # Spyder project settings 118 | .spyderproject 119 | .spyproject 120 | 121 | # Rope project settings 122 | .ropeproject 123 | 124 | # mkdocs documentation 125 | /site 126 | 127 | # mypy 128 | .mypy_cache/ 129 | .dmypy.json 130 | dmypy.json 131 | 132 | # Pyre type checker 133 | .pyre/ 134 | 135 | # pytype static type analyzer 136 | .pytype/ 137 | 138 | # Cython debug symbols 139 | cython_debug/ 140 | 141 | # static files generated from Django application using `collectstatic` 142 | media 143 | static -------------------------------------------------------------------------------- /visualization_example/visualization/__init__.py: -------------------------------------------------------------------------------- 1 | from IPython.core.display import display, HTML 2 | from .html_template import transform_to_tree, span_wrapper 3 | 4 | import pandas as pd 5 | import numpy as np 6 | 7 | def unify_data_format(fn): 8 | def unified_data(data, **kwargs): 9 | if kwargs.get('stanford', False): 10 | tokens, clusters = stanford_data_adapter(data) 11 | if kwargs.get('allen', False): 12 | tokens, clusters = allen_data_adapter(data) 13 | if kwargs.get('huggingface', False): 14 | tokens, clusters = huggingface_data_adapter(data) 15 | if kwargs.get('proref', False): 16 | tokens, clusters = labelled_pronoun(data) 17 | 18 | return fn(tokens, clusters, **kwargs) 19 | 20 | return unified_data 21 | 22 | # Either return the html string or rander in a jupyter notebook output 23 | # Function signature based on displacy render functionality 24 | 25 | def render(tokens, 26 | clusters, 27 | style='coref', 28 | stanford=False, 29 | allen=False, 30 | huggingface=False, 31 | proref=False, 32 | jupyter=True, 33 | task=None): 34 | 35 | html = to_html(tokens, clusters, task) 36 | 37 | if jupyter: 38 | display(HTML(html)) 39 | else: 40 | return html 41 | 42 | def stanford_data_adapter(data): 43 | sents = [] 44 | for sent in data['sentences']: 45 | sents.append([]) 46 | for token in sent['tokens']: 47 | sents[-1].append(token['originalText']) 48 | 49 | clusters = [] 50 | if data['corefs'] is not None: 51 | for num, mentions in data['corefs'].items(): 52 | clusters.append([]) 53 | for mention in mentions: 54 | start = np.cumsum([0]+list(map(len, sents)))[mention['sentNum']-1] + mention['startIndex']-1 55 | end = np.cumsum([0]+list(map(len, sents)))[mention['sentNum']-1] + mention['endIndex']-2 56 | clusters[-1].append([start, end]) 57 | 58 | return sum(sents, []), clusters 59 | 60 | def allen_data_adapter(data): 61 | return data['document'], data['clusters'] 62 | 63 | def huggingface_data_adapter(doc): 64 | tokens = [token.text for token in doc] 65 | 66 | clusters = [] 67 | if doc._.coref_clusters is not None: 68 | for cluster in doc._.coref_clusters: 69 | clusters.append([]) 70 | for mention in cluster.mentions: 71 | clusters[-1].append([mention.start, mention.end-1]) 72 | 73 | return tokens, clusters 74 | 75 | def labelled_pronoun(row): 76 | txt = row.text 77 | 78 | # map char indices to token indices 79 | tokens = txt.split(' ') 80 | start_a = len(txt[:row.a_offset].split(' '))-1 81 | start_b = len(txt[:row.b_offset].split(' '))-1 82 | 83 | clusters = [[[start_a, start_a+len(row.a.split(' '))-1]], [[start_b, start_b+len(row.b.split(' '))-1]]] 84 | 85 | # add pronoun token to the labelled cluster 86 | start_p = len(txt[:row.pronoun_offset].split(' '))-1 87 | if row.a_coref: 88 | clusters[0].append([start_p, start_p+len(row.pronoun.split(' '))-1]) 89 | elif row.b_coref: 90 | clusters[1].append([start_p, start_p+len(row.pronoun.split(' '))-1]) 91 | else: 92 | clusters.append([[start_p, start_p+len(row.pronoun.split(' '))-1]]) 93 | 94 | return tokens, clusters 95 | 96 | def to_html(tokens, clusters, task): 97 | tree = transform_to_tree(tokens, clusters) 98 | html = ''.join(span_wrapper(tree, 0, task)) 99 | html = '
{}
'.format(html) 100 | return html -------------------------------------------------------------------------------- /technique_classification/transformers_classifier/utils.py: -------------------------------------------------------------------------------- 1 | import os 2 | from transformers import DataProcessor, InputExample 3 | from sklearn.metrics import f1_score 4 | from unidecode import unidecode 5 | import string 6 | import random 7 | from autocorrect import Speller 8 | 9 | 10 | def generate_misspelling(phrase, p=0.5): 11 | new_phrase = [] 12 | words = phrase.split(' ') 13 | for word in words: 14 | outcome = random.random() 15 | if outcome <= p: 16 | ix = random.choice(range(len(word))) 17 | new_word = ''.join([word[w] if w != ix else random.choice(string.ascii_letters) for w in range(len(word))]) 18 | new_phrase.append(new_word) 19 | else: 20 | new_phrase.append(word) 21 | return ' '.join(new_phrase) 22 | 23 | 24 | def simple_accuracy(preds, labels): 25 | return (preds == labels).mean() 26 | 27 | 28 | def acc_and_f1_macro(preds, labels): 29 | acc = simple_accuracy(preds, labels) 30 | f1 = f1_score(y_true=labels, y_pred=preds, average='macro') 31 | return { 32 | "acc": acc, 33 | "f1": f1, 34 | "acc_and_f1": (acc + f1) / 2, 35 | } 36 | 37 | 38 | def glue_compute_metrics(task_name, preds, labels): 39 | assert len(preds) == len(labels) 40 | if task_name == "prop": 41 | return acc_and_f1_macro(preds, labels) 42 | else: 43 | raise KeyError(task_name) 44 | 45 | 46 | class PropProcessor(DataProcessor): 47 | def get_train_examples(self, file_path): 48 | """See base class.""" 49 | return self._create_examples(self._read_tsv(file_path), "train") 50 | 51 | def get_dev_examples(self, file_path): 52 | """See base class.""" 53 | return self._create_examples(self._read_tsv(file_path), "dev_matched") 54 | 55 | def get_test_examples(self, file_path): 56 | """See base class.""" 57 | return self._create_examples(self._read_tsv(file_path), "test") 58 | 59 | def get_labels(self): 60 | """See base class.""" 61 | return ['Appeal_to_Authority', 'Doubt', 'Repetition', 62 | 'Appeal_to_fear-prejudice', 'Slogans', 'Black-and-White_Fallacy', 63 | 'Loaded_Language', 'Flag-Waving', 'Name_Calling,Labeling', 64 | 'Whataboutism,Straw_Men,Red_Herring', 'Causal_Oversimplification', 65 | 'Exaggeration,Minimisation', 'Bandwagon,Reductio_ad_hitlerum', 66 | 'Thought-terminating_Cliches'] 67 | 68 | def _create_examples(self, lines, set_type): 69 | """Creates examples for the training and dev sets.""" 70 | examples = [] 71 | spell = Speller(lang='en') 72 | for (i, line) in enumerate(lines): 73 | if i == 0 or line == []: 74 | continue 75 | guid = "%s-%s" % (set_type, i) 76 | text_a = line[3] # generate_misspelling(line[3]) 77 | #try: 78 | # text_a = spell(text_a) 79 | #except: 80 | # pass 81 | 82 | text_b = line[4] 83 | 84 | #pos = text_b.find(text_a) 85 | #text_a = text_b[:pos] + " " + text_b[pos:pos + len(text_a)] + " " + text_b[pos + len(text_a):] 86 | #text_b = None 87 | 88 | if len(line) < 6 or line[5] == '?': 89 | label = self.get_labels()[0] 90 | else: 91 | label = line[5] 92 | examples.append(InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) 93 | return examples 94 | 95 | 96 | glue_tasks_num_labels = { 97 | "prop": 14 98 | } 99 | 100 | 101 | glue_processors = { 102 | "prop": PropProcessor, 103 | } 104 | 105 | 106 | glue_output_modes = { 107 | "prop": "classification" 108 | } 109 | -------------------------------------------------------------------------------- /tools/task-TC_scorer.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import argparse 3 | import logging.handlers 4 | from sklearn.metrics import f1_score 5 | from sklearn.metrics import precision_score 6 | from sklearn.metrics import recall_score 7 | import src.annotation as an 8 | import src.annotations as ans 9 | import src.propaganda_techniques as pt 10 | 11 | logger = logging.getLogger("propaganda_scorer") 12 | ch = logging.StreamHandler(sys.stdout) 13 | ch.setLevel(logging.INFO) 14 | formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s') 15 | ch.setFormatter(formatter) 16 | logger.setLevel(logging.INFO) 17 | 18 | 19 | def main(args): 20 | 21 | user_submission_file = args.submission 22 | gold_file = args.gold 23 | output_log_file = args.log_file 24 | propaganda_techniques_list_file = args.propaganda_techniques_list_file 25 | output_for_script = bool(args.output_for_script) 26 | 27 | if not output_for_script: 28 | logger.addHandler(ch) 29 | 30 | if args.debug_on_std: 31 | ch.setLevel(logging.DEBUG) 32 | 33 | if output_log_file is not None: 34 | logger.info("Logging execution to file " + output_log_file) 35 | fileLogger = logging.FileHandler(output_log_file) 36 | fileLogger.setLevel(logging.DEBUG) 37 | fileLogger.setFormatter(formatter) 38 | logger.addHandler(fileLogger) 39 | 40 | propaganda_techniques = pt.Propaganda_Techniques(propaganda_techniques_list_file) 41 | an.Annotation.set_propaganda_technique_list_obj(propaganda_techniques) 42 | 43 | user_annotations = ans.Annotations() 44 | user_annotations.load_annotation_list_from_file(user_submission_file) 45 | for article in user_annotations.get_article_id_list(): 46 | user_annotations.get_article_annotations_obj(article).sort_spans() 47 | 48 | gold_annotations = ans.Annotations() 49 | gold_annotations.load_annotation_list_from_file(gold_file) 50 | for article in gold_annotations.get_article_id_list(): 51 | gold_annotations.get_article_annotations_obj(article).sort_spans() 52 | 53 | logger.info("Checking format: User Predictions -- Gold Annotations") 54 | if not user_annotations.compare_annotations_identical_article_lists(gold_annotations) or not user_annotations.compare_annotations_identical(gold_annotations): 55 | logger.error("wrong format, no scoring will be performed") 56 | sys.exit() 57 | logger.info("OK: submission file format appears to be correct") 58 | res_for_output, res_for_script = user_annotations.TC_score_to_string(gold_annotations, output_for_script) 59 | logger.info("Scoring submission" + res_for_output) 60 | if output_for_script: 61 | print(res_for_script) 62 | 63 | 64 | if __name__ == "__main__": 65 | 66 | parser = argparse.ArgumentParser("Scorer for SemEval 2020 Task 11 subtask TC.\n" + 67 | "Example: python3 task-TC_scorer.py -s data/submission-task-TC.tsv -r data/article736757214.task-FLC.labels -p data/propaganda-techniques-names-semeval2020task11.txt") 68 | 69 | parser.add_argument('-s', '--submission-file', dest='submission', required=True, help="file with the submission of the team") 70 | parser.add_argument('-r', '--reference-file', dest='gold', required=True, help="file with the gold labels.") 71 | parser.add_argument('-d', '--enable-debug-on-standard-output', dest='debug_on_std', required=False, 72 | action='store_true', help="Print debug info also on standard output.") 73 | parser.add_argument('-l', '--log-file', dest='log_file', required=False, help="Output logger file.") 74 | parser.add_argument('-p', '--propaganda-techniques-list-file', dest='propaganda_techniques_list_file', required=True, 75 | help="file with list of propaganda techniques (one per line).") 76 | parser.add_argument('-o', '--output-for-script', dest='output_for_script', required=False, action='store_true', 77 | default=False, help="Prints the output in a format easy to parse for a script") 78 | main(parser.parse_args()) 79 | -------------------------------------------------------------------------------- /visualization_example/visualization/html_template.py: -------------------------------------------------------------------------------- 1 | import functools 2 | 3 | HIGHLIGHT_COLORS = [ 4 | "blue", 5 | "green", 6 | "pink", 7 | "orange", 8 | "purple", 9 | "teal", 10 | "tan", 11 | "red", 12 | "cobalt", 13 | "brown", 14 | "slate", 15 | "fuchsia", 16 | "gray", 17 | "blue" 18 | ] 19 | 20 | def get_highlight_color(index): 21 | if index <= len(HIGHLIGHT_COLORS): 22 | return HIGHLIGHT_COLORS[index] 23 | else: 24 | return HIGHLIGHT_COLORS[index - (len(HIGHLIGHT_COLORS) * math.floor(index / len(HIGHLIGHT_COLORS)))] 25 | 26 | # Transofrms tokens and clusters into a tree representation 27 | def transform_to_tree(tokens, clusters): 28 | def contains(span, index): 29 | return index >= span[0] and index <= span[1] 30 | 31 | inside_clusters = [{ 32 | 'cluster': -1, 33 | 'contents': [], 34 | 'end': -1 35 | }] 36 | 37 | for i, token in enumerate(tokens): 38 | # Find all the new clusters we are entering at the current index 39 | new_clusters = [] 40 | for j, cluster in enumerate(clusters): 41 | #Make sure we're not already in this cluster 42 | if j not in [c['cluster'] for c in inside_clusters]: 43 | for span in cluster: 44 | if i in span: 45 | new_clusters.append({ 'end': span[1], 'cluster': j }) 46 | 47 | # Enter each new cluster, starting with the leftmost 48 | new_clusters = sorted(new_clusters, key=functools.cmp_to_key(lambda a, b: b['end'] - a['end'])) 49 | for new_cluster in new_clusters: 50 | #Descend into the new cluster 51 | inside_clusters.append({ 52 | 'cluster': new_cluster['cluster'], 53 | 'contents': [], 54 | 'end': new_cluster['end'] 55 | }) 56 | 57 | #Add the current token into the current cluster 58 | inside_clusters[-1]['contents'].append(token) 59 | 60 | # Exit each cluster we're at the end of 61 | while (len(inside_clusters) > 0 and inside_clusters[-1]['end'] == i): 62 | top_cluster = inside_clusters[-1] 63 | inside_clusters.pop() 64 | inside_clusters[-1]['contents'].append(top_cluster) 65 | 66 | return inside_clusters[0]['contents'] 67 | 68 | 69 | mapping = {i: el for i, el in enumerate(['Appeal_to_Authority', 'Doubt', 'Repetition', 70 | 'Appeal_to_fear-prejudice', 'Slogans', 'Black-and-White_Fallacy', 71 | 'Loaded_Language', 'Flag-Waving', 'Name_Calling,Labeling', 72 | 'Whataboutism,Straw_Men,Red_Herring', 'Causal_Oversimplification', 73 | 'Exaggeration,Minimisation', 'Bandwagon,Reductio_ad_hitlerum', 74 | 'Thought-terminating_Cliches'])} 75 | 76 | #This is the function that calls itself when we recurse over the span tree. 77 | def gen_elem(token, idx, depth, task): 78 | if isinstance(token, dict) or isinstance(token, list): 79 | if task == 'TC': 80 | title = mapping[token['cluster']] 81 | elif task == 'SI': 82 | title = 'PROP' 83 | else: 84 | title = token['cluster'] 85 | return '\ 87 | {}\ 88 | {}'.format(idx, 89 | get_highlight_color(token['cluster']), 90 | depth, 91 | title, 92 | title, 93 | ' '.join(span_wrapper(token['contents'], depth + 1, task))) 94 | else: 95 | return '{} '.format(token) 96 | 97 | # Wraps the tree representation into spans indicating cluster-wise depth 98 | def span_wrapper(tree, depth, task): 99 | return [gen_elem(token, idx, depth, task) for idx, token in enumerate(tree)] -------------------------------------------------------------------------------- /technique_classification/dataset.py: -------------------------------------------------------------------------------- 1 | import codecs 2 | import glob 3 | import os 4 | import numpy as np 5 | import pandas as pd 6 | from nltk.tokenize.punkt import PunktSentenceTokenizer 7 | from sklearn.model_selection import train_test_split 8 | 9 | 10 | def read_articles_from_file_list(folder_name, file_pattern="*.txt"): 11 | file_list = glob.glob(os.path.join(folder_name, file_pattern)) 12 | articles = {} 13 | article_id_list, sentence_id_list, sentence_list = ([], [], []) 14 | for filename in sorted(file_list): 15 | article_id = os.path.basename(filename).split(".")[0][7:] 16 | with codecs.open(filename, "r", encoding="utf8") as f: 17 | articles[article_id] = f.read() 18 | return articles 19 | 20 | 21 | def read_predictions_from_file(filename): 22 | articles_id, span_starts, span_ends, gold_labels = ([], [], [], []) 23 | with open(filename, "r") as f: 24 | for row in f.readlines(): 25 | article_id, gold_label, span_start, span_end = row.rstrip().split("\t") 26 | articles_id.append(article_id) 27 | gold_labels.append(gold_label) 28 | span_starts.append(span_start) 29 | span_ends.append(span_end) 30 | return articles_id, span_starts, span_ends, gold_labels 31 | 32 | 33 | def load_data(data_folder, labels_file): 34 | articles = read_articles_from_file_list(data_folder) 35 | ref_articles_id, ref_span_starts, ref_span_ends, labels = read_predictions_from_file(labels_file) 36 | return articles, ref_articles_id, ref_span_starts, ref_span_ends, labels 37 | 38 | 39 | def sents_token_bounds(text): 40 | sents_starts = [] 41 | for start, end in PunktSentenceTokenizer().span_tokenize(text): 42 | sents_starts.append(start) 43 | sents_starts.append(100000) 44 | return np.array(sents_starts) 45 | 46 | 47 | def clear(text): 48 | return text.strip().replace('\t', ' ').replace('\n', ' ') 49 | 50 | 51 | def get_context(article, span_start, span_end): 52 | bounds = sents_token_bounds(article) 53 | context_start = bounds[np.where(bounds <= span_start)[0][-1]] 54 | context_end = bounds[np.where(bounds >= span_end)[0][0]] 55 | return clear(article[context_start:context_end]) 56 | 57 | 58 | def balance_pandas(data): 59 | lst = [data] 60 | max_size = data['label'].value_counts().max() 61 | for class_index, group in data.groupby('label'): 62 | lst.append(group.sample(max_size - len(group), replace=True)) 63 | return pd.concat(lst) 64 | 65 | 66 | def dataset_to_pandas(articles, ref_articles_id, ref_span_starts, ref_span_ends, train_gold_labels): 67 | data = pd.DataFrame.from_dict({'article_id': ref_articles_id, 68 | 'article': [articles[id] for id in ref_articles_id], 69 | 'span_start': np.array(ref_span_starts).astype(int), 70 | 'span_end': np.array(ref_span_ends).astype(int), 71 | 'label': train_gold_labels 72 | }) 73 | data['span'] = data.apply(lambda x: clear(x['article'][x['span_start']:x['span_end']]), axis=1) 74 | data['context'] = data.apply(lambda x: get_context(x['article'], x['span_start'], x['span_end']), axis=1) 75 | return data[['article_id', 'span_start', 'span_end', 'span', 'context', 'label']] 76 | 77 | 78 | def get_train_dev_files(articles, ref_articles_id, ref_span_starts, ref_span_ends, labels, train_file, dev_file, 79 | split_by_ids=False, dev_size=0.3, random_state=40, balance=False, shuffle=True): 80 | data = dataset_to_pandas(articles, ref_articles_id, ref_span_starts, ref_span_ends, labels) 81 | if split_by_ids: 82 | train_ids, dev_ids = train_test_split(data.article_id.unique(), test_size=dev_size, random_state=random_state) 83 | train = data[data.article_id.isin(train_ids)] 84 | dev = data[data.article_id.isin(dev_ids)] 85 | else: 86 | train, dev = train_test_split(data, test_size=dev_size, random_state=random_state) 87 | 88 | if balance: 89 | train = balance_pandas(train) 90 | if shuffle: 91 | train = train.sample(frac=1).reset_index(drop=True) 92 | 93 | save_dataset(train, train_file) 94 | save_dataset(dev, dev_file) 95 | 96 | 97 | def get_test_file(articles, ref_articles_id, ref_span_starts, ref_span_ends, labels, test_file): 98 | test = dataset_to_pandas(articles, ref_articles_id, ref_span_starts, ref_span_ends, labels) 99 | save_dataset(test, test_file) 100 | 101 | 102 | def save_dataset(data, file_path): 103 | data.to_csv(file_path, sep='\t', index=False) 104 | -------------------------------------------------------------------------------- /tools/src/annotation.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | import sys 3 | import logging.handlers 4 | import src.propaganda_techniques as pt 5 | import src.annotation_w_o_label as anwol 6 | 7 | __author__ = "Giovanni Da San Martino" 8 | __copyright__ = "Copyright 2019" 9 | __credits__ = ["Giovanni Da San Martino"] 10 | __license__ = "GPL" 11 | __version__ = "0.1" 12 | __maintainer__ = "Giovanni Da San Martino" 13 | __email__ = "gmartino@hbku.edu.qa" 14 | __status__ = "Beta" 15 | 16 | logger = logging.getLogger("propaganda_scorer") 17 | 18 | 19 | class Annotation(anwol.AnnotationWithOutLabel): 20 | 21 | """ 22 | One annotation is represented by a span (two integer indices indicating the 23 | starting and ending position of the span) and the propaganda technique name 24 | (a label attached to the span). 25 | The class provides basic maniputation functions for one annotation. 26 | """ 27 | 28 | # input file format variables 29 | separator = "\t" 30 | ARTICLE_ID_COL = 0 31 | TECHNIQUE_NAME_COL = 1 32 | FRAGMENT_START_COL = 2 33 | FRAGMENT_END_COL = 3 34 | propaganda_techniques:pt.Propaganda_Techniques = None 35 | 36 | 37 | def __init__(self, label:str=None, start_offset:str = None, end_offset:str=None): 38 | 39 | super().__init__(start_offset, end_offset) 40 | self.label = label 41 | 42 | 43 | def __str__(self): 44 | 45 | return super().__str__() + " -> " + self.get_label() 46 | #return self.get_label() + "\t" + super().__str__() 47 | 48 | 49 | def __eq__(self, second_annotation:Annotation): 50 | """ 51 | Checks whether two annotations are identical, i.e. if their spans are 52 | identical and if they labels coincide 53 | """ 54 | return super().__eq__(second_annotation) and self.get_label()==second_annotation.get_label() 55 | 56 | 57 | def get_label(self)->str: 58 | 59 | return self.label 60 | 61 | 62 | def get_propaganda_techniques(self)->list: 63 | 64 | if self.propaganda_techniques is None: 65 | logger.error("trying to access propaganda techniques list before initialising the corresponding object") 66 | sys.exit() 67 | return self.propaganda_techniques.get_propaganda_techniques_list() 68 | 69 | 70 | @classmethod 71 | def set_propaganda_technique_list_obj(cls, propaganda_technique_obj:pt.Propaganda_Techniques)->None: 72 | """ 73 | propaganda_technique_obj is an object from the module src.propaganda_techniques. 74 | Typical invokation: 75 | ` 76 | propaganda_techniques = pt.Propaganda_Techniques(filename=propaganda_techniques_list_file) 77 | an.Annotation.set_propaganda_technique_list_obj(propaganda_techniques) 78 | ` 79 | """ 80 | cls.propaganda_techniques = propaganda_technique_obj 81 | 82 | 83 | @staticmethod 84 | def load_annotation_from_string(annotation_string:str, row_num:int=None, filename:str=None)->(Annotation, str): 85 | """ 86 | Read annotations from a csv-like string, with fields separated 87 | by the class variable `separator`: 88 | 89 | article idtechnique namestarting_positionending_position 90 | Fields order is determined by the class variables ARTICLE_ID_COL, 91 | TECHNIQUE_NAME_COL, FRAGMENT_START_COL, FRAGMENT_END_COL 92 | 93 | Besides reading the data, it performs basic checks. 94 | 95 | :return a tuple (Annotation object, id of the article) 96 | """ 97 | 98 | row = annotation_string.rstrip().split(Annotation.separator) 99 | if len(row) != 4: 100 | logger.error("Row%s%s is supposed to have 4 columns. Found %d: -%s-." 101 | % (" " + str(row_num) if row_num is not None else "", 102 | " in file " + filename if filename is not None else "", len(row), annotation_string)) 103 | sys.exit() 104 | 105 | article_id = row[Annotation.ARTICLE_ID_COL] 106 | label = row[Annotation.TECHNIQUE_NAME_COL] 107 | try: 108 | start_offset = int(row[Annotation.FRAGMENT_START_COL]) 109 | except: 110 | logger.error("The column %d in row%s%s is supposed to be an integer: -%s-" 111 | %(Annotation.FRAGMENT_START_COL, " " + str(row_num) if row_num is not None else "", 112 | " in file " + filename if filename is not None else "", annotation_string)) 113 | try: 114 | end_offset = int(row[Annotation.FRAGMENT_END_COL]) 115 | except: 116 | logger.error("The column %d in row%s%s is supposed to be an integer: -%s-" 117 | %(Annotation.FRAGMENT_END_COL, " " + str(row_num) if row_num is not None else "", 118 | " in file " + filename if filename is not None else "", annotation_string)) 119 | 120 | return Annotation(label, start_offset, end_offset), article_id 121 | 122 | 123 | def is_technique_name_valid(self)->bool: 124 | """ 125 | Checks whether the technique names are correct 126 | """ 127 | if self.propaganda_techniques is None: 128 | sys.exit("ERROR: propaganda techniques object has not been initialised") 129 | if not self.propaganda_techniques.is_valid_technique(self.get_label()): 130 | logger.error("label %s is not valid. Possible values are: %s"%(self.get_label(), self.propaganda_techniques)) 131 | return False 132 | return True 133 | 134 | 135 | def check_format_of_annotation_in_file(self): 136 | """ 137 | Performs some checks on the fields of the annotation 138 | """ 139 | if not self.is_technique_name_valid(): 140 | sys.exit() 141 | if not self.is_span_valid(): 142 | sys.exit() 143 | 144 | -------------------------------------------------------------------------------- /span_identification/ner/bert_lstm_crf.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | based on 4 | @File: bert_lstm_crf.py 5 | @Copyright: 2019 Michael Zhu 6 | @License:the Apache License, Version 2.0 7 | @Author:Michael Zhu 8 | """ 9 | 10 | # coding=utf-8 11 | # coding=utf-8 12 | import copy 13 | from typing import cast, List 14 | import numpy as np 15 | 16 | import torch.nn as nn 17 | 18 | from torch.autograd import Variable 19 | import torch 20 | 21 | from .conditional_random_field import ConditionalRandomField, allowed_transitions 22 | 23 | 24 | class BertLstmCrf(nn.Module): 25 | """ 26 | bert_lstm_crf model 27 | """ 28 | 29 | def __init__(self, bert_model, 30 | num_labels=9, 31 | embedding_dim=512, 32 | hidden_dim=512, 33 | rnn_layers=1, 34 | rnn_dropout=0.1, 35 | output_dropout=0.1, 36 | use_cuda=False): 37 | super(BertLstmCrf, self).__init__() 38 | self.bert_encoder = bert_model 39 | 40 | self.embedding_dim = embedding_dim 41 | self.hidden_dim = hidden_dim 42 | self.rnn_layers = rnn_layers 43 | 44 | self.lstm = None 45 | if rnn_layers > 0: 46 | self.lstm = nn.LSTM( 47 | embedding_dim, 48 | hidden_dim, 49 | num_layers=rnn_layers, 50 | bidirectional=True, 51 | dropout=rnn_dropout, 52 | batch_first=True 53 | ) 54 | 55 | # self.crf = CRF( 56 | # target_size=num_labels, 57 | # average_batch=True, 58 | # use_cuda=use_cuda 59 | # ) 60 | 61 | # TODO: add contraints 62 | constraints = allowed_transitions('BIO', dict(enumerate(["O", "B", "I"]))) 63 | include_start_end_transitions = True 64 | self.crf = ConditionalRandomField( 65 | num_labels, 66 | constraints, 67 | include_start_end_transitions=include_start_end_transitions 68 | ) 69 | 70 | self.liner = nn.Linear(hidden_dim * 2, num_labels) 71 | self.num_labels = num_labels 72 | 73 | self.output_dropout = nn.Dropout(p=output_dropout) 74 | 75 | def rand_init_hidden(self, batch_size): 76 | """ 77 | random initialize hidden variable 78 | """ 79 | return Variable( 80 | torch.randn(2 * self.rnn_layers, batch_size, self.hidden_dim)), Variable( 81 | torch.randn(2 * self.rnn_layers, batch_size, self.hidden_dim)) 82 | 83 | def clear_subtokens(self, logits, labels, mask): 84 | clear_labels = torch.zeros_like(labels) 85 | clear_logits = torch.zeros_like(logits) 86 | clear_mask = torch.zeros_like(mask) 87 | 88 | for i in range(len(labels)): 89 | assert (mask[i][labels[i] != -100] == 1).all() 90 | cor = labels[i][labels[i] != -100] 91 | clear_labels[i][:len(cor)] = cor 92 | clear_logits[i][:len(cor)] = logits[i][labels[i] != - 100] 93 | clear_mask[i][:len(cor)] = 1 94 | return clear_logits, clear_labels, clear_mask 95 | 96 | def forward(self, **kwargs): 97 | ''' 98 | args: 99 | sentence (word_seq_len, batch_size) : word-level representation of sentence 100 | hidden: initial hidden state 101 | 102 | return: 103 | crf output (word_seq_len, batch_size, tag_size, tag_size), hidden 104 | ''' 105 | 106 | kwargs_copy = copy.deepcopy(kwargs) 107 | if "labels" in kwargs_copy: 108 | kwargs_copy.pop("labels") 109 | 110 | batch_size = kwargs["input_ids"].size(0) 111 | seq_length = kwargs["input_ids"].size(1) 112 | 113 | bert_outputs = self.bert_encoder( 114 | **kwargs 115 | ) 116 | sequence_output = bert_outputs[1] 117 | 118 | if self.lstm is not None: 119 | hidden = self.rand_init_hidden(batch_size) 120 | if kwargs["input_ids"].is_cuda: 121 | hidden = (i.cuda() for i in hidden) 122 | sequence_output, hidden = self.lstm(sequence_output, hidden) 123 | sequence_output = sequence_output.contiguous().view(-1, self.hidden_dim * 2) 124 | sequence_output = self.output_dropout(sequence_output) 125 | 126 | sequence_output = self.liner(sequence_output) 127 | 128 | #out = self.liner(sequence_output) 129 | out = sequence_output 130 | logits = out.contiguous().view(batch_size, seq_length, -1) 131 | 132 | clear_logits, clear_labels, clear_mask = self.clear_subtokens(logits, kwargs['labels'], kwargs["attention_mask"]) 133 | 134 | """ 135 | best_paths = self.crf.viterbi_tags( 136 | logits, 137 | kwargs["attention_mask"].long(), 138 | top_k=1 139 | ) 140 | """ 141 | best_paths = self.crf.viterbi_tags( 142 | clear_logits, 143 | clear_mask.long(), 144 | top_k=1 145 | ) 146 | # Just get the top tags and ignore the scores. 147 | predicted_tags = cast(List[List[int]], [x[0][0] for x in best_paths]) 148 | 149 | if kwargs.get("labels") is not None: 150 | labels = kwargs.get("labels").cpu() 151 | #log_likelihood = self.crf(logits, kwargs.get("labels"), kwargs["attention_mask"]) 152 | log_likelihood = self.crf(clear_logits, clear_labels, clear_mask) 153 | loss = -log_likelihood 154 | correct_predicted_tags = np.zeros_like(labels) 155 | for i in range(len(labels)): 156 | correct_predicted_tags[i][labels[i] != -100] = predicted_tags[i] 157 | return (loss, logits, list(correct_predicted_tags)) 158 | 159 | return (None, logits, predicted_tags) 160 | 161 | 162 | if __name__ == "__main__": 163 | pass 164 | -------------------------------------------------------------------------------- /span_identification/submission.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | import numpy as np 3 | from unidecode import unidecode 4 | import string 5 | import nltk 6 | from nltk.corpus import stopwords 7 | 8 | 9 | def merge_spans(spans, articles_id, articles_content): 10 | res = dict() 11 | articles_content_dict = dict(zip(articles_id, articles_content)) 12 | for article_id in spans: 13 | article = articles_content_dict[article_id] 14 | res[article_id] = [] 15 | mask = np.zeros(len(article)) 16 | for span in spans[article_id]: 17 | mask[span[0]: span[1]] = 1 18 | start = -1 19 | length = 0 20 | for i in range(len(mask)): 21 | if mask[i] == 0: 22 | if start != -1: 23 | res[article_id].append((start, start + length)) 24 | start = -1 25 | length = 0 26 | if mask[i] == 1: 27 | if start == -1: 28 | start = i 29 | length = 1 30 | else: 31 | length += 1 32 | return res 33 | 34 | 35 | def correct_spans(spans, articles_id, articles_content): 36 | stop_words = set(stopwords.words('english')) 37 | res = dict() 38 | articles_content_dict = dict(zip(articles_id, articles_content)) 39 | for article_id in spans: 40 | article = articles_content_dict[article_id] 41 | res[article_id] = [] 42 | mask = np.zeros(len(article)) 43 | for span in spans[article_id]: 44 | mask[span[0]: span[1] + 1] = 1 45 | start = -1 46 | length = 0 47 | for i in range(len(mask)): 48 | if mask[i] == 0: 49 | if start != -1: 50 | end = start + length 51 | 52 | if unidecode(article[start - 1]) == '"': 53 | start -= 1 54 | else: 55 | while not article[start].isalnum(): 56 | start += 1 57 | if unidecode(article[end]) == '"': 58 | end += 1 59 | 60 | if unidecode(article[end - 1]) != '"': 61 | while not article[end - 1].isalnum(): 62 | end -= 1 63 | if end - start > 1: 64 | if article[start: end].lower() not in stop_words: 65 | res[article_id].append((start, end)) 66 | ''' 67 | while article[end - 1].isspace(): 68 | end -= 1 69 | if end > start: 70 | res[article_id].append((start, end)) 71 | ''' 72 | start = -1 73 | length = 0 74 | 75 | if mask[i] == 1: 76 | if start == -1: 77 | start = i 78 | length = 1 79 | else: 80 | length += 1 81 | 82 | if start != -1: 83 | if unidecode(article[start - 1]) == '"': 84 | start -= 1 85 | length += 1 86 | if unidecode(article[start + length]) == '"': 87 | length += 1 88 | if unidecode(article[start + length - 1]) != '"': 89 | while not article[start + length - 1].isalnum(): 90 | length -= 1 91 | if length > 0: 92 | res[article_id].append((start, start + length)) 93 | return res # merge_spans(res, articles_id, articles_content) 94 | 95 | 96 | def get_spans_from_file(file, articles_id, articles_content, nlp): 97 | pred_spans = dict() 98 | with open(file, 'r') as f: 99 | for article_id, text in zip(articles_id, articles_content): 100 | pred_spans.setdefault(article_id, []) 101 | tokens = [(token.idx, token.text) for token in nlp(text)] 102 | idx = np.array(tokens)[:,0] 103 | tokens = np.array(tokens)[:,1] 104 | tokens = [token.strip().replace('\n', ' ').replace('\t', ' ') for token in tokens] 105 | 106 | i = 0 107 | start = -1 108 | for i in range(len(tokens)): 109 | tok = tokens[i] 110 | if len(tok) != 0 and repr(tok) != repr('\ufeff') and repr(tok) != repr('\u200f'): 111 | token, label = f.readline().split('\t') 112 | label = label.strip() 113 | if label == 'B-PROP' or (label == 'I-PROP' and start == -1): 114 | if start != -1: 115 | pred_spans[article_id].append((start, int(idx[i - 1]) + len(tokens[i - 1]))) 116 | start = int(idx[i]) 117 | if label == 'O': 118 | if start != -1: 119 | pred_spans[article_id].append((start, int(idx[i - 1]) + len(tokens[i - 1]))) 120 | start = -1 121 | assert token == tok 122 | assert tok == text[int(idx[i]): int(idx[i]) + len(tok)] 123 | prev_label = label 124 | prev_tok = tok 125 | else: 126 | if prev_tok != '\n': 127 | f.readline() 128 | prev_tok = '\n' 129 | prev_label = 'O' 130 | 131 | return correct_spans(pred_spans, articles_id, articles_content) 132 | 133 | 134 | def get_submission_format(predicted_labels_files, articles_id, articles_content, nlp, output_file): 135 | agg_result = dict() 136 | for file in predicted_labels_files: 137 | result = get_spans_from_file(file, articles_id, articles_content, nlp) 138 | for el in result: 139 | agg_result[el] = agg_result.get(el, []) + result[el] 140 | agg_result = merge_spans(agg_result, articles_id, articles_content) 141 | 142 | with open(output_file, "w") as fout: 143 | for article_id, spans in agg_result.items(): 144 | for span in spans: 145 | fout.write("%s\t%s\t%s\n" % (article_id, span[0], span[1])) 146 | -------------------------------------------------------------------------------- /span_identification/dataset.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | import glob 3 | import os 4 | from shutil import copyfile, rmtree 5 | import random 6 | import pandas as pd 7 | import numpy as np 8 | from sklearn.model_selection import train_test_split 9 | from tqdm import tqdm 10 | 11 | 12 | def load_data(data_folder, propaganda_techniques_file): 13 | file_list = glob.glob(os.path.join(data_folder, "*.txt")) 14 | articles_content, articles_id = ([], []) 15 | for filename in sorted(file_list): 16 | with open(filename, "r", encoding="utf-8") as f: 17 | articles_content.append(f.read()) 18 | articles_id.append(os.path.basename(filename).split(".")[0][7:]) 19 | 20 | with open(propaganda_techniques_file, "r") as f: 21 | propaganda_techniques_names = [line.rstrip() for line in f.readlines()] 22 | 23 | return articles_content, articles_id, propaganda_techniques_names 24 | 25 | 26 | def read_predictions_from_file(filename): 27 | articles_id, gold_spans = ([], []) 28 | with open(filename, "r") as f: 29 | for row in f.readlines(): 30 | article_id, gold_span_start, gold_span_end = row.rstrip().split("\t") 31 | articles_id.append(article_id) 32 | gold_spans.append(tuple(int(el) for el in [gold_span_start, gold_span_end])) 33 | return articles_id, gold_spans 34 | 35 | 36 | def group_spans_by_article_ids(span_list): 37 | data = {} 38 | for el in span_list: 39 | article_id, span = el[0], el[1] 40 | data.setdefault(article_id, []) 41 | data[article_id].append(span) 42 | return data 43 | 44 | 45 | def get_train_dev_files(articles_id, articles_content, nlp, labels_path, train_file, dev_file, split_by_ids=True, 46 | dev_size=0.3, random_state=42): 47 | articles_content_dict = dict(zip(articles_id, articles_content)) 48 | articles_id, gold_spans = read_predictions_from_file(labels_path) 49 | span_list = list(zip(articles_id, gold_spans)) 50 | 51 | if split_by_ids: 52 | data = group_spans_by_article_ids(span_list) 53 | train_ids, dev_ids = train_test_split(np.unique(articles_id), test_size=dev_size, random_state=random_state) 54 | train_data = sorted([(key, value) for (key, value) in data.items() if key in train_ids]) 55 | dev_data = sorted([(key, value) for (key, value) in data.items() if key in dev_ids]) 56 | else: 57 | span_list_train, span_list_test = train_test_split(span_list, test_size=dev_size, random_state=random_state) 58 | train_data = sorted(group_spans_by_article_ids(span_list_train).items()) 59 | dev_data = sorted(group_spans_by_article_ids(span_list_train).items()) 60 | train_ids = [example[0] for example in train_data] 61 | dev_ids = [example[0] for example in dev_data] 62 | 63 | create_BIO_labeled(train_file, train_data, articles_content_dict, nlp) 64 | create_BIO_labeled(dev_file, dev_data, articles_content_dict, nlp) 65 | 66 | return train_ids, dev_ids 67 | 68 | 69 | def get_test_file(file, articles_id, articles_content, nlp): 70 | create_BIO_unlabeled(file, articles_id, articles_content, nlp) 71 | 72 | 73 | def token_label_from_spans(pos, spans): 74 | for el in spans: 75 | if el[0] <= int(pos) < el[1]: 76 | return "PROP" 77 | return 'O' 78 | 79 | 80 | def create_BIO_labeled(file, data, articles_content_dict, nlp): 81 | prev_label = 'O' 82 | with open(file, 'w') as f: 83 | for article_id, spans in tqdm(data): 84 | text = articles_content_dict[article_id] 85 | tokens = [(token.idx, token.text) for token in nlp(text)] 86 | idx = np.array(tokens)[:,0] 87 | tokens = np.array(tokens)[:,1] 88 | prev_tok = '\n' 89 | 90 | for i in range(len(tokens)): 91 | tok = tokens[i].replace('\n', ' ').replace('\t', ' ').strip() 92 | if len(tok) != 0 and repr(tok) != repr('\ufeff') and repr(tok) != repr('\u200f'): 93 | tok = tokens[i].strip().replace('\n', ' ').replace('\t', ' ') 94 | label = token_label_from_spans(idx[i], spans) 95 | if label != 'O': 96 | if prev_label != 'O': 97 | label = 'I-' + 'PROP' 98 | else: 99 | label = 'B-' + 'PROP' 100 | f.write(tok + '\t' + label + '\n') 101 | prev_label = label 102 | prev_tok = tok 103 | else: 104 | if prev_tok != '\n': 105 | f.write('\n') 106 | prev_tok = '\n' 107 | prev_label = 'O' 108 | 109 | 110 | def create_BIO_unlabeled(file, articles_id, articles_content, nlp): 111 | prev_label = 'O' 112 | with open(file, 'w') as f: 113 | for article_id, text in tqdm(zip(articles_id, articles_content)): 114 | tokens = [(token.idx, token.text) for token in nlp(text)] 115 | idx = np.array(tokens)[:,0] 116 | tokens = np.array(tokens)[:,1] 117 | prev_tok = '\n' 118 | 119 | for i in range(len(tokens)): 120 | tok = tokens[i].replace('\n', ' ').replace('\t', ' ').strip() 121 | if len(tok) != 0 and repr(tok) != repr('\ufeff') and repr(tok) != repr('\u200f'): 122 | tok = tokens[i].strip().replace('\n', ' ').replace('\t', ' ') 123 | label = 'O' 124 | f.write(tok + '\t' + label + '\n') 125 | prev_label = label 126 | prev_tok = tok 127 | else: 128 | if prev_tok != '\n': 129 | f.write('\n') 130 | prev_tok = '\n' 131 | prev_label = 'O' 132 | 133 | 134 | def create_subfolder(subfolder, source_folder, articles_id): 135 | if os.path.exists(subfolder): 136 | rmtree(subfolder) 137 | os.makedirs(subfolder) 138 | for article_id in articles_id: 139 | file = 'article' + str(article_id) + '.txt' 140 | copyfile(os.path.join(source_folder, file), os.path.join(subfolder, file)) 141 | -------------------------------------------------------------------------------- /tools/README.md: -------------------------------------------------------------------------------- 1 | 2 | Scorers for the Propaganda Techniques Corpus Version 2 3 | 4 | Contents 5 | 6 | 1. Tasks 7 | 2. Evaluation scripts 8 | 3. Data format 9 | 4. Tools 10 | 5. Citation 11 | 6. Changes from version 1 12 | 13 | 14 | Tasks 15 | -------------------------------------------- 16 | The Propaganda Techniques Corpus (PTC) is a corpus of articles annotated 17 | with propaganda techniques at a fine-grained level. The list of 18 | techniques is in file data/propaganda-techniques-names-semeval2020task11.txt. 19 | Among the different tasks that the corpus enables SemEval 2020 Task 11 focuses on the following ones: 20 | 21 | Subtask 1 (SI). Propaganda Identification. 22 | Given a plain-text document, identify those specific fragments that contain one propaganda technique. This is a binary sequence tagging task. 23 | 24 | Subtask 2 (TC). Propaganda Technique Labeling. 25 | Given a text fragment identified as propaganda and its document context, identify the applied propaganda technique at hand. This is a multi-class classification problem. 26 | 27 | See the paper in the section "Citation" for further details. 28 | 29 | 30 | Evaluation scripts 31 | -------------------------------------------- 32 | 33 | -Task SI (task-SI_scorer.py) 34 | 35 | The evaluation script computes a variant of precision, recall, and F-measure 36 | that takes into account partial overlaps between fragments (see 37 | http://propaganda.qcri.org/semeval2020-task11/data/propaganda_tasks_evaluation.pdf 38 | for more details). 39 | 40 | The script can be run as follows: 41 | 42 | python3 task-SI_scorer.py -s [prediction_file] -r [gold_folder] -m 43 | 44 | Note that all files *.labels in [gold_folder] will be considered containing gold labels 45 | As an example, we provide a "prediction_file" data/submission-task-SI.tsv 46 | and you can run it as follows: 47 | 48 | === 49 | 50 | $ python3 task-SI_scorer.py -s data/submission-task-SI.tsv -r data -m 51 | 2019-09-20 19:47:26,427 - INFO - Checking user submitted file 52 | 2019-09-20 19:47:26,429 - INFO - Scoring the submission with precision and recall method 53 | 2019-09-20 19:47:26,430 - INFO - Precision=1.929825/2=0.964912 Recall=1.947458/4=0.486864 54 | 2019-09-20 19:47:26,430 - INFO - F1=0.647181 55 | 56 | 57 | === 58 | 59 | The scorer for the TC task is task-TC_scorer.py. 60 | The scorer requires file data/propaganda-techniques-names-semeval2020task11.txt. 61 | Such file contains the list of techniques used for scoring. 62 | Adding and removing items from the list will affect the outcome of the scorer. 63 | It can be run as follows 64 | 65 | python3 task-TC_scorer.py -s [prediction_file] -r [gold_file] -p data/propaganda-techniques-names-semeval2020task11.txt 66 | 67 | For example: 68 | 69 | $ python3 task-TC_scorer.py -s data/submission-task-TC.tsv -r data/article736757214.labels-task-TC -p data/propaganda-techniques-names-semeval2020task11.txt 2>/dev/null 70 | 2019-09-20 19:39:21,286 - INFO - Checking format: User Predictions -- Gold Annotations 71 | 2019-09-20 19:39:21,287 - INFO - OK: submission file format appears to be correct 72 | 2019-09-20 19:39:21,293 - INFO - Scoring submission 73 | F1=0.600000 74 | Precision=0.600000 75 | Recall=0.600000 76 | F1_Appeal_to_Authority=0.0 77 | F1_Appeal_to_fear-prejudice=0.0 78 | F1_Bandwagon,Reductio_ad_hitlerum=0.0 79 | F1_Black-and-White_Fallacy=0.0 80 | F1_Causal_Oversimplification=0.0 81 | F1_Doubt=0.0 82 | F1_Exaggeration,Minimisation=1.0 83 | F1_Flag-Waving=0.0 84 | F1_Loaded_Language=0.6666666666666666 85 | F1_Name_Calling,Labeling=0.6666666666666666 86 | F1_Repetition=0.0 87 | F1_Slogans=0.0 88 | F1_Thought-terminating_Cliches=0.0 89 | F1_Whataboutism,Straw_Men,Red_Herring=0.0 90 | 91 | 92 | Data format 93 | -------------------------------------------- 94 | 95 | -Task SI 96 | 97 | The corpus includes one tab-separated file per article in the following 98 | format: 99 | 100 | id begin_offset end_offset 101 | 102 | where 103 | id is the identifier of the article 104 | begin_offset is the character where the covered span begins (inclusive) 105 | end_offset is the character where the covered span ends (exclusive) 106 | 107 | An example of such a file is data/article736757214.task-FLC.labels. 108 | 109 | -Task TC 110 | 111 | The corpus includes one tab-separated file per article in the following format: 112 | 113 | id technique begin_offset end_offset 114 | 115 | The fields are the same as for task SI, but it now also includes "technique", i.e., the propaganda technique applied in the instance. 116 | 117 | 118 | Tools 119 | -------------------------------------------- 120 | 121 | - The script print_spans.py highlights the annotations in an article. 122 | 123 | python3 print_spans.py -s [annotations_file] -t [article_file] -p [propaganda_techniques_file] 124 | 125 | For example: 126 | 127 | python3 print_spans.py -t data/article736757214.txt -s data/article736757214.labels-task-TC -p data/propaganda-techniques-names-semeval2020task11.txt 128 | 129 | 130 | Citation 131 | -------------------------------------------- 132 | 133 | Please cite the following publication when using the PTC corpus: 134 | 135 | G. Da San Martino, S. Yu, A. Barrón-Cedeño, R. Petrov and P. Nakov, "Fine-Grained Analysis of Propaganda in News Articles", to appear at the Conference on Empirical Methods in Natural Language Processing (EMNLP 2019), Hong Kong, China, November 3-7, 2019. 136 | 137 | @InProceedings{EMNLP19DaSanMartino, 138 | author = {Da San Martino, Giovanni and 139 | Yu, Seunghak and 140 | Barr\'{o}n-Cede\~no, Alberto and 141 | Petrov, Rostislav and 142 | Nakov, Preslav}, 143 | title = {Fine-Grained Analysis of Propaganda in News Articles}, 144 | booktitle = {Proceedings of the 2019 Conference on Empirical Methods in Natural Language Processing and 9th International Joint Conference on Natural Language Processing, EMNLP-IJCNLP 2019, Hong Kong, China, November 3-7, 2019}, 145 | series = {EMNLP-IJCNLP 2019}, 146 | year = {2019}, 147 | address = {Hong Kong, China}, 148 | month = {November}, 149 | } 150 | 151 | 152 | Changes from version 1 153 | -------------------------------------------- 154 | 155 | Fixed a bug in the evaluation function for task TC that prevented to find the best alignment between the labels of identical spans in certain cases. 156 | 157 | Now print_spans.py has a parameter -p specifying the file with the list of propaganda techniques 158 | 159 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Semeval 2020, Task 11 2 | 3 | ## Overview 4 | This repository provides code for the SemEval-2020 Task 11 competition (Detection of Propaganda Techniques in News Articles). 5 | 6 | The competition webpage: https://propaganda.qcri.org/semeval2020-task11/ 7 | 8 | The description of the architecture of models can be found in our paper [Aschern at SemEval-2020 Task 11: It Takes Three to Tango: RoBERTa, CRF, and Transfer Learning](https://www.aclweb.org/anthology/2020.semeval-1.191/). 9 | 10 | ## Requirements 11 | ``` 12 | pip install -r ./requirements.txt 13 | ``` 14 | 15 | ## Project structure 16 | 17 | - `configs`: yaml configs for the system 18 | - `datasets`: contains the task datasets, which can be downloaded from the team competition webpage 19 | - `results`: the folder for submissions 20 | - `span_identification`: code for the task SI 21 | - `ner`: pytorch-transformers RoBERTa model with CRF (end-to-end) 22 | - `dataset`: the scripts for loading and preprocessing source dataset 23 | - `submission`: the scripts for obtaining and evaluating results 24 | - `technique_classification`: code for the task TC (the folder has the same structure as `span_identification`) 25 | - `tools`: tools provided by the competition organizers; contain useful functions for reading datasets and evaluating submissions 26 | - `visualization_example`: example of visualization of results for both tasks 27 | 28 | ## Running the models 29 | 30 | All commands are run from the root directory of the repository. 31 | 32 | ### Span Identification 33 | 34 | 1. Configure `configs/si_config.yml` file, if it is needed. data_dir is the path to the cache of original train/eval sub-datasets and their BIO versions. In addition to using the config, it is also possible to specify arguments through the command line. 35 | 36 | 2. Split the dataset for local evaluation (if `--overwrite_cache`, previous files will be replaced). It will produce files with the BIO-format tagging for spans (B-PROP, I-PROP, O) in your `--data_dir`. 37 | ```bash 38 | python -m span_identification --config configs/si_config.yml --split_dataset --overwrite_cache 39 | ``` 40 | 3. Train and eval model (the model parameters are specified in the config, you need to change the paths). The use of CRF is regulated by the flag `--use_crf`. For the first run you can use `--model_name_or_path roberta-large`. 41 | ```bash 42 | python -m span_identification --config configs/si_config.yml --do_train --do_eval 43 | ``` 44 | 4. Apply the trained model to the `test_file` (in BIO-format) specified in the config. It will be created based on the `test_data_folder` folder in case of missing or if the flag `--overwrite_cache` is specified. 45 | ```bash 46 | python -m span_identification --config configs/si_config.yml --do_predict 47 | ``` 48 | 5. Create the submission file `output_file` in the `result` folder. It will obtain spans from the result files with the token labeling specified in `predicted_labels_files`. At the aggregation stage, the span prediction results are simply joined. 49 | ```bash 50 | python -m span_identification --config configs/si_config.yml --create_submission_file 51 | ``` 52 | 6. In case you have the correct markup in the `test_file` or gold `--gold_annot_file` (source competition format), you can run the evaluation competition script. 53 | ```bash 54 | python -m span_identification --config configs/si_config.yml --do_eval_spans 55 | ``` 56 | 7. Use `visualization_example/visualization.ipynb` if you want to visualize labels. 57 | 58 | ### Technique Classification 59 | 60 | Here you need almost the same commands and settings as in the SI task. 61 | 62 | 1. Configure `configs/tc_config.yml` file, if it is needed. 63 | 64 | 2. Split the dataset for local evaluation. 65 | ```bash 66 | python -m technique_classification --config configs/tc_config.yml --split_dataset --overwrite_cache 67 | ``` 68 | 3. Train and eval model. We used two setups with and without flags `--join_embeddings --use_length` (to get our RoBERTa-Joined). For the first run you can use `--model_name_or_path roberta-large`. 69 | ```bash 70 | python -m technique_classification --config configs/tc_config.yml --do_train --do_eval 71 | ``` 72 | or distributed 73 | ``` 74 | CUDA_VISIBLE_DEVICES=0,1,2,3 python -m torch.distributed.launch --nproc_per_node 4 technique_classification --config configs/tc_config.yml --do_train --do_eval 75 | ``` 76 | 4. Apply the trained model to the `test_file` specified in the config. It will be created based on the `test_data_folder` folder and `test_template_labels_path` file in case of missing or if the flag `--overwrite_cache` is specified. 77 | ```bash 78 | python -m technique_classification --config configs/tc_config.yml --do_predict --join_embeddings --use_length 79 | ``` 80 | 5. Create the submission file `output_file`. It will combine predictions from the list `predicted_logits_files` with coefficients specified in `--weights` (optional) and apply some post-processing. 81 | ```bash 82 | python -m technique_classification --config configs/tc_config.yml --create_submission_file 83 | ``` 84 | 6. In case you have the correct markup in the `test_file` or gold `--test_labels_path` (source competition format), you can check your accuracy (micro f1-score) and f1-score per classes. 85 | ```bash 86 | python -m technique_classification --config configs/tc_config.yml --eval_submission 87 | ``` 88 | 7. Use `visualization_example/visualization.ipynb` if you want to visualize labels. 89 | 90 | Our pretrained RoBERTa-CRF (SI task) and RoBERTa-Joined (TC task) models are available in [Google Drive](https://vk.com/away.php?to=https%3A%2F%2Fdrive.google.com%2Fdrive%2Ffolders%2F1Gph7FKMaxOBJdkrk0nM72uFpCGgn-2kC%3Fusp%3Dsharing). 91 | 92 | ## Citation 93 | 94 | If you find this repository helpful, feel free to cite our publication [Aschern at SemEval-2020 Task 11: It Takes Three to Tango: RoBERTa, CRF, and Transfer Learning](https://www.aclweb.org/anthology/2020.semeval-1.191/): 95 | ``` 96 | @inproceedings{chernyavskiy-etal-2020-aschern, 97 | title = "Aschern at {S}em{E}val-2020 Task 11: It Takes Three to Tango: {R}o{BERT}a, {CRF}, and Transfer Learning", 98 | author = "Chernyavskiy, Anton and 99 | Ilvovsky, Dmitry and 100 | Nakov, Preslav", 101 | booktitle = "Proceedings of the Fourteenth Workshop on Semantic Evaluation", 102 | month = dec, 103 | year = "2020", 104 | address = "Barcelona (online)", 105 | publisher = "International Committee for Computational Linguistics", 106 | url = "https://www.aclweb.org/anthology/2020.semeval-1.191", 107 | pages = "1462--1468" 108 | } 109 | ``` 110 | -------------------------------------------------------------------------------- /tools/src/annotation_w_o_label.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | import sys 3 | import src.propaganda_techniques as pt 4 | import logging.handlers 5 | 6 | __author__ = "Giovanni Da San Martino" 7 | __copyright__ = "Copyright 2019" 8 | __credits__ = ["Giovanni Da San Martino"] 9 | __license__ = "GPL" 10 | __version__ = "0.1" 11 | __maintainer__ = "Giovanni Da San Martino" 12 | __email__ = "gmartino@hbku.edu.qa" 13 | __status__ = "Beta" 14 | 15 | logger = logging.getLogger("propaganda_scorer") 16 | 17 | 18 | class AnnotationWithOutLabel(object): 19 | 20 | """ 21 | One annotation is represented by a span (two integer indices indicating the 22 | starting and ending position of the span). 23 | The class provides basic maniputation functions for one annotation. 24 | """ 25 | 26 | # input file format variables 27 | separator = "\t" 28 | ARTICLE_ID_COL = 0 29 | FRAGMENT_START_COL = 1 30 | FRAGMENT_END_COL = 2 31 | 32 | 33 | def __init__(self, start_offset:str = None, end_offset:str=None): 34 | 35 | self.start_offset = int(start_offset) 36 | self.end_offset = int(end_offset) 37 | 38 | 39 | def __str__(self): 40 | 41 | return "[%d, %d]"%(self.start_offset, self.end_offset) 42 | #return "%d\t%d"%(self.start_offset, self.end_offset) 43 | 44 | 45 | def is_span_equal_to(self, second_annotation:AnnotationWithOutLabel)->bool: 46 | """ 47 | Checks whether two annotations are identical, i.e. whether the two spans are identical. 48 | """ 49 | if self.get_start_offset() != second_annotation.get_start_offset() or self.get_end_offset() != second_annotation.get_end_offset(): 50 | return False 51 | return True 52 | 53 | 54 | def __eq__(self, second_annotation:AnnotationWithOutLabel): 55 | 56 | return self.is_span_equal_to(second_annotation) 57 | 58 | 59 | def get_start_offset(self)->int: 60 | 61 | return self.start_offset 62 | 63 | 64 | def get_end_offset(self)->int: 65 | 66 | return self.end_offset 67 | 68 | 69 | def get_span(self)->set: 70 | """ 71 | Returns a set of positions of all characters in the span 72 | """ 73 | return set(range(self.get_start_offset(), self.get_end_offset())) 74 | 75 | 76 | @staticmethod 77 | def load_annotation_from_string(annotation_string:str, row_num:int=None, filename:str=None)->(AnnotationWithOutLabel, str): 78 | """ 79 | Read annotations from a csv-like string, with fields separated 80 | by the class variable `separator`: 81 | 82 | article idstarting_positionending_position 83 | Fields order is determined by the class variables ARTICLE_ID_COL, 84 | FRAGMENT_START_COL, FRAGMENT_END_COL 85 | 86 | Besides reading the data, it performs basic checks. 87 | 88 | :return a tuple (AnnotationWithOutLabel object, id of the article) 89 | """ 90 | 91 | row = annotation_string.rstrip().split(AnnotationWithOutLabel.separator) 92 | if len(row) != 3: 93 | logger.error("Row%s%s is supposed to have 3 columns. Found %d: -%s-." 94 | % (" " + str(row_num) if row_num is not None else "", 95 | " in file " + filename if filename is not None else "", len(row), annotation_string)) 96 | sys.exit() 97 | 98 | article_id = row[AnnotationWithOutLabel.ARTICLE_ID_COL] 99 | try: 100 | start_offset = int(row[AnnotationWithOutLabel.FRAGMENT_START_COL]) 101 | except: 102 | logger.error("The column %d in row%s%s is supposed to be an integer: -%s-" 103 | %(AnnotationWithOutLabel.FRAGMENT_START_COL, " " + str(row_num) if row_num is not None else "", " in file " + filename if filename is not None else "", annotation_string)) 104 | try: 105 | end_offset = int(row[AnnotationWithOutLabel.FRAGMENT_END_COL]) 106 | except: 107 | logger.error("The column %d in row%s%s is supposed to be an integer: -%s-" 108 | %(AnnotationWithOutLabel.FRAGMENT_END_COL, " " + str(row_num) if row_num is not None else "", 109 | " in file " + filename if filename is not None else "", annotation_string)) 110 | 111 | return AnnotationWithOutLabel(start_offset, end_offset), article_id 112 | 113 | 114 | def merge_spans(self, second_annotation:AnnotationWithOutLabel)->None: 115 | """ 116 | Merge the spans of two annotations. The function does not check whether the spans overlap. 117 | 118 | :param second_annotation: the AnnotationWithOutLabel object whose span is being merged 119 | :return: 120 | """ 121 | self.set_start_offset(min(self.get_start_offset(), second_annotation.get_start_offset())) 122 | self.set_end_offset(max(self.get_end_offset(), second_annotation.get_end_offset())) 123 | 124 | 125 | def set_start_offset(self, new_start_offset:int)->None: 126 | 127 | self.start_offset = new_start_offset 128 | 129 | 130 | def set_end_offset(self, new_end_offset:int)->None: 131 | 132 | self.end_offset = new_end_offset 133 | 134 | 135 | def shift_annotation(self, offset:int)->None: 136 | 137 | self.set_start_offset(self.get_start_offset() + offset) 138 | self.set_end_offset(self.get_end_offset() + offset) 139 | 140 | 141 | def span_overlapping(self, second_annotation:AnnotationWithOutLabel)->bool: 142 | return len(self.get_span().intersection(second_annotation.get_span())) > 0 143 | 144 | 145 | def is_span_valid(self)->bool: 146 | """ 147 | Checks whether the span is valid, i.e. if the following conditions are met: 148 | 1) start and end offsets >= 0 149 | 2) start offset < end offset 150 | """ 151 | if self.get_start_offset() < 0 or self.get_end_offset() < 0: 152 | logger.error("Start and end of position of the fragment must be non-negative: %d, %d" 153 | %(self.get_start_offset(), self.get_end_offset())) 154 | return False 155 | if self.get_start_offset() >= self.get_end_offset(): 156 | logger.error("End position of the fragment must be greater than the starting one: start=%d, end=%d"%(self.get_start_offset(), self.get_end_offset())) 157 | return False 158 | return True 159 | 160 | 161 | def check_format_of_annotation_in_file(self): 162 | """ 163 | Performs some checks on the fields of the annotation 164 | """ 165 | if not self.is_span_valid(): 166 | sys.exit() 167 | 168 | -------------------------------------------------------------------------------- /span_identification/ner/utils_ner.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team. 3 | # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | """ Named entity recognition fine-tuning: utilities to work with CoNLL-2003 task. """ 17 | 18 | from __future__ import absolute_import, division, print_function 19 | 20 | import logging 21 | import os 22 | from io import open 23 | 24 | logger = logging.getLogger(__name__) 25 | 26 | 27 | class InputExample(object): 28 | """A single training/test example for token classification.""" 29 | 30 | def __init__(self, guid, words, labels): 31 | """Constructs a InputExample. 32 | 33 | Args: 34 | guid: Unique id for the example. 35 | words: list. The words of the sequence. 36 | labels: (Optional) list. The labels for each word of the sequence. This should be 37 | specified for train and dev examples, but not for test examples. 38 | """ 39 | self.guid = guid 40 | self.words = words 41 | self.labels = labels 42 | 43 | 44 | class InputFeatures(object): 45 | """A single set of features of data.""" 46 | 47 | def __init__(self, input_ids, input_mask, segment_ids, label_ids): 48 | self.input_ids = input_ids 49 | self.input_mask = input_mask 50 | self.segment_ids = segment_ids 51 | self.label_ids = label_ids 52 | 53 | 54 | def read_examples_from_file(file_path, mode): 55 | guid_index = 1 56 | examples = [] 57 | with open(file_path, encoding="utf-8") as f: 58 | words = [] 59 | labels = [] 60 | for line in f: 61 | if line.startswith("-DOCSTART-") or line == "" or line == "\n": 62 | if words: 63 | examples.append(InputExample(guid="{}-{}".format(mode, guid_index), 64 | words=words, 65 | labels=labels)) 66 | guid_index += 1 67 | words = [] 68 | labels = [] 69 | else: 70 | splits = line.split('\t') # " " 71 | words.append(splits[0]) 72 | if len(splits) > 1: 73 | labels.append(splits[-1].replace("\n", "")) 74 | else: 75 | # Examples could have no label for mode = "test" 76 | labels.append("O") 77 | if words: 78 | examples.append(InputExample(guid="%s-%d".format(mode, guid_index), 79 | words=words, 80 | labels=labels)) 81 | return examples 82 | 83 | 84 | def convert_examples_to_features(examples, 85 | label_list, 86 | max_seq_length, 87 | tokenizer, 88 | cls_token_at_end=False, 89 | cls_token="[CLS]", 90 | cls_token_segment_id=1, 91 | sep_token="[SEP]", 92 | sep_token_extra=False, 93 | pad_on_left=False, 94 | pad_token=0, 95 | pad_token_segment_id=0, 96 | pad_token_label_id=-1, 97 | sequence_a_segment_id=0, 98 | mask_padding_with_zero=True): 99 | """ Loads a data file into a list of `InputBatch`s 100 | `cls_token_at_end` define the location of the CLS token: 101 | - False (Default, BERT/XLM pattern): [CLS] + A + [SEP] + B + [SEP] 102 | - True (XLNet/GPT pattern): A + [SEP] + B + [SEP] + [CLS] 103 | `cls_token_segment_id` define the segment id associated to the CLS token (0 for BERT, 2 for XLNet) 104 | """ 105 | 106 | label_map = {label: i for i, label in enumerate(label_list)} 107 | 108 | features = [] 109 | for (ex_index, example) in enumerate(examples): 110 | if ex_index % 10000 == 0: 111 | logger.info("Writing example %d of %d", ex_index, len(examples)) 112 | 113 | tokens = [] 114 | label_ids = [] 115 | for word, label in zip(example.words, example.labels): 116 | word_tokens = tokenizer.tokenize(word) 117 | tokens.extend(word_tokens) 118 | # Use the real label id for the first token of the word, and padding ids for the remaining tokens 119 | label_ids.extend([label_map[label]] + [pad_token_label_id] * (len(word_tokens) - 1)) 120 | 121 | # Account for [CLS] and [SEP] with "- 2" and with "- 3" for RoBERTa. 122 | special_tokens_count = 3 if sep_token_extra else 2 123 | if len(tokens) > max_seq_length - special_tokens_count: 124 | tokens = tokens[:(max_seq_length - special_tokens_count)] 125 | label_ids = label_ids[:(max_seq_length - special_tokens_count)] 126 | 127 | # The convention in BERT is: 128 | # (a) For sequence pairs: 129 | # tokens: [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP] 130 | # type_ids: 0 0 0 0 0 0 0 0 1 1 1 1 1 1 131 | # (b) For single sequences: 132 | # tokens: [CLS] the dog is hairy . [SEP] 133 | # type_ids: 0 0 0 0 0 0 0 134 | # 135 | # Where "type_ids" are used to indicate whether this is the first 136 | # sequence or the second sequence. The embedding vectors for `type=0` and 137 | # `type=1` were learned during pre-training and are added to the wordpiece 138 | # embedding vector (and position vector). This is not *strictly* necessary 139 | # since the [SEP] token unambiguously separates the sequences, but it makes 140 | # it easier for the model to learn the concept of sequences. 141 | # 142 | # For classification tasks, the first vector (corresponding to [CLS]) is 143 | # used as as the "sentence vector". Note that this only makes sense because 144 | # the entire model is fine-tuned. 145 | tokens += [sep_token] 146 | label_ids += [pad_token_label_id] 147 | if sep_token_extra: 148 | # roberta uses an extra separator b/w pairs of sentences 149 | tokens += [sep_token] 150 | label_ids += [pad_token_label_id] 151 | segment_ids = [sequence_a_segment_id] * len(tokens) 152 | 153 | if cls_token_at_end: 154 | tokens += [cls_token] 155 | label_ids += [pad_token_label_id] 156 | segment_ids += [cls_token_segment_id] 157 | else: 158 | tokens = [cls_token] + tokens 159 | label_ids = [pad_token_label_id] + label_ids 160 | segment_ids = [cls_token_segment_id] + segment_ids 161 | 162 | input_ids = tokenizer.convert_tokens_to_ids(tokens) 163 | 164 | # The mask has 1 for real tokens and 0 for padding tokens. Only real 165 | # tokens are attended to. 166 | input_mask = [1 if mask_padding_with_zero else 0] * len(input_ids) 167 | 168 | # Zero-pad up to the sequence length. 169 | padding_length = max_seq_length - len(input_ids) 170 | if pad_on_left: 171 | input_ids = ([pad_token] * padding_length) + input_ids 172 | input_mask = ([0 if mask_padding_with_zero else 1] * padding_length) + input_mask 173 | segment_ids = ([pad_token_segment_id] * padding_length) + segment_ids 174 | label_ids = ([pad_token_label_id] * padding_length) + label_ids 175 | else: 176 | input_ids += ([pad_token] * padding_length) 177 | input_mask += ([0 if mask_padding_with_zero else 1] * padding_length) 178 | segment_ids += ([pad_token_segment_id] * padding_length) 179 | label_ids += ([pad_token_label_id] * padding_length) 180 | 181 | assert len(input_ids) == max_seq_length 182 | assert len(input_mask) == max_seq_length 183 | assert len(segment_ids) == max_seq_length 184 | assert len(label_ids) == max_seq_length 185 | 186 | if ex_index < 5: 187 | logger.info("*** Example ***") 188 | logger.info("guid: %s", example.guid) 189 | logger.info("tokens: %s", " ".join([str(x) for x in tokens])) 190 | logger.info("input_ids: %s", " ".join([str(x) for x in input_ids])) 191 | logger.info("input_mask: %s", " ".join([str(x) for x in input_mask])) 192 | logger.info("segment_ids: %s", " ".join([str(x) for x in segment_ids])) 193 | logger.info("label_ids: %s", " ".join([str(x) for x in label_ids])) 194 | 195 | features.append( 196 | InputFeatures(input_ids=input_ids, 197 | input_mask=input_mask, 198 | segment_ids=segment_ids, 199 | label_ids=label_ids)) 200 | return features 201 | 202 | 203 | def get_labels(path): 204 | if path: 205 | with open(path, "r") as f: 206 | labels = f.read().splitlines() 207 | if "O" not in labels: 208 | labels = ["O"] + labels 209 | return labels 210 | else: 211 | #return ["O", "B-MISC", "I-MISC", "B-PER", "I-PER", "B-ORG", "I-ORG", "B-LOC", "I-LOC"] 212 | #return ["O", "B-PROP", "I-PROP", 'E-PROP', 'U-PROP'] 213 | return ["O", "B-PROP", "I-PROP"] 214 | -------------------------------------------------------------------------------- /technique_classification/submission.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | import numpy as np 3 | import pandas as pd 4 | from nltk.stem import PorterStemmer 5 | from nltk.tokenize import word_tokenize 6 | from collections import defaultdict 7 | from sklearn.utils.extmath import softmax 8 | from sklearn.metrics import accuracy_score, f1_score 9 | from nltk.corpus import stopwords 10 | import string 11 | import pickle 12 | import os 13 | from unidecode import unidecode 14 | from joblib import dump, load 15 | 16 | 17 | def get_insides(data): 18 | insides = defaultdict(dict) 19 | spans_coords = list(zip(data['span_start'].values, data['span_end'].values)) 20 | labels = data['label'].values 21 | article_ids = data['article_id'].values 22 | for i in range(len(spans_coords)): 23 | for j in range(i): 24 | if article_ids[i] == article_ids[j]: 25 | if spans_coords[i][0] >= spans_coords[j][0] and spans_coords[i][1] <= spans_coords[j][1]: 26 | if spans_coords[i][0] != spans_coords[j][0] or spans_coords[i][1] != spans_coords[j][1]: 27 | insides[labels[i]][labels[j]] = insides[labels[i]].get(labels[j], 0) + 1 28 | if spans_coords[j][0] >= spans_coords[i][0] and spans_coords[j][1] <= spans_coords[i][1]: 29 | if spans_coords[j][0] != spans_coords[i][0] or spans_coords[j][1] != spans_coords[i][1]: 30 | insides[labels[j]][labels[i]] = insides[labels[j]].get(labels[i], 0) + 1 31 | return insides 32 | 33 | 34 | def correct_preds_for_insides(preds, spans_coords, logits, insides, mapping, inverse_mapping): 35 | for i in range(len(preds)): 36 | for j in range(len(preds)): 37 | if spans_coords[j][0] >= spans_coords[i][0] and spans_coords[j][1] <= spans_coords[i][1]: 38 | if spans_coords[j][0] != spans_coords[i][0] or spans_coords[j][1] != spans_coords[i][1]: 39 | def_i = preds[i] 40 | def_j = preds[j] 41 | log = softmax([logits[i]])[0] 42 | login = softmax([logits[j]])[0] 43 | def_prob_i = log[inverse_mapping[preds[i]]] 44 | def_prob_j = login[inverse_mapping[preds[j]]] 45 | while preds[j] not in insides.get(preds[i], []): 46 | if log[inverse_mapping[preds[i]]] > login[inverse_mapping[preds[j]]]: 47 | values = np.sort(login)[-2:] 48 | if values[1] / (values[0] + 1e-6) > 1.4: 49 | preds[i] = def_i 50 | preds[j] = def_j 51 | break 52 | login[inverse_mapping[preds[j]]] = 0 53 | preds[j] = mapping[np.argmax(login)] 54 | else: 55 | values = np.sort(log)[-2:] 56 | if values[1] / (values[0] + 1e-6) > 1.4: 57 | preds[i] = def_i 58 | preds[j] = def_j 59 | break 60 | log[inverse_mapping[preds[i]]] = 0 61 | preds[i] = mapping[np.argmax(log)] 62 | return preds 63 | 64 | 65 | def stem_spans(spans): 66 | ps = PorterStemmer() 67 | res = [] 68 | for el in spans: 69 | result = " ".join(ps.stem(word) for word in word_tokenize(el.lower())) 70 | if len(result) > 0: 71 | res.append(result) 72 | return res 73 | 74 | 75 | def get_train_instances(data, data_dir, save=True): 76 | train_instances = dict() 77 | stemmed_spans = stem_spans(data.span.values) 78 | labels = data.label.values 79 | for i in range(len(stemmed_spans)): 80 | if labels[i] != 'Repetition': 81 | span = stemmed_spans[i] 82 | train_instances.setdefault(span, set()) 83 | train_instances[span].add(labels[i]) 84 | if save: 85 | with open(os.path.join(data_dir, 'train_instances_train'), 'wb') as f: 86 | pickle.dump(train_instances, f) 87 | return train_instances 88 | 89 | 90 | def postprocess(x, mapping, inverse_mapping, insides, stop_words, ps, train_instances): 91 | spans_coords = list(zip(x['span_start'].values, x['span_end'].values)) 92 | spans_source = x['span'].values 93 | spans_text = [' '.join([ps.stem(word) for word in word_tokenize(span.lower())]) for span in spans_source] 94 | spans = [' '.join([ps.stem(word) for word in word_tokenize(unidecode(span.lower())) 95 | if word not in stop_words and word not in string.punctuation]) for span in spans_source] 96 | 97 | counts = dict() 98 | for i in range(len(spans)): 99 | counts.setdefault(spans[i], set()) 100 | counts[spans[i]].add(spans_coords[i][0]) 101 | for el in counts: 102 | counts[el] = len(counts[el]) 103 | 104 | preds = x['pred'].values 105 | logits = [np.array(log.split(), dtype=np.float32) for log in x['logits']] 106 | for i in range(len(preds)): 107 | log = logits[i] 108 | 109 | if counts[spans[i]] >= 3 or (counts[spans[i]] >= 2 and logits[i][inverse_mapping["Repetition"]] > 0.001): 110 | log[inverse_mapping["Repetition"]] = 100 111 | 112 | if counts[spans[i]] == 1 and (logits[i][inverse_mapping["Repetition"]] < 0.99 or len(spans[i].split()) <= 1): 113 | log[inverse_mapping["Repetition"]] = 0 114 | 115 | for prediction in train_instances.get(spans_text[i], set()): 116 | log[inverse_mapping[prediction]] += 0.5 117 | if spans_source[i].startswith('#'): 118 | log[inverse_mapping['Slogans']] = 20 119 | 120 | 121 | prev_same = [] 122 | for j in range(i): 123 | if spans_coords[j][0] == spans_coords[i][0] and spans_coords[j][1] == spans_coords[i][1]: 124 | prev_same.append(j) 125 | if len(prev_same) > 0: 126 | for prediction in preds[prev_same]: 127 | log[inverse_mapping[prediction]] = 0 128 | 129 | logits[i] = log 130 | preds[i] = mapping[np.argmax(log)] 131 | 132 | x["pred"] = correct_preds_for_insides(preds, spans_coords, logits, insides, mapping, inverse_mapping) 133 | #x["pred"] = preds 134 | return x 135 | 136 | 137 | def postprocess_predictions(predictions_logits, data, insides, train_instances): 138 | mapping = {i: el for i, el in enumerate( 139 | ['Appeal_to_Authority', 'Doubt', 'Repetition', 'Appeal_to_fear-prejudice', 'Slogans', 'Black-and-White_Fallacy', 140 | 'Loaded_Language', 'Flag-Waving', 'Name_Calling,Labeling', 'Whataboutism,Straw_Men,Red_Herring', 141 | 'Causal_Oversimplification', 'Exaggeration,Minimisation', 'Bandwagon,Reductio_ad_hitlerum', 142 | 'Thought-terminating_Cliches'] 143 | )} 144 | inverse_mapping = {b: a for (a, b) in mapping.items()} 145 | 146 | stop_words = set(stopwords.words('english')) 147 | ps = PorterStemmer() 148 | 149 | predictions = np.argmax(predictions_logits, axis=1) 150 | data['pred'] = [mapping[p] for p in predictions] 151 | data['logits'] = [' '.join(np.array(log, dtype=str)) for log in predictions_logits] 152 | data = data.groupby('article_id', as_index=False).apply(postprocess, mapping, inverse_mapping, insides, 153 | stop_words, ps, train_instances) 154 | return np.array(data["pred"].values) 155 | 156 | 157 | def softmax_with_temperature(z, T): 158 | z = z / T 159 | max_z = np.max(z, axis=1).reshape(-1, 1) 160 | exp_z = np.exp(z - max_z) 161 | return exp_z / np.sum(exp_z, axis=1).reshape(-1, 1) 162 | 163 | 164 | def create_submission_file(predicted_logits_files, train_file_path, dev_file_path, test_file_path, 165 | article_ids, span_starts, span_ends, output_file, weights=None, data_dir=None, agg_model=None): 166 | data_train = pd.read_csv(train_file_path, sep='\t') 167 | data_eval = pd.read_csv(dev_file_path, sep='\t') 168 | #data_train = pd.concat([data_train, data_eval], ignore_index=True) 169 | 170 | insides = get_insides(data_train) 171 | train_instances = get_train_instances(data_train, data_dir) 172 | 173 | data = pd.read_csv(test_file_path, sep='\t') 174 | 175 | if weights is None: 176 | weights = [1. / len(predicted_logits_files) for _ in range(len(predicted_logits_files))] 177 | assert len(weights) == len(predicted_logits_files) 178 | 179 | predictions_logits = None 180 | predictions_logits_list = [] 181 | for file, weight in zip(predicted_logits_files, weights): 182 | with open(file, 'rb') as f: 183 | logits = pickle.load(f) 184 | if predictions_logits is None: 185 | predictions_logits = float(weight) * softmax_with_temperature(logits, 1) 186 | else: 187 | predictions_logits += float(weight) * softmax_with_temperature(logits, 1) 188 | if agg_model is not None: 189 | predictions_logits_list.append(logits) 190 | 191 | predictions = postprocess_predictions(predictions_logits, data, insides, train_instances) 192 | 193 | if agg_model is not None: 194 | clf = load(agg_model) 195 | predictions_sklearn_agg = clf.predict(np.concatenate(predictions_logits_list, axis=1)) 196 | predictions_sklearn_agg[predictions_sklearn_agg == 'Repetition'] = predictions[predictions_sklearn_agg == 'Repetition'] 197 | predictions_sklearn_agg[predictions == 'Repetition'] = 'Repetition' 198 | predictions = predictions_sklearn_agg 199 | 200 | with open(output_file, "w") as fout: 201 | for article_id, prediction, span_start, span_end in zip(article_ids, predictions, span_starts, span_ends): 202 | fout.write("%s\t%s\t%s\t%s\n" % (article_id, prediction, span_start, span_end)) 203 | 204 | 205 | def load_result(file): 206 | result = defaultdict(dict) 207 | with open(file, "r") as f: 208 | for line in f: 209 | article_id, prediction, spl, spr = line.split('\t') 210 | result[article_id].setdefault(prediction, []) 211 | result[article_id][prediction].append([int(spl), int(spr)]) 212 | return result 213 | 214 | 215 | def read_ground_truth(gt_file_path, label_names): 216 | ground_truth = [] 217 | with open(gt_file_path, "r") as f: 218 | for line in f: 219 | gold_label = line.split('\t')[-1].strip() 220 | if gold_label in label_names: 221 | ground_truth.append(gold_label) 222 | return ground_truth 223 | 224 | 225 | def eval_submission(result_file_path, gt_file_path): 226 | predictions = [] 227 | with open(result_file_path, "r") as f: 228 | for line in f: 229 | prediction = line.split('\t')[1].strip() 230 | predictions.append(prediction) 231 | 232 | label_names = sorted(['Appeal_to_Authority', 'Doubt', 'Repetition', 'Appeal_to_fear-prejudice', 'Slogans', 233 | 'Black-and-White_Fallacy', 'Loaded_Language', 'Flag-Waving', 'Name_Calling,Labeling', 234 | 'Whataboutism,Straw_Men,Red_Herring', 'Causal_Oversimplification', 'Exaggeration,Minimisation', 235 | 'Bandwagon,Reductio_ad_hitlerum', 'Thought-terminating_Cliches']) 236 | ground_truth = read_ground_truth(gt_file_path, label_names) 237 | 238 | acc = accuracy_score(ground_truth, predictions) 239 | f1 = list(zip(label_names, f1_score(ground_truth, predictions, average=None, labels=label_names))) 240 | return acc, f1 241 | -------------------------------------------------------------------------------- /span_identification/__main__.py: -------------------------------------------------------------------------------- 1 | try: 2 | from .ner import transformers_ner_crf, transformers_ner 3 | from .dataset import load_data, get_train_dev_files, get_test_file, create_subfolder 4 | from .submission import get_submission_format 5 | except: 6 | from ner import transformers_ner_crf, transformers_ner 7 | from dataset import load_data, get_train_dev_files, get_test_file, create_subfolder 8 | from submission import get_submission_format 9 | 10 | import configargparse 11 | import spacy 12 | import logging 13 | import os 14 | import subprocess 15 | import tempfile 16 | 17 | 18 | logger = logging.getLogger(__name__) 19 | 20 | 21 | def Main(args): 22 | nlp = spacy.load("en") 23 | if not os.path.exists(args.data_dir): 24 | os.makedirs(args.data_dir) 25 | 26 | if args.do_train or args.do_eval or args.split_dataset: 27 | articles_content, articles_id, propaganda_techniques_names = load_data(args.train_data_folder, 28 | args.propaganda_techniques_file) 29 | train_file_path = os.path.join(args.data_dir, args.train_file) 30 | dev_file_path = os.path.join(args.data_dir, args.dev_file) 31 | if not os.path.exists(train_file_path) or not os.path.exists(dev_file_path) or args.overwrite_cache: 32 | logger.info("Creating 'ner' train/dev files: %s, %s", train_file_path, dev_file_path) 33 | train_ids, dev_ids = get_train_dev_files(articles_id, articles_content, nlp, args.labels_path, train_file_path, 34 | dev_file_path, args.split_by_ids, args.dev_size, args.random_state) 35 | if args.split_dataset: 36 | create_subfolder(os.path.join(args.data_dir, 'train-train-articles'), args.train_data_folder, train_ids) 37 | create_subfolder(os.path.join(args.data_dir, 'train-dev-articles'), args.train_data_folder, dev_ids) 38 | 39 | if args.do_predict or args.create_submission_file or args.do_eval_spans: 40 | test_articles_content, test_articles_id, _ = load_data(args.test_data_folder, args.propaganda_techniques_file) 41 | test_file_path = os.path.join(args.data_dir, args.test_file) 42 | if (not os.path.exists(test_file_path) or args.overwrite_cache) and not args.do_eval_spans: 43 | logger.info("Creating 'ner' test file: %s", test_file_path) 44 | get_test_file(test_file_path, test_articles_id, test_articles_content, nlp) 45 | 46 | if args.do_train or args.do_eval or args.do_predict: 47 | if args.use_crf: 48 | transformers_ner_crf(args) 49 | else: 50 | transformers_ner(args) 51 | 52 | if args.do_eval_spans: 53 | logger.info("Evaluating file %s with competition metrics", args.output_file) 54 | output_file = os.path.join('results', args.output_file) 55 | get_submission_format(args.predicted_labels_files, test_articles_id, test_articles_content, nlp, output_file) 56 | if args.gold_annot_file is None: 57 | gold_annot_file = next(tempfile._get_candidate_names()) 58 | get_submission_format([test_file_path], test_articles_id, test_articles_content, nlp, gold_annot_file) 59 | else: 60 | gold_annot_file = args.gold_annot_file 61 | cmd = "python tools/task-SI_scorer.py -s {} -r {}".format(output_file, gold_annot_file) 62 | subprocess.run(cmd, shell=True) 63 | if args.gold_annot_file is None: 64 | os.remove(gold_annot_file) 65 | 66 | if args.create_submission_file: 67 | if not os.path.exists('results'): 68 | os.makedirs('results') 69 | output_file = os.path.join('results', args.output_file) 70 | logger.info("Creating a submission file: %s", output_file) 71 | get_submission_format(args.predicted_labels_files, test_articles_id, test_articles_content, nlp, output_file) 72 | 73 | 74 | def main(): 75 | parser = configargparse.ArgumentParser() 76 | 77 | parser.add_argument('--config', required=True, is_config_file=True, help='Config file path.') 78 | parser.add_argument("--train_data_folder", default=None, type=str, required=True, 79 | help="Source directory with the train articles.") 80 | parser.add_argument("--test_data_folder", default=None, type=str, required=True, 81 | help="Source directory with the test articles.") 82 | parser.add_argument("--propaganda_techniques_file", default=None, type=str, required=True, 83 | help="The file with propaganda techniques.") 84 | parser.add_argument("--labels_path", default=None, type=str, required=True, 85 | help="The file with train labels.") 86 | parser.add_argument("--data_dir", default=None, type=str, required=True, 87 | help="The directory for cached preprocessed data.") 88 | parser.add_argument("--train_file", default=None, type=str, required=True, 89 | help="The filename for cached preprocessed train data.") 90 | parser.add_argument("--dev_file", default=None, type=str, required=True, 91 | help="The filename for cached preprocessed dev data.") 92 | parser.add_argument("--test_file", default=None, type=str, required=True, 93 | help="The filename for cached preprocessed test data.") 94 | parser.add_argument("--predicted_labels_files", default=None, nargs='*', required=True, 95 | help="The predicted filenames of labels that will be used to form the final result") 96 | parser.add_argument("--output_file", default=None, type=str, required=True, 97 | help="The submission filename") 98 | parser.add_argument("--dev_size", default=0.3, type=float, help="Dev data size.") 99 | parser.add_argument("--split_dataset", action="store_true", 100 | help="Split the dataset into the train/dev parts") 101 | parser.add_argument("--split_by_ids", action="store_true", 102 | help="Use articles ids while splitting the dataset into the train/dev parts.") 103 | parser.add_argument("--create_submission_file", action="store_true", 104 | help="Creats file in the submission (source) format") 105 | parser.add_argument("--random_state", default=42, type=int, help='Random state for the dataset splitting.') 106 | parser.add_argument("--do_eval_spans", action="store_true", 107 | help="Whether to run eval on the dev set with the competition metrics.") 108 | parser.add_argument("--gold_annot_file", default=None, type=str, help="Gold annotation file.") 109 | 110 | parser.add_argument("--use_crf", action="store_true", help="Use Conditional Random Field over the model") 111 | parser.add_argument("--use_quotes", action="store_true") 112 | 113 | MODEL_CLASSES = ["bert", "roberta", "distilbert", "camembert"] 114 | parser.add_argument("--model_type", default=None, type=str, required=True, 115 | help="Model type selected in the list: " + ", ".join(MODEL_CLASSES)) 116 | parser.add_argument("--model_name_or_path", default=None, type=str, required=True, 117 | help="Path to pre-trained model or shortcut name") 118 | parser.add_argument("--output_dir", default=None, type=str, required=True, 119 | help="The output directory where the model predictions and checkpoints will be written.") 120 | 121 | parser.add_argument("--labels", default="", type=str, 122 | help="Path to a file containing all labels. If not specified, CoNLL-2003 labels are used.") 123 | parser.add_argument("--config_name", default="", type=str, 124 | help="Pretrained config name or path if not the same as model_name") 125 | parser.add_argument("--tokenizer_name", default="", type=str, 126 | help="Pretrained tokenizer name or path if not the same as model_name") 127 | parser.add_argument("--cache_dir", default="", type=str, 128 | help="Where do you want to store the pre-trained models downloaded from s3") 129 | parser.add_argument("--max_seq_length", default=128, type=int, 130 | help="The maximum total input sequence length after tokenization. Sequences longer " 131 | "than this will be truncated, sequences shorter will be padded.") 132 | parser.add_argument("--do_train", action="store_true", 133 | help="Whether to run training.") 134 | parser.add_argument("--do_eval", action="store_true", 135 | help="Whether to run eval on the dev set.") 136 | parser.add_argument("--do_predict", action="store_true", 137 | help="Whether to run predictions on the test set.") 138 | parser.add_argument("--evaluate_during_training", action="store_true", 139 | help="Whether to run evaluation during training at each logging step.") 140 | parser.add_argument("--do_lower_case", action="store_true", 141 | help="Set this flag if you are using an uncased model.") 142 | 143 | parser.add_argument("--per_gpu_train_batch_size", default=8, type=int, 144 | help="Batch size per GPU/CPU for training.") 145 | parser.add_argument("--per_gpu_eval_batch_size", default=8, type=int, 146 | help="Batch size per GPU/CPU for evaluation.") 147 | parser.add_argument("--gradient_accumulation_steps", type=int, default=1, 148 | help="Number of updates steps to accumulate before performing a backward/update pass.") 149 | parser.add_argument("--learning_rate", default=5e-5, type=float, 150 | help="The initial learning rate for Adam.") 151 | parser.add_argument("--weight_decay", default=0.0, type=float, 152 | help="Weight decay if we apply some.") 153 | parser.add_argument("--adam_epsilon", default=1e-8, type=float, 154 | help="Epsilon for Adam optimizer.") 155 | parser.add_argument("--max_grad_norm", default=1.0, type=float, 156 | help="Max gradient norm.") 157 | parser.add_argument("--num_train_epochs", default=3.0, type=float, 158 | help="Total number of training epochs to perform.") 159 | parser.add_argument("--max_steps", default=-1, type=int, 160 | help="If > 0: set total number of training steps to perform. Override num_train_epochs.") 161 | parser.add_argument("--warmup_steps", default=0, type=int, 162 | help="Linear warmup over warmup_steps.") 163 | 164 | parser.add_argument("--logging_steps", type=int, default=50, 165 | help="Log every X updates steps.") 166 | parser.add_argument("--save_steps", type=int, default=50, 167 | help="Save checkpoint every X updates steps.") 168 | parser.add_argument("--eval_all_checkpoints", action="store_true", 169 | help="Evaluate all checkpoints starting with the same prefix as model_name ending and ending with step number") 170 | parser.add_argument("--no_cuda", action="store_true", 171 | help="Avoid using CUDA when available") 172 | parser.add_argument("--overwrite_output_dir", action="store_true", 173 | help="Overwrite the content of the output directory") 174 | parser.add_argument("--overwrite_cache", action="store_true", 175 | help="Overwrite the cached training and evaluation sets") 176 | parser.add_argument("--seed", type=int, default=42, 177 | help="random seed for initialization") 178 | 179 | parser.add_argument("--fp16", action="store_true", 180 | help="Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit") 181 | parser.add_argument("--fp16_opt_level", type=str, default="O1", 182 | help="For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']." 183 | "See details at https://nvidia.github.io/apex/amp.html") 184 | parser.add_argument("--local_rank", type=int, default=-1, 185 | help="For distributed training: local_rank") 186 | parser.add_argument("--server_ip", type=str, default="", help="For distant debugging.") 187 | parser.add_argument("--server_port", type=str, default="", help="For distant debugging.") 188 | args = parser.parse_args() 189 | 190 | logging.basicConfig(format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", 191 | datefmt="%m/%d/%Y %H:%M:%S", 192 | level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN) 193 | 194 | Main(args) 195 | 196 | 197 | if __name__ == "__main__": 198 | main() 199 | -------------------------------------------------------------------------------- /technique_classification/__main__.py: -------------------------------------------------------------------------------- 1 | try: 2 | from .transformers_classifier import transformers_clf 3 | from .dataset import load_data, get_train_dev_files, get_test_file 4 | from .submission import create_submission_file, eval_submission 5 | except: 6 | from transformers_classifier import transformers_clf 7 | from dataset import load_data, get_train_dev_files, get_test_file 8 | from submission import create_submission_file, eval_submission 9 | 10 | import configargparse 11 | import logging 12 | import os 13 | import subprocess 14 | 15 | 16 | logger = logging.getLogger(__name__) 17 | 18 | 19 | def Main(args): 20 | if not os.path.exists(args.data_dir): 21 | os.makedirs(args.data_dir) 22 | 23 | if args.do_train or args.do_eval or args.split_dataset or args.create_submission_file: 24 | articles, ref_articles_id, ref_span_starts, ref_span_ends, labels = load_data(args.train_data_folder, 25 | args.labels_path) 26 | train_file_path = os.path.join(args.data_dir, args.train_file) 27 | dev_file_path = os.path.join(args.data_dir, args.dev_file) 28 | if not os.path.exists(train_file_path) or not os.path.exists(dev_file_path) or args.overwrite_cache: 29 | logger.info("Creating train/dev files: %s, %s", train_file_path, dev_file_path) 30 | get_train_dev_files(articles, ref_articles_id, ref_span_starts, ref_span_ends, labels, train_file_path, 31 | dev_file_path, args.split_by_ids, args.dev_size, args.random_state, args.balance, 32 | args.shuffle) 33 | 34 | if args.do_predict or args.create_submission_file or args.eval_submission: 35 | test_file_path = os.path.join(args.data_dir, args.test_file) 36 | test_articles, test_articles_id, test_span_starts, test_span_ends, test_labels = load_data(args.test_data_folder, 37 | args.test_template_labels_path) 38 | if not os.path.exists(test_file_path) or args.overwrite_cache: 39 | logger.info("Creating roberta-type test file: %s", test_file_path) 40 | get_test_file(test_articles, test_articles_id, test_span_starts, test_span_ends, test_labels, test_file_path) 41 | 42 | if args.do_train or args.do_eval or args.do_predict: 43 | transformers_clf(args) 44 | 45 | if args.create_submission_file: 46 | if not os.path.exists('results'): 47 | os.makedirs('results') 48 | output_file = os.path.join('results', args.output_file) 49 | logger.info("Creating the submission file: %s", output_file) 50 | create_submission_file(args.predicted_logits_files, train_file_path, dev_file_path, test_file_path, 51 | test_articles_id, test_span_starts, test_span_ends, output_file, args.weights, args.data_dir) 52 | 53 | if args.eval_submission: 54 | output_file = os.path.join('results', args.output_file) 55 | logger.info("Evaluating the submission file: %s", output_file) 56 | if args.test_labels_path is None: 57 | acc, f1 = eval_submission(output_file, test_file_path) 58 | logger.info('accuracy: %f', acc) 59 | print('f1-macro:', f1) 60 | else: 61 | cmd = "python tools/task-TC_scorer.py -s {} -r {} -p {}".format(output_file, args.test_labels_path, 62 | args.propaganda_techniques_file) 63 | subprocess.run(cmd, shell=True) 64 | 65 | 66 | def main(): 67 | parser = configargparse.ArgumentParser() 68 | 69 | parser.add_argument('--config', required=True, is_config_file=True, help='Config file path.') 70 | parser.add_argument("--train_data_folder", default=None, type=str, required=True, 71 | help="Source directory with the train articles.") 72 | parser.add_argument("--test_data_folder", default=None, type=str, required=True, 73 | help="Source directory with the test articles.") 74 | parser.add_argument("--propaganda_techniques_file", default=None, type=str, required=True, 75 | help="The file with propaganda techniques.") 76 | parser.add_argument("--labels_path", default=None, type=str, required=True, 77 | help="The file with train labels.") 78 | parser.add_argument("--test_template_labels_path", default=None, type=str, required=True, 79 | help="The file with test template labels.") 80 | parser.add_argument("--data_dir", default=None, type=str, required=True, 81 | help="The directory for cached preprocessed data.") 82 | parser.add_argument("--train_file", default=None, type=str, required=True, 83 | help="The filename for cached preprocessed train data.") 84 | parser.add_argument("--dev_file", default=None, type=str, required=True, 85 | help="The filename for cached preprocessed dev data.") 86 | parser.add_argument("--test_file", default=None, type=str, required=True, 87 | help="The filename for cached preprocessed test data.") 88 | parser.add_argument("--predicted_logits_files", default=None, nargs='*', required=True, 89 | help="The predicted filenames of logits that will be used to obtain the final result") 90 | parser.add_argument("--weights", default=None, nargs='*', required=False, 91 | help="The list of weights for predicted logits at the aggregation stage") 92 | parser.add_argument("--output_file", default=None, type=str, required=True, 93 | help="The submission filename") 94 | parser.add_argument("--dev_size", default=0.3, type=float, help="Dev data size.") 95 | parser.add_argument("--split_dataset", action="store_true", 96 | help="Split the dataset into the train/dev parts.") 97 | parser.add_argument("--split_by_ids", action="store_true", 98 | help="Use articles ids while splitting the dataset into the train/dev parts.") 99 | parser.add_argument("--random_state", default=42, type=int, help='Random state for the dataset splitting.') 100 | parser.add_argument("--shuffle", action="store_true", help="Shuffle the train dataset.") 101 | parser.add_argument("--balance", action="store_true", help="Balance the train dataset with oversampling.") 102 | parser.add_argument("--create_submission_file", action="store_true", 103 | help="Creats file in the submission (source) format") 104 | parser.add_argument("--eval_submission", action="store_true", help="Do evaluating for the dev subset.") 105 | 106 | parser.add_argument('--use_length', action='store_true') 107 | parser.add_argument('--join_embeddings', action='store_true') 108 | parser.add_argument('--use_matchings', action='store_true') 109 | 110 | MODEL_CLASSES = ["bert", "roberta", "distilbert", "camembert"] 111 | parser.add_argument("--model_type", default=None, type=str, required=True, 112 | help="Model type selected in the list: " + ", ".join(MODEL_CLASSES)) 113 | parser.add_argument("--model_name_or_path", default=None, type=str, required=True, 114 | help="Path to pre-trained model or shortcut name.") 115 | parser.add_argument("--task_name", default=None, type=str, required=True, 116 | help="The name of the task to train.") 117 | parser.add_argument("--output_dir", default=None, type=str, required=True, 118 | help="The output directory where the model predictions and checkpoints will be written.") 119 | parser.add_argument("--test_labels_path", default=None, type=str, required=False) 120 | 121 | ## Other parameters 122 | parser.add_argument("--config_name", default="", type=str, 123 | help="Pretrained config name or path if not the same as model_name") 124 | parser.add_argument("--tokenizer_name", default="", type=str, 125 | help="Pretrained tokenizer name or path if not the same as model_name") 126 | parser.add_argument("--cache_dir", default="", type=str, 127 | help="Where do you want to store the pre-trained models downloaded from s3") 128 | parser.add_argument("--max_seq_length", default=128, type=int, 129 | help="The maximum total input sequence length after tokenization. Sequences longer " 130 | "than this will be truncated, sequences shorter will be padded.") 131 | parser.add_argument("--do_train", action='store_true', 132 | help="Whether to run training.") 133 | parser.add_argument("--do_eval", action='store_true', 134 | help="Whether to run eval on the dev set.") 135 | parser.add_argument("--do_predict", action='store_true', 136 | help="Whether to run prediction") 137 | parser.add_argument("--evaluate_during_training", action='store_true', 138 | help="Rul evaluation during training at each logging step.") 139 | parser.add_argument("--do_lower_case", action='store_true', 140 | help="Set this flag if you are using an uncased model.") 141 | 142 | parser.add_argument("--per_gpu_train_batch_size", default=8, type=int, 143 | help="Batch size per GPU/CPU for training.") 144 | parser.add_argument("--per_gpu_eval_batch_size", default=8, type=int, 145 | help="Batch size per GPU/CPU for evaluation.") 146 | parser.add_argument('--gradient_accumulation_steps', type=int, default=1, 147 | help="Number of updates steps to accumulate before performing a backward/update pass.") 148 | parser.add_argument("--learning_rate", default=5e-5, type=float, 149 | help="The initial learning rate for Adam.") 150 | parser.add_argument("--weight_decay", default=0.0, type=float, 151 | help="Weight deay if we apply some.") 152 | parser.add_argument("--adam_epsilon", default=1e-8, type=float, 153 | help="Epsilon for Adam optimizer.") 154 | parser.add_argument("--max_grad_norm", default=1.0, type=float, 155 | help="Max gradient norm.") 156 | parser.add_argument("--num_train_epochs", default=3.0, type=float, 157 | help="Total number of training epochs to perform.") 158 | parser.add_argument("--max_steps", default=-1, type=int, 159 | help="If > 0: set total number of training steps to perform. Override num_train_epochs.") 160 | parser.add_argument("--warmup_steps", default=0, type=int, 161 | help="Linear warmup over warmup_steps.") 162 | 163 | parser.add_argument('--logging_steps', type=int, default=50, 164 | help="Log every X updates steps.") 165 | parser.add_argument('--save_steps', type=int, default=50, 166 | help="Save checkpoint every X updates steps.") 167 | parser.add_argument("--eval_all_checkpoints", action='store_true', 168 | help="Evaluate all checkpoints starting with the same prefix as model_name ending and ending with step number") 169 | parser.add_argument("--no_cuda", action='store_true', 170 | help="Avoid using CUDA when available") 171 | parser.add_argument('--overwrite_output_dir', action='store_true', 172 | help="Overwrite the content of the output directory") 173 | parser.add_argument('--overwrite_cache', action='store_true', 174 | help="Overwrite the cached training and evaluation sets") 175 | parser.add_argument('--seed', type=int, default=42, 176 | help="random seed for initialization") 177 | 178 | parser.add_argument('--fp16', action='store_true', 179 | help="Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit") 180 | parser.add_argument('--fp16_opt_level', type=str, default='O1', 181 | help="For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']." 182 | "See details at https://nvidia.github.io/apex/amp.html") 183 | parser.add_argument("--local_rank", type=int, default=-1, 184 | help="For distributed training: local_rank") 185 | parser.add_argument('--server_ip', type=str, default='', help="For distant debugging.") 186 | parser.add_argument('--server_port', type=str, default='', help="For distant debugging.") 187 | args = parser.parse_args() 188 | 189 | logging.basicConfig(format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", 190 | datefmt="%m/%d/%Y %H:%M:%S", 191 | level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN) 192 | 193 | Main(args) 194 | 195 | 196 | if __name__ == "__main__": 197 | main() 198 | -------------------------------------------------------------------------------- /visualization_example/visualization/highlight.css: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tools/src/annotations.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | from typing import Dict 3 | import sys 4 | import re 5 | import os.path 6 | import glob 7 | import logging.handlers 8 | from sklearn.metrics import f1_score 9 | from sklearn.metrics import precision_score 10 | from sklearn.metrics import recall_score 11 | import src.article_annotations as aa 12 | import src.annotation as an 13 | 14 | __author__ = "Giovanni Da San Martino" 15 | __copyright__ = "Copyright 2019" 16 | __credits__ = ["Giovanni Da San Martino"] 17 | __license__ = "GPL" 18 | __version__ = "0.1" 19 | __maintainer__ = "Giovanni Da San Martino" 20 | __email__ = "gmartino@hbku.edu.qa" 21 | __status__ = "Beta" 22 | 23 | logger = logging.getLogger("propaganda_scorer") 24 | 25 | 26 | class Annotations(object): 27 | """ 28 | Dictionary of Articles_annotations objects. 29 | (basically a dataset of article_annotations objects) 30 | 31 | """ 32 | 33 | def __init__(self, annotations:aa.Articles_annotations=None): 34 | 35 | if annotations is None: 36 | self.annotations:Dict[str, aa.Articles_annotations] = {} 37 | else: 38 | self.annotations = annotations 39 | 40 | 41 | def __len__(self): 42 | """ 43 | Returns the number of articles in the object 44 | """ 45 | return len(self.get_article_id_list()) 46 | 47 | 48 | def add_annotation(self, annotation:an.Annotation, article_id:str): 49 | """ 50 | Add a single annotation to the article with id article_id. 51 | If such article does not exists, the annotation is created. 52 | """ 53 | if not self.has_article(article_id): 54 | self.create_article_annotations_object(article_id) 55 | self.annotations[article_id].add_annotation(annotation) 56 | 57 | 58 | def check_annotation_spans_with_category_matching(self, merge_overlapping_spans:bool=False): 59 | """ 60 | Check whether there are overlapping spans for the same technique in the same article. 61 | Two spans are overlapping if their associated techniques match (according to category_matching_func) 62 | If merge_overlapping_spans==True then the overlapping spans are merged, otherwise an error is raised. 63 | 64 | :param merge_overlapping_spans: if True merges the overlapping spans 65 | :return: 66 | """ 67 | 68 | for article_id in self.get_article_id_list(): 69 | 70 | annotation_list = self.get_article_annotations_obj(article_id).groupby_technique() 71 | if merge_overlapping_spans: 72 | for technique in annotation_list.keys(): 73 | for i in range(1, len(annotation_list[technique])): 74 | annotation_list[technique][i].merge_spans(annotation_list[technique], i-1) 75 | if not self.get_article_annotations_obj(article_id): 76 | return False 77 | # annotation_list = {} 78 | # for annotation in self.annotations.get_article_annotations(article_id): 79 | # technique = annotation.get_label() 80 | # if technique not in annotation_list.keys(): 81 | # annotation_list[technique] = [[technique, curr_span]] 82 | # else: 83 | # if merge_overlapping_spans: 84 | # annotation_list[technique].append([technique, curr_span]) 85 | # merge_spans(annotation_list[technique], len(annotation_list[technique]) - 1) 86 | # else: 87 | # for matching_technique, span in annotation_list[technique]: 88 | # if len(curr_span.intersection(span)) > 0: 89 | # logger.error("In article %s, the span of the annotation %s, [%s,%s] overlap with " 90 | # "the following one from the same article:%s, [%s,%s]" % ( 91 | # article_id, matching_technique, 92 | # min(span), max(span), technique, min(curr_span), max(curr_span))) 93 | # return False 94 | # annotation_list[technique].append([technique, curr_span]) 95 | # if merge_overlapping_spans: 96 | # annotations[article_id] = [] 97 | # for technique in annotation_list.keys(): 98 | # annotations[article_id] += annotation_list[technique] 99 | return True 100 | 101 | 102 | def compare_annotations_identical_article_lists(self, second_annotations:Annotations): 103 | """ 104 | Compare if self and have identical article id lists 105 | :return: True if the lists are identical and False otherwise. 106 | """ 107 | #checking that the number of articles in self and is the same 108 | if len(self.get_article_id_list()) != len(second_annotations.get_article_id_list()): 109 | logger.error("The number of articles in the annotations is different: %d, %d" 110 | % (len(self.get_article_id_list()), len(second_annotations.get_article_id_list()))) 111 | return False 112 | diff = set(self.get_article_id_list()).difference(set(second_annotations.get_article_id_list())) 113 | if len(diff) > 0: 114 | logger.error("The two lists of article ids differ: %s"%(diff)) 115 | return False 116 | 117 | logger.debug("OK: the list of article ids in the two sets of annotations is identical") 118 | return True 119 | 120 | 121 | def compare_annotations_identical(self, second_annotations:Annotations)->bool: 122 | """ 123 | Compare if self and have identical annotations (without considering the technique labels) 124 | :return: True if the lists are identical and False otherwise. 125 | """ 126 | for article_id in self.get_article_id_list(): 127 | an1_article_annotations = self.get_article_annotations_list(article_id) 128 | an2_article_annotations = second_annotations.get_article_annotations_list(article_id) 129 | if len(an1_article_annotations) != len(an2_article_annotations): 130 | logger.error("The number of annotations for article %s differs: %d vs %d"%(article_id, len(an1_article_annotations), len(an2_article_annotations))) 131 | return False 132 | for an1, an2 in zip(an1_article_annotations, an2_article_annotations): 133 | if not an1.is_span_equal_to(an2): 134 | logger.error("The spans of the annotations of article %s do not match: [%s, %s] vs [%s, %s]"%(article_id, an1.get_start_offset(), an1.get_end_offset(), an2.get_start_offset(), an2.get_end_offset())) 135 | return False 136 | return True 137 | 138 | 139 | # def compute_SI_score(self, second_annotations:anwol.AnnotationWithOutLabel): 140 | # def compute_score_pr(submission_annotations, gold_annotations, technique_names, prop_vs_non_propaganda=False, 141 | # per_article_evaluation=False): 142 | # pass 143 | # prec_denominator = sum([len(annotations) for annotations in submission_annotations.values()]) 144 | # rec_denominator = sum([len(annotations) for annotations in gold_annotations.values()]) 145 | # technique_Spr_prec = {propaganda_technique: 0 for propaganda_technique in technique_names} 146 | # technique_Spr_rec = {propaganda_technique: 0 for propaganda_technique in technique_names} 147 | # cumulative_Spr_prec, cumulative_Spr_rec = (0, 0) 148 | # f1_articles = [] 149 | 150 | # for article_id in submission_annotations.keys(): 151 | # gold_data = gold_annotations[article_id] 152 | # logger.debug("Computing contribution to the score of article id %s\nand tuples %s\n%s\n" 153 | # % (article_id, str(submission_annotations[article_id]), str(gold_data))) 154 | 155 | # article_cumulative_Spr_prec, article_cumulative_Spr_rec = (0, 0) 156 | # for j, sd in enumerate(submission_annotations[article_id]): #submission annotations for article article_id: 157 | # s="" 158 | # sd_annotation_length = len(sd[1]) 159 | # for i, gd in enumerate(gold_data): 160 | # if prop_vs_non_propaganda or gd[0]==sd[0]: 161 | # #s += "\tmatch %s %s-%s - %s %s-%s"%(sd[0],sd[1], sd[2], gd[0], gd[1], gd[2]) 162 | # intersection = len(sd[1].intersection(gd[1])) 163 | # gd_annotation_length = len(gd[1]) 164 | # Spr_prec = intersection/sd_annotation_length 165 | # article_cumulative_Spr_prec += Spr_prec 166 | # cumulative_Spr_prec += Spr_prec 167 | # s += "\tmatch %s %s-%s - %s %s-%s: S(p,r)=|intersect(r, p)|/|p| = %d/%d = %f (cumulative S(p,r)=%f)\n"\ 168 | # %(sd[0],min(sd[1]), max(sd[1]), gd[0], min(gd[1]), max(gd[1]), intersection, sd_annotation_length, Spr_prec, cumulative_Spr_prec) 169 | # technique_Spr_prec[gd[0]] += Spr_prec 170 | 171 | # Spr_rec = intersection/gd_annotation_length 172 | # article_cumulative_Spr_rec += Spr_rec 173 | # cumulative_Spr_rec += Spr_rec 174 | # s += "\tmatch %s %s-%s - %s %s-%s: S(p,r)=|intersect(r, p)|/|r| = %d/%d = %f (cumulative S(p,r)=%f)\n"\ 175 | # %(sd[0],min(sd[1]), max(sd[1]), gd[0], min(gd[1]), max(gd[1]), intersection, gd_annotation_length, Spr_rec, cumulative_Spr_rec) 176 | # technique_Spr_rec[gd[0]] += Spr_rec 177 | # logger.debug("\n%s"%(s)) 178 | 179 | # p_article, r_article, f1_article =compute_prec_rec_f1(article_cumulative_Spr_prec, 180 | # len(submission_annotations[article_id]), 181 | # article_cumulative_Spr_rec, 182 | # len(gold_annotations[article_id]), False) 183 | # f1_articles.append(f1_article) 184 | 185 | # p,r,f1 = compute_prec_rec_f1(cumulative_Spr_prec, prec_denominator, cumulative_Spr_rec, rec_denominator) 186 | 187 | # if not prop_vs_non_propaganda: 188 | # for technique_name in technique_Spr_prec.keys(): 189 | # prec_tech, rec_tech, f1_tech = compute_prec_rec_f1(technique_Spr_prec[technique_name], 190 | # compute_technique_frequency(submission_annotations.values(), technique_name), 191 | # technique_Spr_prec[technique_name], 192 | # compute_technique_frequency(gold_annotations.values(), technique_name), False) 193 | # logger.info("%s: P=%f R=%f F1=%f" % (technique_name, prec_tech, rec_tech, f1_tech)) 194 | 195 | # if per_article_evaluation: 196 | # logger.info("Per article evaluation F1=%s"%(",".join([ str(f1_value) for f1_value in f1_articles]))) 197 | 198 | # return f1 199 | 200 | 201 | def align_annotations(self, second_annotations:Annotations)->None: 202 | """ 203 | Reorder all annotations such that the matching between annotations' labels 204 | and the ones from second_annotations is maximised. 205 | """ 206 | for article_id in second_annotations.get_article_id_list(): 207 | self.get_article_annotations_obj(article_id).align_annotations(second_annotations.get_article_annotations_obj(article_id)) 208 | 209 | 210 | def compute_TC_score(self, second_annotations:Annotations): 211 | """ 212 | second_annotations: gold labels 213 | """ 214 | 215 | self.align_annotations(second_annotations) 216 | gold_labels = [ x.get_label() for x in second_annotations.get_full_list_of_annotations() ] 217 | submission_labels = [ x.get_label() for x in self.get_full_list_of_annotations() ] 218 | 219 | precision = precision_score(gold_labels, submission_labels, pos_label=None, average='micro') 220 | recall = recall_score(gold_labels, submission_labels, pos_label=None, average='micro') 221 | f1 = f1_score(gold_labels, submission_labels, pos_label=None, average='micro') 222 | if an.Annotation.propaganda_techniques is not None: 223 | propaganda_techniques_list = an.Annotation.propaganda_techniques.get_propaganda_techniques_list_sorted() 224 | f1_per_class = f1_score(gold_labels, submission_labels, average=None, labels=propaganda_techniques_list) 225 | return precision, recall, f1, f1_per_class 226 | return precision, recall, f1 227 | 228 | 229 | def create_article_annotations_object(self, article_id:str)->None: 230 | self.annotations[article_id] = aa.Articles_annotations(article_id=article_id) 231 | 232 | 233 | def TC_score_to_string(self, second_annotation:Annotations, output_for_script=False): 234 | 235 | if an.Annotation.propaganda_techniques is None: #raise an error 236 | precision, recall, f1 = self.compute_TC_score(second_annotation) 237 | res = "\nPrecision=%f\nRecall=%f\nF1=%f\n"%(precision, recall, f1) 238 | else: 239 | precision, recall, f1, f1_per_class = self.compute_TC_score(second_annotation) 240 | res_for_screen = "\nF1=%f\nPrecision=%f\nRecall=%f\n%s\n" % (precision, recall, f1, "\n".join([ "F1_"+pr+"="+str(f) for pr, f in zip(an.Annotation.propaganda_techniques.get_propaganda_techniques_list(), f1_per_class)])) 241 | if output_for_script: 242 | res_for_script = "%f\t%f\t%f\t"%(f1, precision, recall) 243 | res_for_script += "\t".join([ str(x) for x in f1_per_class]) 244 | else: 245 | res_for_script = "" 246 | return res_for_screen, res_for_script 247 | 248 | 249 | def get_full_list_of_annotations(self): 250 | full_list = [] 251 | for article_id in self.get_article_id_list(): 252 | for an in self.get_article_annotations_list(article_id): 253 | full_list.append(an) 254 | return full_list 255 | 256 | 257 | def has_article(self, article_id:str)->bool: 258 | """ 259 | Check whether article_id is in the list of articles whose annotations are in the object. 260 | """ 261 | return article_id in self.get_article_id_list() 262 | 263 | 264 | def get_article_id_list(self): 265 | """ 266 | All ids of the article in the object 267 | """ 268 | return self.annotations.keys() 269 | 270 | 271 | def get_article_annotations_obj(self, article_id:str): 272 | """ 273 | Returns all annotations of an article as an Article_annotations object. 274 | """ 275 | return self.annotations[article_id] 276 | 277 | 278 | def get_article_annotations_list(self, article_id:str): 279 | """ 280 | Returns all annotations of an article as a list of Annotation objects. 281 | """ 282 | return self.annotations[article_id].get_article_annotations() 283 | 284 | 285 | def _guess_article_id_from_file_name(self, filename:str)->str: 286 | 287 | regex = re.compile("article([0-9]+).*") 288 | article_id = regex.match(os.path.basename(filename)).group(1) 289 | return article_id 290 | 291 | 292 | def load_annotation_list_from_file(self, filename): 293 | """ 294 | Loads all annotations in file . The file is supposed to contain annotations for multiple articles. To load annotations for a single article use the function with the same name from module src.article_annotations. 295 | Each annotation is checked according to check_format_of_annotation_in_file() 296 | """ 297 | with open(filename, "r") as f: 298 | for i, line in enumerate(f.readlines(), 1): 299 | ann, article_id = an.Annotation.load_annotation_from_string(line.rstrip(), i, filename) 300 | ann.check_format_of_annotation_in_file() 301 | self.add_annotation(ann, article_id) 302 | 303 | 304 | def load_annotation_list_from_folder(self, folder_name, pattern="*.labels"): 305 | """ 306 | Loads all annotations from all files in folder . 307 | Files in the folder are selected according to 308 | """ 309 | if not os.path.exists(folder_name): 310 | logger.error("trying to load annotations from folder %s, which does not exists"%(folder_name)) 311 | return False 312 | if not os.path.isdir(folder_name): 313 | logger.error("trying to load annotations from folder %s, which does not appear to be a valid folder"%(folder_name)) 314 | return False 315 | file_list = glob.glob(os.path.join(folder_name, pattern)) 316 | if len(file_list) == 0: 317 | logger.error("Cannot load file list %s/%s"%(folder_name, pattern)) 318 | sys.exit() 319 | for filename in file_list: 320 | self.create_article_annotations_object(self._guess_article_id_from_file_name(filename)) 321 | self.load_annotation_list_from_file(filename) 322 | return True 323 | 324 | # def compute_technique_frequency(annotations_list, technique_name): 325 | # return sum([len([example_annotation for example_annotation in x if example_annotation[0] == technique_name]) 326 | # for x in self.a]) 327 | 328 | 329 | # def print_annotations(annotation_list): 330 | # s = "" 331 | # i=0 332 | # for technique, span in annotation_list: 333 | # s += "%d) %s: %d - %d\n"%(i, technique, min(span), max(span)) 334 | # i += 1 335 | # return s 336 | -------------------------------------------------------------------------------- /results/SI_output.txt: -------------------------------------------------------------------------------- 1 | 111111114 1705 1824 2 | 111111117 671 753 3 | 111111131 84 97 4 | 111111131 102 109 5 | 111111131 180 190 6 | 111111131 207 214 7 | 111111131 326 336 8 | 111111131 352 365 9 | 111111131 382 395 10 | 111111131 398 413 11 | 111111131 723 731 12 | 111111131 804 811 13 | 111111131 823 865 14 | 111111131 1030 1068 15 | 111111131 1977 1992 16 | 111111131 2660 2671 17 | 111111131 2728 2739 18 | 111111131 2897 2908 19 | 111111131 2912 2924 20 | 111111131 2952 2997 21 | 111111131 3396 3416 22 | 111111131 3437 3455 23 | 111111131 3738 3748 24 | 111111131 4008 4014 25 | 111111131 4030 4038 26 | 111111131 4152 4169 27 | 111111131 4224 4230 28 | 111111131 4256 4264 29 | 111111131 4339 4352 30 | 111111131 4531 4546 31 | 111111131 4635 4643 32 | 111111131 4752 4768 33 | 111111131 4882 4899 34 | 111111131 5174 5186 35 | 111111131 5244 5262 36 | 111111131 5281 5289 37 | 111111131 5368 5402 38 | 111111131 5904 5916 39 | 111111131 5938 5950 40 | 111111131 6920 6937 41 | 111111131 6957 6971 42 | 111111131 7314 7324 43 | 111111131 7700 7708 44 | 111111137 143 183 45 | 111111137 2058 2064 46 | 111111137 2320 2333 47 | 696694316 603 661 48 | 696694316 1020 1094 49 | 696694316 3276 3379 50 | 696694316 3471 3608 51 | 696694316 3610 4009 52 | 696694316 4376 4395 53 | 696694316 4423 4440 54 | 696694316 4478 4500 55 | 696694316 7026 7097 56 | 696694316 7631 7780 57 | 696694316 7971 8295 58 | 696694316 8298 8640 59 | 696694316 9373 9584 60 | 696694316 9586 9819 61 | 696694316 10661 10866 62 | 696694316 11102 11126 63 | 696694316 12634 12736 64 | 696694316 12738 12848 65 | 696694316 13262 13316 66 | 696694316 13456 13555 67 | 697444415 512 539 68 | 697444415 1471 1596 69 | 697444415 2815 2860 70 | 697444415 2959 3022 71 | 698018235 305 335 72 | 698018235 555 581 73 | 698018235 641 652 74 | 698018235 657 687 75 | 698018235 975 1047 76 | 698018235 1482 1493 77 | 698018235 1658 1881 78 | 698018235 1910 1919 79 | 698018235 2132 2410 80 | 698018235 2431 2460 81 | 698018235 2723 2730 82 | 698018235 2917 3000 83 | 698018235 3283 3380 84 | 698018235 3514 3649 85 | 698719689 131 144 86 | 698719689 190 321 87 | 698719689 1440 1449 88 | 698719689 1542 1611 89 | 698719689 2324 2335 90 | 700461600 717 760 91 | 700461600 1547 1745 92 | 700461600 2318 2496 93 | 700461600 2726 2787 94 | 700461600 2805 2868 95 | 700461600 3073 3255 96 | 700461600 3548 3559 97 | 700461600 3963 4080 98 | 700461600 4209 4426 99 | 700461600 4506 4530 100 | 700461600 4657 4709 101 | 701225819 77 99 102 | 701225819 111 143 103 | 701225819 177 187 104 | 701225819 305 313 105 | 701225819 996 1017 106 | 701225819 1201 1299 107 | 701225819 1493 1603 108 | 701225819 1767 1771 109 | 701225819 1872 1889 110 | 701225819 1967 1986 111 | 701225819 2174 2182 112 | 701225819 2616 2621 113 | 701225819 2689 2694 114 | 701225819 2747 2752 115 | 701225819 2787 2792 116 | 701225819 2982 2991 117 | 701225819 3309 3315 118 | 701225819 3523 3541 119 | 701225819 3787 3803 120 | 701225819 3837 3860 121 | 701225819 4268 4313 122 | 701225819 4606 4636 123 | 701225819 4720 4737 124 | 701225819 5914 5927 125 | 701225819 6102 6112 126 | 701553469 31 41 127 | 701553469 77 143 128 | 701553469 205 209 129 | 701553469 288 302 130 | 701553469 351 361 131 | 701553469 1716 1744 132 | 701553469 1864 1887 133 | 701553469 1983 2143 134 | 701553469 2486 2651 135 | 701553469 2666 2781 136 | 701553469 3139 3159 137 | 701553469 3455 3476 138 | 701553469 3695 3715 139 | 701553469 3898 3916 140 | 701553469 4175 4280 141 | 701837665 761 801 142 | 701837665 803 833 143 | 701837665 1364 1449 144 | 701837665 1806 2008 145 | 701837665 2456 2470 146 | 701837665 2534 2575 147 | 701837665 2618 2726 148 | 701837665 2740 2762 149 | 701837665 2792 2940 150 | 701837665 2942 2991 151 | 701837665 3251 3423 152 | 701837665 3510 3534 153 | 701837665 3824 3883 154 | 701837665 3958 4067 155 | 701837665 5040 5084 156 | 701837665 5821 5840 157 | 701837665 6697 6873 158 | 701837665 7193 7357 159 | 701837665 7401 7455 160 | 701837665 7529 7666 161 | 701837665 7727 7746 162 | 701837665 7785 7841 163 | 701837665 8115 8155 164 | 701837665 8276 8312 165 | 701837665 8453 8556 166 | 701837665 9299 9329 167 | 701837665 9331 9371 168 | 701837665 9425 9433 169 | 701837665 9471 9500 170 | 701837665 9741 9756 171 | 701837665 9780 9814 172 | 701837665 10090 10206 173 | 701837665 10208 10227 174 | 701837665 10802 10972 175 | 701837665 11844 11915 176 | 701837665 11964 12007 177 | 701837665 12031 12047 178 | 701837665 12178 12197 179 | 701837665 12381 12476 180 | 701837665 12594 12665 181 | 703821117 114 174 182 | 703821117 179 236 183 | 703821117 472 532 184 | 703821117 833 880 185 | 703821117 2350 2366 186 | 703821117 2475 2523 187 | 703821117 3368 3383 188 | 703821117 3483 3500 189 | 703821117 3519 3571 190 | 703821117 3893 3974 191 | 703821117 5022 5095 192 | 703821117 5281 5373 193 | 703821117 6046 6098 194 | 703821117 6445 6453 195 | 703821117 6520 6578 196 | 703821117 6580 6609 197 | 703821117 6620 6637 198 | 703821117 7264 7314 199 | 703821117 7674 7693 200 | 703821117 10469 10570 201 | 703821117 10572 10680 202 | 703821117 10697 10805 203 | 703821117 10820 10845 204 | 703821117 10860 10995 205 | 703821117 11087 11097 206 | 703821117 11099 11105 207 | 703821117 11221 11328 208 | 703821117 11655 11671 209 | 703821117 12003 12039 210 | 703821117 12149 12242 211 | 703821117 12349 12363 212 | 703821117 13152 13166 213 | 703821117 13316 13369 214 | 703821117 13682 13691 215 | 703821117 13902 13958 216 | 703821117 13960 14030 217 | 703821117 14142 14158 218 | 703821117 14419 14458 219 | 703821117 14511 14640 220 | 703821117 14653 14658 221 | 703821117 14752 14920 222 | 703821117 15047 15069 223 | 703821117 15216 15279 224 | 703821117 15785 15851 225 | 703821117 16195 16231 226 | 703821117 16335 16411 227 | 703821117 16781 16835 228 | 703821117 16976 17013 229 | 703821117 17016 17047 230 | 703821117 17150 17244 231 | 703821117 17552 17689 232 | 703821117 18269 18323 233 | 703821117 18378 18396 234 | 703821117 18447 18550 235 | 703821117 18604 18639 236 | 704591553 71 103 237 | 704591553 238 265 238 | 704591553 278 285 239 | 704591553 697 822 240 | 704591553 933 954 241 | 704591553 1016 1029 242 | 704591553 1125 1141 243 | 704591553 1280 1292 244 | 704591553 1696 1718 245 | 704591553 1805 1832 246 | 704591553 1868 1883 247 | 704591553 1999 2059 248 | 704591553 2180 2228 249 | 704591553 2765 2780 250 | 704591553 2802 2816 251 | 704591553 3153 3163 252 | 704591553 3221 3359 253 | 704591553 3827 3881 254 | 704591553 3883 3911 255 | 704591553 4058 4124 256 | 704591553 4398 4405 257 | 704591553 4454 4480 258 | 704591553 4493 4508 259 | 704591553 4965 5082 260 | 704856340 4007 4185 261 | 704856340 4187 4324 262 | 706636401 992 1001 263 | 706636401 2911 2939 264 | 706636401 3353 3362 265 | 706636401 3724 3747 266 | 706636401 3802 3966 267 | 709732928 12 21 268 | 709732928 160 172 269 | 709732928 251 259 270 | 709732928 1428 1432 271 | 709732928 1811 1820 272 | 709732928 1957 1966 273 | 709732928 2169 2173 274 | 709732928 2177 2184 275 | 709732928 2583 2587 276 | 709732928 3682 3689 277 | 709732928 3732 3742 278 | 709732928 6464 6474 279 | 709732928 7346 7359 280 | 709732928 7579 7590 281 | 709732928 8158 8168 282 | 709732928 8491 8498 283 | 709732928 8521 8530 284 | 709732928 8532 8546 285 | 709732928 8613 8616 286 | 709732928 9458 9465 287 | 709732928 10416 10440 288 | 709732928 10525 10552 289 | 709732928 10660 10675 290 | 709732928 10768 10775 291 | 709732928 10826 10833 292 | 709732928 10865 10882 293 | 709732928 11982 11997 294 | 710100700 1203 1375 295 | 711596363 13 32 296 | 711596363 258 277 297 | 711596363 1194 1252 298 | 711596363 1408 1421 299 | 711596363 1944 1954 300 | 711596363 3065 3083 301 | 711596363 3136 3154 302 | 711596363 3173 3189 303 | 711596363 3277 3285 304 | 711596363 3700 3870 305 | 711596363 3894 4008 306 | 711596363 4274 4281 307 | 711596363 4373 4389 308 | 711596363 4573 4624 309 | 711596363 4626 4664 310 | 711596363 4738 4767 311 | 711596363 4985 5098 312 | 711596363 5391 5412 313 | 711596363 5627 5645 314 | 711596363 5647 5678 315 | 711596363 5745 5779 316 | 711622457 457 573 317 | 711622457 575 597 318 | 711622457 616 732 319 | 711622457 734 756 320 | 711622457 813 847 321 | 711622457 934 997 322 | 711622457 1095 1250 323 | 711622457 1329 1355 324 | 711622457 1882 1967 325 | 711622457 2008 2120 326 | 711622457 2409 2697 327 | 711622457 2754 2812 328 | 711622457 3303 3458 329 | 711716996 30 85 330 | 711716996 298 304 331 | 711716996 724 862 332 | 711716996 957 992 333 | 711716996 1444 1453 334 | 711716996 1600 1632 335 | 711716996 1936 2064 336 | 711716996 2231 2268 337 | 711716996 2308 2314 338 | 711716996 2320 2345 339 | 711716996 2760 2843 340 | 711716996 2910 2941 341 | 711716996 3978 3992 342 | 711716996 4002 4025 343 | 711716996 4068 4082 344 | 711716996 4563 4569 345 | 711716996 4959 4973 346 | 711716996 5284 5298 347 | 715588833 0 17 348 | 715588833 412 498 349 | 715588833 624 679 350 | 715588833 1046 1074 351 | 715588833 1753 1770 352 | 715588833 2061 2160 353 | 715588833 2437 2464 354 | 715588833 2622 2773 355 | 715588833 7098 7129 356 | 715588833 7155 7185 357 | 715588833 7638 7736 358 | 715588833 7829 7855 359 | 715588833 7857 7909 360 | 715588833 8479 8546 361 | 715588833 8548 8641 362 | 715588833 8643 8940 363 | 715588833 9232 9268 364 | 715588833 9456 9504 365 | 715588833 9836 10047 366 | 715588833 10678 10918 367 | 715588833 11273 11388 368 | 715588833 11390 11426 369 | 715588833 11575 11832 370 | 715588833 11839 11843 371 | 715588833 11846 11970 372 | 715588833 11972 12085 373 | 722507879 1369 1391 374 | 722507879 2307 2350 375 | 722507879 2356 2413 376 | 722507879 2432 2516 377 | 722507879 3019 3045 378 | 722507879 3937 3948 379 | 722507879 4053 4079 380 | 723793978 1106 1256 381 | 723793978 1418 1430 382 | 727493378 493 502 383 | 727493378 563 822 384 | 727493378 1272 1285 385 | 727493378 1768 1866 386 | 727493378 1943 1995 387 | 727493378 1997 2265 388 | 727493378 3072 3149 389 | 727736557 85 94 390 | 727736557 186 212 391 | 727736557 305 328 392 | 727736557 650 669 393 | 727736557 983 1001 394 | 727736557 1203 1347 395 | 727736557 1761 1770 396 | 727736557 1819 1858 397 | 727736557 2226 2239 398 | 727736557 2351 2382 399 | 727736557 2429 2447 400 | 727736557 2840 2884 401 | 727736557 4017 4068 402 | 727736557 4511 4544 403 | 727736557 4574 4602 404 | 727736557 4715 4742 405 | 727736557 5073 5132 406 | 728169864 0 8 407 | 728169864 423 440 408 | 728169864 1628 1641 409 | 728169864 2632 2640 410 | 728169864 2644 2651 411 | 728169864 2655 2662 412 | 728169864 2666 2674 413 | 728169864 2678 2684 414 | 728169864 2688 2694 415 | 728169864 2698 2712 416 | 728169864 2720 2744 417 | 728169864 2747 2878 418 | 728169864 3161 3184 419 | 728169864 5313 5330 420 | 728169864 5517 5525 421 | 728169864 5753 5772 422 | 728169864 6198 6259 423 | 728758697 31 49 424 | 728758697 51 89 425 | 728758697 819 1034 426 | 728758697 1232 1454 427 | 728758697 1462 1509 428 | 728758697 1512 1599 429 | 728758697 1697 1744 430 | 728758697 1746 1788 431 | 728758697 1790 1836 432 | 729410793 29 76 433 | 729410793 316 395 434 | 729410793 657 705 435 | 729410793 708 752 436 | 729410793 754 1015 437 | 729410793 1018 1341 438 | 729410793 1563 1601 439 | 729410793 3356 3480 440 | 729410793 3510 3676 441 | 729410793 4126 4166 442 | 729410793 4237 4363 443 | 729410793 4586 4624 444 | 729410793 4626 4676 445 | 729561658 39 89 446 | 729561658 251 312 447 | 729561658 754 778 448 | 729561658 809 833 449 | 729561658 1006 1052 450 | 729561658 1494 1537 451 | 729561658 1539 1573 452 | 729561658 1575 1643 453 | 729561658 1645 1738 454 | 730559808 955 992 455 | 730559808 998 1088 456 | 730559808 1251 1340 457 | 730559808 2483 2640 458 | 730559808 3006 3186 459 | 730559808 3215 3490 460 | 730559808 3492 3611 461 | 730559808 3953 4017 462 | 730559808 4279 4361 463 | 730559808 4404 4441 464 | 730559808 4447 4537 465 | 730559808 5353 5419 466 | 730559808 5659 5889 467 | 730559808 5891 5969 468 | 730573740 45 66 469 | 730573740 834 846 470 | 730573740 983 1125 471 | 730573740 1302 1322 472 | 730573740 1338 1505 473 | 730573740 2177 2227 474 | 730573740 2240 2345 475 | 730573740 2479 2572 476 | 730573740 2682 2751 477 | 731927633 962 1016 478 | 731927633 1018 1053 479 | 731927633 1189 1224 480 | 731927633 1226 1280 481 | 731927633 1973 2005 482 | 731927633 2063 2092 483 | 731927633 2237 2323 484 | 731927633 2434 2444 485 | 731927633 2784 2832 486 | 731927633 3450 3514 487 | 732154721 27 61 488 | 732154721 281 316 489 | 732154721 397 528 490 | 732154721 987 1153 491 | 732154721 1172 1283 492 | 732154721 1930 1965 493 | 732154721 2384 2427 494 | 732154721 2607 2666 495 | 732154721 2755 2819 496 | 735855251 2216 2268 497 | 735855251 2357 2453 498 | 735855251 2455 2563 499 | 755814432 166 168 500 | 755814432 1728 1735 501 | 755814432 1978 2004 502 | 755814432 2418 2564 503 | 755814432 3178 3201 504 | 755814432 3304 3321 505 | 755814432 3846 3960 506 | 757243988 339 350 507 | 757243988 461 467 508 | 757243988 1447 1457 509 | 757243988 1658 1671 510 | 757243988 2267 2280 511 | 757243988 2473 2483 512 | 757243988 2635 2644 513 | 761969038 119 141 514 | 761969038 305 321 515 | 761969038 974 988 516 | 761969038 1755 1783 517 | 761969038 1971 2046 518 | 761969038 4048 4053 519 | 761969038 4149 4170 520 | 761969038 4271 4292 521 | 761969038 4391 4412 522 | 761969038 4521 4541 523 | 761969038 5552 5571 524 | 761969038 5614 5663 525 | 761969692 393 404 526 | 761969692 482 492 527 | 761969692 601 725 528 | 761969692 1557 1574 529 | 761969692 1894 1907 530 | 761969692 2005 2043 531 | 761969692 2057 2089 532 | 761969692 2276 2289 533 | 761969692 2291 2489 534 | 761969692 2858 2885 535 | 761969692 2993 3013 536 | 761969692 3017 3033 537 | 761969692 4010 4090 538 | 761969692 5796 5816 539 | 761969692 5915 5935 540 | 761969692 6064 6084 541 | 761969692 6222 6242 542 | 761969692 6522 6541 543 | 761969692 6636 6662 544 | 763260610 0 17 545 | 763260610 254 290 546 | 763260610 705 719 547 | 763260610 970 981 548 | 763260610 1053 1105 549 | 763260610 1284 1363 550 | 763260610 1365 1375 551 | 763260610 1411 1424 552 | 763260610 1470 1483 553 | 763260610 1516 1529 554 | 763260610 1593 1619 555 | 763260610 1621 1675 556 | 763260610 2132 2150 557 | 763260610 2205 2238 558 | 763260610 2311 2359 559 | 763260610 2430 2443 560 | 763260610 2812 2868 561 | 763260610 3031 3086 562 | 763260610 3134 3156 563 | 763260610 3559 3589 564 | 763260610 3782 3801 565 | 763260610 3811 3868 566 | 763260610 3911 3931 567 | 763260610 3956 3996 568 | 763260610 4015 4037 569 | 763260610 4166 4190 570 | 763260610 4192 4309 571 | 763260610 4369 4390 572 | 763260610 4500 4518 573 | 763260610 4639 4664 574 | 763260610 4724 4742 575 | 763260610 4819 4851 576 | 763260610 5066 5120 577 | 763260610 5391 5423 578 | 763260610 5495 5554 579 | 763260610 5881 5892 580 | 763260610 6160 6182 581 | 763260610 6230 6271 582 | 763260610 6409 6450 583 | 763260610 6660 6740 584 | 763260610 6845 6907 585 | 763260610 7459 7495 586 | 763260610 7606 7639 587 | 763260610 7661 7680 588 | 763260610 7890 7931 589 | 763260610 8032 8048 590 | 763260610 8129 8166 591 | 763260610 8247 8265 592 | 763260610 8777 8790 593 | 763260610 8957 8999 594 | 763260610 9160 9180 595 | 763260610 9186 9203 596 | 763260610 9302 9329 597 | 763260610 9362 9378 598 | 763260610 9405 9471 599 | 763260610 9689 9714 600 | 763260610 9818 9846 601 | 763260610 9916 9946 602 | 763260610 9967 10009 603 | 763260610 10201 10274 604 | 763260610 10457 10469 605 | 763260610 10568 10629 606 | 763260610 11055 11168 607 | 763260610 11280 11303 608 | 763260610 12230 12261 609 | 763260610 12529 12547 610 | 763260610 13164 13180 611 | 763260610 13644 13664 612 | 763260610 13771 13837 613 | 763260610 14075 14095 614 | 763260610 14289 14314 615 | 763260610 14328 14403 616 | 763260610 14424 14441 617 | 763761219 116 140 618 | 763761219 1452 1465 619 | 763761219 1645 1669 620 | 763761219 1875 1890 621 | 763761219 2068 2129 622 | 763761219 2295 2298 623 | 763761219 2343 2352 624 | 764609985 288 296 625 | 764609985 500 507 626 | 764609985 675 694 627 | 764609985 1681 1685 628 | 764609985 2352 2362 629 | 764609985 2463 2471 630 | 764609985 2550 2764 631 | 764609985 2992 3003 632 | 764609985 3574 3584 633 | 764609985 4675 4681 634 | 764609985 4924 4933 635 | 764609985 5496 5541 636 | 764609985 5891 6090 637 | 764609985 6332 6366 638 | 764609985 7112 7118 639 | 764609985 7224 7235 640 | 764609985 7247 7251 641 | 764609985 8497 8514 642 | 764609985 8701 8718 643 | 764609985 8747 8765 644 | 764609985 10188 10203 645 | 764609985 10403 10413 646 | 764609985 11038 11048 647 | 764715911 132 142 648 | 764715911 254 348 649 | 764715911 573 584 650 | 764715911 937 1001 651 | 764715911 1667 1713 652 | 764715911 1728 1734 653 | 764715911 2558 2561 654 | 764715911 2859 2916 655 | 764715911 3722 3734 656 | 764715911 4015 4028 657 | 764715911 4380 4391 658 | 764715911 5779 5788 659 | 764715911 6379 6427 660 | 764715911 6646 6649 661 | 764715911 6832 6852 662 | 764715911 6958 6981 663 | 764715911 7265 7459 664 | 764715911 7479 7596 665 | 764715911 7652 7700 666 | 765953146 787 816 667 | 765953146 982 1011 668 | 765953146 1099 1110 669 | 765953146 2168 2186 670 | 765953146 2193 2209 671 | 765953146 2497 2507 672 | 765953146 5320 5335 673 | 767129999 59 78 674 | 767129999 1498 1513 675 | 767129999 1739 1803 676 | 767129999 1943 1951 677 | 767129999 2058 2092 678 | 767129999 2478 2499 679 | 767129999 2522 2531 680 | 770156173 0 18 681 | 770156173 34 47 682 | 770156173 1021 1029 683 | 770156173 1106 1184 684 | 770156173 1556 1738 685 | 770156173 1740 1836 686 | 770156173 1919 1938 687 | 770156173 2094 2117 688 | 770156173 2158 2164 689 | 770156173 2330 2348 690 | 770156173 2469 2498 691 | 770156173 2819 2827 692 | 770156173 2934 2989 693 | 770156173 3012 3018 694 | 770156173 3266 3280 695 | 770156173 3692 3737 696 | 770156173 3924 3946 697 | 770156173 3970 3985 698 | 770156173 3991 4004 699 | 770156173 4523 4531 700 | 770156173 4682 4692 701 | 770156173 4758 4781 702 | 770156173 4814 4929 703 | 770156173 5254 5261 704 | 770156173 5330 5340 705 | 770156173 5645 5652 706 | 770156173 5920 5936 707 | 770156173 6056 6072 708 | 770877978 900 905 709 | 770877978 1000 1018 710 | 770877978 1020 1067 711 | 770877978 1252 1270 712 | 770877978 3117 3146 713 | 770877978 3244 3292 714 | 770956434 482 487 715 | 770956434 1360 1380 716 | 770956434 1390 1430 717 | 770956434 1560 1571 718 | 770956434 1659 1665 719 | 770956434 1925 1943 720 | 770956434 1945 1992 721 | 770956434 2176 2194 722 | 770956434 2583 2592 723 | 770956434 2884 2891 724 | 776368676 45 70 725 | 776368676 276 297 726 | 776368676 314 343 727 | 776368676 378 408 728 | 776368676 424 457 729 | 776368676 542 559 730 | 776368676 571 623 731 | 776368676 640 658 732 | 776368676 664 738 733 | 776368676 871 893 734 | 776368676 905 957 735 | 776368676 974 992 736 | 776368676 998 1078 737 | 776368676 1164 1176 738 | 776368676 1461 1494 739 | 776368676 1596 1633 740 | 776368676 2017 2033 741 | 776368676 2050 2074 742 | 776368676 2085 2118 743 | 776368676 2132 2147 744 | 776368676 2164 2174 745 | 776368676 2178 2210 746 | 776368676 3803 3823 747 | 780619695 44 62 748 | 780619695 120 160 749 | 780619695 162 168 750 | 780619695 174 183 751 | 780619695 1245 1262 752 | 780619695 1321 1337 753 | 780619695 1538 1554 754 | 780619695 1728 1744 755 | 780619695 1770 1794 756 | 780619695 1935 1959 757 | 780619695 2018 2034 758 | 780619695 2245 2261 759 | 780619695 2329 2338 760 | 780619695 2343 2359 761 | 780619695 2838 2854 762 | 780619695 3066 3082 763 | 780619695 3207 3224 764 | 780619695 3735 3740 765 | 780619695 3933 3941 766 | 780619695 4123 4129 767 | 780619695 4212 4229 768 | 780619695 4298 4309 769 | 780619695 4504 4513 770 | 780619695 4696 4736 771 | 780619695 4856 4874 772 | 780619695 5547 5553 773 | 780619695 5985 6015 774 | 780619695 6232 6246 775 | 780619695 6301 6314 776 | 781577820 14 25 777 | 781577820 163 190 778 | 781577820 215 246 779 | 781577820 274 302 780 | 781577820 470 494 781 | 781577820 554 561 782 | 781577820 589 661 783 | 781577820 731 741 784 | 781577820 968 1028 785 | 781577820 1031 1218 786 | 781577820 1476 1608 787 | 781577820 1642 1679 788 | 786527921 1 16 789 | 786527921 259 300 790 | 786527921 729 736 791 | 786527921 827 898 792 | 786527921 1281 1329 793 | 786527921 1465 1480 794 | 786527921 1881 1904 795 | 786527921 1955 2170 796 | 786527921 2176 2238 797 | 786527921 2415 2440 798 | 786527921 2753 2765 799 | 786527921 3033 3065 800 | 786527921 5134 5376 801 | 786527921 5622 5643 802 | 786527921 5787 5810 803 | 786527921 6132 6149 804 | 786527921 6683 6695 805 | 786527921 6880 6908 806 | 786527921 7032 7051 807 | 786527921 7482 7583 808 | 786527921 7598 7633 809 | 786527921 7747 7771 810 | 786527921 8099 8114 811 | 786527921 8403 8449 812 | 786527921 8548 8577 813 | 786527921 8838 8856 814 | 786527921 8903 8923 815 | 786527921 9063 9078 816 | 786527921 9859 9869 817 | 786527921 9942 10004 818 | 786527921 10254 10267 819 | 786527921 10496 10512 820 | 786527921 10665 10684 821 | 786527921 10713 10737 822 | 786527921 10920 10973 823 | 786527921 11260 11284 824 | 786527921 11294 11311 825 | 786527921 11506 11530 826 | 786527921 12164 12267 827 | 786527921 12385 12404 828 | 786527921 12856 12878 829 | 786527921 12974 13022 830 | 786527921 13029 13062 831 | 786527921 13191 13235 832 | 786527921 13589 13733 833 | 786527921 13931 13982 834 | 786527921 14062 14097 835 | 786527921 14149 14328 836 | 786527921 14488 14500 837 | 786527921 14505 14550 838 | 786527921 14918 14942 839 | 786527921 15077 15112 840 | 786527921 16017 16053 841 | 786527921 16104 16121 842 | 786527921 16130 16177 843 | 786527921 16221 16253 844 | 786527921 16530 16544 845 | 786527921 16804 16827 846 | 786527921 16846 16868 847 | 786527921 16919 16949 848 | 787142429 1106 1123 849 | 787142429 1150 1204 850 | 787142429 2684 2815 851 | 787529309 16 39 852 | 787529309 70 87 853 | 787529309 288 319 854 | 787529309 323 350 855 | 787529309 921 960 856 | 787529309 2338 2356 857 | 787529309 2405 2430 858 | 787529309 2694 2708 859 | 787529309 3158 3173 860 | 787529309 5159 5195 861 | 787529309 5394 5414 862 | 787529309 6056 6347 863 | 787529309 6678 6784 864 | 787529309 7626 7677 865 | 787529309 7814 7853 866 | 787529309 8200 8310 867 | 787529309 8337 8365 868 | 787529309 8567 8585 869 | 787529309 8686 8710 870 | 787529309 9173 9282 871 | 787529309 9303 9384 872 | 787529309 9661 9697 873 | 787759779 331 343 874 | 787759779 495 507 875 | 787759779 590 623 876 | 787759779 670 679 877 | 787759779 681 696 878 | 787759779 1011 1032 879 | 788900262 0 91 880 | 788900262 93 269 881 | 788900262 1484 1576 882 | 788900262 1722 1749 883 | 788900262 1817 1830 884 | 788900262 1858 1891 885 | 788900262 2218 2270 886 | 788900262 2272 2383 887 | 788900262 2401 2428 888 | 788900262 2696 2711 889 | 788900262 4190 4224 890 | 788900262 4478 4561 891 | 788900262 4877 5017 892 | 788900262 5064 5083 893 | 788900262 5112 5137 894 | 788900262 5263 5313 895 | 788900262 6020 6109 896 | 788900262 6331 6408 897 | 788900262 6490 6583 898 | 788900262 6601 6675 899 | 789370909 53 78 900 | 789370909 459 491 901 | 789370909 1894 1919 902 | 789370909 2009 2030 903 | 789370909 2148 2171 904 | 789370909 3212 3231 905 | 789370909 4056 4071 906 | 789370909 4416 4439 907 | 789370909 4736 4756 908 | 789370909 5762 5843 909 | 789370909 5939 6056 910 | 789370909 6861 6881 911 | 789370909 8063 8083 912 | 789370909 8517 8597 913 | 789370909 8727 8772 914 | 789370909 8829 8884 915 | 789370909 8985 9015 916 | 789370909 9134 9210 917 | 789370909 9255 9271 918 | 789370909 9306 9433 919 | 789370909 9840 9870 920 | 789370909 10444 10485 921 | 789370909 10867 10940 922 | 789370909 11058 11070 923 | 789370909 11625 11705 924 | 789370909 11722 11746 925 | 789370909 11837 11843 926 | 789370909 11848 11871 927 | 795703371 647 654 928 | 795703371 813 829 929 | 795703371 1202 1264 930 | 795703371 1268 1271 931 | 795703371 1774 1811 932 | 795703371 2464 2475 933 | 795703371 2479 2490 934 | 795703371 2572 2575 935 | 795703371 2889 2951 936 | 795703371 3759 3901 937 | 795703371 3904 4050 938 | 795703371 4103 4111 939 | 795703371 4126 4135 940 | 795703371 4268 4327 941 | 795703371 4398 4442 942 | 999000149 113 127 943 | 999000149 243 295 944 | 999000149 300 326 945 | 999000149 813 830 946 | 999000149 1303 1374 947 | 999000149 1602 1689 948 | 999000149 2952 2969 949 | 999000149 2974 2987 950 | 999000149 2989 3077 951 | 999000149 3119 3140 952 | 999000159 422 433 953 | 999000159 499 510 954 | 999000159 949 966 955 | 999000159 1287 1303 956 | 999000159 1701 1735 957 | 999000159 1905 1993 958 | 999000565 25 50 959 | 999000565 127 177 960 | 999000565 276 296 961 | 999000565 344 378 962 | 999000565 419 449 963 | 999000565 575 607 964 | 999000894 74 82 965 | 999000894 148 151 966 | 999000894 411 436 967 | 999000894 501 512 968 | 999000894 1393 1409 969 | 999000894 3391 3409 970 | 999000894 4245 4276 971 | 999000894 4927 4940 972 | 999000894 5185 5200 973 | 999000894 5314 5317 974 | 999001033 38 55 975 | 999001033 134 151 976 | 999001033 400 406 977 | 999001033 687 739 978 | 999001033 826 841 979 | 999001033 875 926 980 | 999001033 941 968 981 | 999001033 1160 1225 982 | 999001033 1245 1274 983 | 999001033 1417 1426 984 | 999001033 1445 1466 985 | 999001033 1468 1488 986 | 999001033 1519 1551 987 | 999001033 1564 1575 988 | 999001033 1879 1901 989 | 999001033 1935 1957 990 | 999001033 1985 2025 991 | 999001033 2382 2408 992 | 999001033 2532 2563 993 | 999001033 2595 2603 994 | 999001033 3206 3219 995 | 999001033 3224 3245 996 | 999001033 3362 3422 997 | 999001033 3560 3581 998 | 999001033 3595 3607 999 | 999001033 3713 3724 1000 | 999001297 773 789 1001 | 999001297 1496 1515 1002 | 999001297 2633 2670 1003 | 999001297 2676 2715 1004 | 999001297 2883 2895 1005 | 999001621 0 68 1006 | 999001621 88 156 1007 | 999001621 325 338 1008 | 999001621 382 401 1009 | 999001621 467 473 1010 | 999001621 612 621 1011 | 999001621 769 783 1012 | 999001621 1628 1655 1013 | 999001621 3471 3495 1014 | 999001621 3976 4062 1015 | 999001621 4085 4137 1016 | 999001621 4208 4225 1017 | 999001621 4485 4527 1018 | 999001621 5047 5061 1019 | 999001621 5088 5132 1020 | 999001621 5159 5170 1021 | 999001621 5890 5902 1022 | 999001621 7085 7091 1023 | 999001621 7988 7999 1024 | 999001621 8473 8481 1025 | 999001621 8493 8500 1026 | 999001621 8550 8570 1027 | 999001621 8600 8611 1028 | 999001621 8627 8694 1029 | 999001621 8892 8897 1030 | 999001621 9108 9121 1031 | 999001621 9801 9859 1032 | 999001621 10035 10062 1033 | 999001621 10151 10169 1034 | 999001621 10180 10185 1035 | 999001621 10536 10541 1036 | 999001621 10779 10800 1037 | 999001621 10931 11012 1038 | 999001621 11213 11537 1039 | 999001621 12106 12112 1040 | 999001621 12143 12184 1041 | -------------------------------------------------------------------------------- /span_identification/ner/run_ner.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team. 3 | # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | """ Fine-tuning the library models for named entity recognition on CoNLL-2003 (Bert or Roberta). """ 17 | 18 | from __future__ import absolute_import, division, print_function 19 | 20 | import argparse 21 | import glob 22 | import logging 23 | import os 24 | import random 25 | 26 | from unidecode import unidecode 27 | 28 | import pickle 29 | import numpy as np 30 | import torch 31 | from seqeval.metrics import precision_score, recall_score, f1_score 32 | from sklearn_crfsuite import metrics 33 | from tensorboardX import SummaryWriter 34 | from torch.nn import CrossEntropyLoss 35 | from torch.utils.data import DataLoader, RandomSampler, SequentialSampler, TensorDataset 36 | from torch.utils.data.distributed import DistributedSampler 37 | from tqdm import tqdm, trange 38 | from .utils_ner import convert_examples_to_features, get_labels, read_examples_from_file 39 | 40 | from transformers import AdamW, get_linear_schedule_with_warmup 41 | from transformers import WEIGHTS_NAME, BertConfig, BertForTokenClassification, BertTokenizer 42 | from transformers import RobertaConfig, RobertaTokenizer 43 | from transformers import DistilBertConfig, DistilBertForTokenClassification, DistilBertTokenizer 44 | from transformers import XLNetConfig, XLNetForTokenClassification, XLNetTokenizer 45 | from transformers import CamembertConfig, CamembertForTokenClassification, CamembertTokenizer 46 | from scipy.special import softmax 47 | 48 | from .modeling_roberta import RobertaForTokenClassification 49 | 50 | logger = logging.getLogger(__name__) 51 | 52 | ALL_MODELS = sum( 53 | (tuple(conf.pretrained_config_archive_map.keys()) for conf in (BertConfig, RobertaConfig, DistilBertConfig)), 54 | ()) 55 | 56 | MODEL_CLASSES = { 57 | "bert": (BertConfig, BertForTokenClassification, BertTokenizer), 58 | "roberta": (RobertaConfig, RobertaForTokenClassification, RobertaTokenizer), 59 | "distilbert": (DistilBertConfig, DistilBertForTokenClassification, DistilBertTokenizer), 60 | "camembert": (CamembertConfig, CamembertForTokenClassification, CamembertTokenizer), 61 | "xlnet": (XLNetConfig, XLNetForTokenClassification, XLNetTokenizer) 62 | } 63 | 64 | 65 | def set_seed(args): 66 | random.seed(args.seed) 67 | np.random.seed(args.seed) 68 | torch.manual_seed(args.seed) 69 | if args.n_gpu > 0: 70 | torch.cuda.manual_seed_all(args.seed) 71 | 72 | 73 | def train(args, train_dataset, model, tokenizer, labels, pad_token_label_id): 74 | """ Train the model """ 75 | if args.local_rank in [-1, 0]: 76 | tb_writer = SummaryWriter() 77 | 78 | args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu) 79 | train_sampler = RandomSampler(train_dataset) if args.local_rank == -1 else DistributedSampler(train_dataset) 80 | train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.train_batch_size) 81 | 82 | if args.max_steps > 0: 83 | t_total = args.max_steps 84 | args.num_train_epochs = args.max_steps // (len(train_dataloader) // args.gradient_accumulation_steps) + 1 85 | else: 86 | t_total = len(train_dataloader) // args.gradient_accumulation_steps * args.num_train_epochs 87 | 88 | # Prepare optimizer and schedule (linear warmup and decay) 89 | no_decay = ["bias", "LayerNorm.weight"] 90 | optimizer_grouped_parameters = [ 91 | {"params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 92 | "weight_decay": args.weight_decay}, 93 | {"params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], "weight_decay": 0.0} 94 | ] 95 | optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon) 96 | scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total) 97 | if args.fp16: 98 | try: 99 | from apex import amp 100 | except ImportError: 101 | raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.") 102 | model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level) 103 | 104 | # multi-gpu training (should be after apex fp16 initialization) 105 | if args.n_gpu > 1: 106 | model = torch.nn.DataParallel(model) 107 | 108 | # Distributed training (should be after apex fp16 initialization) 109 | if args.local_rank != -1: 110 | model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.local_rank], 111 | output_device=args.local_rank, 112 | find_unused_parameters=True) 113 | 114 | # Train! 115 | logger.info("***** Running training *****") 116 | logger.info(" Num examples = %d", len(train_dataset)) 117 | logger.info(" Num Epochs = %d", args.num_train_epochs) 118 | logger.info(" Instantaneous batch size per GPU = %d", args.per_gpu_train_batch_size) 119 | logger.info(" Total train batch size (w. parallel, distributed & accumulation) = %d", 120 | args.train_batch_size * args.gradient_accumulation_steps * ( 121 | torch.distributed.get_world_size() if args.local_rank != -1 else 1)) 122 | logger.info(" Gradient Accumulation steps = %d", args.gradient_accumulation_steps) 123 | logger.info(" Total optimization steps = %d", t_total) 124 | 125 | global_step = 0 126 | tr_loss, logging_loss = 0.0, 0.0 127 | model.zero_grad() 128 | train_iterator = trange(int(args.num_train_epochs), desc="Epoch", disable=args.local_rank not in [-1, 0]) 129 | set_seed(args) # Added here for reproductibility (even between python 2 and 3) 130 | for _ in train_iterator: 131 | epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0], position=0, leave=True) 132 | for step, batch in enumerate(epoch_iterator): 133 | model.train() 134 | batch = tuple(t.to(args.device) for t in batch) 135 | inputs = {"input_ids": batch[0], 136 | "attention_mask": batch[1], 137 | "labels": batch[3]} 138 | if args.model_type != "distilbert": 139 | inputs["token_type_ids"] = batch[2] if args.model_type in ["bert", "xlnet"] else None # XLM and RoBERTa don"t use segment_ids 140 | if args.use_quotes: 141 | inputs['quotes'] = batch[4] 142 | outputs = model(**inputs) 143 | loss = outputs[0] # model outputs are always tuple in pytorch-transformers (see doc) 144 | 145 | if args.n_gpu > 1: 146 | loss = loss.mean() # mean() to average on multi-gpu parallel training 147 | if args.gradient_accumulation_steps > 1: 148 | loss = loss / args.gradient_accumulation_steps 149 | 150 | if args.fp16: 151 | with amp.scale_loss(loss, optimizer) as scaled_loss: 152 | scaled_loss.backward() 153 | else: 154 | loss.backward() 155 | 156 | tr_loss += loss.item() 157 | if (step + 1) % args.gradient_accumulation_steps == 0: 158 | if args.fp16: 159 | torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), args.max_grad_norm) 160 | else: 161 | torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) 162 | 163 | scheduler.step() # Update learning rate schedule 164 | optimizer.step() 165 | model.zero_grad() 166 | global_step += 1 167 | 168 | if args.local_rank in [-1, 0] and args.logging_steps > 0 and global_step % args.logging_steps == 0: 169 | # Log metrics 170 | if args.local_rank == -1 and args.evaluate_during_training: # Only evaluate when single GPU otherwise metrics may not average well 171 | results, _ = evaluate(args, model, tokenizer, labels, pad_token_label_id, mode="dev") 172 | for key, value in results.items(): 173 | tb_writer.add_scalar("eval_{}".format(key), value, global_step) 174 | tb_writer.add_scalar("lr", scheduler.get_lr()[0], global_step) 175 | tb_writer.add_scalar("loss", (tr_loss - logging_loss) / args.logging_steps, global_step) 176 | logging_loss = tr_loss 177 | 178 | if args.local_rank in [-1, 0] and args.save_steps > 0 and global_step % args.save_steps == 0: 179 | # Save model checkpoint 180 | output_dir = os.path.join(args.output_dir, "checkpoint-{}".format(global_step)) 181 | if not os.path.exists(output_dir): 182 | os.makedirs(output_dir) 183 | model_to_save = model.module if hasattr(model, "module") else model # Take care of distributed/parallel training 184 | model_to_save.save_pretrained(output_dir) 185 | torch.save(args, os.path.join(output_dir, "training_args.bin")) 186 | logger.info("Saving model checkpoint to %s", output_dir) 187 | 188 | if args.max_steps > 0 and global_step > args.max_steps: 189 | epoch_iterator.close() 190 | break 191 | if args.max_steps > 0 and global_step > args.max_steps: 192 | train_iterator.close() 193 | break 194 | 195 | if args.local_rank in [-1, 0]: 196 | tb_writer.close() 197 | 198 | return global_step, tr_loss / global_step 199 | 200 | 201 | def evaluate(args, model, tokenizer, labels, pad_token_label_id, mode, prefix=""): 202 | eval_dataset = load_and_cache_examples(args, tokenizer, labels, pad_token_label_id, mode=mode) 203 | 204 | args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu) 205 | # Note that DistributedSampler samples randomly 206 | eval_sampler = SequentialSampler(eval_dataset) if args.local_rank == -1 else DistributedSampler(eval_dataset) 207 | eval_dataloader = DataLoader(eval_dataset, sampler=eval_sampler, batch_size=args.eval_batch_size) 208 | 209 | # multi-gpu evaluate 210 | if args.n_gpu > 1: 211 | model = torch.nn.DataParallel(model) 212 | 213 | # Eval! 214 | logger.info("***** Running evaluation %s *****", prefix) 215 | logger.info(" Num examples = %d", len(eval_dataset)) 216 | logger.info(" Batch size = %d", args.eval_batch_size) 217 | eval_loss = 0.0 218 | nb_eval_steps = 0 219 | preds = None 220 | out_label_ids = None 221 | model.eval() 222 | for batch in tqdm(eval_dataloader, desc="Evaluating"): 223 | batch = tuple(t.to(args.device) for t in batch) 224 | 225 | with torch.no_grad(): 226 | inputs = {"input_ids": batch[0], 227 | "attention_mask": batch[1], 228 | "labels": batch[3]} 229 | if args.model_type != "distilbert": 230 | inputs["token_type_ids"] = batch[2] if args.model_type in ["bert", "xlnet"] else None # XLM and RoBERTa don"t use segment_ids 231 | if args.use_quotes: 232 | inputs['quotes'] = batch[4] 233 | outputs = model(**inputs) 234 | tmp_eval_loss, logits = outputs[:2] 235 | 236 | if args.n_gpu > 1: 237 | tmp_eval_loss = tmp_eval_loss.mean() # mean() to average on multi-gpu parallel evaluating 238 | 239 | eval_loss += tmp_eval_loss.item() 240 | nb_eval_steps += 1 241 | if preds is None: 242 | preds = logits.detach().cpu().numpy() 243 | out_label_ids = inputs["labels"].detach().cpu().numpy() 244 | else: 245 | preds = np.append(preds, logits.detach().cpu().numpy(), axis=0) 246 | out_label_ids = np.append(out_label_ids, inputs["labels"].detach().cpu().numpy(), axis=0) 247 | 248 | eval_loss = eval_loss / nb_eval_steps 249 | preds_logits = softmax(preds, axis=2) 250 | preds = np.argmax(preds, axis=2) 251 | 252 | label_map = {i: label for i, label in enumerate(labels)} 253 | 254 | out_label_list = [[] for _ in range(out_label_ids.shape[0])] 255 | preds_list = [[] for _ in range(out_label_ids.shape[0])] 256 | 257 | for i in range(out_label_ids.shape[0]): 258 | for j in range(out_label_ids.shape[1]): 259 | if out_label_ids[i, j] != pad_token_label_id: 260 | out_label_list[i].append(label_map[out_label_ids[i][j]]) 261 | if np.max(preds_logits[i][j]) > 0: 262 | preds_list[i].append(label_map[preds[i][j]]) 263 | else: 264 | preds_list[i].append('O') 265 | 266 | results = { 267 | "loss": eval_loss, 268 | "precision": precision_score(out_label_list, preds_list), 269 | "recall": recall_score(out_label_list, preds_list), 270 | "f1": f1_score(out_label_list, preds_list), 271 | "flat_f1": metrics.flat_f1_score(out_label_list, preds_list, average='micro', labels=["B-PROP", "I-PROP"]) 272 | } 273 | 274 | logger.info("***** Eval results %s *****", prefix) 275 | for key in sorted(results.keys()): 276 | logger.info(" %s = %s", key, str(results[key])) 277 | 278 | return results, preds_list 279 | 280 | 281 | def load_and_cache_examples(args, tokenizer, labels, pad_token_label_id, mode): 282 | if args.local_rank not in [-1, 0] and not evaluate: 283 | torch.distributed.barrier() # Make sure only the first process in distributed training process the dataset, and the others will use the cache 284 | 285 | # Load data features from cache or dataset file 286 | cached_features_file = os.path.join(args.data_dir, "cached_{}_{}_{}".format(mode, 287 | list(filter(None, args.model_name_or_path.split("/"))).pop(), 288 | str(args.max_seq_length))) 289 | if False and os.path.exists(cached_features_file) and not args.overwrite_cache: 290 | logger.info("Loading features from cached file %s", cached_features_file) 291 | features = torch.load(cached_features_file) 292 | else: 293 | logger.info("Creating features from dataset file at %s", args.data_dir) 294 | files = {'train': args.train_file, 'dev': args.dev_file, 'test': args.test_file} 295 | examples = read_examples_from_file(os.path.join(args.data_dir, files[mode]), mode) 296 | features = convert_examples_to_features(examples, labels, args.max_seq_length, tokenizer, 297 | cls_token_at_end=bool(args.model_type in ["xlnet"]), 298 | # xlnet has a cls token at the end 299 | cls_token=tokenizer.cls_token, 300 | cls_token_segment_id=2 if args.model_type in ["xlnet"] else 0, 301 | sep_token=tokenizer.sep_token, 302 | sep_token_extra=bool(args.model_type in ["roberta"]), 303 | # roberta uses an extra separator b/w pairs of sentences, cf. github.com/pytorch/fairseq/commit/1684e166e3da03f5b600dbb7855cb98ddfcd0805 304 | pad_on_left=bool(args.model_type in ["xlnet"]), 305 | # pad on the left for xlnet 306 | pad_token=tokenizer.convert_tokens_to_ids([tokenizer.pad_token])[0], 307 | pad_token_segment_id=4 if args.model_type in ["xlnet"] else 0, 308 | pad_token_label_id=pad_token_label_id 309 | ) 310 | if args.use_quotes: 311 | assert len(features) == len(examples) 312 | for i in range(len(features)): 313 | tokens = [] 314 | for word in examples[i].words: 315 | word_tokens = tokenizer.tokenize(word) 316 | tokens.extend(word_tokens) 317 | tokens = ['cls_token'] + tokens 318 | quotes = np.zeros(args.max_seq_length, dtype=np.float32) 319 | for j in range(1, min(len(tokens), args.max_seq_length)): 320 | if unidecode(tokens[j]) == '"': 321 | quotes[j] = 1 322 | features[i].quotes = quotes[:, None] 323 | 324 | if args.local_rank in [-1, 0]: 325 | logger.info("Saving features into cached file %s", cached_features_file) 326 | torch.save(features, cached_features_file) 327 | 328 | if args.local_rank == 0 and not evaluate: 329 | torch.distributed.barrier() # Make sure only the first process in distributed training process the dataset, and the others will use the cache 330 | 331 | # Convert to Tensors and build dataset 332 | all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long) 333 | all_input_mask = torch.tensor([f.input_mask for f in features], dtype=torch.long) 334 | all_segment_ids = torch.tensor([f.segment_ids for f in features], dtype=torch.long) 335 | all_label_ids = torch.tensor([f.label_ids for f in features], dtype=torch.long) 336 | if args.use_quotes: 337 | all_quotes = torch.tensor([f.quotes for f in features], dtype=torch.long) 338 | dataset = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids, all_quotes) 339 | else: 340 | dataset = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids) 341 | return dataset 342 | 343 | 344 | def transformers_ner(args): 345 | if os.path.exists(args.output_dir) and os.listdir( 346 | args.output_dir) and args.do_train and not args.overwrite_output_dir: 347 | raise ValueError( 348 | "Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome.".format( 349 | args.output_dir)) 350 | 351 | # Setup distant debugging if needed 352 | if args.server_ip and args.server_port: 353 | # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script 354 | import ptvsd 355 | print("Waiting for debugger attach") 356 | ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True) 357 | ptvsd.wait_for_attach() 358 | 359 | # Setup CUDA, GPU & distributed training 360 | if args.local_rank == -1 or args.no_cuda: 361 | device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") 362 | args.n_gpu = torch.cuda.device_count() 363 | else: # Initializes the distributed backend which will take care of sychronizing nodes/GPUs 364 | torch.cuda.set_device(args.local_rank) 365 | device = torch.device("cuda", args.local_rank) 366 | torch.distributed.init_process_group(backend="nccl") 367 | args.n_gpu = 1 368 | args.device = device 369 | 370 | # Setup logging 371 | logging.basicConfig(format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", 372 | datefmt="%m/%d/%Y %H:%M:%S", 373 | level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN) 374 | logger.warning("Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s", 375 | args.local_rank, device, args.n_gpu, bool(args.local_rank != -1), args.fp16) 376 | 377 | # Set seed 378 | set_seed(args) 379 | 380 | # Prepare CONLL-2003 task 381 | labels = get_labels(args.labels) 382 | num_labels = len(labels) 383 | # Use cross entropy ignore index as padding label id so that only real label ids contribute to the loss later 384 | pad_token_label_id = CrossEntropyLoss().ignore_index 385 | 386 | # Load pretrained model and tokenizer 387 | if args.local_rank not in [-1, 0]: 388 | torch.distributed.barrier() # Make sure only the first process in distributed training will download model & vocab 389 | 390 | args.model_type = args.model_type.lower() 391 | config_class, model_class, tokenizer_class = MODEL_CLASSES[args.model_type] 392 | 393 | config = config_class.from_pretrained(args.config_name if args.config_name else args.model_name_or_path, 394 | num_labels=num_labels, 395 | cache_dir=args.cache_dir if args.cache_dir else None) 396 | config.use_quotes = args.use_quotes 397 | tokenizer = tokenizer_class.from_pretrained(args.tokenizer_name if args.tokenizer_name else args.model_name_or_path, 398 | do_lower_case=args.do_lower_case, 399 | cache_dir=args.cache_dir if args.cache_dir else None) 400 | model = model_class.from_pretrained(args.model_name_or_path, 401 | from_tf=bool(".ckpt" in args.model_name_or_path), 402 | config=config, 403 | cache_dir=args.cache_dir if args.cache_dir else None) 404 | 405 | if args.local_rank == 0: 406 | torch.distributed.barrier() # Make sure only the first process in distributed training will download model & vocab 407 | 408 | model.to(args.device) 409 | 410 | logger.info("Training/evaluation parameters %s", args) 411 | 412 | # Training 413 | if args.do_train: 414 | train_dataset = load_and_cache_examples(args, tokenizer, labels, pad_token_label_id, mode="train") 415 | global_step, tr_loss = train(args, train_dataset, model, tokenizer, labels, pad_token_label_id) 416 | logger.info(" global_step = %s, average loss = %s", global_step, tr_loss) 417 | 418 | # Saving best-practices: if you use defaults names for the model, you can reload it using from_pretrained() 419 | if args.do_train and (args.local_rank == -1 or torch.distributed.get_rank() == 0): 420 | # Create output directory if needed 421 | if not os.path.exists(args.output_dir) and args.local_rank in [-1, 0]: 422 | os.makedirs(args.output_dir) 423 | 424 | logger.info("Saving model checkpoint to %s", args.output_dir) 425 | # Save a trained model, configuration and tokenizer using `save_pretrained()`. 426 | # They can then be reloaded using `from_pretrained()` 427 | model_to_save = model.module if hasattr(model, "module") else model # Take care of distributed/parallel training 428 | #model_to_save.save_pretrained(args.output_dir) 429 | model_save_path_ = os.path.join(args.output_dir, "pytorch_model.bin") 430 | torch.save(model_to_save.state_dict(), model_save_path_) 431 | tokenizer.save_pretrained(args.output_dir) 432 | 433 | # Good practice: save your training arguments together with the trained model 434 | torch.save(args, os.path.join(args.output_dir, "training_args.bin")) 435 | 436 | # Evaluation 437 | results = {} 438 | if args.do_eval and args.local_rank in [-1, 0]: 439 | tokenizer = tokenizer_class.from_pretrained(args.output_dir, do_lower_case=args.do_lower_case) 440 | checkpoints = [args.output_dir] 441 | if args.eval_all_checkpoints: 442 | checkpoints = list(os.path.dirname(c) for c in sorted(glob.glob(args.output_dir + "/**/" + WEIGHTS_NAME, recursive=True))) 443 | logging.getLogger("pytorch_transformers.modeling_utils").setLevel(logging.WARN) # Reduce logging 444 | logger.info("Evaluate the following checkpoints: %s", checkpoints) 445 | for checkpoint in checkpoints: 446 | global_step = checkpoint.split("-")[-1] if len(checkpoints) > 1 else "" 447 | 448 | model = model_class.from_pretrained(checkpoint) 449 | model.to(args.device) 450 | result, _ = evaluate(args, model, tokenizer, labels, pad_token_label_id, mode="dev", prefix=global_step) 451 | if global_step: 452 | result = {"{}_{}".format(global_step, k): v for k, v in result.items()} 453 | results.update(result) 454 | output_eval_file = os.path.join(args.output_dir, "eval_results.txt") 455 | with open(output_eval_file, "w") as writer: 456 | for key in sorted(results.keys()): 457 | writer.write("{} = {}\n".format(key, str(results[key]))) 458 | 459 | if args.do_predict and args.local_rank in [-1, 0]: 460 | tokenizer = tokenizer_class.from_pretrained(args.output_dir, do_lower_case=args.do_lower_case) 461 | checkpoints = [args.output_dir] 462 | if args.eval_all_checkpoints: 463 | checkpoints = list(os.path.dirname(c) for c in sorted(glob.glob(args.output_dir + "/**/" + WEIGHTS_NAME, recursive=True))) 464 | logging.getLogger("pytorch_transformers.modeling_utils").setLevel(logging.WARN) # Reduce logging 465 | logger.info("Evaluate the following checkpoints: %s", checkpoints) 466 | for checkpoint in checkpoints: 467 | global_step = checkpoint.split("-")[-1] if len(checkpoints) > 1 else "" 468 | 469 | model = model_class.from_pretrained(checkpoint) 470 | model.to(args.device) 471 | result, predictions = evaluate(args, model, tokenizer, labels, pad_token_label_id, mode="test") 472 | if global_step: 473 | result = {"{}_{}".format(global_step, k): v for k, v in result.items()} 474 | results.update(result) 475 | # Save results 476 | output_test_results_file = os.path.join(checkpoint, "test_results.txt") 477 | with open(output_test_results_file, "w") as writer: 478 | for key in sorted(result.keys()): 479 | writer.write("{} = {}\n".format(key, str(result[key]))) 480 | # Save predictions 481 | output_test_predictions_file = os.path.join(checkpoint, "test_predictions.txt") 482 | with open(output_test_predictions_file, "w") as writer: 483 | with open(os.path.join(args.data_dir, args.test_file), "r") as f: 484 | example_id = 0 485 | for line in f: 486 | if line.startswith("-DOCSTART-") or line == "" or line == "\n": 487 | writer.write(line) 488 | if not predictions[example_id]: 489 | example_id += 1 490 | elif predictions[example_id]: 491 | output_line = line.split('\t')[0] + "\t" + predictions[example_id].pop(0) + "\n" 492 | writer.write(output_line) 493 | else: 494 | logger.warning("Maximum sequence length exceeded: No prediction for '%s'.", line.split()[0]) 495 | 496 | return results 497 | --------------------------------------------------------------------------------