├── span_identification
├── __init__.py
├── ner
│ ├── __init__.py
│ ├── bert_lstm_crf.py
│ ├── utils_ner.py
│ └── run_ner.py
├── submission.py
├── dataset.py
└── __main__.py
├── .gitattributes
├── technique_classification
├── __init__.py
├── transformers_classifier
│ ├── __init__.py
│ └── utils.py
├── dataset.py
├── submission.py
└── __main__.py
├── tools
├── ._README.md
├── data
│ ├── submission-task-SI.tsv
│ ├── article736757214.task-SI.labels
│ ├── submission-task-TC.tsv
│ ├── article736757214.labels-task-TC
│ ├── propaganda-techniques-names-semeval2020task11.txt
│ ├── propaganda-techniques-names.txt
│ └── article736757214.txt
├── src
│ ├── annotation_task_si.py
│ ├── propaganda_techniques.py
│ ├── annotation.py
│ ├── annotation_w_o_label.py
│ └── annotations.py
├── print_spans.py
├── task-TC_scorer.py
└── README.md
├── visualization_example
└── visualization
│ ├── highlight.js
│ ├── __init__.py
│ ├── html_template.py
│ └── highlight.css
├── requirements.txt
├── configs
├── si_config.yml
└── tc_config.yml
├── .gitignore
├── README.md
└── results
└── SI_output.txt
/span_identification/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/.gitattributes:
--------------------------------------------------------------------------------
1 | *.py linguist-detectable=true
2 | *.ipynb linguist-detectable=false
3 |
--------------------------------------------------------------------------------
/technique_classification/__init__.py:
--------------------------------------------------------------------------------
1 | from .transformers_classifier import transformers_clf
--------------------------------------------------------------------------------
/tools/._README.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aschern/semeval2020_task11/HEAD/tools/._README.md
--------------------------------------------------------------------------------
/tools/data/submission-task-SI.tsv:
--------------------------------------------------------------------------------
1 | 736757214 0 50
2 | 736757214 161 172
3 | 736757214 0 10
4 | 736757214 115 167
5 |
--------------------------------------------------------------------------------
/tools/data/article736757214.task-SI.labels:
--------------------------------------------------------------------------------
1 | 736757214 0 59
2 | 736757214 171 181
3 | 736757214 0 9
4 | 736757214 115 167
5 | 736757214 740 759
6 |
--------------------------------------------------------------------------------
/tools/data/submission-task-TC.tsv:
--------------------------------------------------------------------------------
1 | 736757214 Exaggeration,Minimisation 0 59
2 | 736757214 Doubt 171 181
3 | 736757214 Name_Calling,Labeling 0 9
4 | 736757214 Name_Calling,Labeling 115 167
5 | 736757214 Loaded_Language 740 759
6 |
--------------------------------------------------------------------------------
/technique_classification/transformers_classifier/__init__.py:
--------------------------------------------------------------------------------
1 | from .run_glue import transformers_clf
2 | from .modeling_roberta import RobertaForSequenceClassification
3 | from .utils import glue_processors, glue_output_modes, glue_compute_metrics
4 |
--------------------------------------------------------------------------------
/tools/data/article736757214.labels-task-TC:
--------------------------------------------------------------------------------
1 | 736757214 Exaggeration,Minimisation 0 59
2 | 736757214 Whataboutism,Straw_Men,Red_Herring 171 181
3 | 736757214 Name_Calling,Labeling 0 9
4 | 736757214 Loaded_Language 115 167
5 | 736757214 Loaded_Language 740 759
6 |
--------------------------------------------------------------------------------
/visualization_example/visualization/highlight.js:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | torch==1.2.0
2 | transformers==2.3.0
3 | scipy==1.4.1
4 | numpy==1.16.4
5 | joblib==0.13.2
6 | nltk==3.4.5
7 | ConfigArgParse==1.0
8 | sklearn_crfsuite==0.3.6
9 | apex==0.1
10 | seqeval==0.0.5
11 | spacy==2.2.3
12 | Unidecode==1.1.1
13 | tqdm==4.43.0
14 | pandas==1.0.1
15 | ipython==7.13.0
16 | ptvsd==4.3.2
17 | scikit_learn==0.22.2.post1
18 | tensorboardX==2.0
19 |
--------------------------------------------------------------------------------
/span_identification/ner/__init__.py:
--------------------------------------------------------------------------------
1 | from .run_ner import transformers_ner
2 | from .modeling_roberta import RobertaForTokenClassification
3 | from .utils_ner import convert_examples_to_features, get_labels, read_examples_from_file
4 | from .run_ner_crf import transformers_ner_crf
5 | from .bert_lstm_crf import BertLstmCrf
6 | from .conditional_random_field import ConditionalRandomField, allowed_transitions
7 |
--------------------------------------------------------------------------------
/tools/data/propaganda-techniques-names-semeval2020task11.txt:
--------------------------------------------------------------------------------
1 | Appeal_to_Authority
2 | Appeal_to_fear-prejudice
3 | Bandwagon,Reductio_ad_hitlerum
4 | Black-and-White_Fallacy
5 | Causal_Oversimplification
6 | Doubt
7 | Exaggeration,Minimisation
8 | Flag-Waving
9 | Loaded_Language
10 | Name_Calling,Labeling
11 | Repetition
12 | Slogans
13 | Thought-terminating_Cliches
14 | Whataboutism,Straw_Men,Red_Herring
15 |
--------------------------------------------------------------------------------
/tools/data/propaganda-techniques-names.txt:
--------------------------------------------------------------------------------
1 | Appeal_to_Authority
2 | Appeal_to_fear-prejudice
3 | Bandwagon
4 | Black-and-White_Fallacy
5 | Causal_Oversimplification
6 | Doubt
7 | Exaggeration,Minimisation
8 | Flag-Waving
9 | Loaded_Language
10 | Name_Calling,Labeling
11 | Obfuscation,Intentional_Vagueness,Confusion
12 | Red_Herring
13 | Reductio_ad_hitlerum
14 | Repetition
15 | Slogans
16 | Straw_Men
17 | Thought-terminating_Cliches
18 | Whataboutism
19 |
--------------------------------------------------------------------------------
/tools/src/annotation_task_si.py:
--------------------------------------------------------------------------------
1 | import sys
2 | import src.annotation as an
3 |
4 | class AnnotationTaskSI(Annotation):
5 |
6 | def __init__(self, label=None, start_offset = None, end_offset=None): #, article_id=None):
7 |
8 | self.label = label
9 | self.start_offset = int(start_offset)
10 | self.end_offset = int(end_offset)
11 |
12 |
13 | def get_label(self):
14 |
15 | sys.error("ERRRO: trying to access technique label from file in SI task format")
16 |
17 |
18 |
--------------------------------------------------------------------------------
/tools/src/propaganda_techniques.py:
--------------------------------------------------------------------------------
1 |
2 | class Propaganda_Techniques():
3 |
4 |
5 | TECHNIQUE_NAMES_FILE="data/propaganda-techniques-names.txt"
6 |
7 | def __init__(self, filename=TECHNIQUE_NAMES_FILE):
8 |
9 | with open(filename, "r") as f:
10 | self.techniques = [ line.rstrip() for line in f.readlines() ]
11 |
12 |
13 | def get_propaganda_techniques_list(self)->list:
14 |
15 | return self.techniques
16 |
17 |
18 | def get_propaganda_techniques_list_sorted(self)->list:
19 |
20 | return sorted(self.techniques)
21 |
22 |
23 | def is_valid_technique(self, technique_name):
24 |
25 | return technique_name in self.techniques
26 |
27 |
28 | def __str__(self):
29 |
30 | return "\n".join(self.techniques)
31 |
32 |
33 | def __getitem__(self, index):
34 | return self.techniques[index]
35 |
36 |
37 | def get_technique(self, index):
38 | return self.techniques[index]
39 |
40 |
41 | def indexOf(self, technique_name):
42 | return self.techniques.index(technique_name)
43 |
--------------------------------------------------------------------------------
/configs/si_config.yml:
--------------------------------------------------------------------------------
1 | ---------------dataset params---------------
2 |
3 | train_data_folder: datasets/train-articles
4 | test_data_folder: datasets/dev-articles
5 | labels_path: datasets/train-task1-SI.labels
6 | gold_annot_file: results/dev-task-SI.labels
7 | propaganda_techniques_file: tools/data/propaganda-techniques-names-semeval2020task11.txt
8 | data_dir: cached_datasets/SI/
9 | train_file: train.tsv
10 | dev_file: dev.tsv
11 | test_file: test.tsv
12 | split_by_ids: True
13 | dev_size: 0.18
14 | overwrite_cache: False
15 |
16 |
17 | ----------------model params----------------
18 |
19 | use_crf: True
20 | output_file: SI_output_dev.txt
21 | predicted_labels_files: [model_checkpoints/si_roberta_crf/test_predictions.txt]
22 |
23 |
24 | -------------transformers params------------
25 |
26 | model_type: roberta
27 | config_name: roberta-large
28 | model_name_or_path: model_checkpoints/ner_roberta_large_uncased_crf_7700
29 | max_seq_length: 256
30 | per_gpu_train_batch_size: 8
31 | per_gpu_eval_batch_size: 1
32 | learning_rate: 2e-5
33 | save_steps: 700
34 | warmup_steps: 500
35 | num_train_epochs: 27
36 | output_dir: model_checkpoints/ner_roberta_large_uncased_crf_7700/
37 | do_lower_case: True
38 |
--------------------------------------------------------------------------------
/tools/data/article736757214.txt:
--------------------------------------------------------------------------------
1 | Sanctuary City Mayor Protected Illegal Alien Mexican Rapist
2 |
3 | Oakland Mayor Libby Schaaf claims to fight for women.
4 | Except when she's fighting for their rapists instead.
5 | A Democratic mayor’s warning to illegal immigrants of an incoming ICE raid in northern California may have led to a number of illegal immigrants with violent and sex-related convictions evading capture and deportation.
6 | Oakland Mayor Libby Schaaf tweeted out an impending warning of the four-day raid last week, alerting targeted individuals to the imminent arrests, and infuriating Immigrations and Customs Enforcement (ICE) officials, who say that many more could have been caught if they hadn't been warned.
7 | A spokesperson for ICE gave Fox News examples of some of the unsavory characters who evaded officals during the raid.
8 | One Mexican citizen had convictions for unlawful sexual intercourse with a minor and a conviction for driving under the influence (DUI), and had been deported in 2003.
9 | Another who evaded capture had a conviction for sodomizing a drugged victim in 2012, as well as a DUI from this year -- that Mexican citizen had also been previously deported in 2013.
10 | Another illegal immigrant from Mexico, previously deported in 2014 for a conviction for armed robbery, also evaded capture.
11 |
--------------------------------------------------------------------------------
/configs/tc_config.yml:
--------------------------------------------------------------------------------
1 | ---------------dataset params---------------
2 |
3 | propaganda_techniques_file: tools/data/propaganda-techniques-names-semeval2020task11.txt
4 | train_data_folder: datasets/train-articles
5 | #test_data_folder: datasets/train-articles
6 | test_data_folder: datasets/dev-articles
7 | #test_data_folder: datasets/test/test-articles
8 | labels_path: datasets/train-task2-TC.labels
9 | #test_template_labels_path: results/mydev-task-TC.labelss
10 | test_template_labels_path: datasets/dev-task-TC-template.out
11 | #test_template_labels_path: datasets/test/test-task-TC-template.out
12 | data_dir: cached_datasets/TC/
13 | train_file: train.tsv
14 | dev_file: dev.tsv
15 | #test_file: dev.tsv
16 | #test_file: eval_tc_new.tsv
17 | test_file: test.tsv
18 | split_by_ids: True
19 | dev_size: 0.18
20 | balance: False
21 | shuffle: True
22 | overwrite_cache: False
23 |
24 |
25 | ----------------model params----------------
26 |
27 | output_file: TC_output_dev_sc.txt
28 | #weights: [1, 0]
29 | predicted_logits_files: [model_checkpoints/tc_roberta_joineds/predicted_logits]
30 |
31 |
32 | -------------transformers params------------
33 |
34 | task_name: prop
35 | model_type: roberta
36 | #model_name_or_path: model_checkpoints/tc_roberta_large_cased_transfer_joined
37 | model_name_or_path: model_checkpoints/tc_roberta_large_cased_transfer_3500
38 | max_seq_length: 256
39 | per_gpu_train_batch_size: 8
40 | per_gpu_eval_batch_size: 8
41 | learning_rate: 2e-5
42 | save_steps: 700
43 | warmup_steps: 500
44 | num_train_epochs: 10
45 | #output_dir: model_checkpoints/tc_roberta_large_cased_transfer_joined
46 | output_dir: model_checkpoints/tc_roberta_large_cased_transfer_3500
47 | do_lower_case: False
48 |
--------------------------------------------------------------------------------
/tools/print_spans.py:
--------------------------------------------------------------------------------
1 | __author__ = "Giovanni Da San Martino"
2 | __copyright__ = "Copyright 2019"
3 | __credits__ = ["Giovanni Da San Martino"]
4 | __license__ = "GPL"
5 | __version__ = "0.1"
6 | __maintainer__ = "Giovanni Da San Martino"
7 | __email__ = "gmartino@hbku.edu.qa"
8 | __status__ = "Beta"
9 |
10 | import codecs
11 | import argparse
12 | import src.annotation as an
13 | import src.article_annotations as aa
14 | import src.propaganda_techniques as pt
15 |
16 |
17 | def main(args):
18 |
19 | span_file = args.spans_file
20 | article_file = args.article_file
21 | propaganda_techniques_list_file = args.propaganda_techniques_list_file
22 |
23 | propaganda_techniques = pt.Propaganda_Techniques(propaganda_techniques_list_file)
24 | annotations = aa.Articles_annotations()
25 | aa.Articles_annotations.techniques = propaganda_techniques
26 |
27 | annotations.load_article_annotations_from_csv_file(span_file)
28 |
29 | with codecs.open(article_file, "r", encoding="utf8") as f:
30 | article_content = f.read()
31 |
32 | #print("\n".join([str(i)+") "+x for i,x in enumerate(str(aa.techniques).split("\n"))]))
33 | #output_text, footnotes = annotations.tag_text_with_annotations(article_content)
34 | output_text, footnotes, legend = annotations.mark_text(article_content)
35 |
36 | print(output_text)
37 | print(footnotes)
38 |
39 |
40 | if __name__ == "__main__":
41 |
42 | parser = argparse.ArgumentParser(description="Add tags to mark spans in a text file. \n" +
43 | "Example: print_spans.py -s data/article736757214.task-FLC.labels -t data/article736757214.txt")
44 | parser.add_argument('-t', '--text-file', dest='article_file', required=True, help="file with text document")
45 | parser.add_argument('-s', '--spans-file', dest='spans_file', required=True,
46 | help="file with spans to be highlighted. One line of the span file")
47 | parser.add_argument('-p', '--propaganda-techniques-list-file', dest='propaganda_techniques_list_file', required=False,
48 | default="data/propaganda-techniques-names.txt",
49 | help="file with list of propaganda techniques (one per line).")
50 |
51 | main(parser.parse_args())
52 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # Byte-compiled / optimized / DLL files
2 | __pycache__/
3 | *.py[cod]
4 | *$py.class
5 |
6 | # C extensions
7 | *.so
8 |
9 | # Distribution / packaging
10 | .Python
11 | build/
12 | develop-eggs/
13 | dist/
14 | downloads/
15 | eggs/
16 | .eggs/
17 | lib/
18 | lib64/
19 | parts/
20 | sdist/
21 | var/
22 | wheels/
23 | pip-wheel-metadata/
24 | share/python-wheels/
25 | *.egg-info/
26 | .installed.cfg
27 | *.egg
28 | MANIFEST
29 |
30 | # PyInstaller
31 | # Usually these files are written by a python script from a template
32 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
33 | *.manifest
34 | *.spec
35 |
36 | # Installer logs
37 | pip-log.txt
38 | pip-delete-this-directory.txt
39 |
40 | # Unit test / coverage reports
41 | htmlcov/
42 | .tox/
43 | .nox/
44 | .coverage
45 | .coverage.*
46 | .cache
47 | nosetests.xml
48 | coverage.xml
49 | *.cover
50 | *.py,cover
51 | .hypothesis/
52 | .pytest_cache/
53 | cover/
54 |
55 | # Translations
56 | *.mo
57 | *.pot
58 |
59 | # Django stuff:
60 | *.log
61 | local_settings.py
62 | db.sqlite3
63 | db.sqlite3-journal
64 |
65 | # Flask stuff:
66 | instance/
67 | .webassets-cache
68 |
69 | # Scrapy stuff:
70 | .scrapy
71 |
72 | # Sphinx documentation
73 | docs/_build/
74 |
75 | # PyBuilder
76 | .pybuilder/
77 | target/
78 |
79 | # Jupyter Notebook
80 | .ipynb_checkpoints
81 |
82 | # IPython
83 | profile_default/
84 | ipython_config.py
85 |
86 | # pyenv
87 | # For a library or package, you might want to ignore these files since the code is
88 | # intended to run in multiple environments; otherwise, check them in:
89 | # .python-version
90 |
91 | # pipenv
92 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
93 | # However, in case of collaboration, if having platform-specific dependencies or dependencies
94 | # having no cross-platform support, pipenv may install dependencies that don't work, or not
95 | # install all needed dependencies.
96 | #Pipfile.lock
97 |
98 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
99 | __pypackages__/
100 |
101 | # Celery stuff
102 | celerybeat-schedule
103 | celerybeat.pid
104 |
105 | # SageMath parsed files
106 | *.sage.py
107 |
108 | # Environments
109 | .env
110 | .venv
111 | env/
112 | venv/
113 | ENV/
114 | env.bak/
115 | venv.bak/
116 |
117 | # Spyder project settings
118 | .spyderproject
119 | .spyproject
120 |
121 | # Rope project settings
122 | .ropeproject
123 |
124 | # mkdocs documentation
125 | /site
126 |
127 | # mypy
128 | .mypy_cache/
129 | .dmypy.json
130 | dmypy.json
131 |
132 | # Pyre type checker
133 | .pyre/
134 |
135 | # pytype static type analyzer
136 | .pytype/
137 |
138 | # Cython debug symbols
139 | cython_debug/
140 |
141 | # static files generated from Django application using `collectstatic`
142 | media
143 | static
--------------------------------------------------------------------------------
/visualization_example/visualization/__init__.py:
--------------------------------------------------------------------------------
1 | from IPython.core.display import display, HTML
2 | from .html_template import transform_to_tree, span_wrapper
3 |
4 | import pandas as pd
5 | import numpy as np
6 |
7 | def unify_data_format(fn):
8 | def unified_data(data, **kwargs):
9 | if kwargs.get('stanford', False):
10 | tokens, clusters = stanford_data_adapter(data)
11 | if kwargs.get('allen', False):
12 | tokens, clusters = allen_data_adapter(data)
13 | if kwargs.get('huggingface', False):
14 | tokens, clusters = huggingface_data_adapter(data)
15 | if kwargs.get('proref', False):
16 | tokens, clusters = labelled_pronoun(data)
17 |
18 | return fn(tokens, clusters, **kwargs)
19 |
20 | return unified_data
21 |
22 | # Either return the html string or rander in a jupyter notebook output
23 | # Function signature based on displacy render functionality
24 |
25 | def render(tokens,
26 | clusters,
27 | style='coref',
28 | stanford=False,
29 | allen=False,
30 | huggingface=False,
31 | proref=False,
32 | jupyter=True,
33 | task=None):
34 |
35 | html = to_html(tokens, clusters, task)
36 |
37 | if jupyter:
38 | display(HTML(html))
39 | else:
40 | return html
41 |
42 | def stanford_data_adapter(data):
43 | sents = []
44 | for sent in data['sentences']:
45 | sents.append([])
46 | for token in sent['tokens']:
47 | sents[-1].append(token['originalText'])
48 |
49 | clusters = []
50 | if data['corefs'] is not None:
51 | for num, mentions in data['corefs'].items():
52 | clusters.append([])
53 | for mention in mentions:
54 | start = np.cumsum([0]+list(map(len, sents)))[mention['sentNum']-1] + mention['startIndex']-1
55 | end = np.cumsum([0]+list(map(len, sents)))[mention['sentNum']-1] + mention['endIndex']-2
56 | clusters[-1].append([start, end])
57 |
58 | return sum(sents, []), clusters
59 |
60 | def allen_data_adapter(data):
61 | return data['document'], data['clusters']
62 |
63 | def huggingface_data_adapter(doc):
64 | tokens = [token.text for token in doc]
65 |
66 | clusters = []
67 | if doc._.coref_clusters is not None:
68 | for cluster in doc._.coref_clusters:
69 | clusters.append([])
70 | for mention in cluster.mentions:
71 | clusters[-1].append([mention.start, mention.end-1])
72 |
73 | return tokens, clusters
74 |
75 | def labelled_pronoun(row):
76 | txt = row.text
77 |
78 | # map char indices to token indices
79 | tokens = txt.split(' ')
80 | start_a = len(txt[:row.a_offset].split(' '))-1
81 | start_b = len(txt[:row.b_offset].split(' '))-1
82 |
83 | clusters = [[[start_a, start_a+len(row.a.split(' '))-1]], [[start_b, start_b+len(row.b.split(' '))-1]]]
84 |
85 | # add pronoun token to the labelled cluster
86 | start_p = len(txt[:row.pronoun_offset].split(' '))-1
87 | if row.a_coref:
88 | clusters[0].append([start_p, start_p+len(row.pronoun.split(' '))-1])
89 | elif row.b_coref:
90 | clusters[1].append([start_p, start_p+len(row.pronoun.split(' '))-1])
91 | else:
92 | clusters.append([[start_p, start_p+len(row.pronoun.split(' '))-1]])
93 |
94 | return tokens, clusters
95 |
96 | def to_html(tokens, clusters, task):
97 | tree = transform_to_tree(tokens, clusters)
98 | html = ''.join(span_wrapper(tree, 0, task))
99 | html = '
{}
'.format(html)
100 | return html
--------------------------------------------------------------------------------
/technique_classification/transformers_classifier/utils.py:
--------------------------------------------------------------------------------
1 | import os
2 | from transformers import DataProcessor, InputExample
3 | from sklearn.metrics import f1_score
4 | from unidecode import unidecode
5 | import string
6 | import random
7 | from autocorrect import Speller
8 |
9 |
10 | def generate_misspelling(phrase, p=0.5):
11 | new_phrase = []
12 | words = phrase.split(' ')
13 | for word in words:
14 | outcome = random.random()
15 | if outcome <= p:
16 | ix = random.choice(range(len(word)))
17 | new_word = ''.join([word[w] if w != ix else random.choice(string.ascii_letters) for w in range(len(word))])
18 | new_phrase.append(new_word)
19 | else:
20 | new_phrase.append(word)
21 | return ' '.join(new_phrase)
22 |
23 |
24 | def simple_accuracy(preds, labels):
25 | return (preds == labels).mean()
26 |
27 |
28 | def acc_and_f1_macro(preds, labels):
29 | acc = simple_accuracy(preds, labels)
30 | f1 = f1_score(y_true=labels, y_pred=preds, average='macro')
31 | return {
32 | "acc": acc,
33 | "f1": f1,
34 | "acc_and_f1": (acc + f1) / 2,
35 | }
36 |
37 |
38 | def glue_compute_metrics(task_name, preds, labels):
39 | assert len(preds) == len(labels)
40 | if task_name == "prop":
41 | return acc_and_f1_macro(preds, labels)
42 | else:
43 | raise KeyError(task_name)
44 |
45 |
46 | class PropProcessor(DataProcessor):
47 | def get_train_examples(self, file_path):
48 | """See base class."""
49 | return self._create_examples(self._read_tsv(file_path), "train")
50 |
51 | def get_dev_examples(self, file_path):
52 | """See base class."""
53 | return self._create_examples(self._read_tsv(file_path), "dev_matched")
54 |
55 | def get_test_examples(self, file_path):
56 | """See base class."""
57 | return self._create_examples(self._read_tsv(file_path), "test")
58 |
59 | def get_labels(self):
60 | """See base class."""
61 | return ['Appeal_to_Authority', 'Doubt', 'Repetition',
62 | 'Appeal_to_fear-prejudice', 'Slogans', 'Black-and-White_Fallacy',
63 | 'Loaded_Language', 'Flag-Waving', 'Name_Calling,Labeling',
64 | 'Whataboutism,Straw_Men,Red_Herring', 'Causal_Oversimplification',
65 | 'Exaggeration,Minimisation', 'Bandwagon,Reductio_ad_hitlerum',
66 | 'Thought-terminating_Cliches']
67 |
68 | def _create_examples(self, lines, set_type):
69 | """Creates examples for the training and dev sets."""
70 | examples = []
71 | spell = Speller(lang='en')
72 | for (i, line) in enumerate(lines):
73 | if i == 0 or line == []:
74 | continue
75 | guid = "%s-%s" % (set_type, i)
76 | text_a = line[3] # generate_misspelling(line[3])
77 | #try:
78 | # text_a = spell(text_a)
79 | #except:
80 | # pass
81 |
82 | text_b = line[4]
83 |
84 | #pos = text_b.find(text_a)
85 | #text_a = text_b[:pos] + " " + text_b[pos:pos + len(text_a)] + " " + text_b[pos + len(text_a):]
86 | #text_b = None
87 |
88 | if len(line) < 6 or line[5] == '?':
89 | label = self.get_labels()[0]
90 | else:
91 | label = line[5]
92 | examples.append(InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
93 | return examples
94 |
95 |
96 | glue_tasks_num_labels = {
97 | "prop": 14
98 | }
99 |
100 |
101 | glue_processors = {
102 | "prop": PropProcessor,
103 | }
104 |
105 |
106 | glue_output_modes = {
107 | "prop": "classification"
108 | }
109 |
--------------------------------------------------------------------------------
/tools/task-TC_scorer.py:
--------------------------------------------------------------------------------
1 | import sys
2 | import argparse
3 | import logging.handlers
4 | from sklearn.metrics import f1_score
5 | from sklearn.metrics import precision_score
6 | from sklearn.metrics import recall_score
7 | import src.annotation as an
8 | import src.annotations as ans
9 | import src.propaganda_techniques as pt
10 |
11 | logger = logging.getLogger("propaganda_scorer")
12 | ch = logging.StreamHandler(sys.stdout)
13 | ch.setLevel(logging.INFO)
14 | formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
15 | ch.setFormatter(formatter)
16 | logger.setLevel(logging.INFO)
17 |
18 |
19 | def main(args):
20 |
21 | user_submission_file = args.submission
22 | gold_file = args.gold
23 | output_log_file = args.log_file
24 | propaganda_techniques_list_file = args.propaganda_techniques_list_file
25 | output_for_script = bool(args.output_for_script)
26 |
27 | if not output_for_script:
28 | logger.addHandler(ch)
29 |
30 | if args.debug_on_std:
31 | ch.setLevel(logging.DEBUG)
32 |
33 | if output_log_file is not None:
34 | logger.info("Logging execution to file " + output_log_file)
35 | fileLogger = logging.FileHandler(output_log_file)
36 | fileLogger.setLevel(logging.DEBUG)
37 | fileLogger.setFormatter(formatter)
38 | logger.addHandler(fileLogger)
39 |
40 | propaganda_techniques = pt.Propaganda_Techniques(propaganda_techniques_list_file)
41 | an.Annotation.set_propaganda_technique_list_obj(propaganda_techniques)
42 |
43 | user_annotations = ans.Annotations()
44 | user_annotations.load_annotation_list_from_file(user_submission_file)
45 | for article in user_annotations.get_article_id_list():
46 | user_annotations.get_article_annotations_obj(article).sort_spans()
47 |
48 | gold_annotations = ans.Annotations()
49 | gold_annotations.load_annotation_list_from_file(gold_file)
50 | for article in gold_annotations.get_article_id_list():
51 | gold_annotations.get_article_annotations_obj(article).sort_spans()
52 |
53 | logger.info("Checking format: User Predictions -- Gold Annotations")
54 | if not user_annotations.compare_annotations_identical_article_lists(gold_annotations) or not user_annotations.compare_annotations_identical(gold_annotations):
55 | logger.error("wrong format, no scoring will be performed")
56 | sys.exit()
57 | logger.info("OK: submission file format appears to be correct")
58 | res_for_output, res_for_script = user_annotations.TC_score_to_string(gold_annotations, output_for_script)
59 | logger.info("Scoring submission" + res_for_output)
60 | if output_for_script:
61 | print(res_for_script)
62 |
63 |
64 | if __name__ == "__main__":
65 |
66 | parser = argparse.ArgumentParser("Scorer for SemEval 2020 Task 11 subtask TC.\n" +
67 | "Example: python3 task-TC_scorer.py -s data/submission-task-TC.tsv -r data/article736757214.task-FLC.labels -p data/propaganda-techniques-names-semeval2020task11.txt")
68 |
69 | parser.add_argument('-s', '--submission-file', dest='submission', required=True, help="file with the submission of the team")
70 | parser.add_argument('-r', '--reference-file', dest='gold', required=True, help="file with the gold labels.")
71 | parser.add_argument('-d', '--enable-debug-on-standard-output', dest='debug_on_std', required=False,
72 | action='store_true', help="Print debug info also on standard output.")
73 | parser.add_argument('-l', '--log-file', dest='log_file', required=False, help="Output logger file.")
74 | parser.add_argument('-p', '--propaganda-techniques-list-file', dest='propaganda_techniques_list_file', required=True,
75 | help="file with list of propaganda techniques (one per line).")
76 | parser.add_argument('-o', '--output-for-script', dest='output_for_script', required=False, action='store_true',
77 | default=False, help="Prints the output in a format easy to parse for a script")
78 | main(parser.parse_args())
79 |
--------------------------------------------------------------------------------
/visualization_example/visualization/html_template.py:
--------------------------------------------------------------------------------
1 | import functools
2 |
3 | HIGHLIGHT_COLORS = [
4 | "blue",
5 | "green",
6 | "pink",
7 | "orange",
8 | "purple",
9 | "teal",
10 | "tan",
11 | "red",
12 | "cobalt",
13 | "brown",
14 | "slate",
15 | "fuchsia",
16 | "gray",
17 | "blue"
18 | ]
19 |
20 | def get_highlight_color(index):
21 | if index <= len(HIGHLIGHT_COLORS):
22 | return HIGHLIGHT_COLORS[index]
23 | else:
24 | return HIGHLIGHT_COLORS[index - (len(HIGHLIGHT_COLORS) * math.floor(index / len(HIGHLIGHT_COLORS)))]
25 |
26 | # Transofrms tokens and clusters into a tree representation
27 | def transform_to_tree(tokens, clusters):
28 | def contains(span, index):
29 | return index >= span[0] and index <= span[1]
30 |
31 | inside_clusters = [{
32 | 'cluster': -1,
33 | 'contents': [],
34 | 'end': -1
35 | }]
36 |
37 | for i, token in enumerate(tokens):
38 | # Find all the new clusters we are entering at the current index
39 | new_clusters = []
40 | for j, cluster in enumerate(clusters):
41 | #Make sure we're not already in this cluster
42 | if j not in [c['cluster'] for c in inside_clusters]:
43 | for span in cluster:
44 | if i in span:
45 | new_clusters.append({ 'end': span[1], 'cluster': j })
46 |
47 | # Enter each new cluster, starting with the leftmost
48 | new_clusters = sorted(new_clusters, key=functools.cmp_to_key(lambda a, b: b['end'] - a['end']))
49 | for new_cluster in new_clusters:
50 | #Descend into the new cluster
51 | inside_clusters.append({
52 | 'cluster': new_cluster['cluster'],
53 | 'contents': [],
54 | 'end': new_cluster['end']
55 | })
56 |
57 | #Add the current token into the current cluster
58 | inside_clusters[-1]['contents'].append(token)
59 |
60 | # Exit each cluster we're at the end of
61 | while (len(inside_clusters) > 0 and inside_clusters[-1]['end'] == i):
62 | top_cluster = inside_clusters[-1]
63 | inside_clusters.pop()
64 | inside_clusters[-1]['contents'].append(top_cluster)
65 |
66 | return inside_clusters[0]['contents']
67 |
68 |
69 | mapping = {i: el for i, el in enumerate(['Appeal_to_Authority', 'Doubt', 'Repetition',
70 | 'Appeal_to_fear-prejudice', 'Slogans', 'Black-and-White_Fallacy',
71 | 'Loaded_Language', 'Flag-Waving', 'Name_Calling,Labeling',
72 | 'Whataboutism,Straw_Men,Red_Herring', 'Causal_Oversimplification',
73 | 'Exaggeration,Minimisation', 'Bandwagon,Reductio_ad_hitlerum',
74 | 'Thought-terminating_Cliches'])}
75 |
76 | #This is the function that calls itself when we recurse over the span tree.
77 | def gen_elem(token, idx, depth, task):
78 | if isinstance(token, dict) or isinstance(token, list):
79 | if task == 'TC':
80 | title = mapping[token['cluster']]
81 | elif task == 'SI':
82 | title = 'PROP'
83 | else:
84 | title = token['cluster']
85 | return '\
87 | {}\
88 | {}'.format(idx,
89 | get_highlight_color(token['cluster']),
90 | depth,
91 | title,
92 | title,
93 | ' '.join(span_wrapper(token['contents'], depth + 1, task)))
94 | else:
95 | return '{} '.format(token)
96 |
97 | # Wraps the tree representation into spans indicating cluster-wise depth
98 | def span_wrapper(tree, depth, task):
99 | return [gen_elem(token, idx, depth, task) for idx, token in enumerate(tree)]
--------------------------------------------------------------------------------
/technique_classification/dataset.py:
--------------------------------------------------------------------------------
1 | import codecs
2 | import glob
3 | import os
4 | import numpy as np
5 | import pandas as pd
6 | from nltk.tokenize.punkt import PunktSentenceTokenizer
7 | from sklearn.model_selection import train_test_split
8 |
9 |
10 | def read_articles_from_file_list(folder_name, file_pattern="*.txt"):
11 | file_list = glob.glob(os.path.join(folder_name, file_pattern))
12 | articles = {}
13 | article_id_list, sentence_id_list, sentence_list = ([], [], [])
14 | for filename in sorted(file_list):
15 | article_id = os.path.basename(filename).split(".")[0][7:]
16 | with codecs.open(filename, "r", encoding="utf8") as f:
17 | articles[article_id] = f.read()
18 | return articles
19 |
20 |
21 | def read_predictions_from_file(filename):
22 | articles_id, span_starts, span_ends, gold_labels = ([], [], [], [])
23 | with open(filename, "r") as f:
24 | for row in f.readlines():
25 | article_id, gold_label, span_start, span_end = row.rstrip().split("\t")
26 | articles_id.append(article_id)
27 | gold_labels.append(gold_label)
28 | span_starts.append(span_start)
29 | span_ends.append(span_end)
30 | return articles_id, span_starts, span_ends, gold_labels
31 |
32 |
33 | def load_data(data_folder, labels_file):
34 | articles = read_articles_from_file_list(data_folder)
35 | ref_articles_id, ref_span_starts, ref_span_ends, labels = read_predictions_from_file(labels_file)
36 | return articles, ref_articles_id, ref_span_starts, ref_span_ends, labels
37 |
38 |
39 | def sents_token_bounds(text):
40 | sents_starts = []
41 | for start, end in PunktSentenceTokenizer().span_tokenize(text):
42 | sents_starts.append(start)
43 | sents_starts.append(100000)
44 | return np.array(sents_starts)
45 |
46 |
47 | def clear(text):
48 | return text.strip().replace('\t', ' ').replace('\n', ' ')
49 |
50 |
51 | def get_context(article, span_start, span_end):
52 | bounds = sents_token_bounds(article)
53 | context_start = bounds[np.where(bounds <= span_start)[0][-1]]
54 | context_end = bounds[np.where(bounds >= span_end)[0][0]]
55 | return clear(article[context_start:context_end])
56 |
57 |
58 | def balance_pandas(data):
59 | lst = [data]
60 | max_size = data['label'].value_counts().max()
61 | for class_index, group in data.groupby('label'):
62 | lst.append(group.sample(max_size - len(group), replace=True))
63 | return pd.concat(lst)
64 |
65 |
66 | def dataset_to_pandas(articles, ref_articles_id, ref_span_starts, ref_span_ends, train_gold_labels):
67 | data = pd.DataFrame.from_dict({'article_id': ref_articles_id,
68 | 'article': [articles[id] for id in ref_articles_id],
69 | 'span_start': np.array(ref_span_starts).astype(int),
70 | 'span_end': np.array(ref_span_ends).astype(int),
71 | 'label': train_gold_labels
72 | })
73 | data['span'] = data.apply(lambda x: clear(x['article'][x['span_start']:x['span_end']]), axis=1)
74 | data['context'] = data.apply(lambda x: get_context(x['article'], x['span_start'], x['span_end']), axis=1)
75 | return data[['article_id', 'span_start', 'span_end', 'span', 'context', 'label']]
76 |
77 |
78 | def get_train_dev_files(articles, ref_articles_id, ref_span_starts, ref_span_ends, labels, train_file, dev_file,
79 | split_by_ids=False, dev_size=0.3, random_state=40, balance=False, shuffle=True):
80 | data = dataset_to_pandas(articles, ref_articles_id, ref_span_starts, ref_span_ends, labels)
81 | if split_by_ids:
82 | train_ids, dev_ids = train_test_split(data.article_id.unique(), test_size=dev_size, random_state=random_state)
83 | train = data[data.article_id.isin(train_ids)]
84 | dev = data[data.article_id.isin(dev_ids)]
85 | else:
86 | train, dev = train_test_split(data, test_size=dev_size, random_state=random_state)
87 |
88 | if balance:
89 | train = balance_pandas(train)
90 | if shuffle:
91 | train = train.sample(frac=1).reset_index(drop=True)
92 |
93 | save_dataset(train, train_file)
94 | save_dataset(dev, dev_file)
95 |
96 |
97 | def get_test_file(articles, ref_articles_id, ref_span_starts, ref_span_ends, labels, test_file):
98 | test = dataset_to_pandas(articles, ref_articles_id, ref_span_starts, ref_span_ends, labels)
99 | save_dataset(test, test_file)
100 |
101 |
102 | def save_dataset(data, file_path):
103 | data.to_csv(file_path, sep='\t', index=False)
104 |
--------------------------------------------------------------------------------
/tools/src/annotation.py:
--------------------------------------------------------------------------------
1 | from __future__ import annotations
2 | import sys
3 | import logging.handlers
4 | import src.propaganda_techniques as pt
5 | import src.annotation_w_o_label as anwol
6 |
7 | __author__ = "Giovanni Da San Martino"
8 | __copyright__ = "Copyright 2019"
9 | __credits__ = ["Giovanni Da San Martino"]
10 | __license__ = "GPL"
11 | __version__ = "0.1"
12 | __maintainer__ = "Giovanni Da San Martino"
13 | __email__ = "gmartino@hbku.edu.qa"
14 | __status__ = "Beta"
15 |
16 | logger = logging.getLogger("propaganda_scorer")
17 |
18 |
19 | class Annotation(anwol.AnnotationWithOutLabel):
20 |
21 | """
22 | One annotation is represented by a span (two integer indices indicating the
23 | starting and ending position of the span) and the propaganda technique name
24 | (a label attached to the span).
25 | The class provides basic maniputation functions for one annotation.
26 | """
27 |
28 | # input file format variables
29 | separator = "\t"
30 | ARTICLE_ID_COL = 0
31 | TECHNIQUE_NAME_COL = 1
32 | FRAGMENT_START_COL = 2
33 | FRAGMENT_END_COL = 3
34 | propaganda_techniques:pt.Propaganda_Techniques = None
35 |
36 |
37 | def __init__(self, label:str=None, start_offset:str = None, end_offset:str=None):
38 |
39 | super().__init__(start_offset, end_offset)
40 | self.label = label
41 |
42 |
43 | def __str__(self):
44 |
45 | return super().__str__() + " -> " + self.get_label()
46 | #return self.get_label() + "\t" + super().__str__()
47 |
48 |
49 | def __eq__(self, second_annotation:Annotation):
50 | """
51 | Checks whether two annotations are identical, i.e. if their spans are
52 | identical and if they labels coincide
53 | """
54 | return super().__eq__(second_annotation) and self.get_label()==second_annotation.get_label()
55 |
56 |
57 | def get_label(self)->str:
58 |
59 | return self.label
60 |
61 |
62 | def get_propaganda_techniques(self)->list:
63 |
64 | if self.propaganda_techniques is None:
65 | logger.error("trying to access propaganda techniques list before initialising the corresponding object")
66 | sys.exit()
67 | return self.propaganda_techniques.get_propaganda_techniques_list()
68 |
69 |
70 | @classmethod
71 | def set_propaganda_technique_list_obj(cls, propaganda_technique_obj:pt.Propaganda_Techniques)->None:
72 | """
73 | propaganda_technique_obj is an object from the module src.propaganda_techniques.
74 | Typical invokation:
75 | `
76 | propaganda_techniques = pt.Propaganda_Techniques(filename=propaganda_techniques_list_file)
77 | an.Annotation.set_propaganda_technique_list_obj(propaganda_techniques)
78 | `
79 | """
80 | cls.propaganda_techniques = propaganda_technique_obj
81 |
82 |
83 | @staticmethod
84 | def load_annotation_from_string(annotation_string:str, row_num:int=None, filename:str=None)->(Annotation, str):
85 | """
86 | Read annotations from a csv-like string, with fields separated
87 | by the class variable `separator`:
88 |
89 | article idtechnique namestarting_positionending_position
90 | Fields order is determined by the class variables ARTICLE_ID_COL,
91 | TECHNIQUE_NAME_COL, FRAGMENT_START_COL, FRAGMENT_END_COL
92 |
93 | Besides reading the data, it performs basic checks.
94 |
95 | :return a tuple (Annotation object, id of the article)
96 | """
97 |
98 | row = annotation_string.rstrip().split(Annotation.separator)
99 | if len(row) != 4:
100 | logger.error("Row%s%s is supposed to have 4 columns. Found %d: -%s-."
101 | % (" " + str(row_num) if row_num is not None else "",
102 | " in file " + filename if filename is not None else "", len(row), annotation_string))
103 | sys.exit()
104 |
105 | article_id = row[Annotation.ARTICLE_ID_COL]
106 | label = row[Annotation.TECHNIQUE_NAME_COL]
107 | try:
108 | start_offset = int(row[Annotation.FRAGMENT_START_COL])
109 | except:
110 | logger.error("The column %d in row%s%s is supposed to be an integer: -%s-"
111 | %(Annotation.FRAGMENT_START_COL, " " + str(row_num) if row_num is not None else "",
112 | " in file " + filename if filename is not None else "", annotation_string))
113 | try:
114 | end_offset = int(row[Annotation.FRAGMENT_END_COL])
115 | except:
116 | logger.error("The column %d in row%s%s is supposed to be an integer: -%s-"
117 | %(Annotation.FRAGMENT_END_COL, " " + str(row_num) if row_num is not None else "",
118 | " in file " + filename if filename is not None else "", annotation_string))
119 |
120 | return Annotation(label, start_offset, end_offset), article_id
121 |
122 |
123 | def is_technique_name_valid(self)->bool:
124 | """
125 | Checks whether the technique names are correct
126 | """
127 | if self.propaganda_techniques is None:
128 | sys.exit("ERROR: propaganda techniques object has not been initialised")
129 | if not self.propaganda_techniques.is_valid_technique(self.get_label()):
130 | logger.error("label %s is not valid. Possible values are: %s"%(self.get_label(), self.propaganda_techniques))
131 | return False
132 | return True
133 |
134 |
135 | def check_format_of_annotation_in_file(self):
136 | """
137 | Performs some checks on the fields of the annotation
138 | """
139 | if not self.is_technique_name_valid():
140 | sys.exit()
141 | if not self.is_span_valid():
142 | sys.exit()
143 |
144 |
--------------------------------------------------------------------------------
/span_identification/ner/bert_lstm_crf.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | """
3 | based on
4 | @File: bert_lstm_crf.py
5 | @Copyright: 2019 Michael Zhu
6 | @License:the Apache License, Version 2.0
7 | @Author:Michael Zhu
8 | """
9 |
10 | # coding=utf-8
11 | # coding=utf-8
12 | import copy
13 | from typing import cast, List
14 | import numpy as np
15 |
16 | import torch.nn as nn
17 |
18 | from torch.autograd import Variable
19 | import torch
20 |
21 | from .conditional_random_field import ConditionalRandomField, allowed_transitions
22 |
23 |
24 | class BertLstmCrf(nn.Module):
25 | """
26 | bert_lstm_crf model
27 | """
28 |
29 | def __init__(self, bert_model,
30 | num_labels=9,
31 | embedding_dim=512,
32 | hidden_dim=512,
33 | rnn_layers=1,
34 | rnn_dropout=0.1,
35 | output_dropout=0.1,
36 | use_cuda=False):
37 | super(BertLstmCrf, self).__init__()
38 | self.bert_encoder = bert_model
39 |
40 | self.embedding_dim = embedding_dim
41 | self.hidden_dim = hidden_dim
42 | self.rnn_layers = rnn_layers
43 |
44 | self.lstm = None
45 | if rnn_layers > 0:
46 | self.lstm = nn.LSTM(
47 | embedding_dim,
48 | hidden_dim,
49 | num_layers=rnn_layers,
50 | bidirectional=True,
51 | dropout=rnn_dropout,
52 | batch_first=True
53 | )
54 |
55 | # self.crf = CRF(
56 | # target_size=num_labels,
57 | # average_batch=True,
58 | # use_cuda=use_cuda
59 | # )
60 |
61 | # TODO: add contraints
62 | constraints = allowed_transitions('BIO', dict(enumerate(["O", "B", "I"])))
63 | include_start_end_transitions = True
64 | self.crf = ConditionalRandomField(
65 | num_labels,
66 | constraints,
67 | include_start_end_transitions=include_start_end_transitions
68 | )
69 |
70 | self.liner = nn.Linear(hidden_dim * 2, num_labels)
71 | self.num_labels = num_labels
72 |
73 | self.output_dropout = nn.Dropout(p=output_dropout)
74 |
75 | def rand_init_hidden(self, batch_size):
76 | """
77 | random initialize hidden variable
78 | """
79 | return Variable(
80 | torch.randn(2 * self.rnn_layers, batch_size, self.hidden_dim)), Variable(
81 | torch.randn(2 * self.rnn_layers, batch_size, self.hidden_dim))
82 |
83 | def clear_subtokens(self, logits, labels, mask):
84 | clear_labels = torch.zeros_like(labels)
85 | clear_logits = torch.zeros_like(logits)
86 | clear_mask = torch.zeros_like(mask)
87 |
88 | for i in range(len(labels)):
89 | assert (mask[i][labels[i] != -100] == 1).all()
90 | cor = labels[i][labels[i] != -100]
91 | clear_labels[i][:len(cor)] = cor
92 | clear_logits[i][:len(cor)] = logits[i][labels[i] != - 100]
93 | clear_mask[i][:len(cor)] = 1
94 | return clear_logits, clear_labels, clear_mask
95 |
96 | def forward(self, **kwargs):
97 | '''
98 | args:
99 | sentence (word_seq_len, batch_size) : word-level representation of sentence
100 | hidden: initial hidden state
101 |
102 | return:
103 | crf output (word_seq_len, batch_size, tag_size, tag_size), hidden
104 | '''
105 |
106 | kwargs_copy = copy.deepcopy(kwargs)
107 | if "labels" in kwargs_copy:
108 | kwargs_copy.pop("labels")
109 |
110 | batch_size = kwargs["input_ids"].size(0)
111 | seq_length = kwargs["input_ids"].size(1)
112 |
113 | bert_outputs = self.bert_encoder(
114 | **kwargs
115 | )
116 | sequence_output = bert_outputs[1]
117 |
118 | if self.lstm is not None:
119 | hidden = self.rand_init_hidden(batch_size)
120 | if kwargs["input_ids"].is_cuda:
121 | hidden = (i.cuda() for i in hidden)
122 | sequence_output, hidden = self.lstm(sequence_output, hidden)
123 | sequence_output = sequence_output.contiguous().view(-1, self.hidden_dim * 2)
124 | sequence_output = self.output_dropout(sequence_output)
125 |
126 | sequence_output = self.liner(sequence_output)
127 |
128 | #out = self.liner(sequence_output)
129 | out = sequence_output
130 | logits = out.contiguous().view(batch_size, seq_length, -1)
131 |
132 | clear_logits, clear_labels, clear_mask = self.clear_subtokens(logits, kwargs['labels'], kwargs["attention_mask"])
133 |
134 | """
135 | best_paths = self.crf.viterbi_tags(
136 | logits,
137 | kwargs["attention_mask"].long(),
138 | top_k=1
139 | )
140 | """
141 | best_paths = self.crf.viterbi_tags(
142 | clear_logits,
143 | clear_mask.long(),
144 | top_k=1
145 | )
146 | # Just get the top tags and ignore the scores.
147 | predicted_tags = cast(List[List[int]], [x[0][0] for x in best_paths])
148 |
149 | if kwargs.get("labels") is not None:
150 | labels = kwargs.get("labels").cpu()
151 | #log_likelihood = self.crf(logits, kwargs.get("labels"), kwargs["attention_mask"])
152 | log_likelihood = self.crf(clear_logits, clear_labels, clear_mask)
153 | loss = -log_likelihood
154 | correct_predicted_tags = np.zeros_like(labels)
155 | for i in range(len(labels)):
156 | correct_predicted_tags[i][labels[i] != -100] = predicted_tags[i]
157 | return (loss, logits, list(correct_predicted_tags))
158 |
159 | return (None, logits, predicted_tags)
160 |
161 |
162 | if __name__ == "__main__":
163 | pass
164 |
--------------------------------------------------------------------------------
/span_identification/submission.py:
--------------------------------------------------------------------------------
1 | # coding=utf-8
2 | import numpy as np
3 | from unidecode import unidecode
4 | import string
5 | import nltk
6 | from nltk.corpus import stopwords
7 |
8 |
9 | def merge_spans(spans, articles_id, articles_content):
10 | res = dict()
11 | articles_content_dict = dict(zip(articles_id, articles_content))
12 | for article_id in spans:
13 | article = articles_content_dict[article_id]
14 | res[article_id] = []
15 | mask = np.zeros(len(article))
16 | for span in spans[article_id]:
17 | mask[span[0]: span[1]] = 1
18 | start = -1
19 | length = 0
20 | for i in range(len(mask)):
21 | if mask[i] == 0:
22 | if start != -1:
23 | res[article_id].append((start, start + length))
24 | start = -1
25 | length = 0
26 | if mask[i] == 1:
27 | if start == -1:
28 | start = i
29 | length = 1
30 | else:
31 | length += 1
32 | return res
33 |
34 |
35 | def correct_spans(spans, articles_id, articles_content):
36 | stop_words = set(stopwords.words('english'))
37 | res = dict()
38 | articles_content_dict = dict(zip(articles_id, articles_content))
39 | for article_id in spans:
40 | article = articles_content_dict[article_id]
41 | res[article_id] = []
42 | mask = np.zeros(len(article))
43 | for span in spans[article_id]:
44 | mask[span[0]: span[1] + 1] = 1
45 | start = -1
46 | length = 0
47 | for i in range(len(mask)):
48 | if mask[i] == 0:
49 | if start != -1:
50 | end = start + length
51 |
52 | if unidecode(article[start - 1]) == '"':
53 | start -= 1
54 | else:
55 | while not article[start].isalnum():
56 | start += 1
57 | if unidecode(article[end]) == '"':
58 | end += 1
59 |
60 | if unidecode(article[end - 1]) != '"':
61 | while not article[end - 1].isalnum():
62 | end -= 1
63 | if end - start > 1:
64 | if article[start: end].lower() not in stop_words:
65 | res[article_id].append((start, end))
66 | '''
67 | while article[end - 1].isspace():
68 | end -= 1
69 | if end > start:
70 | res[article_id].append((start, end))
71 | '''
72 | start = -1
73 | length = 0
74 |
75 | if mask[i] == 1:
76 | if start == -1:
77 | start = i
78 | length = 1
79 | else:
80 | length += 1
81 |
82 | if start != -1:
83 | if unidecode(article[start - 1]) == '"':
84 | start -= 1
85 | length += 1
86 | if unidecode(article[start + length]) == '"':
87 | length += 1
88 | if unidecode(article[start + length - 1]) != '"':
89 | while not article[start + length - 1].isalnum():
90 | length -= 1
91 | if length > 0:
92 | res[article_id].append((start, start + length))
93 | return res # merge_spans(res, articles_id, articles_content)
94 |
95 |
96 | def get_spans_from_file(file, articles_id, articles_content, nlp):
97 | pred_spans = dict()
98 | with open(file, 'r') as f:
99 | for article_id, text in zip(articles_id, articles_content):
100 | pred_spans.setdefault(article_id, [])
101 | tokens = [(token.idx, token.text) for token in nlp(text)]
102 | idx = np.array(tokens)[:,0]
103 | tokens = np.array(tokens)[:,1]
104 | tokens = [token.strip().replace('\n', ' ').replace('\t', ' ') for token in tokens]
105 |
106 | i = 0
107 | start = -1
108 | for i in range(len(tokens)):
109 | tok = tokens[i]
110 | if len(tok) != 0 and repr(tok) != repr('\ufeff') and repr(tok) != repr('\u200f'):
111 | token, label = f.readline().split('\t')
112 | label = label.strip()
113 | if label == 'B-PROP' or (label == 'I-PROP' and start == -1):
114 | if start != -1:
115 | pred_spans[article_id].append((start, int(idx[i - 1]) + len(tokens[i - 1])))
116 | start = int(idx[i])
117 | if label == 'O':
118 | if start != -1:
119 | pred_spans[article_id].append((start, int(idx[i - 1]) + len(tokens[i - 1])))
120 | start = -1
121 | assert token == tok
122 | assert tok == text[int(idx[i]): int(idx[i]) + len(tok)]
123 | prev_label = label
124 | prev_tok = tok
125 | else:
126 | if prev_tok != '\n':
127 | f.readline()
128 | prev_tok = '\n'
129 | prev_label = 'O'
130 |
131 | return correct_spans(pred_spans, articles_id, articles_content)
132 |
133 |
134 | def get_submission_format(predicted_labels_files, articles_id, articles_content, nlp, output_file):
135 | agg_result = dict()
136 | for file in predicted_labels_files:
137 | result = get_spans_from_file(file, articles_id, articles_content, nlp)
138 | for el in result:
139 | agg_result[el] = agg_result.get(el, []) + result[el]
140 | agg_result = merge_spans(agg_result, articles_id, articles_content)
141 |
142 | with open(output_file, "w") as fout:
143 | for article_id, spans in agg_result.items():
144 | for span in spans:
145 | fout.write("%s\t%s\t%s\n" % (article_id, span[0], span[1]))
146 |
--------------------------------------------------------------------------------
/span_identification/dataset.py:
--------------------------------------------------------------------------------
1 | # coding=utf-8
2 | import glob
3 | import os
4 | from shutil import copyfile, rmtree
5 | import random
6 | import pandas as pd
7 | import numpy as np
8 | from sklearn.model_selection import train_test_split
9 | from tqdm import tqdm
10 |
11 |
12 | def load_data(data_folder, propaganda_techniques_file):
13 | file_list = glob.glob(os.path.join(data_folder, "*.txt"))
14 | articles_content, articles_id = ([], [])
15 | for filename in sorted(file_list):
16 | with open(filename, "r", encoding="utf-8") as f:
17 | articles_content.append(f.read())
18 | articles_id.append(os.path.basename(filename).split(".")[0][7:])
19 |
20 | with open(propaganda_techniques_file, "r") as f:
21 | propaganda_techniques_names = [line.rstrip() for line in f.readlines()]
22 |
23 | return articles_content, articles_id, propaganda_techniques_names
24 |
25 |
26 | def read_predictions_from_file(filename):
27 | articles_id, gold_spans = ([], [])
28 | with open(filename, "r") as f:
29 | for row in f.readlines():
30 | article_id, gold_span_start, gold_span_end = row.rstrip().split("\t")
31 | articles_id.append(article_id)
32 | gold_spans.append(tuple(int(el) for el in [gold_span_start, gold_span_end]))
33 | return articles_id, gold_spans
34 |
35 |
36 | def group_spans_by_article_ids(span_list):
37 | data = {}
38 | for el in span_list:
39 | article_id, span = el[0], el[1]
40 | data.setdefault(article_id, [])
41 | data[article_id].append(span)
42 | return data
43 |
44 |
45 | def get_train_dev_files(articles_id, articles_content, nlp, labels_path, train_file, dev_file, split_by_ids=True,
46 | dev_size=0.3, random_state=42):
47 | articles_content_dict = dict(zip(articles_id, articles_content))
48 | articles_id, gold_spans = read_predictions_from_file(labels_path)
49 | span_list = list(zip(articles_id, gold_spans))
50 |
51 | if split_by_ids:
52 | data = group_spans_by_article_ids(span_list)
53 | train_ids, dev_ids = train_test_split(np.unique(articles_id), test_size=dev_size, random_state=random_state)
54 | train_data = sorted([(key, value) for (key, value) in data.items() if key in train_ids])
55 | dev_data = sorted([(key, value) for (key, value) in data.items() if key in dev_ids])
56 | else:
57 | span_list_train, span_list_test = train_test_split(span_list, test_size=dev_size, random_state=random_state)
58 | train_data = sorted(group_spans_by_article_ids(span_list_train).items())
59 | dev_data = sorted(group_spans_by_article_ids(span_list_train).items())
60 | train_ids = [example[0] for example in train_data]
61 | dev_ids = [example[0] for example in dev_data]
62 |
63 | create_BIO_labeled(train_file, train_data, articles_content_dict, nlp)
64 | create_BIO_labeled(dev_file, dev_data, articles_content_dict, nlp)
65 |
66 | return train_ids, dev_ids
67 |
68 |
69 | def get_test_file(file, articles_id, articles_content, nlp):
70 | create_BIO_unlabeled(file, articles_id, articles_content, nlp)
71 |
72 |
73 | def token_label_from_spans(pos, spans):
74 | for el in spans:
75 | if el[0] <= int(pos) < el[1]:
76 | return "PROP"
77 | return 'O'
78 |
79 |
80 | def create_BIO_labeled(file, data, articles_content_dict, nlp):
81 | prev_label = 'O'
82 | with open(file, 'w') as f:
83 | for article_id, spans in tqdm(data):
84 | text = articles_content_dict[article_id]
85 | tokens = [(token.idx, token.text) for token in nlp(text)]
86 | idx = np.array(tokens)[:,0]
87 | tokens = np.array(tokens)[:,1]
88 | prev_tok = '\n'
89 |
90 | for i in range(len(tokens)):
91 | tok = tokens[i].replace('\n', ' ').replace('\t', ' ').strip()
92 | if len(tok) != 0 and repr(tok) != repr('\ufeff') and repr(tok) != repr('\u200f'):
93 | tok = tokens[i].strip().replace('\n', ' ').replace('\t', ' ')
94 | label = token_label_from_spans(idx[i], spans)
95 | if label != 'O':
96 | if prev_label != 'O':
97 | label = 'I-' + 'PROP'
98 | else:
99 | label = 'B-' + 'PROP'
100 | f.write(tok + '\t' + label + '\n')
101 | prev_label = label
102 | prev_tok = tok
103 | else:
104 | if prev_tok != '\n':
105 | f.write('\n')
106 | prev_tok = '\n'
107 | prev_label = 'O'
108 |
109 |
110 | def create_BIO_unlabeled(file, articles_id, articles_content, nlp):
111 | prev_label = 'O'
112 | with open(file, 'w') as f:
113 | for article_id, text in tqdm(zip(articles_id, articles_content)):
114 | tokens = [(token.idx, token.text) for token in nlp(text)]
115 | idx = np.array(tokens)[:,0]
116 | tokens = np.array(tokens)[:,1]
117 | prev_tok = '\n'
118 |
119 | for i in range(len(tokens)):
120 | tok = tokens[i].replace('\n', ' ').replace('\t', ' ').strip()
121 | if len(tok) != 0 and repr(tok) != repr('\ufeff') and repr(tok) != repr('\u200f'):
122 | tok = tokens[i].strip().replace('\n', ' ').replace('\t', ' ')
123 | label = 'O'
124 | f.write(tok + '\t' + label + '\n')
125 | prev_label = label
126 | prev_tok = tok
127 | else:
128 | if prev_tok != '\n':
129 | f.write('\n')
130 | prev_tok = '\n'
131 | prev_label = 'O'
132 |
133 |
134 | def create_subfolder(subfolder, source_folder, articles_id):
135 | if os.path.exists(subfolder):
136 | rmtree(subfolder)
137 | os.makedirs(subfolder)
138 | for article_id in articles_id:
139 | file = 'article' + str(article_id) + '.txt'
140 | copyfile(os.path.join(source_folder, file), os.path.join(subfolder, file))
141 |
--------------------------------------------------------------------------------
/tools/README.md:
--------------------------------------------------------------------------------
1 |
2 | Scorers for the Propaganda Techniques Corpus Version 2
3 |
4 | Contents
5 |
6 | 1. Tasks
7 | 2. Evaluation scripts
8 | 3. Data format
9 | 4. Tools
10 | 5. Citation
11 | 6. Changes from version 1
12 |
13 |
14 | Tasks
15 | --------------------------------------------
16 | The Propaganda Techniques Corpus (PTC) is a corpus of articles annotated
17 | with propaganda techniques at a fine-grained level. The list of
18 | techniques is in file data/propaganda-techniques-names-semeval2020task11.txt.
19 | Among the different tasks that the corpus enables SemEval 2020 Task 11 focuses on the following ones:
20 |
21 | Subtask 1 (SI). Propaganda Identification.
22 | Given a plain-text document, identify those specific fragments that contain one propaganda technique. This is a binary sequence tagging task.
23 |
24 | Subtask 2 (TC). Propaganda Technique Labeling.
25 | Given a text fragment identified as propaganda and its document context, identify the applied propaganda technique at hand. This is a multi-class classification problem.
26 |
27 | See the paper in the section "Citation" for further details.
28 |
29 |
30 | Evaluation scripts
31 | --------------------------------------------
32 |
33 | -Task SI (task-SI_scorer.py)
34 |
35 | The evaluation script computes a variant of precision, recall, and F-measure
36 | that takes into account partial overlaps between fragments (see
37 | http://propaganda.qcri.org/semeval2020-task11/data/propaganda_tasks_evaluation.pdf
38 | for more details).
39 |
40 | The script can be run as follows:
41 |
42 | python3 task-SI_scorer.py -s [prediction_file] -r [gold_folder] -m
43 |
44 | Note that all files *.labels in [gold_folder] will be considered containing gold labels
45 | As an example, we provide a "prediction_file" data/submission-task-SI.tsv
46 | and you can run it as follows:
47 |
48 | ===
49 |
50 | $ python3 task-SI_scorer.py -s data/submission-task-SI.tsv -r data -m
51 | 2019-09-20 19:47:26,427 - INFO - Checking user submitted file
52 | 2019-09-20 19:47:26,429 - INFO - Scoring the submission with precision and recall method
53 | 2019-09-20 19:47:26,430 - INFO - Precision=1.929825/2=0.964912 Recall=1.947458/4=0.486864
54 | 2019-09-20 19:47:26,430 - INFO - F1=0.647181
55 |
56 |
57 | ===
58 |
59 | The scorer for the TC task is task-TC_scorer.py.
60 | The scorer requires file data/propaganda-techniques-names-semeval2020task11.txt.
61 | Such file contains the list of techniques used for scoring.
62 | Adding and removing items from the list will affect the outcome of the scorer.
63 | It can be run as follows
64 |
65 | python3 task-TC_scorer.py -s [prediction_file] -r [gold_file] -p data/propaganda-techniques-names-semeval2020task11.txt
66 |
67 | For example:
68 |
69 | $ python3 task-TC_scorer.py -s data/submission-task-TC.tsv -r data/article736757214.labels-task-TC -p data/propaganda-techniques-names-semeval2020task11.txt 2>/dev/null
70 | 2019-09-20 19:39:21,286 - INFO - Checking format: User Predictions -- Gold Annotations
71 | 2019-09-20 19:39:21,287 - INFO - OK: submission file format appears to be correct
72 | 2019-09-20 19:39:21,293 - INFO - Scoring submission
73 | F1=0.600000
74 | Precision=0.600000
75 | Recall=0.600000
76 | F1_Appeal_to_Authority=0.0
77 | F1_Appeal_to_fear-prejudice=0.0
78 | F1_Bandwagon,Reductio_ad_hitlerum=0.0
79 | F1_Black-and-White_Fallacy=0.0
80 | F1_Causal_Oversimplification=0.0
81 | F1_Doubt=0.0
82 | F1_Exaggeration,Minimisation=1.0
83 | F1_Flag-Waving=0.0
84 | F1_Loaded_Language=0.6666666666666666
85 | F1_Name_Calling,Labeling=0.6666666666666666
86 | F1_Repetition=0.0
87 | F1_Slogans=0.0
88 | F1_Thought-terminating_Cliches=0.0
89 | F1_Whataboutism,Straw_Men,Red_Herring=0.0
90 |
91 |
92 | Data format
93 | --------------------------------------------
94 |
95 | -Task SI
96 |
97 | The corpus includes one tab-separated file per article in the following
98 | format:
99 |
100 | id begin_offset end_offset
101 |
102 | where
103 | id is the identifier of the article
104 | begin_offset is the character where the covered span begins (inclusive)
105 | end_offset is the character where the covered span ends (exclusive)
106 |
107 | An example of such a file is data/article736757214.task-FLC.labels.
108 |
109 | -Task TC
110 |
111 | The corpus includes one tab-separated file per article in the following format:
112 |
113 | id technique begin_offset end_offset
114 |
115 | The fields are the same as for task SI, but it now also includes "technique", i.e., the propaganda technique applied in the instance.
116 |
117 |
118 | Tools
119 | --------------------------------------------
120 |
121 | - The script print_spans.py highlights the annotations in an article.
122 |
123 | python3 print_spans.py -s [annotations_file] -t [article_file] -p [propaganda_techniques_file]
124 |
125 | For example:
126 |
127 | python3 print_spans.py -t data/article736757214.txt -s data/article736757214.labels-task-TC -p data/propaganda-techniques-names-semeval2020task11.txt
128 |
129 |
130 | Citation
131 | --------------------------------------------
132 |
133 | Please cite the following publication when using the PTC corpus:
134 |
135 | G. Da San Martino, S. Yu, A. Barrón-Cedeño, R. Petrov and P. Nakov, "Fine-Grained Analysis of Propaganda in News Articles", to appear at the Conference on Empirical Methods in Natural Language Processing (EMNLP 2019), Hong Kong, China, November 3-7, 2019.
136 |
137 | @InProceedings{EMNLP19DaSanMartino,
138 | author = {Da San Martino, Giovanni and
139 | Yu, Seunghak and
140 | Barr\'{o}n-Cede\~no, Alberto and
141 | Petrov, Rostislav and
142 | Nakov, Preslav},
143 | title = {Fine-Grained Analysis of Propaganda in News Articles},
144 | booktitle = {Proceedings of the 2019 Conference on Empirical Methods in Natural Language Processing and 9th International Joint Conference on Natural Language Processing, EMNLP-IJCNLP 2019, Hong Kong, China, November 3-7, 2019},
145 | series = {EMNLP-IJCNLP 2019},
146 | year = {2019},
147 | address = {Hong Kong, China},
148 | month = {November},
149 | }
150 |
151 |
152 | Changes from version 1
153 | --------------------------------------------
154 |
155 | Fixed a bug in the evaluation function for task TC that prevented to find the best alignment between the labels of identical spans in certain cases.
156 |
157 | Now print_spans.py has a parameter -p specifying the file with the list of propaganda techniques
158 |
159 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Semeval 2020, Task 11
2 |
3 | ## Overview
4 | This repository provides code for the SemEval-2020 Task 11 competition (Detection of Propaganda Techniques in News Articles).
5 |
6 | The competition webpage: https://propaganda.qcri.org/semeval2020-task11/
7 |
8 | The description of the architecture of models can be found in our paper [Aschern at SemEval-2020 Task 11: It Takes Three to Tango: RoBERTa, CRF, and Transfer Learning](https://www.aclweb.org/anthology/2020.semeval-1.191/).
9 |
10 | ## Requirements
11 | ```
12 | pip install -r ./requirements.txt
13 | ```
14 |
15 | ## Project structure
16 |
17 | - `configs`: yaml configs for the system
18 | - `datasets`: contains the task datasets, which can be downloaded from the team competition webpage
19 | - `results`: the folder for submissions
20 | - `span_identification`: code for the task SI
21 | - `ner`: pytorch-transformers RoBERTa model with CRF (end-to-end)
22 | - `dataset`: the scripts for loading and preprocessing source dataset
23 | - `submission`: the scripts for obtaining and evaluating results
24 | - `technique_classification`: code for the task TC (the folder has the same structure as `span_identification`)
25 | - `tools`: tools provided by the competition organizers; contain useful functions for reading datasets and evaluating submissions
26 | - `visualization_example`: example of visualization of results for both tasks
27 |
28 | ## Running the models
29 |
30 | All commands are run from the root directory of the repository.
31 |
32 | ### Span Identification
33 |
34 | 1. Configure `configs/si_config.yml` file, if it is needed. data_dir is the path to the cache of original train/eval sub-datasets and their BIO versions. In addition to using the config, it is also possible to specify arguments through the command line.
35 |
36 | 2. Split the dataset for local evaluation (if `--overwrite_cache`, previous files will be replaced). It will produce files with the BIO-format tagging for spans (B-PROP, I-PROP, O) in your `--data_dir`.
37 | ```bash
38 | python -m span_identification --config configs/si_config.yml --split_dataset --overwrite_cache
39 | ```
40 | 3. Train and eval model (the model parameters are specified in the config, you need to change the paths). The use of CRF is regulated by the flag `--use_crf`. For the first run you can use `--model_name_or_path roberta-large`.
41 | ```bash
42 | python -m span_identification --config configs/si_config.yml --do_train --do_eval
43 | ```
44 | 4. Apply the trained model to the `test_file` (in BIO-format) specified in the config. It will be created based on the `test_data_folder` folder in case of missing or if the flag `--overwrite_cache` is specified.
45 | ```bash
46 | python -m span_identification --config configs/si_config.yml --do_predict
47 | ```
48 | 5. Create the submission file `output_file` in the `result` folder. It will obtain spans from the result files with the token labeling specified in `predicted_labels_files`. At the aggregation stage, the span prediction results are simply joined.
49 | ```bash
50 | python -m span_identification --config configs/si_config.yml --create_submission_file
51 | ```
52 | 6. In case you have the correct markup in the `test_file` or gold `--gold_annot_file` (source competition format), you can run the evaluation competition script.
53 | ```bash
54 | python -m span_identification --config configs/si_config.yml --do_eval_spans
55 | ```
56 | 7. Use `visualization_example/visualization.ipynb` if you want to visualize labels.
57 |
58 | ### Technique Classification
59 |
60 | Here you need almost the same commands and settings as in the SI task.
61 |
62 | 1. Configure `configs/tc_config.yml` file, if it is needed.
63 |
64 | 2. Split the dataset for local evaluation.
65 | ```bash
66 | python -m technique_classification --config configs/tc_config.yml --split_dataset --overwrite_cache
67 | ```
68 | 3. Train and eval model. We used two setups with and without flags `--join_embeddings --use_length` (to get our RoBERTa-Joined). For the first run you can use `--model_name_or_path roberta-large`.
69 | ```bash
70 | python -m technique_classification --config configs/tc_config.yml --do_train --do_eval
71 | ```
72 | or distributed
73 | ```
74 | CUDA_VISIBLE_DEVICES=0,1,2,3 python -m torch.distributed.launch --nproc_per_node 4 technique_classification --config configs/tc_config.yml --do_train --do_eval
75 | ```
76 | 4. Apply the trained model to the `test_file` specified in the config. It will be created based on the `test_data_folder` folder and `test_template_labels_path` file in case of missing or if the flag `--overwrite_cache` is specified.
77 | ```bash
78 | python -m technique_classification --config configs/tc_config.yml --do_predict --join_embeddings --use_length
79 | ```
80 | 5. Create the submission file `output_file`. It will combine predictions from the list `predicted_logits_files` with coefficients specified in `--weights` (optional) and apply some post-processing.
81 | ```bash
82 | python -m technique_classification --config configs/tc_config.yml --create_submission_file
83 | ```
84 | 6. In case you have the correct markup in the `test_file` or gold `--test_labels_path` (source competition format), you can check your accuracy (micro f1-score) and f1-score per classes.
85 | ```bash
86 | python -m technique_classification --config configs/tc_config.yml --eval_submission
87 | ```
88 | 7. Use `visualization_example/visualization.ipynb` if you want to visualize labels.
89 |
90 | Our pretrained RoBERTa-CRF (SI task) and RoBERTa-Joined (TC task) models are available in [Google Drive](https://vk.com/away.php?to=https%3A%2F%2Fdrive.google.com%2Fdrive%2Ffolders%2F1Gph7FKMaxOBJdkrk0nM72uFpCGgn-2kC%3Fusp%3Dsharing).
91 |
92 | ## Citation
93 |
94 | If you find this repository helpful, feel free to cite our publication [Aschern at SemEval-2020 Task 11: It Takes Three to Tango: RoBERTa, CRF, and Transfer Learning](https://www.aclweb.org/anthology/2020.semeval-1.191/):
95 | ```
96 | @inproceedings{chernyavskiy-etal-2020-aschern,
97 | title = "Aschern at {S}em{E}val-2020 Task 11: It Takes Three to Tango: {R}o{BERT}a, {CRF}, and Transfer Learning",
98 | author = "Chernyavskiy, Anton and
99 | Ilvovsky, Dmitry and
100 | Nakov, Preslav",
101 | booktitle = "Proceedings of the Fourteenth Workshop on Semantic Evaluation",
102 | month = dec,
103 | year = "2020",
104 | address = "Barcelona (online)",
105 | publisher = "International Committee for Computational Linguistics",
106 | url = "https://www.aclweb.org/anthology/2020.semeval-1.191",
107 | pages = "1462--1468"
108 | }
109 | ```
110 |
--------------------------------------------------------------------------------
/tools/src/annotation_w_o_label.py:
--------------------------------------------------------------------------------
1 | from __future__ import annotations
2 | import sys
3 | import src.propaganda_techniques as pt
4 | import logging.handlers
5 |
6 | __author__ = "Giovanni Da San Martino"
7 | __copyright__ = "Copyright 2019"
8 | __credits__ = ["Giovanni Da San Martino"]
9 | __license__ = "GPL"
10 | __version__ = "0.1"
11 | __maintainer__ = "Giovanni Da San Martino"
12 | __email__ = "gmartino@hbku.edu.qa"
13 | __status__ = "Beta"
14 |
15 | logger = logging.getLogger("propaganda_scorer")
16 |
17 |
18 | class AnnotationWithOutLabel(object):
19 |
20 | """
21 | One annotation is represented by a span (two integer indices indicating the
22 | starting and ending position of the span).
23 | The class provides basic maniputation functions for one annotation.
24 | """
25 |
26 | # input file format variables
27 | separator = "\t"
28 | ARTICLE_ID_COL = 0
29 | FRAGMENT_START_COL = 1
30 | FRAGMENT_END_COL = 2
31 |
32 |
33 | def __init__(self, start_offset:str = None, end_offset:str=None):
34 |
35 | self.start_offset = int(start_offset)
36 | self.end_offset = int(end_offset)
37 |
38 |
39 | def __str__(self):
40 |
41 | return "[%d, %d]"%(self.start_offset, self.end_offset)
42 | #return "%d\t%d"%(self.start_offset, self.end_offset)
43 |
44 |
45 | def is_span_equal_to(self, second_annotation:AnnotationWithOutLabel)->bool:
46 | """
47 | Checks whether two annotations are identical, i.e. whether the two spans are identical.
48 | """
49 | if self.get_start_offset() != second_annotation.get_start_offset() or self.get_end_offset() != second_annotation.get_end_offset():
50 | return False
51 | return True
52 |
53 |
54 | def __eq__(self, second_annotation:AnnotationWithOutLabel):
55 |
56 | return self.is_span_equal_to(second_annotation)
57 |
58 |
59 | def get_start_offset(self)->int:
60 |
61 | return self.start_offset
62 |
63 |
64 | def get_end_offset(self)->int:
65 |
66 | return self.end_offset
67 |
68 |
69 | def get_span(self)->set:
70 | """
71 | Returns a set of positions of all characters in the span
72 | """
73 | return set(range(self.get_start_offset(), self.get_end_offset()))
74 |
75 |
76 | @staticmethod
77 | def load_annotation_from_string(annotation_string:str, row_num:int=None, filename:str=None)->(AnnotationWithOutLabel, str):
78 | """
79 | Read annotations from a csv-like string, with fields separated
80 | by the class variable `separator`:
81 |
82 | article idstarting_positionending_position
83 | Fields order is determined by the class variables ARTICLE_ID_COL,
84 | FRAGMENT_START_COL, FRAGMENT_END_COL
85 |
86 | Besides reading the data, it performs basic checks.
87 |
88 | :return a tuple (AnnotationWithOutLabel object, id of the article)
89 | """
90 |
91 | row = annotation_string.rstrip().split(AnnotationWithOutLabel.separator)
92 | if len(row) != 3:
93 | logger.error("Row%s%s is supposed to have 3 columns. Found %d: -%s-."
94 | % (" " + str(row_num) if row_num is not None else "",
95 | " in file " + filename if filename is not None else "", len(row), annotation_string))
96 | sys.exit()
97 |
98 | article_id = row[AnnotationWithOutLabel.ARTICLE_ID_COL]
99 | try:
100 | start_offset = int(row[AnnotationWithOutLabel.FRAGMENT_START_COL])
101 | except:
102 | logger.error("The column %d in row%s%s is supposed to be an integer: -%s-"
103 | %(AnnotationWithOutLabel.FRAGMENT_START_COL, " " + str(row_num) if row_num is not None else "", " in file " + filename if filename is not None else "", annotation_string))
104 | try:
105 | end_offset = int(row[AnnotationWithOutLabel.FRAGMENT_END_COL])
106 | except:
107 | logger.error("The column %d in row%s%s is supposed to be an integer: -%s-"
108 | %(AnnotationWithOutLabel.FRAGMENT_END_COL, " " + str(row_num) if row_num is not None else "",
109 | " in file " + filename if filename is not None else "", annotation_string))
110 |
111 | return AnnotationWithOutLabel(start_offset, end_offset), article_id
112 |
113 |
114 | def merge_spans(self, second_annotation:AnnotationWithOutLabel)->None:
115 | """
116 | Merge the spans of two annotations. The function does not check whether the spans overlap.
117 |
118 | :param second_annotation: the AnnotationWithOutLabel object whose span is being merged
119 | :return:
120 | """
121 | self.set_start_offset(min(self.get_start_offset(), second_annotation.get_start_offset()))
122 | self.set_end_offset(max(self.get_end_offset(), second_annotation.get_end_offset()))
123 |
124 |
125 | def set_start_offset(self, new_start_offset:int)->None:
126 |
127 | self.start_offset = new_start_offset
128 |
129 |
130 | def set_end_offset(self, new_end_offset:int)->None:
131 |
132 | self.end_offset = new_end_offset
133 |
134 |
135 | def shift_annotation(self, offset:int)->None:
136 |
137 | self.set_start_offset(self.get_start_offset() + offset)
138 | self.set_end_offset(self.get_end_offset() + offset)
139 |
140 |
141 | def span_overlapping(self, second_annotation:AnnotationWithOutLabel)->bool:
142 | return len(self.get_span().intersection(second_annotation.get_span())) > 0
143 |
144 |
145 | def is_span_valid(self)->bool:
146 | """
147 | Checks whether the span is valid, i.e. if the following conditions are met:
148 | 1) start and end offsets >= 0
149 | 2) start offset < end offset
150 | """
151 | if self.get_start_offset() < 0 or self.get_end_offset() < 0:
152 | logger.error("Start and end of position of the fragment must be non-negative: %d, %d"
153 | %(self.get_start_offset(), self.get_end_offset()))
154 | return False
155 | if self.get_start_offset() >= self.get_end_offset():
156 | logger.error("End position of the fragment must be greater than the starting one: start=%d, end=%d"%(self.get_start_offset(), self.get_end_offset()))
157 | return False
158 | return True
159 |
160 |
161 | def check_format_of_annotation_in_file(self):
162 | """
163 | Performs some checks on the fields of the annotation
164 | """
165 | if not self.is_span_valid():
166 | sys.exit()
167 |
168 |
--------------------------------------------------------------------------------
/span_identification/ner/utils_ner.py:
--------------------------------------------------------------------------------
1 | # coding=utf-8
2 | # Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
3 | # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
4 | #
5 | # Licensed under the Apache License, Version 2.0 (the "License");
6 | # you may not use this file except in compliance with the License.
7 | # You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | """ Named entity recognition fine-tuning: utilities to work with CoNLL-2003 task. """
17 |
18 | from __future__ import absolute_import, division, print_function
19 |
20 | import logging
21 | import os
22 | from io import open
23 |
24 | logger = logging.getLogger(__name__)
25 |
26 |
27 | class InputExample(object):
28 | """A single training/test example for token classification."""
29 |
30 | def __init__(self, guid, words, labels):
31 | """Constructs a InputExample.
32 |
33 | Args:
34 | guid: Unique id for the example.
35 | words: list. The words of the sequence.
36 | labels: (Optional) list. The labels for each word of the sequence. This should be
37 | specified for train and dev examples, but not for test examples.
38 | """
39 | self.guid = guid
40 | self.words = words
41 | self.labels = labels
42 |
43 |
44 | class InputFeatures(object):
45 | """A single set of features of data."""
46 |
47 | def __init__(self, input_ids, input_mask, segment_ids, label_ids):
48 | self.input_ids = input_ids
49 | self.input_mask = input_mask
50 | self.segment_ids = segment_ids
51 | self.label_ids = label_ids
52 |
53 |
54 | def read_examples_from_file(file_path, mode):
55 | guid_index = 1
56 | examples = []
57 | with open(file_path, encoding="utf-8") as f:
58 | words = []
59 | labels = []
60 | for line in f:
61 | if line.startswith("-DOCSTART-") or line == "" or line == "\n":
62 | if words:
63 | examples.append(InputExample(guid="{}-{}".format(mode, guid_index),
64 | words=words,
65 | labels=labels))
66 | guid_index += 1
67 | words = []
68 | labels = []
69 | else:
70 | splits = line.split('\t') # " "
71 | words.append(splits[0])
72 | if len(splits) > 1:
73 | labels.append(splits[-1].replace("\n", ""))
74 | else:
75 | # Examples could have no label for mode = "test"
76 | labels.append("O")
77 | if words:
78 | examples.append(InputExample(guid="%s-%d".format(mode, guid_index),
79 | words=words,
80 | labels=labels))
81 | return examples
82 |
83 |
84 | def convert_examples_to_features(examples,
85 | label_list,
86 | max_seq_length,
87 | tokenizer,
88 | cls_token_at_end=False,
89 | cls_token="[CLS]",
90 | cls_token_segment_id=1,
91 | sep_token="[SEP]",
92 | sep_token_extra=False,
93 | pad_on_left=False,
94 | pad_token=0,
95 | pad_token_segment_id=0,
96 | pad_token_label_id=-1,
97 | sequence_a_segment_id=0,
98 | mask_padding_with_zero=True):
99 | """ Loads a data file into a list of `InputBatch`s
100 | `cls_token_at_end` define the location of the CLS token:
101 | - False (Default, BERT/XLM pattern): [CLS] + A + [SEP] + B + [SEP]
102 | - True (XLNet/GPT pattern): A + [SEP] + B + [SEP] + [CLS]
103 | `cls_token_segment_id` define the segment id associated to the CLS token (0 for BERT, 2 for XLNet)
104 | """
105 |
106 | label_map = {label: i for i, label in enumerate(label_list)}
107 |
108 | features = []
109 | for (ex_index, example) in enumerate(examples):
110 | if ex_index % 10000 == 0:
111 | logger.info("Writing example %d of %d", ex_index, len(examples))
112 |
113 | tokens = []
114 | label_ids = []
115 | for word, label in zip(example.words, example.labels):
116 | word_tokens = tokenizer.tokenize(word)
117 | tokens.extend(word_tokens)
118 | # Use the real label id for the first token of the word, and padding ids for the remaining tokens
119 | label_ids.extend([label_map[label]] + [pad_token_label_id] * (len(word_tokens) - 1))
120 |
121 | # Account for [CLS] and [SEP] with "- 2" and with "- 3" for RoBERTa.
122 | special_tokens_count = 3 if sep_token_extra else 2
123 | if len(tokens) > max_seq_length - special_tokens_count:
124 | tokens = tokens[:(max_seq_length - special_tokens_count)]
125 | label_ids = label_ids[:(max_seq_length - special_tokens_count)]
126 |
127 | # The convention in BERT is:
128 | # (a) For sequence pairs:
129 | # tokens: [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP]
130 | # type_ids: 0 0 0 0 0 0 0 0 1 1 1 1 1 1
131 | # (b) For single sequences:
132 | # tokens: [CLS] the dog is hairy . [SEP]
133 | # type_ids: 0 0 0 0 0 0 0
134 | #
135 | # Where "type_ids" are used to indicate whether this is the first
136 | # sequence or the second sequence. The embedding vectors for `type=0` and
137 | # `type=1` were learned during pre-training and are added to the wordpiece
138 | # embedding vector (and position vector). This is not *strictly* necessary
139 | # since the [SEP] token unambiguously separates the sequences, but it makes
140 | # it easier for the model to learn the concept of sequences.
141 | #
142 | # For classification tasks, the first vector (corresponding to [CLS]) is
143 | # used as as the "sentence vector". Note that this only makes sense because
144 | # the entire model is fine-tuned.
145 | tokens += [sep_token]
146 | label_ids += [pad_token_label_id]
147 | if sep_token_extra:
148 | # roberta uses an extra separator b/w pairs of sentences
149 | tokens += [sep_token]
150 | label_ids += [pad_token_label_id]
151 | segment_ids = [sequence_a_segment_id] * len(tokens)
152 |
153 | if cls_token_at_end:
154 | tokens += [cls_token]
155 | label_ids += [pad_token_label_id]
156 | segment_ids += [cls_token_segment_id]
157 | else:
158 | tokens = [cls_token] + tokens
159 | label_ids = [pad_token_label_id] + label_ids
160 | segment_ids = [cls_token_segment_id] + segment_ids
161 |
162 | input_ids = tokenizer.convert_tokens_to_ids(tokens)
163 |
164 | # The mask has 1 for real tokens and 0 for padding tokens. Only real
165 | # tokens are attended to.
166 | input_mask = [1 if mask_padding_with_zero else 0] * len(input_ids)
167 |
168 | # Zero-pad up to the sequence length.
169 | padding_length = max_seq_length - len(input_ids)
170 | if pad_on_left:
171 | input_ids = ([pad_token] * padding_length) + input_ids
172 | input_mask = ([0 if mask_padding_with_zero else 1] * padding_length) + input_mask
173 | segment_ids = ([pad_token_segment_id] * padding_length) + segment_ids
174 | label_ids = ([pad_token_label_id] * padding_length) + label_ids
175 | else:
176 | input_ids += ([pad_token] * padding_length)
177 | input_mask += ([0 if mask_padding_with_zero else 1] * padding_length)
178 | segment_ids += ([pad_token_segment_id] * padding_length)
179 | label_ids += ([pad_token_label_id] * padding_length)
180 |
181 | assert len(input_ids) == max_seq_length
182 | assert len(input_mask) == max_seq_length
183 | assert len(segment_ids) == max_seq_length
184 | assert len(label_ids) == max_seq_length
185 |
186 | if ex_index < 5:
187 | logger.info("*** Example ***")
188 | logger.info("guid: %s", example.guid)
189 | logger.info("tokens: %s", " ".join([str(x) for x in tokens]))
190 | logger.info("input_ids: %s", " ".join([str(x) for x in input_ids]))
191 | logger.info("input_mask: %s", " ".join([str(x) for x in input_mask]))
192 | logger.info("segment_ids: %s", " ".join([str(x) for x in segment_ids]))
193 | logger.info("label_ids: %s", " ".join([str(x) for x in label_ids]))
194 |
195 | features.append(
196 | InputFeatures(input_ids=input_ids,
197 | input_mask=input_mask,
198 | segment_ids=segment_ids,
199 | label_ids=label_ids))
200 | return features
201 |
202 |
203 | def get_labels(path):
204 | if path:
205 | with open(path, "r") as f:
206 | labels = f.read().splitlines()
207 | if "O" not in labels:
208 | labels = ["O"] + labels
209 | return labels
210 | else:
211 | #return ["O", "B-MISC", "I-MISC", "B-PER", "I-PER", "B-ORG", "I-ORG", "B-LOC", "I-LOC"]
212 | #return ["O", "B-PROP", "I-PROP", 'E-PROP', 'U-PROP']
213 | return ["O", "B-PROP", "I-PROP"]
214 |
--------------------------------------------------------------------------------
/technique_classification/submission.py:
--------------------------------------------------------------------------------
1 | # coding=utf-8
2 | import numpy as np
3 | import pandas as pd
4 | from nltk.stem import PorterStemmer
5 | from nltk.tokenize import word_tokenize
6 | from collections import defaultdict
7 | from sklearn.utils.extmath import softmax
8 | from sklearn.metrics import accuracy_score, f1_score
9 | from nltk.corpus import stopwords
10 | import string
11 | import pickle
12 | import os
13 | from unidecode import unidecode
14 | from joblib import dump, load
15 |
16 |
17 | def get_insides(data):
18 | insides = defaultdict(dict)
19 | spans_coords = list(zip(data['span_start'].values, data['span_end'].values))
20 | labels = data['label'].values
21 | article_ids = data['article_id'].values
22 | for i in range(len(spans_coords)):
23 | for j in range(i):
24 | if article_ids[i] == article_ids[j]:
25 | if spans_coords[i][0] >= spans_coords[j][0] and spans_coords[i][1] <= spans_coords[j][1]:
26 | if spans_coords[i][0] != spans_coords[j][0] or spans_coords[i][1] != spans_coords[j][1]:
27 | insides[labels[i]][labels[j]] = insides[labels[i]].get(labels[j], 0) + 1
28 | if spans_coords[j][0] >= spans_coords[i][0] and spans_coords[j][1] <= spans_coords[i][1]:
29 | if spans_coords[j][0] != spans_coords[i][0] or spans_coords[j][1] != spans_coords[i][1]:
30 | insides[labels[j]][labels[i]] = insides[labels[j]].get(labels[i], 0) + 1
31 | return insides
32 |
33 |
34 | def correct_preds_for_insides(preds, spans_coords, logits, insides, mapping, inverse_mapping):
35 | for i in range(len(preds)):
36 | for j in range(len(preds)):
37 | if spans_coords[j][0] >= spans_coords[i][0] and spans_coords[j][1] <= spans_coords[i][1]:
38 | if spans_coords[j][0] != spans_coords[i][0] or spans_coords[j][1] != spans_coords[i][1]:
39 | def_i = preds[i]
40 | def_j = preds[j]
41 | log = softmax([logits[i]])[0]
42 | login = softmax([logits[j]])[0]
43 | def_prob_i = log[inverse_mapping[preds[i]]]
44 | def_prob_j = login[inverse_mapping[preds[j]]]
45 | while preds[j] not in insides.get(preds[i], []):
46 | if log[inverse_mapping[preds[i]]] > login[inverse_mapping[preds[j]]]:
47 | values = np.sort(login)[-2:]
48 | if values[1] / (values[0] + 1e-6) > 1.4:
49 | preds[i] = def_i
50 | preds[j] = def_j
51 | break
52 | login[inverse_mapping[preds[j]]] = 0
53 | preds[j] = mapping[np.argmax(login)]
54 | else:
55 | values = np.sort(log)[-2:]
56 | if values[1] / (values[0] + 1e-6) > 1.4:
57 | preds[i] = def_i
58 | preds[j] = def_j
59 | break
60 | log[inverse_mapping[preds[i]]] = 0
61 | preds[i] = mapping[np.argmax(log)]
62 | return preds
63 |
64 |
65 | def stem_spans(spans):
66 | ps = PorterStemmer()
67 | res = []
68 | for el in spans:
69 | result = " ".join(ps.stem(word) for word in word_tokenize(el.lower()))
70 | if len(result) > 0:
71 | res.append(result)
72 | return res
73 |
74 |
75 | def get_train_instances(data, data_dir, save=True):
76 | train_instances = dict()
77 | stemmed_spans = stem_spans(data.span.values)
78 | labels = data.label.values
79 | for i in range(len(stemmed_spans)):
80 | if labels[i] != 'Repetition':
81 | span = stemmed_spans[i]
82 | train_instances.setdefault(span, set())
83 | train_instances[span].add(labels[i])
84 | if save:
85 | with open(os.path.join(data_dir, 'train_instances_train'), 'wb') as f:
86 | pickle.dump(train_instances, f)
87 | return train_instances
88 |
89 |
90 | def postprocess(x, mapping, inverse_mapping, insides, stop_words, ps, train_instances):
91 | spans_coords = list(zip(x['span_start'].values, x['span_end'].values))
92 | spans_source = x['span'].values
93 | spans_text = [' '.join([ps.stem(word) for word in word_tokenize(span.lower())]) for span in spans_source]
94 | spans = [' '.join([ps.stem(word) for word in word_tokenize(unidecode(span.lower()))
95 | if word not in stop_words and word not in string.punctuation]) for span in spans_source]
96 |
97 | counts = dict()
98 | for i in range(len(spans)):
99 | counts.setdefault(spans[i], set())
100 | counts[spans[i]].add(spans_coords[i][0])
101 | for el in counts:
102 | counts[el] = len(counts[el])
103 |
104 | preds = x['pred'].values
105 | logits = [np.array(log.split(), dtype=np.float32) for log in x['logits']]
106 | for i in range(len(preds)):
107 | log = logits[i]
108 |
109 | if counts[spans[i]] >= 3 or (counts[spans[i]] >= 2 and logits[i][inverse_mapping["Repetition"]] > 0.001):
110 | log[inverse_mapping["Repetition"]] = 100
111 |
112 | if counts[spans[i]] == 1 and (logits[i][inverse_mapping["Repetition"]] < 0.99 or len(spans[i].split()) <= 1):
113 | log[inverse_mapping["Repetition"]] = 0
114 |
115 | for prediction in train_instances.get(spans_text[i], set()):
116 | log[inverse_mapping[prediction]] += 0.5
117 | if spans_source[i].startswith('#'):
118 | log[inverse_mapping['Slogans']] = 20
119 |
120 |
121 | prev_same = []
122 | for j in range(i):
123 | if spans_coords[j][0] == spans_coords[i][0] and spans_coords[j][1] == spans_coords[i][1]:
124 | prev_same.append(j)
125 | if len(prev_same) > 0:
126 | for prediction in preds[prev_same]:
127 | log[inverse_mapping[prediction]] = 0
128 |
129 | logits[i] = log
130 | preds[i] = mapping[np.argmax(log)]
131 |
132 | x["pred"] = correct_preds_for_insides(preds, spans_coords, logits, insides, mapping, inverse_mapping)
133 | #x["pred"] = preds
134 | return x
135 |
136 |
137 | def postprocess_predictions(predictions_logits, data, insides, train_instances):
138 | mapping = {i: el for i, el in enumerate(
139 | ['Appeal_to_Authority', 'Doubt', 'Repetition', 'Appeal_to_fear-prejudice', 'Slogans', 'Black-and-White_Fallacy',
140 | 'Loaded_Language', 'Flag-Waving', 'Name_Calling,Labeling', 'Whataboutism,Straw_Men,Red_Herring',
141 | 'Causal_Oversimplification', 'Exaggeration,Minimisation', 'Bandwagon,Reductio_ad_hitlerum',
142 | 'Thought-terminating_Cliches']
143 | )}
144 | inverse_mapping = {b: a for (a, b) in mapping.items()}
145 |
146 | stop_words = set(stopwords.words('english'))
147 | ps = PorterStemmer()
148 |
149 | predictions = np.argmax(predictions_logits, axis=1)
150 | data['pred'] = [mapping[p] for p in predictions]
151 | data['logits'] = [' '.join(np.array(log, dtype=str)) for log in predictions_logits]
152 | data = data.groupby('article_id', as_index=False).apply(postprocess, mapping, inverse_mapping, insides,
153 | stop_words, ps, train_instances)
154 | return np.array(data["pred"].values)
155 |
156 |
157 | def softmax_with_temperature(z, T):
158 | z = z / T
159 | max_z = np.max(z, axis=1).reshape(-1, 1)
160 | exp_z = np.exp(z - max_z)
161 | return exp_z / np.sum(exp_z, axis=1).reshape(-1, 1)
162 |
163 |
164 | def create_submission_file(predicted_logits_files, train_file_path, dev_file_path, test_file_path,
165 | article_ids, span_starts, span_ends, output_file, weights=None, data_dir=None, agg_model=None):
166 | data_train = pd.read_csv(train_file_path, sep='\t')
167 | data_eval = pd.read_csv(dev_file_path, sep='\t')
168 | #data_train = pd.concat([data_train, data_eval], ignore_index=True)
169 |
170 | insides = get_insides(data_train)
171 | train_instances = get_train_instances(data_train, data_dir)
172 |
173 | data = pd.read_csv(test_file_path, sep='\t')
174 |
175 | if weights is None:
176 | weights = [1. / len(predicted_logits_files) for _ in range(len(predicted_logits_files))]
177 | assert len(weights) == len(predicted_logits_files)
178 |
179 | predictions_logits = None
180 | predictions_logits_list = []
181 | for file, weight in zip(predicted_logits_files, weights):
182 | with open(file, 'rb') as f:
183 | logits = pickle.load(f)
184 | if predictions_logits is None:
185 | predictions_logits = float(weight) * softmax_with_temperature(logits, 1)
186 | else:
187 | predictions_logits += float(weight) * softmax_with_temperature(logits, 1)
188 | if agg_model is not None:
189 | predictions_logits_list.append(logits)
190 |
191 | predictions = postprocess_predictions(predictions_logits, data, insides, train_instances)
192 |
193 | if agg_model is not None:
194 | clf = load(agg_model)
195 | predictions_sklearn_agg = clf.predict(np.concatenate(predictions_logits_list, axis=1))
196 | predictions_sklearn_agg[predictions_sklearn_agg == 'Repetition'] = predictions[predictions_sklearn_agg == 'Repetition']
197 | predictions_sklearn_agg[predictions == 'Repetition'] = 'Repetition'
198 | predictions = predictions_sklearn_agg
199 |
200 | with open(output_file, "w") as fout:
201 | for article_id, prediction, span_start, span_end in zip(article_ids, predictions, span_starts, span_ends):
202 | fout.write("%s\t%s\t%s\t%s\n" % (article_id, prediction, span_start, span_end))
203 |
204 |
205 | def load_result(file):
206 | result = defaultdict(dict)
207 | with open(file, "r") as f:
208 | for line in f:
209 | article_id, prediction, spl, spr = line.split('\t')
210 | result[article_id].setdefault(prediction, [])
211 | result[article_id][prediction].append([int(spl), int(spr)])
212 | return result
213 |
214 |
215 | def read_ground_truth(gt_file_path, label_names):
216 | ground_truth = []
217 | with open(gt_file_path, "r") as f:
218 | for line in f:
219 | gold_label = line.split('\t')[-1].strip()
220 | if gold_label in label_names:
221 | ground_truth.append(gold_label)
222 | return ground_truth
223 |
224 |
225 | def eval_submission(result_file_path, gt_file_path):
226 | predictions = []
227 | with open(result_file_path, "r") as f:
228 | for line in f:
229 | prediction = line.split('\t')[1].strip()
230 | predictions.append(prediction)
231 |
232 | label_names = sorted(['Appeal_to_Authority', 'Doubt', 'Repetition', 'Appeal_to_fear-prejudice', 'Slogans',
233 | 'Black-and-White_Fallacy', 'Loaded_Language', 'Flag-Waving', 'Name_Calling,Labeling',
234 | 'Whataboutism,Straw_Men,Red_Herring', 'Causal_Oversimplification', 'Exaggeration,Minimisation',
235 | 'Bandwagon,Reductio_ad_hitlerum', 'Thought-terminating_Cliches'])
236 | ground_truth = read_ground_truth(gt_file_path, label_names)
237 |
238 | acc = accuracy_score(ground_truth, predictions)
239 | f1 = list(zip(label_names, f1_score(ground_truth, predictions, average=None, labels=label_names)))
240 | return acc, f1
241 |
--------------------------------------------------------------------------------
/span_identification/__main__.py:
--------------------------------------------------------------------------------
1 | try:
2 | from .ner import transformers_ner_crf, transformers_ner
3 | from .dataset import load_data, get_train_dev_files, get_test_file, create_subfolder
4 | from .submission import get_submission_format
5 | except:
6 | from ner import transformers_ner_crf, transformers_ner
7 | from dataset import load_data, get_train_dev_files, get_test_file, create_subfolder
8 | from submission import get_submission_format
9 |
10 | import configargparse
11 | import spacy
12 | import logging
13 | import os
14 | import subprocess
15 | import tempfile
16 |
17 |
18 | logger = logging.getLogger(__name__)
19 |
20 |
21 | def Main(args):
22 | nlp = spacy.load("en")
23 | if not os.path.exists(args.data_dir):
24 | os.makedirs(args.data_dir)
25 |
26 | if args.do_train or args.do_eval or args.split_dataset:
27 | articles_content, articles_id, propaganda_techniques_names = load_data(args.train_data_folder,
28 | args.propaganda_techniques_file)
29 | train_file_path = os.path.join(args.data_dir, args.train_file)
30 | dev_file_path = os.path.join(args.data_dir, args.dev_file)
31 | if not os.path.exists(train_file_path) or not os.path.exists(dev_file_path) or args.overwrite_cache:
32 | logger.info("Creating 'ner' train/dev files: %s, %s", train_file_path, dev_file_path)
33 | train_ids, dev_ids = get_train_dev_files(articles_id, articles_content, nlp, args.labels_path, train_file_path,
34 | dev_file_path, args.split_by_ids, args.dev_size, args.random_state)
35 | if args.split_dataset:
36 | create_subfolder(os.path.join(args.data_dir, 'train-train-articles'), args.train_data_folder, train_ids)
37 | create_subfolder(os.path.join(args.data_dir, 'train-dev-articles'), args.train_data_folder, dev_ids)
38 |
39 | if args.do_predict or args.create_submission_file or args.do_eval_spans:
40 | test_articles_content, test_articles_id, _ = load_data(args.test_data_folder, args.propaganda_techniques_file)
41 | test_file_path = os.path.join(args.data_dir, args.test_file)
42 | if (not os.path.exists(test_file_path) or args.overwrite_cache) and not args.do_eval_spans:
43 | logger.info("Creating 'ner' test file: %s", test_file_path)
44 | get_test_file(test_file_path, test_articles_id, test_articles_content, nlp)
45 |
46 | if args.do_train or args.do_eval or args.do_predict:
47 | if args.use_crf:
48 | transformers_ner_crf(args)
49 | else:
50 | transformers_ner(args)
51 |
52 | if args.do_eval_spans:
53 | logger.info("Evaluating file %s with competition metrics", args.output_file)
54 | output_file = os.path.join('results', args.output_file)
55 | get_submission_format(args.predicted_labels_files, test_articles_id, test_articles_content, nlp, output_file)
56 | if args.gold_annot_file is None:
57 | gold_annot_file = next(tempfile._get_candidate_names())
58 | get_submission_format([test_file_path], test_articles_id, test_articles_content, nlp, gold_annot_file)
59 | else:
60 | gold_annot_file = args.gold_annot_file
61 | cmd = "python tools/task-SI_scorer.py -s {} -r {}".format(output_file, gold_annot_file)
62 | subprocess.run(cmd, shell=True)
63 | if args.gold_annot_file is None:
64 | os.remove(gold_annot_file)
65 |
66 | if args.create_submission_file:
67 | if not os.path.exists('results'):
68 | os.makedirs('results')
69 | output_file = os.path.join('results', args.output_file)
70 | logger.info("Creating a submission file: %s", output_file)
71 | get_submission_format(args.predicted_labels_files, test_articles_id, test_articles_content, nlp, output_file)
72 |
73 |
74 | def main():
75 | parser = configargparse.ArgumentParser()
76 |
77 | parser.add_argument('--config', required=True, is_config_file=True, help='Config file path.')
78 | parser.add_argument("--train_data_folder", default=None, type=str, required=True,
79 | help="Source directory with the train articles.")
80 | parser.add_argument("--test_data_folder", default=None, type=str, required=True,
81 | help="Source directory with the test articles.")
82 | parser.add_argument("--propaganda_techniques_file", default=None, type=str, required=True,
83 | help="The file with propaganda techniques.")
84 | parser.add_argument("--labels_path", default=None, type=str, required=True,
85 | help="The file with train labels.")
86 | parser.add_argument("--data_dir", default=None, type=str, required=True,
87 | help="The directory for cached preprocessed data.")
88 | parser.add_argument("--train_file", default=None, type=str, required=True,
89 | help="The filename for cached preprocessed train data.")
90 | parser.add_argument("--dev_file", default=None, type=str, required=True,
91 | help="The filename for cached preprocessed dev data.")
92 | parser.add_argument("--test_file", default=None, type=str, required=True,
93 | help="The filename for cached preprocessed test data.")
94 | parser.add_argument("--predicted_labels_files", default=None, nargs='*', required=True,
95 | help="The predicted filenames of labels that will be used to form the final result")
96 | parser.add_argument("--output_file", default=None, type=str, required=True,
97 | help="The submission filename")
98 | parser.add_argument("--dev_size", default=0.3, type=float, help="Dev data size.")
99 | parser.add_argument("--split_dataset", action="store_true",
100 | help="Split the dataset into the train/dev parts")
101 | parser.add_argument("--split_by_ids", action="store_true",
102 | help="Use articles ids while splitting the dataset into the train/dev parts.")
103 | parser.add_argument("--create_submission_file", action="store_true",
104 | help="Creats file in the submission (source) format")
105 | parser.add_argument("--random_state", default=42, type=int, help='Random state for the dataset splitting.')
106 | parser.add_argument("--do_eval_spans", action="store_true",
107 | help="Whether to run eval on the dev set with the competition metrics.")
108 | parser.add_argument("--gold_annot_file", default=None, type=str, help="Gold annotation file.")
109 |
110 | parser.add_argument("--use_crf", action="store_true", help="Use Conditional Random Field over the model")
111 | parser.add_argument("--use_quotes", action="store_true")
112 |
113 | MODEL_CLASSES = ["bert", "roberta", "distilbert", "camembert"]
114 | parser.add_argument("--model_type", default=None, type=str, required=True,
115 | help="Model type selected in the list: " + ", ".join(MODEL_CLASSES))
116 | parser.add_argument("--model_name_or_path", default=None, type=str, required=True,
117 | help="Path to pre-trained model or shortcut name")
118 | parser.add_argument("--output_dir", default=None, type=str, required=True,
119 | help="The output directory where the model predictions and checkpoints will be written.")
120 |
121 | parser.add_argument("--labels", default="", type=str,
122 | help="Path to a file containing all labels. If not specified, CoNLL-2003 labels are used.")
123 | parser.add_argument("--config_name", default="", type=str,
124 | help="Pretrained config name or path if not the same as model_name")
125 | parser.add_argument("--tokenizer_name", default="", type=str,
126 | help="Pretrained tokenizer name or path if not the same as model_name")
127 | parser.add_argument("--cache_dir", default="", type=str,
128 | help="Where do you want to store the pre-trained models downloaded from s3")
129 | parser.add_argument("--max_seq_length", default=128, type=int,
130 | help="The maximum total input sequence length after tokenization. Sequences longer "
131 | "than this will be truncated, sequences shorter will be padded.")
132 | parser.add_argument("--do_train", action="store_true",
133 | help="Whether to run training.")
134 | parser.add_argument("--do_eval", action="store_true",
135 | help="Whether to run eval on the dev set.")
136 | parser.add_argument("--do_predict", action="store_true",
137 | help="Whether to run predictions on the test set.")
138 | parser.add_argument("--evaluate_during_training", action="store_true",
139 | help="Whether to run evaluation during training at each logging step.")
140 | parser.add_argument("--do_lower_case", action="store_true",
141 | help="Set this flag if you are using an uncased model.")
142 |
143 | parser.add_argument("--per_gpu_train_batch_size", default=8, type=int,
144 | help="Batch size per GPU/CPU for training.")
145 | parser.add_argument("--per_gpu_eval_batch_size", default=8, type=int,
146 | help="Batch size per GPU/CPU for evaluation.")
147 | parser.add_argument("--gradient_accumulation_steps", type=int, default=1,
148 | help="Number of updates steps to accumulate before performing a backward/update pass.")
149 | parser.add_argument("--learning_rate", default=5e-5, type=float,
150 | help="The initial learning rate for Adam.")
151 | parser.add_argument("--weight_decay", default=0.0, type=float,
152 | help="Weight decay if we apply some.")
153 | parser.add_argument("--adam_epsilon", default=1e-8, type=float,
154 | help="Epsilon for Adam optimizer.")
155 | parser.add_argument("--max_grad_norm", default=1.0, type=float,
156 | help="Max gradient norm.")
157 | parser.add_argument("--num_train_epochs", default=3.0, type=float,
158 | help="Total number of training epochs to perform.")
159 | parser.add_argument("--max_steps", default=-1, type=int,
160 | help="If > 0: set total number of training steps to perform. Override num_train_epochs.")
161 | parser.add_argument("--warmup_steps", default=0, type=int,
162 | help="Linear warmup over warmup_steps.")
163 |
164 | parser.add_argument("--logging_steps", type=int, default=50,
165 | help="Log every X updates steps.")
166 | parser.add_argument("--save_steps", type=int, default=50,
167 | help="Save checkpoint every X updates steps.")
168 | parser.add_argument("--eval_all_checkpoints", action="store_true",
169 | help="Evaluate all checkpoints starting with the same prefix as model_name ending and ending with step number")
170 | parser.add_argument("--no_cuda", action="store_true",
171 | help="Avoid using CUDA when available")
172 | parser.add_argument("--overwrite_output_dir", action="store_true",
173 | help="Overwrite the content of the output directory")
174 | parser.add_argument("--overwrite_cache", action="store_true",
175 | help="Overwrite the cached training and evaluation sets")
176 | parser.add_argument("--seed", type=int, default=42,
177 | help="random seed for initialization")
178 |
179 | parser.add_argument("--fp16", action="store_true",
180 | help="Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit")
181 | parser.add_argument("--fp16_opt_level", type=str, default="O1",
182 | help="For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']."
183 | "See details at https://nvidia.github.io/apex/amp.html")
184 | parser.add_argument("--local_rank", type=int, default=-1,
185 | help="For distributed training: local_rank")
186 | parser.add_argument("--server_ip", type=str, default="", help="For distant debugging.")
187 | parser.add_argument("--server_port", type=str, default="", help="For distant debugging.")
188 | args = parser.parse_args()
189 |
190 | logging.basicConfig(format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
191 | datefmt="%m/%d/%Y %H:%M:%S",
192 | level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN)
193 |
194 | Main(args)
195 |
196 |
197 | if __name__ == "__main__":
198 | main()
199 |
--------------------------------------------------------------------------------
/technique_classification/__main__.py:
--------------------------------------------------------------------------------
1 | try:
2 | from .transformers_classifier import transformers_clf
3 | from .dataset import load_data, get_train_dev_files, get_test_file
4 | from .submission import create_submission_file, eval_submission
5 | except:
6 | from transformers_classifier import transformers_clf
7 | from dataset import load_data, get_train_dev_files, get_test_file
8 | from submission import create_submission_file, eval_submission
9 |
10 | import configargparse
11 | import logging
12 | import os
13 | import subprocess
14 |
15 |
16 | logger = logging.getLogger(__name__)
17 |
18 |
19 | def Main(args):
20 | if not os.path.exists(args.data_dir):
21 | os.makedirs(args.data_dir)
22 |
23 | if args.do_train or args.do_eval or args.split_dataset or args.create_submission_file:
24 | articles, ref_articles_id, ref_span_starts, ref_span_ends, labels = load_data(args.train_data_folder,
25 | args.labels_path)
26 | train_file_path = os.path.join(args.data_dir, args.train_file)
27 | dev_file_path = os.path.join(args.data_dir, args.dev_file)
28 | if not os.path.exists(train_file_path) or not os.path.exists(dev_file_path) or args.overwrite_cache:
29 | logger.info("Creating train/dev files: %s, %s", train_file_path, dev_file_path)
30 | get_train_dev_files(articles, ref_articles_id, ref_span_starts, ref_span_ends, labels, train_file_path,
31 | dev_file_path, args.split_by_ids, args.dev_size, args.random_state, args.balance,
32 | args.shuffle)
33 |
34 | if args.do_predict or args.create_submission_file or args.eval_submission:
35 | test_file_path = os.path.join(args.data_dir, args.test_file)
36 | test_articles, test_articles_id, test_span_starts, test_span_ends, test_labels = load_data(args.test_data_folder,
37 | args.test_template_labels_path)
38 | if not os.path.exists(test_file_path) or args.overwrite_cache:
39 | logger.info("Creating roberta-type test file: %s", test_file_path)
40 | get_test_file(test_articles, test_articles_id, test_span_starts, test_span_ends, test_labels, test_file_path)
41 |
42 | if args.do_train or args.do_eval or args.do_predict:
43 | transformers_clf(args)
44 |
45 | if args.create_submission_file:
46 | if not os.path.exists('results'):
47 | os.makedirs('results')
48 | output_file = os.path.join('results', args.output_file)
49 | logger.info("Creating the submission file: %s", output_file)
50 | create_submission_file(args.predicted_logits_files, train_file_path, dev_file_path, test_file_path,
51 | test_articles_id, test_span_starts, test_span_ends, output_file, args.weights, args.data_dir)
52 |
53 | if args.eval_submission:
54 | output_file = os.path.join('results', args.output_file)
55 | logger.info("Evaluating the submission file: %s", output_file)
56 | if args.test_labels_path is None:
57 | acc, f1 = eval_submission(output_file, test_file_path)
58 | logger.info('accuracy: %f', acc)
59 | print('f1-macro:', f1)
60 | else:
61 | cmd = "python tools/task-TC_scorer.py -s {} -r {} -p {}".format(output_file, args.test_labels_path,
62 | args.propaganda_techniques_file)
63 | subprocess.run(cmd, shell=True)
64 |
65 |
66 | def main():
67 | parser = configargparse.ArgumentParser()
68 |
69 | parser.add_argument('--config', required=True, is_config_file=True, help='Config file path.')
70 | parser.add_argument("--train_data_folder", default=None, type=str, required=True,
71 | help="Source directory with the train articles.")
72 | parser.add_argument("--test_data_folder", default=None, type=str, required=True,
73 | help="Source directory with the test articles.")
74 | parser.add_argument("--propaganda_techniques_file", default=None, type=str, required=True,
75 | help="The file with propaganda techniques.")
76 | parser.add_argument("--labels_path", default=None, type=str, required=True,
77 | help="The file with train labels.")
78 | parser.add_argument("--test_template_labels_path", default=None, type=str, required=True,
79 | help="The file with test template labels.")
80 | parser.add_argument("--data_dir", default=None, type=str, required=True,
81 | help="The directory for cached preprocessed data.")
82 | parser.add_argument("--train_file", default=None, type=str, required=True,
83 | help="The filename for cached preprocessed train data.")
84 | parser.add_argument("--dev_file", default=None, type=str, required=True,
85 | help="The filename for cached preprocessed dev data.")
86 | parser.add_argument("--test_file", default=None, type=str, required=True,
87 | help="The filename for cached preprocessed test data.")
88 | parser.add_argument("--predicted_logits_files", default=None, nargs='*', required=True,
89 | help="The predicted filenames of logits that will be used to obtain the final result")
90 | parser.add_argument("--weights", default=None, nargs='*', required=False,
91 | help="The list of weights for predicted logits at the aggregation stage")
92 | parser.add_argument("--output_file", default=None, type=str, required=True,
93 | help="The submission filename")
94 | parser.add_argument("--dev_size", default=0.3, type=float, help="Dev data size.")
95 | parser.add_argument("--split_dataset", action="store_true",
96 | help="Split the dataset into the train/dev parts.")
97 | parser.add_argument("--split_by_ids", action="store_true",
98 | help="Use articles ids while splitting the dataset into the train/dev parts.")
99 | parser.add_argument("--random_state", default=42, type=int, help='Random state for the dataset splitting.')
100 | parser.add_argument("--shuffle", action="store_true", help="Shuffle the train dataset.")
101 | parser.add_argument("--balance", action="store_true", help="Balance the train dataset with oversampling.")
102 | parser.add_argument("--create_submission_file", action="store_true",
103 | help="Creats file in the submission (source) format")
104 | parser.add_argument("--eval_submission", action="store_true", help="Do evaluating for the dev subset.")
105 |
106 | parser.add_argument('--use_length', action='store_true')
107 | parser.add_argument('--join_embeddings', action='store_true')
108 | parser.add_argument('--use_matchings', action='store_true')
109 |
110 | MODEL_CLASSES = ["bert", "roberta", "distilbert", "camembert"]
111 | parser.add_argument("--model_type", default=None, type=str, required=True,
112 | help="Model type selected in the list: " + ", ".join(MODEL_CLASSES))
113 | parser.add_argument("--model_name_or_path", default=None, type=str, required=True,
114 | help="Path to pre-trained model or shortcut name.")
115 | parser.add_argument("--task_name", default=None, type=str, required=True,
116 | help="The name of the task to train.")
117 | parser.add_argument("--output_dir", default=None, type=str, required=True,
118 | help="The output directory where the model predictions and checkpoints will be written.")
119 | parser.add_argument("--test_labels_path", default=None, type=str, required=False)
120 |
121 | ## Other parameters
122 | parser.add_argument("--config_name", default="", type=str,
123 | help="Pretrained config name or path if not the same as model_name")
124 | parser.add_argument("--tokenizer_name", default="", type=str,
125 | help="Pretrained tokenizer name or path if not the same as model_name")
126 | parser.add_argument("--cache_dir", default="", type=str,
127 | help="Where do you want to store the pre-trained models downloaded from s3")
128 | parser.add_argument("--max_seq_length", default=128, type=int,
129 | help="The maximum total input sequence length after tokenization. Sequences longer "
130 | "than this will be truncated, sequences shorter will be padded.")
131 | parser.add_argument("--do_train", action='store_true',
132 | help="Whether to run training.")
133 | parser.add_argument("--do_eval", action='store_true',
134 | help="Whether to run eval on the dev set.")
135 | parser.add_argument("--do_predict", action='store_true',
136 | help="Whether to run prediction")
137 | parser.add_argument("--evaluate_during_training", action='store_true',
138 | help="Rul evaluation during training at each logging step.")
139 | parser.add_argument("--do_lower_case", action='store_true',
140 | help="Set this flag if you are using an uncased model.")
141 |
142 | parser.add_argument("--per_gpu_train_batch_size", default=8, type=int,
143 | help="Batch size per GPU/CPU for training.")
144 | parser.add_argument("--per_gpu_eval_batch_size", default=8, type=int,
145 | help="Batch size per GPU/CPU for evaluation.")
146 | parser.add_argument('--gradient_accumulation_steps', type=int, default=1,
147 | help="Number of updates steps to accumulate before performing a backward/update pass.")
148 | parser.add_argument("--learning_rate", default=5e-5, type=float,
149 | help="The initial learning rate for Adam.")
150 | parser.add_argument("--weight_decay", default=0.0, type=float,
151 | help="Weight deay if we apply some.")
152 | parser.add_argument("--adam_epsilon", default=1e-8, type=float,
153 | help="Epsilon for Adam optimizer.")
154 | parser.add_argument("--max_grad_norm", default=1.0, type=float,
155 | help="Max gradient norm.")
156 | parser.add_argument("--num_train_epochs", default=3.0, type=float,
157 | help="Total number of training epochs to perform.")
158 | parser.add_argument("--max_steps", default=-1, type=int,
159 | help="If > 0: set total number of training steps to perform. Override num_train_epochs.")
160 | parser.add_argument("--warmup_steps", default=0, type=int,
161 | help="Linear warmup over warmup_steps.")
162 |
163 | parser.add_argument('--logging_steps', type=int, default=50,
164 | help="Log every X updates steps.")
165 | parser.add_argument('--save_steps', type=int, default=50,
166 | help="Save checkpoint every X updates steps.")
167 | parser.add_argument("--eval_all_checkpoints", action='store_true',
168 | help="Evaluate all checkpoints starting with the same prefix as model_name ending and ending with step number")
169 | parser.add_argument("--no_cuda", action='store_true',
170 | help="Avoid using CUDA when available")
171 | parser.add_argument('--overwrite_output_dir', action='store_true',
172 | help="Overwrite the content of the output directory")
173 | parser.add_argument('--overwrite_cache', action='store_true',
174 | help="Overwrite the cached training and evaluation sets")
175 | parser.add_argument('--seed', type=int, default=42,
176 | help="random seed for initialization")
177 |
178 | parser.add_argument('--fp16', action='store_true',
179 | help="Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit")
180 | parser.add_argument('--fp16_opt_level', type=str, default='O1',
181 | help="For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']."
182 | "See details at https://nvidia.github.io/apex/amp.html")
183 | parser.add_argument("--local_rank", type=int, default=-1,
184 | help="For distributed training: local_rank")
185 | parser.add_argument('--server_ip', type=str, default='', help="For distant debugging.")
186 | parser.add_argument('--server_port', type=str, default='', help="For distant debugging.")
187 | args = parser.parse_args()
188 |
189 | logging.basicConfig(format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
190 | datefmt="%m/%d/%Y %H:%M:%S",
191 | level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN)
192 |
193 | Main(args)
194 |
195 |
196 | if __name__ == "__main__":
197 | main()
198 |
--------------------------------------------------------------------------------
/visualization_example/visualization/highlight.css:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/tools/src/annotations.py:
--------------------------------------------------------------------------------
1 | from __future__ import annotations
2 | from typing import Dict
3 | import sys
4 | import re
5 | import os.path
6 | import glob
7 | import logging.handlers
8 | from sklearn.metrics import f1_score
9 | from sklearn.metrics import precision_score
10 | from sklearn.metrics import recall_score
11 | import src.article_annotations as aa
12 | import src.annotation as an
13 |
14 | __author__ = "Giovanni Da San Martino"
15 | __copyright__ = "Copyright 2019"
16 | __credits__ = ["Giovanni Da San Martino"]
17 | __license__ = "GPL"
18 | __version__ = "0.1"
19 | __maintainer__ = "Giovanni Da San Martino"
20 | __email__ = "gmartino@hbku.edu.qa"
21 | __status__ = "Beta"
22 |
23 | logger = logging.getLogger("propaganda_scorer")
24 |
25 |
26 | class Annotations(object):
27 | """
28 | Dictionary of Articles_annotations objects.
29 | (basically a dataset of article_annotations objects)
30 |
31 | """
32 |
33 | def __init__(self, annotations:aa.Articles_annotations=None):
34 |
35 | if annotations is None:
36 | self.annotations:Dict[str, aa.Articles_annotations] = {}
37 | else:
38 | self.annotations = annotations
39 |
40 |
41 | def __len__(self):
42 | """
43 | Returns the number of articles in the object
44 | """
45 | return len(self.get_article_id_list())
46 |
47 |
48 | def add_annotation(self, annotation:an.Annotation, article_id:str):
49 | """
50 | Add a single annotation to the article with id article_id.
51 | If such article does not exists, the annotation is created.
52 | """
53 | if not self.has_article(article_id):
54 | self.create_article_annotations_object(article_id)
55 | self.annotations[article_id].add_annotation(annotation)
56 |
57 |
58 | def check_annotation_spans_with_category_matching(self, merge_overlapping_spans:bool=False):
59 | """
60 | Check whether there are overlapping spans for the same technique in the same article.
61 | Two spans are overlapping if their associated techniques match (according to category_matching_func)
62 | If merge_overlapping_spans==True then the overlapping spans are merged, otherwise an error is raised.
63 |
64 | :param merge_overlapping_spans: if True merges the overlapping spans
65 | :return:
66 | """
67 |
68 | for article_id in self.get_article_id_list():
69 |
70 | annotation_list = self.get_article_annotations_obj(article_id).groupby_technique()
71 | if merge_overlapping_spans:
72 | for technique in annotation_list.keys():
73 | for i in range(1, len(annotation_list[technique])):
74 | annotation_list[technique][i].merge_spans(annotation_list[technique], i-1)
75 | if not self.get_article_annotations_obj(article_id):
76 | return False
77 | # annotation_list = {}
78 | # for annotation in self.annotations.get_article_annotations(article_id):
79 | # technique = annotation.get_label()
80 | # if technique not in annotation_list.keys():
81 | # annotation_list[technique] = [[technique, curr_span]]
82 | # else:
83 | # if merge_overlapping_spans:
84 | # annotation_list[technique].append([technique, curr_span])
85 | # merge_spans(annotation_list[technique], len(annotation_list[technique]) - 1)
86 | # else:
87 | # for matching_technique, span in annotation_list[technique]:
88 | # if len(curr_span.intersection(span)) > 0:
89 | # logger.error("In article %s, the span of the annotation %s, [%s,%s] overlap with "
90 | # "the following one from the same article:%s, [%s,%s]" % (
91 | # article_id, matching_technique,
92 | # min(span), max(span), technique, min(curr_span), max(curr_span)))
93 | # return False
94 | # annotation_list[technique].append([technique, curr_span])
95 | # if merge_overlapping_spans:
96 | # annotations[article_id] = []
97 | # for technique in annotation_list.keys():
98 | # annotations[article_id] += annotation_list[technique]
99 | return True
100 |
101 |
102 | def compare_annotations_identical_article_lists(self, second_annotations:Annotations):
103 | """
104 | Compare if self and have identical article id lists
105 | :return: True if the lists are identical and False otherwise.
106 | """
107 | #checking that the number of articles in self and is the same
108 | if len(self.get_article_id_list()) != len(second_annotations.get_article_id_list()):
109 | logger.error("The number of articles in the annotations is different: %d, %d"
110 | % (len(self.get_article_id_list()), len(second_annotations.get_article_id_list())))
111 | return False
112 | diff = set(self.get_article_id_list()).difference(set(second_annotations.get_article_id_list()))
113 | if len(diff) > 0:
114 | logger.error("The two lists of article ids differ: %s"%(diff))
115 | return False
116 |
117 | logger.debug("OK: the list of article ids in the two sets of annotations is identical")
118 | return True
119 |
120 |
121 | def compare_annotations_identical(self, second_annotations:Annotations)->bool:
122 | """
123 | Compare if self and have identical annotations (without considering the technique labels)
124 | :return: True if the lists are identical and False otherwise.
125 | """
126 | for article_id in self.get_article_id_list():
127 | an1_article_annotations = self.get_article_annotations_list(article_id)
128 | an2_article_annotations = second_annotations.get_article_annotations_list(article_id)
129 | if len(an1_article_annotations) != len(an2_article_annotations):
130 | logger.error("The number of annotations for article %s differs: %d vs %d"%(article_id, len(an1_article_annotations), len(an2_article_annotations)))
131 | return False
132 | for an1, an2 in zip(an1_article_annotations, an2_article_annotations):
133 | if not an1.is_span_equal_to(an2):
134 | logger.error("The spans of the annotations of article %s do not match: [%s, %s] vs [%s, %s]"%(article_id, an1.get_start_offset(), an1.get_end_offset(), an2.get_start_offset(), an2.get_end_offset()))
135 | return False
136 | return True
137 |
138 |
139 | # def compute_SI_score(self, second_annotations:anwol.AnnotationWithOutLabel):
140 | # def compute_score_pr(submission_annotations, gold_annotations, technique_names, prop_vs_non_propaganda=False,
141 | # per_article_evaluation=False):
142 | # pass
143 | # prec_denominator = sum([len(annotations) for annotations in submission_annotations.values()])
144 | # rec_denominator = sum([len(annotations) for annotations in gold_annotations.values()])
145 | # technique_Spr_prec = {propaganda_technique: 0 for propaganda_technique in technique_names}
146 | # technique_Spr_rec = {propaganda_technique: 0 for propaganda_technique in technique_names}
147 | # cumulative_Spr_prec, cumulative_Spr_rec = (0, 0)
148 | # f1_articles = []
149 |
150 | # for article_id in submission_annotations.keys():
151 | # gold_data = gold_annotations[article_id]
152 | # logger.debug("Computing contribution to the score of article id %s\nand tuples %s\n%s\n"
153 | # % (article_id, str(submission_annotations[article_id]), str(gold_data)))
154 |
155 | # article_cumulative_Spr_prec, article_cumulative_Spr_rec = (0, 0)
156 | # for j, sd in enumerate(submission_annotations[article_id]): #submission annotations for article article_id:
157 | # s=""
158 | # sd_annotation_length = len(sd[1])
159 | # for i, gd in enumerate(gold_data):
160 | # if prop_vs_non_propaganda or gd[0]==sd[0]:
161 | # #s += "\tmatch %s %s-%s - %s %s-%s"%(sd[0],sd[1], sd[2], gd[0], gd[1], gd[2])
162 | # intersection = len(sd[1].intersection(gd[1]))
163 | # gd_annotation_length = len(gd[1])
164 | # Spr_prec = intersection/sd_annotation_length
165 | # article_cumulative_Spr_prec += Spr_prec
166 | # cumulative_Spr_prec += Spr_prec
167 | # s += "\tmatch %s %s-%s - %s %s-%s: S(p,r)=|intersect(r, p)|/|p| = %d/%d = %f (cumulative S(p,r)=%f)\n"\
168 | # %(sd[0],min(sd[1]), max(sd[1]), gd[0], min(gd[1]), max(gd[1]), intersection, sd_annotation_length, Spr_prec, cumulative_Spr_prec)
169 | # technique_Spr_prec[gd[0]] += Spr_prec
170 |
171 | # Spr_rec = intersection/gd_annotation_length
172 | # article_cumulative_Spr_rec += Spr_rec
173 | # cumulative_Spr_rec += Spr_rec
174 | # s += "\tmatch %s %s-%s - %s %s-%s: S(p,r)=|intersect(r, p)|/|r| = %d/%d = %f (cumulative S(p,r)=%f)\n"\
175 | # %(sd[0],min(sd[1]), max(sd[1]), gd[0], min(gd[1]), max(gd[1]), intersection, gd_annotation_length, Spr_rec, cumulative_Spr_rec)
176 | # technique_Spr_rec[gd[0]] += Spr_rec
177 | # logger.debug("\n%s"%(s))
178 |
179 | # p_article, r_article, f1_article =compute_prec_rec_f1(article_cumulative_Spr_prec,
180 | # len(submission_annotations[article_id]),
181 | # article_cumulative_Spr_rec,
182 | # len(gold_annotations[article_id]), False)
183 | # f1_articles.append(f1_article)
184 |
185 | # p,r,f1 = compute_prec_rec_f1(cumulative_Spr_prec, prec_denominator, cumulative_Spr_rec, rec_denominator)
186 |
187 | # if not prop_vs_non_propaganda:
188 | # for technique_name in technique_Spr_prec.keys():
189 | # prec_tech, rec_tech, f1_tech = compute_prec_rec_f1(technique_Spr_prec[technique_name],
190 | # compute_technique_frequency(submission_annotations.values(), technique_name),
191 | # technique_Spr_prec[technique_name],
192 | # compute_technique_frequency(gold_annotations.values(), technique_name), False)
193 | # logger.info("%s: P=%f R=%f F1=%f" % (technique_name, prec_tech, rec_tech, f1_tech))
194 |
195 | # if per_article_evaluation:
196 | # logger.info("Per article evaluation F1=%s"%(",".join([ str(f1_value) for f1_value in f1_articles])))
197 |
198 | # return f1
199 |
200 |
201 | def align_annotations(self, second_annotations:Annotations)->None:
202 | """
203 | Reorder all annotations such that the matching between annotations' labels
204 | and the ones from second_annotations is maximised.
205 | """
206 | for article_id in second_annotations.get_article_id_list():
207 | self.get_article_annotations_obj(article_id).align_annotations(second_annotations.get_article_annotations_obj(article_id))
208 |
209 |
210 | def compute_TC_score(self, second_annotations:Annotations):
211 | """
212 | second_annotations: gold labels
213 | """
214 |
215 | self.align_annotations(second_annotations)
216 | gold_labels = [ x.get_label() for x in second_annotations.get_full_list_of_annotations() ]
217 | submission_labels = [ x.get_label() for x in self.get_full_list_of_annotations() ]
218 |
219 | precision = precision_score(gold_labels, submission_labels, pos_label=None, average='micro')
220 | recall = recall_score(gold_labels, submission_labels, pos_label=None, average='micro')
221 | f1 = f1_score(gold_labels, submission_labels, pos_label=None, average='micro')
222 | if an.Annotation.propaganda_techniques is not None:
223 | propaganda_techniques_list = an.Annotation.propaganda_techniques.get_propaganda_techniques_list_sorted()
224 | f1_per_class = f1_score(gold_labels, submission_labels, average=None, labels=propaganda_techniques_list)
225 | return precision, recall, f1, f1_per_class
226 | return precision, recall, f1
227 |
228 |
229 | def create_article_annotations_object(self, article_id:str)->None:
230 | self.annotations[article_id] = aa.Articles_annotations(article_id=article_id)
231 |
232 |
233 | def TC_score_to_string(self, second_annotation:Annotations, output_for_script=False):
234 |
235 | if an.Annotation.propaganda_techniques is None: #raise an error
236 | precision, recall, f1 = self.compute_TC_score(second_annotation)
237 | res = "\nPrecision=%f\nRecall=%f\nF1=%f\n"%(precision, recall, f1)
238 | else:
239 | precision, recall, f1, f1_per_class = self.compute_TC_score(second_annotation)
240 | res_for_screen = "\nF1=%f\nPrecision=%f\nRecall=%f\n%s\n" % (precision, recall, f1, "\n".join([ "F1_"+pr+"="+str(f) for pr, f in zip(an.Annotation.propaganda_techniques.get_propaganda_techniques_list(), f1_per_class)]))
241 | if output_for_script:
242 | res_for_script = "%f\t%f\t%f\t"%(f1, precision, recall)
243 | res_for_script += "\t".join([ str(x) for x in f1_per_class])
244 | else:
245 | res_for_script = ""
246 | return res_for_screen, res_for_script
247 |
248 |
249 | def get_full_list_of_annotations(self):
250 | full_list = []
251 | for article_id in self.get_article_id_list():
252 | for an in self.get_article_annotations_list(article_id):
253 | full_list.append(an)
254 | return full_list
255 |
256 |
257 | def has_article(self, article_id:str)->bool:
258 | """
259 | Check whether article_id is in the list of articles whose annotations are in the object.
260 | """
261 | return article_id in self.get_article_id_list()
262 |
263 |
264 | def get_article_id_list(self):
265 | """
266 | All ids of the article in the object
267 | """
268 | return self.annotations.keys()
269 |
270 |
271 | def get_article_annotations_obj(self, article_id:str):
272 | """
273 | Returns all annotations of an article as an Article_annotations object.
274 | """
275 | return self.annotations[article_id]
276 |
277 |
278 | def get_article_annotations_list(self, article_id:str):
279 | """
280 | Returns all annotations of an article as a list of Annotation objects.
281 | """
282 | return self.annotations[article_id].get_article_annotations()
283 |
284 |
285 | def _guess_article_id_from_file_name(self, filename:str)->str:
286 |
287 | regex = re.compile("article([0-9]+).*")
288 | article_id = regex.match(os.path.basename(filename)).group(1)
289 | return article_id
290 |
291 |
292 | def load_annotation_list_from_file(self, filename):
293 | """
294 | Loads all annotations in file . The file is supposed to contain annotations for multiple articles. To load annotations for a single article use the function with the same name from module src.article_annotations.
295 | Each annotation is checked according to check_format_of_annotation_in_file()
296 | """
297 | with open(filename, "r") as f:
298 | for i, line in enumerate(f.readlines(), 1):
299 | ann, article_id = an.Annotation.load_annotation_from_string(line.rstrip(), i, filename)
300 | ann.check_format_of_annotation_in_file()
301 | self.add_annotation(ann, article_id)
302 |
303 |
304 | def load_annotation_list_from_folder(self, folder_name, pattern="*.labels"):
305 | """
306 | Loads all annotations from all files in folder .
307 | Files in the folder are selected according to
308 | """
309 | if not os.path.exists(folder_name):
310 | logger.error("trying to load annotations from folder %s, which does not exists"%(folder_name))
311 | return False
312 | if not os.path.isdir(folder_name):
313 | logger.error("trying to load annotations from folder %s, which does not appear to be a valid folder"%(folder_name))
314 | return False
315 | file_list = glob.glob(os.path.join(folder_name, pattern))
316 | if len(file_list) == 0:
317 | logger.error("Cannot load file list %s/%s"%(folder_name, pattern))
318 | sys.exit()
319 | for filename in file_list:
320 | self.create_article_annotations_object(self._guess_article_id_from_file_name(filename))
321 | self.load_annotation_list_from_file(filename)
322 | return True
323 |
324 | # def compute_technique_frequency(annotations_list, technique_name):
325 | # return sum([len([example_annotation for example_annotation in x if example_annotation[0] == technique_name])
326 | # for x in self.a])
327 |
328 |
329 | # def print_annotations(annotation_list):
330 | # s = ""
331 | # i=0
332 | # for technique, span in annotation_list:
333 | # s += "%d) %s: %d - %d\n"%(i, technique, min(span), max(span))
334 | # i += 1
335 | # return s
336 |
--------------------------------------------------------------------------------
/results/SI_output.txt:
--------------------------------------------------------------------------------
1 | 111111114 1705 1824
2 | 111111117 671 753
3 | 111111131 84 97
4 | 111111131 102 109
5 | 111111131 180 190
6 | 111111131 207 214
7 | 111111131 326 336
8 | 111111131 352 365
9 | 111111131 382 395
10 | 111111131 398 413
11 | 111111131 723 731
12 | 111111131 804 811
13 | 111111131 823 865
14 | 111111131 1030 1068
15 | 111111131 1977 1992
16 | 111111131 2660 2671
17 | 111111131 2728 2739
18 | 111111131 2897 2908
19 | 111111131 2912 2924
20 | 111111131 2952 2997
21 | 111111131 3396 3416
22 | 111111131 3437 3455
23 | 111111131 3738 3748
24 | 111111131 4008 4014
25 | 111111131 4030 4038
26 | 111111131 4152 4169
27 | 111111131 4224 4230
28 | 111111131 4256 4264
29 | 111111131 4339 4352
30 | 111111131 4531 4546
31 | 111111131 4635 4643
32 | 111111131 4752 4768
33 | 111111131 4882 4899
34 | 111111131 5174 5186
35 | 111111131 5244 5262
36 | 111111131 5281 5289
37 | 111111131 5368 5402
38 | 111111131 5904 5916
39 | 111111131 5938 5950
40 | 111111131 6920 6937
41 | 111111131 6957 6971
42 | 111111131 7314 7324
43 | 111111131 7700 7708
44 | 111111137 143 183
45 | 111111137 2058 2064
46 | 111111137 2320 2333
47 | 696694316 603 661
48 | 696694316 1020 1094
49 | 696694316 3276 3379
50 | 696694316 3471 3608
51 | 696694316 3610 4009
52 | 696694316 4376 4395
53 | 696694316 4423 4440
54 | 696694316 4478 4500
55 | 696694316 7026 7097
56 | 696694316 7631 7780
57 | 696694316 7971 8295
58 | 696694316 8298 8640
59 | 696694316 9373 9584
60 | 696694316 9586 9819
61 | 696694316 10661 10866
62 | 696694316 11102 11126
63 | 696694316 12634 12736
64 | 696694316 12738 12848
65 | 696694316 13262 13316
66 | 696694316 13456 13555
67 | 697444415 512 539
68 | 697444415 1471 1596
69 | 697444415 2815 2860
70 | 697444415 2959 3022
71 | 698018235 305 335
72 | 698018235 555 581
73 | 698018235 641 652
74 | 698018235 657 687
75 | 698018235 975 1047
76 | 698018235 1482 1493
77 | 698018235 1658 1881
78 | 698018235 1910 1919
79 | 698018235 2132 2410
80 | 698018235 2431 2460
81 | 698018235 2723 2730
82 | 698018235 2917 3000
83 | 698018235 3283 3380
84 | 698018235 3514 3649
85 | 698719689 131 144
86 | 698719689 190 321
87 | 698719689 1440 1449
88 | 698719689 1542 1611
89 | 698719689 2324 2335
90 | 700461600 717 760
91 | 700461600 1547 1745
92 | 700461600 2318 2496
93 | 700461600 2726 2787
94 | 700461600 2805 2868
95 | 700461600 3073 3255
96 | 700461600 3548 3559
97 | 700461600 3963 4080
98 | 700461600 4209 4426
99 | 700461600 4506 4530
100 | 700461600 4657 4709
101 | 701225819 77 99
102 | 701225819 111 143
103 | 701225819 177 187
104 | 701225819 305 313
105 | 701225819 996 1017
106 | 701225819 1201 1299
107 | 701225819 1493 1603
108 | 701225819 1767 1771
109 | 701225819 1872 1889
110 | 701225819 1967 1986
111 | 701225819 2174 2182
112 | 701225819 2616 2621
113 | 701225819 2689 2694
114 | 701225819 2747 2752
115 | 701225819 2787 2792
116 | 701225819 2982 2991
117 | 701225819 3309 3315
118 | 701225819 3523 3541
119 | 701225819 3787 3803
120 | 701225819 3837 3860
121 | 701225819 4268 4313
122 | 701225819 4606 4636
123 | 701225819 4720 4737
124 | 701225819 5914 5927
125 | 701225819 6102 6112
126 | 701553469 31 41
127 | 701553469 77 143
128 | 701553469 205 209
129 | 701553469 288 302
130 | 701553469 351 361
131 | 701553469 1716 1744
132 | 701553469 1864 1887
133 | 701553469 1983 2143
134 | 701553469 2486 2651
135 | 701553469 2666 2781
136 | 701553469 3139 3159
137 | 701553469 3455 3476
138 | 701553469 3695 3715
139 | 701553469 3898 3916
140 | 701553469 4175 4280
141 | 701837665 761 801
142 | 701837665 803 833
143 | 701837665 1364 1449
144 | 701837665 1806 2008
145 | 701837665 2456 2470
146 | 701837665 2534 2575
147 | 701837665 2618 2726
148 | 701837665 2740 2762
149 | 701837665 2792 2940
150 | 701837665 2942 2991
151 | 701837665 3251 3423
152 | 701837665 3510 3534
153 | 701837665 3824 3883
154 | 701837665 3958 4067
155 | 701837665 5040 5084
156 | 701837665 5821 5840
157 | 701837665 6697 6873
158 | 701837665 7193 7357
159 | 701837665 7401 7455
160 | 701837665 7529 7666
161 | 701837665 7727 7746
162 | 701837665 7785 7841
163 | 701837665 8115 8155
164 | 701837665 8276 8312
165 | 701837665 8453 8556
166 | 701837665 9299 9329
167 | 701837665 9331 9371
168 | 701837665 9425 9433
169 | 701837665 9471 9500
170 | 701837665 9741 9756
171 | 701837665 9780 9814
172 | 701837665 10090 10206
173 | 701837665 10208 10227
174 | 701837665 10802 10972
175 | 701837665 11844 11915
176 | 701837665 11964 12007
177 | 701837665 12031 12047
178 | 701837665 12178 12197
179 | 701837665 12381 12476
180 | 701837665 12594 12665
181 | 703821117 114 174
182 | 703821117 179 236
183 | 703821117 472 532
184 | 703821117 833 880
185 | 703821117 2350 2366
186 | 703821117 2475 2523
187 | 703821117 3368 3383
188 | 703821117 3483 3500
189 | 703821117 3519 3571
190 | 703821117 3893 3974
191 | 703821117 5022 5095
192 | 703821117 5281 5373
193 | 703821117 6046 6098
194 | 703821117 6445 6453
195 | 703821117 6520 6578
196 | 703821117 6580 6609
197 | 703821117 6620 6637
198 | 703821117 7264 7314
199 | 703821117 7674 7693
200 | 703821117 10469 10570
201 | 703821117 10572 10680
202 | 703821117 10697 10805
203 | 703821117 10820 10845
204 | 703821117 10860 10995
205 | 703821117 11087 11097
206 | 703821117 11099 11105
207 | 703821117 11221 11328
208 | 703821117 11655 11671
209 | 703821117 12003 12039
210 | 703821117 12149 12242
211 | 703821117 12349 12363
212 | 703821117 13152 13166
213 | 703821117 13316 13369
214 | 703821117 13682 13691
215 | 703821117 13902 13958
216 | 703821117 13960 14030
217 | 703821117 14142 14158
218 | 703821117 14419 14458
219 | 703821117 14511 14640
220 | 703821117 14653 14658
221 | 703821117 14752 14920
222 | 703821117 15047 15069
223 | 703821117 15216 15279
224 | 703821117 15785 15851
225 | 703821117 16195 16231
226 | 703821117 16335 16411
227 | 703821117 16781 16835
228 | 703821117 16976 17013
229 | 703821117 17016 17047
230 | 703821117 17150 17244
231 | 703821117 17552 17689
232 | 703821117 18269 18323
233 | 703821117 18378 18396
234 | 703821117 18447 18550
235 | 703821117 18604 18639
236 | 704591553 71 103
237 | 704591553 238 265
238 | 704591553 278 285
239 | 704591553 697 822
240 | 704591553 933 954
241 | 704591553 1016 1029
242 | 704591553 1125 1141
243 | 704591553 1280 1292
244 | 704591553 1696 1718
245 | 704591553 1805 1832
246 | 704591553 1868 1883
247 | 704591553 1999 2059
248 | 704591553 2180 2228
249 | 704591553 2765 2780
250 | 704591553 2802 2816
251 | 704591553 3153 3163
252 | 704591553 3221 3359
253 | 704591553 3827 3881
254 | 704591553 3883 3911
255 | 704591553 4058 4124
256 | 704591553 4398 4405
257 | 704591553 4454 4480
258 | 704591553 4493 4508
259 | 704591553 4965 5082
260 | 704856340 4007 4185
261 | 704856340 4187 4324
262 | 706636401 992 1001
263 | 706636401 2911 2939
264 | 706636401 3353 3362
265 | 706636401 3724 3747
266 | 706636401 3802 3966
267 | 709732928 12 21
268 | 709732928 160 172
269 | 709732928 251 259
270 | 709732928 1428 1432
271 | 709732928 1811 1820
272 | 709732928 1957 1966
273 | 709732928 2169 2173
274 | 709732928 2177 2184
275 | 709732928 2583 2587
276 | 709732928 3682 3689
277 | 709732928 3732 3742
278 | 709732928 6464 6474
279 | 709732928 7346 7359
280 | 709732928 7579 7590
281 | 709732928 8158 8168
282 | 709732928 8491 8498
283 | 709732928 8521 8530
284 | 709732928 8532 8546
285 | 709732928 8613 8616
286 | 709732928 9458 9465
287 | 709732928 10416 10440
288 | 709732928 10525 10552
289 | 709732928 10660 10675
290 | 709732928 10768 10775
291 | 709732928 10826 10833
292 | 709732928 10865 10882
293 | 709732928 11982 11997
294 | 710100700 1203 1375
295 | 711596363 13 32
296 | 711596363 258 277
297 | 711596363 1194 1252
298 | 711596363 1408 1421
299 | 711596363 1944 1954
300 | 711596363 3065 3083
301 | 711596363 3136 3154
302 | 711596363 3173 3189
303 | 711596363 3277 3285
304 | 711596363 3700 3870
305 | 711596363 3894 4008
306 | 711596363 4274 4281
307 | 711596363 4373 4389
308 | 711596363 4573 4624
309 | 711596363 4626 4664
310 | 711596363 4738 4767
311 | 711596363 4985 5098
312 | 711596363 5391 5412
313 | 711596363 5627 5645
314 | 711596363 5647 5678
315 | 711596363 5745 5779
316 | 711622457 457 573
317 | 711622457 575 597
318 | 711622457 616 732
319 | 711622457 734 756
320 | 711622457 813 847
321 | 711622457 934 997
322 | 711622457 1095 1250
323 | 711622457 1329 1355
324 | 711622457 1882 1967
325 | 711622457 2008 2120
326 | 711622457 2409 2697
327 | 711622457 2754 2812
328 | 711622457 3303 3458
329 | 711716996 30 85
330 | 711716996 298 304
331 | 711716996 724 862
332 | 711716996 957 992
333 | 711716996 1444 1453
334 | 711716996 1600 1632
335 | 711716996 1936 2064
336 | 711716996 2231 2268
337 | 711716996 2308 2314
338 | 711716996 2320 2345
339 | 711716996 2760 2843
340 | 711716996 2910 2941
341 | 711716996 3978 3992
342 | 711716996 4002 4025
343 | 711716996 4068 4082
344 | 711716996 4563 4569
345 | 711716996 4959 4973
346 | 711716996 5284 5298
347 | 715588833 0 17
348 | 715588833 412 498
349 | 715588833 624 679
350 | 715588833 1046 1074
351 | 715588833 1753 1770
352 | 715588833 2061 2160
353 | 715588833 2437 2464
354 | 715588833 2622 2773
355 | 715588833 7098 7129
356 | 715588833 7155 7185
357 | 715588833 7638 7736
358 | 715588833 7829 7855
359 | 715588833 7857 7909
360 | 715588833 8479 8546
361 | 715588833 8548 8641
362 | 715588833 8643 8940
363 | 715588833 9232 9268
364 | 715588833 9456 9504
365 | 715588833 9836 10047
366 | 715588833 10678 10918
367 | 715588833 11273 11388
368 | 715588833 11390 11426
369 | 715588833 11575 11832
370 | 715588833 11839 11843
371 | 715588833 11846 11970
372 | 715588833 11972 12085
373 | 722507879 1369 1391
374 | 722507879 2307 2350
375 | 722507879 2356 2413
376 | 722507879 2432 2516
377 | 722507879 3019 3045
378 | 722507879 3937 3948
379 | 722507879 4053 4079
380 | 723793978 1106 1256
381 | 723793978 1418 1430
382 | 727493378 493 502
383 | 727493378 563 822
384 | 727493378 1272 1285
385 | 727493378 1768 1866
386 | 727493378 1943 1995
387 | 727493378 1997 2265
388 | 727493378 3072 3149
389 | 727736557 85 94
390 | 727736557 186 212
391 | 727736557 305 328
392 | 727736557 650 669
393 | 727736557 983 1001
394 | 727736557 1203 1347
395 | 727736557 1761 1770
396 | 727736557 1819 1858
397 | 727736557 2226 2239
398 | 727736557 2351 2382
399 | 727736557 2429 2447
400 | 727736557 2840 2884
401 | 727736557 4017 4068
402 | 727736557 4511 4544
403 | 727736557 4574 4602
404 | 727736557 4715 4742
405 | 727736557 5073 5132
406 | 728169864 0 8
407 | 728169864 423 440
408 | 728169864 1628 1641
409 | 728169864 2632 2640
410 | 728169864 2644 2651
411 | 728169864 2655 2662
412 | 728169864 2666 2674
413 | 728169864 2678 2684
414 | 728169864 2688 2694
415 | 728169864 2698 2712
416 | 728169864 2720 2744
417 | 728169864 2747 2878
418 | 728169864 3161 3184
419 | 728169864 5313 5330
420 | 728169864 5517 5525
421 | 728169864 5753 5772
422 | 728169864 6198 6259
423 | 728758697 31 49
424 | 728758697 51 89
425 | 728758697 819 1034
426 | 728758697 1232 1454
427 | 728758697 1462 1509
428 | 728758697 1512 1599
429 | 728758697 1697 1744
430 | 728758697 1746 1788
431 | 728758697 1790 1836
432 | 729410793 29 76
433 | 729410793 316 395
434 | 729410793 657 705
435 | 729410793 708 752
436 | 729410793 754 1015
437 | 729410793 1018 1341
438 | 729410793 1563 1601
439 | 729410793 3356 3480
440 | 729410793 3510 3676
441 | 729410793 4126 4166
442 | 729410793 4237 4363
443 | 729410793 4586 4624
444 | 729410793 4626 4676
445 | 729561658 39 89
446 | 729561658 251 312
447 | 729561658 754 778
448 | 729561658 809 833
449 | 729561658 1006 1052
450 | 729561658 1494 1537
451 | 729561658 1539 1573
452 | 729561658 1575 1643
453 | 729561658 1645 1738
454 | 730559808 955 992
455 | 730559808 998 1088
456 | 730559808 1251 1340
457 | 730559808 2483 2640
458 | 730559808 3006 3186
459 | 730559808 3215 3490
460 | 730559808 3492 3611
461 | 730559808 3953 4017
462 | 730559808 4279 4361
463 | 730559808 4404 4441
464 | 730559808 4447 4537
465 | 730559808 5353 5419
466 | 730559808 5659 5889
467 | 730559808 5891 5969
468 | 730573740 45 66
469 | 730573740 834 846
470 | 730573740 983 1125
471 | 730573740 1302 1322
472 | 730573740 1338 1505
473 | 730573740 2177 2227
474 | 730573740 2240 2345
475 | 730573740 2479 2572
476 | 730573740 2682 2751
477 | 731927633 962 1016
478 | 731927633 1018 1053
479 | 731927633 1189 1224
480 | 731927633 1226 1280
481 | 731927633 1973 2005
482 | 731927633 2063 2092
483 | 731927633 2237 2323
484 | 731927633 2434 2444
485 | 731927633 2784 2832
486 | 731927633 3450 3514
487 | 732154721 27 61
488 | 732154721 281 316
489 | 732154721 397 528
490 | 732154721 987 1153
491 | 732154721 1172 1283
492 | 732154721 1930 1965
493 | 732154721 2384 2427
494 | 732154721 2607 2666
495 | 732154721 2755 2819
496 | 735855251 2216 2268
497 | 735855251 2357 2453
498 | 735855251 2455 2563
499 | 755814432 166 168
500 | 755814432 1728 1735
501 | 755814432 1978 2004
502 | 755814432 2418 2564
503 | 755814432 3178 3201
504 | 755814432 3304 3321
505 | 755814432 3846 3960
506 | 757243988 339 350
507 | 757243988 461 467
508 | 757243988 1447 1457
509 | 757243988 1658 1671
510 | 757243988 2267 2280
511 | 757243988 2473 2483
512 | 757243988 2635 2644
513 | 761969038 119 141
514 | 761969038 305 321
515 | 761969038 974 988
516 | 761969038 1755 1783
517 | 761969038 1971 2046
518 | 761969038 4048 4053
519 | 761969038 4149 4170
520 | 761969038 4271 4292
521 | 761969038 4391 4412
522 | 761969038 4521 4541
523 | 761969038 5552 5571
524 | 761969038 5614 5663
525 | 761969692 393 404
526 | 761969692 482 492
527 | 761969692 601 725
528 | 761969692 1557 1574
529 | 761969692 1894 1907
530 | 761969692 2005 2043
531 | 761969692 2057 2089
532 | 761969692 2276 2289
533 | 761969692 2291 2489
534 | 761969692 2858 2885
535 | 761969692 2993 3013
536 | 761969692 3017 3033
537 | 761969692 4010 4090
538 | 761969692 5796 5816
539 | 761969692 5915 5935
540 | 761969692 6064 6084
541 | 761969692 6222 6242
542 | 761969692 6522 6541
543 | 761969692 6636 6662
544 | 763260610 0 17
545 | 763260610 254 290
546 | 763260610 705 719
547 | 763260610 970 981
548 | 763260610 1053 1105
549 | 763260610 1284 1363
550 | 763260610 1365 1375
551 | 763260610 1411 1424
552 | 763260610 1470 1483
553 | 763260610 1516 1529
554 | 763260610 1593 1619
555 | 763260610 1621 1675
556 | 763260610 2132 2150
557 | 763260610 2205 2238
558 | 763260610 2311 2359
559 | 763260610 2430 2443
560 | 763260610 2812 2868
561 | 763260610 3031 3086
562 | 763260610 3134 3156
563 | 763260610 3559 3589
564 | 763260610 3782 3801
565 | 763260610 3811 3868
566 | 763260610 3911 3931
567 | 763260610 3956 3996
568 | 763260610 4015 4037
569 | 763260610 4166 4190
570 | 763260610 4192 4309
571 | 763260610 4369 4390
572 | 763260610 4500 4518
573 | 763260610 4639 4664
574 | 763260610 4724 4742
575 | 763260610 4819 4851
576 | 763260610 5066 5120
577 | 763260610 5391 5423
578 | 763260610 5495 5554
579 | 763260610 5881 5892
580 | 763260610 6160 6182
581 | 763260610 6230 6271
582 | 763260610 6409 6450
583 | 763260610 6660 6740
584 | 763260610 6845 6907
585 | 763260610 7459 7495
586 | 763260610 7606 7639
587 | 763260610 7661 7680
588 | 763260610 7890 7931
589 | 763260610 8032 8048
590 | 763260610 8129 8166
591 | 763260610 8247 8265
592 | 763260610 8777 8790
593 | 763260610 8957 8999
594 | 763260610 9160 9180
595 | 763260610 9186 9203
596 | 763260610 9302 9329
597 | 763260610 9362 9378
598 | 763260610 9405 9471
599 | 763260610 9689 9714
600 | 763260610 9818 9846
601 | 763260610 9916 9946
602 | 763260610 9967 10009
603 | 763260610 10201 10274
604 | 763260610 10457 10469
605 | 763260610 10568 10629
606 | 763260610 11055 11168
607 | 763260610 11280 11303
608 | 763260610 12230 12261
609 | 763260610 12529 12547
610 | 763260610 13164 13180
611 | 763260610 13644 13664
612 | 763260610 13771 13837
613 | 763260610 14075 14095
614 | 763260610 14289 14314
615 | 763260610 14328 14403
616 | 763260610 14424 14441
617 | 763761219 116 140
618 | 763761219 1452 1465
619 | 763761219 1645 1669
620 | 763761219 1875 1890
621 | 763761219 2068 2129
622 | 763761219 2295 2298
623 | 763761219 2343 2352
624 | 764609985 288 296
625 | 764609985 500 507
626 | 764609985 675 694
627 | 764609985 1681 1685
628 | 764609985 2352 2362
629 | 764609985 2463 2471
630 | 764609985 2550 2764
631 | 764609985 2992 3003
632 | 764609985 3574 3584
633 | 764609985 4675 4681
634 | 764609985 4924 4933
635 | 764609985 5496 5541
636 | 764609985 5891 6090
637 | 764609985 6332 6366
638 | 764609985 7112 7118
639 | 764609985 7224 7235
640 | 764609985 7247 7251
641 | 764609985 8497 8514
642 | 764609985 8701 8718
643 | 764609985 8747 8765
644 | 764609985 10188 10203
645 | 764609985 10403 10413
646 | 764609985 11038 11048
647 | 764715911 132 142
648 | 764715911 254 348
649 | 764715911 573 584
650 | 764715911 937 1001
651 | 764715911 1667 1713
652 | 764715911 1728 1734
653 | 764715911 2558 2561
654 | 764715911 2859 2916
655 | 764715911 3722 3734
656 | 764715911 4015 4028
657 | 764715911 4380 4391
658 | 764715911 5779 5788
659 | 764715911 6379 6427
660 | 764715911 6646 6649
661 | 764715911 6832 6852
662 | 764715911 6958 6981
663 | 764715911 7265 7459
664 | 764715911 7479 7596
665 | 764715911 7652 7700
666 | 765953146 787 816
667 | 765953146 982 1011
668 | 765953146 1099 1110
669 | 765953146 2168 2186
670 | 765953146 2193 2209
671 | 765953146 2497 2507
672 | 765953146 5320 5335
673 | 767129999 59 78
674 | 767129999 1498 1513
675 | 767129999 1739 1803
676 | 767129999 1943 1951
677 | 767129999 2058 2092
678 | 767129999 2478 2499
679 | 767129999 2522 2531
680 | 770156173 0 18
681 | 770156173 34 47
682 | 770156173 1021 1029
683 | 770156173 1106 1184
684 | 770156173 1556 1738
685 | 770156173 1740 1836
686 | 770156173 1919 1938
687 | 770156173 2094 2117
688 | 770156173 2158 2164
689 | 770156173 2330 2348
690 | 770156173 2469 2498
691 | 770156173 2819 2827
692 | 770156173 2934 2989
693 | 770156173 3012 3018
694 | 770156173 3266 3280
695 | 770156173 3692 3737
696 | 770156173 3924 3946
697 | 770156173 3970 3985
698 | 770156173 3991 4004
699 | 770156173 4523 4531
700 | 770156173 4682 4692
701 | 770156173 4758 4781
702 | 770156173 4814 4929
703 | 770156173 5254 5261
704 | 770156173 5330 5340
705 | 770156173 5645 5652
706 | 770156173 5920 5936
707 | 770156173 6056 6072
708 | 770877978 900 905
709 | 770877978 1000 1018
710 | 770877978 1020 1067
711 | 770877978 1252 1270
712 | 770877978 3117 3146
713 | 770877978 3244 3292
714 | 770956434 482 487
715 | 770956434 1360 1380
716 | 770956434 1390 1430
717 | 770956434 1560 1571
718 | 770956434 1659 1665
719 | 770956434 1925 1943
720 | 770956434 1945 1992
721 | 770956434 2176 2194
722 | 770956434 2583 2592
723 | 770956434 2884 2891
724 | 776368676 45 70
725 | 776368676 276 297
726 | 776368676 314 343
727 | 776368676 378 408
728 | 776368676 424 457
729 | 776368676 542 559
730 | 776368676 571 623
731 | 776368676 640 658
732 | 776368676 664 738
733 | 776368676 871 893
734 | 776368676 905 957
735 | 776368676 974 992
736 | 776368676 998 1078
737 | 776368676 1164 1176
738 | 776368676 1461 1494
739 | 776368676 1596 1633
740 | 776368676 2017 2033
741 | 776368676 2050 2074
742 | 776368676 2085 2118
743 | 776368676 2132 2147
744 | 776368676 2164 2174
745 | 776368676 2178 2210
746 | 776368676 3803 3823
747 | 780619695 44 62
748 | 780619695 120 160
749 | 780619695 162 168
750 | 780619695 174 183
751 | 780619695 1245 1262
752 | 780619695 1321 1337
753 | 780619695 1538 1554
754 | 780619695 1728 1744
755 | 780619695 1770 1794
756 | 780619695 1935 1959
757 | 780619695 2018 2034
758 | 780619695 2245 2261
759 | 780619695 2329 2338
760 | 780619695 2343 2359
761 | 780619695 2838 2854
762 | 780619695 3066 3082
763 | 780619695 3207 3224
764 | 780619695 3735 3740
765 | 780619695 3933 3941
766 | 780619695 4123 4129
767 | 780619695 4212 4229
768 | 780619695 4298 4309
769 | 780619695 4504 4513
770 | 780619695 4696 4736
771 | 780619695 4856 4874
772 | 780619695 5547 5553
773 | 780619695 5985 6015
774 | 780619695 6232 6246
775 | 780619695 6301 6314
776 | 781577820 14 25
777 | 781577820 163 190
778 | 781577820 215 246
779 | 781577820 274 302
780 | 781577820 470 494
781 | 781577820 554 561
782 | 781577820 589 661
783 | 781577820 731 741
784 | 781577820 968 1028
785 | 781577820 1031 1218
786 | 781577820 1476 1608
787 | 781577820 1642 1679
788 | 786527921 1 16
789 | 786527921 259 300
790 | 786527921 729 736
791 | 786527921 827 898
792 | 786527921 1281 1329
793 | 786527921 1465 1480
794 | 786527921 1881 1904
795 | 786527921 1955 2170
796 | 786527921 2176 2238
797 | 786527921 2415 2440
798 | 786527921 2753 2765
799 | 786527921 3033 3065
800 | 786527921 5134 5376
801 | 786527921 5622 5643
802 | 786527921 5787 5810
803 | 786527921 6132 6149
804 | 786527921 6683 6695
805 | 786527921 6880 6908
806 | 786527921 7032 7051
807 | 786527921 7482 7583
808 | 786527921 7598 7633
809 | 786527921 7747 7771
810 | 786527921 8099 8114
811 | 786527921 8403 8449
812 | 786527921 8548 8577
813 | 786527921 8838 8856
814 | 786527921 8903 8923
815 | 786527921 9063 9078
816 | 786527921 9859 9869
817 | 786527921 9942 10004
818 | 786527921 10254 10267
819 | 786527921 10496 10512
820 | 786527921 10665 10684
821 | 786527921 10713 10737
822 | 786527921 10920 10973
823 | 786527921 11260 11284
824 | 786527921 11294 11311
825 | 786527921 11506 11530
826 | 786527921 12164 12267
827 | 786527921 12385 12404
828 | 786527921 12856 12878
829 | 786527921 12974 13022
830 | 786527921 13029 13062
831 | 786527921 13191 13235
832 | 786527921 13589 13733
833 | 786527921 13931 13982
834 | 786527921 14062 14097
835 | 786527921 14149 14328
836 | 786527921 14488 14500
837 | 786527921 14505 14550
838 | 786527921 14918 14942
839 | 786527921 15077 15112
840 | 786527921 16017 16053
841 | 786527921 16104 16121
842 | 786527921 16130 16177
843 | 786527921 16221 16253
844 | 786527921 16530 16544
845 | 786527921 16804 16827
846 | 786527921 16846 16868
847 | 786527921 16919 16949
848 | 787142429 1106 1123
849 | 787142429 1150 1204
850 | 787142429 2684 2815
851 | 787529309 16 39
852 | 787529309 70 87
853 | 787529309 288 319
854 | 787529309 323 350
855 | 787529309 921 960
856 | 787529309 2338 2356
857 | 787529309 2405 2430
858 | 787529309 2694 2708
859 | 787529309 3158 3173
860 | 787529309 5159 5195
861 | 787529309 5394 5414
862 | 787529309 6056 6347
863 | 787529309 6678 6784
864 | 787529309 7626 7677
865 | 787529309 7814 7853
866 | 787529309 8200 8310
867 | 787529309 8337 8365
868 | 787529309 8567 8585
869 | 787529309 8686 8710
870 | 787529309 9173 9282
871 | 787529309 9303 9384
872 | 787529309 9661 9697
873 | 787759779 331 343
874 | 787759779 495 507
875 | 787759779 590 623
876 | 787759779 670 679
877 | 787759779 681 696
878 | 787759779 1011 1032
879 | 788900262 0 91
880 | 788900262 93 269
881 | 788900262 1484 1576
882 | 788900262 1722 1749
883 | 788900262 1817 1830
884 | 788900262 1858 1891
885 | 788900262 2218 2270
886 | 788900262 2272 2383
887 | 788900262 2401 2428
888 | 788900262 2696 2711
889 | 788900262 4190 4224
890 | 788900262 4478 4561
891 | 788900262 4877 5017
892 | 788900262 5064 5083
893 | 788900262 5112 5137
894 | 788900262 5263 5313
895 | 788900262 6020 6109
896 | 788900262 6331 6408
897 | 788900262 6490 6583
898 | 788900262 6601 6675
899 | 789370909 53 78
900 | 789370909 459 491
901 | 789370909 1894 1919
902 | 789370909 2009 2030
903 | 789370909 2148 2171
904 | 789370909 3212 3231
905 | 789370909 4056 4071
906 | 789370909 4416 4439
907 | 789370909 4736 4756
908 | 789370909 5762 5843
909 | 789370909 5939 6056
910 | 789370909 6861 6881
911 | 789370909 8063 8083
912 | 789370909 8517 8597
913 | 789370909 8727 8772
914 | 789370909 8829 8884
915 | 789370909 8985 9015
916 | 789370909 9134 9210
917 | 789370909 9255 9271
918 | 789370909 9306 9433
919 | 789370909 9840 9870
920 | 789370909 10444 10485
921 | 789370909 10867 10940
922 | 789370909 11058 11070
923 | 789370909 11625 11705
924 | 789370909 11722 11746
925 | 789370909 11837 11843
926 | 789370909 11848 11871
927 | 795703371 647 654
928 | 795703371 813 829
929 | 795703371 1202 1264
930 | 795703371 1268 1271
931 | 795703371 1774 1811
932 | 795703371 2464 2475
933 | 795703371 2479 2490
934 | 795703371 2572 2575
935 | 795703371 2889 2951
936 | 795703371 3759 3901
937 | 795703371 3904 4050
938 | 795703371 4103 4111
939 | 795703371 4126 4135
940 | 795703371 4268 4327
941 | 795703371 4398 4442
942 | 999000149 113 127
943 | 999000149 243 295
944 | 999000149 300 326
945 | 999000149 813 830
946 | 999000149 1303 1374
947 | 999000149 1602 1689
948 | 999000149 2952 2969
949 | 999000149 2974 2987
950 | 999000149 2989 3077
951 | 999000149 3119 3140
952 | 999000159 422 433
953 | 999000159 499 510
954 | 999000159 949 966
955 | 999000159 1287 1303
956 | 999000159 1701 1735
957 | 999000159 1905 1993
958 | 999000565 25 50
959 | 999000565 127 177
960 | 999000565 276 296
961 | 999000565 344 378
962 | 999000565 419 449
963 | 999000565 575 607
964 | 999000894 74 82
965 | 999000894 148 151
966 | 999000894 411 436
967 | 999000894 501 512
968 | 999000894 1393 1409
969 | 999000894 3391 3409
970 | 999000894 4245 4276
971 | 999000894 4927 4940
972 | 999000894 5185 5200
973 | 999000894 5314 5317
974 | 999001033 38 55
975 | 999001033 134 151
976 | 999001033 400 406
977 | 999001033 687 739
978 | 999001033 826 841
979 | 999001033 875 926
980 | 999001033 941 968
981 | 999001033 1160 1225
982 | 999001033 1245 1274
983 | 999001033 1417 1426
984 | 999001033 1445 1466
985 | 999001033 1468 1488
986 | 999001033 1519 1551
987 | 999001033 1564 1575
988 | 999001033 1879 1901
989 | 999001033 1935 1957
990 | 999001033 1985 2025
991 | 999001033 2382 2408
992 | 999001033 2532 2563
993 | 999001033 2595 2603
994 | 999001033 3206 3219
995 | 999001033 3224 3245
996 | 999001033 3362 3422
997 | 999001033 3560 3581
998 | 999001033 3595 3607
999 | 999001033 3713 3724
1000 | 999001297 773 789
1001 | 999001297 1496 1515
1002 | 999001297 2633 2670
1003 | 999001297 2676 2715
1004 | 999001297 2883 2895
1005 | 999001621 0 68
1006 | 999001621 88 156
1007 | 999001621 325 338
1008 | 999001621 382 401
1009 | 999001621 467 473
1010 | 999001621 612 621
1011 | 999001621 769 783
1012 | 999001621 1628 1655
1013 | 999001621 3471 3495
1014 | 999001621 3976 4062
1015 | 999001621 4085 4137
1016 | 999001621 4208 4225
1017 | 999001621 4485 4527
1018 | 999001621 5047 5061
1019 | 999001621 5088 5132
1020 | 999001621 5159 5170
1021 | 999001621 5890 5902
1022 | 999001621 7085 7091
1023 | 999001621 7988 7999
1024 | 999001621 8473 8481
1025 | 999001621 8493 8500
1026 | 999001621 8550 8570
1027 | 999001621 8600 8611
1028 | 999001621 8627 8694
1029 | 999001621 8892 8897
1030 | 999001621 9108 9121
1031 | 999001621 9801 9859
1032 | 999001621 10035 10062
1033 | 999001621 10151 10169
1034 | 999001621 10180 10185
1035 | 999001621 10536 10541
1036 | 999001621 10779 10800
1037 | 999001621 10931 11012
1038 | 999001621 11213 11537
1039 | 999001621 12106 12112
1040 | 999001621 12143 12184
1041 |
--------------------------------------------------------------------------------
/span_identification/ner/run_ner.py:
--------------------------------------------------------------------------------
1 | # coding=utf-8
2 | # Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
3 | # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
4 | #
5 | # Licensed under the Apache License, Version 2.0 (the "License");
6 | # you may not use this file except in compliance with the License.
7 | # You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | """ Fine-tuning the library models for named entity recognition on CoNLL-2003 (Bert or Roberta). """
17 |
18 | from __future__ import absolute_import, division, print_function
19 |
20 | import argparse
21 | import glob
22 | import logging
23 | import os
24 | import random
25 |
26 | from unidecode import unidecode
27 |
28 | import pickle
29 | import numpy as np
30 | import torch
31 | from seqeval.metrics import precision_score, recall_score, f1_score
32 | from sklearn_crfsuite import metrics
33 | from tensorboardX import SummaryWriter
34 | from torch.nn import CrossEntropyLoss
35 | from torch.utils.data import DataLoader, RandomSampler, SequentialSampler, TensorDataset
36 | from torch.utils.data.distributed import DistributedSampler
37 | from tqdm import tqdm, trange
38 | from .utils_ner import convert_examples_to_features, get_labels, read_examples_from_file
39 |
40 | from transformers import AdamW, get_linear_schedule_with_warmup
41 | from transformers import WEIGHTS_NAME, BertConfig, BertForTokenClassification, BertTokenizer
42 | from transformers import RobertaConfig, RobertaTokenizer
43 | from transformers import DistilBertConfig, DistilBertForTokenClassification, DistilBertTokenizer
44 | from transformers import XLNetConfig, XLNetForTokenClassification, XLNetTokenizer
45 | from transformers import CamembertConfig, CamembertForTokenClassification, CamembertTokenizer
46 | from scipy.special import softmax
47 |
48 | from .modeling_roberta import RobertaForTokenClassification
49 |
50 | logger = logging.getLogger(__name__)
51 |
52 | ALL_MODELS = sum(
53 | (tuple(conf.pretrained_config_archive_map.keys()) for conf in (BertConfig, RobertaConfig, DistilBertConfig)),
54 | ())
55 |
56 | MODEL_CLASSES = {
57 | "bert": (BertConfig, BertForTokenClassification, BertTokenizer),
58 | "roberta": (RobertaConfig, RobertaForTokenClassification, RobertaTokenizer),
59 | "distilbert": (DistilBertConfig, DistilBertForTokenClassification, DistilBertTokenizer),
60 | "camembert": (CamembertConfig, CamembertForTokenClassification, CamembertTokenizer),
61 | "xlnet": (XLNetConfig, XLNetForTokenClassification, XLNetTokenizer)
62 | }
63 |
64 |
65 | def set_seed(args):
66 | random.seed(args.seed)
67 | np.random.seed(args.seed)
68 | torch.manual_seed(args.seed)
69 | if args.n_gpu > 0:
70 | torch.cuda.manual_seed_all(args.seed)
71 |
72 |
73 | def train(args, train_dataset, model, tokenizer, labels, pad_token_label_id):
74 | """ Train the model """
75 | if args.local_rank in [-1, 0]:
76 | tb_writer = SummaryWriter()
77 |
78 | args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu)
79 | train_sampler = RandomSampler(train_dataset) if args.local_rank == -1 else DistributedSampler(train_dataset)
80 | train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.train_batch_size)
81 |
82 | if args.max_steps > 0:
83 | t_total = args.max_steps
84 | args.num_train_epochs = args.max_steps // (len(train_dataloader) // args.gradient_accumulation_steps) + 1
85 | else:
86 | t_total = len(train_dataloader) // args.gradient_accumulation_steps * args.num_train_epochs
87 |
88 | # Prepare optimizer and schedule (linear warmup and decay)
89 | no_decay = ["bias", "LayerNorm.weight"]
90 | optimizer_grouped_parameters = [
91 | {"params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
92 | "weight_decay": args.weight_decay},
93 | {"params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], "weight_decay": 0.0}
94 | ]
95 | optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon)
96 | scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total)
97 | if args.fp16:
98 | try:
99 | from apex import amp
100 | except ImportError:
101 | raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.")
102 | model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level)
103 |
104 | # multi-gpu training (should be after apex fp16 initialization)
105 | if args.n_gpu > 1:
106 | model = torch.nn.DataParallel(model)
107 |
108 | # Distributed training (should be after apex fp16 initialization)
109 | if args.local_rank != -1:
110 | model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.local_rank],
111 | output_device=args.local_rank,
112 | find_unused_parameters=True)
113 |
114 | # Train!
115 | logger.info("***** Running training *****")
116 | logger.info(" Num examples = %d", len(train_dataset))
117 | logger.info(" Num Epochs = %d", args.num_train_epochs)
118 | logger.info(" Instantaneous batch size per GPU = %d", args.per_gpu_train_batch_size)
119 | logger.info(" Total train batch size (w. parallel, distributed & accumulation) = %d",
120 | args.train_batch_size * args.gradient_accumulation_steps * (
121 | torch.distributed.get_world_size() if args.local_rank != -1 else 1))
122 | logger.info(" Gradient Accumulation steps = %d", args.gradient_accumulation_steps)
123 | logger.info(" Total optimization steps = %d", t_total)
124 |
125 | global_step = 0
126 | tr_loss, logging_loss = 0.0, 0.0
127 | model.zero_grad()
128 | train_iterator = trange(int(args.num_train_epochs), desc="Epoch", disable=args.local_rank not in [-1, 0])
129 | set_seed(args) # Added here for reproductibility (even between python 2 and 3)
130 | for _ in train_iterator:
131 | epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0], position=0, leave=True)
132 | for step, batch in enumerate(epoch_iterator):
133 | model.train()
134 | batch = tuple(t.to(args.device) for t in batch)
135 | inputs = {"input_ids": batch[0],
136 | "attention_mask": batch[1],
137 | "labels": batch[3]}
138 | if args.model_type != "distilbert":
139 | inputs["token_type_ids"] = batch[2] if args.model_type in ["bert", "xlnet"] else None # XLM and RoBERTa don"t use segment_ids
140 | if args.use_quotes:
141 | inputs['quotes'] = batch[4]
142 | outputs = model(**inputs)
143 | loss = outputs[0] # model outputs are always tuple in pytorch-transformers (see doc)
144 |
145 | if args.n_gpu > 1:
146 | loss = loss.mean() # mean() to average on multi-gpu parallel training
147 | if args.gradient_accumulation_steps > 1:
148 | loss = loss / args.gradient_accumulation_steps
149 |
150 | if args.fp16:
151 | with amp.scale_loss(loss, optimizer) as scaled_loss:
152 | scaled_loss.backward()
153 | else:
154 | loss.backward()
155 |
156 | tr_loss += loss.item()
157 | if (step + 1) % args.gradient_accumulation_steps == 0:
158 | if args.fp16:
159 | torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), args.max_grad_norm)
160 | else:
161 | torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm)
162 |
163 | scheduler.step() # Update learning rate schedule
164 | optimizer.step()
165 | model.zero_grad()
166 | global_step += 1
167 |
168 | if args.local_rank in [-1, 0] and args.logging_steps > 0 and global_step % args.logging_steps == 0:
169 | # Log metrics
170 | if args.local_rank == -1 and args.evaluate_during_training: # Only evaluate when single GPU otherwise metrics may not average well
171 | results, _ = evaluate(args, model, tokenizer, labels, pad_token_label_id, mode="dev")
172 | for key, value in results.items():
173 | tb_writer.add_scalar("eval_{}".format(key), value, global_step)
174 | tb_writer.add_scalar("lr", scheduler.get_lr()[0], global_step)
175 | tb_writer.add_scalar("loss", (tr_loss - logging_loss) / args.logging_steps, global_step)
176 | logging_loss = tr_loss
177 |
178 | if args.local_rank in [-1, 0] and args.save_steps > 0 and global_step % args.save_steps == 0:
179 | # Save model checkpoint
180 | output_dir = os.path.join(args.output_dir, "checkpoint-{}".format(global_step))
181 | if not os.path.exists(output_dir):
182 | os.makedirs(output_dir)
183 | model_to_save = model.module if hasattr(model, "module") else model # Take care of distributed/parallel training
184 | model_to_save.save_pretrained(output_dir)
185 | torch.save(args, os.path.join(output_dir, "training_args.bin"))
186 | logger.info("Saving model checkpoint to %s", output_dir)
187 |
188 | if args.max_steps > 0 and global_step > args.max_steps:
189 | epoch_iterator.close()
190 | break
191 | if args.max_steps > 0 and global_step > args.max_steps:
192 | train_iterator.close()
193 | break
194 |
195 | if args.local_rank in [-1, 0]:
196 | tb_writer.close()
197 |
198 | return global_step, tr_loss / global_step
199 |
200 |
201 | def evaluate(args, model, tokenizer, labels, pad_token_label_id, mode, prefix=""):
202 | eval_dataset = load_and_cache_examples(args, tokenizer, labels, pad_token_label_id, mode=mode)
203 |
204 | args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu)
205 | # Note that DistributedSampler samples randomly
206 | eval_sampler = SequentialSampler(eval_dataset) if args.local_rank == -1 else DistributedSampler(eval_dataset)
207 | eval_dataloader = DataLoader(eval_dataset, sampler=eval_sampler, batch_size=args.eval_batch_size)
208 |
209 | # multi-gpu evaluate
210 | if args.n_gpu > 1:
211 | model = torch.nn.DataParallel(model)
212 |
213 | # Eval!
214 | logger.info("***** Running evaluation %s *****", prefix)
215 | logger.info(" Num examples = %d", len(eval_dataset))
216 | logger.info(" Batch size = %d", args.eval_batch_size)
217 | eval_loss = 0.0
218 | nb_eval_steps = 0
219 | preds = None
220 | out_label_ids = None
221 | model.eval()
222 | for batch in tqdm(eval_dataloader, desc="Evaluating"):
223 | batch = tuple(t.to(args.device) for t in batch)
224 |
225 | with torch.no_grad():
226 | inputs = {"input_ids": batch[0],
227 | "attention_mask": batch[1],
228 | "labels": batch[3]}
229 | if args.model_type != "distilbert":
230 | inputs["token_type_ids"] = batch[2] if args.model_type in ["bert", "xlnet"] else None # XLM and RoBERTa don"t use segment_ids
231 | if args.use_quotes:
232 | inputs['quotes'] = batch[4]
233 | outputs = model(**inputs)
234 | tmp_eval_loss, logits = outputs[:2]
235 |
236 | if args.n_gpu > 1:
237 | tmp_eval_loss = tmp_eval_loss.mean() # mean() to average on multi-gpu parallel evaluating
238 |
239 | eval_loss += tmp_eval_loss.item()
240 | nb_eval_steps += 1
241 | if preds is None:
242 | preds = logits.detach().cpu().numpy()
243 | out_label_ids = inputs["labels"].detach().cpu().numpy()
244 | else:
245 | preds = np.append(preds, logits.detach().cpu().numpy(), axis=0)
246 | out_label_ids = np.append(out_label_ids, inputs["labels"].detach().cpu().numpy(), axis=0)
247 |
248 | eval_loss = eval_loss / nb_eval_steps
249 | preds_logits = softmax(preds, axis=2)
250 | preds = np.argmax(preds, axis=2)
251 |
252 | label_map = {i: label for i, label in enumerate(labels)}
253 |
254 | out_label_list = [[] for _ in range(out_label_ids.shape[0])]
255 | preds_list = [[] for _ in range(out_label_ids.shape[0])]
256 |
257 | for i in range(out_label_ids.shape[0]):
258 | for j in range(out_label_ids.shape[1]):
259 | if out_label_ids[i, j] != pad_token_label_id:
260 | out_label_list[i].append(label_map[out_label_ids[i][j]])
261 | if np.max(preds_logits[i][j]) > 0:
262 | preds_list[i].append(label_map[preds[i][j]])
263 | else:
264 | preds_list[i].append('O')
265 |
266 | results = {
267 | "loss": eval_loss,
268 | "precision": precision_score(out_label_list, preds_list),
269 | "recall": recall_score(out_label_list, preds_list),
270 | "f1": f1_score(out_label_list, preds_list),
271 | "flat_f1": metrics.flat_f1_score(out_label_list, preds_list, average='micro', labels=["B-PROP", "I-PROP"])
272 | }
273 |
274 | logger.info("***** Eval results %s *****", prefix)
275 | for key in sorted(results.keys()):
276 | logger.info(" %s = %s", key, str(results[key]))
277 |
278 | return results, preds_list
279 |
280 |
281 | def load_and_cache_examples(args, tokenizer, labels, pad_token_label_id, mode):
282 | if args.local_rank not in [-1, 0] and not evaluate:
283 | torch.distributed.barrier() # Make sure only the first process in distributed training process the dataset, and the others will use the cache
284 |
285 | # Load data features from cache or dataset file
286 | cached_features_file = os.path.join(args.data_dir, "cached_{}_{}_{}".format(mode,
287 | list(filter(None, args.model_name_or_path.split("/"))).pop(),
288 | str(args.max_seq_length)))
289 | if False and os.path.exists(cached_features_file) and not args.overwrite_cache:
290 | logger.info("Loading features from cached file %s", cached_features_file)
291 | features = torch.load(cached_features_file)
292 | else:
293 | logger.info("Creating features from dataset file at %s", args.data_dir)
294 | files = {'train': args.train_file, 'dev': args.dev_file, 'test': args.test_file}
295 | examples = read_examples_from_file(os.path.join(args.data_dir, files[mode]), mode)
296 | features = convert_examples_to_features(examples, labels, args.max_seq_length, tokenizer,
297 | cls_token_at_end=bool(args.model_type in ["xlnet"]),
298 | # xlnet has a cls token at the end
299 | cls_token=tokenizer.cls_token,
300 | cls_token_segment_id=2 if args.model_type in ["xlnet"] else 0,
301 | sep_token=tokenizer.sep_token,
302 | sep_token_extra=bool(args.model_type in ["roberta"]),
303 | # roberta uses an extra separator b/w pairs of sentences, cf. github.com/pytorch/fairseq/commit/1684e166e3da03f5b600dbb7855cb98ddfcd0805
304 | pad_on_left=bool(args.model_type in ["xlnet"]),
305 | # pad on the left for xlnet
306 | pad_token=tokenizer.convert_tokens_to_ids([tokenizer.pad_token])[0],
307 | pad_token_segment_id=4 if args.model_type in ["xlnet"] else 0,
308 | pad_token_label_id=pad_token_label_id
309 | )
310 | if args.use_quotes:
311 | assert len(features) == len(examples)
312 | for i in range(len(features)):
313 | tokens = []
314 | for word in examples[i].words:
315 | word_tokens = tokenizer.tokenize(word)
316 | tokens.extend(word_tokens)
317 | tokens = ['cls_token'] + tokens
318 | quotes = np.zeros(args.max_seq_length, dtype=np.float32)
319 | for j in range(1, min(len(tokens), args.max_seq_length)):
320 | if unidecode(tokens[j]) == '"':
321 | quotes[j] = 1
322 | features[i].quotes = quotes[:, None]
323 |
324 | if args.local_rank in [-1, 0]:
325 | logger.info("Saving features into cached file %s", cached_features_file)
326 | torch.save(features, cached_features_file)
327 |
328 | if args.local_rank == 0 and not evaluate:
329 | torch.distributed.barrier() # Make sure only the first process in distributed training process the dataset, and the others will use the cache
330 |
331 | # Convert to Tensors and build dataset
332 | all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long)
333 | all_input_mask = torch.tensor([f.input_mask for f in features], dtype=torch.long)
334 | all_segment_ids = torch.tensor([f.segment_ids for f in features], dtype=torch.long)
335 | all_label_ids = torch.tensor([f.label_ids for f in features], dtype=torch.long)
336 | if args.use_quotes:
337 | all_quotes = torch.tensor([f.quotes for f in features], dtype=torch.long)
338 | dataset = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids, all_quotes)
339 | else:
340 | dataset = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids)
341 | return dataset
342 |
343 |
344 | def transformers_ner(args):
345 | if os.path.exists(args.output_dir) and os.listdir(
346 | args.output_dir) and args.do_train and not args.overwrite_output_dir:
347 | raise ValueError(
348 | "Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome.".format(
349 | args.output_dir))
350 |
351 | # Setup distant debugging if needed
352 | if args.server_ip and args.server_port:
353 | # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script
354 | import ptvsd
355 | print("Waiting for debugger attach")
356 | ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True)
357 | ptvsd.wait_for_attach()
358 |
359 | # Setup CUDA, GPU & distributed training
360 | if args.local_rank == -1 or args.no_cuda:
361 | device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
362 | args.n_gpu = torch.cuda.device_count()
363 | else: # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
364 | torch.cuda.set_device(args.local_rank)
365 | device = torch.device("cuda", args.local_rank)
366 | torch.distributed.init_process_group(backend="nccl")
367 | args.n_gpu = 1
368 | args.device = device
369 |
370 | # Setup logging
371 | logging.basicConfig(format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
372 | datefmt="%m/%d/%Y %H:%M:%S",
373 | level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN)
374 | logger.warning("Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s",
375 | args.local_rank, device, args.n_gpu, bool(args.local_rank != -1), args.fp16)
376 |
377 | # Set seed
378 | set_seed(args)
379 |
380 | # Prepare CONLL-2003 task
381 | labels = get_labels(args.labels)
382 | num_labels = len(labels)
383 | # Use cross entropy ignore index as padding label id so that only real label ids contribute to the loss later
384 | pad_token_label_id = CrossEntropyLoss().ignore_index
385 |
386 | # Load pretrained model and tokenizer
387 | if args.local_rank not in [-1, 0]:
388 | torch.distributed.barrier() # Make sure only the first process in distributed training will download model & vocab
389 |
390 | args.model_type = args.model_type.lower()
391 | config_class, model_class, tokenizer_class = MODEL_CLASSES[args.model_type]
392 |
393 | config = config_class.from_pretrained(args.config_name if args.config_name else args.model_name_or_path,
394 | num_labels=num_labels,
395 | cache_dir=args.cache_dir if args.cache_dir else None)
396 | config.use_quotes = args.use_quotes
397 | tokenizer = tokenizer_class.from_pretrained(args.tokenizer_name if args.tokenizer_name else args.model_name_or_path,
398 | do_lower_case=args.do_lower_case,
399 | cache_dir=args.cache_dir if args.cache_dir else None)
400 | model = model_class.from_pretrained(args.model_name_or_path,
401 | from_tf=bool(".ckpt" in args.model_name_or_path),
402 | config=config,
403 | cache_dir=args.cache_dir if args.cache_dir else None)
404 |
405 | if args.local_rank == 0:
406 | torch.distributed.barrier() # Make sure only the first process in distributed training will download model & vocab
407 |
408 | model.to(args.device)
409 |
410 | logger.info("Training/evaluation parameters %s", args)
411 |
412 | # Training
413 | if args.do_train:
414 | train_dataset = load_and_cache_examples(args, tokenizer, labels, pad_token_label_id, mode="train")
415 | global_step, tr_loss = train(args, train_dataset, model, tokenizer, labels, pad_token_label_id)
416 | logger.info(" global_step = %s, average loss = %s", global_step, tr_loss)
417 |
418 | # Saving best-practices: if you use defaults names for the model, you can reload it using from_pretrained()
419 | if args.do_train and (args.local_rank == -1 or torch.distributed.get_rank() == 0):
420 | # Create output directory if needed
421 | if not os.path.exists(args.output_dir) and args.local_rank in [-1, 0]:
422 | os.makedirs(args.output_dir)
423 |
424 | logger.info("Saving model checkpoint to %s", args.output_dir)
425 | # Save a trained model, configuration and tokenizer using `save_pretrained()`.
426 | # They can then be reloaded using `from_pretrained()`
427 | model_to_save = model.module if hasattr(model, "module") else model # Take care of distributed/parallel training
428 | #model_to_save.save_pretrained(args.output_dir)
429 | model_save_path_ = os.path.join(args.output_dir, "pytorch_model.bin")
430 | torch.save(model_to_save.state_dict(), model_save_path_)
431 | tokenizer.save_pretrained(args.output_dir)
432 |
433 | # Good practice: save your training arguments together with the trained model
434 | torch.save(args, os.path.join(args.output_dir, "training_args.bin"))
435 |
436 | # Evaluation
437 | results = {}
438 | if args.do_eval and args.local_rank in [-1, 0]:
439 | tokenizer = tokenizer_class.from_pretrained(args.output_dir, do_lower_case=args.do_lower_case)
440 | checkpoints = [args.output_dir]
441 | if args.eval_all_checkpoints:
442 | checkpoints = list(os.path.dirname(c) for c in sorted(glob.glob(args.output_dir + "/**/" + WEIGHTS_NAME, recursive=True)))
443 | logging.getLogger("pytorch_transformers.modeling_utils").setLevel(logging.WARN) # Reduce logging
444 | logger.info("Evaluate the following checkpoints: %s", checkpoints)
445 | for checkpoint in checkpoints:
446 | global_step = checkpoint.split("-")[-1] if len(checkpoints) > 1 else ""
447 |
448 | model = model_class.from_pretrained(checkpoint)
449 | model.to(args.device)
450 | result, _ = evaluate(args, model, tokenizer, labels, pad_token_label_id, mode="dev", prefix=global_step)
451 | if global_step:
452 | result = {"{}_{}".format(global_step, k): v for k, v in result.items()}
453 | results.update(result)
454 | output_eval_file = os.path.join(args.output_dir, "eval_results.txt")
455 | with open(output_eval_file, "w") as writer:
456 | for key in sorted(results.keys()):
457 | writer.write("{} = {}\n".format(key, str(results[key])))
458 |
459 | if args.do_predict and args.local_rank in [-1, 0]:
460 | tokenizer = tokenizer_class.from_pretrained(args.output_dir, do_lower_case=args.do_lower_case)
461 | checkpoints = [args.output_dir]
462 | if args.eval_all_checkpoints:
463 | checkpoints = list(os.path.dirname(c) for c in sorted(glob.glob(args.output_dir + "/**/" + WEIGHTS_NAME, recursive=True)))
464 | logging.getLogger("pytorch_transformers.modeling_utils").setLevel(logging.WARN) # Reduce logging
465 | logger.info("Evaluate the following checkpoints: %s", checkpoints)
466 | for checkpoint in checkpoints:
467 | global_step = checkpoint.split("-")[-1] if len(checkpoints) > 1 else ""
468 |
469 | model = model_class.from_pretrained(checkpoint)
470 | model.to(args.device)
471 | result, predictions = evaluate(args, model, tokenizer, labels, pad_token_label_id, mode="test")
472 | if global_step:
473 | result = {"{}_{}".format(global_step, k): v for k, v in result.items()}
474 | results.update(result)
475 | # Save results
476 | output_test_results_file = os.path.join(checkpoint, "test_results.txt")
477 | with open(output_test_results_file, "w") as writer:
478 | for key in sorted(result.keys()):
479 | writer.write("{} = {}\n".format(key, str(result[key])))
480 | # Save predictions
481 | output_test_predictions_file = os.path.join(checkpoint, "test_predictions.txt")
482 | with open(output_test_predictions_file, "w") as writer:
483 | with open(os.path.join(args.data_dir, args.test_file), "r") as f:
484 | example_id = 0
485 | for line in f:
486 | if line.startswith("-DOCSTART-") or line == "" or line == "\n":
487 | writer.write(line)
488 | if not predictions[example_id]:
489 | example_id += 1
490 | elif predictions[example_id]:
491 | output_line = line.split('\t')[0] + "\t" + predictions[example_id].pop(0) + "\n"
492 | writer.write(output_line)
493 | else:
494 | logger.warning("Maximum sequence length exceeded: No prediction for '%s'.", line.split()[0])
495 |
496 | return results
497 |
--------------------------------------------------------------------------------