├── span_identification
    ├── __init__.py
    ├── ner
    │   ├── __init__.py
    │   ├── bert_lstm_crf.py
    │   ├── utils_ner.py
    │   └── run_ner.py
    ├── submission.py
    ├── dataset.py
    └── __main__.py
├── .gitattributes
├── technique_classification
    ├── __init__.py
    ├── transformers_classifier
    │   ├── __init__.py
    │   └── utils.py
    ├── dataset.py
    ├── submission.py
    └── __main__.py
├── tools
    ├── ._README.md
    ├── data
    │   ├── submission-task-SI.tsv
    │   ├── article736757214.task-SI.labels
    │   ├── submission-task-TC.tsv
    │   ├── article736757214.labels-task-TC
    │   ├── propaganda-techniques-names-semeval2020task11.txt
    │   ├── propaganda-techniques-names.txt
    │   └── article736757214.txt
    ├── src
    │   ├── annotation_task_si.py
    │   ├── propaganda_techniques.py
    │   ├── annotation.py
    │   ├── annotation_w_o_label.py
    │   └── annotations.py
    ├── print_spans.py
    ├── task-TC_scorer.py
    └── README.md
├── visualization_example
    └── visualization
    │   ├── highlight.js
    │   ├── __init__.py
    │   ├── html_template.py
    │   └── highlight.css
├── requirements.txt
├── configs
    ├── si_config.yml
    └── tc_config.yml
├── .gitignore
├── README.md
└── results
    └── SI_output.txt


/span_identification/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/.gitattributes:
--------------------------------------------------------------------------------
1 | *.py linguist-detectable=true
2 | *.ipynb linguist-detectable=false
3 | 


--------------------------------------------------------------------------------
/technique_classification/__init__.py:
--------------------------------------------------------------------------------
1 | from .transformers_classifier import transformers_clf


--------------------------------------------------------------------------------
/tools/._README.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aschern/semeval2020_task11/HEAD/tools/._README.md


--------------------------------------------------------------------------------
/tools/data/submission-task-SI.tsv:
--------------------------------------------------------------------------------
1 | 736757214	0	50
2 | 736757214	161	172
3 | 736757214	0	10
4 | 736757214	115	167
5 | 


--------------------------------------------------------------------------------
/tools/data/article736757214.task-SI.labels:
--------------------------------------------------------------------------------
1 | 736757214	0	59
2 | 736757214	171	181
3 | 736757214	0	9
4 | 736757214	115	167
5 | 736757214	740	759
6 | 


--------------------------------------------------------------------------------
/tools/data/submission-task-TC.tsv:
--------------------------------------------------------------------------------
1 | 736757214	Exaggeration,Minimisation	0	59
2 | 736757214	Doubt	171	181
3 | 736757214	Name_Calling,Labeling	0	9
4 | 736757214	Name_Calling,Labeling	115	167
5 | 736757214	Loaded_Language	740	759
6 | 


--------------------------------------------------------------------------------
/technique_classification/transformers_classifier/__init__.py:
--------------------------------------------------------------------------------
1 | from .run_glue import transformers_clf
2 | from .modeling_roberta import RobertaForSequenceClassification
3 | from .utils import glue_processors, glue_output_modes, glue_compute_metrics
4 | 


--------------------------------------------------------------------------------
/tools/data/article736757214.labels-task-TC:
--------------------------------------------------------------------------------
1 | 736757214	Exaggeration,Minimisation	0	59
2 | 736757214	Whataboutism,Straw_Men,Red_Herring	171	181
3 | 736757214	Name_Calling,Labeling	0	9
4 | 736757214	Loaded_Language	115	167
5 | 736757214	Loaded_Language	740	759
6 | 


--------------------------------------------------------------------------------
/visualization_example/visualization/highlight.js:
--------------------------------------------------------------------------------
 1 | <script>
 2 |     
 3 | function handleHighlightMouseOver(el) {
 4 |     $('[id='+el.getAttribute('id')+']').addClass('active');
 5 |   }
 6 | 
 7 | function handleHighlightMouseOut(el) {
 8 |     $('[id='+el.getAttribute('id')+']').removeClass('active');
 9 | }
10 |   
11 | </script>


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | torch==1.2.0
 2 | transformers==2.3.0
 3 | scipy==1.4.1
 4 | numpy==1.16.4
 5 | joblib==0.13.2
 6 | nltk==3.4.5
 7 | ConfigArgParse==1.0
 8 | sklearn_crfsuite==0.3.6
 9 | apex==0.1
10 | seqeval==0.0.5
11 | spacy==2.2.3
12 | Unidecode==1.1.1
13 | tqdm==4.43.0
14 | pandas==1.0.1
15 | ipython==7.13.0
16 | ptvsd==4.3.2
17 | scikit_learn==0.22.2.post1
18 | tensorboardX==2.0
19 | 


--------------------------------------------------------------------------------
/span_identification/ner/__init__.py:
--------------------------------------------------------------------------------
1 | from .run_ner import transformers_ner
2 | from .modeling_roberta import RobertaForTokenClassification
3 | from .utils_ner import convert_examples_to_features, get_labels, read_examples_from_file
4 | from .run_ner_crf import transformers_ner_crf
5 | from .bert_lstm_crf import BertLstmCrf
6 | from .conditional_random_field import ConditionalRandomField, allowed_transitions
7 | 


--------------------------------------------------------------------------------
/tools/data/propaganda-techniques-names-semeval2020task11.txt:
--------------------------------------------------------------------------------
 1 | Appeal_to_Authority
 2 | Appeal_to_fear-prejudice
 3 | Bandwagon,Reductio_ad_hitlerum
 4 | Black-and-White_Fallacy
 5 | Causal_Oversimplification
 6 | Doubt
 7 | Exaggeration,Minimisation
 8 | Flag-Waving
 9 | Loaded_Language
10 | Name_Calling,Labeling
11 | Repetition
12 | Slogans
13 | Thought-terminating_Cliches
14 | Whataboutism,Straw_Men,Red_Herring
15 | 


--------------------------------------------------------------------------------
/tools/data/propaganda-techniques-names.txt:
--------------------------------------------------------------------------------
 1 | Appeal_to_Authority
 2 | Appeal_to_fear-prejudice
 3 | Bandwagon
 4 | Black-and-White_Fallacy
 5 | Causal_Oversimplification
 6 | Doubt
 7 | Exaggeration,Minimisation
 8 | Flag-Waving
 9 | Loaded_Language
10 | Name_Calling,Labeling
11 | Obfuscation,Intentional_Vagueness,Confusion
12 | Red_Herring
13 | Reductio_ad_hitlerum
14 | Repetition
15 | Slogans
16 | Straw_Men
17 | Thought-terminating_Cliches
18 | Whataboutism
19 | 


--------------------------------------------------------------------------------
/tools/src/annotation_task_si.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | import src.annotation as an
 3 | 
 4 | class AnnotationTaskSI(Annotation):
 5 | 
 6 |     def __init__(self, label=None, start_offset = None, end_offset=None): #, article_id=None):
 7 |         
 8 |         self.label = label
 9 |         self.start_offset = int(start_offset)
10 |         self.end_offset = int(end_offset)
11 | 
12 | 
13 |     def get_label(self):
14 | 
15 |         sys.error("ERRRO: trying to access technique label from file in SI task format")
16 | 
17 | 
18 | 


--------------------------------------------------------------------------------
/tools/src/propaganda_techniques.py:
--------------------------------------------------------------------------------
 1 | 
 2 | class Propaganda_Techniques():
 3 | 
 4 | 
 5 |     TECHNIQUE_NAMES_FILE="data/propaganda-techniques-names.txt"
 6 | 
 7 |     def __init__(self, filename=TECHNIQUE_NAMES_FILE):
 8 | 
 9 |         with open(filename, "r") as f:
10 |              self.techniques = [ line.rstrip() for line in f.readlines() ]
11 | 
12 | 
13 |     def get_propaganda_techniques_list(self)->list:
14 | 
15 |         return self.techniques
16 | 
17 | 
18 |     def get_propaganda_techniques_list_sorted(self)->list:
19 | 
20 |         return sorted(self.techniques)
21 | 
22 | 
23 |     def is_valid_technique(self, technique_name):
24 | 
25 |         return technique_name in self.techniques
26 | 
27 | 
28 |     def __str__(self):
29 | 
30 |         return "\n".join(self.techniques)
31 | 
32 | 
33 |     def __getitem__(self, index):
34 |         return self.techniques[index]
35 | 
36 | 
37 |     def get_technique(self, index):
38 |         return self.techniques[index]
39 | 
40 | 
41 |     def indexOf(self, technique_name):
42 |         return self.techniques.index(technique_name)
43 | 


--------------------------------------------------------------------------------
/configs/si_config.yml:
--------------------------------------------------------------------------------
 1 | ---------------dataset params---------------
 2 | 
 3 | train_data_folder: datasets/train-articles
 4 | test_data_folder: datasets/dev-articles
 5 | labels_path: datasets/train-task1-SI.labels
 6 | gold_annot_file: results/dev-task-SI.labels
 7 | propaganda_techniques_file: tools/data/propaganda-techniques-names-semeval2020task11.txt
 8 | data_dir: cached_datasets/SI/
 9 | train_file: train.tsv
10 | dev_file: dev.tsv
11 | test_file: test.tsv
12 | split_by_ids: True
13 | dev_size: 0.18
14 | overwrite_cache: False
15 | 
16 | 
17 | ----------------model params----------------
18 | 
19 | use_crf: True
20 | output_file: SI_output_dev.txt
21 | predicted_labels_files: [model_checkpoints/si_roberta_crf/test_predictions.txt]
22 | 
23 | 
24 | -------------transformers params------------
25 | 
26 | model_type: roberta
27 | config_name: roberta-large
28 | model_name_or_path: model_checkpoints/ner_roberta_large_uncased_crf_7700
29 | max_seq_length: 256
30 | per_gpu_train_batch_size: 8
31 | per_gpu_eval_batch_size: 1
32 | learning_rate: 2e-5
33 | save_steps: 700
34 | warmup_steps: 500
35 | num_train_epochs: 27
36 | output_dir: model_checkpoints/ner_roberta_large_uncased_crf_7700/
37 | do_lower_case: True
38 | 


--------------------------------------------------------------------------------
/tools/data/article736757214.txt:
--------------------------------------------------------------------------------
 1 | Sanctuary City Mayor Protected Illegal Alien Mexican Rapist
 2 | 
 3 | Oakland Mayor Libby Schaaf claims to fight for women.
 4 | Except when she's fighting for their rapists instead.
 5 | A Democratic mayor’s warning to illegal immigrants of an incoming ICE raid in northern California may have led to a number of illegal immigrants with violent and sex-related convictions evading capture and deportation.
 6 | Oakland Mayor Libby Schaaf tweeted out an impending warning of the four-day raid last week, alerting targeted individuals to the imminent arrests, and infuriating Immigrations and Customs Enforcement (ICE) officials, who say that many more could have been caught if they hadn't been warned.
 7 | A spokesperson for ICE gave Fox News examples of some of the unsavory characters who evaded officals during the raid.
 8 | One Mexican citizen had convictions for unlawful sexual intercourse with a minor and a conviction for driving under the influence (DUI), and had been deported in 2003.
 9 | Another who evaded capture had a conviction for sodomizing a drugged victim in 2012, as well as a DUI from this year -- that Mexican citizen had also been previously deported in 2013.
10 | Another illegal immigrant from Mexico, previously deported in 2014 for a conviction for armed robbery, also evaded capture.
11 | 


--------------------------------------------------------------------------------
/configs/tc_config.yml:
--------------------------------------------------------------------------------
 1 | ---------------dataset params---------------
 2 | 
 3 | propaganda_techniques_file: tools/data/propaganda-techniques-names-semeval2020task11.txt
 4 | train_data_folder: datasets/train-articles
 5 | #test_data_folder: datasets/train-articles
 6 | test_data_folder: datasets/dev-articles
 7 | #test_data_folder: datasets/test/test-articles
 8 | labels_path: datasets/train-task2-TC.labels
 9 | #test_template_labels_path: results/mydev-task-TC.labelss
10 | test_template_labels_path: datasets/dev-task-TC-template.out
11 | #test_template_labels_path: datasets/test/test-task-TC-template.out
12 | data_dir: cached_datasets/TC/
13 | train_file: train.tsv
14 | dev_file: dev.tsv
15 | #test_file: dev.tsv
16 | #test_file: eval_tc_new.tsv
17 | test_file: test.tsv
18 | split_by_ids: True
19 | dev_size: 0.18
20 | balance: False
21 | shuffle: True
22 | overwrite_cache: False
23 | 
24 | 
25 | ----------------model params----------------
26 | 
27 | output_file: TC_output_dev_sc.txt
28 | #weights: [1, 0]
29 | predicted_logits_files: [model_checkpoints/tc_roberta_joineds/predicted_logits]
30 | 
31 | 
32 | -------------transformers params------------
33 | 
34 | task_name: prop
35 | model_type: roberta
36 | #model_name_or_path: model_checkpoints/tc_roberta_large_cased_transfer_joined
37 | model_name_or_path: model_checkpoints/tc_roberta_large_cased_transfer_3500
38 | max_seq_length: 256
39 | per_gpu_train_batch_size: 8
40 | per_gpu_eval_batch_size: 8
41 | learning_rate: 2e-5
42 | save_steps: 700
43 | warmup_steps: 500
44 | num_train_epochs: 10
45 | #output_dir: model_checkpoints/tc_roberta_large_cased_transfer_joined
46 | output_dir: model_checkpoints/tc_roberta_large_cased_transfer_3500
47 | do_lower_case: False
48 | 


--------------------------------------------------------------------------------
/tools/print_spans.py:
--------------------------------------------------------------------------------
 1 | __author__ = "Giovanni Da San Martino"
 2 | __copyright__ = "Copyright 2019"
 3 | __credits__ = ["Giovanni Da San Martino"]
 4 | __license__ = "GPL"
 5 | __version__ = "0.1"
 6 | __maintainer__ = "Giovanni Da San Martino"
 7 | __email__ = "gmartino@hbku.edu.qa"
 8 | __status__ = "Beta"
 9 | 
10 | import codecs
11 | import argparse
12 | import src.annotation as an
13 | import src.article_annotations as aa
14 | import src.propaganda_techniques as pt
15 | 
16 | 
17 | def main(args):
18 | 
19 |     span_file = args.spans_file
20 |     article_file = args.article_file
21 |     propaganda_techniques_list_file = args.propaganda_techniques_list_file
22 | 
23 |     propaganda_techniques = pt.Propaganda_Techniques(propaganda_techniques_list_file)
24 |     annotations = aa.Articles_annotations()
25 |     aa.Articles_annotations.techniques = propaganda_techniques
26 | 
27 |     annotations.load_article_annotations_from_csv_file(span_file)
28 |     
29 |     with codecs.open(article_file, "r", encoding="utf8") as f:
30 |         article_content = f.read()
31 |     
32 |     #print("\n".join([str(i)+") "+x for i,x in enumerate(str(aa.techniques).split("\n"))]))
33 |     #output_text, footnotes = annotations.tag_text_with_annotations(article_content)
34 |     output_text, footnotes, legend = annotations.mark_text(article_content)
35 | 
36 |     print(output_text)
37 |     print(footnotes)
38 | 
39 | 
40 | if __name__ == "__main__":
41 | 
42 |     parser = argparse.ArgumentParser(description="Add tags to mark spans in a text file. \n" + 
43 |                                      "Example: print_spans.py -s data/article736757214.task-FLC.labels -t data/article736757214.txt")
44 |     parser.add_argument('-t', '--text-file', dest='article_file', required=True, help="file with text document")
45 |     parser.add_argument('-s', '--spans-file', dest='spans_file', required=True, 
46 |                         help="file with spans to be highlighted. One line of the span file")
47 |     parser.add_argument('-p', '--propaganda-techniques-list-file', dest='propaganda_techniques_list_file', required=False, 
48 |                         default="data/propaganda-techniques-names.txt", 
49 |                         help="file with list of propaganda techniques (one per line).")
50 | 
51 |     main(parser.parse_args())
52 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | pip-wheel-metadata/
 24 | share/python-wheels/
 25 | *.egg-info/
 26 | .installed.cfg
 27 | *.egg
 28 | MANIFEST
 29 | 
 30 | # PyInstaller
 31 | #  Usually these files are written by a python script from a template
 32 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 33 | *.manifest
 34 | *.spec
 35 | 
 36 | # Installer logs
 37 | pip-log.txt
 38 | pip-delete-this-directory.txt
 39 | 
 40 | # Unit test / coverage reports
 41 | htmlcov/
 42 | .tox/
 43 | .nox/
 44 | .coverage
 45 | .coverage.*
 46 | .cache
 47 | nosetests.xml
 48 | coverage.xml
 49 | *.cover
 50 | *.py,cover
 51 | .hypothesis/
 52 | .pytest_cache/
 53 | cover/
 54 | 
 55 | # Translations
 56 | *.mo
 57 | *.pot
 58 | 
 59 | # Django stuff:
 60 | *.log
 61 | local_settings.py
 62 | db.sqlite3
 63 | db.sqlite3-journal
 64 | 
 65 | # Flask stuff:
 66 | instance/
 67 | .webassets-cache
 68 | 
 69 | # Scrapy stuff:
 70 | .scrapy
 71 | 
 72 | # Sphinx documentation
 73 | docs/_build/
 74 | 
 75 | # PyBuilder
 76 | .pybuilder/
 77 | target/
 78 | 
 79 | # Jupyter Notebook
 80 | .ipynb_checkpoints
 81 | 
 82 | # IPython
 83 | profile_default/
 84 | ipython_config.py
 85 | 
 86 | # pyenv
 87 | #   For a library or package, you might want to ignore these files since the code is
 88 | #   intended to run in multiple environments; otherwise, check them in:
 89 | # .python-version
 90 | 
 91 | # pipenv
 92 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 93 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 94 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 95 | #   install all needed dependencies.
 96 | #Pipfile.lock
 97 | 
 98 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
 99 | __pypackages__/
100 | 
101 | # Celery stuff
102 | celerybeat-schedule
103 | celerybeat.pid
104 | 
105 | # SageMath parsed files
106 | *.sage.py
107 | 
108 | # Environments
109 | .env
110 | .venv
111 | env/
112 | venv/
113 | ENV/
114 | env.bak/
115 | venv.bak/
116 | 
117 | # Spyder project settings
118 | .spyderproject
119 | .spyproject
120 | 
121 | # Rope project settings
122 | .ropeproject
123 | 
124 | # mkdocs documentation
125 | /site
126 | 
127 | # mypy
128 | .mypy_cache/
129 | .dmypy.json
130 | dmypy.json
131 | 
132 | # Pyre type checker
133 | .pyre/
134 | 
135 | # pytype static type analyzer
136 | .pytype/
137 | 
138 | # Cython debug symbols
139 | cython_debug/
140 | 
141 | # static files generated from Django application using `collectstatic`
142 | media
143 | static


--------------------------------------------------------------------------------
/visualization_example/visualization/__init__.py:
--------------------------------------------------------------------------------
  1 | from IPython.core.display import display, HTML
  2 | from .html_template import transform_to_tree, span_wrapper
  3 | 
  4 | import pandas as pd
  5 | import numpy as np
  6 | 
  7 | def unify_data_format(fn):
  8 |     def unified_data(data, **kwargs):
  9 |         if kwargs.get('stanford', False):
 10 |             tokens, clusters = stanford_data_adapter(data)
 11 |         if kwargs.get('allen', False):
 12 |             tokens, clusters = allen_data_adapter(data)
 13 |         if kwargs.get('huggingface', False):
 14 |             tokens, clusters = huggingface_data_adapter(data)
 15 |         if kwargs.get('proref', False):
 16 |             tokens, clusters = labelled_pronoun(data)
 17 | 
 18 |         return fn(tokens, clusters, **kwargs)
 19 | 
 20 |     return unified_data
 21 | 
 22 | # Either return the html string or rander in a jupyter notebook output
 23 | # Function signature based on displacy render functionality
 24 | 
 25 | def render(tokens,
 26 |             clusters,
 27 |             style='coref', 
 28 |             stanford=False, 
 29 |             allen=False, 
 30 |             huggingface=False, 
 31 |             proref=False,
 32 |             jupyter=True,
 33 |             task=None):
 34 | 
 35 |     html = to_html(tokens, clusters, task)
 36 | 
 37 |     if jupyter:
 38 |         display(HTML(html))
 39 |     else:
 40 |         return html
 41 | 
 42 | def stanford_data_adapter(data):
 43 |     sents = []
 44 |     for sent in data['sentences']:
 45 |         sents.append([])
 46 |         for token in sent['tokens']:
 47 |             sents[-1].append(token['originalText'])
 48 | 
 49 |     clusters = []
 50 |     if data['corefs'] is not None:
 51 |         for num, mentions in data['corefs'].items():
 52 |             clusters.append([])
 53 |             for mention in mentions:
 54 |                 start = np.cumsum([0]+list(map(len, sents)))[mention['sentNum']-1] + mention['startIndex']-1
 55 |                 end = np.cumsum([0]+list(map(len, sents)))[mention['sentNum']-1] + mention['endIndex']-2
 56 |                 clusters[-1].append([start, end])
 57 |             
 58 |     return sum(sents, []), clusters
 59 | 
 60 | def allen_data_adapter(data):
 61 |     return data['document'], data['clusters']
 62 | 
 63 | def huggingface_data_adapter(doc):
 64 |     tokens = [token.text for token in doc]
 65 | 
 66 |     clusters = []
 67 |     if doc._.coref_clusters is not None:
 68 |         for cluster in doc._.coref_clusters:
 69 |             clusters.append([])
 70 |             for mention in cluster.mentions:
 71 |                 clusters[-1].append([mention.start, mention.end-1])
 72 | 
 73 |     return tokens, clusters
 74 | 
 75 | def labelled_pronoun(row):
 76 |     txt = row.text
 77 | 
 78 |     # map char indices to token indices
 79 |     tokens = txt.split(' ')
 80 |     start_a = len(txt[:row.a_offset].split(' '))-1
 81 |     start_b = len(txt[:row.b_offset].split(' '))-1
 82 | 
 83 |     clusters = [[[start_a, start_a+len(row.a.split(' '))-1]], [[start_b, start_b+len(row.b.split(' '))-1]]]
 84 | 
 85 |     # add pronoun token to the labelled cluster
 86 |     start_p = len(txt[:row.pronoun_offset].split(' '))-1
 87 |     if row.a_coref:
 88 |         clusters[0].append([start_p, start_p+len(row.pronoun.split(' '))-1])
 89 |     elif row.b_coref:
 90 |         clusters[1].append([start_p, start_p+len(row.pronoun.split(' '))-1])
 91 |     else:
 92 |         clusters.append([[start_p, start_p+len(row.pronoun.split(' '))-1]])
 93 | 
 94 |     return tokens, clusters
 95 |     
 96 | def to_html(tokens, clusters, task):
 97 |     tree = transform_to_tree(tokens, clusters)
 98 |     html = ''.join(span_wrapper(tree, 0, task))
 99 |     html = '<div style="padding: 16px;">{}</div>'.format(html)
100 |     return html


--------------------------------------------------------------------------------
/technique_classification/transformers_classifier/utils.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | from transformers import DataProcessor, InputExample
  3 | from sklearn.metrics import f1_score
  4 | from unidecode import unidecode
  5 | import string
  6 | import random
  7 | from autocorrect import Speller
  8 | 
  9 | 
 10 | def generate_misspelling(phrase, p=0.5):
 11 |     new_phrase = []
 12 |     words = phrase.split(' ')
 13 |     for word in words:
 14 |         outcome = random.random()
 15 |         if outcome <= p:
 16 |             ix = random.choice(range(len(word)))
 17 |             new_word = ''.join([word[w] if w != ix else random.choice(string.ascii_letters) for w in range(len(word))])
 18 |             new_phrase.append(new_word)
 19 |         else:
 20 |             new_phrase.append(word)
 21 |     return ' '.join(new_phrase) 
 22 | 
 23 | 
 24 | def simple_accuracy(preds, labels):
 25 |     return (preds == labels).mean()
 26 | 
 27 | 
 28 | def acc_and_f1_macro(preds, labels):
 29 |     acc = simple_accuracy(preds, labels)
 30 |     f1 = f1_score(y_true=labels, y_pred=preds, average='macro')
 31 |     return {
 32 |         "acc": acc,
 33 |         "f1": f1,
 34 |         "acc_and_f1": (acc + f1) / 2,
 35 |     }
 36 | 
 37 | 
 38 | def glue_compute_metrics(task_name, preds, labels):
 39 |     assert len(preds) == len(labels)
 40 |     if task_name == "prop":
 41 |         return acc_and_f1_macro(preds, labels)
 42 |     else:
 43 |         raise KeyError(task_name)
 44 | 
 45 | 
 46 | class PropProcessor(DataProcessor):
 47 |     def get_train_examples(self, file_path):
 48 |         """See base class."""
 49 |         return self._create_examples(self._read_tsv(file_path), "train")
 50 | 
 51 |     def get_dev_examples(self, file_path):
 52 |         """See base class."""
 53 |         return self._create_examples(self._read_tsv(file_path), "dev_matched")
 54 | 
 55 |     def get_test_examples(self, file_path):
 56 |         """See base class."""
 57 |         return self._create_examples(self._read_tsv(file_path), "test")
 58 | 
 59 |     def get_labels(self):
 60 |         """See base class."""
 61 |         return ['Appeal_to_Authority', 'Doubt', 'Repetition',
 62 |            'Appeal_to_fear-prejudice', 'Slogans', 'Black-and-White_Fallacy',
 63 |            'Loaded_Language', 'Flag-Waving', 'Name_Calling,Labeling',
 64 |            'Whataboutism,Straw_Men,Red_Herring', 'Causal_Oversimplification',
 65 |            'Exaggeration,Minimisation', 'Bandwagon,Reductio_ad_hitlerum',
 66 |            'Thought-terminating_Cliches']
 67 | 
 68 |     def _create_examples(self, lines, set_type):
 69 |         """Creates examples for the training and dev sets."""
 70 |         examples = []
 71 |         spell = Speller(lang='en')
 72 |         for (i, line) in enumerate(lines):
 73 |             if i == 0 or line == []:
 74 |                 continue
 75 |             guid = "%s-%s" % (set_type, i)
 76 |             text_a = line[3] # generate_misspelling(line[3])
 77 |             #try:
 78 |             #    text_a = spell(text_a)
 79 |             #except:
 80 |             #    pass
 81 |             
 82 |             text_b = line[4]
 83 | 
 84 |             #pos = text_b.find(text_a)
 85 |             #text_a = text_b[:pos] + " <b> " + text_b[pos:pos + len(text_a)] + " </b> " + text_b[pos + len(text_a):]
 86 |             #text_b = None
 87 | 
 88 |             if len(line) < 6 or line[5] == '?':
 89 |                 label = self.get_labels()[0]
 90 |             else:
 91 |                 label = line[5]
 92 |             examples.append(InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
 93 |         return examples
 94 | 
 95 | 
 96 | glue_tasks_num_labels = {
 97 |     "prop": 14
 98 | }
 99 | 
100 | 
101 | glue_processors = {
102 |     "prop": PropProcessor,
103 | }
104 | 
105 | 
106 | glue_output_modes = {
107 |     "prop": "classification"
108 | }
109 | 


--------------------------------------------------------------------------------
/tools/task-TC_scorer.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | import argparse
 3 | import logging.handlers
 4 | from sklearn.metrics import f1_score
 5 | from sklearn.metrics import precision_score
 6 | from sklearn.metrics import recall_score
 7 | import src.annotation as an
 8 | import src.annotations as ans
 9 | import src.propaganda_techniques as pt
10 | 
11 | logger = logging.getLogger("propaganda_scorer")
12 | ch = logging.StreamHandler(sys.stdout)
13 | ch.setLevel(logging.INFO)
14 | formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
15 | ch.setFormatter(formatter)
16 | logger.setLevel(logging.INFO)
17 | 
18 | 
19 | def main(args):
20 | 
21 |     user_submission_file = args.submission
22 |     gold_file = args.gold
23 |     output_log_file = args.log_file
24 |     propaganda_techniques_list_file = args.propaganda_techniques_list_file
25 |     output_for_script = bool(args.output_for_script)
26 | 
27 |     if not output_for_script:
28 |         logger.addHandler(ch)
29 | 
30 |     if args.debug_on_std:
31 |         ch.setLevel(logging.DEBUG)
32 | 
33 |     if output_log_file is not None:
34 |         logger.info("Logging execution to file " + output_log_file)
35 |         fileLogger = logging.FileHandler(output_log_file)
36 |         fileLogger.setLevel(logging.DEBUG)
37 |         fileLogger.setFormatter(formatter)
38 |         logger.addHandler(fileLogger)
39 | 
40 |     propaganda_techniques = pt.Propaganda_Techniques(propaganda_techniques_list_file)
41 |     an.Annotation.set_propaganda_technique_list_obj(propaganda_techniques)
42 | 
43 |     user_annotations = ans.Annotations()
44 |     user_annotations.load_annotation_list_from_file(user_submission_file)
45 |     for article in user_annotations.get_article_id_list():
46 |         user_annotations.get_article_annotations_obj(article).sort_spans()
47 | 
48 |     gold_annotations = ans.Annotations()
49 |     gold_annotations.load_annotation_list_from_file(gold_file)
50 |     for article in gold_annotations.get_article_id_list():
51 |         gold_annotations.get_article_annotations_obj(article).sort_spans()
52 | 
53 |     logger.info("Checking format: User Predictions -- Gold Annotations")
54 |     if not user_annotations.compare_annotations_identical_article_lists(gold_annotations) or not user_annotations.compare_annotations_identical(gold_annotations):
55 |         logger.error("wrong format, no scoring will be performed")
56 |         sys.exit()
57 |     logger.info("OK: submission file format appears to be correct")
58 |     res_for_output, res_for_script = user_annotations.TC_score_to_string(gold_annotations, output_for_script)
59 |     logger.info("Scoring submission" + res_for_output)
60 |     if output_for_script:
61 |         print(res_for_script)
62 | 
63 | 
64 | if __name__ == "__main__":
65 | 
66 |     parser = argparse.ArgumentParser("Scorer for SemEval 2020 Task 11 subtask TC.\n" +
67 |     "Example: python3 task-TC_scorer.py -s data/submission-task-TC.tsv -r data/article736757214.task-FLC.labels -p data/propaganda-techniques-names-semeval2020task11.txt")
68 | 
69 |     parser.add_argument('-s', '--submission-file', dest='submission', required=True, help="file with the submission of the team")
70 |     parser.add_argument('-r', '--reference-file', dest='gold', required=True, help="file with the gold labels.")
71 |     parser.add_argument('-d', '--enable-debug-on-standard-output', dest='debug_on_std', required=False,
72 |                         action='store_true', help="Print debug info also on standard output.")
73 |     parser.add_argument('-l', '--log-file', dest='log_file', required=False, help="Output logger file.")
74 |     parser.add_argument('-p', '--propaganda-techniques-list-file', dest='propaganda_techniques_list_file', required=True, 
75 |                         help="file with list of propaganda techniques (one per line).")
76 |     parser.add_argument('-o', '--output-for-script', dest='output_for_script', required=False, action='store_true',
77 |                         default=False, help="Prints the output in a format easy to parse for a script")
78 |     main(parser.parse_args())
79 | 


--------------------------------------------------------------------------------
/visualization_example/visualization/html_template.py:
--------------------------------------------------------------------------------
 1 | import functools
 2 | 
 3 | HIGHLIGHT_COLORS = [
 4 |     "blue",
 5 |     "green",
 6 |     "pink",
 7 |     "orange",
 8 |     "purple",
 9 |     "teal",
10 |     "tan",
11 |     "red",
12 |     "cobalt",
13 |     "brown",
14 |     "slate",
15 |     "fuchsia",
16 |     "gray",
17 |     "blue"
18 | ]
19 | 
20 | def get_highlight_color(index):
21 |     if index <= len(HIGHLIGHT_COLORS):
22 |         return HIGHLIGHT_COLORS[index]
23 |     else:
24 |         return HIGHLIGHT_COLORS[index - (len(HIGHLIGHT_COLORS) * math.floor(index / len(HIGHLIGHT_COLORS)))]
25 | 
26 | # Transofrms tokens and clusters into a tree representation
27 | def transform_to_tree(tokens, clusters):
28 |     def contains(span, index):
29 |         return index >= span[0] and index <= span[1]
30 | 
31 |     inside_clusters = [{
32 |         'cluster': -1,
33 |         'contents': [],
34 |         'end': -1
35 |     }]
36 | 
37 |     for i, token in enumerate(tokens):
38 |         # Find all the new clusters we are entering at the current index
39 |         new_clusters = []
40 |         for j, cluster in enumerate(clusters):
41 |             #Make sure we're not already in this cluster
42 |             if j not in [c['cluster'] for c in inside_clusters]:
43 |                 for span in cluster:
44 |                     if i in span:
45 |                         new_clusters.append({ 'end': span[1], 'cluster': j })
46 | 
47 |         # Enter each new cluster, starting with the leftmost
48 |         new_clusters = sorted(new_clusters, key=functools.cmp_to_key(lambda a, b: b['end'] - a['end']))
49 |         for new_cluster in new_clusters:
50 |             #Descend into the new cluster
51 |             inside_clusters.append({
52 |                 'cluster': new_cluster['cluster'],
53 |                 'contents': [],
54 |                 'end': new_cluster['end']
55 |             })
56 | 
57 |         #Add the current token into the current cluster
58 |         inside_clusters[-1]['contents'].append(token)
59 | 
60 |         # Exit each cluster we're at the end of
61 |         while (len(inside_clusters) > 0 and inside_clusters[-1]['end'] == i):
62 |             top_cluster = inside_clusters[-1]
63 |             inside_clusters.pop()
64 |             inside_clusters[-1]['contents'].append(top_cluster)
65 | 
66 |     return inside_clusters[0]['contents']
67 | 
68 | 
69 | mapping = {i: el for i, el in enumerate(['Appeal_to_Authority', 'Doubt', 'Repetition',
70 |        'Appeal_to_fear-prejudice', 'Slogans', 'Black-and-White_Fallacy',
71 |        'Loaded_Language', 'Flag-Waving', 'Name_Calling,Labeling',
72 |        'Whataboutism,Straw_Men,Red_Herring', 'Causal_Oversimplification',
73 |        'Exaggeration,Minimisation', 'Bandwagon,Reductio_ad_hitlerum',
74 |        'Thought-terminating_Cliches'])}
75 | 
76 | #This is the function that calls itself when we recurse over the span tree.
77 | def gen_elem(token, idx, depth, task):
78 |     if isinstance(token, dict) or isinstance(token, list):
79 |         if task == 'TC':
80 |             title = mapping[token['cluster']]
81 |         elif task == 'SI':
82 |             title = 'PROP'
83 |         else:
84 |             title = token['cluster']
85 |         return '<span key={} class="highlight {}" depth={} id={} onmouseover="handleHighlightMouseOver(this)" \
86 |                 onmouseout="handleHighlightMouseOut(this)" labelPosition="left">\
87 |                 <span class="highlight__label"><strong>{}</strong></span>\
88 |                 <span class="highlight__content">{}</span></span>'.format(idx, 
89 |                                                                           get_highlight_color(token['cluster']), 
90 |                                                                           depth,
91 |                                                                           title,
92 |                                                                           title, 
93 |                                                                           ' '.join(span_wrapper(token['contents'], depth + 1, task)))
94 |     else:
95 |         return '<span>{} </span>'.format(token)
96 |  
97 | # Wraps the tree representation into spans indicating cluster-wise depth
98 | def span_wrapper(tree, depth, task):
99 |       return [gen_elem(token, idx, depth, task) for idx, token in enumerate(tree)]


--------------------------------------------------------------------------------
/technique_classification/dataset.py:
--------------------------------------------------------------------------------
  1 | import codecs
  2 | import glob
  3 | import os
  4 | import numpy as np
  5 | import pandas as pd
  6 | from nltk.tokenize.punkt import PunktSentenceTokenizer
  7 | from sklearn.model_selection import train_test_split
  8 | 
  9 | 
 10 | def read_articles_from_file_list(folder_name, file_pattern="*.txt"):
 11 |     file_list = glob.glob(os.path.join(folder_name, file_pattern))
 12 |     articles = {}
 13 |     article_id_list, sentence_id_list, sentence_list = ([], [], [])
 14 |     for filename in sorted(file_list):
 15 |         article_id = os.path.basename(filename).split(".")[0][7:]
 16 |         with codecs.open(filename, "r", encoding="utf8") as f:
 17 |             articles[article_id] = f.read()
 18 |     return articles
 19 | 
 20 | 
 21 | def read_predictions_from_file(filename):
 22 |     articles_id, span_starts, span_ends, gold_labels = ([], [], [], [])
 23 |     with open(filename, "r") as f:
 24 |         for row in f.readlines():
 25 |             article_id, gold_label, span_start, span_end = row.rstrip().split("\t")
 26 |             articles_id.append(article_id)
 27 |             gold_labels.append(gold_label)
 28 |             span_starts.append(span_start)
 29 |             span_ends.append(span_end)
 30 |     return articles_id, span_starts, span_ends, gold_labels
 31 | 
 32 | 
 33 | def load_data(data_folder, labels_file):
 34 |     articles = read_articles_from_file_list(data_folder)
 35 |     ref_articles_id, ref_span_starts, ref_span_ends, labels = read_predictions_from_file(labels_file)
 36 |     return articles, ref_articles_id, ref_span_starts, ref_span_ends, labels
 37 | 
 38 | 
 39 | def sents_token_bounds(text):
 40 |     sents_starts = []
 41 |     for start, end in PunktSentenceTokenizer().span_tokenize(text):
 42 |         sents_starts.append(start)
 43 |     sents_starts.append(100000)
 44 |     return np.array(sents_starts)
 45 | 
 46 | 
 47 | def clear(text):
 48 |     return text.strip().replace('\t', ' ').replace('\n', ' ')
 49 | 
 50 | 
 51 | def get_context(article, span_start, span_end):
 52 |     bounds = sents_token_bounds(article)
 53 |     context_start = bounds[np.where(bounds <= span_start)[0][-1]]
 54 |     context_end = bounds[np.where(bounds >= span_end)[0][0]]
 55 |     return clear(article[context_start:context_end])
 56 | 
 57 | 
 58 | def balance_pandas(data):
 59 |     lst = [data]
 60 |     max_size = data['label'].value_counts().max()
 61 |     for class_index, group in data.groupby('label'):
 62 |         lst.append(group.sample(max_size - len(group), replace=True))
 63 |     return pd.concat(lst)
 64 | 
 65 | 
 66 | def dataset_to_pandas(articles, ref_articles_id, ref_span_starts, ref_span_ends, train_gold_labels):
 67 |     data = pd.DataFrame.from_dict({'article_id': ref_articles_id, 
 68 |               'article': [articles[id] for id in ref_articles_id], 
 69 |               'span_start': np.array(ref_span_starts).astype(int), 
 70 |               'span_end': np.array(ref_span_ends).astype(int),
 71 |               'label': train_gold_labels
 72 |              })
 73 |     data['span'] = data.apply(lambda x: clear(x['article'][x['span_start']:x['span_end']]), axis=1)
 74 |     data['context'] = data.apply(lambda x: get_context(x['article'], x['span_start'], x['span_end']), axis=1)
 75 |     return data[['article_id', 'span_start', 'span_end', 'span', 'context', 'label']]
 76 | 
 77 | 
 78 | def get_train_dev_files(articles, ref_articles_id, ref_span_starts, ref_span_ends, labels, train_file, dev_file,
 79 |                      split_by_ids=False, dev_size=0.3, random_state=40, balance=False, shuffle=True):
 80 |     data = dataset_to_pandas(articles, ref_articles_id, ref_span_starts, ref_span_ends, labels)
 81 |     if split_by_ids:
 82 |         train_ids, dev_ids = train_test_split(data.article_id.unique(), test_size=dev_size, random_state=random_state)
 83 |         train = data[data.article_id.isin(train_ids)]
 84 |         dev = data[data.article_id.isin(dev_ids)]
 85 |     else:
 86 |         train, dev = train_test_split(data, test_size=dev_size, random_state=random_state)
 87 |         
 88 |     if balance:
 89 |         train = balance_pandas(train)
 90 |     if shuffle:
 91 |         train = train.sample(frac=1).reset_index(drop=True)
 92 |     
 93 |     save_dataset(train, train_file)
 94 |     save_dataset(dev, dev_file)
 95 | 
 96 |     
 97 | def get_test_file(articles, ref_articles_id, ref_span_starts, ref_span_ends, labels, test_file):
 98 |     test = dataset_to_pandas(articles, ref_articles_id, ref_span_starts, ref_span_ends, labels)
 99 |     save_dataset(test, test_file)
100 |     
101 | 
102 | def save_dataset(data, file_path):
103 |     data.to_csv(file_path, sep='\t', index=False)
104 | 


--------------------------------------------------------------------------------
/tools/src/annotation.py:
--------------------------------------------------------------------------------
  1 | from __future__ import annotations
  2 | import sys
  3 | import logging.handlers
  4 | import src.propaganda_techniques as pt
  5 | import src.annotation_w_o_label as anwol
  6 | 
  7 | __author__ = "Giovanni Da San Martino"
  8 | __copyright__ = "Copyright 2019"
  9 | __credits__ = ["Giovanni Da San Martino"]
 10 | __license__ = "GPL"
 11 | __version__ = "0.1"
 12 | __maintainer__ = "Giovanni Da San Martino"
 13 | __email__ = "gmartino@hbku.edu.qa"
 14 | __status__ = "Beta"
 15 | 
 16 | logger = logging.getLogger("propaganda_scorer")
 17 | 
 18 | 
 19 | class Annotation(anwol.AnnotationWithOutLabel):
 20 | 
 21 |     """
 22 |     One annotation is represented by a span (two integer indices indicating the 
 23 |     starting and ending position of the span) and the propaganda technique name 
 24 |     (a label attached to the span). 
 25 |     The class provides basic maniputation functions for one annotation. 
 26 |     """
 27 | 
 28 |     # input file format variables
 29 |     separator = "\t"
 30 |     ARTICLE_ID_COL = 0
 31 |     TECHNIQUE_NAME_COL = 1
 32 |     FRAGMENT_START_COL = 2
 33 |     FRAGMENT_END_COL = 3
 34 |     propaganda_techniques:pt.Propaganda_Techniques = None
 35 | 
 36 | 
 37 |     def __init__(self, label:str=None, start_offset:str = None, end_offset:str=None): 
 38 |         
 39 |         super().__init__(start_offset, end_offset)
 40 |         self.label = label
 41 | 
 42 | 
 43 |     def __str__(self):
 44 | 
 45 |         return super().__str__() + " -> " + self.get_label()
 46 |         #return self.get_label() + "\t" + super().__str__()
 47 | 
 48 | 
 49 |     def __eq__(self, second_annotation:Annotation):
 50 |         """
 51 |         Checks whether two annotations are identical, i.e. if their spans are 
 52 |         identical and if they labels coincide
 53 |         """        
 54 |         return super().__eq__(second_annotation) and self.get_label()==second_annotation.get_label()
 55 | 
 56 | 
 57 |     def get_label(self)->str:
 58 | 
 59 |         return self.label
 60 | 
 61 |     
 62 |     def get_propaganda_techniques(self)->list:
 63 | 
 64 |         if self.propaganda_techniques is None:
 65 |             logger.error("trying to access propaganda techniques list before initialising the corresponding object")
 66 |             sys.exit()
 67 |         return self.propaganda_techniques.get_propaganda_techniques_list()
 68 | 
 69 |     
 70 |     @classmethod
 71 |     def set_propaganda_technique_list_obj(cls, propaganda_technique_obj:pt.Propaganda_Techniques)->None:
 72 |         """
 73 |         propaganda_technique_obj is an object from the module src.propaganda_techniques.
 74 |         Typical invokation: 
 75 |         `
 76 |             propaganda_techniques = pt.Propaganda_Techniques(filename=propaganda_techniques_list_file)
 77 |             an.Annotation.set_propaganda_technique_list_obj(propaganda_techniques)
 78 |         `
 79 |         """
 80 |         cls.propaganda_techniques = propaganda_technique_obj
 81 | 
 82 | 
 83 |     @staticmethod
 84 |     def load_annotation_from_string(annotation_string:str, row_num:int=None, filename:str=None)->(Annotation, str):
 85 |         """
 86 |         Read annotations from a csv-like string, with fields separated
 87 |         by the class variable `separator`: 
 88 | 
 89 |         article id<separator>technique name<separator>starting_position<separator>ending_position
 90 |         Fields order is determined by the class variables ARTICLE_ID_COL,
 91 |         TECHNIQUE_NAME_COL, FRAGMENT_START_COL, FRAGMENT_END_COL
 92 | 
 93 |         Besides reading the data, it performs basic checks.
 94 | 
 95 |         :return a tuple (Annotation object, id of the article)
 96 |         """
 97 | 
 98 |         row = annotation_string.rstrip().split(Annotation.separator)
 99 |         if len(row) != 4:
100 |             logger.error("Row%s%s is supposed to have 4 columns. Found %d: -%s-."
101 |                          % (" " + str(row_num) if row_num is not None else "",
102 |                             " in file " + filename if filename is not None else "", len(row), annotation_string))
103 |             sys.exit()
104 | 
105 |         article_id = row[Annotation.ARTICLE_ID_COL]
106 |         label = row[Annotation.TECHNIQUE_NAME_COL]
107 |         try:
108 |             start_offset = int(row[Annotation.FRAGMENT_START_COL])
109 |         except:
110 |             logger.error("The column %d in row%s%s is supposed to be an integer: -%s-"
111 |                          %(Annotation.FRAGMENT_START_COL, " " + str(row_num) if row_num is not None else "",
112 |                             " in file " + filename if filename is not None else "", annotation_string))
113 |         try:
114 |             end_offset = int(row[Annotation.FRAGMENT_END_COL])
115 |         except:
116 |             logger.error("The column %d in row%s%s is supposed to be an integer: -%s-"
117 |                          %(Annotation.FRAGMENT_END_COL, " " + str(row_num) if row_num is not None else "",
118 |                             " in file " + filename if filename is not None else "", annotation_string))
119 | 
120 |         return Annotation(label, start_offset, end_offset), article_id
121 |         
122 | 
123 |     def is_technique_name_valid(self)->bool:
124 |         """
125 |         Checks whether the technique names are correct
126 |         """
127 |         if self.propaganda_techniques is None:
128 |             sys.exit("ERROR: propaganda techniques object has not been initialised")
129 |         if not self.propaganda_techniques.is_valid_technique(self.get_label()):
130 |             logger.error("label %s is not valid. Possible values are: %s"%(self.get_label(), self.propaganda_techniques))
131 |             return False
132 |         return True
133 | 
134 | 
135 |     def check_format_of_annotation_in_file(self):
136 |         """
137 |         Performs some checks on the fields of the annotation
138 |         """
139 |         if not self.is_technique_name_valid():
140 |             sys.exit()
141 |         if not self.is_span_valid():
142 |             sys.exit()
143 | 
144 | 


--------------------------------------------------------------------------------
/span_identification/ner/bert_lstm_crf.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """
  3 | based on
  4 | @File: bert_lstm_crf.py
  5 | @Copyright: 2019 Michael Zhu
  6 | @License：the Apache License, Version 2.0
  7 | @Author：Michael Zhu
  8 | """
  9 | 
 10 | # coding=utf-8
 11 | # coding=utf-8
 12 | import copy
 13 | from typing import cast, List
 14 | import numpy as np
 15 | 
 16 | import torch.nn as nn
 17 | 
 18 | from torch.autograd import Variable
 19 | import torch
 20 | 
 21 | from .conditional_random_field import ConditionalRandomField, allowed_transitions
 22 | 
 23 | 
 24 | class BertLstmCrf(nn.Module):
 25 |     """
 26 |     bert_lstm_crf model
 27 |     """
 28 | 
 29 |     def __init__(self, bert_model,
 30 |                  num_labels=9,
 31 |                  embedding_dim=512,
 32 |                  hidden_dim=512,
 33 |                  rnn_layers=1,
 34 |                  rnn_dropout=0.1,
 35 |                  output_dropout=0.1,
 36 |                  use_cuda=False):
 37 |         super(BertLstmCrf, self).__init__()
 38 |         self.bert_encoder = bert_model
 39 | 
 40 |         self.embedding_dim = embedding_dim
 41 |         self.hidden_dim = hidden_dim
 42 |         self.rnn_layers = rnn_layers
 43 | 
 44 |         self.lstm = None
 45 |         if rnn_layers > 0:
 46 |             self.lstm = nn.LSTM(
 47 |                 embedding_dim,
 48 |                 hidden_dim,
 49 |                 num_layers=rnn_layers,
 50 |                 bidirectional=True,
 51 |                 dropout=rnn_dropout,
 52 |                 batch_first=True
 53 |             )
 54 | 
 55 |         # self.crf = CRF(
 56 |         #     target_size=num_labels,
 57 |         #     average_batch=True,
 58 |         #     use_cuda=use_cuda
 59 |         # )
 60 | 
 61 |         # TODO: add contraints
 62 |         constraints = allowed_transitions('BIO', dict(enumerate(["O", "B", "I"])))
 63 |         include_start_end_transitions = True
 64 |         self.crf = ConditionalRandomField(
 65 |             num_labels,
 66 |             constraints,
 67 |             include_start_end_transitions=include_start_end_transitions
 68 |         )
 69 | 
 70 |         self.liner = nn.Linear(hidden_dim * 2, num_labels)
 71 |         self.num_labels = num_labels
 72 | 
 73 |         self.output_dropout = nn.Dropout(p=output_dropout)
 74 | 
 75 |     def rand_init_hidden(self, batch_size):
 76 |         """
 77 |         random initialize hidden variable
 78 |         """
 79 |         return Variable(
 80 |             torch.randn(2 * self.rnn_layers, batch_size, self.hidden_dim)), Variable(
 81 |             torch.randn(2 * self.rnn_layers, batch_size, self.hidden_dim))
 82 |     
 83 |     def clear_subtokens(self, logits, labels, mask):
 84 |         clear_labels = torch.zeros_like(labels)
 85 |         clear_logits = torch.zeros_like(logits)
 86 |         clear_mask = torch.zeros_like(mask)
 87 | 
 88 |         for i in range(len(labels)):
 89 |             assert (mask[i][labels[i] != -100] == 1).all()
 90 |             cor = labels[i][labels[i] != -100]
 91 |             clear_labels[i][:len(cor)] = cor
 92 |             clear_logits[i][:len(cor)] = logits[i][labels[i] != - 100]
 93 |             clear_mask[i][:len(cor)] = 1
 94 |         return clear_logits, clear_labels, clear_mask
 95 | 
 96 |     def forward(self, **kwargs):
 97 |         '''
 98 |         args:
 99 |             sentence (word_seq_len, batch_size) : word-level representation of sentence
100 |             hidden: initial hidden state
101 | 
102 |         return:
103 |             crf output (word_seq_len, batch_size, tag_size, tag_size), hidden
104 |         '''
105 | 
106 |         kwargs_copy = copy.deepcopy(kwargs)
107 |         if "labels" in kwargs_copy:
108 |             kwargs_copy.pop("labels")
109 | 
110 |         batch_size = kwargs["input_ids"].size(0)
111 |         seq_length = kwargs["input_ids"].size(1)
112 | 
113 |         bert_outputs = self.bert_encoder(
114 |             **kwargs
115 |         )
116 |         sequence_output = bert_outputs[1]
117 | 
118 |         if self.lstm is not None:
119 |             hidden = self.rand_init_hidden(batch_size)
120 |             if kwargs["input_ids"].is_cuda:
121 |                 hidden = (i.cuda() for i in hidden)
122 |             sequence_output, hidden = self.lstm(sequence_output, hidden)
123 |             sequence_output = sequence_output.contiguous().view(-1, self.hidden_dim * 2)
124 |             sequence_output = self.output_dropout(sequence_output)
125 |             
126 |             sequence_output = self.liner(sequence_output)
127 | 
128 |         #out = self.liner(sequence_output)
129 |         out = sequence_output
130 |         logits = out.contiguous().view(batch_size, seq_length, -1)
131 |         
132 |         clear_logits, clear_labels, clear_mask = self.clear_subtokens(logits, kwargs['labels'], kwargs["attention_mask"])
133 |         
134 |         """
135 |         best_paths = self.crf.viterbi_tags(
136 |             logits,
137 |             kwargs["attention_mask"].long(),
138 |             top_k=1
139 |         )
140 |         """
141 |         best_paths = self.crf.viterbi_tags(
142 |             clear_logits,
143 |             clear_mask.long(),
144 |             top_k=1
145 |         )
146 |         # Just get the top tags and ignore the scores.
147 |         predicted_tags = cast(List[List[int]], [x[0][0] for x in best_paths])
148 |         
149 |         if kwargs.get("labels") is not None:
150 |             labels = kwargs.get("labels").cpu()
151 |             #log_likelihood = self.crf(logits, kwargs.get("labels"), kwargs["attention_mask"])
152 |             log_likelihood = self.crf(clear_logits, clear_labels, clear_mask)
153 |             loss = -log_likelihood
154 |             correct_predicted_tags = np.zeros_like(labels)
155 |             for i in range(len(labels)):
156 |                 correct_predicted_tags[i][labels[i] != -100] = predicted_tags[i]
157 |             return (loss, logits, list(correct_predicted_tags))
158 | 
159 |         return (None, logits, predicted_tags)
160 | 
161 | 
162 | if __name__ == "__main__":
163 |     pass
164 | 


--------------------------------------------------------------------------------
/span_identification/submission.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | import numpy as np
  3 | from unidecode import unidecode
  4 | import string
  5 | import nltk
  6 | from nltk.corpus import stopwords
  7 | 
  8 | 
  9 | def merge_spans(spans, articles_id, articles_content):
 10 |     res = dict()
 11 |     articles_content_dict = dict(zip(articles_id, articles_content))
 12 |     for article_id in spans:
 13 |         article = articles_content_dict[article_id]
 14 |         res[article_id] = []
 15 |         mask = np.zeros(len(article))
 16 |         for span in spans[article_id]:
 17 |             mask[span[0]: span[1]] = 1
 18 |         start = -1
 19 |         length = 0
 20 |         for i in range(len(mask)):
 21 |             if mask[i] == 0:
 22 |                 if start != -1:
 23 |                     res[article_id].append((start, start + length))
 24 |                     start = -1
 25 |                     length = 0
 26 |             if mask[i] == 1:
 27 |                 if start == -1:
 28 |                     start = i
 29 |                     length = 1
 30 |                 else:
 31 |                     length += 1     
 32 |     return res
 33 | 
 34 | 
 35 | def correct_spans(spans, articles_id, articles_content):
 36 |     stop_words = set(stopwords.words('english'))
 37 |     res = dict()
 38 |     articles_content_dict = dict(zip(articles_id, articles_content))
 39 |     for article_id in spans:
 40 |         article = articles_content_dict[article_id]
 41 |         res[article_id] = []
 42 |         mask = np.zeros(len(article))
 43 |         for span in spans[article_id]:
 44 |             mask[span[0]: span[1] + 1] = 1
 45 |         start = -1
 46 |         length = 0
 47 |         for i in range(len(mask)):
 48 |             if mask[i] == 0:
 49 |                 if start != -1:
 50 |                     end = start + length
 51 |                     
 52 |                     if unidecode(article[start - 1]) == '"':
 53 |                         start -= 1
 54 |                     else:
 55 |                         while not article[start].isalnum():
 56 |                             start += 1
 57 |                     if unidecode(article[end]) == '"':
 58 |                         end += 1
 59 |                     
 60 |                     if unidecode(article[end - 1]) != '"':
 61 |                         while not article[end - 1].isalnum():
 62 |                             end -= 1
 63 |                     if end - start > 1:
 64 |                         if article[start: end].lower() not in stop_words:
 65 |                             res[article_id].append((start, end))
 66 |                     '''
 67 |                     while article[end - 1].isspace():
 68 |                         end -= 1
 69 |                     if end > start:
 70 |                         res[article_id].append((start, end))
 71 |                     '''
 72 |                     start = -1
 73 |                     length = 0
 74 |             
 75 |             if mask[i] == 1:
 76 |                 if start == -1:
 77 |                     start = i
 78 |                     length = 1
 79 |                 else:
 80 |                     length += 1
 81 |         
 82 |         if start != -1:
 83 |             if unidecode(article[start - 1]) == '"':
 84 |                 start -= 1
 85 |                 length += 1
 86 |             if unidecode(article[start + length]) == '"':
 87 |                 length += 1
 88 |             if unidecode(article[start + length - 1]) != '"':
 89 |                 while not article[start + length - 1].isalnum():
 90 |                     length -= 1
 91 |             if length > 0:
 92 |                 res[article_id].append((start, start + length))
 93 |     return res # merge_spans(res, articles_id, articles_content)
 94 | 
 95 | 
 96 | def get_spans_from_file(file, articles_id, articles_content, nlp):
 97 |     pred_spans = dict()
 98 |     with open(file, 'r') as f:
 99 |         for article_id, text in zip(articles_id, articles_content):
100 |             pred_spans.setdefault(article_id, [])
101 |             tokens = [(token.idx, token.text) for token in nlp(text)]
102 |             idx = np.array(tokens)[:,0]
103 |             tokens = np.array(tokens)[:,1]
104 |             tokens = [token.strip().replace('\n', ' ').replace('\t', ' ') for token in tokens]
105 |             
106 |             i = 0
107 |             start = -1
108 |             for i in range(len(tokens)):
109 |                 tok = tokens[i]
110 |                 if len(tok) != 0 and repr(tok) != repr('\ufeff') and repr(tok) != repr('\u200f'):
111 |                     token, label = f.readline().split('\t')
112 |                     label = label.strip()
113 |                     if label == 'B-PROP' or (label == 'I-PROP' and start == -1):
114 |                         if start != -1:
115 |                             pred_spans[article_id].append((start, int(idx[i - 1]) + len(tokens[i - 1])))
116 |                         start = int(idx[i])                        
117 |                     if label == 'O':
118 |                         if start != -1:
119 |                             pred_spans[article_id].append((start, int(idx[i - 1]) + len(tokens[i - 1])))
120 |                         start = -1
121 |                     assert token == tok
122 |                     assert tok == text[int(idx[i]): int(idx[i]) + len(tok)]
123 |                     prev_label = label
124 |                     prev_tok = tok
125 |                 else:
126 |                     if prev_tok != '\n':
127 |                         f.readline()
128 |                         prev_tok = '\n'
129 |                     prev_label = 'O'
130 |     
131 |     return correct_spans(pred_spans, articles_id, articles_content)
132 | 
133 | 
134 | def get_submission_format(predicted_labels_files, articles_id, articles_content, nlp, output_file):    
135 |     agg_result = dict()
136 |     for file in predicted_labels_files:
137 |         result = get_spans_from_file(file, articles_id, articles_content, nlp)
138 |         for el in result:
139 |             agg_result[el] = agg_result.get(el, []) + result[el]
140 |     agg_result = merge_spans(agg_result, articles_id, articles_content)
141 |     
142 |     with open(output_file, "w") as fout:
143 |         for article_id, spans in agg_result.items():
144 |             for span in spans:
145 |                 fout.write("%s\t%s\t%s\n" % (article_id, span[0], span[1]))
146 | 


--------------------------------------------------------------------------------
/span_identification/dataset.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | import glob
  3 | import os
  4 | from shutil import copyfile, rmtree
  5 | import random
  6 | import pandas as pd
  7 | import numpy as np
  8 | from sklearn.model_selection import train_test_split
  9 | from tqdm import tqdm
 10 | 
 11 | 
 12 | def load_data(data_folder, propaganda_techniques_file):
 13 |     file_list = glob.glob(os.path.join(data_folder, "*.txt"))
 14 |     articles_content, articles_id = ([], [])
 15 |     for filename in sorted(file_list):
 16 |         with open(filename, "r", encoding="utf-8") as f:
 17 |             articles_content.append(f.read())
 18 |             articles_id.append(os.path.basename(filename).split(".")[0][7:])
 19 | 
 20 |     with open(propaganda_techniques_file, "r") as f:
 21 |         propaganda_techniques_names = [line.rstrip() for line in f.readlines()]
 22 |     
 23 |     return articles_content, articles_id, propaganda_techniques_names
 24 | 
 25 | 
 26 | def read_predictions_from_file(filename):
 27 |     articles_id, gold_spans = ([], [])
 28 |     with open(filename, "r") as f:
 29 |         for row in f.readlines():
 30 |             article_id, gold_span_start, gold_span_end = row.rstrip().split("\t")
 31 |             articles_id.append(article_id)
 32 |             gold_spans.append(tuple(int(el) for el in [gold_span_start, gold_span_end]))
 33 |     return articles_id, gold_spans
 34 | 
 35 | 
 36 | def group_spans_by_article_ids(span_list):
 37 |     data = {}
 38 |     for el in span_list:
 39 |         article_id, span = el[0], el[1]
 40 |         data.setdefault(article_id, [])
 41 |         data[article_id].append(span)
 42 |     return data
 43 | 
 44 | 
 45 | def get_train_dev_files(articles_id, articles_content, nlp, labels_path, train_file, dev_file, split_by_ids=True, 
 46 |                      dev_size=0.3, random_state=42):
 47 |     articles_content_dict = dict(zip(articles_id, articles_content))
 48 |     articles_id, gold_spans = read_predictions_from_file(labels_path)
 49 |     span_list = list(zip(articles_id, gold_spans))
 50 |     
 51 |     if split_by_ids:
 52 |         data = group_spans_by_article_ids(span_list)
 53 |         train_ids, dev_ids = train_test_split(np.unique(articles_id), test_size=dev_size, random_state=random_state)
 54 |         train_data = sorted([(key, value) for (key, value) in data.items() if key in train_ids])
 55 |         dev_data = sorted([(key, value) for (key, value) in data.items() if key in dev_ids])
 56 |     else:
 57 |         span_list_train, span_list_test = train_test_split(span_list, test_size=dev_size, random_state=random_state)
 58 |         train_data = sorted(group_spans_by_article_ids(span_list_train).items())
 59 |         dev_data = sorted(group_spans_by_article_ids(span_list_train).items())
 60 |         train_ids = [example[0] for example in train_data]
 61 |         dev_ids = [example[0] for example in dev_data]
 62 |     
 63 |     create_BIO_labeled(train_file, train_data, articles_content_dict, nlp)
 64 |     create_BIO_labeled(dev_file, dev_data, articles_content_dict, nlp)
 65 |     
 66 |     return train_ids, dev_ids
 67 |     
 68 |                     
 69 | def get_test_file(file, articles_id, articles_content, nlp):
 70 |     create_BIO_unlabeled(file, articles_id, articles_content, nlp)
 71 |     
 72 | 
 73 | def token_label_from_spans(pos, spans):
 74 |     for el in spans:
 75 |         if el[0] <= int(pos) < el[1]:
 76 |             return "PROP"
 77 |     return 'O'
 78 | 
 79 |                     
 80 | def create_BIO_labeled(file, data, articles_content_dict, nlp):
 81 |     prev_label = 'O'
 82 |     with open(file, 'w') as f:
 83 |         for article_id, spans in tqdm(data):
 84 |             text = articles_content_dict[article_id]
 85 |             tokens = [(token.idx, token.text) for token in nlp(text)]
 86 |             idx = np.array(tokens)[:,0]
 87 |             tokens = np.array(tokens)[:,1]
 88 |             prev_tok = '\n'
 89 |             
 90 |             for i in range(len(tokens)):
 91 |                 tok = tokens[i].replace('\n', ' ').replace('\t', ' ').strip()
 92 |                 if len(tok) != 0 and repr(tok) != repr('\ufeff') and repr(tok) != repr('\u200f'):
 93 |                     tok = tokens[i].strip().replace('\n', ' ').replace('\t', ' ')
 94 |                     label =  token_label_from_spans(idx[i], spans)
 95 |                     if label != 'O':
 96 |                         if prev_label != 'O':
 97 |                             label = 'I-' + 'PROP'
 98 |                         else:
 99 |                             label = 'B-' + 'PROP'
100 |                     f.write(tok + '\t' + label + '\n')
101 |                     prev_label = label
102 |                     prev_tok = tok
103 |                 else:
104 |                     if prev_tok != '\n':
105 |                         f.write('\n')
106 |                         prev_tok = '\n'
107 |                     prev_label = 'O'
108 | 
109 |                     
110 | def create_BIO_unlabeled(file, articles_id, articles_content, nlp):
111 |     prev_label = 'O'
112 |     with open(file, 'w') as f:
113 |         for article_id, text in tqdm(zip(articles_id, articles_content)):
114 |             tokens = [(token.idx, token.text) for token in nlp(text)]
115 |             idx = np.array(tokens)[:,0]
116 |             tokens = np.array(tokens)[:,1]
117 |             prev_tok = '\n'
118 |             
119 |             for i in range(len(tokens)):
120 |                 tok = tokens[i].replace('\n', ' ').replace('\t', ' ').strip()
121 |                 if len(tok) != 0 and repr(tok) != repr('\ufeff') and repr(tok) != repr('\u200f'):
122 |                     tok = tokens[i].strip().replace('\n', ' ').replace('\t', ' ')
123 |                     label = 'O'
124 |                     f.write(tok + '\t' + label + '\n')
125 |                     prev_label = label
126 |                     prev_tok = tok
127 |                 else:
128 |                     if prev_tok != '\n':
129 |                         f.write('\n')
130 |                         prev_tok = '\n'
131 |                     prev_label = 'O'
132 | 
133 |                     
134 | def create_subfolder(subfolder, source_folder, articles_id):
135 |     if os.path.exists(subfolder):
136 |         rmtree(subfolder)
137 |     os.makedirs(subfolder)
138 |     for article_id in articles_id:
139 |         file = 'article' + str(article_id) + '.txt'
140 |         copyfile(os.path.join(source_folder, file), os.path.join(subfolder, file))
141 | 


--------------------------------------------------------------------------------
/tools/README.md:
--------------------------------------------------------------------------------
  1 | 
  2 | Scorers for the Propaganda Techniques Corpus Version 2
  3 | 
  4 | Contents
  5 | 
  6 | 1. Tasks
  7 | 2. Evaluation scripts
  8 | 3. Data format
  9 | 4. Tools
 10 | 5. Citation 
 11 | 6. Changes from version 1 
 12 | 
 13 | 
 14 | Tasks
 15 | --------------------------------------------
 16 | The Propaganda Techniques Corpus (PTC) is a corpus of articles annotated 
 17 | with propaganda techniques at a fine-grained level. The list of 
 18 | techniques is in file data/propaganda-techniques-names-semeval2020task11.txt.
 19 | Among the different tasks that the corpus enables SemEval 2020 Task 11 focuses on the following ones:
 20 | 
 21 | Subtask 1 (SI). Propaganda Identification.
 22 | Given a plain-text document, identify those specific fragments that contain one propaganda technique. This is a binary sequence tagging task.
 23 | 
 24 | Subtask 2 (TC). Propaganda Technique Labeling.
 25 | Given a text fragment identified as propaganda and its document context, identify the applied propaganda technique at hand. This is a multi-class classification problem.
 26 | 
 27 | See the paper in the section "Citation" for further details. 
 28 | 
 29 | 
 30 | Evaluation scripts
 31 | --------------------------------------------
 32 | 
 33 | -Task SI (task-SI_scorer.py)
 34 | 
 35 | The evaluation script computes a variant of precision, recall, and F-measure 
 36 | that takes into account partial overlaps between fragments (see 
 37 | http://propaganda.qcri.org/semeval2020-task11/data/propaganda_tasks_evaluation.pdf
 38 | for more details). 
 39 | 
 40 | The script can be run as follows:
 41 | 
 42 | python3 task-SI_scorer.py -s [prediction_file] -r [gold_folder] -m
 43 | 
 44 | Note that all files *.labels in [gold_folder] will be considered containing gold labels
 45 | As an example, we provide a "prediction_file" data/submission-task-SI.tsv 
 46 | and you can run it as follows:
 47 | 
 48 | ===
 49 | 
 50 | $ python3 task-SI_scorer.py -s data/submission-task-SI.tsv -r data -m
 51 | 2019-09-20 19:47:26,427 - INFO - Checking user submitted file
 52 | 2019-09-20 19:47:26,429 - INFO - Scoring the submission with precision and recall method
 53 | 2019-09-20 19:47:26,430 - INFO - Precision=1.929825/2=0.964912  Recall=1.947458/4=0.486864
 54 | 2019-09-20 19:47:26,430 - INFO - F1=0.647181
 55 | 
 56 | 
 57 | ===
 58 | 
 59 | The scorer for the TC task is task-TC_scorer.py. 
 60 | The scorer requires file data/propaganda-techniques-names-semeval2020task11.txt. 
 61 | Such file contains the list of techniques used for scoring. 
 62 | Adding and removing items from the list will affect the outcome of the scorer. 
 63 | It can be run as follows
 64 | 
 65 | python3 task-TC_scorer.py -s [prediction_file] -r [gold_file] -p data/propaganda-techniques-names-semeval2020task11.txt
 66 | 
 67 | For example:
 68 | 
 69 | $ python3 task-TC_scorer.py -s data/submission-task-TC.tsv -r data/article736757214.labels-task-TC -p data/propaganda-techniques-names-semeval2020task11.txt 2>/dev/null
 70 | 2019-09-20 19:39:21,286 - INFO - Checking format: User Predictions -- Gold Annotations
 71 | 2019-09-20 19:39:21,287 - INFO - OK: submission file format appears to be correct
 72 | 2019-09-20 19:39:21,293 - INFO - Scoring submission
 73 | F1=0.600000
 74 | Precision=0.600000
 75 | Recall=0.600000
 76 | F1_Appeal_to_Authority=0.0
 77 | F1_Appeal_to_fear-prejudice=0.0
 78 | F1_Bandwagon,Reductio_ad_hitlerum=0.0
 79 | F1_Black-and-White_Fallacy=0.0
 80 | F1_Causal_Oversimplification=0.0
 81 | F1_Doubt=0.0
 82 | F1_Exaggeration,Minimisation=1.0
 83 | F1_Flag-Waving=0.0
 84 | F1_Loaded_Language=0.6666666666666666
 85 | F1_Name_Calling,Labeling=0.6666666666666666
 86 | F1_Repetition=0.0
 87 | F1_Slogans=0.0
 88 | F1_Thought-terminating_Cliches=0.0
 89 | F1_Whataboutism,Straw_Men,Red_Herring=0.0
 90 | 
 91 | 
 92 | Data format
 93 | --------------------------------------------
 94 | 
 95 | -Task SI
 96 | 
 97 | The corpus includes one tab-separated file per article in the following 
 98 | format: 
 99 | 
100 | id   begin_offset     end_offset
101 | 
102 | where 
103 | 	id is the identifier of the article
104 | 	begin_offset is the character where the covered span begins (inclusive)
105 | 	end_offset is the character where the covered span ends (exclusive)
106 | 
107 | An example of such a file is data/article736757214.task-FLC.labels. 
108 | 
109 | -Task TC
110 | 
111 | The corpus includes one tab-separated file per article in the following format:
112 | 
113 | id   technique    begin_offset     end_offset
114 | 
115 | The fields are the same as for task SI, but it now also includes "technique", i.e., the propaganda technique applied in the instance. 
116 | 
117 | 
118 | Tools
119 | --------------------------------------------
120 | 
121 | - The script print_spans.py highlights the annotations in an article.
122 | 
123 | python3 print_spans.py -s [annotations_file] -t [article_file] -p [propaganda_techniques_file]
124 | 
125 | For example:
126 | 
127 | python3 print_spans.py -t data/article736757214.txt -s data/article736757214.labels-task-TC -p data/propaganda-techniques-names-semeval2020task11.txt
128 | 
129 | 
130 | Citation 
131 | --------------------------------------------
132 | 
133 | Please cite the following publication when using the PTC corpus:
134 | 
135 | G. Da San Martino, S. Yu, A. Barrón-Cedeño, R. Petrov and P. Nakov, "Fine-Grained Analysis of Propaganda in News Articles", to appear at the Conference on Empirical Methods in Natural Language Processing (EMNLP 2019), Hong Kong, China, November 3-7, 2019.
136 | 
137 | @InProceedings{EMNLP19DaSanMartino,
138 | author = {Da San Martino, Giovanni and
139 | Yu, Seunghak and
140 | Barr\'{o}n-Cede\~no, Alberto and
141 | Petrov, Rostislav and
142 | Nakov, Preslav},
143 | title = {Fine-Grained Analysis of Propaganda in News Articles},
144 | booktitle = {Proceedings of the 2019 Conference on Empirical Methods in Natural Language Processing and 9th International Joint Conference on Natural Language Processing, EMNLP-IJCNLP 2019, Hong Kong, China, November 3-7, 2019},
145 | series = {EMNLP-IJCNLP 2019},
146 | year = {2019},
147 | address = {Hong Kong, China},
148 | month = {November},
149 | }
150 | 
151 | 
152 | Changes from version 1
153 | --------------------------------------------
154 | 
155 | Fixed a bug in the evaluation function for task TC that prevented to find the best alignment between the labels of identical spans in certain cases.
156 | 
157 | Now print_spans.py has a parameter -p specifying the file with the list of propaganda techniques
158 | 
159 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Semeval 2020, Task 11
  2 | 
  3 | ## Overview
  4 | This repository provides code for the SemEval-2020 Task 11 competition (Detection of Propaganda Techniques in News Articles).
  5 | 
  6 | The competition webpage: https://propaganda.qcri.org/semeval2020-task11/
  7 | 
  8 | The description of the architecture of models can be found in our paper [Aschern at SemEval-2020 Task 11: It Takes Three to Tango: RoBERTa, CRF, and Transfer Learning](https://www.aclweb.org/anthology/2020.semeval-1.191/).
  9 | 
 10 | ## Requirements
 11 | ```
 12 | pip install -r ./requirements.txt
 13 | ```
 14 | 
 15 | ## Project structure
 16 | 
 17 | - `configs`: yaml configs for the system
 18 | - `datasets`: contains the task datasets, which can be downloaded from the team competition webpage
 19 | - `results`: the folder for submissions
 20 | - `span_identification`: code for the task SI
 21 |   - `ner`: pytorch-transformers RoBERTa model with CRF (end-to-end)
 22 |   - `dataset`: the scripts for loading and preprocessing source dataset
 23 |   - `submission`: the scripts for obtaining and evaluating results
 24 | - `technique_classification`: code for the task TC (the folder has the same structure as `span_identification`)
 25 | - `tools`: tools provided by the competition organizers; contain useful functions for reading datasets and evaluating submissions
 26 | - `visualization_example`: example of visualization of results for both tasks
 27 | 
 28 | ## Running the models
 29 | 
 30 | All commands are run from the root directory of the repository.
 31 | 
 32 | ### Span Identification
 33 | 
 34 | 1. Configure `configs/si_config.yml` file, if it is needed. data_dir is the path to the cache of original train/eval sub-datasets and their BIO versions. In addition to using the config, it is also possible to specify arguments through the command line.
 35 | 
 36 | 2. Split the dataset for local evaluation (if `--overwrite_cache`, previous files will be replaced). It will produce files with the BIO-format tagging for spans (B-PROP, I-PROP, O) in your `--data_dir`.
 37 |     ```bash
 38 |     python -m span_identification --config configs/si_config.yml --split_dataset --overwrite_cache
 39 |     ```
 40 | 3. Train and eval model (the model parameters are specified in the config, you need to change the paths). The use of CRF is regulated by the flag `--use_crf`. For the first run you can use `--model_name_or_path roberta-large`.
 41 |     ```bash
 42 |     python -m span_identification --config configs/si_config.yml --do_train --do_eval
 43 |     ```
 44 | 4. Apply the trained model to the `test_file` (in BIO-format) specified in the config. It will be created based on the `test_data_folder` folder in case of missing or if the flag `--overwrite_cache` is specified.
 45 |     ```bash
 46 |     python -m span_identification --config configs/si_config.yml --do_predict
 47 |     ```
 48 | 5. Create the submission file `output_file` in the `result` folder. It will obtain spans from the result files with the token labeling specified in `predicted_labels_files`. At the aggregation stage, the span prediction results are simply joined.
 49 |     ```bash
 50 |     python -m span_identification --config configs/si_config.yml --create_submission_file
 51 |     ```
 52 | 6. In case you have the correct markup in the `test_file` or gold `--gold_annot_file` (source competition format), you can run the evaluation competition script.
 53 |     ```bash
 54 |     python -m span_identification --config configs/si_config.yml --do_eval_spans
 55 |     ```
 56 | 7. Use `visualization_example/visualization.ipynb` if you want to visualize labels.
 57 | 
 58 | ### Technique Classification
 59 | 
 60 | Here you need almost the same commands and settings as in the SI task.
 61 | 
 62 | 1. Configure `configs/tc_config.yml` file, if it is needed.
 63 | 
 64 | 2. Split the dataset for local evaluation.
 65 |     ```bash
 66 |     python -m technique_classification --config configs/tc_config.yml --split_dataset --overwrite_cache
 67 |     ```
 68 | 3. Train and eval model. We used two setups with and without flags `--join_embeddings --use_length` (to get our RoBERTa-Joined). For the first run you can use `--model_name_or_path roberta-large`.
 69 |     ```bash
 70 |     python -m technique_classification --config configs/tc_config.yml --do_train --do_eval
 71 |     ```
 72 |     or distributed
 73 |     ```
 74 |     CUDA_VISIBLE_DEVICES=0,1,2,3 python -m torch.distributed.launch --nproc_per_node 4 technique_classification --config configs/tc_config.yml --do_train --do_eval
 75 |     ```
 76 | 4. Apply the trained model to the `test_file` specified in the config. It will be created based on the `test_data_folder` folder and `test_template_labels_path` file in case of missing or if the flag `--overwrite_cache` is specified.
 77 |     ```bash
 78 |     python -m technique_classification --config configs/tc_config.yml --do_predict --join_embeddings --use_length
 79 |     ```
 80 | 5. Create the submission file `output_file`. It will combine predictions from the list `predicted_logits_files` with coefficients specified in `--weights` (optional) and apply some post-processing.
 81 |     ```bash
 82 |     python -m technique_classification --config configs/tc_config.yml --create_submission_file
 83 |     ```
 84 | 6. In case you have the correct markup in the `test_file` or gold `--test_labels_path` (source competition format), you can check your accuracy (micro f1-score) and f1-score per classes.
 85 |     ```bash
 86 |     python -m technique_classification --config configs/tc_config.yml  --eval_submission
 87 |     ```
 88 | 7. Use `visualization_example/visualization.ipynb` if you want to visualize labels.
 89 | 
 90 | Our pretrained RoBERTa-CRF (SI task) and RoBERTa-Joined (TC task) models are available in [Google Drive](https://vk.com/away.php?to=https%3A%2F%2Fdrive.google.com%2Fdrive%2Ffolders%2F1Gph7FKMaxOBJdkrk0nM72uFpCGgn-2kC%3Fusp%3Dsharing).
 91 | 
 92 | ## Citation
 93 | 
 94 | If you find this repository helpful, feel free to cite our publication [Aschern at SemEval-2020 Task 11: It Takes Three to Tango: RoBERTa, CRF, and Transfer Learning](https://www.aclweb.org/anthology/2020.semeval-1.191/):
 95 | ```
 96 | @inproceedings{chernyavskiy-etal-2020-aschern,
 97 |     title = "Aschern at {S}em{E}val-2020 Task 11: It Takes Three to Tango: {R}o{BERT}a, {CRF}, and Transfer Learning",
 98 |     author = "Chernyavskiy, Anton  and
 99 |       Ilvovsky, Dmitry  and
100 |       Nakov, Preslav",
101 |     booktitle = "Proceedings of the Fourteenth Workshop on Semantic Evaluation",
102 |     month = dec,
103 |     year = "2020",
104 |     address = "Barcelona (online)",
105 |     publisher = "International Committee for Computational Linguistics",
106 |     url = "https://www.aclweb.org/anthology/2020.semeval-1.191",
107 |     pages = "1462--1468"
108 | }
109 | ``` 
110 | 


--------------------------------------------------------------------------------
/tools/src/annotation_w_o_label.py:
--------------------------------------------------------------------------------
  1 | from __future__ import annotations
  2 | import sys
  3 | import src.propaganda_techniques as pt
  4 | import logging.handlers
  5 | 
  6 | __author__ = "Giovanni Da San Martino"
  7 | __copyright__ = "Copyright 2019"
  8 | __credits__ = ["Giovanni Da San Martino"]
  9 | __license__ = "GPL"
 10 | __version__ = "0.1"
 11 | __maintainer__ = "Giovanni Da San Martino"
 12 | __email__ = "gmartino@hbku.edu.qa"
 13 | __status__ = "Beta"
 14 | 
 15 | logger = logging.getLogger("propaganda_scorer")
 16 | 
 17 | 
 18 | class AnnotationWithOutLabel(object):
 19 | 
 20 |     """
 21 |     One annotation is represented by a span (two integer indices indicating the 
 22 |     starting and ending position of the span). 
 23 |     The class provides basic maniputation functions for one annotation. 
 24 |     """
 25 | 
 26 |     # input file format variables
 27 |     separator = "\t"
 28 |     ARTICLE_ID_COL = 0
 29 |     FRAGMENT_START_COL = 1
 30 |     FRAGMENT_END_COL = 2
 31 | 
 32 | 
 33 |     def __init__(self, start_offset:str = None, end_offset:str=None): 
 34 |         
 35 |         self.start_offset = int(start_offset)
 36 |         self.end_offset = int(end_offset)
 37 | 
 38 | 
 39 |     def __str__(self):
 40 | 
 41 |         return "[%d, %d]"%(self.start_offset, self.end_offset)
 42 |         #return "%d\t%d"%(self.start_offset, self.end_offset)
 43 | 
 44 | 
 45 |     def is_span_equal_to(self, second_annotation:AnnotationWithOutLabel)->bool:
 46 |         """
 47 |         Checks whether two annotations are identical, i.e. whether the two spans are identical. 
 48 |         """
 49 |         if self.get_start_offset() != second_annotation.get_start_offset() or self.get_end_offset() != second_annotation.get_end_offset():
 50 |             return False
 51 |         return True
 52 | 
 53 | 
 54 |     def __eq__(self, second_annotation:AnnotationWithOutLabel):
 55 |         
 56 |         return self.is_span_equal_to(second_annotation)
 57 | 
 58 | 
 59 |     def get_start_offset(self)->int:
 60 | 
 61 |         return self.start_offset
 62 | 
 63 |         
 64 |     def get_end_offset(self)->int:
 65 | 
 66 |         return self.end_offset
 67 | 
 68 |     
 69 |     def get_span(self)->set:
 70 |         """
 71 |         Returns a set of positions of all characters in the span
 72 |         """
 73 |         return set(range(self.get_start_offset(), self.get_end_offset()))
 74 | 
 75 |     
 76 |     @staticmethod
 77 |     def load_annotation_from_string(annotation_string:str, row_num:int=None, filename:str=None)->(AnnotationWithOutLabel, str):
 78 |         """
 79 |         Read annotations from a csv-like string, with fields separated
 80 |         by the class variable `separator`: 
 81 | 
 82 |         article id<separator>starting_position<separator>ending_position
 83 |         Fields order is determined by the class variables ARTICLE_ID_COL,
 84 |         FRAGMENT_START_COL, FRAGMENT_END_COL
 85 | 
 86 |         Besides reading the data, it performs basic checks.
 87 | 
 88 |         :return a tuple (AnnotationWithOutLabel object, id of the article)
 89 |         """
 90 | 
 91 |         row = annotation_string.rstrip().split(AnnotationWithOutLabel.separator)
 92 |         if len(row) != 3:
 93 |             logger.error("Row%s%s is supposed to have 3 columns. Found %d: -%s-."
 94 |                          % (" " + str(row_num) if row_num is not None else "",
 95 |                             " in file " + filename if filename is not None else "", len(row), annotation_string))
 96 |             sys.exit()
 97 | 
 98 |         article_id = row[AnnotationWithOutLabel.ARTICLE_ID_COL]
 99 |         try:
100 |             start_offset = int(row[AnnotationWithOutLabel.FRAGMENT_START_COL])
101 |         except:
102 |             logger.error("The column %d in row%s%s is supposed to be an integer: -%s-"
103 |                          %(AnnotationWithOutLabel.FRAGMENT_START_COL, " " + str(row_num) if row_num is not None else "", " in file " + filename if filename is not None else "", annotation_string))
104 |         try:
105 |             end_offset = int(row[AnnotationWithOutLabel.FRAGMENT_END_COL])
106 |         except:
107 |             logger.error("The column %d in row%s%s is supposed to be an integer: -%s-"
108 |                          %(AnnotationWithOutLabel.FRAGMENT_END_COL, " " + str(row_num) if row_num is not None else "",
109 |                             " in file " + filename if filename is not None else "", annotation_string))
110 | 
111 |         return AnnotationWithOutLabel(start_offset, end_offset), article_id
112 | 
113 | 
114 |     def merge_spans(self, second_annotation:AnnotationWithOutLabel)->None:
115 |         """
116 |         Merge the spans of two annotations. The function does not check whether the spans overlap. 
117 | 
118 |         :param second_annotation: the AnnotationWithOutLabel object whose span is being merged
119 |         :return:
120 |         """
121 |         self.set_start_offset(min(self.get_start_offset(), second_annotation.get_start_offset()))
122 |         self.set_end_offset(max(self.get_end_offset(), second_annotation.get_end_offset()))
123 | 
124 | 
125 |     def set_start_offset(self, new_start_offset:int)->None:
126 | 
127 |         self.start_offset = new_start_offset
128 | 
129 | 
130 |     def set_end_offset(self, new_end_offset:int)->None:
131 | 
132 |         self.end_offset = new_end_offset
133 | 
134 | 
135 |     def shift_annotation(self, offset:int)->None:
136 |         
137 |         self.set_start_offset(self.get_start_offset() + offset)
138 |         self.set_end_offset(self.get_end_offset() + offset)
139 |         
140 | 
141 |     def span_overlapping(self, second_annotation:AnnotationWithOutLabel)->bool:
142 |         return len(self.get_span().intersection(second_annotation.get_span())) > 0
143 | 
144 | 
145 |     def is_span_valid(self)->bool:
146 |         """
147 |         Checks whether the span is valid, i.e. if the following conditions are met: 
148 |         1) start and end offsets >= 0 
149 |         2) start offset < end offset
150 |         """
151 |         if self.get_start_offset() < 0 or self.get_end_offset() < 0:
152 |             logger.error("Start and end of position of the fragment must be non-negative: %d, %d"
153 |                          %(self.get_start_offset(), self.get_end_offset()))
154 |             return False
155 |         if self.get_start_offset() >= self.get_end_offset():
156 |             logger.error("End position of the fragment must be greater than the starting one: start=%d, end=%d"%(self.get_start_offset(), self.get_end_offset()))
157 |             return False
158 |         return True
159 |         
160 | 
161 |     def check_format_of_annotation_in_file(self):
162 |         """
163 |         Performs some checks on the fields of the annotation
164 |         """
165 |         if not self.is_span_valid():
166 |             sys.exit()
167 | 
168 | 


--------------------------------------------------------------------------------
/span_identification/ner/utils_ner.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | # Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
  3 | # Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
  4 | #
  5 | # Licensed under the Apache License, Version 2.0 (the "License");
  6 | # you may not use this file except in compliance with the License.
  7 | # You may obtain a copy of the License at
  8 | #
  9 | #     http://www.apache.org/licenses/LICENSE-2.0
 10 | #
 11 | # Unless required by applicable law or agreed to in writing, software
 12 | # distributed under the License is distributed on an "AS IS" BASIS,
 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14 | # See the License for the specific language governing permissions and
 15 | # limitations under the License.
 16 | """ Named entity recognition fine-tuning: utilities to work with CoNLL-2003 task. """
 17 | 
 18 | from __future__ import absolute_import, division, print_function
 19 | 
 20 | import logging
 21 | import os
 22 | from io import open
 23 | 
 24 | logger = logging.getLogger(__name__)
 25 | 
 26 | 
 27 | class InputExample(object):
 28 |     """A single training/test example for token classification."""
 29 | 
 30 |     def __init__(self, guid, words, labels):
 31 |         """Constructs a InputExample.
 32 | 
 33 |         Args:
 34 |             guid: Unique id for the example.
 35 |             words: list. The words of the sequence.
 36 |             labels: (Optional) list. The labels for each word of the sequence. This should be
 37 |             specified for train and dev examples, but not for test examples.
 38 |         """
 39 |         self.guid = guid
 40 |         self.words = words
 41 |         self.labels = labels
 42 | 
 43 | 
 44 | class InputFeatures(object):
 45 |     """A single set of features of data."""
 46 | 
 47 |     def __init__(self, input_ids, input_mask, segment_ids, label_ids):
 48 |         self.input_ids = input_ids
 49 |         self.input_mask = input_mask
 50 |         self.segment_ids = segment_ids
 51 |         self.label_ids = label_ids
 52 | 
 53 | 
 54 | def read_examples_from_file(file_path, mode):
 55 |     guid_index = 1
 56 |     examples = []
 57 |     with open(file_path, encoding="utf-8") as f:
 58 |         words = []
 59 |         labels = []
 60 |         for line in f:
 61 |             if line.startswith("-DOCSTART-") or line == "" or line == "\n":
 62 |                 if words:
 63 |                     examples.append(InputExample(guid="{}-{}".format(mode, guid_index),
 64 |                                                  words=words,
 65 |                                                  labels=labels))
 66 |                     guid_index += 1
 67 |                     words = []
 68 |                     labels = []
 69 |             else:
 70 |                 splits = line.split('\t') # " "
 71 |                 words.append(splits[0])
 72 |                 if len(splits) > 1:
 73 |                     labels.append(splits[-1].replace("\n", ""))
 74 |                 else:
 75 |                     # Examples could have no label for mode = "test"
 76 |                     labels.append("O")
 77 |         if words:
 78 |             examples.append(InputExample(guid="%s-%d".format(mode, guid_index),
 79 |                                          words=words,
 80 |                                          labels=labels))
 81 |     return examples
 82 | 
 83 | 
 84 | def convert_examples_to_features(examples,
 85 |                                  label_list,
 86 |                                  max_seq_length,
 87 |                                  tokenizer,
 88 |                                  cls_token_at_end=False,
 89 |                                  cls_token="[CLS]",
 90 |                                  cls_token_segment_id=1,
 91 |                                  sep_token="[SEP]",
 92 |                                  sep_token_extra=False,
 93 |                                  pad_on_left=False,
 94 |                                  pad_token=0,
 95 |                                  pad_token_segment_id=0,
 96 |                                  pad_token_label_id=-1,
 97 |                                  sequence_a_segment_id=0,
 98 |                                  mask_padding_with_zero=True):
 99 |     """ Loads a data file into a list of `InputBatch`s
100 |         `cls_token_at_end` define the location of the CLS token:
101 |             - False (Default, BERT/XLM pattern): [CLS] + A + [SEP] + B + [SEP]
102 |             - True (XLNet/GPT pattern): A + [SEP] + B + [SEP] + [CLS]
103 |         `cls_token_segment_id` define the segment id associated to the CLS token (0 for BERT, 2 for XLNet)
104 |     """
105 | 
106 |     label_map = {label: i for i, label in enumerate(label_list)}
107 | 
108 |     features = []
109 |     for (ex_index, example) in enumerate(examples):
110 |         if ex_index % 10000 == 0:
111 |             logger.info("Writing example %d of %d", ex_index, len(examples))
112 | 
113 |         tokens = []
114 |         label_ids = []
115 |         for word, label in zip(example.words, example.labels):
116 |             word_tokens = tokenizer.tokenize(word)
117 |             tokens.extend(word_tokens)
118 |             # Use the real label id for the first token of the word, and padding ids for the remaining tokens
119 |             label_ids.extend([label_map[label]] + [pad_token_label_id] * (len(word_tokens) - 1))
120 | 
121 |         # Account for [CLS] and [SEP] with "- 2" and with "- 3" for RoBERTa.
122 |         special_tokens_count = 3 if sep_token_extra else 2
123 |         if len(tokens) > max_seq_length - special_tokens_count:
124 |             tokens = tokens[:(max_seq_length - special_tokens_count)]
125 |             label_ids = label_ids[:(max_seq_length - special_tokens_count)]
126 | 
127 |         # The convention in BERT is:
128 |         # (a) For sequence pairs:
129 |         #  tokens:   [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP]
130 |         #  type_ids:   0   0  0    0    0     0       0   0   1  1  1  1   1   1
131 |         # (b) For single sequences:
132 |         #  tokens:   [CLS] the dog is hairy . [SEP]
133 |         #  type_ids:   0   0   0   0  0     0   0
134 |         #
135 |         # Where "type_ids" are used to indicate whether this is the first
136 |         # sequence or the second sequence. The embedding vectors for `type=0` and
137 |         # `type=1` were learned during pre-training and are added to the wordpiece
138 |         # embedding vector (and position vector). This is not *strictly* necessary
139 |         # since the [SEP] token unambiguously separates the sequences, but it makes
140 |         # it easier for the model to learn the concept of sequences.
141 |         #
142 |         # For classification tasks, the first vector (corresponding to [CLS]) is
143 |         # used as as the "sentence vector". Note that this only makes sense because
144 |         # the entire model is fine-tuned.
145 |         tokens += [sep_token]
146 |         label_ids += [pad_token_label_id]
147 |         if sep_token_extra:
148 |             # roberta uses an extra separator b/w pairs of sentences
149 |             tokens += [sep_token]
150 |             label_ids += [pad_token_label_id]
151 |         segment_ids = [sequence_a_segment_id] * len(tokens)
152 | 
153 |         if cls_token_at_end:
154 |             tokens += [cls_token]
155 |             label_ids += [pad_token_label_id]
156 |             segment_ids += [cls_token_segment_id]
157 |         else:
158 |             tokens = [cls_token] + tokens
159 |             label_ids = [pad_token_label_id] + label_ids
160 |             segment_ids = [cls_token_segment_id] + segment_ids
161 | 
162 |         input_ids = tokenizer.convert_tokens_to_ids(tokens)
163 | 
164 |         # The mask has 1 for real tokens and 0 for padding tokens. Only real
165 |         # tokens are attended to.
166 |         input_mask = [1 if mask_padding_with_zero else 0] * len(input_ids)
167 | 
168 |         # Zero-pad up to the sequence length.
169 |         padding_length = max_seq_length - len(input_ids)
170 |         if pad_on_left:
171 |             input_ids = ([pad_token] * padding_length) + input_ids
172 |             input_mask = ([0 if mask_padding_with_zero else 1] * padding_length) + input_mask
173 |             segment_ids = ([pad_token_segment_id] * padding_length) + segment_ids
174 |             label_ids = ([pad_token_label_id] * padding_length) + label_ids
175 |         else:
176 |             input_ids += ([pad_token] * padding_length)
177 |             input_mask += ([0 if mask_padding_with_zero else 1] * padding_length)
178 |             segment_ids += ([pad_token_segment_id] * padding_length)
179 |             label_ids += ([pad_token_label_id] * padding_length)
180 | 
181 |         assert len(input_ids) == max_seq_length
182 |         assert len(input_mask) == max_seq_length
183 |         assert len(segment_ids) == max_seq_length
184 |         assert len(label_ids) == max_seq_length
185 | 
186 |         if ex_index < 5:
187 |             logger.info("*** Example ***")
188 |             logger.info("guid: %s", example.guid)
189 |             logger.info("tokens: %s", " ".join([str(x) for x in tokens]))
190 |             logger.info("input_ids: %s", " ".join([str(x) for x in input_ids]))
191 |             logger.info("input_mask: %s", " ".join([str(x) for x in input_mask]))
192 |             logger.info("segment_ids: %s", " ".join([str(x) for x in segment_ids]))
193 |             logger.info("label_ids: %s", " ".join([str(x) for x in label_ids]))
194 | 
195 |         features.append(
196 |                 InputFeatures(input_ids=input_ids,
197 |                               input_mask=input_mask,
198 |                               segment_ids=segment_ids,
199 |                               label_ids=label_ids))
200 |     return features
201 | 
202 | 
203 | def get_labels(path):
204 |     if path:
205 |         with open(path, "r") as f:
206 |             labels = f.read().splitlines()
207 |         if "O" not in labels:
208 |             labels = ["O"] + labels
209 |         return labels
210 |     else:
211 |         #return ["O", "B-MISC", "I-MISC",  "B-PER", "I-PER", "B-ORG", "I-ORG", "B-LOC", "I-LOC"]
212 |         #return ["O", "B-PROP", "I-PROP", 'E-PROP', 'U-PROP']
213 |         return ["O", "B-PROP", "I-PROP"]
214 | 


--------------------------------------------------------------------------------
/technique_classification/submission.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | import numpy as np
  3 | import pandas as pd
  4 | from nltk.stem import PorterStemmer
  5 | from nltk.tokenize import word_tokenize
  6 | from collections import defaultdict
  7 | from sklearn.utils.extmath import softmax
  8 | from sklearn.metrics import accuracy_score, f1_score
  9 | from nltk.corpus import stopwords
 10 | import string
 11 | import pickle
 12 | import os
 13 | from unidecode import unidecode
 14 | from joblib import dump, load
 15 | 
 16 | 
 17 | def get_insides(data):  
 18 |     insides = defaultdict(dict)
 19 |     spans_coords = list(zip(data['span_start'].values, data['span_end'].values))
 20 |     labels = data['label'].values
 21 |     article_ids = data['article_id'].values
 22 |     for i in range(len(spans_coords)):
 23 |         for j in range(i):
 24 |             if article_ids[i] == article_ids[j]:
 25 |                 if spans_coords[i][0] >= spans_coords[j][0] and spans_coords[i][1] <= spans_coords[j][1]:
 26 |                     if spans_coords[i][0] != spans_coords[j][0] or spans_coords[i][1] != spans_coords[j][1]:
 27 |                         insides[labels[i]][labels[j]] = insides[labels[i]].get(labels[j], 0) + 1
 28 |                 if spans_coords[j][0] >= spans_coords[i][0] and spans_coords[j][1] <= spans_coords[i][1]:
 29 |                     if spans_coords[j][0] != spans_coords[i][0] or spans_coords[j][1] != spans_coords[i][1]:
 30 |                         insides[labels[j]][labels[i]] = insides[labels[j]].get(labels[i], 0) + 1
 31 |     return insides
 32 | 
 33 | 
 34 | def correct_preds_for_insides(preds, spans_coords, logits, insides, mapping, inverse_mapping):
 35 |     for i in range(len(preds)):
 36 |         for j in range(len(preds)):
 37 |             if spans_coords[j][0] >= spans_coords[i][0] and spans_coords[j][1] <= spans_coords[i][1]:
 38 |                 if spans_coords[j][0] != spans_coords[i][0] or spans_coords[j][1] != spans_coords[i][1]:
 39 |                     def_i = preds[i]
 40 |                     def_j = preds[j]
 41 |                     log = softmax([logits[i]])[0]
 42 |                     login = softmax([logits[j]])[0]
 43 |                     def_prob_i = log[inverse_mapping[preds[i]]]
 44 |                     def_prob_j = login[inverse_mapping[preds[j]]]
 45 |                     while preds[j] not in insides.get(preds[i], []):
 46 |                         if log[inverse_mapping[preds[i]]] > login[inverse_mapping[preds[j]]]:
 47 |                             values = np.sort(login)[-2:]
 48 |                             if values[1] / (values[0] + 1e-6) > 1.4:
 49 |                                 preds[i] = def_i
 50 |                                 preds[j] = def_j
 51 |                                 break
 52 |                             login[inverse_mapping[preds[j]]] = 0
 53 |                             preds[j] = mapping[np.argmax(login)]
 54 |                         else:
 55 |                             values = np.sort(log)[-2:]
 56 |                             if values[1] / (values[0] + 1e-6) > 1.4:
 57 |                                 preds[i] = def_i
 58 |                                 preds[j] = def_j
 59 |                                 break
 60 |                             log[inverse_mapping[preds[i]]] = 0
 61 |                             preds[i] = mapping[np.argmax(log)]
 62 |     return preds
 63 | 
 64 |                             
 65 | def stem_spans(spans):
 66 |     ps = PorterStemmer()
 67 |     res = []
 68 |     for el in spans:
 69 |         result = " ".join(ps.stem(word) for word in word_tokenize(el.lower()))
 70 |         if len(result) > 0:
 71 |             res.append(result)
 72 |     return res
 73 | 
 74 | 
 75 | def get_train_instances(data, data_dir, save=True):
 76 |     train_instances = dict()
 77 |     stemmed_spans = stem_spans(data.span.values)
 78 |     labels = data.label.values
 79 |     for i in range(len(stemmed_spans)):
 80 |         if labels[i] != 'Repetition':
 81 |             span = stemmed_spans[i]
 82 |             train_instances.setdefault(span, set())
 83 |             train_instances[span].add(labels[i])
 84 |     if save:
 85 |         with open(os.path.join(data_dir, 'train_instances_train'), 'wb') as f:
 86 |             pickle.dump(train_instances, f)
 87 |     return train_instances
 88 |                             
 89 |     
 90 | def postprocess(x, mapping, inverse_mapping, insides, stop_words, ps, train_instances):
 91 |     spans_coords = list(zip(x['span_start'].values, x['span_end'].values))
 92 |     spans_source = x['span'].values
 93 |     spans_text = [' '.join([ps.stem(word) for word in word_tokenize(span.lower())]) for span in spans_source]
 94 |     spans = [' '.join([ps.stem(word) for word in word_tokenize(unidecode(span.lower())) 
 95 |                        if word not in stop_words and word not in string.punctuation]) for span in spans_source]
 96 |     
 97 |     counts = dict()
 98 |     for i in range(len(spans)):
 99 |         counts.setdefault(spans[i], set())
100 |         counts[spans[i]].add(spans_coords[i][0])
101 |     for el in counts:
102 |         counts[el] = len(counts[el])
103 |         
104 |     preds = x['pred'].values
105 |     logits = [np.array(log.split(), dtype=np.float32) for log in x['logits']]
106 |     for i in range(len(preds)):
107 |         log = logits[i]
108 |         
109 |         if counts[spans[i]] >= 3 or (counts[spans[i]] >= 2 and logits[i][inverse_mapping["Repetition"]] > 0.001):
110 |             log[inverse_mapping["Repetition"]] = 100
111 |         
112 |         if counts[spans[i]] == 1 and (logits[i][inverse_mapping["Repetition"]] < 0.99 or len(spans[i].split()) <= 1):
113 |             log[inverse_mapping["Repetition"]] = 0
114 |         
115 |         for prediction in train_instances.get(spans_text[i], set()):
116 |             log[inverse_mapping[prediction]] += 0.5
117 |         if spans_source[i].startswith('#'):
118 |             log[inverse_mapping['Slogans']] = 20
119 |          
120 |         
121 |         prev_same = []
122 |         for j in range(i):
123 |             if spans_coords[j][0] == spans_coords[i][0] and spans_coords[j][1] == spans_coords[i][1]:
124 |                 prev_same.append(j)
125 |         if len(prev_same) > 0:
126 |             for prediction in preds[prev_same]:
127 |                 log[inverse_mapping[prediction]] = 0
128 |         
129 |         logits[i] = log
130 |         preds[i] = mapping[np.argmax(log)]
131 |         
132 |     x["pred"] = correct_preds_for_insides(preds, spans_coords, logits, insides, mapping, inverse_mapping)
133 |     #x["pred"] = preds
134 |     return x
135 | 
136 | 
137 | def postprocess_predictions(predictions_logits, data, insides, train_instances):
138 |     mapping = {i: el for i, el in enumerate(
139 |         ['Appeal_to_Authority', 'Doubt', 'Repetition', 'Appeal_to_fear-prejudice', 'Slogans', 'Black-and-White_Fallacy',
140 |          'Loaded_Language', 'Flag-Waving', 'Name_Calling,Labeling', 'Whataboutism,Straw_Men,Red_Herring', 
141 |          'Causal_Oversimplification', 'Exaggeration,Minimisation', 'Bandwagon,Reductio_ad_hitlerum', 
142 |          'Thought-terminating_Cliches']
143 |     )}
144 |     inverse_mapping = {b: a for (a, b) in mapping.items()}
145 |     
146 |     stop_words = set(stopwords.words('english'))
147 |     ps = PorterStemmer()
148 |     
149 |     predictions = np.argmax(predictions_logits, axis=1)
150 |     data['pred'] = [mapping[p] for p in predictions]
151 |     data['logits'] = [' '.join(np.array(log, dtype=str)) for log in predictions_logits]
152 |     data = data.groupby('article_id', as_index=False).apply(postprocess, mapping, inverse_mapping, insides,
153 |                                                             stop_words, ps, train_instances)
154 |     return np.array(data["pred"].values)
155 | 
156 | 
157 | def softmax_with_temperature(z, T): 
158 |     z = z / T 
159 |     max_z = np.max(z, axis=1).reshape(-1, 1) 
160 |     exp_z = np.exp(z - max_z)
161 |     return exp_z / np.sum(exp_z, axis=1).reshape(-1, 1)
162 | 
163 | 
164 | def create_submission_file(predicted_logits_files, train_file_path, dev_file_path, test_file_path, 
165 |                         article_ids, span_starts, span_ends, output_file, weights=None, data_dir=None, agg_model=None): 
166 |     data_train = pd.read_csv(train_file_path, sep='\t')
167 |     data_eval = pd.read_csv(dev_file_path, sep='\t')
168 |     #data_train = pd.concat([data_train, data_eval], ignore_index=True)
169 |     
170 |     insides = get_insides(data_train)
171 |     train_instances = get_train_instances(data_train, data_dir)
172 |     
173 |     data = pd.read_csv(test_file_path, sep='\t')
174 |     
175 |     if weights is None:
176 |         weights = [1. / len(predicted_logits_files) for _ in range(len(predicted_logits_files))]
177 |     assert len(weights) == len(predicted_logits_files)
178 |     
179 |     predictions_logits = None
180 |     predictions_logits_list = []
181 |     for file, weight in zip(predicted_logits_files, weights):
182 |         with open(file, 'rb') as f:
183 |             logits = pickle.load(f)
184 |             if predictions_logits is None:
185 |                 predictions_logits = float(weight) * softmax_with_temperature(logits, 1)
186 |             else:
187 |                 predictions_logits += float(weight) * softmax_with_temperature(logits, 1)
188 |             if agg_model is not None:
189 |                 predictions_logits_list.append(logits)
190 |     
191 |     predictions = postprocess_predictions(predictions_logits, data, insides, train_instances)
192 |     
193 |     if agg_model is not None:
194 |         clf = load(agg_model)
195 |         predictions_sklearn_agg = clf.predict(np.concatenate(predictions_logits_list, axis=1))
196 |         predictions_sklearn_agg[predictions_sklearn_agg == 'Repetition'] = predictions[predictions_sklearn_agg == 'Repetition']
197 |         predictions_sklearn_agg[predictions == 'Repetition'] = 'Repetition'
198 |         predictions = predictions_sklearn_agg
199 |     
200 |     with open(output_file, "w") as fout:
201 |         for article_id, prediction, span_start, span_end in zip(article_ids, predictions, span_starts, span_ends):
202 |             fout.write("%s\t%s\t%s\t%s\n" % (article_id, prediction, span_start, span_end))
203 | 
204 |             
205 | def load_result(file):
206 |     result = defaultdict(dict)
207 |     with open(file, "r") as f:
208 |         for line in f:
209 |             article_id, prediction, spl, spr = line.split('\t')
210 |             result[article_id].setdefault(prediction, [])
211 |             result[article_id][prediction].append([int(spl), int(spr)])
212 |     return result
213 | 
214 | 
215 | def read_ground_truth(gt_file_path, label_names):
216 |     ground_truth = []
217 |     with open(gt_file_path, "r") as f:
218 |         for line in f:
219 |             gold_label = line.split('\t')[-1].strip()
220 |             if gold_label in label_names:
221 |                 ground_truth.append(gold_label)
222 |     return ground_truth
223 | 
224 |             
225 | def eval_submission(result_file_path, gt_file_path):
226 |     predictions = []
227 |     with open(result_file_path, "r") as f:
228 |         for line in f:
229 |             prediction = line.split('\t')[1].strip()
230 |             predictions.append(prediction)
231 |     
232 |     label_names = sorted(['Appeal_to_Authority', 'Doubt', 'Repetition', 'Appeal_to_fear-prejudice', 'Slogans',
233 |                           'Black-and-White_Fallacy', 'Loaded_Language', 'Flag-Waving', 'Name_Calling,Labeling', 
234 |                           'Whataboutism,Straw_Men,Red_Herring', 'Causal_Oversimplification', 'Exaggeration,Minimisation', 
235 |                           'Bandwagon,Reductio_ad_hitlerum', 'Thought-terminating_Cliches'])
236 |     ground_truth = read_ground_truth(gt_file_path, label_names)
237 |     
238 |     acc = accuracy_score(ground_truth, predictions)
239 |     f1 = list(zip(label_names, f1_score(ground_truth, predictions, average=None, labels=label_names)))
240 |     return acc, f1
241 | 


--------------------------------------------------------------------------------
/span_identification/__main__.py:
--------------------------------------------------------------------------------
  1 | try:
  2 |     from .ner import transformers_ner_crf, transformers_ner
  3 |     from .dataset import load_data, get_train_dev_files, get_test_file, create_subfolder
  4 |     from .submission import get_submission_format
  5 | except:
  6 |     from ner import transformers_ner_crf, transformers_ner
  7 |     from dataset import load_data, get_train_dev_files, get_test_file, create_subfolder
  8 |     from submission import get_submission_format
  9 |     
 10 | import configargparse
 11 | import spacy
 12 | import logging
 13 | import os
 14 | import subprocess
 15 | import tempfile
 16 | 
 17 | 
 18 | logger = logging.getLogger(__name__)
 19 | 
 20 | 
 21 | def Main(args):
 22 |     nlp = spacy.load("en")
 23 |     if not os.path.exists(args.data_dir):
 24 |         os.makedirs(args.data_dir)
 25 |     
 26 |     if args.do_train or args.do_eval or args.split_dataset:
 27 |         articles_content, articles_id, propaganda_techniques_names = load_data(args.train_data_folder, 
 28 |                                                                            args.propaganda_techniques_file)
 29 |         train_file_path = os.path.join(args.data_dir, args.train_file)
 30 |         dev_file_path = os.path.join(args.data_dir, args.dev_file)
 31 |         if not os.path.exists(train_file_path) or not os.path.exists(dev_file_path) or args.overwrite_cache:
 32 |             logger.info("Creating 'ner' train/dev files: %s, %s", train_file_path, dev_file_path)
 33 |             train_ids, dev_ids = get_train_dev_files(articles_id, articles_content, nlp, args.labels_path, train_file_path,
 34 |                                                      dev_file_path, args.split_by_ids, args.dev_size, args.random_state)
 35 |             if args.split_dataset:
 36 |                 create_subfolder(os.path.join(args.data_dir, 'train-train-articles'),  args.train_data_folder, train_ids)
 37 |                 create_subfolder(os.path.join(args.data_dir, 'train-dev-articles'),  args.train_data_folder, dev_ids)
 38 |     
 39 |     if args.do_predict or args.create_submission_file or args.do_eval_spans:
 40 |         test_articles_content, test_articles_id, _ = load_data(args.test_data_folder, args.propaganda_techniques_file)
 41 |         test_file_path = os.path.join(args.data_dir, args.test_file)
 42 |         if (not os.path.exists(test_file_path) or args.overwrite_cache) and not args.do_eval_spans:
 43 |             logger.info("Creating 'ner' test file: %s", test_file_path)
 44 |             get_test_file(test_file_path, test_articles_id, test_articles_content, nlp)            
 45 |     
 46 |     if args.do_train or args.do_eval or args.do_predict:
 47 |         if args.use_crf:
 48 |             transformers_ner_crf(args)
 49 |         else:
 50 |             transformers_ner(args)
 51 |             
 52 |     if args.do_eval_spans:
 53 |         logger.info("Evaluating file %s with competition metrics", args.output_file)
 54 |         output_file = os.path.join('results', args.output_file)
 55 |         get_submission_format(args.predicted_labels_files, test_articles_id, test_articles_content, nlp, output_file)
 56 |         if args.gold_annot_file is None:
 57 |             gold_annot_file = next(tempfile._get_candidate_names())
 58 |             get_submission_format([test_file_path], test_articles_id, test_articles_content, nlp, gold_annot_file)
 59 |         else:
 60 |             gold_annot_file = args.gold_annot_file
 61 |         cmd = "python tools/task-SI_scorer.py -s {} -r {}".format(output_file, gold_annot_file)
 62 |         subprocess.run(cmd, shell=True)
 63 |         if args.gold_annot_file is None:
 64 |             os.remove(gold_annot_file)
 65 |     
 66 |     if args.create_submission_file:
 67 |         if not os.path.exists('results'):
 68 |             os.makedirs('results')
 69 |         output_file = os.path.join('results', args.output_file)
 70 |         logger.info("Creating a submission file: %s", output_file)
 71 |         get_submission_format(args.predicted_labels_files, test_articles_id, test_articles_content, nlp, output_file)
 72 | 
 73 | 
 74 | def main(): 
 75 |     parser = configargparse.ArgumentParser()
 76 |     
 77 |     parser.add_argument('--config', required=True, is_config_file=True, help='Config file path.')
 78 |     parser.add_argument("--train_data_folder", default=None, type=str, required=True,
 79 |                         help="Source directory with the train articles.")
 80 |     parser.add_argument("--test_data_folder", default=None, type=str, required=True,
 81 |                         help="Source directory with the test articles.")
 82 |     parser.add_argument("--propaganda_techniques_file", default=None, type=str, required=True,
 83 |                         help="The file with propaganda techniques.")
 84 |     parser.add_argument("--labels_path", default=None, type=str, required=True,
 85 |                         help="The file with train labels.")
 86 |     parser.add_argument("--data_dir", default=None, type=str, required=True,
 87 |                         help="The directory for cached preprocessed data.")
 88 |     parser.add_argument("--train_file", default=None, type=str, required=True,
 89 |                         help="The filename for cached preprocessed train data.")
 90 |     parser.add_argument("--dev_file", default=None, type=str, required=True,
 91 |                         help="The filename for cached preprocessed dev data.")
 92 |     parser.add_argument("--test_file", default=None, type=str, required=True,
 93 |                         help="The filename for cached preprocessed test data.")
 94 |     parser.add_argument("--predicted_labels_files", default=None, nargs='*', required=True,
 95 |                         help="The predicted filenames of labels that will be used to form the final result")
 96 |     parser.add_argument("--output_file", default=None, type=str, required=True,
 97 |                         help="The submission filename")
 98 |     parser.add_argument("--dev_size", default=0.3, type=float, help="Dev data size.")
 99 |     parser.add_argument("--split_dataset", action="store_true", 
100 |                         help="Split the dataset into the train/dev parts")
101 |     parser.add_argument("--split_by_ids", action="store_true", 
102 |                         help="Use articles ids while splitting the dataset into the train/dev parts.")
103 |     parser.add_argument("--create_submission_file", action="store_true", 
104 |                         help="Creats file in the submission (source) format")
105 |     parser.add_argument("--random_state", default=42, type=int, help='Random state for the dataset splitting.')
106 |     parser.add_argument("--do_eval_spans", action="store_true", 
107 |                         help="Whether to run eval on the dev set with the competition metrics.")
108 |     parser.add_argument("--gold_annot_file", default=None, type=str, help="Gold annotation file.")
109 | 
110 |     parser.add_argument("--use_crf", action="store_true", help="Use Conditional Random Field over the model")
111 |     parser.add_argument("--use_quotes", action="store_true")
112 |     
113 |     MODEL_CLASSES = ["bert", "roberta", "distilbert", "camembert"]
114 |     parser.add_argument("--model_type", default=None, type=str, required=True,
115 |                         help="Model type selected in the list: " + ", ".join(MODEL_CLASSES))
116 |     parser.add_argument("--model_name_or_path", default=None, type=str, required=True,
117 |                         help="Path to pre-trained model or shortcut name")
118 |     parser.add_argument("--output_dir", default=None, type=str, required=True,
119 |                         help="The output directory where the model predictions and checkpoints will be written.")
120 | 
121 |     parser.add_argument("--labels", default="", type=str,
122 |                         help="Path to a file containing all labels. If not specified, CoNLL-2003 labels are used.")
123 |     parser.add_argument("--config_name", default="", type=str,
124 |                         help="Pretrained config name or path if not the same as model_name")
125 |     parser.add_argument("--tokenizer_name", default="", type=str,
126 |                         help="Pretrained tokenizer name or path if not the same as model_name")
127 |     parser.add_argument("--cache_dir", default="", type=str,
128 |                         help="Where do you want to store the pre-trained models downloaded from s3")
129 |     parser.add_argument("--max_seq_length", default=128, type=int,
130 |                         help="The maximum total input sequence length after tokenization. Sequences longer "
131 |                              "than this will be truncated, sequences shorter will be padded.")
132 |     parser.add_argument("--do_train", action="store_true",
133 |                         help="Whether to run training.")
134 |     parser.add_argument("--do_eval", action="store_true",
135 |                         help="Whether to run eval on the dev set.")
136 |     parser.add_argument("--do_predict", action="store_true",
137 |                         help="Whether to run predictions on the test set.")
138 |     parser.add_argument("--evaluate_during_training", action="store_true",
139 |                         help="Whether to run evaluation during training at each logging step.")
140 |     parser.add_argument("--do_lower_case", action="store_true",
141 |                         help="Set this flag if you are using an uncased model.")
142 | 
143 |     parser.add_argument("--per_gpu_train_batch_size", default=8, type=int,
144 |                         help="Batch size per GPU/CPU for training.")
145 |     parser.add_argument("--per_gpu_eval_batch_size", default=8, type=int,
146 |                         help="Batch size per GPU/CPU for evaluation.")
147 |     parser.add_argument("--gradient_accumulation_steps", type=int, default=1,
148 |                         help="Number of updates steps to accumulate before performing a backward/update pass.")
149 |     parser.add_argument("--learning_rate", default=5e-5, type=float,
150 |                         help="The initial learning rate for Adam.")
151 |     parser.add_argument("--weight_decay", default=0.0, type=float,
152 |                         help="Weight decay if we apply some.")
153 |     parser.add_argument("--adam_epsilon", default=1e-8, type=float,
154 |                         help="Epsilon for Adam optimizer.")
155 |     parser.add_argument("--max_grad_norm", default=1.0, type=float,
156 |                         help="Max gradient norm.")
157 |     parser.add_argument("--num_train_epochs", default=3.0, type=float,
158 |                         help="Total number of training epochs to perform.")
159 |     parser.add_argument("--max_steps", default=-1, type=int,
160 |                         help="If > 0: set total number of training steps to perform. Override num_train_epochs.")
161 |     parser.add_argument("--warmup_steps", default=0, type=int,
162 |                         help="Linear warmup over warmup_steps.")
163 | 
164 |     parser.add_argument("--logging_steps", type=int, default=50,
165 |                         help="Log every X updates steps.")
166 |     parser.add_argument("--save_steps", type=int, default=50,
167 |                         help="Save checkpoint every X updates steps.")
168 |     parser.add_argument("--eval_all_checkpoints", action="store_true",
169 |                         help="Evaluate all checkpoints starting with the same prefix as model_name ending and ending with step number")
170 |     parser.add_argument("--no_cuda", action="store_true",
171 |                         help="Avoid using CUDA when available")
172 |     parser.add_argument("--overwrite_output_dir", action="store_true",
173 |                         help="Overwrite the content of the output directory")
174 |     parser.add_argument("--overwrite_cache", action="store_true",
175 |                         help="Overwrite the cached training and evaluation sets")
176 |     parser.add_argument("--seed", type=int, default=42,
177 |                         help="random seed for initialization")
178 | 
179 |     parser.add_argument("--fp16", action="store_true",
180 |                         help="Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit")
181 |     parser.add_argument("--fp16_opt_level", type=str, default="O1",
182 |                         help="For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']."
183 |                              "See details at https://nvidia.github.io/apex/amp.html")
184 |     parser.add_argument("--local_rank", type=int, default=-1,
185 |                         help="For distributed training: local_rank")
186 |     parser.add_argument("--server_ip", type=str, default="", help="For distant debugging.")
187 |     parser.add_argument("--server_port", type=str, default="", help="For distant debugging.")
188 |     args = parser.parse_args()
189 |     
190 |     logging.basicConfig(format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
191 |                         datefmt="%m/%d/%Y %H:%M:%S",
192 |                         level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN)
193 |     
194 |     Main(args)
195 |     
196 |     
197 | if __name__ == "__main__":
198 |     main()
199 | 


--------------------------------------------------------------------------------
/technique_classification/__main__.py:
--------------------------------------------------------------------------------
  1 | try:
  2 |     from .transformers_classifier import transformers_clf
  3 |     from .dataset import load_data, get_train_dev_files, get_test_file
  4 |     from .submission import create_submission_file, eval_submission
  5 | except:
  6 |     from transformers_classifier import transformers_clf
  7 |     from dataset import load_data, get_train_dev_files, get_test_file
  8 |     from submission import create_submission_file, eval_submission
  9 |     
 10 | import configargparse
 11 | import logging
 12 | import os
 13 | import subprocess
 14 | 
 15 | 
 16 | logger = logging.getLogger(__name__)
 17 | 
 18 | 
 19 | def Main(args):
 20 |     if not os.path.exists(args.data_dir):
 21 |         os.makedirs(args.data_dir)
 22 |     
 23 |     if args.do_train or args.do_eval or args.split_dataset or args.create_submission_file:
 24 |         articles, ref_articles_id, ref_span_starts, ref_span_ends, labels = load_data(args.train_data_folder, 
 25 |                                                                            args.labels_path)
 26 |         train_file_path = os.path.join(args.data_dir, args.train_file)
 27 |         dev_file_path = os.path.join(args.data_dir, args.dev_file)
 28 |         if not os.path.exists(train_file_path) or not os.path.exists(dev_file_path) or args.overwrite_cache:
 29 |             logger.info("Creating train/dev files: %s, %s", train_file_path, dev_file_path)
 30 |             get_train_dev_files(articles, ref_articles_id, ref_span_starts, ref_span_ends, labels, train_file_path, 
 31 |                                 dev_file_path, args.split_by_ids, args.dev_size, args.random_state, args.balance,
 32 |                                 args.shuffle)
 33 |     
 34 |     if args.do_predict or args.create_submission_file or args.eval_submission:
 35 |         test_file_path = os.path.join(args.data_dir, args.test_file)
 36 |         test_articles, test_articles_id, test_span_starts, test_span_ends, test_labels = load_data(args.test_data_folder,
 37 |                                                                                       args.test_template_labels_path)
 38 |         if not os.path.exists(test_file_path) or args.overwrite_cache:
 39 |             logger.info("Creating roberta-type test file: %s", test_file_path)
 40 |             get_test_file(test_articles, test_articles_id, test_span_starts, test_span_ends, test_labels, test_file_path)
 41 |            
 42 |     if args.do_train or args.do_eval or args.do_predict:
 43 |         transformers_clf(args)
 44 |     
 45 |     if args.create_submission_file:
 46 |         if not os.path.exists('results'):
 47 |             os.makedirs('results')
 48 |         output_file = os.path.join('results', args.output_file)
 49 |         logger.info("Creating the submission file: %s", output_file)        
 50 |         create_submission_file(args.predicted_logits_files, train_file_path, dev_file_path, test_file_path, 
 51 |                             test_articles_id, test_span_starts, test_span_ends, output_file, args.weights, args.data_dir)
 52 |         
 53 |     if args.eval_submission:
 54 |         output_file = os.path.join('results', args.output_file)
 55 |         logger.info("Evaluating the submission file: %s", output_file)
 56 |         if args.test_labels_path is None:
 57 |             acc, f1 = eval_submission(output_file, test_file_path)
 58 |             logger.info('accuracy: %f', acc)
 59 |             print('f1-macro:', f1)
 60 |         else:
 61 |             cmd = "python tools/task-TC_scorer.py -s {} -r {} -p {}".format(output_file, args.test_labels_path,
 62 |                                                                           args.propaganda_techniques_file)
 63 |             subprocess.run(cmd, shell=True)
 64 | 
 65 | 
 66 | def main(): 
 67 |     parser = configargparse.ArgumentParser()
 68 |     
 69 |     parser.add_argument('--config', required=True, is_config_file=True, help='Config file path.')
 70 |     parser.add_argument("--train_data_folder", default=None, type=str, required=True,
 71 |                         help="Source directory with the train articles.")
 72 |     parser.add_argument("--test_data_folder", default=None, type=str, required=True,
 73 |                         help="Source directory with the test articles.")
 74 |     parser.add_argument("--propaganda_techniques_file", default=None, type=str, required=True,
 75 |                     help="The file with propaganda techniques.")
 76 |     parser.add_argument("--labels_path", default=None, type=str, required=True,
 77 |                         help="The file with train labels.")
 78 |     parser.add_argument("--test_template_labels_path", default=None, type=str, required=True,
 79 |                         help="The file with test template labels.")
 80 |     parser.add_argument("--data_dir", default=None, type=str, required=True,
 81 |                         help="The directory for cached preprocessed data.")
 82 |     parser.add_argument("--train_file", default=None, type=str, required=True,
 83 |                         help="The filename for cached preprocessed train data.")
 84 |     parser.add_argument("--dev_file", default=None, type=str, required=True,
 85 |                         help="The filename for cached preprocessed dev data.")
 86 |     parser.add_argument("--test_file", default=None, type=str, required=True,
 87 |                         help="The filename for cached preprocessed test data.")
 88 |     parser.add_argument("--predicted_logits_files", default=None, nargs='*', required=True,
 89 |                         help="The predicted filenames of logits that will be used to obtain the final result")
 90 |     parser.add_argument("--weights", default=None, nargs='*', required=False,
 91 |                         help="The list of weights for predicted logits at the aggregation stage")
 92 |     parser.add_argument("--output_file", default=None, type=str, required=True,
 93 |                         help="The submission filename")
 94 |     parser.add_argument("--dev_size", default=0.3, type=float, help="Dev data size.")
 95 |     parser.add_argument("--split_dataset", action="store_true", 
 96 |                         help="Split the dataset into the train/dev parts.")
 97 |     parser.add_argument("--split_by_ids", action="store_true", 
 98 |                         help="Use articles ids while splitting the dataset into the train/dev parts.")
 99 |     parser.add_argument("--random_state", default=42, type=int, help='Random state for the dataset splitting.')
100 |     parser.add_argument("--shuffle", action="store_true", help="Shuffle the train dataset.")
101 |     parser.add_argument("--balance", action="store_true", help="Balance the train dataset with oversampling.")
102 |     parser.add_argument("--create_submission_file", action="store_true", 
103 |                         help="Creats file in the submission (source) format")
104 |     parser.add_argument("--eval_submission", action="store_true", help="Do evaluating for the dev subset.")
105 |     
106 |     parser.add_argument('--use_length', action='store_true')
107 |     parser.add_argument('--join_embeddings', action='store_true')
108 |     parser.add_argument('--use_matchings', action='store_true')
109 |     
110 |     MODEL_CLASSES = ["bert", "roberta", "distilbert", "camembert"]
111 |     parser.add_argument("--model_type", default=None, type=str, required=True,
112 |                         help="Model type selected in the list: " + ", ".join(MODEL_CLASSES))
113 |     parser.add_argument("--model_name_or_path", default=None, type=str, required=True,
114 |                         help="Path to pre-trained model or shortcut name.")
115 |     parser.add_argument("--task_name", default=None, type=str, required=True,
116 |                         help="The name of the task to train.")
117 |     parser.add_argument("--output_dir", default=None, type=str, required=True,
118 |                         help="The output directory where the model predictions and checkpoints will be written.")
119 |     parser.add_argument("--test_labels_path", default=None, type=str, required=False)
120 | 
121 |     ## Other parameters
122 |     parser.add_argument("--config_name", default="", type=str,
123 |                         help="Pretrained config name or path if not the same as model_name")
124 |     parser.add_argument("--tokenizer_name", default="", type=str,
125 |                         help="Pretrained tokenizer name or path if not the same as model_name")
126 |     parser.add_argument("--cache_dir", default="", type=str,
127 |                         help="Where do you want to store the pre-trained models downloaded from s3")
128 |     parser.add_argument("--max_seq_length", default=128, type=int,
129 |                         help="The maximum total input sequence length after tokenization. Sequences longer "
130 |                              "than this will be truncated, sequences shorter will be padded.")
131 |     parser.add_argument("--do_train", action='store_true',
132 |                         help="Whether to run training.")
133 |     parser.add_argument("--do_eval", action='store_true',
134 |                         help="Whether to run eval on the dev set.")
135 |     parser.add_argument("--do_predict", action='store_true',
136 |                         help="Whether to run prediction")
137 |     parser.add_argument("--evaluate_during_training", action='store_true',
138 |                         help="Rul evaluation during training at each logging step.")
139 |     parser.add_argument("--do_lower_case", action='store_true',
140 |                         help="Set this flag if you are using an uncased model.")
141 | 
142 |     parser.add_argument("--per_gpu_train_batch_size", default=8, type=int,
143 |                         help="Batch size per GPU/CPU for training.")
144 |     parser.add_argument("--per_gpu_eval_batch_size", default=8, type=int,
145 |                         help="Batch size per GPU/CPU for evaluation.")
146 |     parser.add_argument('--gradient_accumulation_steps', type=int, default=1,
147 |                         help="Number of updates steps to accumulate before performing a backward/update pass.")     
148 |     parser.add_argument("--learning_rate", default=5e-5, type=float,
149 |                         help="The initial learning rate for Adam.")
150 |     parser.add_argument("--weight_decay", default=0.0, type=float,
151 |                         help="Weight deay if we apply some.")
152 |     parser.add_argument("--adam_epsilon", default=1e-8, type=float,
153 |                         help="Epsilon for Adam optimizer.")
154 |     parser.add_argument("--max_grad_norm", default=1.0, type=float,
155 |                         help="Max gradient norm.")
156 |     parser.add_argument("--num_train_epochs", default=3.0, type=float,
157 |                         help="Total number of training epochs to perform.")
158 |     parser.add_argument("--max_steps", default=-1, type=int,
159 |                         help="If > 0: set total number of training steps to perform. Override num_train_epochs.")
160 |     parser.add_argument("--warmup_steps", default=0, type=int,
161 |                         help="Linear warmup over warmup_steps.")
162 | 
163 |     parser.add_argument('--logging_steps', type=int, default=50,
164 |                         help="Log every X updates steps.")
165 |     parser.add_argument('--save_steps', type=int, default=50,
166 |                         help="Save checkpoint every X updates steps.")
167 |     parser.add_argument("--eval_all_checkpoints", action='store_true',
168 |                         help="Evaluate all checkpoints starting with the same prefix as model_name ending and ending with step number")
169 |     parser.add_argument("--no_cuda", action='store_true',
170 |                         help="Avoid using CUDA when available")
171 |     parser.add_argument('--overwrite_output_dir', action='store_true',
172 |                         help="Overwrite the content of the output directory")
173 |     parser.add_argument('--overwrite_cache', action='store_true',
174 |                         help="Overwrite the cached training and evaluation sets")
175 |     parser.add_argument('--seed', type=int, default=42,
176 |                         help="random seed for initialization")
177 | 
178 |     parser.add_argument('--fp16', action='store_true',
179 |                         help="Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit")
180 |     parser.add_argument('--fp16_opt_level', type=str, default='O1',
181 |                         help="For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']."
182 |                              "See details at https://nvidia.github.io/apex/amp.html")
183 |     parser.add_argument("--local_rank", type=int, default=-1,
184 |                         help="For distributed training: local_rank")
185 |     parser.add_argument('--server_ip', type=str, default='', help="For distant debugging.")
186 |     parser.add_argument('--server_port', type=str, default='', help="For distant debugging.")
187 |     args = parser.parse_args()
188 |     
189 |     logging.basicConfig(format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
190 |                         datefmt="%m/%d/%Y %H:%M:%S",
191 |                         level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN)
192 |     
193 |     Main(args)
194 |     
195 |     
196 | if __name__ == "__main__":
197 |     main()
198 | 


--------------------------------------------------------------------------------
/visualization_example/visualization/highlight.css:
--------------------------------------------------------------------------------
  1 | <style>
  2 | 
  3 | .highlight {
  4 |   border: 2px solid;
  5 |   color: #232323;
  6 |   margin: 4px 6px 4px 3px;
  7 |   vertical-align: middle;
  8 |   box-shadow: 2px 4px 20px rgba(0,0,0,0.1);
  9 |   position: relative;
 10 |   cursor: default;
 11 |   min-width: 26px;
 12 |   line-height: 22px;
 13 |   display: inline-flex;
 14 | }
 15 | 
 16 | .highlight:last-child {
 17 |   margin-right: 4px;
 18 | }
 19 | 
 20 | .highlight:first-child {
 21 |   margin-left: 0;
 22 | }
 23 | 
 24 | .highlight,
 25 | .highlight span {
 26 |   transition: background-color .1s ease,
 27 |               color .1s ease,
 28 |               box-shadow .1s ease,
 29 |               opacity .1s ease;
 30 | }
 31 | 
 32 | .highlight.short-text {
 33 |   text-align: center;
 34 | }
 35 | 
 36 | .highlight__label {
 37 |   align-items: center;
 38 |   justify-content: center;
 39 |   padding: 0 8px;
 40 |   text-align: center;
 41 |   user-select: none;
 42 | }
 43 | 
 44 | .highlight__label strong,
 45 | .highlight__label span.highlight__label__secondary-label {
 46 |   display: block;
 47 |   font-size: 11px;
 48 |   color: #fff;
 49 |   -webkit-font-smoothing: subpixel-antialiased;
 50 |   letter-spacing: 0.1em;
 51 | }
 52 | 
 53 | .highlight__label strong {
 54 |   text-transform: uppercase;
 55 | }
 56 | 
 57 | .highlight__label span.highlight__label__secondary-label {
 58 |   opacity: .75;
 59 |   padding-left: 6px;
 60 | }
 61 | 
 62 | .highlight__content {
 63 |   flex-wrap: wrap;
 64 |   align-items: center;
 65 |   padding: 2px 2px 2px 6px;
 66 | }
 67 | 
 68 | 
 69 | .highlight-container.highlight-container--bottom-labels .highlight.bottom {
 70 |   margin-top: 6px;
 71 | }
 72 | 
 73 | .highlight.bottom {
 74 |   display: block;
 75 |   white-space: normal;
 76 | }
 77 | 
 78 | .highlight.bottom .highlight__content:after {
 79 |   content: " ";
 80 |   padding-right: 3px;
 81 | }
 82 | 
 83 | .highlight.bottom .highlight__label {
 84 |   line-height: 14px;
 85 |   padding-top: 1px;
 86 | }
 87 | 
 88 | 
 89 | .highlight.top {
 90 |   flex-direction: column;
 91 |   white-space: normal;
 92 | }
 93 | 
 94 | .highlight.top .highlight__label {
 95 |   min-height: 22px;
 96 | }
 97 | 
 98 | 
 99 | .highlight.active,
100 | .highlight.active span {
101 |   color: #fff;
102 | }
103 | 
104 | .highlight.active .highlight:not(.active) span {
105 |   color: #232323;
106 | }
107 | 
108 | .highlight.clickable {
109 |   cursor: pointer;
110 | }
111 | 
112 | .highlight.clickable.clickable.selected {
113 |   cursor: default;
114 | }
115 | 
116 | .highlight.clickable.clicking {
117 |   opacity: 0.66;
118 |   transition-duration: 0s;
119 | }
120 | 
121 | .clicking .highlight,
122 | .clicking .highlight span,
123 | .clicking .highlight:before,
124 | .clicking .highlight:after {
125 |   transition-duration: 0s;
126 | }
127 | 
128 | 
129 | .highlight.gray {
130 |   background: #f2f4f6;
131 | }
132 | 
133 | .highlight.gray,
134 | .highlight-arrow--gray .highlight-arrow__triangle {
135 |   border-color: #a0aab5;
136 | }
137 | 
138 | .highlight.gray .highlight__label,
139 | .highlight-arrow--gray .highlight-arrow__stalk,
140 | .highlight.gray .highlight__button .highlight__button__body {
141 |   background-color: #a0aab5;
142 | }
143 | 
144 | .highlight.gray.active {
145 |   background: #a0aab5;
146 | }
147 | 
148 | .highlight.gray.active .highlight__label {
149 |   background-color: #aab3bd;
150 | }
151 | 
152 | .highlight.gray .highlight__button svg {
153 |   fill: #a0aab5;
154 | }
155 | 
156 | 
157 | 
158 | .highlight.blue {
159 |   background: #edf4fa;
160 | }
161 | 
162 | .highlight.blue,
163 | .highlight-arrow--blue .highlight-arrow__triangle {
164 |   border-color: #4db1f7;
165 | }
166 | 
167 | .highlight.blue > .highlight__label,
168 | .highlight-arrow--blue .highlight-arrow__stalk,
169 | .highlight.blue .highlight__button .highlight__button__body {
170 |   background-color: #4db1f7;
171 | }
172 | 
173 | .highlight.blue.active {
174 |   background: #4db1f7;
175 | }
176 | 
177 | .highlight.blue.active > .highlight__label {
178 |   background-color: #5fb9f8;
179 | }
180 | 
181 | .highlight.blue .highlight__button svg {
182 |   fill: #4db1f7;
183 | }
184 | 
185 | 
186 | .highlight.green {
187 |   background: #f1f4f1;
188 | }
189 | 
190 | .highlight.green,
191 | .highlight-arrow--green .highlight-arrow__triangle {
192 |   border-color: #90ac4e;
193 | }
194 | 
195 | .highlight.green > .highlight__label,
196 | .highlight-arrow--green .highlight-arrow__stalk,
197 | .highlight.green .highlight__button .highlight__button__body {
198 |   background-color: #90ac4e;
199 | }
200 | 
201 | .highlight.green.active {
202 |   background: #90ac4e;
203 | }
204 | 
205 | .highlight.green.active > .highlight__label {
206 |   background-color: #9bb460;
207 | }
208 | 
209 | .highlight.green .highlight__button svg {
210 |   fill: #90ac4e;
211 | }
212 | 
213 | 
214 | .highlight.pink {
215 |   background: #f4f1f4;
216 | }
217 | 
218 | .highlight.pink,
219 | .highlight-arrow--pink .highlight-arrow__triangle {
220 |   border-color: #ce6587;
221 | }
222 | 
223 | .highlight.pink > .highlight__label,
224 | .highlight-arrow--pink .highlight-arrow__stalk,
225 | .highlight.pink .highlight__button .highlight__button__body {
226 |   background-color: #ce6587;
227 | }
228 | 
229 | .highlight.pink.active {
230 |   background: #ce6587;
231 | }
232 | 
233 | .highlight.pink.active > .highlight__label {
234 |   background-color: #d37593;
235 | }
236 | 
237 | .highlight.pink .highlight__button svg {
238 |   fill: #ce6587;
239 | }
240 | 
241 | 
242 | .highlight.orange {
243 |   background: #f2f4f4;
244 | }
245 | 
246 | .highlight.orange,
247 | .highlight-arrow--orange .highlight-arrow__triangle {
248 |   border-color: #dd9e3e;
249 | }
250 | 
251 | .highlight.orange > .highlight__label,
252 | .highlight-arrow--orange .highlight-arrow__stalk,
253 | .highlight.orange .highlight__button .highlight__button__body {
254 |   background-color: #dd9e3e;
255 | }
256 | 
257 | .highlight.orange.active {
258 |   background: #dd9e3e;
259 | }
260 | 
261 | .highlight.orange.active > .highlight__label {
262 |   background-color: #e0a852;
263 | }
264 | 
265 | .highlight.orange .highlight__button svg {
266 |   fill: #dd9e3e;
267 | }
268 | 
269 | 
270 | .highlight.purple {
271 |   background: #f1f0f7;
272 | }
273 | 
274 | .highlight.purple,
275 | .highlight-arrow--purple .highlight-arrow__triangle {
276 |   border-color: #9a5eba;
277 | }
278 | 
279 | .highlight.purple > .highlight__label,
280 | .highlight-arrow--purple .highlight-arrow__stalk,
281 | .highlight.purple .highlight__button .highlight__button__body {
282 |   background-color: #9a5eba;
283 | }
284 | 
285 | .highlight.purple.active {
286 |   background: #9a5eba;
287 | }
288 | 
289 | .highlight.purple.active > .highlight__label {
290 |   background-color: #a46ec1;
291 | }
292 | 
293 | .highlight.purple .highlight__button svg {
294 |   fill: #9a5eba;
295 | }
296 | 
297 | 
298 | .highlight.teal {
299 |   background: #eef4f6;
300 | }
301 | 
302 | .highlight.teal,
303 | .highlight-arrow--teal .highlight-arrow__triangle {
304 |   border-color: #5bb1ad;
305 | }
306 | 
307 | .highlight.teal > .highlight__label,
308 | .highlight-arrow--teal .highlight-arrow__stalk,
309 | .highlight.teal .highlight__button .highlight__button__body {
310 |   background-color: #5bb1ad;
311 | }
312 | 
313 | .highlight.teal.active {
314 |   background: #5bb1ad;
315 | }
316 | 
317 | .highlight.teal.active > .highlight__label {
318 |   background-color: #6cb9b5;
319 | }
320 | 
321 | .highlight.teal .highlight__button svg {
322 |   fill: #5bb1ad;
323 | }
324 | 
325 | 
326 | .highlight.tan {
327 |   background: #f2f4f4;
328 | }
329 | 
330 | .highlight.tan,
331 | .highlight-arrow--tan .highlight-arrow__triangle {
332 |   border-color: #b0a481;
333 | }
334 | 
335 | .highlight.tan > .highlight__label,
336 | .highlight-arrow--tan .highlight-arrow__stalk,
337 | .highlight.tan .highlight__button .highlight__button__body {
338 |   background-color: #b0a481;
339 | }
340 | 
341 | .highlight.tan.active {
342 |   background: #b0a481;
343 | }
344 | 
345 | .highlight.tan.active > .highlight__label {
346 |   background-color: #b8ad8e;
347 | }
348 | 
349 | .highlight.tan .highlight__button svg {
350 |   fill: #b0a481;
351 | }
352 | 
353 | 
354 | .highlight.red {
355 |   background: #f5eef0;
356 | }
357 | 
358 | .highlight.red,
359 | .highlight-arrow--red .highlight-arrow__triangle {
360 |   border-color: #df3838;
361 | }
362 | 
363 | .highlight.red > .highlight__label,
364 | .highlight-arrow--red .highlight-arrow__stalk,
365 | .highlight.red .highlight__button .highlight__button__body {
366 |   background-color: #df3838;
367 | }
368 | 
369 | .highlight.red.active {
370 |   background: #df3838;
371 | }
372 | 
373 | .highlight.red.active > .highlight__label {
374 |   background-color: #e24c4c;
375 | }
376 | 
377 | .highlight.red .highlight__button svg {
378 |   fill: #df3838;
379 | }
380 | 
381 | 
382 | .highlight.cobalt {
383 |   background: #eef0f5;
384 | }
385 | 
386 | .highlight.cobalt,
387 | .highlight-arrow--cobalt .highlight-arrow__triangle {
388 |   border-color: #5f5b97;
389 | }
390 | 
391 | .highlight.cobalt > .highlight__label,
392 | .highlight-arrow--cobalt .highlight-arrow__stalk,
393 | .highlight.cobalt .highlight__button .highlight__button__body {
394 |   background-color: #5f5b97;
395 | }
396 | 
397 | .highlight.cobalt.active {
398 |   background: #5f5b97;
399 | }
400 | 
401 | .highlight.cobalt.active > .highlight__label {
402 |   background-color: #6f6ca2;
403 | }
404 | 
405 | .highlight.cobalt .highlight__button svg {
406 |   fill: #5f5b97;
407 | }
408 | 
409 | 
410 | .highlight.brown {
411 |   background: #f2f4f6;
412 | }
413 | 
414 | .highlight.brown,
415 | .highlight-arrow--brown .highlight-arrow__triangle {
416 |   border-color: #6a4e3d;
417 | }
418 | 
419 | .highlight.brown > .highlight__label,
420 | .highlight-arrow--brown .highlight-arrow__stalk,
421 | .highlight.brown .highlight__button .highlight__button__body {
422 |   background-color: #6a4e3d;
423 | }
424 | 
425 | .highlight.brown.active {
426 |   background: #6a4e3d;
427 | }
428 | 
429 | .highlight.brown.active > .highlight__label {
430 |   background-color: #796051;
431 | }
432 | 
433 | .highlight.brown .highlight__button svg {
434 |   fill: #6a4e3d;
435 | }
436 | 
437 | 
438 | .highlight.slate {
439 |   background: #eceff1;
440 | }
441 | 
442 | .highlight.slate,
443 | .highlight-arrow--slate .highlight-arrow__triangle {
444 |   border-color: #3b4247;
445 | }
446 | 
447 | .highlight.slate > .highlight__label,
448 | .highlight-arrow--slate .highlight-arrow__stalk,
449 | .highlight.slate .highlight__button .highlight__button__body {
450 |   background-color: #3b4247;
451 | }
452 | 
453 | .highlight.slate.active {
454 |   background: #3b4247;
455 | }
456 | 
457 | .highlight.slate.active > .highlight__label {
458 |   background-color: #4f555a;
459 | }
460 | 
461 | .highlight.slate .highlight__button svg {
462 |   fill: #3b4247;
463 | }
464 | 
465 | .highlight.fuchsia {
466 |   background: #f5f1f9;
467 | }
468 | 
469 | .highlight.fuchsia,
470 | .highlight-arrow--fuchsia .highlight-arrow__triangle {
471 |   border-color: #e875e8;
472 | }
473 | 
474 | .highlight.fuchsia > .highlight__label,
475 | .highlight-arrow--fuchsia .highlight-arrow__stalk,
476 | .highlight.fuchsia .highlight__button .highlight__button__body {
477 |   background-color: #e875e8;
478 | }
479 | 
480 | .highlight.fuchsia.active {
481 |   background: #e875e8;
482 | }
483 | 
484 | .highlight.fuchsia.active > .highlight__label {
485 |   background-color: #ea83ea;
486 | }
487 | 
488 | .highlight.fuchsia .highlight__button svg {
489 |   fill: #e875e8;
490 | }
491 | 
492 | 
493 | .highlight__tooltip {
494 |   display: block;
495 |   position: absolute;
496 |   box-shadow: 0 0 30px rgba(0,0,0,.2);
497 |   border-radius: 6px;
498 |   background: rgba(70,70,70,.9);
499 |   padding: 4px 9px 5px 9px;
500 |   opacity: 0;
501 |   z-index: -9;
502 |   left: 50%;
503 |   top: 100%;
504 |   margin-top: 10px;
505 |   font-size: 14px;
506 |   color: #fff;
507 |   transform: translate(-50%, -6px);
508 |   transition: opacity .2s ease,
509 |               z-index .2s ease,
510 |               transform .2s ease .3s;
511 |   font-weight: bold;
512 |   white-space: nowrap;
513 |   user-select: none;
514 |   cursor: default;
515 | }
516 | 
517 | .highlight__tooltip:before {
518 |   display: block;
519 |   position: absolute;
520 |   left: 50%;
521 |   top: 0;
522 |   margin-top: -6px;
523 |   margin-left: -6px;
524 |   content: "";
525 |   width: 0;
526 |   height: 0;
527 |   border-style: solid;
528 |   border-width: 0 6px 6px 6px;
529 |   border-color: transparent transparent rgba(70,70,70,.9) transparent;
530 | }
531 | 
532 | .highlight:hover .highlight__tooltip {
533 |   z-index: 9;
534 |   opacity: 1;
535 |   transform: translate(-50%, 0);
536 |   transition-delay: 0s;
537 | }
538 | 
539 | .highlight__tooltip:hover {
540 |   z-index: -9 !important;
541 | }
542 | 
543 | .highlight-container {
544 |   line-height: 42px !important;
545 |   align-items: center;
546 |   display: flex;
547 |   flex-wrap: wrap;
548 |   white-space: pre;
549 |   cursor: default;
550 | }
551 | 
552 | 
553 | .highlight-container.highlight-container--bottom-labels {
554 |   padding: 10px 1.125em;
555 |   align-items: flex-start;
556 | }
557 | 
558 | .highlight-container.highlight-container--diagram {
559 |   align-items: flex-start;
560 | }
561 | 
562 | .highlight-container.highlight-container--diagram.passage.model__content__summary {
563 |   background: transparent;
564 |   align-items: stretch;
565 |   padding: 0;
566 | }
567 | 
568 | </style>


--------------------------------------------------------------------------------
/tools/src/annotations.py:
--------------------------------------------------------------------------------
  1 | from __future__ import annotations
  2 | from typing import Dict
  3 | import sys
  4 | import re
  5 | import os.path
  6 | import glob
  7 | import logging.handlers
  8 | from sklearn.metrics import f1_score
  9 | from sklearn.metrics import precision_score
 10 | from sklearn.metrics import recall_score
 11 | import src.article_annotations as aa
 12 | import src.annotation as an
 13 | 
 14 | __author__ = "Giovanni Da San Martino"
 15 | __copyright__ = "Copyright 2019"
 16 | __credits__ = ["Giovanni Da San Martino"]
 17 | __license__ = "GPL"
 18 | __version__ = "0.1"
 19 | __maintainer__ = "Giovanni Da San Martino"
 20 | __email__ = "gmartino@hbku.edu.qa"
 21 | __status__ = "Beta"
 22 | 
 23 | logger = logging.getLogger("propaganda_scorer")
 24 | 
 25 | 
 26 | class Annotations(object):
 27 |     """
 28 |     Dictionary of Articles_annotations objects. 
 29 |     (basically a dataset of article_annotations objects)
 30 | 
 31 |     """
 32 | 
 33 |     def __init__(self, annotations:aa.Articles_annotations=None):
 34 | 
 35 |         if annotations is None:
 36 |             self.annotations:Dict[str, aa.Articles_annotations] = {} 
 37 |         else:
 38 |             self.annotations = annotations
 39 | 
 40 | 
 41 |     def __len__(self):
 42 |         """
 43 |         Returns the number of articles in the object
 44 |         """
 45 |         return len(self.get_article_id_list())
 46 | 
 47 | 
 48 |     def add_annotation(self, annotation:an.Annotation, article_id:str):
 49 |         """
 50 |         Add a single annotation to the article with id article_id. 
 51 |         If such article does not exists, the annotation is created. 
 52 |         """
 53 |         if not self.has_article(article_id):
 54 |             self.create_article_annotations_object(article_id)
 55 |         self.annotations[article_id].add_annotation(annotation)
 56 | 
 57 | 
 58 |     def check_annotation_spans_with_category_matching(self, merge_overlapping_spans:bool=False):
 59 |         """
 60 |         Check whether there are overlapping spans for the same technique in the same article.
 61 |         Two spans are overlapping if their associated techniques match (according to category_matching_func)
 62 |         If merge_overlapping_spans==True then the overlapping spans are merged, otherwise an error is raised.
 63 | 
 64 |         :param merge_overlapping_spans: if True merges the overlapping spans
 65 |         :return:
 66 |         """
 67 | 
 68 |         for article_id in self.get_article_id_list():
 69 | 
 70 |             annotation_list = self.get_article_annotations_obj(article_id).groupby_technique()
 71 |             if merge_overlapping_spans:
 72 |                 for technique in annotation_list.keys():
 73 |                     for i in range(1, len(annotation_list[technique])):
 74 |                         annotation_list[technique][i].merge_spans(annotation_list[technique], i-1)
 75 |             if not self.get_article_annotations_obj(article_id):
 76 |                 return False
 77 |             # annotation_list = {}
 78 |             # for annotation in self.annotations.get_article_annotations(article_id):
 79 |             #     technique = annotation.get_label()
 80 |             #     if technique not in annotation_list.keys():
 81 |             #         annotation_list[technique] = [[technique, curr_span]]
 82 |             #     else:
 83 |             #         if merge_overlapping_spans:
 84 |             #             annotation_list[technique].append([technique, curr_span])
 85 |             #             merge_spans(annotation_list[technique], len(annotation_list[technique]) - 1)
 86 |             #         else:
 87 |             #             for matching_technique, span in annotation_list[technique]:
 88 |             #                 if len(curr_span.intersection(span)) > 0:
 89 |             #                     logger.error("In article %s, the span of the annotation %s, [%s,%s] overlap with "
 90 |             #                                  "the following one from the same article:%s, [%s,%s]" % (
 91 |             #                                  article_id, matching_technique,
 92 |             #                                  min(span), max(span), technique, min(curr_span), max(curr_span)))
 93 |             #                     return False
 94 |             #             annotation_list[technique].append([technique, curr_span])
 95 |             # if merge_overlapping_spans:
 96 |             #     annotations[article_id] = []
 97 |             #     for technique in annotation_list.keys():
 98 |             #         annotations[article_id] += annotation_list[technique]
 99 |         return True
100 | 
101 | 
102 |     def compare_annotations_identical_article_lists(self, second_annotations:Annotations):
103 |         """
104 |         Compare if self and <second_annotations> have identical article id lists
105 |         :return: True if the lists are identical and False otherwise. 
106 |         """
107 |         #checking that the number of articles in self and <second_annotations> is the same
108 |         if len(self.get_article_id_list()) != len(second_annotations.get_article_id_list()):
109 |             logger.error("The number of articles in the annotations is different: %d, %d" 
110 |                 % (len(self.get_article_id_list()), len(second_annotations.get_article_id_list())))
111 |             return False
112 |         diff = set(self.get_article_id_list()).difference(set(second_annotations.get_article_id_list()))
113 |         if len(diff) > 0:
114 |             logger.error("The two lists of article ids differ: %s"%(diff))
115 |             return False
116 | 
117 |         logger.debug("OK: the list of article ids in the two sets of annotations is identical")
118 |         return True
119 | 
120 | 
121 |     def compare_annotations_identical(self, second_annotations:Annotations)->bool:
122 |         """
123 |         Compare if self and <second_annotations> have identical annotations (without considering the technique labels)
124 |         :return: True if the lists are identical and False otherwise. 
125 |         """
126 |         for article_id in self.get_article_id_list():
127 |             an1_article_annotations = self.get_article_annotations_list(article_id)
128 |             an2_article_annotations = second_annotations.get_article_annotations_list(article_id)
129 |             if len(an1_article_annotations) != len(an2_article_annotations):
130 |                 logger.error("The number of annotations for article %s differs: %d vs %d"%(article_id, len(an1_article_annotations), len(an2_article_annotations)))
131 |                 return False
132 |             for an1, an2 in zip(an1_article_annotations, an2_article_annotations):
133 |                 if not an1.is_span_equal_to(an2):
134 |                     logger.error("The spans of the annotations of article %s do not match: [%s, %s] vs [%s, %s]"%(article_id, an1.get_start_offset(), an1.get_end_offset(), an2.get_start_offset(), an2.get_end_offset()))
135 |                     return False
136 |         return True
137 | 
138 | 
139 | #    def compute_SI_score(self, second_annotations:anwol.AnnotationWithOutLabel):
140 | #        def compute_score_pr(submission_annotations, gold_annotations, technique_names, prop_vs_non_propaganda=False,
141 |  #                    per_article_evaluation=False):
142 | #        pass
143 |         # prec_denominator = sum([len(annotations) for annotations in submission_annotations.values()])
144 |         # rec_denominator = sum([len(annotations) for annotations in gold_annotations.values()])
145 |         # technique_Spr_prec = {propaganda_technique: 0 for propaganda_technique in technique_names}
146 |         # technique_Spr_rec = {propaganda_technique: 0 for propaganda_technique in technique_names}
147 |         # cumulative_Spr_prec, cumulative_Spr_rec = (0, 0)
148 |         # f1_articles = []
149 | 
150 |         # for article_id in submission_annotations.keys():
151 |         #     gold_data = gold_annotations[article_id]
152 |         #     logger.debug("Computing contribution to the score of article id %s\nand tuples %s\n%s\n"
153 |         #                 % (article_id, str(submission_annotations[article_id]), str(gold_data)))
154 | 
155 |         #     article_cumulative_Spr_prec, article_cumulative_Spr_rec = (0, 0)
156 |         #     for j, sd in enumerate(submission_annotations[article_id]): #submission annotations for article article_id:
157 |         #         s=""
158 |         #         sd_annotation_length = len(sd[1])
159 |         #         for i, gd in enumerate(gold_data):
160 |         #             if prop_vs_non_propaganda or gd[0]==sd[0]:
161 |         #                 #s += "\tmatch %s %s-%s - %s %s-%s"%(sd[0],sd[1], sd[2], gd[0], gd[1], gd[2])
162 |         #                 intersection = len(sd[1].intersection(gd[1]))
163 |         #                 gd_annotation_length = len(gd[1])
164 |         #                 Spr_prec = intersection/sd_annotation_length
165 |         #                 article_cumulative_Spr_prec += Spr_prec
166 |         #                 cumulative_Spr_prec += Spr_prec
167 |         #                 s += "\tmatch %s %s-%s - %s %s-%s: S(p,r)=|intersect(r, p)|/|p| = %d/%d = %f (cumulative S(p,r)=%f)\n"\
168 |         #                     %(sd[0],min(sd[1]), max(sd[1]), gd[0], min(gd[1]), max(gd[1]), intersection, sd_annotation_length, Spr_prec, cumulative_Spr_prec)
169 |         #                 technique_Spr_prec[gd[0]] += Spr_prec
170 | 
171 |         #                 Spr_rec = intersection/gd_annotation_length
172 |         #                 article_cumulative_Spr_rec += Spr_rec
173 |         #                 cumulative_Spr_rec += Spr_rec
174 |         #                 s += "\tmatch %s %s-%s - %s %s-%s: S(p,r)=|intersect(r, p)|/|r| = %d/%d = %f (cumulative S(p,r)=%f)\n"\
175 |         #                     %(sd[0],min(sd[1]), max(sd[1]), gd[0], min(gd[1]), max(gd[1]), intersection, gd_annotation_length, Spr_rec, cumulative_Spr_rec)
176 |         #                 technique_Spr_rec[gd[0]] += Spr_rec
177 |         #         logger.debug("\n%s"%(s))
178 | 
179 |         #     p_article, r_article, f1_article =compute_prec_rec_f1(article_cumulative_Spr_prec,
180 |         #                                                         len(submission_annotations[article_id]),
181 |         #                                                         article_cumulative_Spr_rec,
182 |         #                                                         len(gold_annotations[article_id]), False)
183 |         #     f1_articles.append(f1_article)
184 | 
185 |         # p,r,f1 = compute_prec_rec_f1(cumulative_Spr_prec, prec_denominator, cumulative_Spr_rec, rec_denominator)
186 | 
187 |         # if not prop_vs_non_propaganda:
188 |         #     for technique_name in technique_Spr_prec.keys():
189 |         #         prec_tech, rec_tech, f1_tech = compute_prec_rec_f1(technique_Spr_prec[technique_name],
190 |         #                                     compute_technique_frequency(submission_annotations.values(), technique_name),
191 |         #                                     technique_Spr_prec[technique_name],
192 |         #                                     compute_technique_frequency(gold_annotations.values(), technique_name), False)
193 |         #         logger.info("%s: P=%f R=%f F1=%f" % (technique_name, prec_tech, rec_tech, f1_tech))
194 | 
195 |         # if per_article_evaluation:
196 |         #     logger.info("Per article evaluation F1=%s"%(",".join([ str(f1_value) for f1_value in  f1_articles])))
197 | 
198 |         # return f1
199 | 
200 | 
201 |     def align_annotations(self, second_annotations:Annotations)->None:
202 |         """
203 |         Reorder all annotations such that the matching between annotations' labels
204 |         and the ones from second_annotations is maximised. 
205 |         """
206 |         for article_id in second_annotations.get_article_id_list():
207 |             self.get_article_annotations_obj(article_id).align_annotations(second_annotations.get_article_annotations_obj(article_id))
208 | 
209 | 
210 |     def compute_TC_score(self, second_annotations:Annotations):
211 |         """
212 |         second_annotations: gold labels
213 |         """
214 | 
215 |         self.align_annotations(second_annotations)
216 |         gold_labels = [ x.get_label() for x in second_annotations.get_full_list_of_annotations() ]
217 |         submission_labels = [ x.get_label() for x in  self.get_full_list_of_annotations() ]
218 | 
219 |         precision = precision_score(gold_labels, submission_labels, pos_label=None, average='micro')
220 |         recall = recall_score(gold_labels, submission_labels, pos_label=None, average='micro')
221 |         f1 = f1_score(gold_labels, submission_labels, pos_label=None, average='micro')
222 |         if an.Annotation.propaganda_techniques is not None:
223 |             propaganda_techniques_list = an.Annotation.propaganda_techniques.get_propaganda_techniques_list_sorted()
224 |             f1_per_class = f1_score(gold_labels, submission_labels, average=None, labels=propaganda_techniques_list)
225 |             return precision, recall, f1, f1_per_class
226 |         return precision, recall, f1
227 | 
228 | 
229 |     def create_article_annotations_object(self, article_id:str)->None:
230 |         self.annotations[article_id] = aa.Articles_annotations(article_id=article_id)  
231 | 
232 | 
233 |     def TC_score_to_string(self, second_annotation:Annotations, output_for_script=False):
234 | 
235 |             if an.Annotation.propaganda_techniques is None: #raise an error
236 |                 precision, recall, f1 = self.compute_TC_score(second_annotation)    
237 |                 res = "\nPrecision=%f\nRecall=%f\nF1=%f\n"%(precision, recall, f1)
238 |             else:
239 |                 precision, recall, f1, f1_per_class = self.compute_TC_score(second_annotation)
240 |                 res_for_screen = "\nF1=%f\nPrecision=%f\nRecall=%f\n%s\n" % (precision, recall, f1, "\n".join([ "F1_"+pr+"="+str(f) for pr, f in zip(an.Annotation.propaganda_techniques.get_propaganda_techniques_list(), f1_per_class)]))
241 |                 if output_for_script:
242 |                     res_for_script = "%f\t%f\t%f\t"%(f1, precision, recall)
243 |                     res_for_script += "\t".join([ str(x) for x in f1_per_class])
244 |                 else:
245 |                     res_for_script = ""
246 |             return res_for_screen, res_for_script
247 | 
248 | 
249 |     def get_full_list_of_annotations(self):
250 |         full_list = []
251 |         for article_id in self.get_article_id_list():
252 |             for an in self.get_article_annotations_list(article_id):
253 |                 full_list.append(an)
254 |         return full_list
255 |             
256 | 
257 |     def has_article(self, article_id:str)->bool:
258 |         """
259 |         Check whether article_id is in the list of articles whose annotations are in the object. 
260 |         """
261 |         return article_id in self.get_article_id_list()
262 | 
263 | 
264 |     def get_article_id_list(self):
265 |         """
266 |         All ids of the article in the object
267 |         """
268 |         return self.annotations.keys()
269 | 
270 | 
271 |     def get_article_annotations_obj(self, article_id:str):
272 |         """
273 |         Returns all annotations of an article as an Article_annotations object.
274 |         """
275 |         return self.annotations[article_id]
276 | 
277 | 
278 |     def get_article_annotations_list(self, article_id:str):
279 |         """
280 |         Returns all annotations of an article as a list of Annotation objects.
281 |         """
282 |         return self.annotations[article_id].get_article_annotations()
283 | 
284 | 
285 |     def _guess_article_id_from_file_name(self, filename:str)->str:
286 |         
287 |         regex = re.compile("article([0-9]+).*")
288 |         article_id = regex.match(os.path.basename(filename)).group(1)
289 |         return article_id
290 | 
291 | 
292 |     def load_annotation_list_from_file(self, filename):
293 |         """
294 |         Loads all annotations in file <filename>. The file is supposed to contain annotations for multiple articles. To load annotations for a single article use the function with the same name from module src.article_annotations. 
295 |         Each annotation is checked according to check_format_of_annotation_in_file()
296 |         """
297 |         with open(filename, "r") as f:
298 |             for i, line in enumerate(f.readlines(), 1):
299 |                 ann, article_id = an.Annotation.load_annotation_from_string(line.rstrip(), i, filename)
300 |                 ann.check_format_of_annotation_in_file()
301 |                 self.add_annotation(ann, article_id)
302 | 
303 | 
304 |     def load_annotation_list_from_folder(self, folder_name, pattern="*.labels"):
305 |         """
306 |         Loads all annotations from all files in folder <folder_name>. 
307 |         Files in the folder are selected according to <pattern>
308 |         """
309 |         if not os.path.exists(folder_name):
310 |             logger.error("trying to load annotations from folder %s, which does not exists"%(folder_name))
311 |             return False
312 |         if not os.path.isdir(folder_name):
313 |             logger.error("trying to load annotations from folder %s, which does not appear to be a valid folder"%(folder_name))
314 |             return False
315 |         file_list = glob.glob(os.path.join(folder_name, pattern))
316 |         if len(file_list) == 0:
317 |             logger.error("Cannot load file list %s/%s"%(folder_name, pattern))
318 |             sys.exit()
319 |         for filename in file_list:
320 |             self.create_article_annotations_object(self._guess_article_id_from_file_name(filename))
321 |             self.load_annotation_list_from_file(filename)
322 |         return True
323 | 
324 | #    def compute_technique_frequency(annotations_list, technique_name):
325 | #        return sum([len([example_annotation for example_annotation in x if example_annotation[0] == technique_name])
326 | #                    for x in self.a])
327 | 
328 | 
329 |  #   def print_annotations(annotation_list):
330 |  #       s = ""
331 |  #       i=0
332 |  #       for technique, span in annotation_list:
333 |  #           s += "%d) %s: %d - %d\n"%(i, technique, min(span), max(span))
334 |  #           i += 1
335 |  #       return s
336 | 


--------------------------------------------------------------------------------
/results/SI_output.txt:
--------------------------------------------------------------------------------
   1 | 111111114	1705	1824
   2 | 111111117	671	753
   3 | 111111131	84	97
   4 | 111111131	102	109
   5 | 111111131	180	190
   6 | 111111131	207	214
   7 | 111111131	326	336
   8 | 111111131	352	365
   9 | 111111131	382	395
  10 | 111111131	398	413
  11 | 111111131	723	731
  12 | 111111131	804	811
  13 | 111111131	823	865
  14 | 111111131	1030	1068
  15 | 111111131	1977	1992
  16 | 111111131	2660	2671
  17 | 111111131	2728	2739
  18 | 111111131	2897	2908
  19 | 111111131	2912	2924
  20 | 111111131	2952	2997
  21 | 111111131	3396	3416
  22 | 111111131	3437	3455
  23 | 111111131	3738	3748
  24 | 111111131	4008	4014
  25 | 111111131	4030	4038
  26 | 111111131	4152	4169
  27 | 111111131	4224	4230
  28 | 111111131	4256	4264
  29 | 111111131	4339	4352
  30 | 111111131	4531	4546
  31 | 111111131	4635	4643
  32 | 111111131	4752	4768
  33 | 111111131	4882	4899
  34 | 111111131	5174	5186
  35 | 111111131	5244	5262
  36 | 111111131	5281	5289
  37 | 111111131	5368	5402
  38 | 111111131	5904	5916
  39 | 111111131	5938	5950
  40 | 111111131	6920	6937
  41 | 111111131	6957	6971
  42 | 111111131	7314	7324
  43 | 111111131	7700	7708
  44 | 111111137	143	183
  45 | 111111137	2058	2064
  46 | 111111137	2320	2333
  47 | 696694316	603	661
  48 | 696694316	1020	1094
  49 | 696694316	3276	3379
  50 | 696694316	3471	3608
  51 | 696694316	3610	4009
  52 | 696694316	4376	4395
  53 | 696694316	4423	4440
  54 | 696694316	4478	4500
  55 | 696694316	7026	7097
  56 | 696694316	7631	7780
  57 | 696694316	7971	8295
  58 | 696694316	8298	8640
  59 | 696694316	9373	9584
  60 | 696694316	9586	9819
  61 | 696694316	10661	10866
  62 | 696694316	11102	11126
  63 | 696694316	12634	12736
  64 | 696694316	12738	12848
  65 | 696694316	13262	13316
  66 | 696694316	13456	13555
  67 | 697444415	512	539
  68 | 697444415	1471	1596
  69 | 697444415	2815	2860
  70 | 697444415	2959	3022
  71 | 698018235	305	335
  72 | 698018235	555	581
  73 | 698018235	641	652
  74 | 698018235	657	687
  75 | 698018235	975	1047
  76 | 698018235	1482	1493
  77 | 698018235	1658	1881
  78 | 698018235	1910	1919
  79 | 698018235	2132	2410
  80 | 698018235	2431	2460
  81 | 698018235	2723	2730
  82 | 698018235	2917	3000
  83 | 698018235	3283	3380
  84 | 698018235	3514	3649
  85 | 698719689	131	144
  86 | 698719689	190	321
  87 | 698719689	1440	1449
  88 | 698719689	1542	1611
  89 | 698719689	2324	2335
  90 | 700461600	717	760
  91 | 700461600	1547	1745
  92 | 700461600	2318	2496
  93 | 700461600	2726	2787
  94 | 700461600	2805	2868
  95 | 700461600	3073	3255
  96 | 700461600	3548	3559
  97 | 700461600	3963	4080
  98 | 700461600	4209	4426
  99 | 700461600	4506	4530
 100 | 700461600	4657	4709
 101 | 701225819	77	99
 102 | 701225819	111	143
 103 | 701225819	177	187
 104 | 701225819	305	313
 105 | 701225819	996	1017
 106 | 701225819	1201	1299
 107 | 701225819	1493	1603
 108 | 701225819	1767	1771
 109 | 701225819	1872	1889
 110 | 701225819	1967	1986
 111 | 701225819	2174	2182
 112 | 701225819	2616	2621
 113 | 701225819	2689	2694
 114 | 701225819	2747	2752
 115 | 701225819	2787	2792
 116 | 701225819	2982	2991
 117 | 701225819	3309	3315
 118 | 701225819	3523	3541
 119 | 701225819	3787	3803
 120 | 701225819	3837	3860
 121 | 701225819	4268	4313
 122 | 701225819	4606	4636
 123 | 701225819	4720	4737
 124 | 701225819	5914	5927
 125 | 701225819	6102	6112
 126 | 701553469	31	41
 127 | 701553469	77	143
 128 | 701553469	205	209
 129 | 701553469	288	302
 130 | 701553469	351	361
 131 | 701553469	1716	1744
 132 | 701553469	1864	1887
 133 | 701553469	1983	2143
 134 | 701553469	2486	2651
 135 | 701553469	2666	2781
 136 | 701553469	3139	3159
 137 | 701553469	3455	3476
 138 | 701553469	3695	3715
 139 | 701553469	3898	3916
 140 | 701553469	4175	4280
 141 | 701837665	761	801
 142 | 701837665	803	833
 143 | 701837665	1364	1449
 144 | 701837665	1806	2008
 145 | 701837665	2456	2470
 146 | 701837665	2534	2575
 147 | 701837665	2618	2726
 148 | 701837665	2740	2762
 149 | 701837665	2792	2940
 150 | 701837665	2942	2991
 151 | 701837665	3251	3423
 152 | 701837665	3510	3534
 153 | 701837665	3824	3883
 154 | 701837665	3958	4067
 155 | 701837665	5040	5084
 156 | 701837665	5821	5840
 157 | 701837665	6697	6873
 158 | 701837665	7193	7357
 159 | 701837665	7401	7455
 160 | 701837665	7529	7666
 161 | 701837665	7727	7746
 162 | 701837665	7785	7841
 163 | 701837665	8115	8155
 164 | 701837665	8276	8312
 165 | 701837665	8453	8556
 166 | 701837665	9299	9329
 167 | 701837665	9331	9371
 168 | 701837665	9425	9433
 169 | 701837665	9471	9500
 170 | 701837665	9741	9756
 171 | 701837665	9780	9814
 172 | 701837665	10090	10206
 173 | 701837665	10208	10227
 174 | 701837665	10802	10972
 175 | 701837665	11844	11915
 176 | 701837665	11964	12007
 177 | 701837665	12031	12047
 178 | 701837665	12178	12197
 179 | 701837665	12381	12476
 180 | 701837665	12594	12665
 181 | 703821117	114	174
 182 | 703821117	179	236
 183 | 703821117	472	532
 184 | 703821117	833	880
 185 | 703821117	2350	2366
 186 | 703821117	2475	2523
 187 | 703821117	3368	3383
 188 | 703821117	3483	3500
 189 | 703821117	3519	3571
 190 | 703821117	3893	3974
 191 | 703821117	5022	5095
 192 | 703821117	5281	5373
 193 | 703821117	6046	6098
 194 | 703821117	6445	6453
 195 | 703821117	6520	6578
 196 | 703821117	6580	6609
 197 | 703821117	6620	6637
 198 | 703821117	7264	7314
 199 | 703821117	7674	7693
 200 | 703821117	10469	10570
 201 | 703821117	10572	10680
 202 | 703821117	10697	10805
 203 | 703821117	10820	10845
 204 | 703821117	10860	10995
 205 | 703821117	11087	11097
 206 | 703821117	11099	11105
 207 | 703821117	11221	11328
 208 | 703821117	11655	11671
 209 | 703821117	12003	12039
 210 | 703821117	12149	12242
 211 | 703821117	12349	12363
 212 | 703821117	13152	13166
 213 | 703821117	13316	13369
 214 | 703821117	13682	13691
 215 | 703821117	13902	13958
 216 | 703821117	13960	14030
 217 | 703821117	14142	14158
 218 | 703821117	14419	14458
 219 | 703821117	14511	14640
 220 | 703821117	14653	14658
 221 | 703821117	14752	14920
 222 | 703821117	15047	15069
 223 | 703821117	15216	15279
 224 | 703821117	15785	15851
 225 | 703821117	16195	16231
 226 | 703821117	16335	16411
 227 | 703821117	16781	16835
 228 | 703821117	16976	17013
 229 | 703821117	17016	17047
 230 | 703821117	17150	17244
 231 | 703821117	17552	17689
 232 | 703821117	18269	18323
 233 | 703821117	18378	18396
 234 | 703821117	18447	18550
 235 | 703821117	18604	18639
 236 | 704591553	71	103
 237 | 704591553	238	265
 238 | 704591553	278	285
 239 | 704591553	697	822
 240 | 704591553	933	954
 241 | 704591553	1016	1029
 242 | 704591553	1125	1141
 243 | 704591553	1280	1292
 244 | 704591553	1696	1718
 245 | 704591553	1805	1832
 246 | 704591553	1868	1883
 247 | 704591553	1999	2059
 248 | 704591553	2180	2228
 249 | 704591553	2765	2780
 250 | 704591553	2802	2816
 251 | 704591553	3153	3163
 252 | 704591553	3221	3359
 253 | 704591553	3827	3881
 254 | 704591553	3883	3911
 255 | 704591553	4058	4124
 256 | 704591553	4398	4405
 257 | 704591553	4454	4480
 258 | 704591553	4493	4508
 259 | 704591553	4965	5082
 260 | 704856340	4007	4185
 261 | 704856340	4187	4324
 262 | 706636401	992	1001
 263 | 706636401	2911	2939
 264 | 706636401	3353	3362
 265 | 706636401	3724	3747
 266 | 706636401	3802	3966
 267 | 709732928	12	21
 268 | 709732928	160	172
 269 | 709732928	251	259
 270 | 709732928	1428	1432
 271 | 709732928	1811	1820
 272 | 709732928	1957	1966
 273 | 709732928	2169	2173
 274 | 709732928	2177	2184
 275 | 709732928	2583	2587
 276 | 709732928	3682	3689
 277 | 709732928	3732	3742
 278 | 709732928	6464	6474
 279 | 709732928	7346	7359
 280 | 709732928	7579	7590
 281 | 709732928	8158	8168
 282 | 709732928	8491	8498
 283 | 709732928	8521	8530
 284 | 709732928	8532	8546
 285 | 709732928	8613	8616
 286 | 709732928	9458	9465
 287 | 709732928	10416	10440
 288 | 709732928	10525	10552
 289 | 709732928	10660	10675
 290 | 709732928	10768	10775
 291 | 709732928	10826	10833
 292 | 709732928	10865	10882
 293 | 709732928	11982	11997
 294 | 710100700	1203	1375
 295 | 711596363	13	32
 296 | 711596363	258	277
 297 | 711596363	1194	1252
 298 | 711596363	1408	1421
 299 | 711596363	1944	1954
 300 | 711596363	3065	3083
 301 | 711596363	3136	3154
 302 | 711596363	3173	3189
 303 | 711596363	3277	3285
 304 | 711596363	3700	3870
 305 | 711596363	3894	4008
 306 | 711596363	4274	4281
 307 | 711596363	4373	4389
 308 | 711596363	4573	4624
 309 | 711596363	4626	4664
 310 | 711596363	4738	4767
 311 | 711596363	4985	5098
 312 | 711596363	5391	5412
 313 | 711596363	5627	5645
 314 | 711596363	5647	5678
 315 | 711596363	5745	5779
 316 | 711622457	457	573
 317 | 711622457	575	597
 318 | 711622457	616	732
 319 | 711622457	734	756
 320 | 711622457	813	847
 321 | 711622457	934	997
 322 | 711622457	1095	1250
 323 | 711622457	1329	1355
 324 | 711622457	1882	1967
 325 | 711622457	2008	2120
 326 | 711622457	2409	2697
 327 | 711622457	2754	2812
 328 | 711622457	3303	3458
 329 | 711716996	30	85
 330 | 711716996	298	304
 331 | 711716996	724	862
 332 | 711716996	957	992
 333 | 711716996	1444	1453
 334 | 711716996	1600	1632
 335 | 711716996	1936	2064
 336 | 711716996	2231	2268
 337 | 711716996	2308	2314
 338 | 711716996	2320	2345
 339 | 711716996	2760	2843
 340 | 711716996	2910	2941
 341 | 711716996	3978	3992
 342 | 711716996	4002	4025
 343 | 711716996	4068	4082
 344 | 711716996	4563	4569
 345 | 711716996	4959	4973
 346 | 711716996	5284	5298
 347 | 715588833	0	17
 348 | 715588833	412	498
 349 | 715588833	624	679
 350 | 715588833	1046	1074
 351 | 715588833	1753	1770
 352 | 715588833	2061	2160
 353 | 715588833	2437	2464
 354 | 715588833	2622	2773
 355 | 715588833	7098	7129
 356 | 715588833	7155	7185
 357 | 715588833	7638	7736
 358 | 715588833	7829	7855
 359 | 715588833	7857	7909
 360 | 715588833	8479	8546
 361 | 715588833	8548	8641
 362 | 715588833	8643	8940
 363 | 715588833	9232	9268
 364 | 715588833	9456	9504
 365 | 715588833	9836	10047
 366 | 715588833	10678	10918
 367 | 715588833	11273	11388
 368 | 715588833	11390	11426
 369 | 715588833	11575	11832
 370 | 715588833	11839	11843
 371 | 715588833	11846	11970
 372 | 715588833	11972	12085
 373 | 722507879	1369	1391
 374 | 722507879	2307	2350
 375 | 722507879	2356	2413
 376 | 722507879	2432	2516
 377 | 722507879	3019	3045
 378 | 722507879	3937	3948
 379 | 722507879	4053	4079
 380 | 723793978	1106	1256
 381 | 723793978	1418	1430
 382 | 727493378	493	502
 383 | 727493378	563	822
 384 | 727493378	1272	1285
 385 | 727493378	1768	1866
 386 | 727493378	1943	1995
 387 | 727493378	1997	2265
 388 | 727493378	3072	3149
 389 | 727736557	85	94
 390 | 727736557	186	212
 391 | 727736557	305	328
 392 | 727736557	650	669
 393 | 727736557	983	1001
 394 | 727736557	1203	1347
 395 | 727736557	1761	1770
 396 | 727736557	1819	1858
 397 | 727736557	2226	2239
 398 | 727736557	2351	2382
 399 | 727736557	2429	2447
 400 | 727736557	2840	2884
 401 | 727736557	4017	4068
 402 | 727736557	4511	4544
 403 | 727736557	4574	4602
 404 | 727736557	4715	4742
 405 | 727736557	5073	5132
 406 | 728169864	0	8
 407 | 728169864	423	440
 408 | 728169864	1628	1641
 409 | 728169864	2632	2640
 410 | 728169864	2644	2651
 411 | 728169864	2655	2662
 412 | 728169864	2666	2674
 413 | 728169864	2678	2684
 414 | 728169864	2688	2694
 415 | 728169864	2698	2712
 416 | 728169864	2720	2744
 417 | 728169864	2747	2878
 418 | 728169864	3161	3184
 419 | 728169864	5313	5330
 420 | 728169864	5517	5525
 421 | 728169864	5753	5772
 422 | 728169864	6198	6259
 423 | 728758697	31	49
 424 | 728758697	51	89
 425 | 728758697	819	1034
 426 | 728758697	1232	1454
 427 | 728758697	1462	1509
 428 | 728758697	1512	1599
 429 | 728758697	1697	1744
 430 | 728758697	1746	1788
 431 | 728758697	1790	1836
 432 | 729410793	29	76
 433 | 729410793	316	395
 434 | 729410793	657	705
 435 | 729410793	708	752
 436 | 729410793	754	1015
 437 | 729410793	1018	1341
 438 | 729410793	1563	1601
 439 | 729410793	3356	3480
 440 | 729410793	3510	3676
 441 | 729410793	4126	4166
 442 | 729410793	4237	4363
 443 | 729410793	4586	4624
 444 | 729410793	4626	4676
 445 | 729561658	39	89
 446 | 729561658	251	312
 447 | 729561658	754	778
 448 | 729561658	809	833
 449 | 729561658	1006	1052
 450 | 729561658	1494	1537
 451 | 729561658	1539	1573
 452 | 729561658	1575	1643
 453 | 729561658	1645	1738
 454 | 730559808	955	992
 455 | 730559808	998	1088
 456 | 730559808	1251	1340
 457 | 730559808	2483	2640
 458 | 730559808	3006	3186
 459 | 730559808	3215	3490
 460 | 730559808	3492	3611
 461 | 730559808	3953	4017
 462 | 730559808	4279	4361
 463 | 730559808	4404	4441
 464 | 730559808	4447	4537
 465 | 730559808	5353	5419
 466 | 730559808	5659	5889
 467 | 730559808	5891	5969
 468 | 730573740	45	66
 469 | 730573740	834	846
 470 | 730573740	983	1125
 471 | 730573740	1302	1322
 472 | 730573740	1338	1505
 473 | 730573740	2177	2227
 474 | 730573740	2240	2345
 475 | 730573740	2479	2572
 476 | 730573740	2682	2751
 477 | 731927633	962	1016
 478 | 731927633	1018	1053
 479 | 731927633	1189	1224
 480 | 731927633	1226	1280
 481 | 731927633	1973	2005
 482 | 731927633	2063	2092
 483 | 731927633	2237	2323
 484 | 731927633	2434	2444
 485 | 731927633	2784	2832
 486 | 731927633	3450	3514
 487 | 732154721	27	61
 488 | 732154721	281	316
 489 | 732154721	397	528
 490 | 732154721	987	1153
 491 | 732154721	1172	1283
 492 | 732154721	1930	1965
 493 | 732154721	2384	2427
 494 | 732154721	2607	2666
 495 | 732154721	2755	2819
 496 | 735855251	2216	2268
 497 | 735855251	2357	2453
 498 | 735855251	2455	2563
 499 | 755814432	166	168
 500 | 755814432	1728	1735
 501 | 755814432	1978	2004
 502 | 755814432	2418	2564
 503 | 755814432	3178	3201
 504 | 755814432	3304	3321
 505 | 755814432	3846	3960
 506 | 757243988	339	350
 507 | 757243988	461	467
 508 | 757243988	1447	1457
 509 | 757243988	1658	1671
 510 | 757243988	2267	2280
 511 | 757243988	2473	2483
 512 | 757243988	2635	2644
 513 | 761969038	119	141
 514 | 761969038	305	321
 515 | 761969038	974	988
 516 | 761969038	1755	1783
 517 | 761969038	1971	2046
 518 | 761969038	4048	4053
 519 | 761969038	4149	4170
 520 | 761969038	4271	4292
 521 | 761969038	4391	4412
 522 | 761969038	4521	4541
 523 | 761969038	5552	5571
 524 | 761969038	5614	5663
 525 | 761969692	393	404
 526 | 761969692	482	492
 527 | 761969692	601	725
 528 | 761969692	1557	1574
 529 | 761969692	1894	1907
 530 | 761969692	2005	2043
 531 | 761969692	2057	2089
 532 | 761969692	2276	2289
 533 | 761969692	2291	2489
 534 | 761969692	2858	2885
 535 | 761969692	2993	3013
 536 | 761969692	3017	3033
 537 | 761969692	4010	4090
 538 | 761969692	5796	5816
 539 | 761969692	5915	5935
 540 | 761969692	6064	6084
 541 | 761969692	6222	6242
 542 | 761969692	6522	6541
 543 | 761969692	6636	6662
 544 | 763260610	0	17
 545 | 763260610	254	290
 546 | 763260610	705	719
 547 | 763260610	970	981
 548 | 763260610	1053	1105
 549 | 763260610	1284	1363
 550 | 763260610	1365	1375
 551 | 763260610	1411	1424
 552 | 763260610	1470	1483
 553 | 763260610	1516	1529
 554 | 763260610	1593	1619
 555 | 763260610	1621	1675
 556 | 763260610	2132	2150
 557 | 763260610	2205	2238
 558 | 763260610	2311	2359
 559 | 763260610	2430	2443
 560 | 763260610	2812	2868
 561 | 763260610	3031	3086
 562 | 763260610	3134	3156
 563 | 763260610	3559	3589
 564 | 763260610	3782	3801
 565 | 763260610	3811	3868
 566 | 763260610	3911	3931
 567 | 763260610	3956	3996
 568 | 763260610	4015	4037
 569 | 763260610	4166	4190
 570 | 763260610	4192	4309
 571 | 763260610	4369	4390
 572 | 763260610	4500	4518
 573 | 763260610	4639	4664
 574 | 763260610	4724	4742
 575 | 763260610	4819	4851
 576 | 763260610	5066	5120
 577 | 763260610	5391	5423
 578 | 763260610	5495	5554
 579 | 763260610	5881	5892
 580 | 763260610	6160	6182
 581 | 763260610	6230	6271
 582 | 763260610	6409	6450
 583 | 763260610	6660	6740
 584 | 763260610	6845	6907
 585 | 763260610	7459	7495
 586 | 763260610	7606	7639
 587 | 763260610	7661	7680
 588 | 763260610	7890	7931
 589 | 763260610	8032	8048
 590 | 763260610	8129	8166
 591 | 763260610	8247	8265
 592 | 763260610	8777	8790
 593 | 763260610	8957	8999
 594 | 763260610	9160	9180
 595 | 763260610	9186	9203
 596 | 763260610	9302	9329
 597 | 763260610	9362	9378
 598 | 763260610	9405	9471
 599 | 763260610	9689	9714
 600 | 763260610	9818	9846
 601 | 763260610	9916	9946
 602 | 763260610	9967	10009
 603 | 763260610	10201	10274
 604 | 763260610	10457	10469
 605 | 763260610	10568	10629
 606 | 763260610	11055	11168
 607 | 763260610	11280	11303
 608 | 763260610	12230	12261
 609 | 763260610	12529	12547
 610 | 763260610	13164	13180
 611 | 763260610	13644	13664
 612 | 763260610	13771	13837
 613 | 763260610	14075	14095
 614 | 763260610	14289	14314
 615 | 763260610	14328	14403
 616 | 763260610	14424	14441
 617 | 763761219	116	140
 618 | 763761219	1452	1465
 619 | 763761219	1645	1669
 620 | 763761219	1875	1890
 621 | 763761219	2068	2129
 622 | 763761219	2295	2298
 623 | 763761219	2343	2352
 624 | 764609985	288	296
 625 | 764609985	500	507
 626 | 764609985	675	694
 627 | 764609985	1681	1685
 628 | 764609985	2352	2362
 629 | 764609985	2463	2471
 630 | 764609985	2550	2764
 631 | 764609985	2992	3003
 632 | 764609985	3574	3584
 633 | 764609985	4675	4681
 634 | 764609985	4924	4933
 635 | 764609985	5496	5541
 636 | 764609985	5891	6090
 637 | 764609985	6332	6366
 638 | 764609985	7112	7118
 639 | 764609985	7224	7235
 640 | 764609985	7247	7251
 641 | 764609985	8497	8514
 642 | 764609985	8701	8718
 643 | 764609985	8747	8765
 644 | 764609985	10188	10203
 645 | 764609985	10403	10413
 646 | 764609985	11038	11048
 647 | 764715911	132	142
 648 | 764715911	254	348
 649 | 764715911	573	584
 650 | 764715911	937	1001
 651 | 764715911	1667	1713
 652 | 764715911	1728	1734
 653 | 764715911	2558	2561
 654 | 764715911	2859	2916
 655 | 764715911	3722	3734
 656 | 764715911	4015	4028
 657 | 764715911	4380	4391
 658 | 764715911	5779	5788
 659 | 764715911	6379	6427
 660 | 764715911	6646	6649
 661 | 764715911	6832	6852
 662 | 764715911	6958	6981
 663 | 764715911	7265	7459
 664 | 764715911	7479	7596
 665 | 764715911	7652	7700
 666 | 765953146	787	816
 667 | 765953146	982	1011
 668 | 765953146	1099	1110
 669 | 765953146	2168	2186
 670 | 765953146	2193	2209
 671 | 765953146	2497	2507
 672 | 765953146	5320	5335
 673 | 767129999	59	78
 674 | 767129999	1498	1513
 675 | 767129999	1739	1803
 676 | 767129999	1943	1951
 677 | 767129999	2058	2092
 678 | 767129999	2478	2499
 679 | 767129999	2522	2531
 680 | 770156173	0	18
 681 | 770156173	34	47
 682 | 770156173	1021	1029
 683 | 770156173	1106	1184
 684 | 770156173	1556	1738
 685 | 770156173	1740	1836
 686 | 770156173	1919	1938
 687 | 770156173	2094	2117
 688 | 770156173	2158	2164
 689 | 770156173	2330	2348
 690 | 770156173	2469	2498
 691 | 770156173	2819	2827
 692 | 770156173	2934	2989
 693 | 770156173	3012	3018
 694 | 770156173	3266	3280
 695 | 770156173	3692	3737
 696 | 770156173	3924	3946
 697 | 770156173	3970	3985
 698 | 770156173	3991	4004
 699 | 770156173	4523	4531
 700 | 770156173	4682	4692
 701 | 770156173	4758	4781
 702 | 770156173	4814	4929
 703 | 770156173	5254	5261
 704 | 770156173	5330	5340
 705 | 770156173	5645	5652
 706 | 770156173	5920	5936
 707 | 770156173	6056	6072
 708 | 770877978	900	905
 709 | 770877978	1000	1018
 710 | 770877978	1020	1067
 711 | 770877978	1252	1270
 712 | 770877978	3117	3146
 713 | 770877978	3244	3292
 714 | 770956434	482	487
 715 | 770956434	1360	1380
 716 | 770956434	1390	1430
 717 | 770956434	1560	1571
 718 | 770956434	1659	1665
 719 | 770956434	1925	1943
 720 | 770956434	1945	1992
 721 | 770956434	2176	2194
 722 | 770956434	2583	2592
 723 | 770956434	2884	2891
 724 | 776368676	45	70
 725 | 776368676	276	297
 726 | 776368676	314	343
 727 | 776368676	378	408
 728 | 776368676	424	457
 729 | 776368676	542	559
 730 | 776368676	571	623
 731 | 776368676	640	658
 732 | 776368676	664	738
 733 | 776368676	871	893
 734 | 776368676	905	957
 735 | 776368676	974	992
 736 | 776368676	998	1078
 737 | 776368676	1164	1176
 738 | 776368676	1461	1494
 739 | 776368676	1596	1633
 740 | 776368676	2017	2033
 741 | 776368676	2050	2074
 742 | 776368676	2085	2118
 743 | 776368676	2132	2147
 744 | 776368676	2164	2174
 745 | 776368676	2178	2210
 746 | 776368676	3803	3823
 747 | 780619695	44	62
 748 | 780619695	120	160
 749 | 780619695	162	168
 750 | 780619695	174	183
 751 | 780619695	1245	1262
 752 | 780619695	1321	1337
 753 | 780619695	1538	1554
 754 | 780619695	1728	1744
 755 | 780619695	1770	1794
 756 | 780619695	1935	1959
 757 | 780619695	2018	2034
 758 | 780619695	2245	2261
 759 | 780619695	2329	2338
 760 | 780619695	2343	2359
 761 | 780619695	2838	2854
 762 | 780619695	3066	3082
 763 | 780619695	3207	3224
 764 | 780619695	3735	3740
 765 | 780619695	3933	3941
 766 | 780619695	4123	4129
 767 | 780619695	4212	4229
 768 | 780619695	4298	4309
 769 | 780619695	4504	4513
 770 | 780619695	4696	4736
 771 | 780619695	4856	4874
 772 | 780619695	5547	5553
 773 | 780619695	5985	6015
 774 | 780619695	6232	6246
 775 | 780619695	6301	6314
 776 | 781577820	14	25
 777 | 781577820	163	190
 778 | 781577820	215	246
 779 | 781577820	274	302
 780 | 781577820	470	494
 781 | 781577820	554	561
 782 | 781577820	589	661
 783 | 781577820	731	741
 784 | 781577820	968	1028
 785 | 781577820	1031	1218
 786 | 781577820	1476	1608
 787 | 781577820	1642	1679
 788 | 786527921	1	16
 789 | 786527921	259	300
 790 | 786527921	729	736
 791 | 786527921	827	898
 792 | 786527921	1281	1329
 793 | 786527921	1465	1480
 794 | 786527921	1881	1904
 795 | 786527921	1955	2170
 796 | 786527921	2176	2238
 797 | 786527921	2415	2440
 798 | 786527921	2753	2765
 799 | 786527921	3033	3065
 800 | 786527921	5134	5376
 801 | 786527921	5622	5643
 802 | 786527921	5787	5810
 803 | 786527921	6132	6149
 804 | 786527921	6683	6695
 805 | 786527921	6880	6908
 806 | 786527921	7032	7051
 807 | 786527921	7482	7583
 808 | 786527921	7598	7633
 809 | 786527921	7747	7771
 810 | 786527921	8099	8114
 811 | 786527921	8403	8449
 812 | 786527921	8548	8577
 813 | 786527921	8838	8856
 814 | 786527921	8903	8923
 815 | 786527921	9063	9078
 816 | 786527921	9859	9869
 817 | 786527921	9942	10004
 818 | 786527921	10254	10267
 819 | 786527921	10496	10512
 820 | 786527921	10665	10684
 821 | 786527921	10713	10737
 822 | 786527921	10920	10973
 823 | 786527921	11260	11284
 824 | 786527921	11294	11311
 825 | 786527921	11506	11530
 826 | 786527921	12164	12267
 827 | 786527921	12385	12404
 828 | 786527921	12856	12878
 829 | 786527921	12974	13022
 830 | 786527921	13029	13062
 831 | 786527921	13191	13235
 832 | 786527921	13589	13733
 833 | 786527921	13931	13982
 834 | 786527921	14062	14097
 835 | 786527921	14149	14328
 836 | 786527921	14488	14500
 837 | 786527921	14505	14550
 838 | 786527921	14918	14942
 839 | 786527921	15077	15112
 840 | 786527921	16017	16053
 841 | 786527921	16104	16121
 842 | 786527921	16130	16177
 843 | 786527921	16221	16253
 844 | 786527921	16530	16544
 845 | 786527921	16804	16827
 846 | 786527921	16846	16868
 847 | 786527921	16919	16949
 848 | 787142429	1106	1123
 849 | 787142429	1150	1204
 850 | 787142429	2684	2815
 851 | 787529309	16	39
 852 | 787529309	70	87
 853 | 787529309	288	319
 854 | 787529309	323	350
 855 | 787529309	921	960
 856 | 787529309	2338	2356
 857 | 787529309	2405	2430
 858 | 787529309	2694	2708
 859 | 787529309	3158	3173
 860 | 787529309	5159	5195
 861 | 787529309	5394	5414
 862 | 787529309	6056	6347
 863 | 787529309	6678	6784
 864 | 787529309	7626	7677
 865 | 787529309	7814	7853
 866 | 787529309	8200	8310
 867 | 787529309	8337	8365
 868 | 787529309	8567	8585
 869 | 787529309	8686	8710
 870 | 787529309	9173	9282
 871 | 787529309	9303	9384
 872 | 787529309	9661	9697
 873 | 787759779	331	343
 874 | 787759779	495	507
 875 | 787759779	590	623
 876 | 787759779	670	679
 877 | 787759779	681	696
 878 | 787759779	1011	1032
 879 | 788900262	0	91
 880 | 788900262	93	269
 881 | 788900262	1484	1576
 882 | 788900262	1722	1749
 883 | 788900262	1817	1830
 884 | 788900262	1858	1891
 885 | 788900262	2218	2270
 886 | 788900262	2272	2383
 887 | 788900262	2401	2428
 888 | 788900262	2696	2711
 889 | 788900262	4190	4224
 890 | 788900262	4478	4561
 891 | 788900262	4877	5017
 892 | 788900262	5064	5083
 893 | 788900262	5112	5137
 894 | 788900262	5263	5313
 895 | 788900262	6020	6109
 896 | 788900262	6331	6408
 897 | 788900262	6490	6583
 898 | 788900262	6601	6675
 899 | 789370909	53	78
 900 | 789370909	459	491
 901 | 789370909	1894	1919
 902 | 789370909	2009	2030
 903 | 789370909	2148	2171
 904 | 789370909	3212	3231
 905 | 789370909	4056	4071
 906 | 789370909	4416	4439
 907 | 789370909	4736	4756
 908 | 789370909	5762	5843
 909 | 789370909	5939	6056
 910 | 789370909	6861	6881
 911 | 789370909	8063	8083
 912 | 789370909	8517	8597
 913 | 789370909	8727	8772
 914 | 789370909	8829	8884
 915 | 789370909	8985	9015
 916 | 789370909	9134	9210
 917 | 789370909	9255	9271
 918 | 789370909	9306	9433
 919 | 789370909	9840	9870
 920 | 789370909	10444	10485
 921 | 789370909	10867	10940
 922 | 789370909	11058	11070
 923 | 789370909	11625	11705
 924 | 789370909	11722	11746
 925 | 789370909	11837	11843
 926 | 789370909	11848	11871
 927 | 795703371	647	654
 928 | 795703371	813	829
 929 | 795703371	1202	1264
 930 | 795703371	1268	1271
 931 | 795703371	1774	1811
 932 | 795703371	2464	2475
 933 | 795703371	2479	2490
 934 | 795703371	2572	2575
 935 | 795703371	2889	2951
 936 | 795703371	3759	3901
 937 | 795703371	3904	4050
 938 | 795703371	4103	4111
 939 | 795703371	4126	4135
 940 | 795703371	4268	4327
 941 | 795703371	4398	4442
 942 | 999000149	113	127
 943 | 999000149	243	295
 944 | 999000149	300	326
 945 | 999000149	813	830
 946 | 999000149	1303	1374
 947 | 999000149	1602	1689
 948 | 999000149	2952	2969
 949 | 999000149	2974	2987
 950 | 999000149	2989	3077
 951 | 999000149	3119	3140
 952 | 999000159	422	433
 953 | 999000159	499	510
 954 | 999000159	949	966
 955 | 999000159	1287	1303
 956 | 999000159	1701	1735
 957 | 999000159	1905	1993
 958 | 999000565	25	50
 959 | 999000565	127	177
 960 | 999000565	276	296
 961 | 999000565	344	378
 962 | 999000565	419	449
 963 | 999000565	575	607
 964 | 999000894	74	82
 965 | 999000894	148	151
 966 | 999000894	411	436
 967 | 999000894	501	512
 968 | 999000894	1393	1409
 969 | 999000894	3391	3409
 970 | 999000894	4245	4276
 971 | 999000894	4927	4940
 972 | 999000894	5185	5200
 973 | 999000894	5314	5317
 974 | 999001033	38	55
 975 | 999001033	134	151
 976 | 999001033	400	406
 977 | 999001033	687	739
 978 | 999001033	826	841
 979 | 999001033	875	926
 980 | 999001033	941	968
 981 | 999001033	1160	1225
 982 | 999001033	1245	1274
 983 | 999001033	1417	1426
 984 | 999001033	1445	1466
 985 | 999001033	1468	1488
 986 | 999001033	1519	1551
 987 | 999001033	1564	1575
 988 | 999001033	1879	1901
 989 | 999001033	1935	1957
 990 | 999001033	1985	2025
 991 | 999001033	2382	2408
 992 | 999001033	2532	2563
 993 | 999001033	2595	2603
 994 | 999001033	3206	3219
 995 | 999001033	3224	3245
 996 | 999001033	3362	3422
 997 | 999001033	3560	3581
 998 | 999001033	3595	3607
 999 | 999001033	3713	3724
1000 | 999001297	773	789
1001 | 999001297	1496	1515
1002 | 999001297	2633	2670
1003 | 999001297	2676	2715
1004 | 999001297	2883	2895
1005 | 999001621	0	68
1006 | 999001621	88	156
1007 | 999001621	325	338
1008 | 999001621	382	401
1009 | 999001621	467	473
1010 | 999001621	612	621
1011 | 999001621	769	783
1012 | 999001621	1628	1655
1013 | 999001621	3471	3495
1014 | 999001621	3976	4062
1015 | 999001621	4085	4137
1016 | 999001621	4208	4225
1017 | 999001621	4485	4527
1018 | 999001621	5047	5061
1019 | 999001621	5088	5132
1020 | 999001621	5159	5170
1021 | 999001621	5890	5902
1022 | 999001621	7085	7091
1023 | 999001621	7988	7999
1024 | 999001621	8473	8481
1025 | 999001621	8493	8500
1026 | 999001621	8550	8570
1027 | 999001621	8600	8611
1028 | 999001621	8627	8694
1029 | 999001621	8892	8897
1030 | 999001621	9108	9121
1031 | 999001621	9801	9859
1032 | 999001621	10035	10062
1033 | 999001621	10151	10169
1034 | 999001621	10180	10185
1035 | 999001621	10536	10541
1036 | 999001621	10779	10800
1037 | 999001621	10931	11012
1038 | 999001621	11213	11537
1039 | 999001621	12106	12112
1040 | 999001621	12143	12184
1041 | 


--------------------------------------------------------------------------------
/span_identification/ner/run_ner.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | # Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
  3 | # Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
  4 | #
  5 | # Licensed under the Apache License, Version 2.0 (the "License");
  6 | # you may not use this file except in compliance with the License.
  7 | # You may obtain a copy of the License at
  8 | #
  9 | #     http://www.apache.org/licenses/LICENSE-2.0
 10 | #
 11 | # Unless required by applicable law or agreed to in writing, software
 12 | # distributed under the License is distributed on an "AS IS" BASIS,
 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14 | # See the License for the specific language governing permissions and
 15 | # limitations under the License.
 16 | """ Fine-tuning the library models for named entity recognition on CoNLL-2003 (Bert or Roberta). """
 17 | 
 18 | from __future__ import absolute_import, division, print_function
 19 | 
 20 | import argparse
 21 | import glob
 22 | import logging
 23 | import os
 24 | import random
 25 | 
 26 | from unidecode import unidecode
 27 | 
 28 | import pickle
 29 | import numpy as np
 30 | import torch
 31 | from seqeval.metrics import precision_score, recall_score, f1_score
 32 | from sklearn_crfsuite import metrics
 33 | from tensorboardX import SummaryWriter
 34 | from torch.nn import CrossEntropyLoss
 35 | from torch.utils.data import DataLoader, RandomSampler, SequentialSampler, TensorDataset
 36 | from torch.utils.data.distributed import DistributedSampler
 37 | from tqdm import tqdm, trange
 38 | from .utils_ner import convert_examples_to_features, get_labels, read_examples_from_file
 39 | 
 40 | from transformers import AdamW, get_linear_schedule_with_warmup
 41 | from transformers import WEIGHTS_NAME, BertConfig, BertForTokenClassification, BertTokenizer
 42 | from transformers import RobertaConfig, RobertaTokenizer
 43 | from transformers import DistilBertConfig, DistilBertForTokenClassification, DistilBertTokenizer
 44 | from transformers import XLNetConfig, XLNetForTokenClassification, XLNetTokenizer
 45 | from transformers import CamembertConfig, CamembertForTokenClassification, CamembertTokenizer
 46 | from scipy.special import softmax
 47 | 
 48 | from .modeling_roberta import RobertaForTokenClassification
 49 | 
 50 | logger = logging.getLogger(__name__)
 51 | 
 52 | ALL_MODELS = sum(
 53 |     (tuple(conf.pretrained_config_archive_map.keys()) for conf in (BertConfig, RobertaConfig, DistilBertConfig)),
 54 |     ())
 55 | 
 56 | MODEL_CLASSES = {
 57 |     "bert": (BertConfig, BertForTokenClassification, BertTokenizer),
 58 |     "roberta": (RobertaConfig, RobertaForTokenClassification, RobertaTokenizer),
 59 |     "distilbert": (DistilBertConfig, DistilBertForTokenClassification, DistilBertTokenizer),
 60 |     "camembert": (CamembertConfig, CamembertForTokenClassification, CamembertTokenizer),
 61 |     "xlnet": (XLNetConfig, XLNetForTokenClassification, XLNetTokenizer)
 62 | }
 63 | 
 64 | 
 65 | def set_seed(args):
 66 |     random.seed(args.seed)
 67 |     np.random.seed(args.seed)
 68 |     torch.manual_seed(args.seed)
 69 |     if args.n_gpu > 0:
 70 |         torch.cuda.manual_seed_all(args.seed)
 71 | 
 72 | 
 73 | def train(args, train_dataset, model, tokenizer, labels, pad_token_label_id):
 74 |     """ Train the model """
 75 |     if args.local_rank in [-1, 0]:
 76 |         tb_writer = SummaryWriter()
 77 | 
 78 |     args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu)
 79 |     train_sampler = RandomSampler(train_dataset) if args.local_rank == -1 else DistributedSampler(train_dataset)
 80 |     train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.train_batch_size)
 81 | 
 82 |     if args.max_steps > 0:
 83 |         t_total = args.max_steps
 84 |         args.num_train_epochs = args.max_steps // (len(train_dataloader) // args.gradient_accumulation_steps) + 1
 85 |     else:
 86 |         t_total = len(train_dataloader) // args.gradient_accumulation_steps * args.num_train_epochs
 87 | 
 88 |     # Prepare optimizer and schedule (linear warmup and decay)
 89 |     no_decay = ["bias", "LayerNorm.weight"]
 90 |     optimizer_grouped_parameters = [
 91 |         {"params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
 92 |          "weight_decay": args.weight_decay},
 93 |         {"params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], "weight_decay": 0.0}
 94 |     ]
 95 |     optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon)
 96 |     scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total)
 97 |     if args.fp16:
 98 |         try:
 99 |             from apex import amp
100 |         except ImportError:
101 |             raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.")
102 |         model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level)
103 | 
104 |     # multi-gpu training (should be after apex fp16 initialization)
105 |     if args.n_gpu > 1:
106 |         model = torch.nn.DataParallel(model)
107 | 
108 |     # Distributed training (should be after apex fp16 initialization)
109 |     if args.local_rank != -1:
110 |         model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.local_rank],
111 |                                                           output_device=args.local_rank,
112 |                                                           find_unused_parameters=True)
113 | 
114 |     # Train!
115 |     logger.info("***** Running training *****")
116 |     logger.info("  Num examples = %d", len(train_dataset))
117 |     logger.info("  Num Epochs = %d", args.num_train_epochs)
118 |     logger.info("  Instantaneous batch size per GPU = %d", args.per_gpu_train_batch_size)
119 |     logger.info("  Total train batch size (w. parallel, distributed & accumulation) = %d",
120 |                 args.train_batch_size * args.gradient_accumulation_steps * (
121 |                     torch.distributed.get_world_size() if args.local_rank != -1 else 1))
122 |     logger.info("  Gradient Accumulation steps = %d", args.gradient_accumulation_steps)
123 |     logger.info("  Total optimization steps = %d", t_total)
124 | 
125 |     global_step = 0
126 |     tr_loss, logging_loss = 0.0, 0.0
127 |     model.zero_grad()
128 |     train_iterator = trange(int(args.num_train_epochs), desc="Epoch", disable=args.local_rank not in [-1, 0])
129 |     set_seed(args)  # Added here for reproductibility (even between python 2 and 3)
130 |     for _ in train_iterator:
131 |         epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0], position=0, leave=True)
132 |         for step, batch in enumerate(epoch_iterator):
133 |             model.train()
134 |             batch = tuple(t.to(args.device) for t in batch)
135 |             inputs = {"input_ids": batch[0],
136 |                       "attention_mask": batch[1],
137 |                       "labels": batch[3]}
138 |             if args.model_type != "distilbert":
139 |                 inputs["token_type_ids"] = batch[2] if args.model_type in ["bert", "xlnet"] else None  # XLM and RoBERTa don"t use segment_ids
140 |             if args.use_quotes:
141 |                 inputs['quotes'] = batch[4]
142 |             outputs = model(**inputs)
143 |             loss = outputs[0]  # model outputs are always tuple in pytorch-transformers (see doc)
144 | 
145 |             if args.n_gpu > 1:
146 |                 loss = loss.mean()  # mean() to average on multi-gpu parallel training
147 |             if args.gradient_accumulation_steps > 1:
148 |                 loss = loss / args.gradient_accumulation_steps
149 | 
150 |             if args.fp16:
151 |                 with amp.scale_loss(loss, optimizer) as scaled_loss:
152 |                     scaled_loss.backward()
153 |             else:
154 |                 loss.backward()
155 | 
156 |             tr_loss += loss.item()
157 |             if (step + 1) % args.gradient_accumulation_steps == 0:
158 |                 if args.fp16:
159 |                     torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), args.max_grad_norm)
160 |                 else:
161 |                     torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm)
162 | 
163 |                 scheduler.step()  # Update learning rate schedule
164 |                 optimizer.step()
165 |                 model.zero_grad()
166 |                 global_step += 1
167 | 
168 |                 if args.local_rank in [-1, 0] and args.logging_steps > 0 and global_step % args.logging_steps == 0:
169 |                     # Log metrics
170 |                     if args.local_rank == -1 and args.evaluate_during_training:  # Only evaluate when single GPU otherwise metrics may not average well
171 |                         results, _ = evaluate(args, model, tokenizer, labels, pad_token_label_id, mode="dev")
172 |                         for key, value in results.items():
173 |                             tb_writer.add_scalar("eval_{}".format(key), value, global_step)
174 |                     tb_writer.add_scalar("lr", scheduler.get_lr()[0], global_step)
175 |                     tb_writer.add_scalar("loss", (tr_loss - logging_loss) / args.logging_steps, global_step)
176 |                     logging_loss = tr_loss
177 | 
178 |                 if args.local_rank in [-1, 0] and args.save_steps > 0 and global_step % args.save_steps == 0:
179 |                     # Save model checkpoint
180 |                     output_dir = os.path.join(args.output_dir, "checkpoint-{}".format(global_step))
181 |                     if not os.path.exists(output_dir):
182 |                         os.makedirs(output_dir)
183 |                     model_to_save = model.module if hasattr(model, "module") else model  # Take care of distributed/parallel training
184 |                     model_to_save.save_pretrained(output_dir)
185 |                     torch.save(args, os.path.join(output_dir, "training_args.bin"))
186 |                     logger.info("Saving model checkpoint to %s", output_dir)
187 | 
188 |             if args.max_steps > 0 and global_step > args.max_steps:
189 |                 epoch_iterator.close()
190 |                 break
191 |         if args.max_steps > 0 and global_step > args.max_steps:
192 |             train_iterator.close()
193 |             break
194 | 
195 |     if args.local_rank in [-1, 0]:
196 |         tb_writer.close()
197 | 
198 |     return global_step, tr_loss / global_step
199 | 
200 | 
201 | def evaluate(args, model, tokenizer, labels, pad_token_label_id, mode, prefix=""):
202 |     eval_dataset = load_and_cache_examples(args, tokenizer, labels, pad_token_label_id, mode=mode)
203 | 
204 |     args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu)
205 |     # Note that DistributedSampler samples randomly
206 |     eval_sampler = SequentialSampler(eval_dataset) if args.local_rank == -1 else DistributedSampler(eval_dataset)
207 |     eval_dataloader = DataLoader(eval_dataset, sampler=eval_sampler, batch_size=args.eval_batch_size)
208 | 
209 |     # multi-gpu evaluate
210 |     if args.n_gpu > 1:
211 |         model = torch.nn.DataParallel(model)
212 | 
213 |     # Eval!
214 |     logger.info("***** Running evaluation %s *****", prefix)
215 |     logger.info("  Num examples = %d", len(eval_dataset))
216 |     logger.info("  Batch size = %d", args.eval_batch_size)
217 |     eval_loss = 0.0
218 |     nb_eval_steps = 0
219 |     preds = None
220 |     out_label_ids = None
221 |     model.eval()
222 |     for batch in tqdm(eval_dataloader, desc="Evaluating"):
223 |         batch = tuple(t.to(args.device) for t in batch)
224 | 
225 |         with torch.no_grad():
226 |             inputs = {"input_ids": batch[0],
227 |                       "attention_mask": batch[1],
228 |                       "labels": batch[3]}
229 |             if args.model_type != "distilbert":
230 |                 inputs["token_type_ids"] = batch[2] if args.model_type in ["bert", "xlnet"] else None  # XLM and RoBERTa don"t use segment_ids
231 |             if args.use_quotes:
232 |                 inputs['quotes'] = batch[4]
233 |             outputs = model(**inputs)
234 |             tmp_eval_loss, logits = outputs[:2]
235 | 
236 |             if args.n_gpu > 1:
237 |                 tmp_eval_loss = tmp_eval_loss.mean()  # mean() to average on multi-gpu parallel evaluating
238 | 
239 |             eval_loss += tmp_eval_loss.item()
240 |         nb_eval_steps += 1
241 |         if preds is None:
242 |             preds = logits.detach().cpu().numpy()
243 |             out_label_ids = inputs["labels"].detach().cpu().numpy()
244 |         else:
245 |             preds = np.append(preds, logits.detach().cpu().numpy(), axis=0)
246 |             out_label_ids = np.append(out_label_ids, inputs["labels"].detach().cpu().numpy(), axis=0)
247 | 
248 |     eval_loss = eval_loss / nb_eval_steps
249 |     preds_logits = softmax(preds, axis=2)
250 |     preds = np.argmax(preds, axis=2)
251 | 
252 |     label_map = {i: label for i, label in enumerate(labels)}
253 | 
254 |     out_label_list = [[] for _ in range(out_label_ids.shape[0])]
255 |     preds_list = [[] for _ in range(out_label_ids.shape[0])]
256 | 
257 |     for i in range(out_label_ids.shape[0]):
258 |         for j in range(out_label_ids.shape[1]):
259 |             if out_label_ids[i, j] != pad_token_label_id:
260 |                 out_label_list[i].append(label_map[out_label_ids[i][j]])
261 |                 if np.max(preds_logits[i][j]) > 0:
262 |                     preds_list[i].append(label_map[preds[i][j]])
263 |                 else:
264 |                     preds_list[i].append('O')
265 | 
266 |     results = {
267 |         "loss": eval_loss,
268 |         "precision": precision_score(out_label_list, preds_list),
269 |         "recall": recall_score(out_label_list, preds_list),
270 |         "f1": f1_score(out_label_list, preds_list),
271 |         "flat_f1": metrics.flat_f1_score(out_label_list, preds_list, average='micro', labels=["B-PROP", "I-PROP"])
272 |     }
273 | 
274 |     logger.info("***** Eval results %s *****", prefix)
275 |     for key in sorted(results.keys()):
276 |         logger.info("  %s = %s", key, str(results[key]))
277 | 
278 |     return results, preds_list
279 | 
280 | 
281 | def load_and_cache_examples(args, tokenizer, labels, pad_token_label_id, mode):
282 |     if args.local_rank not in [-1, 0] and not evaluate:
283 |         torch.distributed.barrier()  # Make sure only the first process in distributed training process the dataset, and the others will use the cache
284 | 
285 |     # Load data features from cache or dataset file
286 |     cached_features_file = os.path.join(args.data_dir, "cached_{}_{}_{}".format(mode,
287 |         list(filter(None, args.model_name_or_path.split("/"))).pop(),
288 |         str(args.max_seq_length)))
289 |     if False and os.path.exists(cached_features_file) and not args.overwrite_cache:
290 |         logger.info("Loading features from cached file %s", cached_features_file)
291 |         features = torch.load(cached_features_file)
292 |     else:
293 |         logger.info("Creating features from dataset file at %s", args.data_dir)
294 |         files = {'train': args.train_file, 'dev': args.dev_file, 'test': args.test_file}
295 |         examples = read_examples_from_file(os.path.join(args.data_dir, files[mode]), mode)
296 |         features = convert_examples_to_features(examples, labels, args.max_seq_length, tokenizer,
297 |                                                 cls_token_at_end=bool(args.model_type in ["xlnet"]),
298 |                                                 # xlnet has a cls token at the end
299 |                                                 cls_token=tokenizer.cls_token,
300 |                                                 cls_token_segment_id=2 if args.model_type in ["xlnet"] else 0,
301 |                                                 sep_token=tokenizer.sep_token,
302 |                                                 sep_token_extra=bool(args.model_type in ["roberta"]),
303 |                                                 # roberta uses an extra separator b/w pairs of sentences, cf. github.com/pytorch/fairseq/commit/1684e166e3da03f5b600dbb7855cb98ddfcd0805
304 |                                                 pad_on_left=bool(args.model_type in ["xlnet"]),
305 |                                                 # pad on the left for xlnet
306 |                                                 pad_token=tokenizer.convert_tokens_to_ids([tokenizer.pad_token])[0],
307 |                                                 pad_token_segment_id=4 if args.model_type in ["xlnet"] else 0,
308 |                                                 pad_token_label_id=pad_token_label_id
309 |                                                 )
310 |         if args.use_quotes:
311 |             assert len(features) == len(examples)            
312 |             for i in range(len(features)):
313 |                 tokens = []
314 |                 for word in examples[i].words:
315 |                     word_tokens = tokenizer.tokenize(word)
316 |                     tokens.extend(word_tokens)
317 |                 tokens = ['cls_token'] + tokens
318 |                 quotes = np.zeros(args.max_seq_length, dtype=np.float32)
319 |                 for j in range(1, min(len(tokens), args.max_seq_length)):
320 |                     if unidecode(tokens[j]) == '"':
321 |                         quotes[j] = 1
322 |                 features[i].quotes = quotes[:, None]
323 |                 
324 |         if args.local_rank in [-1, 0]:
325 |             logger.info("Saving features into cached file %s", cached_features_file)
326 |             torch.save(features, cached_features_file)
327 | 
328 |     if args.local_rank == 0 and not evaluate:
329 |         torch.distributed.barrier()  # Make sure only the first process in distributed training process the dataset, and the others will use the cache
330 | 
331 |     # Convert to Tensors and build dataset
332 |     all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long)
333 |     all_input_mask = torch.tensor([f.input_mask for f in features], dtype=torch.long)
334 |     all_segment_ids = torch.tensor([f.segment_ids for f in features], dtype=torch.long)
335 |     all_label_ids = torch.tensor([f.label_ids for f in features], dtype=torch.long)
336 |     if args.use_quotes:
337 |         all_quotes = torch.tensor([f.quotes for f in features], dtype=torch.long)
338 |         dataset = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids, all_quotes)
339 |     else:
340 |         dataset = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids)
341 |     return dataset
342 | 
343 | 
344 | def transformers_ner(args):
345 |     if os.path.exists(args.output_dir) and os.listdir(
346 |             args.output_dir) and args.do_train and not args.overwrite_output_dir:
347 |         raise ValueError(
348 |             "Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome.".format(
349 |                 args.output_dir))
350 | 
351 |     # Setup distant debugging if needed
352 |     if args.server_ip and args.server_port:
353 |         # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script
354 |         import ptvsd
355 |         print("Waiting for debugger attach")
356 |         ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True)
357 |         ptvsd.wait_for_attach()
358 | 
359 |     # Setup CUDA, GPU & distributed training
360 |     if args.local_rank == -1 or args.no_cuda:
361 |         device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
362 |         args.n_gpu = torch.cuda.device_count()
363 |     else:  # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
364 |         torch.cuda.set_device(args.local_rank)
365 |         device = torch.device("cuda", args.local_rank)
366 |         torch.distributed.init_process_group(backend="nccl")
367 |         args.n_gpu = 1
368 |     args.device = device
369 | 
370 |     # Setup logging
371 |     logging.basicConfig(format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
372 |                         datefmt="%m/%d/%Y %H:%M:%S",
373 |                         level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN)
374 |     logger.warning("Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s",
375 |                    args.local_rank, device, args.n_gpu, bool(args.local_rank != -1), args.fp16)
376 | 
377 |     # Set seed
378 |     set_seed(args)
379 | 
380 |     # Prepare CONLL-2003 task
381 |     labels = get_labels(args.labels)
382 |     num_labels = len(labels)
383 |     # Use cross entropy ignore index as padding label id so that only real label ids contribute to the loss later
384 |     pad_token_label_id = CrossEntropyLoss().ignore_index
385 | 
386 |     # Load pretrained model and tokenizer
387 |     if args.local_rank not in [-1, 0]:
388 |         torch.distributed.barrier()  # Make sure only the first process in distributed training will download model & vocab
389 | 
390 |     args.model_type = args.model_type.lower()
391 |     config_class, model_class, tokenizer_class = MODEL_CLASSES[args.model_type]
392 |     
393 |     config = config_class.from_pretrained(args.config_name if args.config_name else args.model_name_or_path,
394 |                                           num_labels=num_labels,
395 |                                           cache_dir=args.cache_dir if args.cache_dir else None)
396 |     config.use_quotes = args.use_quotes
397 |     tokenizer = tokenizer_class.from_pretrained(args.tokenizer_name if args.tokenizer_name else args.model_name_or_path,
398 |                                                 do_lower_case=args.do_lower_case,
399 |                                                 cache_dir=args.cache_dir if args.cache_dir else None)
400 |     model = model_class.from_pretrained(args.model_name_or_path,
401 |                                         from_tf=bool(".ckpt" in args.model_name_or_path),
402 |                                         config=config,
403 |                                         cache_dir=args.cache_dir if args.cache_dir else None)
404 | 
405 |     if args.local_rank == 0:
406 |         torch.distributed.barrier()  # Make sure only the first process in distributed training will download model & vocab
407 | 
408 |     model.to(args.device)
409 | 
410 |     logger.info("Training/evaluation parameters %s", args)
411 | 
412 |     # Training
413 |     if args.do_train:
414 |         train_dataset = load_and_cache_examples(args, tokenizer, labels, pad_token_label_id, mode="train")
415 |         global_step, tr_loss = train(args, train_dataset, model, tokenizer, labels, pad_token_label_id)
416 |         logger.info(" global_step = %s, average loss = %s", global_step, tr_loss)
417 | 
418 |     # Saving best-practices: if you use defaults names for the model, you can reload it using from_pretrained()
419 |     if args.do_train and (args.local_rank == -1 or torch.distributed.get_rank() == 0):
420 |         # Create output directory if needed
421 |         if not os.path.exists(args.output_dir) and args.local_rank in [-1, 0]:
422 |             os.makedirs(args.output_dir)
423 | 
424 |         logger.info("Saving model checkpoint to %s", args.output_dir)
425 |         # Save a trained model, configuration and tokenizer using `save_pretrained()`.
426 |         # They can then be reloaded using `from_pretrained()`
427 |         model_to_save = model.module if hasattr(model, "module") else model  # Take care of distributed/parallel training
428 |         #model_to_save.save_pretrained(args.output_dir)
429 |         model_save_path_ = os.path.join(args.output_dir, "pytorch_model.bin")
430 |         torch.save(model_to_save.state_dict(), model_save_path_)
431 |         tokenizer.save_pretrained(args.output_dir)
432 | 
433 |         # Good practice: save your training arguments together with the trained model
434 |         torch.save(args, os.path.join(args.output_dir, "training_args.bin"))
435 | 
436 |     # Evaluation
437 |     results = {}
438 |     if args.do_eval and args.local_rank in [-1, 0]:
439 |         tokenizer = tokenizer_class.from_pretrained(args.output_dir, do_lower_case=args.do_lower_case)
440 |         checkpoints = [args.output_dir]
441 |         if args.eval_all_checkpoints:
442 |             checkpoints = list(os.path.dirname(c) for c in sorted(glob.glob(args.output_dir + "/**/" + WEIGHTS_NAME, recursive=True)))
443 |             logging.getLogger("pytorch_transformers.modeling_utils").setLevel(logging.WARN)  # Reduce logging
444 |         logger.info("Evaluate the following checkpoints: %s", checkpoints)
445 |         for checkpoint in checkpoints:
446 |             global_step = checkpoint.split("-")[-1] if len(checkpoints) > 1 else ""
447 |             
448 |             model = model_class.from_pretrained(checkpoint)
449 |             model.to(args.device)
450 |             result, _ = evaluate(args, model, tokenizer, labels, pad_token_label_id, mode="dev", prefix=global_step)
451 |             if global_step:
452 |                 result = {"{}_{}".format(global_step, k): v for k, v in result.items()}
453 |             results.update(result)
454 |         output_eval_file = os.path.join(args.output_dir, "eval_results.txt")
455 |         with open(output_eval_file, "w") as writer:
456 |             for key in sorted(results.keys()):
457 |                 writer.write("{} = {}\n".format(key, str(results[key])))
458 | 
459 |     if args.do_predict and args.local_rank in [-1, 0]:
460 |         tokenizer = tokenizer_class.from_pretrained(args.output_dir, do_lower_case=args.do_lower_case)
461 |         checkpoints = [args.output_dir]
462 |         if args.eval_all_checkpoints:
463 |             checkpoints = list(os.path.dirname(c) for c in sorted(glob.glob(args.output_dir + "/**/" + WEIGHTS_NAME, recursive=True)))
464 |             logging.getLogger("pytorch_transformers.modeling_utils").setLevel(logging.WARN)  # Reduce logging
465 |         logger.info("Evaluate the following checkpoints: %s", checkpoints)
466 |         for checkpoint in checkpoints:
467 |             global_step = checkpoint.split("-")[-1] if len(checkpoints) > 1 else ""
468 |             
469 |             model = model_class.from_pretrained(checkpoint)
470 |             model.to(args.device)
471 |             result, predictions = evaluate(args, model, tokenizer, labels, pad_token_label_id, mode="test")
472 |             if global_step:
473 |                 result = {"{}_{}".format(global_step, k): v for k, v in result.items()}
474 |             results.update(result)
475 |             # Save results
476 |             output_test_results_file = os.path.join(checkpoint, "test_results.txt")
477 |             with open(output_test_results_file, "w") as writer:
478 |                 for key in sorted(result.keys()):
479 |                     writer.write("{} = {}\n".format(key, str(result[key])))
480 |             # Save predictions
481 |             output_test_predictions_file = os.path.join(checkpoint, "test_predictions.txt")
482 |             with open(output_test_predictions_file, "w") as writer:
483 |                 with open(os.path.join(args.data_dir, args.test_file), "r") as f:
484 |                     example_id = 0
485 |                     for line in f:
486 |                         if line.startswith("-DOCSTART-") or line == "" or line == "\n":
487 |                             writer.write(line)
488 |                             if not predictions[example_id]:
489 |                                 example_id += 1
490 |                         elif predictions[example_id]:
491 |                             output_line = line.split('\t')[0] + "\t" + predictions[example_id].pop(0) + "\n"
492 |                             writer.write(output_line)
493 |                         else:
494 |                             logger.warning("Maximum sequence length exceeded: No prediction for '%s'.", line.split()[0])
495 | 
496 |     return results
497 | 


--------------------------------------------------------------------------------