├── fasttext_baseline
    ├── processed_data
    │   └── .empty
    ├── requirements.txt
    ├── tests.py
    └── run.py
├── mbert_baseline
    ├── allennlp_xstance
    │   ├── __init__.py
    │   ├── xstance_predictor.py
    │   └── xstance_reader.py
    ├── requirements.txt
    └── mbert.jsonnet
├── example.png
├── data
    └── xstance-data-v1.0.zip
├── .travis.yml
├── LICENSE
├── evaluate.py
├── .gitignore
└── README.md


/fasttext_baseline/processed_data/.empty:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/mbert_baseline/allennlp_xstance/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/example.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ZurichNLP/xstance/HEAD/example.png


--------------------------------------------------------------------------------
/fasttext_baseline/requirements.txt:
--------------------------------------------------------------------------------
1 | jsonlines==1.2.0
2 | fasttext==0.9.2
3 | 


--------------------------------------------------------------------------------
/mbert_baseline/requirements.txt:
--------------------------------------------------------------------------------
1 | jsonlines==1.2.0
2 | allennlp==0.9.0
3 | scikit-learn==0.22.2
4 | 


--------------------------------------------------------------------------------
/data/xstance-data-v1.0.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ZurichNLP/xstance/HEAD/data/xstance-data-v1.0.zip


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
 1 | language: python
 2 | python:
 3 |   - "3.6"
 4 |   - "3.7"
 5 |   - "3.8"
 6 | install:
 7 |   - unzip data/xstance-data-v1.0.zip -d data
 8 |   - pip install scikit-learn
 9 |   - pip install -r fasttext_baseline/requirements.txt
10 |   - pip install -r mbert_baseline/requirements.txt
11 |   - cd fasttext_baseline && wget http://www.statmt.org/europarl/v7/tools.tgz && tar -xvf tools.tgz && cd ..
12 | script: python evaluate.py --gold data/test.jsonl --pred predictions/mbert_pred.jsonl
13 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2020 University of Zurich, Department of Computational Linguistics, Jannis Vamvas
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/mbert_baseline/mbert.jsonnet:
--------------------------------------------------------------------------------
 1 | local bert_model = "bert-base-multilingual-cased";
 2 | 
 3 | {
 4 |     "dataset_reader": {
 5 |         "lazy": false,
 6 |         "type": "xstance_reader",
 7 |         "max_sequence_length": 512,
 8 |         "skip_label_indexing": false,
 9 |         "tokenizer": {
10 |             "type": "pretrained_transformer",
11 |             "do_lowercase": false,
12 |             "model_name": bert_model
13 |         },
14 |         "token_indexers": {
15 |             "bert": {
16 |                 "type": "bert-pretrained",
17 |                 "do_lowercase": false,
18 |                 "pretrained_model": bert_model
19 |             }
20 |         }
21 |     },
22 |     "train_data_path": "../data/train.jsonl",
23 |     "validation_data_path": "../data/valid.jsonl",
24 |     "model": {
25 |         "type": "bert_for_classification",
26 |         "bert_model": bert_model,
27 |         "dropout": 0.1
28 |     },
29 |     "iterator": {
30 |         "type": "basic",
31 |         "batch_size": 16,
32 |     },
33 |     "trainer": {
34 |         "optimizer": {
35 |             "type": "bert_adam",
36 |             "warmup": 0.1,
37 |             "t_total": 8580,
38 |             "lr": 0.00002
39 |         },
40 |         "validation_metric": "+accuracy",
41 |         "num_serialized_models_to_keep": 1,
42 |         "num_epochs": 3,
43 |         "patience": 5,
44 |         "cuda_device": 0
45 |     }
46 | }
47 | 


--------------------------------------------------------------------------------
/evaluate.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import json
 3 | from typing import List
 4 | 
 5 | from sklearn.metrics import f1_score
 6 | 
 7 | LANGUAGES = [
 8 |     "de",
 9 |     "fr",
10 |     "it",
11 | ]
12 | 
13 | TEST_SETS = [
14 |     "new_comments_defr",
15 |     "new_questions_defr",
16 |     "new_topics_defr",
17 |     "new_comments_it",
18 |     # "new_questions_it",
19 |     # "new_topics_it",
20 | ]
21 | 
22 | 
23 | def evaluate_file(gold_file, pred_file):
24 |     gold_list = [json.loads(line) for line in gold_file]
25 |     pred_list = [json.loads(line) for line in pred_file]
26 |     args.gold.close()
27 |     args.pred.close()
28 |     evaluate_json(gold_list, pred_list)
29 | 
30 | 
31 | def evaluate_json(gold: List, pred: List):
32 |     for test_set in TEST_SETS:
33 |         print(test_set)
34 |         for language in LANGUAGES:
35 |             instance_indices = [i for i, instance in enumerate(gold) if
36 |                                 instance["test_set"] == test_set and instance["language"] == language]
37 |             gold_labels = [gold[i]["label"] for i in instance_indices]
38 |             pred_labels = [pred[i]["label"] for i in instance_indices]
39 |             if not len(gold_labels):
40 |                 continue
41 |             score = f1_score(gold_labels, pred_labels, average="macro")
42 |             print(language.upper(), 100 * score)
43 |         print()
44 | 
45 | 
46 | if __name__ == "__main__":
47 |     parser = argparse.ArgumentParser(description="Evaluate predictions on the x-stance test sets")
48 |     parser.add_argument('--gold', type=argparse.FileType('r', encoding='UTF-8'), required=True)
49 |     parser.add_argument('--pred', type=argparse.FileType('r', encoding='UTF-8'), required=True)
50 |     parser.add_argument("-v", "--verbose", action="store_true")
51 |     args = parser.parse_args()
52 |     evaluate_file(args.gold, args.pred)
53 | 


--------------------------------------------------------------------------------
/mbert_baseline/allennlp_xstance/xstance_predictor.py:
--------------------------------------------------------------------------------
 1 | from copy import deepcopy
 2 | from typing import List, Dict
 3 | 
 4 | from overrides import overrides
 5 | import numpy
 6 | 
 7 | from allennlp.common.util import JsonDict
 8 | from allennlp.data import Instance
 9 | from allennlp.predictors.predictor import Predictor
10 | from allennlp.data.fields import LabelField
11 | 
12 | 
13 | @Predictor.register('xstance_predictor')
14 | class XStancePredictor(Predictor):
15 |     """
16 |     Predictor for any model that takes in a sentence and returns
17 |     a single class for it.  In particular, it can be used with
18 |     the :class:`~allennlp.models.basic_classifier.BasicClassifier` model
19 |     """
20 |     def predict(self, sentence: str) -> JsonDict:
21 |         return self.predict_json({"sentence": sentence})
22 | 
23 |     @overrides
24 |     def _json_to_instance(self, json_dict: JsonDict) -> Instance:
25 |         """
26 |         Expects JSON that looks like ``{"sentence": "..."}``.
27 |         Runs the underlying model, and adds the ``"label"`` to the output.
28 |         """
29 |         question = json_dict["question"]
30 |         comment = json_dict["comment"]
31 |         return self._dataset_reader.text_to_instance(question, comment)
32 | 
33 |     @overrides
34 |     def predictions_to_labeled_instances(self,
35 |                                          instance: Instance,
36 |                                          outputs: Dict[str, numpy.ndarray]) -> List[Instance]:
37 |         new_instance = deepcopy(instance)
38 |         if "probs" in outputs:
39 |             label = numpy.argmax(outputs['probs'])
40 |             new_instance.add_field('prediction', LabelField(int(label)))
41 |         elif "prediction" in outputs:
42 |             label = outputs["score"]
43 |             new_instance.add_field('prediction', LabelField(int(label), skip_indexing=True))
44 |         else:
45 |             raise ValueError("probs or score not found in prediction outputs")
46 |         return [new_instance]
47 | 


--------------------------------------------------------------------------------
/fasttext_baseline/tests.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from unittest import TestCase
 3 | 
 4 | import jsonlines
 5 | import numpy as np
 6 | 
 7 | from fasttext_baseline import run as run_baseline
 8 | 
 9 | 
10 | class FastTextBaselineTestCase(TestCase):
11 | 
12 |     def setUp(self) -> None:
13 |         self.data_dir = "../data"
14 |         self.model_path = "test_model.bin"
15 |         input_path = os.path.join(self.data_dir, "valid.jsonl")
16 |         self.valid_path = os.path.join("test_output", "valid.txt")
17 |         run_baseline._jsonl_to_fasttext_format(input_path, self.valid_path)
18 | 
19 |     def test_jsonl_to_fasttext_format(self):
20 |         with open(self.valid_path) as f:
21 |             first_line = next(f)
22 |         label, *tokens = first_line.split()
23 |         self.assertEqual("__label__FAVOR", label)
24 |         self.assertEqual(
25 |             """\
26 | Sollen Ausländer / -innen , die seit mindestens zehn Jahren \
27 | in der Schweiz leben , das Stimm- und Wahlrecht auf Gemeindeebene \
28 | erhalten ? Ich bin finde das geht zu wenig weit . Alle \
29 | Menschen die hier leben sollen das Recht auf Mitsprache haben .""",
30 |             first_line.replace(label, "").strip()
31 |         )
32 |         num_lines = 0
33 |         with open(self.valid_path) as f:
34 |             for line in f:
35 |                 if line.strip():
36 |                     num_lines += 1
37 |         self.assertEqual(3926, num_lines)
38 | 
39 |     def test_predictions_to_jsonl(self):
40 |         predictions = (((u'__label__AGAINST',), np.array([0.15613931]),),)
41 |         predictions_path = os.path.join("test_output", "pred.jsonl")
42 |         run_baseline._predictions_to_jsonl(predictions, predictions_path)
43 |         with jsonlines.open(predictions_path) as f:
44 |             first_line = next(iter(f))
45 |         self.assertDictEqual({"label": "AGAINST"}, first_line)
46 | 
47 | 
48 |     def test_train(self):
49 |         run_baseline.train(self.model_path, self.valid_path)
50 | 
51 |     def test_predict(self):
52 |         predictions = run_baseline.predict(self.model_path, self.valid_path)
53 |         self.assertEqual(3926, len(predictions))
54 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | pip-wheel-metadata/
 24 | share/python-wheels/
 25 | *.egg-info/
 26 | .installed.cfg
 27 | *.egg
 28 | MANIFEST
 29 | 
 30 | # PyInstaller
 31 | #  Usually these files are written by a python script from a template
 32 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 33 | *.manifest
 34 | *.spec
 35 | 
 36 | # Installer logs
 37 | pip-log.txt
 38 | pip-delete-this-directory.txt
 39 | 
 40 | # Unit test / coverage reports
 41 | htmlcov/
 42 | .tox/
 43 | .nox/
 44 | .coverage
 45 | .coverage.*
 46 | .cache
 47 | nosetests.xml
 48 | coverage.xml
 49 | *.cover
 50 | *.py,cover
 51 | .hypothesis/
 52 | .pytest_cache/
 53 | 
 54 | # Translations
 55 | *.mo
 56 | *.pot
 57 | 
 58 | # Django stuff:
 59 | *.log
 60 | local_settings.py
 61 | db.sqlite3
 62 | db.sqlite3-journal
 63 | 
 64 | # Flask stuff:
 65 | instance/
 66 | .webassets-cache
 67 | 
 68 | # Scrapy stuff:
 69 | .scrapy
 70 | 
 71 | # Sphinx documentation
 72 | docs/_build/
 73 | 
 74 | # PyBuilder
 75 | target/
 76 | 
 77 | # Jupyter Notebook
 78 | .ipynb_checkpoints
 79 | 
 80 | # IPython
 81 | profile_default/
 82 | ipython_config.py
 83 | 
 84 | # pyenv
 85 | .python-version
 86 | 
 87 | # pipenv
 88 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 89 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 90 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 91 | #   install all needed dependencies.
 92 | #Pipfile.lock
 93 | 
 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
 95 | __pypackages__/
 96 | 
 97 | # Celery stuff
 98 | celerybeat-schedule
 99 | celerybeat.pid
100 | 
101 | # SageMath parsed files
102 | *.sage.py
103 | 
104 | # Environments
105 | .env
106 | .venv
107 | env/
108 | venv/
109 | ENV/
110 | env.bak/
111 | venv.bak/
112 | 
113 | # Spyder project settings
114 | .spyderproject
115 | .spyproject
116 | 
117 | # Rope project settings
118 | .ropeproject
119 | 
120 | # mkdocs documentation
121 | /site
122 | 
123 | # mypy
124 | .mypy_cache/
125 | .dmypy.json
126 | dmypy.json
127 | 
128 | # Pyre type checker
129 | .pyre/
130 | 
131 | .idea
132 | 


--------------------------------------------------------------------------------
/mbert_baseline/allennlp_xstance/xstance_reader.py:
--------------------------------------------------------------------------------
 1 | from collections import defaultdict
 2 | from typing import Dict, Union
 3 | import logging
 4 | 
 5 | import jsonlines
 6 | from overrides import overrides
 7 | from allennlp.common.file_utils import cached_path
 8 | from allennlp.data.dataset_readers.dataset_reader import DatasetReader
 9 | from allennlp.data.fields import LabelField, TextField, Field
10 | from allennlp.data.instance import Instance
11 | from allennlp.data.token_indexers import TokenIndexer, SingleIdTokenIndexer
12 | from allennlp.data.tokenizers import Tokenizer, WordTokenizer
13 | 
14 | logger = logging.getLogger(__name__)  # pylint: disable=invalid-name
15 | 
16 | 
17 | @DatasetReader.register("xstance_reader")
18 | class XStanceReader(DatasetReader):
19 |     def __init__(self,
20 |                  token_indexers: Dict[str, TokenIndexer] = None,
21 |                  tokenizer: Tokenizer = None,
22 |                  max_sequence_length: int = None,
23 |                  skip_label_indexing: bool = False,
24 |                  ignore_questions: bool = False,
25 |                  ignore_comments: bool = False,
26 |                  lazy: bool = False) -> None:
27 |         super().__init__(lazy=lazy)
28 |         self._tokenizer = tokenizer or WordTokenizer()
29 |         self._max_sequence_length = max_sequence_length
30 |         self._skip_label_indexing = skip_label_indexing
31 |         self.ignore_questions = ignore_questions
32 |         self.ignore_comments = ignore_comments
33 |         self._token_indexers = token_indexers or {'tokens': SingleIdTokenIndexer()}
34 | 
35 |     @overrides
36 |     def _read(self, file_path):
37 |         with jsonlines.open(cached_path(file_path), "r") as f:
38 |             for i, answer in enumerate(f):
39 |                 question = answer["question"]
40 |                 comment = answer["comment"]
41 |                 label = answer.get("label", None)
42 |                 if label is not None:
43 |                     if self._skip_label_indexing:
44 |                         try:
45 |                             label = int(label)
46 |                         except ValueError:
47 |                             raise ValueError('Labels must be integers if skip_label_indexing is True.')
48 |                     else:
49 |                         label = str(label)
50 |                 instance = self.text_to_instance(question=question, comment=comment, label=label)
51 |                 if i < 4:
52 |                     logger.debug(instance)
53 |                 if instance is not None:
54 |                     yield instance
55 | 
56 |     def _truncate(self, n, tokens):
57 |         """
58 |         truncate a set of tokens using the provided sequence length
59 |         """
60 |         if len(tokens) > n:
61 |             tokens = tokens[:n]
62 |         return tokens
63 | 
64 |     @overrides
65 |     def text_to_instance(self, question: str, comment: str, label: Union[str, int] = None) -> Instance:
66 |         fields: Dict[str, Field] = {}
67 |         if self.ignore_questions:
68 |             question_tokens = []
69 |         else:
70 |             question_tokens = self._tokenizer.tokenize(question)
71 |         if self.ignore_comments:
72 |             comment_tokens = []
73 |         else:
74 |             comment_tokens = self._tokenizer.tokenize(comment)
75 |             comment_tokens = comment_tokens[1:]  # Do not need [CLS] in second segment
76 |         if self._max_sequence_length is not None:
77 |             question_tokens = self._truncate(self._max_sequence_length - len(comment_tokens) - 2, question_tokens)
78 |         tokens = question_tokens + comment_tokens
79 |         fields['tokens'] = TextField(tokens, self._token_indexers, )
80 |         if label is not None:
81 |             fields['label'] = LabelField(label,
82 |                                          skip_indexing=self._skip_label_indexing)
83 |         return Instance(fields)
84 | 


--------------------------------------------------------------------------------
/fasttext_baseline/run.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import os
  3 | import subprocess
  4 | from typing import Tuple
  5 | 
  6 | import jsonlines
  7 | import fasttext
  8 | from numpy.core.multiarray import ndarray
  9 | 
 10 | 
 11 | def _jsonl_to_fasttext_format(input_path: str, output_path: str) -> None:
 12 |     # Basic preprocessing and split into language files
 13 |     language_order = []
 14 |     with jsonlines.open(input_path) as f_in, \
 15 |             open(output_path + ".de", "w") as f_out_de, \
 16 |             open(output_path + ".fr", "w") as f_out_fr, \
 17 |             open(output_path + ".it", "w") as f_out_it:
 18 |         for line in f_in:
 19 |             # Concatenate question and comment
 20 |             text = " ".join([line["question"], line["comment"]])
 21 |             text = text.replace("\n", " ")
 22 |             language = line["language"]
 23 |             language_order.append(language)
 24 |             if language == "de":
 25 |                 output_file = f_out_de
 26 |             elif language == "fr":
 27 |                 output_file = f_out_fr
 28 |             elif language == "it":
 29 |                 output_file = f_out_it
 30 |             else:
 31 |                 raise NotImplementedError()
 32 |             output_file.write("__label__{} {}\n".format(line["label"], text))
 33 |     # Language-specific tokenization
 34 |     for language in ["de", "fr", "it"]:
 35 |         with open(output_path + ".{}".format(language)) as f_in, \
 36 |                 open(output_path + ".tokenized.{}".format(language), "w") as f_out:
 37 |             subprocess.call([
 38 |                     "./tools/tokenizer.perl",
 39 |                     "-l", language,
 40 |                     "-q",
 41 |                 ], stdin=f_in, stdout=f_out,
 42 |             )
 43 |     # Merge language files
 44 |     with open(output_path, "w") as f_out, \
 45 |             open(output_path + ".tokenized.de") as f_in_de, \
 46 |             open(output_path + ".tokenized.fr") as f_in_fr, \
 47 |             open(output_path + ".tokenized.it") as f_in_it:
 48 |         for language in language_order:
 49 |             if language == "de":
 50 |                 output_file = f_in_de
 51 |             elif language == "fr":
 52 |                 output_file = f_in_fr
 53 |             else:
 54 |                 output_file = f_in_it
 55 |             line = next(output_file)
 56 |             line = line.replace("_ _ label _ _ ", "__label__")
 57 |             f_out.write(line)
 58 | 
 59 | 
 60 | def _predictions_to_jsonl(predictions: Tuple[Tuple[Tuple[str], ndarray]], output_path: str) -> None:
 61 |     with jsonlines.open(output_path, "w") as f:
 62 |         for labels, _ in predictions:
 63 |             label = labels[0].replace("__label__", "")
 64 |             f.write({"label": label})
 65 | 
 66 | 
 67 | def train(model_path: str, train_dataset_path: str, pretrained_vectors: str = "", lr: float = 0.1,
 68 |           epochs: int = 5) -> fasttext.FastText:
 69 |     model = fasttext.train_supervised(
 70 |         input=train_dataset_path,
 71 |         pretrained_vectors=pretrained_vectors,
 72 |         dim=300,
 73 |         lr=lr,
 74 |         epoch=epochs,
 75 |         wordNgrams=3,
 76 |     )
 77 |     model.save_model(model_path)
 78 |     return model
 79 | 
 80 | 
 81 | def predict(model_path: str, dataset_path) -> Tuple[Tuple[Tuple[str], ndarray]]:
 82 |     model = fasttext.load_model(model_path)
 83 |     predictions = []
 84 |     with open(dataset_path) as f:
 85 |         for line in f:
 86 |             if not line.strip():
 87 |                 continue
 88 |             _, *tokens = line.split()
 89 |             text = " ".join(tokens)
 90 |             prediction: Tuple[Tuple[str], ndarray] = model.predict(text)
 91 |             predictions.append(prediction)
 92 |     return tuple(predictions)
 93 | 
 94 | 
 95 | def main():
 96 |     parser = argparse.ArgumentParser(description="Train a fastText baseline for X-Stance")
 97 |     parser.add_argument('--data-dir', type=str, required=True)
 98 |     parser.add_argument('--pred', type=str, required=True)
 99 |     parser.add_argument('--pretrained-vectors', type=str, default="")
100 |     parser.add_argument('--lr', type=float, default=0.1)
101 |     parser.add_argument('--epochs', type=int, default=5)
102 |     args = parser.parse_args()
103 | 
104 |     FASTTEXT_DATA_DIR = "processed_data"
105 |     for dataset_path in [
106 |         "train.jsonl",
107 |         "valid.jsonl",
108 |         "test.jsonl",
109 |     ]:
110 |         _jsonl_to_fasttext_format(
111 |             input_path=os.path.join(args.data_dir, dataset_path),
112 |             output_path=os.path.join(FASTTEXT_DATA_DIR, dataset_path.replace(".jsonl", ".txt"))
113 |         )
114 | 
115 |     model_path = "model.bin"
116 |     model = train(
117 |         model_path,
118 |         train_dataset_path=os.path.join(FASTTEXT_DATA_DIR, "train.txt"),
119 |         pretrained_vectors=args.pretrained_vectors,
120 |         lr=args.lr,
121 |         epochs=args.epochs,
122 |     )
123 |     _, valid_precision, valid_recall = model.test(os.path.join(FASTTEXT_DATA_DIR, "valid.txt"))
124 |     print("Validation precision: ", valid_precision)
125 |     print("Validation recall: ", valid_recall)
126 |     valid_f1 = 2 * valid_precision * valid_recall / (valid_precision + valid_recall)
127 |     print("Validation F1: ", valid_f1)
128 | 
129 |     predictions = predict(model_path, os.path.join(FASTTEXT_DATA_DIR, "test.txt"))
130 |     _predictions_to_jsonl(predictions, args.pred)
131 |     print("Saved test predictions in", args.pred)
132 | 
133 | 
134 | if __name__ == "__main__":
135 |     main()
136 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | [![Live Viewer](https://img.shields.io/badge/Browse-nlp%2Fviewer-brightgreen)](https://huggingface.co/nlp/viewer/?dataset=x_stance)
  2 | [![Build Status](https://travis-ci.org/ZurichNLP/xstance.svg?branch=master)](https://travis-ci.org/ZurichNLP/xstance)
  3 | 
  4 | # <span style="font-variant:small-caps;">x</span>-stance
  5 | 
  6 | Data and code accompanying the paper ["X-Stance: A Multilingual Multi-Target Dataset for Stance Detection"](http://ceur-ws.org/Vol-2624/paper9.pdf).
  7 | 
  8 | A high-level description can be found in the [**blog post**](https://vamvas.ch/more-general-stance-detection-with-x-stance), and a more detailed description in the [**paper**](http://ceur-ws.org/Vol-2624/paper9.pdf).
  9 | 
 10 | 
 11 | ## Summary
 12 | 
 13 | The <span style="font-variant:small-caps;">x</span>-stance dataset contains more than **150 political questions**, and **67k comments** written by candidates on those questions.
 14 | 
 15 | It can be used to train and evaluate stance detection systems.
 16 | 
 17 | The comments are partly **German**, partly **French** and **Italian**. The questions are available in all the three languages plus **English**.
 18 | 
 19 | The data have been extracted from the Swiss voting advice platform [Smartvote](https://smartvote.ch/).
 20 | 
 21 | Data example:
 22 | 
 23 | <img alt="Data Example" src="example.png" width="700">
 24 | 
 25 | ## Structure
 26 | 
 27 | The dataset contains the following files:
 28 | - *train.jsonl*
 29 | - *valid.jsonl*
 30 | - *test.jsonl*
 31 | - *questions.{en,de,fr,it}.jsonl*
 32 | 
 33 | Example for a train, valid or test instance:
 34 | 
 35 | ```json
 36 | {
 37 |    "id": 20475,
 38 |    "language": "de",
 39 |    "question_id": 3469,
 40 |    "question": "Soll der Bundesrat ein Freihandelsabkommen mit den USA anstreben?",
 41 |    "comment": "Nicht unter einem Präsidenten, welcher die Rechte anderer mit Füssen tritt und Respektlos gegenüber ändern ist.",
 42 |    "label": "AGAINST",
 43 |    "numerical_label": 0,
 44 |    "author": "8aa829c3b86f",
 45 |    "topic": "Foreign Policy",
 46 |    "test_set": "new_comments_defr"
 47 | }
 48 | ```
 49 | 
 50 | Details:
 51 | - Languages: The files *train.jsonl* and *valid.jsonl* contain about 75% German data and 25% French data. The file *test.jsonl* also contains some Italians samples to test zero-shot cross-lingual transfer.
 52 | - `"label"` can be `"FAVOR"` or `"AGAINST"`.
 53 | - `"numerical_label"` provides a more fine-grained label (not used in our baseline). Range of values: {0, 25, 75, 100}, where 0 means "no" and 100 means "yes".
 54 | - `"test_set"`: Only *test.jsonl* has this field. Specifies the test partition (new comments / new questions / new topics; German+French / Italian). For details on the test partitions please refer to Table 2 in the [paper](http://ceur-ws.org/Vol-2624/paper9.pdf).
 55 | 
 56 | In the train, valid and test files, the comments are paired with a version of the question in the same language (e.g. German comment + German version of the question). The *questions.xx.jsonl* files provide complete translations of all the questions.
 57 | 
 58 | ## Evaluation
 59 | 
 60 | Dependencies: Python 3; `scikit-learn`
 61 | 
 62 | Unpacking the data:
 63 | ```bash
 64 | unzip data/xstance-data-v1.0.zip -d data
 65 | ```
 66 | 
 67 | Usage:
 68 | ```bash
 69 | python evaluate.py \
 70 |   --gold data/test.jsonl \
 71 |   --pred predictions/mbert_pred.jsonl 
 72 | ```
 73 | 
 74 | The predictions file should be a JSON lines file (http://jsonlines.org/). The lines in the file should correspond to the lines in the gold file (*test.jsonl*).
 75 | 
 76 | Example prediction:
 77 | ```json
 78 | {"label": "AGAINST"}
 79 | ```
 80 | 
 81 | The evaluation script outputs the macro-average of the F1 score for each label, per test partition and per language:
 82 | 
 83 | ```
 84 | new_comments_defr
 85 | DE 76.83541377429334
 86 | FR 76.61281705054353
 87 | 
 88 | new_questions_defr
 89 | DE 68.46881591336131
 90 | FR 68.3831150794995
 91 | 
 92 | new_topics_defr
 93 | DE 68.90323152487849
 94 | FR 70.8982523359103
 95 | 
 96 | new_comments_it
 97 | IT 70.19234360410832
 98 | ```
 99 | 
100 | ## fastText Baseline
101 | Dependencies:
102 | - Python >= 3.6
103 | - Perl
104 | - `pip install -r fasttext_baseline/requirements.txt`
105 | 
106 | Unpacking the data (if not done in the previous section):
107 | ```bash
108 | unzip data/xstance-data-v1.0.zip -d data
109 | ```
110 | 
111 | Downloading the Europarl preprocessing tools:
112 | ```bash
113 | cd fasttext_baseline
114 | wget http://www.statmt.org/europarl/v7/tools.tgz
115 | tar -xvf tools.tgz
116 | ```
117 | 
118 | Training and predicting:
119 | ```bash
120 | python run.py --data-dir ../data --pred ../predictions/mypred.jsonl
121 | ```
122 | 
123 | Evaluating:
124 | ```bash
125 | cd ..
126 | python evaluate.py \
127 |   --gold data/test.jsonl \
128 |   --pred predictions/mypred.jsonl 
129 | ```
130 | 
131 | 
132 | ## M-BERT Baseline
133 | Dependencies:
134 | - Python >= 3.6
135 | - AllenNLP 0.9.0 (http://docs.allennlp.org/master/)
136 | - `pip install -r mbert_baseline/requirements.txt`
137 | - The commands below assume GPU computation. They can be adapted for CPU, however.
138 | 
139 | Unpacking the data (if not done in the previous section):
140 | ```bash
141 | unzip data/xstance-data-v1.0.zip -d data
142 | ```
143 | 
144 | Training:
145 | ```bash
146 | cd mbert_baseline
147 | allennlp train mbert.jsonnet \
148 |     --include-package allennlp_xstance \
149 |     -s mymodel
150 | ```
151 | 
152 | Predicting:
153 | ```bash
154 | cd mbert_baseline
155 | allennlp predict mymodel ../data/test.jsonl \
156 |     --include-package allennlp_xstance \
157 |     --predictor xstance_predictor \
158 |     --cuda-device 0 \
159 |     --output-file ../predictions/mypred.jsonl
160 | ```
161 | 
162 | Evaluating:
163 | ```bash
164 | cd ..
165 | python evaluate.py \
166 |   --gold data/test.jsonl \
167 |   --pred predictions/mypred.jsonl 
168 | ```
169 | 
170 | 
171 | ## Licenses
172 | - Dataset: CC BY-NC 4.0 (© www.smartvote.ch)
173 | - Rest of repository: MIT License
174 | 
175 | ## References
176 | The dataset and baseline model are described in:
177 | 
178 | ```bibtex
179 | @inproceedings{vamvas2020xstance,
180 |     author    = "Vamvas, Jannis and Sennrich, Rico",
181 |     title     = "{X-Stance}: A Multilingual Multi-Target Dataset for Stance Detection",
182 |     booktitle = "Proceedings of the 5th Swiss Text Analytics Conference (SwissText) \& 16th Conference on Natural Language Processing (KONVENS)",
183 |     address   = "Zurich, Switzerland",
184 |     year      = "2020",
185 |     month     = "jun",
186 |     url       = "http://ceur-ws.org/Vol-2624/paper9.pdf"
187 | }
188 | ```
189 | 
190 | ## Metadata
191 | The metadata are used by search engines such as Google Dataset Search.
192 | <div itemscope itemtype="http://schema.org/Dataset">
193 |     <table>
194 |         <tr>
195 |             <th>property</th>
196 |             <th>value</th>
197 |         </tr>
198 |         <tr>
199 |             <td>name</td>
200 |             <td><code itemprop="name"><span style="font-variant:small-caps;">x</span>-Stance: A Multilingual
201 |                 Multi-Target Dataset for Stance Detection</code></td>
202 |         </tr>
203 |         <tr>
204 |             <td>description</td>
205 |             <td><code itemprop="description">The <span style="font-variant:small-caps;">x</span>-stance dataset contains
206 |                 more than 150 political questions, and 67k comments written by candidates on those questions. It can be
207 |                 used to train and evaluate stance detection systems. The comments are partly German, partly French and
208 |                 Italian. The questions are available in all the three languages plus English. The data have been
209 |                 extracted from the Swiss voting advice platform Smartvote.ch.</code></td>
210 |         </tr>
211 |         <tr>
212 |             <td>url</td>
213 |             <td><code itemprop="url">https://github.com/ZurichNLP/xstance</code></td>
214 |         </tr>
215 |         <tr>
216 |             <td>sameAs</td>
217 |             <td><code itemprop="sameAs">https://doi.org/10.5281/zenodo.3831317</code></td>
218 |         </tr>
219 |         <tr>
220 |             <td>license</td>
221 |             <td>
222 |                 <div itemscope itemtype="http://schema.org/CreativeWork" itemprop="license">
223 |                     <table>
224 |                         <tr>
225 |                             <th>property</th>
226 |                             <th>value</th>
227 |                         </tr>
228 |                         <tr>
229 |                             <td>name</td>
230 |                             <td><code itemprop="name">CC BY-NC 4.0</code></td>
231 |                         </tr>
232 |                         <tr>
233 |                             <td>url</td>
234 |                             <td><code itemprop="url">https://creativecommons.org/licenses/by-nc/4.0/</code></td>
235 |                         </tr>
236 |                     </table>
237 |                 </div>
238 |             </td>
239 |         </tr>
240 |         <tr>
241 |             <td>distribution</td>
242 |             <td>
243 |                 <div itemscope itemtype="http://schema.org/DataDownload" itemprop="distribution">
244 |                     <table>
245 |                         <tr>
246 |                             <th>property</th>
247 |                             <th>value</th>
248 |                         </tr>
249 |                         <tr>
250 |                             <td>contentUrl</td>
251 |                             <td><code itemprop="contentUrl">https://raw.githubusercontent.com/ZurichNLP/xstance/master/data/xstance-data-v1.0.zip</code>
252 |                             </td>
253 |                         </tr>
254 |                         <tr>
255 |                             <td>encodingFormat</td>
256 |                             <td><code itemprop="encodingFormat">application/zip</code>
257 |                             </td>
258 |                         </tr>
259 |                     </table>
260 |                 </div>
261 |             </td>
262 |         </tr>
263 |         <tr>
264 |             <td>citation</td>
265 |             <td><code itemprop="citation">http://doi.org/10.5281/zenodo.3831317</code></td>
266 |         </tr>
267 |     </table>
268 | </div>
269 | 


--------------------------------------------------------------------------------