├── fasttext_baseline
├── processed_data
│ └── .empty
├── requirements.txt
├── tests.py
└── run.py
├── mbert_baseline
├── allennlp_xstance
│ ├── __init__.py
│ ├── xstance_predictor.py
│ └── xstance_reader.py
├── requirements.txt
└── mbert.jsonnet
├── example.png
├── data
└── xstance-data-v1.0.zip
├── .travis.yml
├── LICENSE
├── evaluate.py
├── .gitignore
└── README.md
/fasttext_baseline/processed_data/.empty:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/mbert_baseline/allennlp_xstance/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/example.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ZurichNLP/xstance/HEAD/example.png
--------------------------------------------------------------------------------
/fasttext_baseline/requirements.txt:
--------------------------------------------------------------------------------
1 | jsonlines==1.2.0
2 | fasttext==0.9.2
3 |
--------------------------------------------------------------------------------
/mbert_baseline/requirements.txt:
--------------------------------------------------------------------------------
1 | jsonlines==1.2.0
2 | allennlp==0.9.0
3 | scikit-learn==0.22.2
4 |
--------------------------------------------------------------------------------
/data/xstance-data-v1.0.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ZurichNLP/xstance/HEAD/data/xstance-data-v1.0.zip
--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
1 | language: python
2 | python:
3 | - "3.6"
4 | - "3.7"
5 | - "3.8"
6 | install:
7 | - unzip data/xstance-data-v1.0.zip -d data
8 | - pip install scikit-learn
9 | - pip install -r fasttext_baseline/requirements.txt
10 | - pip install -r mbert_baseline/requirements.txt
11 | - cd fasttext_baseline && wget http://www.statmt.org/europarl/v7/tools.tgz && tar -xvf tools.tgz && cd ..
12 | script: python evaluate.py --gold data/test.jsonl --pred predictions/mbert_pred.jsonl
13 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2020 University of Zurich, Department of Computational Linguistics, Jannis Vamvas
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/mbert_baseline/mbert.jsonnet:
--------------------------------------------------------------------------------
1 | local bert_model = "bert-base-multilingual-cased";
2 |
3 | {
4 | "dataset_reader": {
5 | "lazy": false,
6 | "type": "xstance_reader",
7 | "max_sequence_length": 512,
8 | "skip_label_indexing": false,
9 | "tokenizer": {
10 | "type": "pretrained_transformer",
11 | "do_lowercase": false,
12 | "model_name": bert_model
13 | },
14 | "token_indexers": {
15 | "bert": {
16 | "type": "bert-pretrained",
17 | "do_lowercase": false,
18 | "pretrained_model": bert_model
19 | }
20 | }
21 | },
22 | "train_data_path": "../data/train.jsonl",
23 | "validation_data_path": "../data/valid.jsonl",
24 | "model": {
25 | "type": "bert_for_classification",
26 | "bert_model": bert_model,
27 | "dropout": 0.1
28 | },
29 | "iterator": {
30 | "type": "basic",
31 | "batch_size": 16,
32 | },
33 | "trainer": {
34 | "optimizer": {
35 | "type": "bert_adam",
36 | "warmup": 0.1,
37 | "t_total": 8580,
38 | "lr": 0.00002
39 | },
40 | "validation_metric": "+accuracy",
41 | "num_serialized_models_to_keep": 1,
42 | "num_epochs": 3,
43 | "patience": 5,
44 | "cuda_device": 0
45 | }
46 | }
47 |
--------------------------------------------------------------------------------
/evaluate.py:
--------------------------------------------------------------------------------
1 | import argparse
2 | import json
3 | from typing import List
4 |
5 | from sklearn.metrics import f1_score
6 |
7 | LANGUAGES = [
8 | "de",
9 | "fr",
10 | "it",
11 | ]
12 |
13 | TEST_SETS = [
14 | "new_comments_defr",
15 | "new_questions_defr",
16 | "new_topics_defr",
17 | "new_comments_it",
18 | # "new_questions_it",
19 | # "new_topics_it",
20 | ]
21 |
22 |
23 | def evaluate_file(gold_file, pred_file):
24 | gold_list = [json.loads(line) for line in gold_file]
25 | pred_list = [json.loads(line) for line in pred_file]
26 | args.gold.close()
27 | args.pred.close()
28 | evaluate_json(gold_list, pred_list)
29 |
30 |
31 | def evaluate_json(gold: List, pred: List):
32 | for test_set in TEST_SETS:
33 | print(test_set)
34 | for language in LANGUAGES:
35 | instance_indices = [i for i, instance in enumerate(gold) if
36 | instance["test_set"] == test_set and instance["language"] == language]
37 | gold_labels = [gold[i]["label"] for i in instance_indices]
38 | pred_labels = [pred[i]["label"] for i in instance_indices]
39 | if not len(gold_labels):
40 | continue
41 | score = f1_score(gold_labels, pred_labels, average="macro")
42 | print(language.upper(), 100 * score)
43 | print()
44 |
45 |
46 | if __name__ == "__main__":
47 | parser = argparse.ArgumentParser(description="Evaluate predictions on the x-stance test sets")
48 | parser.add_argument('--gold', type=argparse.FileType('r', encoding='UTF-8'), required=True)
49 | parser.add_argument('--pred', type=argparse.FileType('r', encoding='UTF-8'), required=True)
50 | parser.add_argument("-v", "--verbose", action="store_true")
51 | args = parser.parse_args()
52 | evaluate_file(args.gold, args.pred)
53 |
--------------------------------------------------------------------------------
/mbert_baseline/allennlp_xstance/xstance_predictor.py:
--------------------------------------------------------------------------------
1 | from copy import deepcopy
2 | from typing import List, Dict
3 |
4 | from overrides import overrides
5 | import numpy
6 |
7 | from allennlp.common.util import JsonDict
8 | from allennlp.data import Instance
9 | from allennlp.predictors.predictor import Predictor
10 | from allennlp.data.fields import LabelField
11 |
12 |
13 | @Predictor.register('xstance_predictor')
14 | class XStancePredictor(Predictor):
15 | """
16 | Predictor for any model that takes in a sentence and returns
17 | a single class for it. In particular, it can be used with
18 | the :class:`~allennlp.models.basic_classifier.BasicClassifier` model
19 | """
20 | def predict(self, sentence: str) -> JsonDict:
21 | return self.predict_json({"sentence": sentence})
22 |
23 | @overrides
24 | def _json_to_instance(self, json_dict: JsonDict) -> Instance:
25 | """
26 | Expects JSON that looks like ``{"sentence": "..."}``.
27 | Runs the underlying model, and adds the ``"label"`` to the output.
28 | """
29 | question = json_dict["question"]
30 | comment = json_dict["comment"]
31 | return self._dataset_reader.text_to_instance(question, comment)
32 |
33 | @overrides
34 | def predictions_to_labeled_instances(self,
35 | instance: Instance,
36 | outputs: Dict[str, numpy.ndarray]) -> List[Instance]:
37 | new_instance = deepcopy(instance)
38 | if "probs" in outputs:
39 | label = numpy.argmax(outputs['probs'])
40 | new_instance.add_field('prediction', LabelField(int(label)))
41 | elif "prediction" in outputs:
42 | label = outputs["score"]
43 | new_instance.add_field('prediction', LabelField(int(label), skip_indexing=True))
44 | else:
45 | raise ValueError("probs or score not found in prediction outputs")
46 | return [new_instance]
47 |
--------------------------------------------------------------------------------
/fasttext_baseline/tests.py:
--------------------------------------------------------------------------------
1 | import os
2 | from unittest import TestCase
3 |
4 | import jsonlines
5 | import numpy as np
6 |
7 | from fasttext_baseline import run as run_baseline
8 |
9 |
10 | class FastTextBaselineTestCase(TestCase):
11 |
12 | def setUp(self) -> None:
13 | self.data_dir = "../data"
14 | self.model_path = "test_model.bin"
15 | input_path = os.path.join(self.data_dir, "valid.jsonl")
16 | self.valid_path = os.path.join("test_output", "valid.txt")
17 | run_baseline._jsonl_to_fasttext_format(input_path, self.valid_path)
18 |
19 | def test_jsonl_to_fasttext_format(self):
20 | with open(self.valid_path) as f:
21 | first_line = next(f)
22 | label, *tokens = first_line.split()
23 | self.assertEqual("__label__FAVOR", label)
24 | self.assertEqual(
25 | """\
26 | Sollen Ausländer / -innen , die seit mindestens zehn Jahren \
27 | in der Schweiz leben , das Stimm- und Wahlrecht auf Gemeindeebene \
28 | erhalten ? Ich bin finde das geht zu wenig weit . Alle \
29 | Menschen die hier leben sollen das Recht auf Mitsprache haben .""",
30 | first_line.replace(label, "").strip()
31 | )
32 | num_lines = 0
33 | with open(self.valid_path) as f:
34 | for line in f:
35 | if line.strip():
36 | num_lines += 1
37 | self.assertEqual(3926, num_lines)
38 |
39 | def test_predictions_to_jsonl(self):
40 | predictions = (((u'__label__AGAINST',), np.array([0.15613931]),),)
41 | predictions_path = os.path.join("test_output", "pred.jsonl")
42 | run_baseline._predictions_to_jsonl(predictions, predictions_path)
43 | with jsonlines.open(predictions_path) as f:
44 | first_line = next(iter(f))
45 | self.assertDictEqual({"label": "AGAINST"}, first_line)
46 |
47 |
48 | def test_train(self):
49 | run_baseline.train(self.model_path, self.valid_path)
50 |
51 | def test_predict(self):
52 | predictions = run_baseline.predict(self.model_path, self.valid_path)
53 | self.assertEqual(3926, len(predictions))
54 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # Byte-compiled / optimized / DLL files
2 | __pycache__/
3 | *.py[cod]
4 | *$py.class
5 |
6 | # C extensions
7 | *.so
8 |
9 | # Distribution / packaging
10 | .Python
11 | build/
12 | develop-eggs/
13 | dist/
14 | downloads/
15 | eggs/
16 | .eggs/
17 | lib/
18 | lib64/
19 | parts/
20 | sdist/
21 | var/
22 | wheels/
23 | pip-wheel-metadata/
24 | share/python-wheels/
25 | *.egg-info/
26 | .installed.cfg
27 | *.egg
28 | MANIFEST
29 |
30 | # PyInstaller
31 | # Usually these files are written by a python script from a template
32 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
33 | *.manifest
34 | *.spec
35 |
36 | # Installer logs
37 | pip-log.txt
38 | pip-delete-this-directory.txt
39 |
40 | # Unit test / coverage reports
41 | htmlcov/
42 | .tox/
43 | .nox/
44 | .coverage
45 | .coverage.*
46 | .cache
47 | nosetests.xml
48 | coverage.xml
49 | *.cover
50 | *.py,cover
51 | .hypothesis/
52 | .pytest_cache/
53 |
54 | # Translations
55 | *.mo
56 | *.pot
57 |
58 | # Django stuff:
59 | *.log
60 | local_settings.py
61 | db.sqlite3
62 | db.sqlite3-journal
63 |
64 | # Flask stuff:
65 | instance/
66 | .webassets-cache
67 |
68 | # Scrapy stuff:
69 | .scrapy
70 |
71 | # Sphinx documentation
72 | docs/_build/
73 |
74 | # PyBuilder
75 | target/
76 |
77 | # Jupyter Notebook
78 | .ipynb_checkpoints
79 |
80 | # IPython
81 | profile_default/
82 | ipython_config.py
83 |
84 | # pyenv
85 | .python-version
86 |
87 | # pipenv
88 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
89 | # However, in case of collaboration, if having platform-specific dependencies or dependencies
90 | # having no cross-platform support, pipenv may install dependencies that don't work, or not
91 | # install all needed dependencies.
92 | #Pipfile.lock
93 |
94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
95 | __pypackages__/
96 |
97 | # Celery stuff
98 | celerybeat-schedule
99 | celerybeat.pid
100 |
101 | # SageMath parsed files
102 | *.sage.py
103 |
104 | # Environments
105 | .env
106 | .venv
107 | env/
108 | venv/
109 | ENV/
110 | env.bak/
111 | venv.bak/
112 |
113 | # Spyder project settings
114 | .spyderproject
115 | .spyproject
116 |
117 | # Rope project settings
118 | .ropeproject
119 |
120 | # mkdocs documentation
121 | /site
122 |
123 | # mypy
124 | .mypy_cache/
125 | .dmypy.json
126 | dmypy.json
127 |
128 | # Pyre type checker
129 | .pyre/
130 |
131 | .idea
132 |
--------------------------------------------------------------------------------
/mbert_baseline/allennlp_xstance/xstance_reader.py:
--------------------------------------------------------------------------------
1 | from collections import defaultdict
2 | from typing import Dict, Union
3 | import logging
4 |
5 | import jsonlines
6 | from overrides import overrides
7 | from allennlp.common.file_utils import cached_path
8 | from allennlp.data.dataset_readers.dataset_reader import DatasetReader
9 | from allennlp.data.fields import LabelField, TextField, Field
10 | from allennlp.data.instance import Instance
11 | from allennlp.data.token_indexers import TokenIndexer, SingleIdTokenIndexer
12 | from allennlp.data.tokenizers import Tokenizer, WordTokenizer
13 |
14 | logger = logging.getLogger(__name__) # pylint: disable=invalid-name
15 |
16 |
17 | @DatasetReader.register("xstance_reader")
18 | class XStanceReader(DatasetReader):
19 | def __init__(self,
20 | token_indexers: Dict[str, TokenIndexer] = None,
21 | tokenizer: Tokenizer = None,
22 | max_sequence_length: int = None,
23 | skip_label_indexing: bool = False,
24 | ignore_questions: bool = False,
25 | ignore_comments: bool = False,
26 | lazy: bool = False) -> None:
27 | super().__init__(lazy=lazy)
28 | self._tokenizer = tokenizer or WordTokenizer()
29 | self._max_sequence_length = max_sequence_length
30 | self._skip_label_indexing = skip_label_indexing
31 | self.ignore_questions = ignore_questions
32 | self.ignore_comments = ignore_comments
33 | self._token_indexers = token_indexers or {'tokens': SingleIdTokenIndexer()}
34 |
35 | @overrides
36 | def _read(self, file_path):
37 | with jsonlines.open(cached_path(file_path), "r") as f:
38 | for i, answer in enumerate(f):
39 | question = answer["question"]
40 | comment = answer["comment"]
41 | label = answer.get("label", None)
42 | if label is not None:
43 | if self._skip_label_indexing:
44 | try:
45 | label = int(label)
46 | except ValueError:
47 | raise ValueError('Labels must be integers if skip_label_indexing is True.')
48 | else:
49 | label = str(label)
50 | instance = self.text_to_instance(question=question, comment=comment, label=label)
51 | if i < 4:
52 | logger.debug(instance)
53 | if instance is not None:
54 | yield instance
55 |
56 | def _truncate(self, n, tokens):
57 | """
58 | truncate a set of tokens using the provided sequence length
59 | """
60 | if len(tokens) > n:
61 | tokens = tokens[:n]
62 | return tokens
63 |
64 | @overrides
65 | def text_to_instance(self, question: str, comment: str, label: Union[str, int] = None) -> Instance:
66 | fields: Dict[str, Field] = {}
67 | if self.ignore_questions:
68 | question_tokens = []
69 | else:
70 | question_tokens = self._tokenizer.tokenize(question)
71 | if self.ignore_comments:
72 | comment_tokens = []
73 | else:
74 | comment_tokens = self._tokenizer.tokenize(comment)
75 | comment_tokens = comment_tokens[1:] # Do not need [CLS] in second segment
76 | if self._max_sequence_length is not None:
77 | question_tokens = self._truncate(self._max_sequence_length - len(comment_tokens) - 2, question_tokens)
78 | tokens = question_tokens + comment_tokens
79 | fields['tokens'] = TextField(tokens, self._token_indexers, )
80 | if label is not None:
81 | fields['label'] = LabelField(label,
82 | skip_indexing=self._skip_label_indexing)
83 | return Instance(fields)
84 |
--------------------------------------------------------------------------------
/fasttext_baseline/run.py:
--------------------------------------------------------------------------------
1 | import argparse
2 | import os
3 | import subprocess
4 | from typing import Tuple
5 |
6 | import jsonlines
7 | import fasttext
8 | from numpy.core.multiarray import ndarray
9 |
10 |
11 | def _jsonl_to_fasttext_format(input_path: str, output_path: str) -> None:
12 | # Basic preprocessing and split into language files
13 | language_order = []
14 | with jsonlines.open(input_path) as f_in, \
15 | open(output_path + ".de", "w") as f_out_de, \
16 | open(output_path + ".fr", "w") as f_out_fr, \
17 | open(output_path + ".it", "w") as f_out_it:
18 | for line in f_in:
19 | # Concatenate question and comment
20 | text = " ".join([line["question"], line["comment"]])
21 | text = text.replace("\n", " ")
22 | language = line["language"]
23 | language_order.append(language)
24 | if language == "de":
25 | output_file = f_out_de
26 | elif language == "fr":
27 | output_file = f_out_fr
28 | elif language == "it":
29 | output_file = f_out_it
30 | else:
31 | raise NotImplementedError()
32 | output_file.write("__label__{} {}\n".format(line["label"], text))
33 | # Language-specific tokenization
34 | for language in ["de", "fr", "it"]:
35 | with open(output_path + ".{}".format(language)) as f_in, \
36 | open(output_path + ".tokenized.{}".format(language), "w") as f_out:
37 | subprocess.call([
38 | "./tools/tokenizer.perl",
39 | "-l", language,
40 | "-q",
41 | ], stdin=f_in, stdout=f_out,
42 | )
43 | # Merge language files
44 | with open(output_path, "w") as f_out, \
45 | open(output_path + ".tokenized.de") as f_in_de, \
46 | open(output_path + ".tokenized.fr") as f_in_fr, \
47 | open(output_path + ".tokenized.it") as f_in_it:
48 | for language in language_order:
49 | if language == "de":
50 | output_file = f_in_de
51 | elif language == "fr":
52 | output_file = f_in_fr
53 | else:
54 | output_file = f_in_it
55 | line = next(output_file)
56 | line = line.replace("_ _ label _ _ ", "__label__")
57 | f_out.write(line)
58 |
59 |
60 | def _predictions_to_jsonl(predictions: Tuple[Tuple[Tuple[str], ndarray]], output_path: str) -> None:
61 | with jsonlines.open(output_path, "w") as f:
62 | for labels, _ in predictions:
63 | label = labels[0].replace("__label__", "")
64 | f.write({"label": label})
65 |
66 |
67 | def train(model_path: str, train_dataset_path: str, pretrained_vectors: str = "", lr: float = 0.1,
68 | epochs: int = 5) -> fasttext.FastText:
69 | model = fasttext.train_supervised(
70 | input=train_dataset_path,
71 | pretrained_vectors=pretrained_vectors,
72 | dim=300,
73 | lr=lr,
74 | epoch=epochs,
75 | wordNgrams=3,
76 | )
77 | model.save_model(model_path)
78 | return model
79 |
80 |
81 | def predict(model_path: str, dataset_path) -> Tuple[Tuple[Tuple[str], ndarray]]:
82 | model = fasttext.load_model(model_path)
83 | predictions = []
84 | with open(dataset_path) as f:
85 | for line in f:
86 | if not line.strip():
87 | continue
88 | _, *tokens = line.split()
89 | text = " ".join(tokens)
90 | prediction: Tuple[Tuple[str], ndarray] = model.predict(text)
91 | predictions.append(prediction)
92 | return tuple(predictions)
93 |
94 |
95 | def main():
96 | parser = argparse.ArgumentParser(description="Train a fastText baseline for X-Stance")
97 | parser.add_argument('--data-dir', type=str, required=True)
98 | parser.add_argument('--pred', type=str, required=True)
99 | parser.add_argument('--pretrained-vectors', type=str, default="")
100 | parser.add_argument('--lr', type=float, default=0.1)
101 | parser.add_argument('--epochs', type=int, default=5)
102 | args = parser.parse_args()
103 |
104 | FASTTEXT_DATA_DIR = "processed_data"
105 | for dataset_path in [
106 | "train.jsonl",
107 | "valid.jsonl",
108 | "test.jsonl",
109 | ]:
110 | _jsonl_to_fasttext_format(
111 | input_path=os.path.join(args.data_dir, dataset_path),
112 | output_path=os.path.join(FASTTEXT_DATA_DIR, dataset_path.replace(".jsonl", ".txt"))
113 | )
114 |
115 | model_path = "model.bin"
116 | model = train(
117 | model_path,
118 | train_dataset_path=os.path.join(FASTTEXT_DATA_DIR, "train.txt"),
119 | pretrained_vectors=args.pretrained_vectors,
120 | lr=args.lr,
121 | epochs=args.epochs,
122 | )
123 | _, valid_precision, valid_recall = model.test(os.path.join(FASTTEXT_DATA_DIR, "valid.txt"))
124 | print("Validation precision: ", valid_precision)
125 | print("Validation recall: ", valid_recall)
126 | valid_f1 = 2 * valid_precision * valid_recall / (valid_precision + valid_recall)
127 | print("Validation F1: ", valid_f1)
128 |
129 | predictions = predict(model_path, os.path.join(FASTTEXT_DATA_DIR, "test.txt"))
130 | _predictions_to_jsonl(predictions, args.pred)
131 | print("Saved test predictions in", args.pred)
132 |
133 |
134 | if __name__ == "__main__":
135 | main()
136 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | [](https://huggingface.co/nlp/viewer/?dataset=x_stance)
2 | [](https://travis-ci.org/ZurichNLP/xstance)
3 |
4 | # x-stance
5 |
6 | Data and code accompanying the paper ["X-Stance: A Multilingual Multi-Target Dataset for Stance Detection"](http://ceur-ws.org/Vol-2624/paper9.pdf).
7 |
8 | A high-level description can be found in the [**blog post**](https://vamvas.ch/more-general-stance-detection-with-x-stance), and a more detailed description in the [**paper**](http://ceur-ws.org/Vol-2624/paper9.pdf).
9 |
10 |
11 | ## Summary
12 |
13 | The x-stance dataset contains more than **150 political questions**, and **67k comments** written by candidates on those questions.
14 |
15 | It can be used to train and evaluate stance detection systems.
16 |
17 | The comments are partly **German**, partly **French** and **Italian**. The questions are available in all the three languages plus **English**.
18 |
19 | The data have been extracted from the Swiss voting advice platform [Smartvote](https://smartvote.ch/).
20 |
21 | Data example:
22 |
23 |
24 |
25 | ## Structure
26 |
27 | The dataset contains the following files:
28 | - *train.jsonl*
29 | - *valid.jsonl*
30 | - *test.jsonl*
31 | - *questions.{en,de,fr,it}.jsonl*
32 |
33 | Example for a train, valid or test instance:
34 |
35 | ```json
36 | {
37 | "id": 20475,
38 | "language": "de",
39 | "question_id": 3469,
40 | "question": "Soll der Bundesrat ein Freihandelsabkommen mit den USA anstreben?",
41 | "comment": "Nicht unter einem Präsidenten, welcher die Rechte anderer mit Füssen tritt und Respektlos gegenüber ändern ist.",
42 | "label": "AGAINST",
43 | "numerical_label": 0,
44 | "author": "8aa829c3b86f",
45 | "topic": "Foreign Policy",
46 | "test_set": "new_comments_defr"
47 | }
48 | ```
49 |
50 | Details:
51 | - Languages: The files *train.jsonl* and *valid.jsonl* contain about 75% German data and 25% French data. The file *test.jsonl* also contains some Italians samples to test zero-shot cross-lingual transfer.
52 | - `"label"` can be `"FAVOR"` or `"AGAINST"`.
53 | - `"numerical_label"` provides a more fine-grained label (not used in our baseline). Range of values: {0, 25, 75, 100}, where 0 means "no" and 100 means "yes".
54 | - `"test_set"`: Only *test.jsonl* has this field. Specifies the test partition (new comments / new questions / new topics; German+French / Italian). For details on the test partitions please refer to Table 2 in the [paper](http://ceur-ws.org/Vol-2624/paper9.pdf).
55 |
56 | In the train, valid and test files, the comments are paired with a version of the question in the same language (e.g. German comment + German version of the question). The *questions.xx.jsonl* files provide complete translations of all the questions.
57 |
58 | ## Evaluation
59 |
60 | Dependencies: Python 3; `scikit-learn`
61 |
62 | Unpacking the data:
63 | ```bash
64 | unzip data/xstance-data-v1.0.zip -d data
65 | ```
66 |
67 | Usage:
68 | ```bash
69 | python evaluate.py \
70 | --gold data/test.jsonl \
71 | --pred predictions/mbert_pred.jsonl
72 | ```
73 |
74 | The predictions file should be a JSON lines file (http://jsonlines.org/). The lines in the file should correspond to the lines in the gold file (*test.jsonl*).
75 |
76 | Example prediction:
77 | ```json
78 | {"label": "AGAINST"}
79 | ```
80 |
81 | The evaluation script outputs the macro-average of the F1 score for each label, per test partition and per language:
82 |
83 | ```
84 | new_comments_defr
85 | DE 76.83541377429334
86 | FR 76.61281705054353
87 |
88 | new_questions_defr
89 | DE 68.46881591336131
90 | FR 68.3831150794995
91 |
92 | new_topics_defr
93 | DE 68.90323152487849
94 | FR 70.8982523359103
95 |
96 | new_comments_it
97 | IT 70.19234360410832
98 | ```
99 |
100 | ## fastText Baseline
101 | Dependencies:
102 | - Python >= 3.6
103 | - Perl
104 | - `pip install -r fasttext_baseline/requirements.txt`
105 |
106 | Unpacking the data (if not done in the previous section):
107 | ```bash
108 | unzip data/xstance-data-v1.0.zip -d data
109 | ```
110 |
111 | Downloading the Europarl preprocessing tools:
112 | ```bash
113 | cd fasttext_baseline
114 | wget http://www.statmt.org/europarl/v7/tools.tgz
115 | tar -xvf tools.tgz
116 | ```
117 |
118 | Training and predicting:
119 | ```bash
120 | python run.py --data-dir ../data --pred ../predictions/mypred.jsonl
121 | ```
122 |
123 | Evaluating:
124 | ```bash
125 | cd ..
126 | python evaluate.py \
127 | --gold data/test.jsonl \
128 | --pred predictions/mypred.jsonl
129 | ```
130 |
131 |
132 | ## M-BERT Baseline
133 | Dependencies:
134 | - Python >= 3.6
135 | - AllenNLP 0.9.0 (http://docs.allennlp.org/master/)
136 | - `pip install -r mbert_baseline/requirements.txt`
137 | - The commands below assume GPU computation. They can be adapted for CPU, however.
138 |
139 | Unpacking the data (if not done in the previous section):
140 | ```bash
141 | unzip data/xstance-data-v1.0.zip -d data
142 | ```
143 |
144 | Training:
145 | ```bash
146 | cd mbert_baseline
147 | allennlp train mbert.jsonnet \
148 | --include-package allennlp_xstance \
149 | -s mymodel
150 | ```
151 |
152 | Predicting:
153 | ```bash
154 | cd mbert_baseline
155 | allennlp predict mymodel ../data/test.jsonl \
156 | --include-package allennlp_xstance \
157 | --predictor xstance_predictor \
158 | --cuda-device 0 \
159 | --output-file ../predictions/mypred.jsonl
160 | ```
161 |
162 | Evaluating:
163 | ```bash
164 | cd ..
165 | python evaluate.py \
166 | --gold data/test.jsonl \
167 | --pred predictions/mypred.jsonl
168 | ```
169 |
170 |
171 | ## Licenses
172 | - Dataset: CC BY-NC 4.0 (© www.smartvote.ch)
173 | - Rest of repository: MIT License
174 |
175 | ## References
176 | The dataset and baseline model are described in:
177 |
178 | ```bibtex
179 | @inproceedings{vamvas2020xstance,
180 | author = "Vamvas, Jannis and Sennrich, Rico",
181 | title = "{X-Stance}: A Multilingual Multi-Target Dataset for Stance Detection",
182 | booktitle = "Proceedings of the 5th Swiss Text Analytics Conference (SwissText) \& 16th Conference on Natural Language Processing (KONVENS)",
183 | address = "Zurich, Switzerland",
184 | year = "2020",
185 | month = "jun",
186 | url = "http://ceur-ws.org/Vol-2624/paper9.pdf"
187 | }
188 | ```
189 |
190 | ## Metadata
191 | The metadata are used by search engines such as Google Dataset Search.
192 |
| property | 196 |value | 197 |||||||
|---|---|---|---|---|---|---|---|
| name | 200 |x-Stance: A Multilingual
201 | Multi-Target Dataset for Stance Detection |
202 | ||||||
| description | 205 |The x-stance dataset contains
206 | more than 150 political questions, and 67k comments written by candidates on those questions. It can be
207 | used to train and evaluate stance detection systems. The comments are partly German, partly French and
208 | Italian. The questions are available in all the three languages plus English. The data have been
209 | extracted from the Swiss voting advice platform Smartvote.ch. |
210 | ||||||
| url | 213 |https://github.com/ZurichNLP/xstance |
214 | ||||||
| sameAs | 217 |https://doi.org/10.5281/zenodo.3831317 |
218 | ||||||
| license | 221 |
222 |
223 |
238 |
|
239 | ||||||
| distribution | 242 |
243 |
244 |
261 |
|
262 | ||||||
| citation | 265 |http://doi.org/10.5281/zenodo.3831317 |
266 |