├── fasttext_baseline ├── processed_data │ └── .empty ├── requirements.txt ├── tests.py └── run.py ├── mbert_baseline ├── allennlp_xstance │ ├── __init__.py │ ├── xstance_predictor.py │ └── xstance_reader.py ├── requirements.txt └── mbert.jsonnet ├── example.png ├── data └── xstance-data-v1.0.zip ├── .travis.yml ├── LICENSE ├── evaluate.py ├── .gitignore └── README.md /fasttext_baseline/processed_data/.empty: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /mbert_baseline/allennlp_xstance/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /example.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ZurichNLP/xstance/HEAD/example.png -------------------------------------------------------------------------------- /fasttext_baseline/requirements.txt: -------------------------------------------------------------------------------- 1 | jsonlines==1.2.0 2 | fasttext==0.9.2 3 | -------------------------------------------------------------------------------- /mbert_baseline/requirements.txt: -------------------------------------------------------------------------------- 1 | jsonlines==1.2.0 2 | allennlp==0.9.0 3 | scikit-learn==0.22.2 4 | -------------------------------------------------------------------------------- /data/xstance-data-v1.0.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ZurichNLP/xstance/HEAD/data/xstance-data-v1.0.zip -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: python 2 | python: 3 | - "3.6" 4 | - "3.7" 5 | - "3.8" 6 | install: 7 | - unzip data/xstance-data-v1.0.zip -d data 8 | - pip install scikit-learn 9 | - pip install -r fasttext_baseline/requirements.txt 10 | - pip install -r mbert_baseline/requirements.txt 11 | - cd fasttext_baseline && wget http://www.statmt.org/europarl/v7/tools.tgz && tar -xvf tools.tgz && cd .. 12 | script: python evaluate.py --gold data/test.jsonl --pred predictions/mbert_pred.jsonl 13 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2020 University of Zurich, Department of Computational Linguistics, Jannis Vamvas 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /mbert_baseline/mbert.jsonnet: -------------------------------------------------------------------------------- 1 | local bert_model = "bert-base-multilingual-cased"; 2 | 3 | { 4 | "dataset_reader": { 5 | "lazy": false, 6 | "type": "xstance_reader", 7 | "max_sequence_length": 512, 8 | "skip_label_indexing": false, 9 | "tokenizer": { 10 | "type": "pretrained_transformer", 11 | "do_lowercase": false, 12 | "model_name": bert_model 13 | }, 14 | "token_indexers": { 15 | "bert": { 16 | "type": "bert-pretrained", 17 | "do_lowercase": false, 18 | "pretrained_model": bert_model 19 | } 20 | } 21 | }, 22 | "train_data_path": "../data/train.jsonl", 23 | "validation_data_path": "../data/valid.jsonl", 24 | "model": { 25 | "type": "bert_for_classification", 26 | "bert_model": bert_model, 27 | "dropout": 0.1 28 | }, 29 | "iterator": { 30 | "type": "basic", 31 | "batch_size": 16, 32 | }, 33 | "trainer": { 34 | "optimizer": { 35 | "type": "bert_adam", 36 | "warmup": 0.1, 37 | "t_total": 8580, 38 | "lr": 0.00002 39 | }, 40 | "validation_metric": "+accuracy", 41 | "num_serialized_models_to_keep": 1, 42 | "num_epochs": 3, 43 | "patience": 5, 44 | "cuda_device": 0 45 | } 46 | } 47 | -------------------------------------------------------------------------------- /evaluate.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import json 3 | from typing import List 4 | 5 | from sklearn.metrics import f1_score 6 | 7 | LANGUAGES = [ 8 | "de", 9 | "fr", 10 | "it", 11 | ] 12 | 13 | TEST_SETS = [ 14 | "new_comments_defr", 15 | "new_questions_defr", 16 | "new_topics_defr", 17 | "new_comments_it", 18 | # "new_questions_it", 19 | # "new_topics_it", 20 | ] 21 | 22 | 23 | def evaluate_file(gold_file, pred_file): 24 | gold_list = [json.loads(line) for line in gold_file] 25 | pred_list = [json.loads(line) for line in pred_file] 26 | args.gold.close() 27 | args.pred.close() 28 | evaluate_json(gold_list, pred_list) 29 | 30 | 31 | def evaluate_json(gold: List, pred: List): 32 | for test_set in TEST_SETS: 33 | print(test_set) 34 | for language in LANGUAGES: 35 | instance_indices = [i for i, instance in enumerate(gold) if 36 | instance["test_set"] == test_set and instance["language"] == language] 37 | gold_labels = [gold[i]["label"] for i in instance_indices] 38 | pred_labels = [pred[i]["label"] for i in instance_indices] 39 | if not len(gold_labels): 40 | continue 41 | score = f1_score(gold_labels, pred_labels, average="macro") 42 | print(language.upper(), 100 * score) 43 | print() 44 | 45 | 46 | if __name__ == "__main__": 47 | parser = argparse.ArgumentParser(description="Evaluate predictions on the x-stance test sets") 48 | parser.add_argument('--gold', type=argparse.FileType('r', encoding='UTF-8'), required=True) 49 | parser.add_argument('--pred', type=argparse.FileType('r', encoding='UTF-8'), required=True) 50 | parser.add_argument("-v", "--verbose", action="store_true") 51 | args = parser.parse_args() 52 | evaluate_file(args.gold, args.pred) 53 | -------------------------------------------------------------------------------- /mbert_baseline/allennlp_xstance/xstance_predictor.py: -------------------------------------------------------------------------------- 1 | from copy import deepcopy 2 | from typing import List, Dict 3 | 4 | from overrides import overrides 5 | import numpy 6 | 7 | from allennlp.common.util import JsonDict 8 | from allennlp.data import Instance 9 | from allennlp.predictors.predictor import Predictor 10 | from allennlp.data.fields import LabelField 11 | 12 | 13 | @Predictor.register('xstance_predictor') 14 | class XStancePredictor(Predictor): 15 | """ 16 | Predictor for any model that takes in a sentence and returns 17 | a single class for it. In particular, it can be used with 18 | the :class:`~allennlp.models.basic_classifier.BasicClassifier` model 19 | """ 20 | def predict(self, sentence: str) -> JsonDict: 21 | return self.predict_json({"sentence": sentence}) 22 | 23 | @overrides 24 | def _json_to_instance(self, json_dict: JsonDict) -> Instance: 25 | """ 26 | Expects JSON that looks like ``{"sentence": "..."}``. 27 | Runs the underlying model, and adds the ``"label"`` to the output. 28 | """ 29 | question = json_dict["question"] 30 | comment = json_dict["comment"] 31 | return self._dataset_reader.text_to_instance(question, comment) 32 | 33 | @overrides 34 | def predictions_to_labeled_instances(self, 35 | instance: Instance, 36 | outputs: Dict[str, numpy.ndarray]) -> List[Instance]: 37 | new_instance = deepcopy(instance) 38 | if "probs" in outputs: 39 | label = numpy.argmax(outputs['probs']) 40 | new_instance.add_field('prediction', LabelField(int(label))) 41 | elif "prediction" in outputs: 42 | label = outputs["score"] 43 | new_instance.add_field('prediction', LabelField(int(label), skip_indexing=True)) 44 | else: 45 | raise ValueError("probs or score not found in prediction outputs") 46 | return [new_instance] 47 | -------------------------------------------------------------------------------- /fasttext_baseline/tests.py: -------------------------------------------------------------------------------- 1 | import os 2 | from unittest import TestCase 3 | 4 | import jsonlines 5 | import numpy as np 6 | 7 | from fasttext_baseline import run as run_baseline 8 | 9 | 10 | class FastTextBaselineTestCase(TestCase): 11 | 12 | def setUp(self) -> None: 13 | self.data_dir = "../data" 14 | self.model_path = "test_model.bin" 15 | input_path = os.path.join(self.data_dir, "valid.jsonl") 16 | self.valid_path = os.path.join("test_output", "valid.txt") 17 | run_baseline._jsonl_to_fasttext_format(input_path, self.valid_path) 18 | 19 | def test_jsonl_to_fasttext_format(self): 20 | with open(self.valid_path) as f: 21 | first_line = next(f) 22 | label, *tokens = first_line.split() 23 | self.assertEqual("__label__FAVOR", label) 24 | self.assertEqual( 25 | """\ 26 | Sollen Ausländer / -innen , die seit mindestens zehn Jahren \ 27 | in der Schweiz leben , das Stimm- und Wahlrecht auf Gemeindeebene \ 28 | erhalten ? Ich bin finde das geht zu wenig weit . Alle \ 29 | Menschen die hier leben sollen das Recht auf Mitsprache haben .""", 30 | first_line.replace(label, "").strip() 31 | ) 32 | num_lines = 0 33 | with open(self.valid_path) as f: 34 | for line in f: 35 | if line.strip(): 36 | num_lines += 1 37 | self.assertEqual(3926, num_lines) 38 | 39 | def test_predictions_to_jsonl(self): 40 | predictions = (((u'__label__AGAINST',), np.array([0.15613931]),),) 41 | predictions_path = os.path.join("test_output", "pred.jsonl") 42 | run_baseline._predictions_to_jsonl(predictions, predictions_path) 43 | with jsonlines.open(predictions_path) as f: 44 | first_line = next(iter(f)) 45 | self.assertDictEqual({"label": "AGAINST"}, first_line) 46 | 47 | 48 | def test_train(self): 49 | run_baseline.train(self.model_path, self.valid_path) 50 | 51 | def test_predict(self): 52 | predictions = run_baseline.predict(self.model_path, self.valid_path) 53 | self.assertEqual(3926, len(predictions)) 54 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | pip-wheel-metadata/ 24 | share/python-wheels/ 25 | *.egg-info/ 26 | .installed.cfg 27 | *.egg 28 | MANIFEST 29 | 30 | # PyInstaller 31 | # Usually these files are written by a python script from a template 32 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 33 | *.manifest 34 | *.spec 35 | 36 | # Installer logs 37 | pip-log.txt 38 | pip-delete-this-directory.txt 39 | 40 | # Unit test / coverage reports 41 | htmlcov/ 42 | .tox/ 43 | .nox/ 44 | .coverage 45 | .coverage.* 46 | .cache 47 | nosetests.xml 48 | coverage.xml 49 | *.cover 50 | *.py,cover 51 | .hypothesis/ 52 | .pytest_cache/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | target/ 76 | 77 | # Jupyter Notebook 78 | .ipynb_checkpoints 79 | 80 | # IPython 81 | profile_default/ 82 | ipython_config.py 83 | 84 | # pyenv 85 | .python-version 86 | 87 | # pipenv 88 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 89 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 90 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 91 | # install all needed dependencies. 92 | #Pipfile.lock 93 | 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 95 | __pypackages__/ 96 | 97 | # Celery stuff 98 | celerybeat-schedule 99 | celerybeat.pid 100 | 101 | # SageMath parsed files 102 | *.sage.py 103 | 104 | # Environments 105 | .env 106 | .venv 107 | env/ 108 | venv/ 109 | ENV/ 110 | env.bak/ 111 | venv.bak/ 112 | 113 | # Spyder project settings 114 | .spyderproject 115 | .spyproject 116 | 117 | # Rope project settings 118 | .ropeproject 119 | 120 | # mkdocs documentation 121 | /site 122 | 123 | # mypy 124 | .mypy_cache/ 125 | .dmypy.json 126 | dmypy.json 127 | 128 | # Pyre type checker 129 | .pyre/ 130 | 131 | .idea 132 | -------------------------------------------------------------------------------- /mbert_baseline/allennlp_xstance/xstance_reader.py: -------------------------------------------------------------------------------- 1 | from collections import defaultdict 2 | from typing import Dict, Union 3 | import logging 4 | 5 | import jsonlines 6 | from overrides import overrides 7 | from allennlp.common.file_utils import cached_path 8 | from allennlp.data.dataset_readers.dataset_reader import DatasetReader 9 | from allennlp.data.fields import LabelField, TextField, Field 10 | from allennlp.data.instance import Instance 11 | from allennlp.data.token_indexers import TokenIndexer, SingleIdTokenIndexer 12 | from allennlp.data.tokenizers import Tokenizer, WordTokenizer 13 | 14 | logger = logging.getLogger(__name__) # pylint: disable=invalid-name 15 | 16 | 17 | @DatasetReader.register("xstance_reader") 18 | class XStanceReader(DatasetReader): 19 | def __init__(self, 20 | token_indexers: Dict[str, TokenIndexer] = None, 21 | tokenizer: Tokenizer = None, 22 | max_sequence_length: int = None, 23 | skip_label_indexing: bool = False, 24 | ignore_questions: bool = False, 25 | ignore_comments: bool = False, 26 | lazy: bool = False) -> None: 27 | super().__init__(lazy=lazy) 28 | self._tokenizer = tokenizer or WordTokenizer() 29 | self._max_sequence_length = max_sequence_length 30 | self._skip_label_indexing = skip_label_indexing 31 | self.ignore_questions = ignore_questions 32 | self.ignore_comments = ignore_comments 33 | self._token_indexers = token_indexers or {'tokens': SingleIdTokenIndexer()} 34 | 35 | @overrides 36 | def _read(self, file_path): 37 | with jsonlines.open(cached_path(file_path), "r") as f: 38 | for i, answer in enumerate(f): 39 | question = answer["question"] 40 | comment = answer["comment"] 41 | label = answer.get("label", None) 42 | if label is not None: 43 | if self._skip_label_indexing: 44 | try: 45 | label = int(label) 46 | except ValueError: 47 | raise ValueError('Labels must be integers if skip_label_indexing is True.') 48 | else: 49 | label = str(label) 50 | instance = self.text_to_instance(question=question, comment=comment, label=label) 51 | if i < 4: 52 | logger.debug(instance) 53 | if instance is not None: 54 | yield instance 55 | 56 | def _truncate(self, n, tokens): 57 | """ 58 | truncate a set of tokens using the provided sequence length 59 | """ 60 | if len(tokens) > n: 61 | tokens = tokens[:n] 62 | return tokens 63 | 64 | @overrides 65 | def text_to_instance(self, question: str, comment: str, label: Union[str, int] = None) -> Instance: 66 | fields: Dict[str, Field] = {} 67 | if self.ignore_questions: 68 | question_tokens = [] 69 | else: 70 | question_tokens = self._tokenizer.tokenize(question) 71 | if self.ignore_comments: 72 | comment_tokens = [] 73 | else: 74 | comment_tokens = self._tokenizer.tokenize(comment) 75 | comment_tokens = comment_tokens[1:] # Do not need [CLS] in second segment 76 | if self._max_sequence_length is not None: 77 | question_tokens = self._truncate(self._max_sequence_length - len(comment_tokens) - 2, question_tokens) 78 | tokens = question_tokens + comment_tokens 79 | fields['tokens'] = TextField(tokens, self._token_indexers, ) 80 | if label is not None: 81 | fields['label'] = LabelField(label, 82 | skip_indexing=self._skip_label_indexing) 83 | return Instance(fields) 84 | -------------------------------------------------------------------------------- /fasttext_baseline/run.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import os 3 | import subprocess 4 | from typing import Tuple 5 | 6 | import jsonlines 7 | import fasttext 8 | from numpy.core.multiarray import ndarray 9 | 10 | 11 | def _jsonl_to_fasttext_format(input_path: str, output_path: str) -> None: 12 | # Basic preprocessing and split into language files 13 | language_order = [] 14 | with jsonlines.open(input_path) as f_in, \ 15 | open(output_path + ".de", "w") as f_out_de, \ 16 | open(output_path + ".fr", "w") as f_out_fr, \ 17 | open(output_path + ".it", "w") as f_out_it: 18 | for line in f_in: 19 | # Concatenate question and comment 20 | text = " ".join([line["question"], line["comment"]]) 21 | text = text.replace("\n", " ") 22 | language = line["language"] 23 | language_order.append(language) 24 | if language == "de": 25 | output_file = f_out_de 26 | elif language == "fr": 27 | output_file = f_out_fr 28 | elif language == "it": 29 | output_file = f_out_it 30 | else: 31 | raise NotImplementedError() 32 | output_file.write("__label__{} {}\n".format(line["label"], text)) 33 | # Language-specific tokenization 34 | for language in ["de", "fr", "it"]: 35 | with open(output_path + ".{}".format(language)) as f_in, \ 36 | open(output_path + ".tokenized.{}".format(language), "w") as f_out: 37 | subprocess.call([ 38 | "./tools/tokenizer.perl", 39 | "-l", language, 40 | "-q", 41 | ], stdin=f_in, stdout=f_out, 42 | ) 43 | # Merge language files 44 | with open(output_path, "w") as f_out, \ 45 | open(output_path + ".tokenized.de") as f_in_de, \ 46 | open(output_path + ".tokenized.fr") as f_in_fr, \ 47 | open(output_path + ".tokenized.it") as f_in_it: 48 | for language in language_order: 49 | if language == "de": 50 | output_file = f_in_de 51 | elif language == "fr": 52 | output_file = f_in_fr 53 | else: 54 | output_file = f_in_it 55 | line = next(output_file) 56 | line = line.replace("_ _ label _ _ ", "__label__") 57 | f_out.write(line) 58 | 59 | 60 | def _predictions_to_jsonl(predictions: Tuple[Tuple[Tuple[str], ndarray]], output_path: str) -> None: 61 | with jsonlines.open(output_path, "w") as f: 62 | for labels, _ in predictions: 63 | label = labels[0].replace("__label__", "") 64 | f.write({"label": label}) 65 | 66 | 67 | def train(model_path: str, train_dataset_path: str, pretrained_vectors: str = "", lr: float = 0.1, 68 | epochs: int = 5) -> fasttext.FastText: 69 | model = fasttext.train_supervised( 70 | input=train_dataset_path, 71 | pretrained_vectors=pretrained_vectors, 72 | dim=300, 73 | lr=lr, 74 | epoch=epochs, 75 | wordNgrams=3, 76 | ) 77 | model.save_model(model_path) 78 | return model 79 | 80 | 81 | def predict(model_path: str, dataset_path) -> Tuple[Tuple[Tuple[str], ndarray]]: 82 | model = fasttext.load_model(model_path) 83 | predictions = [] 84 | with open(dataset_path) as f: 85 | for line in f: 86 | if not line.strip(): 87 | continue 88 | _, *tokens = line.split() 89 | text = " ".join(tokens) 90 | prediction: Tuple[Tuple[str], ndarray] = model.predict(text) 91 | predictions.append(prediction) 92 | return tuple(predictions) 93 | 94 | 95 | def main(): 96 | parser = argparse.ArgumentParser(description="Train a fastText baseline for X-Stance") 97 | parser.add_argument('--data-dir', type=str, required=True) 98 | parser.add_argument('--pred', type=str, required=True) 99 | parser.add_argument('--pretrained-vectors', type=str, default="") 100 | parser.add_argument('--lr', type=float, default=0.1) 101 | parser.add_argument('--epochs', type=int, default=5) 102 | args = parser.parse_args() 103 | 104 | FASTTEXT_DATA_DIR = "processed_data" 105 | for dataset_path in [ 106 | "train.jsonl", 107 | "valid.jsonl", 108 | "test.jsonl", 109 | ]: 110 | _jsonl_to_fasttext_format( 111 | input_path=os.path.join(args.data_dir, dataset_path), 112 | output_path=os.path.join(FASTTEXT_DATA_DIR, dataset_path.replace(".jsonl", ".txt")) 113 | ) 114 | 115 | model_path = "model.bin" 116 | model = train( 117 | model_path, 118 | train_dataset_path=os.path.join(FASTTEXT_DATA_DIR, "train.txt"), 119 | pretrained_vectors=args.pretrained_vectors, 120 | lr=args.lr, 121 | epochs=args.epochs, 122 | ) 123 | _, valid_precision, valid_recall = model.test(os.path.join(FASTTEXT_DATA_DIR, "valid.txt")) 124 | print("Validation precision: ", valid_precision) 125 | print("Validation recall: ", valid_recall) 126 | valid_f1 = 2 * valid_precision * valid_recall / (valid_precision + valid_recall) 127 | print("Validation F1: ", valid_f1) 128 | 129 | predictions = predict(model_path, os.path.join(FASTTEXT_DATA_DIR, "test.txt")) 130 | _predictions_to_jsonl(predictions, args.pred) 131 | print("Saved test predictions in", args.pred) 132 | 133 | 134 | if __name__ == "__main__": 135 | main() 136 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | [![Live Viewer](https://img.shields.io/badge/Browse-nlp%2Fviewer-brightgreen)](https://huggingface.co/nlp/viewer/?dataset=x_stance) 2 | [![Build Status](https://travis-ci.org/ZurichNLP/xstance.svg?branch=master)](https://travis-ci.org/ZurichNLP/xstance) 3 | 4 | # x-stance 5 | 6 | Data and code accompanying the paper ["X-Stance: A Multilingual Multi-Target Dataset for Stance Detection"](http://ceur-ws.org/Vol-2624/paper9.pdf). 7 | 8 | A high-level description can be found in the [**blog post**](https://vamvas.ch/more-general-stance-detection-with-x-stance), and a more detailed description in the [**paper**](http://ceur-ws.org/Vol-2624/paper9.pdf). 9 | 10 | 11 | ## Summary 12 | 13 | The x-stance dataset contains more than **150 political questions**, and **67k comments** written by candidates on those questions. 14 | 15 | It can be used to train and evaluate stance detection systems. 16 | 17 | The comments are partly **German**, partly **French** and **Italian**. The questions are available in all the three languages plus **English**. 18 | 19 | The data have been extracted from the Swiss voting advice platform [Smartvote](https://smartvote.ch/). 20 | 21 | Data example: 22 | 23 | Data Example 24 | 25 | ## Structure 26 | 27 | The dataset contains the following files: 28 | - *train.jsonl* 29 | - *valid.jsonl* 30 | - *test.jsonl* 31 | - *questions.{en,de,fr,it}.jsonl* 32 | 33 | Example for a train, valid or test instance: 34 | 35 | ```json 36 | { 37 | "id": 20475, 38 | "language": "de", 39 | "question_id": 3469, 40 | "question": "Soll der Bundesrat ein Freihandelsabkommen mit den USA anstreben?", 41 | "comment": "Nicht unter einem Präsidenten, welcher die Rechte anderer mit Füssen tritt und Respektlos gegenüber ändern ist.", 42 | "label": "AGAINST", 43 | "numerical_label": 0, 44 | "author": "8aa829c3b86f", 45 | "topic": "Foreign Policy", 46 | "test_set": "new_comments_defr" 47 | } 48 | ``` 49 | 50 | Details: 51 | - Languages: The files *train.jsonl* and *valid.jsonl* contain about 75% German data and 25% French data. The file *test.jsonl* also contains some Italians samples to test zero-shot cross-lingual transfer. 52 | - `"label"` can be `"FAVOR"` or `"AGAINST"`. 53 | - `"numerical_label"` provides a more fine-grained label (not used in our baseline). Range of values: {0, 25, 75, 100}, where 0 means "no" and 100 means "yes". 54 | - `"test_set"`: Only *test.jsonl* has this field. Specifies the test partition (new comments / new questions / new topics; German+French / Italian). For details on the test partitions please refer to Table 2 in the [paper](http://ceur-ws.org/Vol-2624/paper9.pdf). 55 | 56 | In the train, valid and test files, the comments are paired with a version of the question in the same language (e.g. German comment + German version of the question). The *questions.xx.jsonl* files provide complete translations of all the questions. 57 | 58 | ## Evaluation 59 | 60 | Dependencies: Python 3; `scikit-learn` 61 | 62 | Unpacking the data: 63 | ```bash 64 | unzip data/xstance-data-v1.0.zip -d data 65 | ``` 66 | 67 | Usage: 68 | ```bash 69 | python evaluate.py \ 70 | --gold data/test.jsonl \ 71 | --pred predictions/mbert_pred.jsonl 72 | ``` 73 | 74 | The predictions file should be a JSON lines file (http://jsonlines.org/). The lines in the file should correspond to the lines in the gold file (*test.jsonl*). 75 | 76 | Example prediction: 77 | ```json 78 | {"label": "AGAINST"} 79 | ``` 80 | 81 | The evaluation script outputs the macro-average of the F1 score for each label, per test partition and per language: 82 | 83 | ``` 84 | new_comments_defr 85 | DE 76.83541377429334 86 | FR 76.61281705054353 87 | 88 | new_questions_defr 89 | DE 68.46881591336131 90 | FR 68.3831150794995 91 | 92 | new_topics_defr 93 | DE 68.90323152487849 94 | FR 70.8982523359103 95 | 96 | new_comments_it 97 | IT 70.19234360410832 98 | ``` 99 | 100 | ## fastText Baseline 101 | Dependencies: 102 | - Python >= 3.6 103 | - Perl 104 | - `pip install -r fasttext_baseline/requirements.txt` 105 | 106 | Unpacking the data (if not done in the previous section): 107 | ```bash 108 | unzip data/xstance-data-v1.0.zip -d data 109 | ``` 110 | 111 | Downloading the Europarl preprocessing tools: 112 | ```bash 113 | cd fasttext_baseline 114 | wget http://www.statmt.org/europarl/v7/tools.tgz 115 | tar -xvf tools.tgz 116 | ``` 117 | 118 | Training and predicting: 119 | ```bash 120 | python run.py --data-dir ../data --pred ../predictions/mypred.jsonl 121 | ``` 122 | 123 | Evaluating: 124 | ```bash 125 | cd .. 126 | python evaluate.py \ 127 | --gold data/test.jsonl \ 128 | --pred predictions/mypred.jsonl 129 | ``` 130 | 131 | 132 | ## M-BERT Baseline 133 | Dependencies: 134 | - Python >= 3.6 135 | - AllenNLP 0.9.0 (http://docs.allennlp.org/master/) 136 | - `pip install -r mbert_baseline/requirements.txt` 137 | - The commands below assume GPU computation. They can be adapted for CPU, however. 138 | 139 | Unpacking the data (if not done in the previous section): 140 | ```bash 141 | unzip data/xstance-data-v1.0.zip -d data 142 | ``` 143 | 144 | Training: 145 | ```bash 146 | cd mbert_baseline 147 | allennlp train mbert.jsonnet \ 148 | --include-package allennlp_xstance \ 149 | -s mymodel 150 | ``` 151 | 152 | Predicting: 153 | ```bash 154 | cd mbert_baseline 155 | allennlp predict mymodel ../data/test.jsonl \ 156 | --include-package allennlp_xstance \ 157 | --predictor xstance_predictor \ 158 | --cuda-device 0 \ 159 | --output-file ../predictions/mypred.jsonl 160 | ``` 161 | 162 | Evaluating: 163 | ```bash 164 | cd .. 165 | python evaluate.py \ 166 | --gold data/test.jsonl \ 167 | --pred predictions/mypred.jsonl 168 | ``` 169 | 170 | 171 | ## Licenses 172 | - Dataset: CC BY-NC 4.0 (© www.smartvote.ch) 173 | - Rest of repository: MIT License 174 | 175 | ## References 176 | The dataset and baseline model are described in: 177 | 178 | ```bibtex 179 | @inproceedings{vamvas2020xstance, 180 | author = "Vamvas, Jannis and Sennrich, Rico", 181 | title = "{X-Stance}: A Multilingual Multi-Target Dataset for Stance Detection", 182 | booktitle = "Proceedings of the 5th Swiss Text Analytics Conference (SwissText) \& 16th Conference on Natural Language Processing (KONVENS)", 183 | address = "Zurich, Switzerland", 184 | year = "2020", 185 | month = "jun", 186 | url = "http://ceur-ws.org/Vol-2624/paper9.pdf" 187 | } 188 | ``` 189 | 190 | ## Metadata 191 | The metadata are used by search engines such as Google Dataset Search. 192 |
193 | 194 | 195 | 196 | 197 | 198 | 199 | 200 | 202 | 203 | 204 | 205 | 210 | 211 | 212 | 213 | 214 | 215 | 216 | 217 | 218 | 219 | 220 | 221 | 239 | 240 | 241 | 242 | 262 | 263 | 264 | 265 | 266 | 267 |
propertyvalue
namex-Stance: A Multilingual 201 | Multi-Target Dataset for Stance Detection
descriptionThe x-stance dataset contains 206 | more than 150 political questions, and 67k comments written by candidates on those questions. It can be 207 | used to train and evaluate stance detection systems. The comments are partly German, partly French and 208 | Italian. The questions are available in all the three languages plus English. The data have been 209 | extracted from the Swiss voting advice platform Smartvote.ch.
url
sameAshttps://doi.org/10.5281/zenodo.3831317
license 222 |
223 | 224 | 225 | 226 | 227 | 228 | 229 | 230 | 231 | 232 | 233 | 234 | 235 | 236 |
propertyvalue
nameCC BY-NC 4.0
url
237 |
238 |
distribution 243 |
244 | 245 | 246 | 247 | 248 | 249 | 250 | 251 | 253 | 254 | 255 | 256 | 258 | 259 |
propertyvalue
contentUrlhttps://raw.githubusercontent.com/ZurichNLP/xstance/master/data/xstance-data-v1.0.zip 252 |
encodingFormatapplication/zip 257 |
260 |
261 |
citationhttp://doi.org/10.5281/zenodo.3831317
268 |
269 | --------------------------------------------------------------------------------