├── fig
├── bot.png
├── demo.png
├── logo.png
├── robot.png
├── extract.png
├── pipeline.png
├── generation.png
├── eval_metrics.png
├── spider_bias.png
├── auto_annotated.png
└── disparity_diff.png
├── materials
└── AnnotationGuideline.pdf
├── download_dataset.sh
├── download_tagger.sh
├── tagger
├── prepare.sh
├── post_process.sh
├── labels.txt
├── helper
│ ├── jsonlize.py
│ ├── annotator_utils.py
│ ├── split.py
│ ├── utils.py
│ ├── heuristics.py
│ └── utils_batch.py
├── config.json
├── requirements.txt
├── README.md
├── sample.txt
├── annotator.py
├── tasks.py
└── run_tagger.py
├── extractor
├── README.md
├── parameters.txt
├── extractor.py
├── keywords.txt
├── extractor_utils.py
├── example.ipynb
└── paper.json
├── scripts
└── gdown.pl
├── README.md
└── LICENSE
/fig/bot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/neulab/ReviewAdvisor/HEAD/fig/bot.png
--------------------------------------------------------------------------------
/fig/demo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/neulab/ReviewAdvisor/HEAD/fig/demo.png
--------------------------------------------------------------------------------
/fig/logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/neulab/ReviewAdvisor/HEAD/fig/logo.png
--------------------------------------------------------------------------------
/fig/robot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/neulab/ReviewAdvisor/HEAD/fig/robot.png
--------------------------------------------------------------------------------
/fig/extract.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/neulab/ReviewAdvisor/HEAD/fig/extract.png
--------------------------------------------------------------------------------
/fig/pipeline.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/neulab/ReviewAdvisor/HEAD/fig/pipeline.png
--------------------------------------------------------------------------------
/fig/generation.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/neulab/ReviewAdvisor/HEAD/fig/generation.png
--------------------------------------------------------------------------------
/fig/eval_metrics.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/neulab/ReviewAdvisor/HEAD/fig/eval_metrics.png
--------------------------------------------------------------------------------
/fig/spider_bias.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/neulab/ReviewAdvisor/HEAD/fig/spider_bias.png
--------------------------------------------------------------------------------
/fig/auto_annotated.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/neulab/ReviewAdvisor/HEAD/fig/auto_annotated.png
--------------------------------------------------------------------------------
/fig/disparity_diff.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/neulab/ReviewAdvisor/HEAD/fig/disparity_diff.png
--------------------------------------------------------------------------------
/materials/AnnotationGuideline.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/neulab/ReviewAdvisor/HEAD/materials/AnnotationGuideline.pdf
--------------------------------------------------------------------------------
/download_dataset.sh:
--------------------------------------------------------------------------------
1 | perl scripts/gdown.pl https://drive.google.com/file/d/1nJdljy468roUcKLbVwWUhMs7teirah75/view?usp=sharing dataset.zip
2 | unzip dataset.zip
3 | rm dataset.zip
--------------------------------------------------------------------------------
/download_tagger.sh:
--------------------------------------------------------------------------------
1 | perl scripts/gdown.pl https://drive.google.com/file/d/1RbOhDblCcErrfnoRV8WWlESFGK1LnEDW/view?usp=sharing seqlab_final.zip
2 | unzip seqlab_final.zip
3 | rm seqlab_final.zip
--------------------------------------------------------------------------------
/tagger/prepare.sh:
--------------------------------------------------------------------------------
1 | # sh prepare.sh results/review.test gold_sent.txt gold_id.txt
2 | python helper/jsonlize.py "$1" out.jsonl
3 | python helper/split.py out.jsonl 1 test.txt id.txt
4 | rm out.jsonl
--------------------------------------------------------------------------------
/tagger/post_process.sh:
--------------------------------------------------------------------------------
1 | # sh jsonl.sh gold_sent.txt gold_id.txt gold.jsonl
2 | rm cached_test_BertTokenizer_512 cached_test_BertTokenizer_512.lock
3 | mv seqlab_final/test_predictions.txt ./test.txt
4 | python helper/heuristics.py id.txt test.txt result.jsonl
5 | rm test.txt id.txt
--------------------------------------------------------------------------------
/tagger/labels.txt:
--------------------------------------------------------------------------------
1 | clarity_negative
2 | clarity_positive
3 | meaningful_comparison_negative
4 | meaningful_comparison_positive
5 | motivation_negative
6 | motivation_positive
7 | O
8 | originality_negative
9 | originality_positive
10 | replicability_negative
11 | replicability_positive
12 | soundness_negative
13 | soundness_positive
14 | substance_negative
15 | substance_positive
16 | summary
17 |
--------------------------------------------------------------------------------
/extractor/README.md:
--------------------------------------------------------------------------------
1 | # Extractor
2 |
3 | To use the Cross-entropy extractor, see the example in [`example.ipynb`](example.ipynb). Notice that we reset the random seed every time we begin a new extraction (see [`extractor.py`](extractor.py)). This reset is not required. If you don't do that, the extractions for the same paper will be slightly different every time you extract. But they only differ in no more than 3/30 sentences.
--------------------------------------------------------------------------------
/extractor/parameters.txt:
--------------------------------------------------------------------------------
1 | 0 0 0
2 | 1 0 0
3 | 2 0 0
4 | 3 0.5 1000
5 | 4 0.5 1000
6 | 5 0.5 1000
7 | 6 0.5 1000
8 | 7 0.5 1000
9 | 8 0.5 1000
10 | 9 0.4 1000
11 | 10 0.4 1000
12 | 11 0.4 10000
13 | 12 0.4 10000
14 | 13 0.35 10000
15 | 14 0.3 10000
16 | 15 0.3 10000
17 | 16 0.25 10000
18 | 17 0.25 10000
19 | 18 0.25 10000
20 | 19 0.2 10000
21 | 20 0.2 10000
22 | 21 0.2 10000
23 | 22 0.2 10000
24 | 23 0.2 10000
25 | 24 0.2 10000
26 | 25 0.2 10000
--------------------------------------------------------------------------------
/tagger/helper/jsonlize.py:
--------------------------------------------------------------------------------
1 | import jsonlines
2 | from fire import Fire
3 |
4 |
5 | def main(input_file, output_file):
6 | lines = []
7 | with open(input_file, 'r', encoding='utf8') as f:
8 | for i, line in enumerate(f.readlines()):
9 | lines.append({'id': i, 'text': line.strip(), 'labels': []})
10 | with jsonlines.open(output_file, 'w') as writer:
11 | writer.write_all(lines)
12 |
13 |
14 | if __name__ == '__main__':
15 | Fire(main)
16 |
--------------------------------------------------------------------------------
/tagger/config.json:
--------------------------------------------------------------------------------
1 | {
2 | "data_dir": ".",
3 | "labels": "./labels.txt",
4 | "model_name_or_path": "seqlab_final",
5 | "evaluate_during_training": true,
6 | "output_dir": "seqlab_final",
7 | "max_seq_length": 512,
8 | "num_train_epochs": 3,
9 | "per_device_train_batch_size": 2,
10 | "per_device_eval_batch_size": 6,
11 | "save_steps": 20680,
12 | "eval_steps": 20680,
13 | "seed": 1,
14 | "do_train": false,
15 | "do_eval": false,
16 | "do_predict": true
17 | }
18 |
--------------------------------------------------------------------------------
/tagger/requirements.txt:
--------------------------------------------------------------------------------
1 | certifi==2020.12.5
2 | chardet==4.0.0
3 | click==7.1.2
4 | conllu==4.4
5 | filelock==3.0.12
6 | fire==0.4.0
7 | future==0.18.2
8 | idna==2.10
9 | joblib==1.0.1
10 | jsonlines==2.0.0
11 | nltk==3.5
12 | numpy==1.20.1
13 | packaging==20.9
14 | pyparsing==2.4.7
15 | regex==2020.11.13
16 | requests==2.25.1
17 | sacremoses==0.0.43
18 | scikit-learn==0.24.1
19 | scipy==1.6.1
20 | sentencepiece==0.1.95
21 | seqeval==1.2.2
22 | six==1.15.0
23 | termcolor==1.1.0
24 | threadpoolctl==2.1.0
25 | tokenizers==0.8.1rc1
26 | torch==1.6.0
27 | tqdm==4.59.0
28 | transformers==3.0.2
29 | urllib3==1.26.3
30 |
--------------------------------------------------------------------------------
/extractor/extractor.py:
--------------------------------------------------------------------------------
1 | # %%
2 | from extractor_utils import *
3 |
4 |
5 | class Extractor:
6 | def __init__(self, keywords_file, parameters_file):
7 | self.keywords = read_keywords(keywords_file)
8 | self.parameters = read_parameters(parameters_file)
9 |
10 | def extract(self, text):
11 | np.random.seed(666)
12 | filtered_sents, cleaned_filtered_sents = keywords_filtering(text, self.keywords)
13 | if len(filtered_sents) <= 30:
14 | out_p = np.array([1] * len(filtered_sents))
15 | else:
16 | group = len(filtered_sents) // 10
17 | init_p, init_n = self.parameters[group]
18 | out_p = CEmethod(cleaned_filtered_sents, N=init_n, init_p=init_p)
19 | samples = [np.random.binomial(1, p=out_p) for j in range(1)]
20 | extracted = get_text(samples[0], filtered_sents)
21 | return extracted
22 |
--------------------------------------------------------------------------------
/extractor/keywords.txt:
--------------------------------------------------------------------------------
1 | propose proposed proposing
2 | present presented
3 | develop developed
4 | study studies studied studying
5 | investigate investigated
6 | examine examined
7 | introduce introduces introduction
8 | understand understood understanding
9 | explore explored
10 | design designed
11 | address addressed
12 | prove proved
13 | discover discovered
14 | optimize optimization optimal optimum
15 | efficient efficiently
16 | effective effectively effectiveness
17 | increase increases increasing
18 | decrease decreases decreasing
19 | reduce reducing reduction
20 | maximize maximizes maximizing maximum
21 | minimize minimizes minimizing minimal
22 | observe observed observations observing
23 | find found finding
24 | show shows showed showing
25 | imply implies implied implying
26 | suggest suggests suggested suggesting
27 | demonstrate demonstrates demonstrated demonstrating
28 | achieve achieves achieved
29 | perform performs performed performing performance
30 | outperform outperforms out-performs outperformed outperforming
31 | improve improves improved
32 | dataset datasets
33 | state-of-the-art
34 | metric metrics
35 | measure measures
36 | baseline baselines
37 | compare comparable compared comparison comparative compares comparing
38 | experiment experiments
39 | evaluate evaluated evaluating evaluation
40 | generalize generalization generalizations generalizability
41 | better best
42 | result results
43 | analyze analyzed analysis
44 | explain explained explanation explanations
45 | interpret interpretability interpretations interpretable
46 | bound bounds bounded
47 | benefit benefited
48 | apply applied
49 |
--------------------------------------------------------------------------------
/tagger/helper/annotator_utils.py:
--------------------------------------------------------------------------------
1 | """
2 | Input is a string, we split it into sentences
3 | """
4 | from typing import List
5 |
6 | import nltk
7 |
8 |
9 | def endswith(sent: str, extensions: List[str]):
10 | for extension in extensions:
11 | if sent.endswith(extension):
12 | return True
13 | return False
14 |
15 |
16 | def contain_open_bracket(text: str):
17 | has_open_bracket = False
18 | for c in text:
19 | if c == '(':
20 | has_open_bracket = True
21 | if has_open_bracket and c == ')':
22 | has_open_bracket = False
23 | return has_open_bracket
24 |
25 |
26 | def get_sents(text: str) -> List[str]:
27 | """ Give a text string, return the sentence list """
28 | # Here are some heuristics that we use to get appropriate sentence splitter.
29 | # 1. combine sentences with its successor when certain conditions satisfied
30 | sent_list: List[str] = nltk.tokenize.sent_tokenize(text)
31 | new_sent_list = [sent.replace("\n", "") for sent in sent_list]
32 | postprocessed = []
33 | buff = ""
34 | for sent in new_sent_list:
35 | if endswith(sent, ['i.e.', 'i.e .', 'e.g.', 'e.g .', 'resp.', 'resp .',
36 | 'et al.', 'et al .', 'i.i.d.', 'i.i.d .', 'Eq.',
37 | 'Eq .', 'eq.', 'eq .', 'incl.', 'incl .', 'Fig.',
38 | 'Fig .', 'w.r.t.', 'w.r.t .', 'sec.', 'sec .',
39 | 'Sec.', 'Sec .']) or len(sent) < 10 \
40 | or contain_open_bracket(sent):
41 | buff += sent
42 | else:
43 | postprocessed.append(buff + sent)
44 | buff = ""
45 | if len(buff) > 0:
46 | postprocessed.append(buff)
47 | return postprocessed
48 |
--------------------------------------------------------------------------------
/tagger/README.md:
--------------------------------------------------------------------------------
1 | # Aspect Tagger
2 | We define 8 aspects which are **Summary**, **Motivation/Impact**, **Originality**, **Soundness/Correctness**, **Substance**, **Replicability**, **Meaningful Comparison**, **Clarity**. Our tagger can tag appropriate spans that indicate those aspects with sentiment polarity (e.g. positive originality). Requirements for all the libraries are in the **requirements.txt**, please use the right version for each library in order to use our trained tagger.
3 |
4 |
5 |
6 | ## Batch Annotation
7 | For batch annotation, please follow the format shown in **sample.txt** to prepare your data. Specifically, one line should be one review. Batch annotation support both CPU and GPU, but we highly suggest using GPU for efficiency reasons.
8 |
9 | As an example, to prepare the proper input data for tagger, run
10 | ```bash
11 | sh prepare.sh sample.txt
12 | ```
13 |
14 | This will result in a **test.txt** file and a **id.txt** file, which will be used to feed into our tagger as well as for later alignment use.
15 |
16 | To tag the prepared file **test.txt**, run
17 | ```bash
18 | python run_tagger.py config.json
19 | ```
20 |
21 | This will write the results in **seqlab_final/test_predictions.txt**. To further clean the results and apply our heuristic rules, run
22 | ```bash
23 | sh post_process.sh
24 | ```
25 |
26 | The results will be written into **result.jsonl**, one line for each review.
27 |
28 |
29 |
30 | ## Direct Annotation
31 | Except batch annotation, we also provide an interface to conveniently annotate a single review. See the usage below.
32 |
33 | ```python
34 | from annotator import Annotator
35 | annotator = Annotator('labels.txt', 'seqlab_final', 'cpu') # The last argument can be 'cpu' or 'gpu'.
36 | annotator.annotate('The paper is well written and easy to follow.') # the input is plain text.
37 | ```
38 |
39 | You should be able to get the following output
40 | ```python
41 | >>> annotator.annotate('The paper is well written and easy to follow.')
42 | [('The', 'clarity_positive'), ('paper', 'clarity_positive'), ('is', 'clarity_positive'), ('well', 'clarity_positive'), ('written', 'clarity_positive'), ('and', 'clarity_positive'), ('easy', 'clarity_positive'), ('to', 'clarity_positive'), ('follow', 'clarity_positive'), ('.', 'clarity_positive')]
43 |
44 | ```
45 |
46 |
--------------------------------------------------------------------------------
/scripts/gdown.pl:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env perl
2 | #
3 | # Google Drive direct download of big files
4 | # ./gdown.pl 'gdrive file url' ['desired file name']
5 | #
6 | # v1.0 by circulosmeos 04-2014.
7 | # v1.1 by circulosmeos 01-2017.
8 | # v1.2, v1.3, v1.4 by circulosmeos 01-2019, 02-2019.
9 | # //circulosmeos.wordpress.com/2014/04/12/google-drive-direct-download-of-big-files
10 | # Distributed under GPL 3 (//www.gnu.org/licenses/gpl-3.0.html)
11 | #
12 | use strict;
13 | use POSIX;
14 |
15 | my $TEMP='gdown.cookie.temp';
16 | my $COMMAND;
17 | my $confirm;
18 | my $check;
19 | sub execute_command();
20 |
21 | my $URL=shift;
22 | die "\n./gdown.pl 'gdrive file url' [desired file name]\n\n" if $URL eq '';
23 |
24 | my $FILENAME=shift;
25 | $FILENAME='gdown.'.strftime("%Y%m%d%H%M%S", localtime).'.'.substr(rand,2) if $FILENAME eq '';
26 |
27 | if ($URL=~m#^https?://drive.google.com/file/d/([^/]+)#) {
28 | $URL="https://docs.google.com/uc?id=$1&export=download";
29 | }
30 | elsif ($URL=~m#^https?://drive.google.com/open\?id=([^/]+)#) {
31 | $URL="https://docs.google.com/uc?id=$1&export=download";
32 | }
33 |
34 | execute_command();
35 |
36 | while (-s $FILENAME < 100000) { # only if the file isn't the download yet
37 | open fFILENAME, '<', $FILENAME;
38 | $check=0;
39 | foreach () {
40 | if (/href="(\/uc\?export=download[^"]+)/) {
41 | $URL='https://docs.google.com'.$1;
42 | $URL=~s/&/&/g;
43 | $confirm='';
44 | $check=1;
45 | last;
46 | }
47 | if (/confirm=([^;&]+)/) {
48 | $confirm=$1;
49 | $check=1;
50 | last;
51 | }
52 | if (/"downloadUrl":"([^"]+)/) {
53 | $URL=$1;
54 | $URL=~s/\\u003d/=/g;
55 | $URL=~s/\\u0026/&/g;
56 | $confirm='';
57 | $check=1;
58 | last;
59 | }
60 | }
61 | close fFILENAME;
62 | die "Couldn't download the file :-(\n" if ($check==0);
63 | $URL=~s/confirm=([^;&]+)/confirm=$confirm/ if $confirm ne '';
64 |
65 | execute_command();
66 | }
67 |
68 | unlink $TEMP;
69 |
70 | sub execute_command() {
71 | $COMMAND="wget --progress=dot:giga --no-check-certificate --load-cookie $TEMP --save-cookie $TEMP \"$URL\"";
72 | $COMMAND.=" -O \"$FILENAME\"" if $FILENAME ne '';
73 | system ( $COMMAND );
74 | return 1;
75 | }
76 |
--------------------------------------------------------------------------------
/tagger/sample.txt:
--------------------------------------------------------------------------------
1 | The paper proposes a new memory access scheme based on Lie group actions for NTMs. Pros: * Well written * Novel addressing scheme as an extension to NTM. * Seems to work slightly better than normal NTMs. * Some interesting theory about the novel addressing scheme based on Lie groups. Cons: * In the results, the LANTM only seems to be slightly better than the normal NTM. * The result tables are a bit confusing. * No source code available. * The difference to the properties of normal NTM doesn't become too clear. Esp it is said that LANTM are better than NTM because they are differentiable end-to-end and provide a robust relative indexing scheme but NTM are also differentiable end-to-end and also provide a robust indexing scheme. * It is said that the head is discrete in NTM but actually it is in space R^n, i.e. it is already continuous. It doesn't become clear what is meant here. * No tests on real-world tasks, only some toy tasks. * No comparisons to some of the other NTM extensions such as D-NTM or Sparse Access Memory (SAM) (https://arxiv.org/abs/1610.09027). Although the motivations of other NTM extensions might be different, such comparisons still would have been interesting.
2 | The paper makes an interesting and timely contribution in investigating controlled dataset collection, and the impact of different axes of variation on object detection. In general, the community agrees on the importance of these questions, but there is very little work done to provide answers. As such, the originality and significance of the work is high. Clarity of the paper is also good, and release of the dataset and code should help with reproducibility. *** Post-rebuttal comments After reading the other reviews and the rebuttal, I am more convinced that this paper should be accepted. The rebuttal addressed concerns in a thoughtful and concrete manner.
3 | This paper conducts an empirical analysis of the effect of training data size on the model robustness to adversarial examples. The authors compared four different NN architectures using four different datasets for the task of image classification. Overall, the paper is easy to follow and clearly written. However, since Su et al., 2018, already presented similar findings, I do not see any major contribution in this paper. Additionally, I would expect the authors to conduct some more analysis of their results besides acc. and distortion levels. For examples, investigate the type of mistakes the models have made, compare models with the same test acc. but different amount of training data used to get there, some analysis/experiments to explain these findings (monitor models parameters/grads during training, etc.)
--------------------------------------------------------------------------------
/tagger/annotator.py:
--------------------------------------------------------------------------------
1 | from typing import Dict, List
2 |
3 | from transformers import (
4 | AutoConfig,
5 | AutoModelForTokenClassification,
6 | AutoTokenizer
7 | )
8 |
9 | from helper.annotator_utils import *
10 | from helper.utils import TokenClassifier, align_predictions
11 | import nltk
12 | from helper.heuristics import heuristics
13 |
14 | import logging
15 |
16 | logging.disable(logging.WARNING)
17 |
18 |
19 | class Annotator:
20 | def __init__(self, label_file, model_file_path, device):
21 | # get labels
22 | labels = []
23 | with open(label_file, 'r', encoding='utf8') as f:
24 | for line in f.readlines():
25 | labels.append(line.strip())
26 | self.labels = labels
27 |
28 | label_map: Dict[int, str] = {i: label for i, label in enumerate(labels)}
29 | self.label_map = label_map
30 | num_labels = len(label_map)
31 |
32 | # get tokenizer
33 | self.tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path=model_file_path)
34 |
35 | # get model config
36 | config = AutoConfig.from_pretrained(
37 | pretrained_model_name_or_path=model_file_path,
38 | num_labels=num_labels,
39 | id2label=label_map,
40 | label2id={label: i for i, label in enumerate(labels)}
41 | )
42 |
43 | # get model
44 | self.model = AutoModelForTokenClassification.from_pretrained(
45 | pretrained_model_name_or_path=model_file_path,
46 | from_tf=False,
47 | config=config
48 | )
49 |
50 | if device == 'gpu':
51 | self.model.cuda()
52 |
53 | # init the token classifier
54 | self.token_classifier = TokenClassifier(
55 | tokenizer=self.tokenizer,
56 | model=self.model,
57 | labels=self.labels
58 | )
59 |
60 | def prepare_inputs(self, text: str) -> List[List[str]]:
61 | sents = get_sents(text)
62 | new_sents = [nltk.word_tokenize(sent) for sent in sents]
63 | return new_sents
64 |
65 | def annotate(self, text):
66 | inputs = self.prepare_inputs(text)
67 | preds, label_ids = self.token_classifier.classify_token(inputs)
68 | preds_list, _ = align_predictions(preds, label_ids, self.label_map)
69 | assert len(inputs) == len(preds_list)
70 | output = []
71 | for words, labels, label_id in zip(inputs, preds_list, label_ids):
72 | assert len(words) == len(labels) or len(label_id) == 512
73 | if len(words) != len(labels):
74 | max_len = len(words)
75 | while len(labels) < max_len:
76 | labels.append('O')
77 | assert len(words) == len(labels)
78 | for word, label in zip(words, labels):
79 | output.append((word, label))
80 | output = heuristics(output)
81 | return output
82 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 |
2 | 
3 |
4 | ### Authors: [Weizhe Yuan](https://yyy-apple.github.io/), [Pengfei Liu](http://pfliu.com/), [Graham Neubig](http://www.phontron.com/)
5 |
6 |
7 |
8 | ## Updates
9 | 2021-10-22 Updated extractor.
10 | 2021-3-11 Updated tagger.
11 |
12 |
13 | ## Outline
14 | * #### [Motivation](https://github.com/neulab/ReviewAdvisor#motivation)
15 | * #### [Online Demo](https://github.com/neulab/ReviewAdvisor#demo)
16 | * #### [Dataset: ASAP-Review](https://github.com/neulab/ReviewAdvisor#dataset)
17 | * #### [Evaluation](https://github.com/neulab/ReviewAdvisor#evaluation)
18 | * #### [Model: ReviewAdvisor](https://github.com/neulab/ReviewAdvisor#model)
19 | * #### [Fairness: Bias Analysis](https://github.com/neulab/ReviewAdvisor#bias)
20 | * #### [Rethinking: Challenges](https://github.com/neulab/ReviewAdvisor#challenges)
21 | * #### [Acknowledgement](https://github.com/neulab/ReviewAdvisor#acknowledgement-1)
22 | * #### [Bib](https://github.com/neulab/ReviewAdvisor#bib-1)
23 |
24 |
25 | ## Motivation
26 |
27 | * #### Can We Automate Scientific Reviewing?
28 | * #### How will we know if we succeed?
29 | * #### Heavy workload on reviewing papers? ReviewAdvisor helps out!
30 |
31 |
32 | ## Demo
33 |
34 | (The Demo is currently offline due to computational consideration. We will re-start it once the computational resource is sufficient.)
35 |
36 | #### Have a [TRY](http://review.nlpedia.ai/)
37 |
38 |
39 |
40 |
41 | ## Dataset
42 | (The dataset is under Apache-2.0 license)
43 |
44 | To download our dataset, run
45 | ```bash
46 | sh download_dataset.sh
47 | ```
48 | There will be a **dataset** folder that contains the data. Please read the README.md in **dataset** folder to know more about the details.
49 |
50 | To use our trained tagger, run
51 | ```bash
52 | sh download_tagger.sh
53 | ```
54 | There will be a **seqlab_final** folder that contains our trained tagger. Move it into the **tagger** folder in this repo. The detailed usage of our tagger can be refered [here](https://github.com/neulab/ReviewAdvisor/tree/main/tagger).
55 |
56 |
57 | ## Evaluation
58 | We first define two high-level objectives of scientific peer review following [Jefferson et al. (2002a)](https://jamanetwork.com/journals/jama/fullarticle/194989). and [Smith (2006)](https://www.ncbi.nlm.nih.gov/pmc/articles/PMC1420798/).
59 |
60 | 1. Selecting high-quality submissions for publication.
61 | 2. Improving different aspects of a paper by providing detailed comments.
62 |
63 | We then make the first step towards review evaluation for scientific papers by proposing seven quantifiable metrics that can characterize a review from multiple perspectives. The evaluation metrics are shown in Fig.1, red represents absolute metric and blue represents relative metric.
64 |
65 |
66 |
67 | ## Model
68 | We decompose review generation into two steps, and consider using a *extract-then-generate* paradigm.
69 |
70 | We consider three extraction strategies, which can be visualized in Fig.2.
71 | 1. Section-based Extraction
72 | 2. Cross-entropy Method Extraction (see [`extractor`](extractor) folder for details)
73 | 3. Hybrid Extraction
74 |
75 |
76 |
77 | We consider two generation frameworks, which can be visualized in Fig.3.
78 | 1. Vanilla Sequence to Sequence
79 | 2. Jointly Sequence to Sequence and Sequence Labeling
80 |
81 |
82 |
83 |
84 |
85 | ## Bias
86 | To characterize potential bias existing in reviews, we define aspect score as the percentage of positive occurrences with respect to each review's aspects.
87 |
88 | ### Absolute bias
89 | Absolute bias can be visualized directly by plotting the aspect scores of different groups w.r.t each aspect. An example is shown in Fig.4.
90 |
91 |
92 |
93 |
94 |
95 | ### Relative bias
96 | Relative bias quantifies the relative favor for certain groups between human reviewers and system reviewers. And we define a metric called disparity difference to measure relative bias. The calculation of disparity difference can be visualized in Fig.5.
97 |
98 |
99 |
100 |
101 | ## Challenges
102 |
103 | We summarize eight challenges in the pursuit of a good review generation system.
104 | 1. Model
105 | - Long Document Modeling
106 | - Pre-trained Model for Scientific Domain
107 | - Structure Information
108 | - External Knowledge
109 | 2. Dataset
110 | - More Open, Fine-grained Review Data
111 | - More Accurate and Powerful Scientific Paper Parser
112 | 3. Evaluation
113 | - Fairness and Bias in Generated Text
114 | - Reliability
115 |
116 |
117 |
118 | ## Acknowledgement
119 |
120 | #### This work cannot be accomplished without the help of many researchers.
121 | We would like to thank people for their generous support, especially,
122 | Gábor Berend, Zhouhan Lin, William W. Cohen, Pengcheng Yin, Tiange Luo, Yuki M. Asano, Junjie Yan, Tuomas Haarnoja, Dandan Guo, Jie Fu, Lei Chen, Jinlan Fu, Jiapeng Wu, Yiran Chen, Wenshan Wang, Ziyi Dou, Yixin Liu, Junxian He, Bahetiyaer Bare, Saizheng Zhang, Jiateng Xie, Spyros Gidaris, Marco Federici, Junji Dai, Zihuiwen Ye, Jie Zhou, Yufang Liu, Yue Zhang, Ruifeng Xu, Zhenghua Li, Chunting Zhou, Yang Wei, Jiahao Wang, Bowen Tan, Anda Zhou.
123 |
124 |
125 | ## Bib
126 | ```
127 | @misc{yuan2021automate,
128 | title={Can We Automate Scientific Reviewing?},
129 | author={Weizhe Yuan and Pengfei Liu and Graham Neubig},
130 | year={2021},
131 | eprint={2102.00176},
132 | archivePrefix={arXiv},
133 | primaryClass={cs.CL}
134 | }
135 | ```
136 |
--------------------------------------------------------------------------------
/tagger/tasks.py:
--------------------------------------------------------------------------------
1 | import logging
2 | import os
3 | from typing import List, TextIO, Union
4 |
5 | from conllu import parse_incr
6 |
7 | from helper.utils_batch import InputExample, Split, TokenClassificationTask
8 |
9 |
10 | logger = logging.getLogger(__name__)
11 |
12 |
13 | class NER(TokenClassificationTask):
14 | def __init__(self, label_idx=-1):
15 | # in NER datasets, the last column is usually reserved for NER label
16 | self.label_idx = label_idx
17 |
18 | def read_examples_from_file(self, data_dir, mode: Union[Split, str]) -> List[InputExample]:
19 | if isinstance(mode, Split):
20 | mode = mode.value
21 | file_path = os.path.join(data_dir, f"{mode}.txt")
22 | guid_index = 1
23 | examples = []
24 | with open(file_path, encoding="utf-8") as f:
25 | words = []
26 | labels = []
27 | for line in f:
28 | if line.startswith("-DOCSTART-") or line == "" or line == "\n":
29 | if words:
30 | examples.append(InputExample(guid=f"{mode}-{guid_index}", words=words, labels=labels))
31 | guid_index += 1
32 | words = []
33 | labels = []
34 | else:
35 | splits = line.split(" ")
36 | words.append(splits[0])
37 | if len(splits) > 1:
38 | labels.append(splits[self.label_idx].replace("\n", ""))
39 | else:
40 | # Examples could have no label for mode = "test"
41 | labels.append("O")
42 | if words:
43 | examples.append(InputExample(guid=f"{mode}-{guid_index}", words=words, labels=labels))
44 | return examples
45 |
46 | def write_predictions_to_file(self, writer: TextIO, test_input_reader: TextIO, preds_list: List):
47 | example_id = 0
48 | for line in test_input_reader:
49 | if line.startswith("-DOCSTART-") or line == "" or line == "\n":
50 | writer.write(line)
51 | if not preds_list[example_id]:
52 | example_id += 1
53 | elif preds_list[example_id]:
54 | output_line = line.split()[0] + " " + preds_list[example_id].pop(0) + "\n"
55 | writer.write(output_line)
56 | else:
57 | logger.warning("Maximum sequence length exceeded: No prediction for '%s'.", line.split()[0])
58 |
59 | def get_labels(self, path: str) -> List[str]:
60 | if path:
61 | with open(path, "r") as f:
62 | labels = f.read().splitlines()
63 | if "O" not in labels:
64 | labels = ["O"] + labels
65 | return labels
66 | else:
67 | return ["O", "B-MISC", "I-MISC", "B-PER", "I-PER", "B-ORG", "I-ORG", "B-LOC", "I-LOC"]
68 |
69 |
70 | class Chunk(NER):
71 | def __init__(self):
72 | # in CONLL2003 dataset chunk column is second-to-last
73 | super().__init__(label_idx=-2)
74 |
75 | def get_labels(self, path: str) -> List[str]:
76 | if path:
77 | with open(path, "r") as f:
78 | labels = f.read().splitlines()
79 | if "O" not in labels:
80 | labels = ["O"] + labels
81 | return labels
82 | else:
83 | return [
84 | "O",
85 | "B-ADVP",
86 | "B-INTJ",
87 | "B-LST",
88 | "B-PRT",
89 | "B-NP",
90 | "B-SBAR",
91 | "B-VP",
92 | "B-ADJP",
93 | "B-CONJP",
94 | "B-PP",
95 | "I-ADVP",
96 | "I-INTJ",
97 | "I-LST",
98 | "I-PRT",
99 | "I-NP",
100 | "I-SBAR",
101 | "I-VP",
102 | "I-ADJP",
103 | "I-CONJP",
104 | "I-PP",
105 | ]
106 |
107 |
108 | class POS(TokenClassificationTask):
109 | def read_examples_from_file(self, data_dir, mode: Union[Split, str]) -> List[InputExample]:
110 | if isinstance(mode, Split):
111 | mode = mode.value
112 | file_path = os.path.join(data_dir, f"{mode}.txt")
113 | guid_index = 1
114 | examples = []
115 |
116 | with open(file_path, encoding="utf-8") as f:
117 | for sentence in parse_incr(f):
118 | words = []
119 | labels = []
120 | for token in sentence:
121 | words.append(token["form"])
122 | labels.append(token["upos"])
123 | assert len(words) == len(labels)
124 | if words:
125 | examples.append(InputExample(guid=f"{mode}-{guid_index}", words=words, labels=labels))
126 | guid_index += 1
127 | return examples
128 |
129 | def write_predictions_to_file(self, writer: TextIO, test_input_reader: TextIO, preds_list: List):
130 | example_id = 0
131 | for sentence in parse_incr(test_input_reader):
132 | s_p = preds_list[example_id]
133 | out = ""
134 | for token in sentence:
135 | out += f'{token["form"]} ({token["upos"]}|{s_p.pop(0)}) '
136 | out += "\n"
137 | writer.write(out)
138 | example_id += 1
139 |
140 | def get_labels(self, path: str) -> List[str]:
141 | if path:
142 | with open(path, "r") as f:
143 | return f.read().splitlines()
144 | else:
145 | return [
146 | "ADJ",
147 | "ADP",
148 | "ADV",
149 | "AUX",
150 | "CCONJ",
151 | "DET",
152 | "INTJ",
153 | "NOUN",
154 | "NUM",
155 | "PART",
156 | "PRON",
157 | "PROPN",
158 | "PUNCT",
159 | "SCONJ",
160 | "SYM",
161 | "VERB",
162 | "X",
163 | ]
164 |
--------------------------------------------------------------------------------
/tagger/helper/split.py:
--------------------------------------------------------------------------------
1 | # %% This is for automatically convert jsonl to sequence labeling
2 | # how many sentences together is specified using parameter passing
3 |
4 | import fire
5 | import jsonlines
6 | from transformers import AutoTokenizer
7 | from typing import List
8 | import nltk
9 |
10 |
11 | def endswith(sent: str, extensions: List[str]):
12 | for extension in extensions:
13 | if sent.endswith(extension):
14 | return True
15 | return False
16 |
17 |
18 | def contain_open_bracket(text: str):
19 | has_open_bracket = False
20 | for c in text:
21 | if c == '(':
22 | has_open_bracket = True
23 | if has_open_bracket and c == ')':
24 | has_open_bracket = False
25 | return has_open_bracket
26 |
27 |
28 | def get_sents(text: str) -> List[str]:
29 | """ Give a text string, return the sentence list """
30 | # Here are some heuristics that we use to get appropriate sentence splitter.
31 | # 1. combine sentences with its successor when certain conditions satisfied
32 | sent_list: List[str] = nltk.tokenize.sent_tokenize(text)
33 | new_sent_list = [sent.replace("\n", "") for sent in sent_list]
34 | postprocessed = []
35 | buff = ""
36 | for sent in new_sent_list:
37 | if endswith(sent, ['i.e.', 'i.e .', 'e.g.', 'e.g .', 'resp.', 'resp .',
38 | 'et al.', 'et al .', 'i.i.d.', 'i.i.d .', 'Eq.',
39 | 'Eq .', 'eq.', 'eq .', 'incl.', 'incl .', 'Fig.',
40 | 'Fig .', 'w.r.t.', 'w.r.t .', 'sec.', 'sec .',
41 | 'Sec.', 'Sec .']) or len(sent) < 10 \
42 | or contain_open_bracket(sent):
43 | buff += sent
44 | else:
45 | postprocessed.append(buff + sent)
46 | buff = ""
47 | if len(buff) > 0:
48 | postprocessed.append(buff)
49 | return postprocessed
50 |
51 |
52 | def get_aligned_data(json_line, tokenizer: AutoTokenizer):
53 | """ Get sentence tokens with its corresponding aspect tag.
54 | Return sth like this:
55 | [['ICLR_2017_1',
56 | ('this', 'clarity_positive'),
57 | ('paper','clarity_positive'),
58 | ('is', 'clarity_positive'),
59 | ('well', 'clarity_positive'),
60 | ('written', 'clarity_positive'),
61 | ('and', 'clarity_positive'),
62 | ('easy', 'clarity_positive'),
63 | ('to', 'clarity_positive'),
64 | ('follow', 'clarity_positive')],
65 | ...
66 | ]
67 | """
68 | paper_id: str = json_line.get('id')
69 | text: str = json_line.get('text')
70 | labels: List = json_line.get('labels')
71 |
72 | sents = get_sents(text)
73 | split_sent_list = []
74 | for sent in sents:
75 | split_sent_list.append(nltk.word_tokenize(sent))
76 |
77 | pointer = 0
78 | aligned_review = []
79 | for sent in split_sent_list:
80 | align_list = [paper_id]
81 |
82 | for token in sent:
83 | # We substitute the token if it cannot be tokenized by Bert
84 | current_subwords_len = len(tokenizer(token).get('input_ids'))
85 | if current_subwords_len == 0:
86 | token = 'sp_tok'
87 |
88 | start = pointer + text[pointer:].find(token) # start of a token
89 | pointer = start + len(token) # end of a token
90 | has_aspect = False
91 | for label_list in labels:
92 | label_start = label_list[0]
93 | label_end = label_list[1]
94 | label_text = label_list[2]
95 |
96 | if label_start <= start and pointer <= label_end:
97 | align_list.append((token, label_text))
98 | has_aspect = True
99 | break
100 |
101 | if not has_aspect:
102 | align_list.append((token, 'O'))
103 |
104 | if len(align_list) > 1:
105 | aligned_review.append(align_list)
106 |
107 | return aligned_review
108 |
109 |
110 | def concate_sentences(aligned_review: List, num: int):
111 | new_aligned_review = []
112 | paper_id = aligned_review[0][0]
113 | for i in range(0, len(aligned_review), num):
114 | align_list = [paper_id]
115 | sents = aligned_review[i: i + num]
116 | for sent in sents:
117 | align_list += sent[1:]
118 | new_aligned_review.append(align_list)
119 | return new_aligned_review
120 |
121 |
122 | def split(json_line, tokenizer: AutoTokenizer, num: int):
123 | aligned_review = get_aligned_data(json_line, tokenizer)
124 | new_aligned_review = concate_sentences(aligned_review, num)
125 | return new_aligned_review
126 |
127 |
128 | def read_jsonlines(jsonl_file):
129 | out = []
130 | with jsonlines.open(jsonl_file, 'r') as reader:
131 | for obj in reader:
132 | out.append(obj)
133 | return out
134 |
135 |
136 | def reformat(sent_file):
137 | # relace 2 blank lines with 1 blank line
138 | with open(sent_file, "r", encoding="utf8") as f:
139 | data = f.read()
140 | data = data.replace("\n\n\n", "\n\n")
141 | with open(sent_file, "w", encoding="utf8") as f:
142 | f.write(data)
143 |
144 |
145 | def main(jsonl_file, num: int, seqlab_file_name, id_file_name):
146 | """
147 | Given a jsonl file, format into seqlab file with id file.
148 | :param filename: jsonl file
149 | :param num: the number of sentences to concate together
150 | :param seqlab_file_name: the output seqlab format file
151 | :param id_file_name: the output id file
152 | """
153 | tokenizer = AutoTokenizer.from_pretrained("bert-large-cased")
154 | out = []
155 | json_lines = read_jsonlines(jsonl_file)
156 | for json_line in json_lines:
157 | out += split(json_line, tokenizer, num)
158 |
159 | # write file
160 | # token and label is separated by white space
161 | id_list = []
162 | sent_list = []
163 | for data in out:
164 | # data: [id, (token, aspect), (token, aspect), ...]
165 | paper_id = data[0]
166 | id_list.append(paper_id)
167 | for elem in data[1:]:
168 | line = elem[0] + " " + elem[1]
169 | sent_list.append(line)
170 | sent_list.append('\n')
171 |
172 | id_file = open(id_file_name, 'w')
173 | for elem in id_list:
174 | print(elem, file=id_file)
175 | id_file.flush()
176 |
177 | seqlab_file = open(seqlab_file_name, 'w')
178 | for elem in sent_list:
179 | print(elem, file=seqlab_file)
180 | seqlab_file.flush()
181 |
182 | reformat(seqlab_file_name)
183 |
184 |
185 | if __name__ == '__main__':
186 | # python split.py human_labeled.jsonl 1 review_with_aspect.txt id.txt
187 | fire.Fire(main)
188 |
--------------------------------------------------------------------------------
/extractor/extractor_utils.py:
--------------------------------------------------------------------------------
1 | # %%
2 | import json
3 | import sys
4 | from collections import Counter
5 | from typing import List
6 |
7 | import nltk
8 | import numpy as np
9 | import traceback
10 | from nltk.corpus import stopwords
11 | from nltk.stem import PorterStemmer
12 |
13 | sys.setrecursionlimit(1000000)
14 | nltk.download('stopwords')
15 | nltk.download('punkt')
16 | stemming = PorterStemmer()
17 | stops = set(stopwords.words("english"))
18 |
19 |
20 | # first read keywords table
21 | def read_keywords(keywords_file):
22 | keywords = []
23 | with open(keywords_file, 'r', encoding='utf8') as f:
24 | for line in f.readlines():
25 | line = line.strip()
26 | keywords += line.split(" ")
27 | return keywords
28 |
29 |
30 | # then read parameters table
31 | def read_parameters(parameters_file):
32 | parameters = []
33 | with open(parameters_file, 'r', encoding='utf8') as f:
34 | for line in f.readlines():
35 | line = line.strip()
36 | _, init_p, init_n = line.split(' ')
37 | parameters.append((float(init_p), int(init_n)))
38 | return parameters
39 |
40 |
41 | def apply_cleaning_function_to_list(X):
42 | cleaned_X = []
43 | for element in X:
44 | cleaned_X.append(clean_text(element))
45 | return cleaned_X
46 |
47 |
48 | def clean_text(raw_text):
49 | """This function works on a raw text string, and:
50 | 1) changes to lower case
51 | 2) tokenizes (breaks down into words
52 | 3) removes punctuation and non-word text
53 | 4) finds word stems
54 | 5) removes stop words
55 | 6) rejoins meaningful stem words"""
56 |
57 | # Convert to lower case
58 | text = raw_text.lower()
59 | # Tokenize
60 | tokens = nltk.word_tokenize(text)
61 | # Keep only words (removes punctuation + numbers)
62 | # use .isalnum to keep also numbers
63 | token_words = [w for w in tokens if w.isalpha()]
64 | # Stemming
65 | stemmed_words = [stemming.stem(w) for w in token_words]
66 | # Remove stop words
67 | meaningful_words = [w for w in stemmed_words if not w in stops]
68 | # Rejoin meaningful stemmed words
69 | joined_words = (" ".join(meaningful_words))
70 | # Return cleaned data
71 | return joined_words
72 |
73 |
74 | def get_full_text(paper_json):
75 | full_text = ""
76 | with open(paper_json, 'r', encoding='utf8') as f:
77 | content_dict = json.loads(f.read())
78 | sections = content_dict.get('metadata').get('sections')
79 | for section in sections:
80 | heading: str = section.get('heading')
81 | text: str = section.get('text')
82 | if heading is not None:
83 | if heading.upper().__contains__('ACKNOW') or heading.upper().__contains__('APPEN'):
84 | break
85 | if text is not None and len(text) > 0:
86 | full_text += text + " "
87 | full_text = full_text.replace("\n", " ").encode("utf-8", "ignore").decode("utf-8").strip()
88 | return full_text
89 |
90 |
91 | # look how the filtering works
92 | def get_sents(text: str) -> (List, List):
93 | """ give a text string, return the sentence list """
94 | # Here are some heuristics that we use to get appropriate sentence splitter.
95 | # 1. Delete sentences that are fewer than 25 characters.
96 | # 2. If a sentence ends in et al. Then concate with the sentence behind it.
97 | sent_list: List[str] = nltk.tokenize.sent_tokenize(text)
98 | new_sent_list = [sent.replace("\n", "") for sent in sent_list]
99 | postprocessed = []
100 | buff = ""
101 | for sent in new_sent_list:
102 | if sent.endswith('et al.') or sent.endswith('Eq.') \
103 | or sent.endswith('i.e.') or sent.endswith('e.g.'):
104 | buff += sent
105 | else:
106 | if len(buff + sent) > 25 and \
107 | not (buff + sent).__contains__('arxiv') and \
108 | not (buff + sent).__contains__('http'):
109 | postprocessed.append(buff + sent)
110 | buff = ""
111 | if len(buff) > 0:
112 | postprocessed.append(buff)
113 | cleaned_sent_list = apply_cleaning_function_to_list(postprocessed[:250])
114 | return postprocessed[:250], cleaned_sent_list
115 |
116 |
117 | def keywords_filtering(text: str, keywords: List[str]) -> (List[str], List[str]):
118 | sents, cleaned_sents = get_sents(text)
119 | filtered_sents = []
120 | cleaned_filtered_sents = []
121 | for sent, clean_sent in zip(sents, cleaned_sents):
122 | words = nltk.word_tokenize(sent)
123 | for word in words:
124 | if word in keywords:
125 | filtered_sents.append(sent)
126 | cleaned_filtered_sents.append(clean_sent)
127 | break
128 | return filtered_sents, cleaned_filtered_sents
129 |
130 |
131 | def score(sample: np.array, sent_list: List[str]) -> float:
132 | final_text = get_text(sample, sent_list)
133 | return get_score(final_text)
134 |
135 |
136 | def get_text(sample: np.array, sent_list: List[str]) -> str:
137 | final_text = ""
138 | for idx in range(0, len(sample)):
139 | if sample[idx] == 1:
140 | final_text += sent_list[idx] + " "
141 | final_text = final_text.strip()
142 | return final_text
143 |
144 |
145 | def get_score(text: str) -> float:
146 | words = nltk.word_tokenize(text)
147 | summ_len = len(words)
148 | counter = Counter(words)
149 | v = np.array(list(counter.values())) / summ_len
150 | return float(np.matmul(-v, np.log2(v)))
151 |
152 |
153 | def isAllZeroOrOne(array):
154 | """ Use to check convergence """
155 | for elem in array:
156 | if elem != 1.0 and elem != 0.0:
157 | return False
158 | return True
159 |
160 |
161 | def CEmethod(sent_list: List[str], N=10000, init_p=0.5, rho=0.05, alpha=0.7, iter=100) -> np.array:
162 | try:
163 | p = np.array([init_p] * len(sent_list))
164 | early_stop_step = 0
165 | gamma_old = 0.0
166 | for i in range(iter):
167 | if i >= 1:
168 | N = 1000
169 | samples = [np.random.binomial(1, p=p) for j in range(N)]
170 | scored_samples = [(sample, score(sample, sent_list)) for sample in samples if sample.sum() <= 30]
171 |
172 | while len(scored_samples) == 0:
173 | samples = [np.random.binomial(1, p=p) for j in range(N)]
174 | scored_samples = [(sample, score(sample, sent_list)) for sample in samples if sample.sum() <= 30]
175 |
176 | # np.quantile does not require a sorted input
177 | gamma = np.quantile([x[1] for x in scored_samples], 1 - rho)
178 |
179 | valid_samples = [sample[0] for sample in scored_samples if sample[1] >= gamma]
180 |
181 | # Relax the gamma a little bit due to floating point precision issue
182 | closeness = 0.0000000000001
183 | while len(valid_samples) == 0:
184 | valid_samples = [sample[0] for sample in scored_samples if sample[1] >= gamma - closeness]
185 | closeness *= 10
186 |
187 | new_p = sum(valid_samples) / len(valid_samples)
188 |
189 | if gamma == gamma_old:
190 | early_stop_step += 1
191 |
192 | p = alpha * p + (1 - alpha) * new_p
193 | gamma_old = gamma
194 |
195 | if early_stop_step >= 3 or isAllZeroOrOne(p):
196 | break
197 | return p
198 |
199 | except:
200 | return np.array([0] * len(sent_list))
201 |
--------------------------------------------------------------------------------
/tagger/helper/utils.py:
--------------------------------------------------------------------------------
1 | # coding=utf-8
2 | # Adapted from Huggingface token-classification task
3 |
4 | from dataclasses import dataclass
5 | from typing import List, Optional
6 |
7 | import numpy as np
8 | import torch
9 | from torch import nn
10 | from transformers import PreTrainedTokenizer, AutoModelForTokenClassification
11 |
12 |
13 | @dataclass
14 | class InputExample:
15 | """
16 | A single training/test example for token classification.
17 |
18 | Args:
19 | guid: Unique id for the example.
20 | words: list. The words of the sequence.
21 | labels: (Optional) list. The labels for each word of the sequence. This should be
22 | specified for train and dev examples, but not for test examples.
23 | """
24 |
25 | guid: str
26 | words: List[str]
27 | labels: Optional[List[str]]
28 |
29 |
30 | def convert_examples_to_features(
31 | examples: List[InputExample],
32 | label_list: List[str],
33 | max_seq_length: int,
34 | tokenizer: PreTrainedTokenizer,
35 | cls_token="[CLS]",
36 | cls_token_segment_id=1,
37 | sep_token="[SEP]",
38 | pad_token=0,
39 | pad_token_segment_id=0,
40 | pad_token_label_id=-100,
41 | sequence_a_segment_id=0,
42 | ):
43 | """Loads a data file into a list of `InputFeatures`
44 | `cls_token_at_end` define the location of the CLS token:
45 | - False (Default, BERT/XLM pattern): [CLS] + A + [SEP] + B + [SEP]
46 | - True (XLNet/GPT pattern): A + [SEP] + B + [SEP] + [CLS]
47 | `cls_token_segment_id` define the segment id associated to the CLS token (0 for BERT, 2 for XLNet)
48 | """
49 | label_map = {label: i for i, label in enumerate(label_list)}
50 |
51 | features = []
52 | for (ex_index, example) in enumerate(examples):
53 |
54 | tokens = []
55 | label_ids = []
56 | for word, label in zip(example.words, example.labels):
57 | word_tokens = tokenizer.tokenize(word)
58 |
59 | # bert-base-multilingual-cased sometimes output "nothing ([]) when calling tokenize with just a space.
60 | if len(word_tokens) > 0:
61 | tokens.extend(word_tokens)
62 | # Use the real label id for the first token of the word, and padding ids for the remaining tokens
63 | label_ids.extend([label_map[label]] + [pad_token_label_id] * (len(word_tokens) - 1))
64 |
65 | # Account for [CLS] and [SEP] with "- 2" and with "- 3" for RoBERTa.
66 | special_tokens_count = tokenizer.num_special_tokens_to_add()
67 |
68 | # Truncate long sequence
69 | if len(tokens) > max_seq_length - special_tokens_count:
70 | tokens = tokens[: (max_seq_length - special_tokens_count)]
71 | label_ids = label_ids[: (max_seq_length - special_tokens_count)]
72 |
73 | # The convention in BERT is:
74 | # (a) For sequence pairs:
75 | # tokens: [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP]
76 | # type_ids: 0 0 0 0 0 0 0 0 1 1 1 1 1 1
77 | # (b) For single sequences:
78 | # tokens: [CLS] the dog is hairy . [SEP]
79 | # type_ids: 0 0 0 0 0 0 0
80 | #
81 | # Where "type_ids" are used to indicate whether this is the first
82 | # sequence or the second sequence. The embedding vectors for `type=0` and
83 | # `type=1` were learned during pre-training and are added to the wordpiece
84 | # embedding vector (and position vector). This is not *strictly* necessary
85 | # since the [SEP] token unambiguously separates the sequences, but it makes
86 | # it easier for the model to learn the concept of sequences.
87 | #
88 | # For classification tasks, the first vector (corresponding to [CLS]) is
89 | # used as as the "sentence vector". Note that this only makes sense because
90 | # the entire model is fine-tuned.
91 | tokens += [sep_token]
92 | label_ids += [pad_token_label_id]
93 |
94 | segment_ids = [sequence_a_segment_id] * len(tokens)
95 |
96 | tokens = [cls_token] + tokens
97 | label_ids = [pad_token_label_id] + label_ids
98 | segment_ids = [cls_token_segment_id] + segment_ids
99 |
100 | input_ids = tokenizer.convert_tokens_to_ids(tokens)
101 |
102 | # The mask has 1 for real tokens and 0 for padding tokens. Only real
103 | # tokens are attended to.
104 | input_mask = [1] * len(input_ids)
105 |
106 | # Zero-pad up to the sequence length.
107 | padding_length = max_seq_length - len(input_ids)
108 |
109 | # pad on the right
110 | input_ids += [pad_token] * padding_length
111 | input_mask += [0] * padding_length
112 | segment_ids += [pad_token_segment_id] * padding_length
113 | label_ids += [pad_token_label_id] * padding_length
114 |
115 | assert len(input_ids) == max_seq_length
116 | assert len(input_mask) == max_seq_length
117 | assert len(segment_ids) == max_seq_length
118 | assert len(label_ids) == max_seq_length
119 |
120 | features.append(
121 | {'input_ids': torch.tensor(input_ids, dtype=torch.long).unsqueeze(0),
122 | 'attention_mask': torch.tensor(input_mask, dtype=torch.long).unsqueeze(0),
123 | 'token_type_ids': torch.tensor(segment_ids, dtype=torch.long).unsqueeze(0),
124 | 'labels': torch.tensor(label_ids, dtype=torch.long).unsqueeze(0)}
125 | )
126 | return features
127 |
128 |
129 | def align_predictions(predictions: np.ndarray, label_ids: np.ndarray, label_map):
130 | preds = np.argmax(predictions, axis=2)
131 |
132 | batch_size, seq_len = preds.shape
133 |
134 | out_label_list = [[] for _ in range(batch_size)]
135 | preds_list = [[] for _ in range(batch_size)]
136 |
137 | for i in range(batch_size):
138 | for j in range(seq_len):
139 | if label_ids[i, j] != nn.CrossEntropyLoss().ignore_index:
140 | out_label_list[i].append(label_map[label_ids[i][j]])
141 | preds_list[i].append(label_map[preds[i][j]])
142 |
143 | return preds_list, out_label_list
144 |
145 |
146 | class TokenClassifier:
147 | def __init__(
148 | self,
149 | tokenizer: PreTrainedTokenizer,
150 | model: AutoModelForTokenClassification,
151 | labels: List[str],
152 | max_seq_length=512
153 | ):
154 | self.tokenizer = tokenizer
155 | self.model = model
156 | self.model.eval()
157 | self.labels = labels
158 | self.max_seq_length = max_seq_length
159 | self.pad_token_label_id = nn.CrossEntropyLoss().ignore_index
160 |
161 | def prepare_features(self, sents: List[List[str]]):
162 | input_examples = []
163 | for i, sent in enumerate(sents):
164 | labels = ['O'] * len(sent)
165 | input_examples.append(InputExample(guid=f'{i}', words=sent, labels=labels))
166 |
167 | features = convert_examples_to_features(
168 | examples=input_examples,
169 | label_list=self.labels,
170 | max_seq_length=self.max_seq_length,
171 | tokenizer=self.tokenizer,
172 | cls_token=self.tokenizer.cls_token,
173 | cls_token_segment_id=0,
174 | sep_token=self.tokenizer.sep_token,
175 | pad_token=self.tokenizer.pad_token_id,
176 | pad_token_segment_id=self.tokenizer.pad_token_type_id,
177 | pad_token_label_id=self.pad_token_label_id
178 | )
179 | return features
180 |
181 | def classify_token(self, sents: List[List[str]]):
182 | features = self.prepare_features(sents)
183 |
184 | preds: torch.Tensor = None
185 | label_ids: torch.Tensor = None
186 |
187 | for inputs in features:
188 | for k, v in inputs.items():
189 | if isinstance(v, torch.Tensor):
190 | inputs[k] = v.to(self.model.device)
191 |
192 | with torch.no_grad():
193 | outputs = self.model(**inputs)
194 | step_eval_loss, logits = outputs[:2]
195 |
196 | # output predictions
197 | if preds is None:
198 | preds = logits.detach()
199 | else:
200 | preds = torch.cat((preds, logits.detach()), dim=0)
201 |
202 | if inputs.get('labels') is not None:
203 | if label_ids is None:
204 | label_ids = inputs['labels'].detach()
205 | else:
206 | label_ids = torch.cat((label_ids, inputs['labels'].detach()))
207 |
208 | # Finally, turn the aggregated tensors into numpy arrays.
209 | if preds is not None:
210 | preds = preds.cpu().numpy()
211 | if label_ids is not None:
212 | label_ids = label_ids.cpu().numpy()
213 | return preds, label_ids
214 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | Apache License
2 | Version 2.0, January 2004
3 | http://www.apache.org/licenses/
4 |
5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
6 |
7 | 1. Definitions.
8 |
9 | "License" shall mean the terms and conditions for use, reproduction,
10 | and distribution as defined by Sections 1 through 9 of this document.
11 |
12 | "Licensor" shall mean the copyright owner or entity authorized by
13 | the copyright owner that is granting the License.
14 |
15 | "Legal Entity" shall mean the union of the acting entity and all
16 | other entities that control, are controlled by, or are under common
17 | control with that entity. For the purposes of this definition,
18 | "control" means (i) the power, direct or indirect, to cause the
19 | direction or management of such entity, whether by contract or
20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the
21 | outstanding shares, or (iii) beneficial ownership of such entity.
22 |
23 | "You" (or "Your") shall mean an individual or Legal Entity
24 | exercising permissions granted by this License.
25 |
26 | "Source" form shall mean the preferred form for making modifications,
27 | including but not limited to software source code, documentation
28 | source, and configuration files.
29 |
30 | "Object" form shall mean any form resulting from mechanical
31 | transformation or translation of a Source form, including but
32 | not limited to compiled object code, generated documentation,
33 | and conversions to other media types.
34 |
35 | "Work" shall mean the work of authorship, whether in Source or
36 | Object form, made available under the License, as indicated by a
37 | copyright notice that is included in or attached to the work
38 | (an example is provided in the Appendix below).
39 |
40 | "Derivative Works" shall mean any work, whether in Source or Object
41 | form, that is based on (or derived from) the Work and for which the
42 | editorial revisions, annotations, elaborations, or other modifications
43 | represent, as a whole, an original work of authorship. For the purposes
44 | of this License, Derivative Works shall not include works that remain
45 | separable from, or merely link (or bind by name) to the interfaces of,
46 | the Work and Derivative Works thereof.
47 |
48 | "Contribution" shall mean any work of authorship, including
49 | the original version of the Work and any modifications or additions
50 | to that Work or Derivative Works thereof, that is intentionally
51 | submitted to Licensor for inclusion in the Work by the copyright owner
52 | or by an individual or Legal Entity authorized to submit on behalf of
53 | the copyright owner. For the purposes of this definition, "submitted"
54 | means any form of electronic, verbal, or written communication sent
55 | to the Licensor or its representatives, including but not limited to
56 | communication on electronic mailing lists, source code control systems,
57 | and issue tracking systems that are managed by, or on behalf of, the
58 | Licensor for the purpose of discussing and improving the Work, but
59 | excluding communication that is conspicuously marked or otherwise
60 | designated in writing by the copyright owner as "Not a Contribution."
61 |
62 | "Contributor" shall mean Licensor and any individual or Legal Entity
63 | on behalf of whom a Contribution has been received by Licensor and
64 | subsequently incorporated within the Work.
65 |
66 | 2. Grant of Copyright License. Subject to the terms and conditions of
67 | this License, each Contributor hereby grants to You a perpetual,
68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
69 | copyright license to reproduce, prepare Derivative Works of,
70 | publicly display, publicly perform, sublicense, and distribute the
71 | Work and such Derivative Works in Source or Object form.
72 |
73 | 3. Grant of Patent License. Subject to the terms and conditions of
74 | this License, each Contributor hereby grants to You a perpetual,
75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
76 | (except as stated in this section) patent license to make, have made,
77 | use, offer to sell, sell, import, and otherwise transfer the Work,
78 | where such license applies only to those patent claims licensable
79 | by such Contributor that are necessarily infringed by their
80 | Contribution(s) alone or by combination of their Contribution(s)
81 | with the Work to which such Contribution(s) was submitted. If You
82 | institute patent litigation against any entity (including a
83 | cross-claim or counterclaim in a lawsuit) alleging that the Work
84 | or a Contribution incorporated within the Work constitutes direct
85 | or contributory patent infringement, then any patent licenses
86 | granted to You under this License for that Work shall terminate
87 | as of the date such litigation is filed.
88 |
89 | 4. Redistribution. You may reproduce and distribute copies of the
90 | Work or Derivative Works thereof in any medium, with or without
91 | modifications, and in Source or Object form, provided that You
92 | meet the following conditions:
93 |
94 | (a) You must give any other recipients of the Work or
95 | Derivative Works a copy of this License; and
96 |
97 | (b) You must cause any modified files to carry prominent notices
98 | stating that You changed the files; and
99 |
100 | (c) You must retain, in the Source form of any Derivative Works
101 | that You distribute, all copyright, patent, trademark, and
102 | attribution notices from the Source form of the Work,
103 | excluding those notices that do not pertain to any part of
104 | the Derivative Works; and
105 |
106 | (d) If the Work includes a "NOTICE" text file as part of its
107 | distribution, then any Derivative Works that You distribute must
108 | include a readable copy of the attribution notices contained
109 | within such NOTICE file, excluding those notices that do not
110 | pertain to any part of the Derivative Works, in at least one
111 | of the following places: within a NOTICE text file distributed
112 | as part of the Derivative Works; within the Source form or
113 | documentation, if provided along with the Derivative Works; or,
114 | within a display generated by the Derivative Works, if and
115 | wherever such third-party notices normally appear. The contents
116 | of the NOTICE file are for informational purposes only and
117 | do not modify the License. You may add Your own attribution
118 | notices within Derivative Works that You distribute, alongside
119 | or as an addendum to the NOTICE text from the Work, provided
120 | that such additional attribution notices cannot be construed
121 | as modifying the License.
122 |
123 | You may add Your own copyright statement to Your modifications and
124 | may provide additional or different license terms and conditions
125 | for use, reproduction, or distribution of Your modifications, or
126 | for any such Derivative Works as a whole, provided Your use,
127 | reproduction, and distribution of the Work otherwise complies with
128 | the conditions stated in this License.
129 |
130 | 5. Submission of Contributions. Unless You explicitly state otherwise,
131 | any Contribution intentionally submitted for inclusion in the Work
132 | by You to the Licensor shall be under the terms and conditions of
133 | this License, without any additional terms or conditions.
134 | Notwithstanding the above, nothing herein shall supersede or modify
135 | the terms of any separate license agreement you may have executed
136 | with Licensor regarding such Contributions.
137 |
138 | 6. Trademarks. This License does not grant permission to use the trade
139 | names, trademarks, service marks, or product names of the Licensor,
140 | except as required for reasonable and customary use in describing the
141 | origin of the Work and reproducing the content of the NOTICE file.
142 |
143 | 7. Disclaimer of Warranty. Unless required by applicable law or
144 | agreed to in writing, Licensor provides the Work (and each
145 | Contributor provides its Contributions) on an "AS IS" BASIS,
146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 | implied, including, without limitation, any warranties or conditions
148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 | PARTICULAR PURPOSE. You are solely responsible for determining the
150 | appropriateness of using or redistributing the Work and assume any
151 | risks associated with Your exercise of permissions under this License.
152 |
153 | 8. Limitation of Liability. In no event and under no legal theory,
154 | whether in tort (including negligence), contract, or otherwise,
155 | unless required by applicable law (such as deliberate and grossly
156 | negligent acts) or agreed to in writing, shall any Contributor be
157 | liable to You for damages, including any direct, indirect, special,
158 | incidental, or consequential damages of any character arising as a
159 | result of this License or out of the use or inability to use the
160 | Work (including but not limited to damages for loss of goodwill,
161 | work stoppage, computer failure or malfunction, or any and all
162 | other commercial damages or losses), even if such Contributor
163 | has been advised of the possibility of such damages.
164 |
165 | 9. Accepting Warranty or Additional Liability. While redistributing
166 | the Work or Derivative Works thereof, You may choose to offer,
167 | and charge a fee for, acceptance of support, warranty, indemnity,
168 | or other liability obligations and/or rights consistent with this
169 | License. However, in accepting such obligations, You may act only
170 | on Your own behalf and on Your sole responsibility, not on behalf
171 | of any other Contributor, and only if You agree to indemnify,
172 | defend, and hold each Contributor harmless for any liability
173 | incurred by, or claims asserted against, such Contributor by reason
174 | of your accepting any such warranty or additional liability.
175 |
176 | END OF TERMS AND CONDITIONS
177 |
178 | APPENDIX: How to apply the Apache License to your work.
179 |
180 | To apply the Apache License to your work, attach the following
181 | boilerplate notice, with the fields enclosed by brackets "[]"
182 | replaced with your own identifying information. (Don't include
183 | the brackets!) The text should be enclosed in the appropriate
184 | comment syntax for the file format. We also recommend that a
185 | file or class name and description of purpose be included on the
186 | same "printed page" as the copyright notice for easier
187 | identification within third-party archives.
188 |
189 | Copyright [yyyy] [name of copyright owner]
190 |
191 | Licensed under the Apache License, Version 2.0 (the "License");
192 | you may not use this file except in compliance with the License.
193 | You may obtain a copy of the License at
194 |
195 | http://www.apache.org/licenses/LICENSE-2.0
196 |
197 | Unless required by applicable law or agreed to in writing, software
198 | distributed under the License is distributed on an "AS IS" BASIS,
199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 | See the License for the specific language governing permissions and
201 | limitations under the License.
202 |
--------------------------------------------------------------------------------
/tagger/run_tagger.py:
--------------------------------------------------------------------------------
1 | # coding=utf-8
2 | # Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
3 | # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
4 | #
5 | # Licensed under the Apache License, Version 2.0 (the "License");
6 | # you may not use this file except in compliance with the License.
7 | # You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | """ Fine-tuning the library models for named entity recognition on CoNLL-2003. """
17 | import logging
18 | import os
19 | import sys
20 | from dataclasses import dataclass, field
21 | from importlib import import_module
22 | from typing import Dict, List, Optional, Tuple
23 |
24 | import numpy as np
25 | from seqeval.metrics import accuracy_score, f1_score, precision_score, recall_score
26 | from torch import nn
27 |
28 | from transformers import (
29 | AutoConfig,
30 | AutoModelForTokenClassification,
31 | AutoTokenizer,
32 | EvalPrediction,
33 | HfArgumentParser,
34 | Trainer,
35 | TrainingArguments,
36 | set_seed,
37 | )
38 | from helper.utils_batch import Split, TokenClassificationDataset, TokenClassificationTask
39 |
40 |
41 | logger = logging.getLogger(__name__)
42 |
43 |
44 | @dataclass
45 | class ModelArguments:
46 | """
47 | Arguments pertaining to which model/config/tokenizer we are going to fine-tune from.
48 | """
49 |
50 | model_name_or_path: str = field(
51 | metadata={"help": "Path to pretrained model or model identifier from huggingface.co/models"}
52 | )
53 | config_name: Optional[str] = field(
54 | default=None, metadata={"help": "Pretrained config name or path if not the same as model_name"}
55 | )
56 | task_type: Optional[str] = field(
57 | default="NER", metadata={"help": "Task type to fine tune in training (e.g. NER, POS, etc)"}
58 | )
59 | tokenizer_name: Optional[str] = field(
60 | default=None, metadata={"help": "Pretrained tokenizer name or path if not the same as model_name"}
61 | )
62 | use_fast: bool = field(default=False, metadata={"help": "Set this flag to use fast tokenization."})
63 | # If you want to tweak more attributes on your tokenizer, you should do it in a distinct script,
64 | # or just modify its tokenizer_config.json.
65 | cache_dir: Optional[str] = field(
66 | default=None, metadata={"help": "Where do you want to store the pretrained models downloaded from s3"}
67 | )
68 |
69 |
70 | @dataclass
71 | class DataTrainingArguments:
72 | """
73 | Arguments pertaining to what data we are going to input our model for training and eval.
74 | """
75 |
76 | data_dir: str = field(
77 | metadata={"help": "The input data dir. Should contain the .txt files for a CoNLL-2003-formatted task."}
78 | )
79 | labels: Optional[str] = field(
80 | default=None,
81 | metadata={"help": "Path to a file containing all labels. If not specified, CoNLL-2003 labels are used."},
82 | )
83 | max_seq_length: int = field(
84 | default=128,
85 | metadata={
86 | "help": "The maximum total input sequence length after tokenization. Sequences longer "
87 | "than this will be truncated, sequences shorter will be padded."
88 | },
89 | )
90 | overwrite_cache: bool = field(
91 | default=False, metadata={"help": "Overwrite the cached training and evaluation sets"}
92 | )
93 |
94 |
95 | def main():
96 | # See all possible arguments in src/transformers/training_args.py
97 | # or by passing the --help flag to this script.
98 | # We now keep distinct sets of args, for a cleaner separation of concerns.
99 |
100 | parser = HfArgumentParser((ModelArguments, DataTrainingArguments, TrainingArguments))
101 | if len(sys.argv) == 2 and sys.argv[1].endswith(".json"):
102 | # If we pass only one argument to the script and it's the path to a json file,
103 | # let's parse it to get our arguments.
104 | model_args, data_args, training_args = parser.parse_json_file(json_file=os.path.abspath(sys.argv[1]))
105 | else:
106 | model_args, data_args, training_args = parser.parse_args_into_dataclasses()
107 |
108 | if (
109 | os.path.exists(training_args.output_dir)
110 | and os.listdir(training_args.output_dir)
111 | and training_args.do_train
112 | and not training_args.overwrite_output_dir
113 | ):
114 | raise ValueError(
115 | f"Output directory ({training_args.output_dir}) already exists and is not empty. Use --overwrite_output_dir to overcome."
116 | )
117 |
118 | module = import_module("tasks")
119 | try:
120 | token_classification_task_clazz = getattr(module, model_args.task_type)
121 | token_classification_task: TokenClassificationTask = token_classification_task_clazz()
122 | except AttributeError:
123 | raise ValueError(
124 | f"Task {model_args.task_type} needs to be defined as a TokenClassificationTask subclass in {module}. "
125 | f"Available tasks classes are: {TokenClassificationTask.__subclasses__()}"
126 | )
127 |
128 | # Setup logging
129 | logging.basicConfig(
130 | format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
131 | datefmt="%m/%d/%Y %H:%M:%S",
132 | level=logging.INFO if training_args.local_rank in [-1, 0] else logging.WARN,
133 | )
134 | logger.warning(
135 | "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s",
136 | training_args.local_rank,
137 | training_args.device,
138 | training_args.n_gpu,
139 | bool(training_args.local_rank != -1),
140 | training_args.fp16,
141 | )
142 | logger.info("Training/evaluation parameters %s", training_args)
143 |
144 | # Set seed
145 | set_seed(training_args.seed)
146 |
147 | # Prepare CONLL-2003 task
148 | labels = token_classification_task.get_labels(data_args.labels)
149 | label_map: Dict[int, str] = {i: label for i, label in enumerate(labels)}
150 | num_labels = len(labels)
151 |
152 | # Load pretrained model and tokenizer
153 | #
154 | # Distributed training:
155 | # The .from_pretrained methods guarantee that only one local process can concurrently
156 | # download model & vocab.
157 |
158 | config = AutoConfig.from_pretrained(
159 | model_args.config_name if model_args.config_name else model_args.model_name_or_path,
160 | num_labels=num_labels,
161 | id2label=label_map,
162 | label2id={label: i for i, label in enumerate(labels)},
163 | cache_dir=model_args.cache_dir,
164 | )
165 | tokenizer = AutoTokenizer.from_pretrained(
166 | model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path,
167 | cache_dir=model_args.cache_dir,
168 | use_fast=model_args.use_fast,
169 | )
170 | model = AutoModelForTokenClassification.from_pretrained(
171 | model_args.model_name_or_path,
172 | from_tf=bool(".ckpt" in model_args.model_name_or_path),
173 | config=config,
174 | cache_dir=model_args.cache_dir,
175 | )
176 |
177 | # Get datasets
178 | train_dataset = (
179 | TokenClassificationDataset(
180 | token_classification_task=token_classification_task,
181 | data_dir=data_args.data_dir,
182 | tokenizer=tokenizer,
183 | labels=labels,
184 | model_type=config.model_type,
185 | max_seq_length=data_args.max_seq_length,
186 | overwrite_cache=data_args.overwrite_cache,
187 | mode=Split.train,
188 | )
189 | if training_args.do_train
190 | else None
191 | )
192 | eval_dataset = (
193 | TokenClassificationDataset(
194 | token_classification_task=token_classification_task,
195 | data_dir=data_args.data_dir,
196 | tokenizer=tokenizer,
197 | labels=labels,
198 | model_type=config.model_type,
199 | max_seq_length=data_args.max_seq_length,
200 | overwrite_cache=data_args.overwrite_cache,
201 | mode=Split.dev,
202 | )
203 | if training_args.do_eval
204 | else None
205 | )
206 |
207 | def align_predictions(predictions: np.ndarray, label_ids: np.ndarray) -> Tuple[List[int], List[int]]:
208 | preds = np.argmax(predictions, axis=2)
209 |
210 | batch_size, seq_len = preds.shape
211 |
212 | out_label_list = [[] for _ in range(batch_size)]
213 | preds_list = [[] for _ in range(batch_size)]
214 |
215 | for i in range(batch_size):
216 | for j in range(seq_len):
217 | if label_ids[i, j] != nn.CrossEntropyLoss().ignore_index:
218 | out_label_list[i].append(label_map[label_ids[i][j]])
219 | preds_list[i].append(label_map[preds[i][j]])
220 |
221 | return preds_list, out_label_list
222 |
223 | def compute_metrics(p: EvalPrediction) -> Dict:
224 | preds_list, out_label_list = align_predictions(p.predictions, p.label_ids)
225 | return {
226 | "accuracy_score": accuracy_score(out_label_list, preds_list),
227 | "precision": precision_score(out_label_list, preds_list),
228 | "recall": recall_score(out_label_list, preds_list),
229 | "f1": f1_score(out_label_list, preds_list),
230 | }
231 |
232 | # Initialize our Trainer
233 | trainer = Trainer(
234 | model=model,
235 | args=training_args,
236 | train_dataset=train_dataset,
237 | eval_dataset=eval_dataset,
238 | compute_metrics=compute_metrics,
239 | )
240 |
241 | # Training
242 | if training_args.do_train:
243 | trainer.train(
244 | model_path=model_args.model_name_or_path if os.path.isdir(model_args.model_name_or_path) else None
245 | )
246 | trainer.save_model()
247 | # For convenience, we also re-save the tokenizer to the same directory,
248 | # so that you can share your model easily on huggingface.co/models =)
249 | if trainer.is_world_master():
250 | tokenizer.save_pretrained(training_args.output_dir)
251 |
252 | # Evaluation
253 | results = {}
254 | if training_args.do_eval:
255 | logger.info("*** Evaluate ***")
256 |
257 | result = trainer.evaluate()
258 |
259 | output_eval_file = os.path.join(training_args.output_dir, "eval_results.txt")
260 | if trainer.is_world_master():
261 | with open(output_eval_file, "w") as writer:
262 | logger.info("***** Eval results *****")
263 | for key, value in result.items():
264 | logger.info(" %s = %s", key, value)
265 | writer.write("%s = %s\n" % (key, value))
266 |
267 | results.update(result)
268 |
269 | # Predict
270 | if training_args.do_predict:
271 | test_dataset = TokenClassificationDataset(
272 | token_classification_task=token_classification_task,
273 | data_dir=data_args.data_dir,
274 | tokenizer=tokenizer,
275 | labels=labels,
276 | model_type=config.model_type,
277 | max_seq_length=data_args.max_seq_length,
278 | overwrite_cache=data_args.overwrite_cache,
279 | mode=Split.test,
280 | )
281 |
282 | predictions, label_ids, metrics = trainer.predict(test_dataset)
283 | preds_list, _ = align_predictions(predictions, label_ids)
284 |
285 | output_test_results_file = os.path.join(training_args.output_dir, "test_results.txt")
286 | if trainer.is_world_master():
287 | with open(output_test_results_file, "w") as writer:
288 | for key, value in metrics.items():
289 | logger.info(" %s = %s", key, value)
290 | writer.write("%s = %s\n" % (key, value))
291 |
292 | # Save predictions
293 | output_test_predictions_file = os.path.join(training_args.output_dir, "test_predictions.txt")
294 | if trainer.is_world_master():
295 | with open(output_test_predictions_file, "w") as writer:
296 | with open(os.path.join(data_args.data_dir, "test.txt"), "r") as f:
297 | token_classification_task.write_predictions_to_file(writer, f, preds_list)
298 |
299 | return results
300 |
301 |
302 | def _mp_fn(index):
303 | # For xla_spawn (TPUs)
304 | main()
305 |
306 |
307 | if __name__ == "__main__":
308 | main()
309 |
--------------------------------------------------------------------------------
/tagger/helper/heuristics.py:
--------------------------------------------------------------------------------
1 | # %%
2 | from typing import List
3 | import jsonlines
4 | import fire
5 |
6 |
7 | def pair_sent_with_id(id_file, sent_file):
8 | """ pair sentences with their corresponding id """
9 | paper_id_list = []
10 | with open(id_file, 'r', encoding='utf8') as f:
11 | for line in f.readlines():
12 | paper_id = line.strip()
13 | paper_id_list.append(paper_id)
14 |
15 | sent_list = [] # List[List[Tuple]]
16 | sent = []
17 | with open(sent_file, 'r', encoding='utf8') as f:
18 | for line in f.readlines():
19 | line = line.strip()
20 | if len(line) == 0:
21 | sent_list.append(sent)
22 | sent = []
23 | else:
24 | token, aspect = line.split(" ")
25 | sent.append((token, aspect))
26 | if len(sent) > 0:
27 | sent_list.append(sent)
28 |
29 | assert len(paper_id_list) == len(sent_list)
30 |
31 | # concate the id into the sent list
32 | sent_with_id = []
33 | for id, sent_elem in zip(paper_id_list, sent_list):
34 | list_to_add = [id] + sent_elem
35 | sent_with_id.append(list_to_add)
36 |
37 | return sent_with_id
38 |
39 |
40 | def reconstruct_doc(sent_with_id: List):
41 | # concate every sentence for the same document
42 | doc_with_id = []
43 | current_id = sent_with_id[0][0]
44 | current_sent = sent_with_id[0]
45 | for l in sent_with_id[1:]:
46 | id = l[0]
47 | if id == current_id:
48 | current_sent += l[1:]
49 | else:
50 | doc_with_id.append(current_sent)
51 | current_id = id
52 | current_sent = l
53 |
54 | if len(current_sent) > 0:
55 | doc_with_id.append(current_sent)
56 | return doc_with_id
57 |
58 |
59 | def combine(tag_list):
60 | # combine same consecutive tags
61 | new_tag_list = []
62 | current_tag = tag_list[0][0]
63 | start = tag_list[0][1]
64 | end = tag_list[0][2]
65 | for elem in tag_list:
66 | if elem[0] == current_tag:
67 | end = elem[2]
68 | else:
69 | new_tag_list.append([current_tag, start, end])
70 | current_tag = elem[0]
71 | start = elem[1]
72 | end = elem[2]
73 | new_tag_list.append([current_tag, start, end])
74 | return new_tag_list
75 |
76 |
77 | def endsWithPunctuation(token: str) -> bool:
78 | if token.endswith('.') or token.endswith(','):
79 | return True
80 | else:
81 | return False
82 |
83 |
84 | def is_special_symbol(token: str) -> bool:
85 | if len(token) == 1:
86 | if not token.isdigit():
87 | if not token.isalpha():
88 | if token != "'":
89 | return True
90 | return False
91 |
92 |
93 | def heuristics(doc_with_id_elem):
94 | """ take a List[id, Tuple], return with a same format """
95 | paper_id = doc_with_id_elem[0]
96 | words_with_labels = doc_with_id_elem[1:]
97 | words = [x[0] for x in words_with_labels]
98 | labels = [x[1] for x in words_with_labels]
99 |
100 | # convert to [tag, start, end] format
101 | current_label = labels[0]
102 | label_list = []
103 | start = 0
104 | end = 0
105 |
106 | for i, label in enumerate(labels):
107 | if label == current_label:
108 | end = i
109 | else:
110 | span = [current_label, start, end]
111 | label_list.append(span)
112 | current_label = label
113 | start = i
114 | end = i
115 |
116 | span = [current_label, start, end]
117 | label_list.append(span)
118 |
119 | # Heuristic 1: If there are no other tags (they are tagged as O which
120 | # stands for Outside) between two summary tags, then replace all tags
121 | # between them with summary tag.
122 | if len(label_list) >= 3:
123 | for i in range(len(label_list) - 2):
124 | if label_list[i][0] == 'summary' \
125 | and label_list[i + 2][0] == 'summary' \
126 | and label_list[i + 1][0] == 'O':
127 | label_list[i + 1][0] = 'summary'
128 |
129 | label_list = combine(label_list)
130 |
131 | # Heuristic 2: If there are multiple text spans tagged as summary,
132 | # keep the first one and discard others.
133 | summary_appear = False
134 | for i in range(len(label_list)):
135 | if label_list[i][0] == 'summary' and not summary_appear:
136 | summary_appear = True
137 | continue
138 | if summary_appear:
139 | if label_list[i][0] == 'summary':
140 | label_list[i][0] = 'O'
141 |
142 | label_list = combine(label_list)
143 |
144 | # Heuristic 3: If the punctuation is separately tagged and is
145 | # different from its neighbor, we replace its tag to O.
146 | for i in range(len(label_list)):
147 | if label_list[i][1] == label_list[i][2]:
148 | current_word = words[label_list[i][1]]
149 | if current_word == ',' or current_word == '.':
150 | label_list[i][0] = 'O'
151 |
152 | label_list = combine(label_list)
153 |
154 | # Heuristic 4 & 5
155 | if len(label_list) >= 3:
156 | # Heuristic 4: If two tags are separated by a single other tag,
157 | # then replace this tag with its right neighbor's tag.
158 | for i in range(len(label_list) - 2):
159 | if label_list[i][0] != 'summary' and label_list[i][0] != 'O' \
160 | and label_list[i][0] == label_list[i + 2][0] \
161 | and label_list[i + 1][0] != label_list[i][0]:
162 | if label_list[i + 1][0] != 'O' or label_list[i + 1][1] == label_list[i + 1][2]:
163 | label_list[i + 1][0] = label_list[i + 2][0]
164 | label_list = combine(label_list)
165 |
166 | # Heuristic 5: If there exists a single token with a tag and its neighbors are O,
167 | # then replace this tag to O.
168 | for i in range(1, len(label_list) - 1):
169 | if label_list[i][0] != 'O' and label_list[i][1] == label_list[i][2] \
170 | and label_list[i - 1][0] == 'O' and label_list[i + 1][0] == 'O':
171 | label_list[i][0] = 'O'
172 | label_list = combine(label_list)
173 |
174 | # Heuristic 6: For a non-summary non-O tag span, if its neighbors are O
175 | # and the start/end of this span is not special symbol (for example,
176 | # punctuations or other symbols that have 1 length), then we expand
177 | # from its start/end until we meet other non-O tag or special symbol.
178 | new_labels = []
179 | for elem in label_list:
180 | new_labels += [elem[0]] * (elem[2] - elem[1] + 1)
181 |
182 | if len(label_list) >= 3:
183 | for i in range(1, len(label_list) - 1):
184 | if label_list[i][0] != 'O' and label_list[i][0] != 'summary':
185 | start = label_list[i][1]
186 | end = label_list[i][2]
187 | # from start
188 | if label_list[i - 1][0] == 'O' and start > 0 \
189 | and not is_special_symbol(words[start - 1]) \
190 | and not endsWithPunctuation(words[start - 1]):
191 | new_start = start
192 | while new_start > 0 and not is_special_symbol(words[new_start - 1]) \
193 | and not endsWithPunctuation(words[new_start - 1]) \
194 | and new_labels[new_start - 1] == 'O':
195 | new_start -= 1
196 | label_list[i - 1][2] = new_start - 1
197 | label_list[i][1] = new_start
198 | # may exist end < start tag, we will delete them later
199 | # from end
200 | if label_list[i + 1][0] == 'O' and end < len(words) - 1 \
201 | and not is_special_symbol(words[end]) \
202 | and not endsWithPunctuation(words[end]):
203 | new_end = end
204 | while new_end < len(words) - 1 and not is_special_symbol(words[new_end + 1]) \
205 | and not endsWithPunctuation(words[new_end]) \
206 | and new_labels[new_end + 1] == 'O':
207 | new_end += 1
208 | label_list[i + 1][1] = new_end + 1
209 | label_list[i][2] = new_end
210 | # may exist end < start tag, we will delete them later
211 | # Since we didn't update the new_tags, we should solve conflicts later where the
212 | # end of the previous tag is greater than the start of current tag.
213 |
214 | new_label_list = []
215 | for elem in label_list:
216 | if elem[1] <= elem[2]:
217 | # Only keep those with start <= end
218 | new_label_list.append(elem)
219 |
220 | # Make corrections to the boundaries
221 | if len(new_label_list) >= 2:
222 | for i in range(len(new_label_list) - 1):
223 | new_label_list[i][2] = new_label_list[i + 1][1] - 1
224 |
225 | labels = []
226 | for elem in new_label_list:
227 | labels += [elem[0]] * (elem[2] - elem[1] + 1)
228 |
229 | # Heuristic 7: If the summary span does not end with a period, then we truncate or
230 | # extend it at most five words to make it ends with a period.
231 | summary_end = None
232 | for i in range(1, len(words) - 1):
233 | current_label = labels[i]
234 | next_label = labels[i + 1]
235 | if current_label == 'summary' and next_label != 'summary':
236 | summary_end = i
237 |
238 | if summary_end is not None:
239 | if words[summary_end].endswith('.') or words[summary_end + 1] == '.':
240 | # The boundary is correct
241 | pass
242 | else:
243 | i = summary_end
244 | new_summary_end = None
245 | left_count = 0
246 | right_count = 0
247 | while i >= 0 and not words[i].endswith('.'):
248 | # Left find first
249 | i -= 1
250 | left_count += 1
251 | if left_count == 6:
252 | break
253 | if i > summary_end - 6 and i != -1:
254 | new_summary_end = i
255 |
256 | if new_summary_end is None:
257 | i = summary_end
258 | while i < len(words) - 1 and not words[i].endswith('.'):
259 | # Then find right
260 | i += 1
261 | right_count += 1
262 | if right_count == 6:
263 | break
264 | if i < summary_end + 6:
265 | new_summary_end = i
266 | if new_summary_end is not None and new_summary_end != summary_end:
267 | if new_summary_end < summary_end:
268 | for idx in range(new_summary_end + 1, summary_end + 1):
269 | labels[idx] = 'O'
270 | else:
271 | for idx in range(summary_end, new_summary_end + 1):
272 | labels[idx] = 'summary'
273 |
274 | return_list = [(x, y) for x, y in zip(words, labels)]
275 | return_list = [paper_id] + return_list
276 |
277 | return return_list
278 |
279 |
280 | def postprocess(doc_with_id):
281 | new_doc_with_id = []
282 | for elem in doc_with_id:
283 | new_doc_with_id.append(heuristics(elem))
284 | return new_doc_with_id
285 |
286 |
287 | def get_jsonlines(new_doc_with_id):
288 | lines = []
289 | for doc in new_doc_with_id:
290 | id = doc[0]
291 | text_list = []
292 | text_list += [t[0] for t in doc[1:]]
293 | text = " ".join(text_list)
294 | # text = detokenizer.detokenize(text_list)
295 | # we have text, then we need to find appropriate span
296 | labels = []
297 | current_label = doc[1][1]
298 | current_token = doc[1][0]
299 | pointer = 0
300 | label_start = pointer + text[pointer:].find(current_token)
301 | label_end = label_start + len(current_token)
302 | for token, label in doc[1:]:
303 | token_start = pointer + text[pointer:].find(token)
304 | pointer = token_start + len(token) # represent the end of a token
305 | if label == current_label:
306 | label_end = pointer
307 | else:
308 | if label_start < label_end and current_label != "O":
309 | labels.append([label_start, label_end, current_label])
310 | current_label = label
311 | label_start = pointer - len(token)
312 | label_end = pointer
313 | if label_start < label_end and current_label != "O":
314 | labels.append([label_start, label_end, current_label])
315 | # write a jsonl
316 | line = {"id": id, "text": text, "labels": labels}
317 | lines.append(line)
318 | return lines
319 |
320 |
321 | def write_jsonlines(jsonl, jsonl_file):
322 | with jsonlines.open(jsonl_file, 'w') as writer:
323 | writer.write_all(jsonl)
324 |
325 |
326 | def main(id_file, sent_file, jsonl_file):
327 | sent_with_id = pair_sent_with_id(id_file, sent_file)
328 | doc_with_id = reconstruct_doc(sent_with_id)
329 | new_doc_with_id = postprocess(doc_with_id)
330 | lines = get_jsonlines(new_doc_with_id)
331 | write_jsonlines(lines, jsonl_file)
332 |
333 |
334 | if __name__ == '__main__':
335 | """ Convert sequence labeling format file into josnl file """
336 | # python heuristics.py test_aspect.id test_aspect.sent test_aspect.jsonl
337 | fire.Fire(main)
338 |
--------------------------------------------------------------------------------
/tagger/helper/utils_batch.py:
--------------------------------------------------------------------------------
1 | # coding=utf-8
2 | # Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
3 | # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
4 | #
5 | # Licensed under the Apache License, Version 2.0 (the "License");
6 | # you may not use this file except in compliance with the License.
7 | # You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | """ Named entity recognition fine-tuning: utilities to work with CoNLL-2003 task. """
17 |
18 |
19 | import logging
20 | import os
21 | from dataclasses import dataclass
22 | from enum import Enum
23 | from typing import List, Optional, Union
24 |
25 | from filelock import FileLock
26 | from transformers import PreTrainedTokenizer, is_tf_available, is_torch_available
27 |
28 |
29 | logger = logging.getLogger(__name__)
30 |
31 |
32 | @dataclass
33 | class InputExample:
34 | """
35 | A single training/test example for token classification.
36 |
37 | Args:
38 | guid: Unique id for the example.
39 | words: list. The words of the sequence.
40 | labels: (Optional) list. The labels for each word of the sequence. This should be
41 | specified for train and dev examples, but not for test examples.
42 | """
43 |
44 | guid: str
45 | words: List[str]
46 | labels: Optional[List[str]]
47 |
48 |
49 | @dataclass
50 | class InputFeatures:
51 | """
52 | A single set of features of data.
53 | Property names are the same names as the corresponding inputs to a model.
54 | """
55 |
56 | input_ids: List[int]
57 | attention_mask: List[int]
58 | token_type_ids: Optional[List[int]] = None
59 | label_ids: Optional[List[int]] = None
60 |
61 |
62 | class Split(Enum):
63 | train = "train"
64 | dev = "dev"
65 | test = "test"
66 |
67 |
68 | class TokenClassificationTask:
69 | def read_examples_from_file(self, data_dir, mode: Union[Split, str]) -> List[InputExample]:
70 | raise NotImplementedError
71 |
72 | def get_labels(self, path: str) -> List[str]:
73 | raise NotImplementedError
74 |
75 | def convert_examples_to_features(
76 | self,
77 | examples: List[InputExample],
78 | label_list: List[str],
79 | max_seq_length: int,
80 | tokenizer: PreTrainedTokenizer,
81 | cls_token_at_end=False,
82 | cls_token="[CLS]",
83 | cls_token_segment_id=1,
84 | sep_token="[SEP]",
85 | sep_token_extra=False,
86 | pad_on_left=False,
87 | pad_token=0,
88 | pad_token_segment_id=0,
89 | pad_token_label_id=-100,
90 | sequence_a_segment_id=0,
91 | mask_padding_with_zero=True,
92 | ) -> List[InputFeatures]:
93 | """Loads a data file into a list of `InputFeatures`
94 | `cls_token_at_end` define the location of the CLS token:
95 | - False (Default, BERT/XLM pattern): [CLS] + A + [SEP] + B + [SEP]
96 | - True (XLNet/GPT pattern): A + [SEP] + B + [SEP] + [CLS]
97 | `cls_token_segment_id` define the segment id associated to the CLS token (0 for BERT, 2 for XLNet)
98 | """
99 | # TODO clean up all this to leverage built-in features of tokenizers
100 |
101 | label_map = {label: i for i, label in enumerate(label_list)}
102 |
103 | features = []
104 | for (ex_index, example) in enumerate(examples):
105 | if ex_index % 10_000 == 0:
106 | logger.info("Writing example %d of %d", ex_index, len(examples))
107 |
108 | tokens = []
109 | label_ids = []
110 | for word, label in zip(example.words, example.labels):
111 | word_tokens = tokenizer.tokenize(word)
112 |
113 | # bert-base-multilingual-cased sometimes output "nothing ([]) when calling tokenize with just a space.
114 | if len(word_tokens) > 0:
115 | tokens.extend(word_tokens)
116 | # Use the real label id for the first token of the word, and padding ids for the remaining tokens
117 | label_ids.extend([label_map[label]] + [pad_token_label_id] * (len(word_tokens) - 1))
118 |
119 | # Account for [CLS] and [SEP] with "- 2" and with "- 3" for RoBERTa.
120 | special_tokens_count = tokenizer.num_special_tokens_to_add()
121 | if len(tokens) > max_seq_length - special_tokens_count:
122 | tokens = tokens[: (max_seq_length - special_tokens_count)]
123 | label_ids = label_ids[: (max_seq_length - special_tokens_count)]
124 |
125 | # The convention in BERT is:
126 | # (a) For sequence pairs:
127 | # tokens: [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP]
128 | # type_ids: 0 0 0 0 0 0 0 0 1 1 1 1 1 1
129 | # (b) For single sequences:
130 | # tokens: [CLS] the dog is hairy . [SEP]
131 | # type_ids: 0 0 0 0 0 0 0
132 | #
133 | # Where "type_ids" are used to indicate whether this is the first
134 | # sequence or the second sequence. The embedding vectors for `type=0` and
135 | # `type=1` were learned during pre-training and are added to the wordpiece
136 | # embedding vector (and position vector). This is not *strictly* necessary
137 | # since the [SEP] token unambiguously separates the sequences, but it makes
138 | # it easier for the model to learn the concept of sequences.
139 | #
140 | # For classification tasks, the first vector (corresponding to [CLS]) is
141 | # used as as the "sentence vector". Note that this only makes sense because
142 | # the entire model is fine-tuned.
143 | tokens += [sep_token]
144 | label_ids += [pad_token_label_id]
145 | if sep_token_extra:
146 | # roberta uses an extra separator b/w pairs of sentences
147 | tokens += [sep_token]
148 | label_ids += [pad_token_label_id]
149 | segment_ids = [sequence_a_segment_id] * len(tokens)
150 |
151 | if cls_token_at_end:
152 | tokens += [cls_token]
153 | label_ids += [pad_token_label_id]
154 | segment_ids += [cls_token_segment_id]
155 | else:
156 | tokens = [cls_token] + tokens
157 | label_ids = [pad_token_label_id] + label_ids
158 | segment_ids = [cls_token_segment_id] + segment_ids
159 |
160 | input_ids = tokenizer.convert_tokens_to_ids(tokens)
161 |
162 | # The mask has 1 for real tokens and 0 for padding tokens. Only real
163 | # tokens are attended to.
164 | input_mask = [1 if mask_padding_with_zero else 0] * len(input_ids)
165 |
166 | # Zero-pad up to the sequence length.
167 | padding_length = max_seq_length - len(input_ids)
168 | if pad_on_left:
169 | input_ids = ([pad_token] * padding_length) + input_ids
170 | input_mask = ([0 if mask_padding_with_zero else 1] * padding_length) + input_mask
171 | segment_ids = ([pad_token_segment_id] * padding_length) + segment_ids
172 | label_ids = ([pad_token_label_id] * padding_length) + label_ids
173 | else:
174 | input_ids += [pad_token] * padding_length
175 | input_mask += [0 if mask_padding_with_zero else 1] * padding_length
176 | segment_ids += [pad_token_segment_id] * padding_length
177 | label_ids += [pad_token_label_id] * padding_length
178 |
179 | assert len(input_ids) == max_seq_length
180 | assert len(input_mask) == max_seq_length
181 | assert len(segment_ids) == max_seq_length
182 | assert len(label_ids) == max_seq_length
183 |
184 | if ex_index < 5:
185 | logger.info("*** Example ***")
186 | logger.info("guid: %s", example.guid)
187 | logger.info("tokens: %s", " ".join([str(x) for x in tokens]))
188 | logger.info("input_ids: %s", " ".join([str(x) for x in input_ids]))
189 | logger.info("input_mask: %s", " ".join([str(x) for x in input_mask]))
190 | logger.info("segment_ids: %s", " ".join([str(x) for x in segment_ids]))
191 | logger.info("label_ids: %s", " ".join([str(x) for x in label_ids]))
192 |
193 | if "token_type_ids" not in tokenizer.model_input_names:
194 | segment_ids = None
195 |
196 | features.append(
197 | InputFeatures(
198 | input_ids=input_ids, attention_mask=input_mask, token_type_ids=segment_ids, label_ids=label_ids
199 | )
200 | )
201 | return features
202 |
203 |
204 | if is_torch_available():
205 | import torch
206 | from torch import nn
207 | from torch.utils.data.dataset import Dataset
208 |
209 | class TokenClassificationDataset(Dataset):
210 | """
211 | This will be superseded by a framework-agnostic approach
212 | soon.
213 | """
214 |
215 | features: List[InputFeatures]
216 | pad_token_label_id: int = nn.CrossEntropyLoss().ignore_index
217 | # Use cross entropy ignore_index as padding label id so that only
218 | # real label ids contribute to the loss later.
219 |
220 | def __init__(
221 | self,
222 | token_classification_task: TokenClassificationTask,
223 | data_dir: str,
224 | tokenizer: PreTrainedTokenizer,
225 | labels: List[str],
226 | model_type: str,
227 | max_seq_length: Optional[int] = None,
228 | overwrite_cache=False,
229 | mode: Split = Split.train,
230 | ):
231 | # Load data features from cache or dataset file
232 | cached_features_file = os.path.join(
233 | data_dir,
234 | "cached_{}_{}_{}".format(mode.value, tokenizer.__class__.__name__, str(max_seq_length)),
235 | )
236 |
237 | # Make sure only the first process in distributed training processes the dataset,
238 | # and the others will use the cache.
239 | lock_path = cached_features_file + ".lock"
240 | with FileLock(lock_path):
241 |
242 | if os.path.exists(cached_features_file) and not overwrite_cache:
243 | logger.info(f"Loading features from cached file {cached_features_file}")
244 | self.features = torch.load(cached_features_file)
245 | else:
246 | logger.info(f"Creating features from dataset file at {data_dir}")
247 | examples = token_classification_task.read_examples_from_file(data_dir, mode)
248 | # TODO clean up all this to leverage built-in features of tokenizers
249 | self.features = token_classification_task.convert_examples_to_features(
250 | examples,
251 | labels,
252 | max_seq_length,
253 | tokenizer,
254 | cls_token_at_end=bool(model_type in ["xlnet"]),
255 | # xlnet has a cls token at the end
256 | cls_token=tokenizer.cls_token,
257 | cls_token_segment_id=2 if model_type in ["xlnet"] else 0,
258 | sep_token=tokenizer.sep_token,
259 | sep_token_extra=False,
260 | # roberta uses an extra separator b/w pairs of sentences, cf. github.com/pytorch/fairseq/commit/1684e166e3da03f5b600dbb7855cb98ddfcd0805
261 | pad_on_left=bool(tokenizer.padding_side == "left"),
262 | pad_token=tokenizer.pad_token_id,
263 | pad_token_segment_id=tokenizer.pad_token_type_id,
264 | pad_token_label_id=self.pad_token_label_id,
265 | )
266 | logger.info(f"Saving features into cached file {cached_features_file}")
267 | torch.save(self.features, cached_features_file)
268 |
269 | def __len__(self):
270 | return len(self.features)
271 |
272 | def __getitem__(self, i) -> InputFeatures:
273 | return self.features[i]
274 |
275 |
276 | if is_tf_available():
277 | import tensorflow as tf
278 |
279 | class TFTokenClassificationDataset:
280 | """
281 | This will be superseded by a framework-agnostic approach
282 | soon.
283 | """
284 |
285 | features: List[InputFeatures]
286 | pad_token_label_id: int = -100
287 | # Use cross entropy ignore_index as padding label id so that only
288 | # real label ids contribute to the loss later.
289 |
290 | def __init__(
291 | self,
292 | token_classification_task: TokenClassificationTask,
293 | data_dir: str,
294 | tokenizer: PreTrainedTokenizer,
295 | labels: List[str],
296 | model_type: str,
297 | max_seq_length: Optional[int] = None,
298 | overwrite_cache=False,
299 | mode: Split = Split.train,
300 | ):
301 | examples = token_classification_task.read_examples_from_file(data_dir, mode)
302 | # TODO clean up all this to leverage built-in features of tokenizers
303 | self.features = token_classification_task.convert_examples_to_features(
304 | examples,
305 | labels,
306 | max_seq_length,
307 | tokenizer,
308 | cls_token_at_end=bool(model_type in ["xlnet"]),
309 | # xlnet has a cls token at the end
310 | cls_token=tokenizer.cls_token,
311 | cls_token_segment_id=2 if model_type in ["xlnet"] else 0,
312 | sep_token=tokenizer.sep_token,
313 | sep_token_extra=False,
314 | # roberta uses an extra separator b/w pairs of sentences, cf. github.com/pytorch/fairseq/commit/1684e166e3da03f5b600dbb7855cb98ddfcd0805
315 | pad_on_left=bool(tokenizer.padding_side == "left"),
316 | pad_token=tokenizer.pad_token_id,
317 | pad_token_segment_id=tokenizer.pad_token_type_id,
318 | pad_token_label_id=self.pad_token_label_id,
319 | )
320 |
321 | def gen():
322 | for ex in self.features:
323 | if ex.token_type_ids is None:
324 | yield (
325 | {"input_ids": ex.input_ids, "attention_mask": ex.attention_mask},
326 | ex.label_ids,
327 | )
328 | else:
329 | yield (
330 | {
331 | "input_ids": ex.input_ids,
332 | "attention_mask": ex.attention_mask,
333 | "token_type_ids": ex.token_type_ids,
334 | },
335 | ex.label_ids,
336 | )
337 |
338 | if "token_type_ids" not in tokenizer.model_input_names:
339 | self.dataset = tf.data.Dataset.from_generator(
340 | gen,
341 | ({"input_ids": tf.int32, "attention_mask": tf.int32}, tf.int64),
342 | (
343 | {"input_ids": tf.TensorShape([None]), "attention_mask": tf.TensorShape([None])},
344 | tf.TensorShape([None]),
345 | ),
346 | )
347 | else:
348 | self.dataset = tf.data.Dataset.from_generator(
349 | gen,
350 | ({"input_ids": tf.int32, "attention_mask": tf.int32, "token_type_ids": tf.int32}, tf.int64),
351 | (
352 | {
353 | "input_ids": tf.TensorShape([None]),
354 | "attention_mask": tf.TensorShape([None]),
355 | "token_type_ids": tf.TensorShape([None]),
356 | },
357 | tf.TensorShape([None]),
358 | ),
359 | )
360 |
361 | def get_dataset(self):
362 | self.dataset = self.dataset.apply(tf.data.experimental.assert_cardinality(len(self.features)))
363 |
364 | return self.dataset
365 |
366 | def __len__(self):
367 | return len(self.features)
368 |
369 | def __getitem__(self, i) -> InputFeatures:
370 | return self.features[i]
371 |
--------------------------------------------------------------------------------
/extractor/example.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "## Imports"
8 | ]
9 | },
10 | {
11 | "cell_type": "code",
12 | "execution_count": 2,
13 | "metadata": {
14 | "pycharm": {
15 | "is_executing": false
16 | }
17 | },
18 | "outputs": [],
19 | "source": [
20 | "# Imports\n",
21 | "from extractor import Extractor\n",
22 | "from extractor_utils import get_full_text"
23 | ]
24 | },
25 | {
26 | "cell_type": "markdown",
27 | "metadata": {},
28 | "source": [
29 | "## Prepare text"
30 | ]
31 | },
32 | {
33 | "cell_type": "markdown",
34 | "metadata": {},
35 | "source": [
36 | "Prepare the plain text that we want to extract from ( in our example, we use `paper.json` ). We should concatenate all the content texts together."
37 | ]
38 | },
39 | {
40 | "cell_type": "code",
41 | "execution_count": 3,
42 | "metadata": {
43 | "pycharm": {
44 | "is_executing": false
45 | }
46 | },
47 | "outputs": [],
48 | "source": [
49 | "fulltext = get_full_text(\"paper.json\")"
50 | ]
51 | },
52 | {
53 | "cell_type": "code",
54 | "execution_count": 4,
55 | "metadata": {
56 | "pycharm": {
57 | "is_executing": false
58 | }
59 | },
60 | "outputs": [
61 | {
62 | "data": {
63 | "text/plain": [
64 | "'Recurrent neural networks, long short-term memory [12] and gated recurrent [7] neural networks in particular, have been firmly established as state of the art approaches in sequence modeling and transduction problems such as language modeling and machine translation [29, 2, 5]. Numerous efforts have since continued to push the boundaries of recurrent language models and encoder-decoder architectures [31, 21, 13]. ∗Equal contribution. Listing order is random. Jakob proposed replacing RNNs with self-attention and started the effort to evaluate this idea. Ashish, with Illia, designed and implemented the first Transformer models and has been crucially involved in every aspect of this work. Noam proposed scaled dot-product attention, multi-head attention and the parameter-free position representation and became the other person involved in nearly every detail. Niki designed, implemented, tuned and evaluated countless model variants in our original codebase and tensor2tensor. Llion also experimented with novel model variants, was responsible for our initial codebase, and efficient inference and visualizations. Lukasz and Aidan spent countless long days designing various parts of and implementing tensor2tensor, replacing our earlier codebase, greatly improving results and massively accelerating our research. †Work performed while at Google Brain. ‡Work performed while at Google Research. 31st Conference on Neural Information Processing Systems (NIPS 2017), Long Beach, CA, USA. Recurrent models typically factor computation along the symbol positions of the input and output sequences. Aligning the positions to steps in computation time, they generate a sequence of hidden states ht, as a function of the previous hidden state ht−1 and the input for position t. This inherently sequential nature precludes parallelization within training examples, which becomes critical at longer sequence lengths, as memory constraints limit batching across examples. Recent work has achieved significant improvements in computational efficiency through factorization tricks [18] and conditional computation [26], while also improving model performance in case of the latter. The fundamental constraint of sequential computation, however, remains. Attention mechanisms have become an integral part of compelling sequence modeling and transduction models in various tasks, allowing modeling of dependencies without regard to their distance in the input or output sequences [2, 16]. In all but a few cases [22], however, such attention mechanisms are used in conjunction with a recurrent network. In this work we propose the Transformer, a model architecture eschewing recurrence and instead relying entirely on an attention mechanism to draw global dependencies between input and output. The Transformer allows for significantly more parallelization and can reach a new state of the art in translation quality after being trained for as little as twelve hours on eight P100 GPUs. The goal of reducing sequential computation also forms the foundation of the Extended Neural GPU [20], ByteNet [15] and ConvS2S [8], all of which use convolutional neural networks as basic building block, computing hidden representations in parallel for all input and output positions. In these models, the number of operations required to relate signals from two arbitrary input or output positions grows in the distance between positions, linearly for ConvS2S and logarithmically for ByteNet. This makes it more difficult to learn dependencies between distant positions [11]. In the Transformer this is reduced to a constant number of operations, albeit at the cost of reduced effective resolution due to averaging attention-weighted positions, an effect we counteract with Multi-Head Attention as described in section 3.2. Self-attention, sometimes called intra-attention is an attention mechanism relating different positions of a single sequence in order to compute a representation of the sequence. Self-attention has been used successfully in a variety of tasks including reading comprehension, abstractive summarization, textual entailment and learning task-independent sentence representations [4, 22, 23, 19]. End-to-end memory networks are based on a recurrent attention mechanism instead of sequencealigned recurrence and have been shown to perform well on simple-language question answering and language modeling tasks [28]. To the best of our knowledge, however, the Transformer is the first transduction model relying entirely on self-attention to compute representations of its input and output without using sequencealigned RNNs or convolution. In the following sections, we will describe the Transformer, motivate self-attention and discuss its advantages over models such as [14, 15] and [8]. Most competitive neural sequence transduction models have an encoder-decoder structure [5, 2, 29]. Here, the encoder maps an input sequence of symbol representations (x1, ..., xn) to a sequence of continuous representations z = (z1, ..., zn). Given z, the decoder then generates an output sequence (y1, ..., ym) of symbols one element at a time. At each step the model is auto-regressive [9], consuming the previously generated symbols as additional input when generating the next. The Transformer follows this overall architecture using stacked self-attention and point-wise, fully connected layers for both the encoder and decoder, shown in the left and right halves of Figure 1, respectively. Encoder: The encoder is composed of a stack of N = 6 identical layers. Each layer has two sub-layers. The first is a multi-head self-attention mechanism, and the second is a simple, position- wise fully connected feed-forward network. We employ a residual connection [10] around each of the two sub-layers, followed by layer normalization [1]. That is, the output of each sub-layer is LayerNorm(x + Sublayer(x)), where Sublayer(x) is the function implemented by the sub-layer itself. To facilitate these residual connections, all sub-layers in the model, as well as the embedding layers, produce outputs of dimension dmodel = 512. Decoder: The decoder is also composed of a stack of N = 6 identical layers. In addition to the two sub-layers in each encoder layer, the decoder inserts a third sub-layer, which performs multi-head attention over the output of the encoder stack. Similar to the encoder, we employ residual connections around each of the sub-layers, followed by layer normalization. We also modify the self-attention sub-layer in the decoder stack to prevent positions from attending to subsequent positions. This masking, combined with fact that the output embeddings are offset by one position, ensures that the predictions for position i can depend only on the known outputs at positions less than i. An attention function can be described as mapping a query and a set of key-value pairs to an output, where the query, keys, values, and output are all vectors. The output is computed as a weighted sum of the values, where the weight assigned to each value is computed by a compatibility function of the query with the corresponding key. We call our particular attention \"Scaled Dot-Product Attention\" (Figure 2). The input consists of queries and keys of dimension dk, and values of dimension dv . We compute the dot products of the query with all keys, divide each by √ dk, and apply a softmax function to obtain the weights on the values. In practice, we compute the attention function on a set of queries simultaneously, packed together into a matrix Q. The keys and values are also packed together into matrices K and V . We compute the matrix of outputs as: Attention(Q,K, V ) = softmax( QKT√ dk )V (1) The two most commonly used attention functions are additive attention [2], and dot-product (multiplicative) attention. Dot-product attention is identical to our algorithm, except for the scaling factor of 1√ dk . Additive attention computes the compatibility function using a feed-forward network with a single hidden layer. While the two are similar in theoretical complexity, dot-product attention is much faster and more space-efficient in practice, since it can be implemented using highly optimized matrix multiplication code. While for small values of dk the two mechanisms perform similarly, additive attention outperforms dot product attention without scaling for larger values of dk [3]. We suspect that for large values of dk, the dot products grow large in magnitude, pushing the softmax function into regions where it has extremely small gradients 4. To counteract this effect, we scale the dot products by 1√ dk . Instead of performing a single attention function with dmodel-dimensional keys, values and queries, we found it beneficial to linearly project the queries, keys and values h times with different, learned linear projections to dk, dk and dv dimensions, respectively. On each of these projected versions of queries, keys and values we then perform the attention function in parallel, yielding dv-dimensional output values. These are concatenated and once again projected, resulting in the final values, as depicted in Figure 2. Multi-head attention allows the model to jointly attend to information from different representation subspaces at different positions. With a single attention head, averaging inhibits this. 4To illustrate why the dot products get large, assume that the components of q and k are independent random variables with mean 0 and variance 1. Then their dot product, q · k = ∑dk i=1 qiki, has mean 0 and variance dk. MultiHead(Q,K, V ) = Concat(head1, ...,headh)W O where headi = Attention(QW Q i ,KW K i , V W V i ) Where the projections are parameter matricesWQi ∈ Rdmodel×dk ,WKi ∈ Rdmodel×dk ,WVi ∈ Rdmodel×dv and WO ∈ Rhdv×dmodel . In this work we employ h = 8 parallel attention layers, or heads. For each of these we use dk = dv = dmodel/h = 64. Due to the reduced dimension of each head, the total computational cost is similar to that of single-head attention with full dimensionality. The Transformer uses multi-head attention in three different ways: • In \"encoder-decoder attention\" layers, the queries come from the previous decoder layer, and the memory keys and values come from the output of the encoder. This allows every position in the decoder to attend over all positions in the input sequence. This mimics the typical encoder-decoder attention mechanisms in sequence-to-sequence models such as [31, 2, 8]. • The encoder contains self-attention layers. In a self-attention layer all of the keys, values and queries come from the same place, in this case, the output of the previous layer in the encoder. Each position in the encoder can attend to all positions in the previous layer of the encoder. • Similarly, self-attention layers in the decoder allow each position in the decoder to attend to all positions in the decoder up to and including that position. We need to prevent leftward information flow in the decoder to preserve the auto-regressive property. We implement this inside of scaled dot-product attention by masking out (setting to −∞) all values in the input of the softmax which correspond to illegal connections. See Figure 2. In addition to attention sub-layers, each of the layers in our encoder and decoder contains a fully connected feed-forward network, which is applied to each position separately and identically. This consists of two linear transformations with a ReLU activation in between. FFN(x) = max(0, xW1 + b1)W2 + b2 (2) While the linear transformations are the same across different positions, they use different parameters from layer to layer. Another way of describing this is as two convolutions with kernel size 1. The dimensionality of input and output is dmodel = 512, and the inner-layer has dimensionality dff = 2048. Similarly to other sequence transduction models, we use learned embeddings to convert the input tokens and output tokens to vectors of dimension dmodel. We also use the usual learned linear transformation and softmax function to convert the decoder output to predicted next-token probabilities. In our model, we share the same weight matrix between the two embedding layers and the pre-softmax linear transformation, similar to [24]. In the embedding layers, we multiply those weights by √ dmodel. Since our model contains no recurrence and no convolution, in order for the model to make use of the order of the sequence, we must inject some information about the relative or absolute position of the tokens in the sequence. To this end, we add \"positional encodings\" to the input embeddings at the bottoms of the encoder and decoder stacks. The positional encodings have the same dimension dmodel as the embeddings, so that the two can be summed. There are many choices of positional encodings, learned and fixed [8]. In this work, we use sine and cosine functions of different frequencies: PE(pos,2i) = sin(pos/10000 2i/dmodel) PE(pos,2i+1) = cos(pos/10000 2i/dmodel) where pos is the position and i is the dimension. That is, each dimension of the positional encoding corresponds to a sinusoid. The wavelengths form a geometric progression from 2π to 10000 · 2π. We chose this function because we hypothesized it would allow the model to easily learn to attend by relative positions, since for any fixed offset k, PEpos+k can be represented as a linear function of PEpos. We also experimented with using learned positional embeddings [8] instead, and found that the two versions produced nearly identical results (see Table 3 row (E)). We chose the sinusoidal version because it may allow the model to extrapolate to sequence lengths longer than the ones encountered during training. In this section we compare various aspects of self-attention layers to the recurrent and convolutional layers commonly used for mapping one variable-length sequence of symbol representations (x1, ..., xn) to another sequence of equal length (z1, ..., zn), with xi, zi ∈ Rd, such as a hidden layer in a typical sequence transduction encoder or decoder. Motivating our use of self-attention we consider three desiderata. One is the total computational complexity per layer. Another is the amount of computation that can be parallelized, as measured by the minimum number of sequential operations required. The third is the path length between long-range dependencies in the network. Learning long-range dependencies is a key challenge in many sequence transduction tasks. One key factor affecting the ability to learn such dependencies is the length of the paths forward and backward signals have to traverse in the network. The shorter these paths between any combination of positions in the input and output sequences, the easier it is to learn long-range dependencies [11]. Hence we also compare the maximum path length between any two input and output positions in networks composed of the different layer types. As noted in Table 1, a self-attention layer connects all positions with a constant number of sequentially executed operations, whereas a recurrent layer requires O(n) sequential operations. In terms of computational complexity, self-attention layers are faster than recurrent layers when the sequence length n is smaller than the representation dimensionality d, which is most often the case with sentence representations used by state-of-the-art models in machine translations, such as word-piece [31] and byte-pair [25] representations. To improve computational performance for tasks involving very long sequences, self-attention could be restricted to considering only a neighborhood of size r in the input sequence centered around the respective output position. This would increase the maximum path length to O(n/r). We plan to investigate this approach further in future work. A single convolutional layer with kernel width k < n does not connect all pairs of input and output positions. Doing so requires a stack of O(n/k) convolutional layers in the case of contiguous kernels, or O(logk(n)) in the case of dilated convolutions [15], increasing the length of the longest paths between any two positions in the network. Convolutional layers are generally more expensive than recurrent layers, by a factor of k. Separable convolutions [6], however, decrease the complexity considerably, to O(k · n · d + n · d2). Even with k = n, however, the complexity of a separable convolution is equal to the combination of a self-attention layer and a point-wise feed-forward layer, the approach we take in our model. As side benefit, self-attention could yield more interpretable models. We inspect attention distributions from our models and present and discuss examples in the appendix. Not only do individual attention heads clearly learn to perform different tasks, many appear to exhibit behavior related to the syntactic and semantic structure of the sentences. This section describes the training regime for our models. We trained on the standard WMT 2014 English-German dataset consisting of about 4.5 million sentence pairs. Sentences were encoded using byte-pair encoding [3], which has a shared sourcetarget vocabulary of about 37000 tokens. For English-French, we used the significantly larger WMT 2014 English-French dataset consisting of 36M sentences and split tokens into a 32000 word-piece vocabulary [31]. Sentence pairs were batched together by approximate sequence length. Each training batch contained a set of sentence pairs containing approximately 25000 source tokens and 25000 target tokens. We trained our models on one machine with 8 NVIDIA P100 GPUs. For our base models using the hyperparameters described throughout the paper, each training step took about 0.4 seconds. We trained the base models for a total of 100,000 steps or 12 hours. For our big models,(described on the bottom line of table 3), step time was 1.0 seconds. The big models were trained for 300,000 steps (3.5 days). We used the Adam optimizer [17] with β1 = 0.9, β2 = 0.98 and = 10−9. We varied the learning rate over the course of training, according to the formula: lrate = d−0.5model ·min(step_num −0.5, step_num · warmup_steps−1.5) (3) This corresponds to increasing the learning rate linearly for the first warmup_steps training steps, and decreasing it thereafter proportionally to the inverse square root of the step number. We used warmup_steps = 4000. We employ three types of regularization during training: Residual Dropout We apply dropout [27] to the output of each sub-layer, before it is added to the sub-layer input and normalized. In addition, we apply dropout to the sums of the embeddings and the positional encodings in both the encoder and decoder stacks. For the base model, we use a rate of Pdrop = 0.1. Label Smoothing During training, we employed label smoothing of value ls = 0.1 [30]. This hurts perplexity, as the model learns to be more unsure, but improves accuracy and BLEU score. On the WMT 2014 English-to-German translation task, the big transformer model (Transformer (big) in Table 2) outperforms the best previously reported models (including ensembles) by more than 2.0 BLEU, establishing a new state-of-the-art BLEU score of 28.4. The configuration of this model is listed in the bottom line of Table 3. Training took 3.5 days on 8 P100 GPUs. Even our base model surpasses all previously published models and ensembles, at a fraction of the training cost of any of the competitive models. On the WMT 2014 English-to-French translation task, our big model achieves a BLEU score of 41.0, outperforming all of the previously published single models, at less than 1/4 the training cost of the previous state-of-the-art model. The Transformer (big) model trained for English-to-French used dropout rate Pdrop = 0.1, instead of 0.3. For the base models, we used a single model obtained by averaging the last 5 checkpoints, which were written at 10-minute intervals. For the big models, we averaged the last 20 checkpoints. We used beam search with a beam size of 4 and length penalty α = 0.6 [31]. These hyperparameters were chosen after experimentation on the development set. We set the maximum output length during inference to input length + 50, but terminate early when possible [31]. Table 2 summarizes our results and compares our translation quality and training costs to other model architectures from the literature. We estimate the number of floating point operations used to train a model by multiplying the training time, the number of GPUs used, and an estimate of the sustained single-precision floating-point capacity of each GPU 5. To evaluate the importance of different components of the Transformer, we varied our base model in different ways, measuring the change in performance on English-to-German translation on the development set, newstest2013. We used beam search as described in the previous section, but no checkpoint averaging. We present these results in Table 3. In Table 3 rows (A), we vary the number of attention heads and the attention key and value dimensions, keeping the amount of computation constant, as described in Section 3.2.2. While single-head attention is 0.9 BLEU worse than the best setting, quality also drops off with too many heads. 5We used values of 2.8, 3.7, 6.0 and 9.5 TFLOPS for K80, K40, M40 and P100, respectively. In Table 3 rows (B), we observe that reducing the attention key size dk hurts model quality. This suggests that determining compatibility is not easy and that a more sophisticated compatibility function than dot product may be beneficial. We further observe in rows (C) and (D) that, as expected, bigger models are better, and dropout is very helpful in avoiding over-fitting. In row (E) we replace our sinusoidal positional encoding with learned positional embeddings [8], and observe nearly identical results to the base model. In this work, we presented the Transformer, the first sequence transduction model based entirely on attention, replacing the recurrent layers most commonly used in encoder-decoder architectures with multi-headed self-attention. For translation tasks, the Transformer can be trained significantly faster than architectures based on recurrent or convolutional layers. On both WMT 2014 English-to-German and WMT 2014 English-to-French translation tasks, we achieve a new state of the art. In the former task our best model outperforms even all previously reported ensembles. We are excited about the future of attention-based models and plan to apply them to other tasks. We plan to extend the Transformer to problems involving input and output modalities other than text and to investigate local, restricted attention mechanisms to efficiently handle large inputs and outputs such as images, audio and video. Making generation less sequential is another research goals of ours. The code we used to train and evaluate our models is available at https://github.com/ tensorflow/tensor2tensor. Acknowledgements We are grateful to Nal Kalchbrenner and Stephan Gouws for their fruitful comments, corrections and inspiration.'"
65 | ]
66 | },
67 | "execution_count": 4,
68 | "metadata": {},
69 | "output_type": "execute_result"
70 | }
71 | ],
72 | "source": [
73 | "fulltext"
74 | ]
75 | },
76 | {
77 | "cell_type": "markdown",
78 | "metadata": {},
79 | "source": [
80 | "## Prepare extractor"
81 | ]
82 | },
83 | {
84 | "cell_type": "markdown",
85 | "metadata": {},
86 | "source": [
87 | "Initialize the extractor with `keywords.txt` and `parameters.txt` files specified."
88 | ]
89 | },
90 | {
91 | "cell_type": "code",
92 | "execution_count": 5,
93 | "metadata": {
94 | "pycharm": {
95 | "is_executing": false
96 | }
97 | },
98 | "outputs": [],
99 | "source": [
100 | "extractor = Extractor('keywords.txt', 'parameters.txt')"
101 | ]
102 | },
103 | {
104 | "cell_type": "markdown",
105 | "metadata": {},
106 | "source": [
107 | "Extract text"
108 | ]
109 | },
110 | {
111 | "cell_type": "code",
112 | "execution_count": 6,
113 | "metadata": {
114 | "pycharm": {
115 | "is_executing": true
116 | }
117 | },
118 | "outputs": [],
119 | "source": [
120 | "extraction = extractor.extract(fulltext)"
121 | ]
122 | },
123 | {
124 | "cell_type": "code",
125 | "execution_count": 7,
126 | "metadata": {
127 | "pycharm": {
128 | "is_executing": true
129 | }
130 | },
131 | "outputs": [
132 | {
133 | "data": {
134 | "text/plain": [
135 | "'Jakob proposed replacing RNNs with self-attention and started the effort to evaluate this idea. Noam proposed scaled dot-product attention, multi-head attention and the parameter-free position representation and became the other person involved in nearly every detail. Niki designed, implemented, tuned and evaluated countless model variants in our original codebase and tensor2tensor. Llion also experimented with novel model variants, was responsible for our initial codebase, and efficient inference and visualizations. Lukasz and Aidan spent countless long days designing various parts of and implementing tensor2tensor, replacing our earlier codebase, greatly improving results and massively accelerating our research. Recent work has achieved significant improvements in computational efficiency through factorization tricks [18] and conditional computation [26], while also improving model performance in case of the latter. The goal of reducing sequential computation also forms the foundation of the Extended Neural GPU [20], ByteNet [15] and ConvS2S [8], all of which use convolutional neural networks as basic building block, computing hidden representations in parallel for all input and output positions. In the Transformer this is reduced to a constant number of operations, albeit at the cost of reduced effective resolution due to averaging attention-weighted positions, an effect we counteract with Multi-Head Attention as described in section 3.2. End-to-end memory networks are based on a recurrent attention mechanism instead of sequencealigned recurrence and have been shown to perform well on simple-language question answering and language modeling tasks [28]. We compute the dot products of the query with all keys, divide each by √ dk, and apply a softmax function to obtain the weights on the values. Instead of performing a single attention function with dmodel-dimensional keys, values and queries, we found it beneficial to linearly project the queries, keys and values h times with different, learned linear projections to dk, dk and dv dimensions, respectively. We also experimented with using learned positional embeddings [8] instead, and found that the two versions produced nearly identical results (see Table 3 row (E)). In this section we compare various aspects of self-attention layers to the recurrent and convolutional layers commonly used for mapping one variable-length sequence of symbol representations (x1, ..., xn) to another sequence of equal length (z1, ..., zn), with xi, zi ∈ Rd, such as a hidden layer in a typical sequence transduction encoder or decoder. To improve computational performance for tasks involving very long sequences, self-attention could be restricted to considering only a neighborhood of size r in the input sequence centered around the respective output position. Doing so requires a stack of O(n/k) convolutional layers in the case of contiguous kernels, or O(logk(n)) in the case of dilated convolutions [15], increasing the length of the longest paths between any two positions in the network. Convolutional layers are generally more expensive than recurrent layers, by a factor of k. Separable convolutions [6], however, decrease the complexity considerably, to O(k · n · d + n · d2). As side benefit, self-attention could yield more interpretable models. We inspect attention distributions from our models and present and discuss examples in the appendix. Not only do individual attention heads clearly learn to perform different tasks, many appear to exhibit behavior related to the syntactic and semantic structure of the sentences. We trained on the standard WMT 2014 English-German dataset consisting of about 4.5 million sentence pairs. For English-French, we used the significantly larger WMT 2014 English-French dataset consisting of 36M sentences and split tokens into a 32000 word-piece vocabulary [31]. We varied the learning rate over the course of training, according to the formula: lrate = d−0.5model ·min(step_num −0.5, step_num · warmup_steps−1.5) (3) This corresponds to increasing the learning rate linearly for the first warmup_steps training steps, and decreasing it thereafter proportionally to the inverse square root of the step number. We employ three types of regularization during training: Residual Dropout We apply dropout [27] to the output of each sub-layer, before it is added to the sub-layer input and normalized. On the WMT 2014 English-to-German translation task, the big transformer model (Transformer (big) in Table 2) outperforms the best previously reported models (including ensembles) by more than 2.0 BLEU, establishing a new state-of-the-art BLEU score of 28.4. We set the maximum output length during inference to input length + 50, but terminate early when possible [31]. Table 2 summarizes our results and compares our translation quality and training costs to other model architectures from the literature. To evaluate the importance of different components of the Transformer, we varied our base model in different ways, measuring the change in performance on English-to-German translation on the development set, newstest2013. This suggests that determining compatibility is not easy and that a more sophisticated compatibility function than dot product may be beneficial. We further observe in rows (C) and (D) that, as expected, bigger models are better, and dropout is very helpful in avoiding over-fitting. We plan to extend the Transformer to problems involving input and output modalities other than text and to investigate local, restricted attention mechanisms to efficiently handle large inputs and outputs such as images, audio and video.'"
136 | ]
137 | },
138 | "execution_count": 7,
139 | "metadata": {},
140 | "output_type": "execute_result"
141 | }
142 | ],
143 | "source": [
144 | "extraction"
145 | ]
146 | }
147 | ],
148 | "metadata": {
149 | "kernelspec": {
150 | "display_name": "Python 3",
151 | "language": "python",
152 | "name": "python3"
153 | },
154 | "language_info": {
155 | "codemirror_mode": {
156 | "name": "ipython",
157 | "version": 3
158 | },
159 | "file_extension": ".py",
160 | "mimetype": "text/x-python",
161 | "name": "python",
162 | "nbconvert_exporter": "python",
163 | "pygments_lexer": "ipython3",
164 | "version": "3.7.4"
165 | },
166 | "pycharm": {
167 | "stem_cell": {
168 | "cell_type": "raw",
169 | "metadata": {
170 | "collapsed": false
171 | },
172 | "source": []
173 | }
174 | }
175 | },
176 | "nbformat": 4,
177 | "nbformat_minor": 1
178 | }
179 |
--------------------------------------------------------------------------------
/extractor/paper.json:
--------------------------------------------------------------------------------
1 | {"name": "NIPS_2017_575.pdf", "metadata": {"source": "CRF", "title": "Attention Is All You Need", "authors": ["Ashish Vaswani", "Noam Shazeer", "Niki Parmar", "Aidan N. Gomez"], "emails": ["avaswani@google.com", "noam@google.com", "nikip@google.com", "usz@google.com", "llion@google.com", "aidan@cs.toronto.edu", "lukaszkaiser@google.com", "illia.polosukhin@gmail.com"], "sections": [{"heading": "1 Introduction", "text": "Recurrent neural networks, long short-term memory [12] and gated recurrent [7] neural networks in particular, have been firmly established as state of the art approaches in sequence modeling and transduction problems such as language modeling and machine translation [29, 2, 5]. Numerous efforts have since continued to push the boundaries of recurrent language models and encoder-decoder architectures [31, 21, 13]. \u2217Equal contribution. Listing order is random. Jakob proposed replacing RNNs with self-attention and started the effort to evaluate this idea. Ashish, with Illia, designed and implemented the first Transformer models and has been crucially involved in every aspect of this work. Noam proposed scaled dot-product attention, multi-head attention and the parameter-free position representation and became the other person involved in nearly every detail. Niki designed, implemented, tuned and evaluated countless model variants in our original codebase and tensor2tensor. Llion also experimented with novel model variants, was responsible for our initial codebase, and efficient inference and visualizations. Lukasz and Aidan spent countless long days designing various parts of and implementing tensor2tensor, replacing our earlier codebase, greatly improving results and massively accelerating our research. \u2020Work performed while at Google Brain. \u2021Work performed while at Google Research.\n31st Conference on Neural Information Processing Systems (NIPS 2017), Long Beach, CA, USA.\nRecurrent models typically factor computation along the symbol positions of the input and output sequences. Aligning the positions to steps in computation time, they generate a sequence of hidden states ht, as a function of the previous hidden state ht\u22121 and the input for position t. This inherently sequential nature precludes parallelization within training examples, which becomes critical at longer sequence lengths, as memory constraints limit batching across examples. Recent work has achieved significant improvements in computational efficiency through factorization tricks [18] and conditional computation [26], while also improving model performance in case of the latter. The fundamental constraint of sequential computation, however, remains.\nAttention mechanisms have become an integral part of compelling sequence modeling and transduction models in various tasks, allowing modeling of dependencies without regard to their distance in the input or output sequences [2, 16]. In all but a few cases [22], however, such attention mechanisms are used in conjunction with a recurrent network.\nIn this work we propose the Transformer, a model architecture eschewing recurrence and instead relying entirely on an attention mechanism to draw global dependencies between input and output. The Transformer allows for significantly more parallelization and can reach a new state of the art in translation quality after being trained for as little as twelve hours on eight P100 GPUs."}, {"heading": "2 Background", "text": "The goal of reducing sequential computation also forms the foundation of the Extended Neural GPU [20], ByteNet [15] and ConvS2S [8], all of which use convolutional neural networks as basic building block, computing hidden representations in parallel for all input and output positions. In these models, the number of operations required to relate signals from two arbitrary input or output positions grows in the distance between positions, linearly for ConvS2S and logarithmically for ByteNet. This makes it more difficult to learn dependencies between distant positions [11]. In the Transformer this is reduced to a constant number of operations, albeit at the cost of reduced effective resolution due to averaging attention-weighted positions, an effect we counteract with Multi-Head Attention as described in section 3.2.\nSelf-attention, sometimes called intra-attention is an attention mechanism relating different positions of a single sequence in order to compute a representation of the sequence. Self-attention has been used successfully in a variety of tasks including reading comprehension, abstractive summarization, textual entailment and learning task-independent sentence representations [4, 22, 23, 19].\nEnd-to-end memory networks are based on a recurrent attention mechanism instead of sequencealigned recurrence and have been shown to perform well on simple-language question answering and language modeling tasks [28].\nTo the best of our knowledge, however, the Transformer is the first transduction model relying entirely on self-attention to compute representations of its input and output without using sequencealigned RNNs or convolution. In the following sections, we will describe the Transformer, motivate self-attention and discuss its advantages over models such as [14, 15] and [8]."}, {"heading": "3 Model Architecture", "text": "Most competitive neural sequence transduction models have an encoder-decoder structure [5, 2, 29]. Here, the encoder maps an input sequence of symbol representations (x1, ..., xn) to a sequence of continuous representations z = (z1, ..., zn). Given z, the decoder then generates an output sequence (y1, ..., ym) of symbols one element at a time. At each step the model is auto-regressive [9], consuming the previously generated symbols as additional input when generating the next.\nThe Transformer follows this overall architecture using stacked self-attention and point-wise, fully connected layers for both the encoder and decoder, shown in the left and right halves of Figure 1, respectively."}, {"heading": "3.1 Encoder and Decoder Stacks", "text": "Encoder: The encoder is composed of a stack of N = 6 identical layers. Each layer has two sub-layers. The first is a multi-head self-attention mechanism, and the second is a simple, position-\nwise fully connected feed-forward network. We employ a residual connection [10] around each of the two sub-layers, followed by layer normalization [1]. That is, the output of each sub-layer is LayerNorm(x + Sublayer(x)), where Sublayer(x) is the function implemented by the sub-layer itself. To facilitate these residual connections, all sub-layers in the model, as well as the embedding layers, produce outputs of dimension dmodel = 512.\nDecoder: The decoder is also composed of a stack of N = 6 identical layers. In addition to the two sub-layers in each encoder layer, the decoder inserts a third sub-layer, which performs multi-head attention over the output of the encoder stack. Similar to the encoder, we employ residual connections around each of the sub-layers, followed by layer normalization. We also modify the self-attention sub-layer in the decoder stack to prevent positions from attending to subsequent positions. This masking, combined with fact that the output embeddings are offset by one position, ensures that the predictions for position i can depend only on the known outputs at positions less than i."}, {"heading": "3.2 Attention", "text": "An attention function can be described as mapping a query and a set of key-value pairs to an output, where the query, keys, values, and output are all vectors. The output is computed as a weighted sum of the values, where the weight assigned to each value is computed by a compatibility function of the query with the corresponding key."}, {"heading": "3.2.1 Scaled Dot-Product Attention", "text": "We call our particular attention \"Scaled Dot-Product Attention\" (Figure 2). The input consists of queries and keys of dimension dk, and values of dimension dv . We compute the dot products of the\nquery with all keys, divide each by \u221a dk, and apply a softmax function to obtain the weights on the values.\nIn practice, we compute the attention function on a set of queries simultaneously, packed together into a matrix Q. The keys and values are also packed together into matrices K and V . We compute the matrix of outputs as:\nAttention(Q,K, V ) = softmax( QKT\u221a dk )V (1)\nThe two most commonly used attention functions are additive attention [2], and dot-product (multiplicative) attention. Dot-product attention is identical to our algorithm, except for the scaling factor of 1\u221a\ndk . Additive attention computes the compatibility function using a feed-forward network with a single hidden layer. While the two are similar in theoretical complexity, dot-product attention is much faster and more space-efficient in practice, since it can be implemented using highly optimized matrix multiplication code.\nWhile for small values of dk the two mechanisms perform similarly, additive attention outperforms dot product attention without scaling for larger values of dk [3]. We suspect that for large values of dk, the dot products grow large in magnitude, pushing the softmax function into regions where it has extremely small gradients 4. To counteract this effect, we scale the dot products by 1\u221a\ndk ."}, {"heading": "3.2.2 Multi-Head Attention", "text": "Instead of performing a single attention function with dmodel-dimensional keys, values and queries, we found it beneficial to linearly project the queries, keys and values h times with different, learned linear projections to dk, dk and dv dimensions, respectively. On each of these projected versions of queries, keys and values we then perform the attention function in parallel, yielding dv-dimensional output values. These are concatenated and once again projected, resulting in the final values, as depicted in Figure 2.\nMulti-head attention allows the model to jointly attend to information from different representation subspaces at different positions. With a single attention head, averaging inhibits this.\n4To illustrate why the dot products get large, assume that the components of q and k are independent random variables with mean 0 and variance 1. Then their dot product, q \u00b7 k = \u2211dk i=1 qiki, has mean 0 and variance dk.\nMultiHead(Q,K, V ) = Concat(head1, ...,headh)W O\nwhere headi = Attention(QW Q i ,KW K i , V W V i )\nWhere the projections are parameter matricesWQi \u2208 Rdmodel\u00d7dk ,WKi \u2208 Rdmodel\u00d7dk ,WVi \u2208 Rdmodel\u00d7dv and WO \u2208 Rhdv\u00d7dmodel . In this work we employ h = 8 parallel attention layers, or heads. For each of these we use dk = dv = dmodel/h = 64. Due to the reduced dimension of each head, the total computational cost is similar to that of single-head attention with full dimensionality."}, {"heading": "3.2.3 Applications of Attention in our Model", "text": "The Transformer uses multi-head attention in three different ways:\n\u2022 In \"encoder-decoder attention\" layers, the queries come from the previous decoder layer, and the memory keys and values come from the output of the encoder. This allows every position in the decoder to attend over all positions in the input sequence. This mimics the typical encoder-decoder attention mechanisms in sequence-to-sequence models such as [31, 2, 8].\n\u2022 The encoder contains self-attention layers. In a self-attention layer all of the keys, values and queries come from the same place, in this case, the output of the previous layer in the encoder. Each position in the encoder can attend to all positions in the previous layer of the encoder.\n\u2022 Similarly, self-attention layers in the decoder allow each position in the decoder to attend to all positions in the decoder up to and including that position. We need to prevent leftward information flow in the decoder to preserve the auto-regressive property. We implement this inside of scaled dot-product attention by masking out (setting to \u2212\u221e) all values in the input of the softmax which correspond to illegal connections. See Figure 2."}, {"heading": "3.3 Position-wise Feed-Forward Networks", "text": "In addition to attention sub-layers, each of the layers in our encoder and decoder contains a fully connected feed-forward network, which is applied to each position separately and identically. This consists of two linear transformations with a ReLU activation in between.\nFFN(x) = max(0, xW1 + b1)W2 + b2 (2)\nWhile the linear transformations are the same across different positions, they use different parameters from layer to layer. Another way of describing this is as two convolutions with kernel size 1. The dimensionality of input and output is dmodel = 512, and the inner-layer has dimensionality dff = 2048."}, {"heading": "3.4 Embeddings and Softmax", "text": "Similarly to other sequence transduction models, we use learned embeddings to convert the input tokens and output tokens to vectors of dimension dmodel. We also use the usual learned linear transformation and softmax function to convert the decoder output to predicted next-token probabilities. In our model, we share the same weight matrix between the two embedding layers and the pre-softmax linear transformation, similar to [24]. In the embedding layers, we multiply those weights by \u221a dmodel."}, {"heading": "3.5 Positional Encoding", "text": "Since our model contains no recurrence and no convolution, in order for the model to make use of the order of the sequence, we must inject some information about the relative or absolute position of the tokens in the sequence. To this end, we add \"positional encodings\" to the input embeddings at the\nbottoms of the encoder and decoder stacks. The positional encodings have the same dimension dmodel as the embeddings, so that the two can be summed. There are many choices of positional encodings, learned and fixed [8].\nIn this work, we use sine and cosine functions of different frequencies:\nPE(pos,2i) = sin(pos/10000 2i/dmodel)\nPE(pos,2i+1) = cos(pos/10000 2i/dmodel)\nwhere pos is the position and i is the dimension. That is, each dimension of the positional encoding corresponds to a sinusoid. The wavelengths form a geometric progression from 2\u03c0 to 10000 \u00b7 2\u03c0. We chose this function because we hypothesized it would allow the model to easily learn to attend by relative positions, since for any fixed offset k, PEpos+k can be represented as a linear function of PEpos.\nWe also experimented with using learned positional embeddings [8] instead, and found that the two versions produced nearly identical results (see Table 3 row (E)). We chose the sinusoidal version because it may allow the model to extrapolate to sequence lengths longer than the ones encountered during training."}, {"heading": "4 Why Self-Attention", "text": "In this section we compare various aspects of self-attention layers to the recurrent and convolutional layers commonly used for mapping one variable-length sequence of symbol representations (x1, ..., xn) to another sequence of equal length (z1, ..., zn), with xi, zi \u2208 Rd, such as a hidden layer in a typical sequence transduction encoder or decoder. Motivating our use of self-attention we consider three desiderata.\nOne is the total computational complexity per layer. Another is the amount of computation that can be parallelized, as measured by the minimum number of sequential operations required.\nThe third is the path length between long-range dependencies in the network. Learning long-range dependencies is a key challenge in many sequence transduction tasks. One key factor affecting the ability to learn such dependencies is the length of the paths forward and backward signals have to traverse in the network. The shorter these paths between any combination of positions in the input and output sequences, the easier it is to learn long-range dependencies [11]. Hence we also compare the maximum path length between any two input and output positions in networks composed of the different layer types.\nAs noted in Table 1, a self-attention layer connects all positions with a constant number of sequentially executed operations, whereas a recurrent layer requires O(n) sequential operations. In terms of computational complexity, self-attention layers are faster than recurrent layers when the sequence length n is smaller than the representation dimensionality d, which is most often the case with sentence representations used by state-of-the-art models in machine translations, such as word-piece [31] and byte-pair [25] representations. To improve computational performance for tasks involving very long sequences, self-attention could be restricted to considering only a neighborhood of size r in\nthe input sequence centered around the respective output position. This would increase the maximum path length to O(n/r). We plan to investigate this approach further in future work.\nA single convolutional layer with kernel width k < n does not connect all pairs of input and output positions. Doing so requires a stack of O(n/k) convolutional layers in the case of contiguous kernels, or O(logk(n)) in the case of dilated convolutions [15], increasing the length of the longest paths between any two positions in the network. Convolutional layers are generally more expensive than recurrent layers, by a factor of k. Separable convolutions [6], however, decrease the complexity considerably, to O(k \u00b7 n \u00b7 d + n \u00b7 d2). Even with k = n, however, the complexity of a separable convolution is equal to the combination of a self-attention layer and a point-wise feed-forward layer, the approach we take in our model.\nAs side benefit, self-attention could yield more interpretable models. We inspect attention distributions from our models and present and discuss examples in the appendix. Not only do individual attention heads clearly learn to perform different tasks, many appear to exhibit behavior related to the syntactic and semantic structure of the sentences."}, {"heading": "5 Training", "text": "This section describes the training regime for our models."}, {"heading": "5.1 Training Data and Batching", "text": "We trained on the standard WMT 2014 English-German dataset consisting of about 4.5 million sentence pairs. Sentences were encoded using byte-pair encoding [3], which has a shared sourcetarget vocabulary of about 37000 tokens. For English-French, we used the significantly larger WMT 2014 English-French dataset consisting of 36M sentences and split tokens into a 32000 word-piece vocabulary [31]. Sentence pairs were batched together by approximate sequence length. Each training batch contained a set of sentence pairs containing approximately 25000 source tokens and 25000 target tokens."}, {"heading": "5.2 Hardware and Schedule", "text": "We trained our models on one machine with 8 NVIDIA P100 GPUs. For our base models using the hyperparameters described throughout the paper, each training step took about 0.4 seconds. We trained the base models for a total of 100,000 steps or 12 hours. For our big models,(described on the bottom line of table 3), step time was 1.0 seconds. The big models were trained for 300,000 steps (3.5 days)."}, {"heading": "5.3 Optimizer", "text": "We used the Adam optimizer [17] with \u03b21 = 0.9, \u03b22 = 0.98 and = 10\u22129. We varied the learning rate over the course of training, according to the formula:\nlrate = d\u22120.5model \u00b7min(step_num \u22120.5, step_num \u00b7 warmup_steps\u22121.5) (3)\nThis corresponds to increasing the learning rate linearly for the first warmup_steps training steps, and decreasing it thereafter proportionally to the inverse square root of the step number. We used warmup_steps = 4000."}, {"heading": "5.4 Regularization", "text": "We employ three types of regularization during training:\nResidual Dropout We apply dropout [27] to the output of each sub-layer, before it is added to the sub-layer input and normalized. In addition, we apply dropout to the sums of the embeddings and the positional encodings in both the encoder and decoder stacks. For the base model, we use a rate of Pdrop = 0.1.\nLabel Smoothing During training, we employed label smoothing of value ls = 0.1 [30]. This hurts perplexity, as the model learns to be more unsure, but improves accuracy and BLEU score."}, {"heading": "6 Results", "text": ""}, {"heading": "6.1 Machine Translation", "text": "On the WMT 2014 English-to-German translation task, the big transformer model (Transformer (big) in Table 2) outperforms the best previously reported models (including ensembles) by more than 2.0 BLEU, establishing a new state-of-the-art BLEU score of 28.4. The configuration of this model is listed in the bottom line of Table 3. Training took 3.5 days on 8 P100 GPUs. Even our base model surpasses all previously published models and ensembles, at a fraction of the training cost of any of the competitive models.\nOn the WMT 2014 English-to-French translation task, our big model achieves a BLEU score of 41.0, outperforming all of the previously published single models, at less than 1/4 the training cost of the previous state-of-the-art model. The Transformer (big) model trained for English-to-French used dropout rate Pdrop = 0.1, instead of 0.3.\nFor the base models, we used a single model obtained by averaging the last 5 checkpoints, which were written at 10-minute intervals. For the big models, we averaged the last 20 checkpoints. We used beam search with a beam size of 4 and length penalty \u03b1 = 0.6 [31]. These hyperparameters were chosen after experimentation on the development set. We set the maximum output length during inference to input length + 50, but terminate early when possible [31].\nTable 2 summarizes our results and compares our translation quality and training costs to other model architectures from the literature. We estimate the number of floating point operations used to train a model by multiplying the training time, the number of GPUs used, and an estimate of the sustained single-precision floating-point capacity of each GPU 5."}, {"heading": "6.2 Model Variations", "text": "To evaluate the importance of different components of the Transformer, we varied our base model in different ways, measuring the change in performance on English-to-German translation on the development set, newstest2013. We used beam search as described in the previous section, but no checkpoint averaging. We present these results in Table 3.\nIn Table 3 rows (A), we vary the number of attention heads and the attention key and value dimensions, keeping the amount of computation constant, as described in Section 3.2.2. While single-head attention is 0.9 BLEU worse than the best setting, quality also drops off with too many heads.\n5We used values of 2.8, 3.7, 6.0 and 9.5 TFLOPS for K80, K40, M40 and P100, respectively.\nIn Table 3 rows (B), we observe that reducing the attention key size dk hurts model quality. This suggests that determining compatibility is not easy and that a more sophisticated compatibility function than dot product may be beneficial. We further observe in rows (C) and (D) that, as expected, bigger models are better, and dropout is very helpful in avoiding over-fitting. In row (E) we replace our sinusoidal positional encoding with learned positional embeddings [8], and observe nearly identical results to the base model."}, {"heading": "7 Conclusion", "text": "In this work, we presented the Transformer, the first sequence transduction model based entirely on attention, replacing the recurrent layers most commonly used in encoder-decoder architectures with multi-headed self-attention.\nFor translation tasks, the Transformer can be trained significantly faster than architectures based on recurrent or convolutional layers. On both WMT 2014 English-to-German and WMT 2014 English-to-French translation tasks, we achieve a new state of the art. In the former task our best model outperforms even all previously reported ensembles.\nWe are excited about the future of attention-based models and plan to apply them to other tasks. We plan to extend the Transformer to problems involving input and output modalities other than text and to investigate local, restricted attention mechanisms to efficiently handle large inputs and outputs such as images, audio and video. Making generation less sequential is another research goals of ours.\nThe code we used to train and evaluate our models is available at https://github.com/ tensorflow/tensor2tensor.\nAcknowledgements We are grateful to Nal Kalchbrenner and Stephan Gouws for their fruitful comments, corrections and inspiration."}], "references": [{"title": "Neural machine translation by jointly learning to align and translate", "author": ["Dzmitry Bahdanau", "Kyunghyun Cho", "Yoshua Bengio"], "venue": "CoRR, abs/1409.0473,", "citeRegEx": "2", "shortCiteRegEx": "2", "year": 2014}, {"title": "Massive exploration of neural machine translation", "author": ["Denny Britz", "Anna Goldie", "Minh-Thang Luong", "Quoc V. Le"], "venue": "architectures. CoRR,", "citeRegEx": "3", "shortCiteRegEx": "3", "year": 2017}, {"title": "Long short-term memory-networks for machine reading", "author": ["Jianpeng Cheng", "Li Dong", "Mirella Lapata"], "venue": "arXiv preprint arXiv:1601.06733,", "citeRegEx": "4", "shortCiteRegEx": "4", "year": 2016}, {"title": "Learning phrase representations using rnn encoder-decoder for statistical machine", "author": ["Kyunghyun Cho", "Bart van Merrienboer", "Caglar Gulcehre", "Fethi Bougares", "Holger Schwenk", "Yoshua Bengio"], "venue": "translation. CoRR,", "citeRegEx": "5", "shortCiteRegEx": "5", "year": 2014}, {"title": "Xception: Deep learning with depthwise separable convolutions", "author": ["Francois Chollet"], "venue": "arXiv preprint arXiv:1610.02357,", "citeRegEx": "6", "shortCiteRegEx": "6", "year": 2016}, {"title": "Empirical evaluation of gated recurrent neural networks on sequence modeling", "author": ["Junyoung Chung", "\u00c7aglar G\u00fcl\u00e7ehre", "Kyunghyun Cho", "Yoshua Bengio"], "venue": "CoRR, abs/1412.3555,", "citeRegEx": "7", "shortCiteRegEx": "7", "year": 2014}, {"title": "Convolutional sequence to sequence learning", "author": ["Jonas Gehring", "Michael Auli", "David Grangier", "Denis Yarats", "Yann N. Dauphin"], "venue": "arXiv preprint arXiv:1705.03122v2,", "citeRegEx": "8", "shortCiteRegEx": "8", "year": 2017}, {"title": "Generating sequences with recurrent neural networks", "author": ["Alex Graves"], "venue": "arXiv preprint arXiv:1308.0850,", "citeRegEx": "9", "shortCiteRegEx": "9", "year": 2013}, {"title": "Deep residual learning for image recognition", "author": ["Kaiming He", "Xiangyu Zhang", "Shaoqing Ren", "Jian Sun"], "venue": "In Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition,", "citeRegEx": "10", "shortCiteRegEx": "10", "year": 2016}, {"title": "Gradient flow in recurrent nets: the difficulty of learning", "author": ["Sepp Hochreiter", "Yoshua Bengio", "Paolo Frasconi", "J\u00fcrgen Schmidhuber"], "venue": "long-term dependencies,", "citeRegEx": "11", "shortCiteRegEx": "11", "year": 2001}, {"title": "Long short-term memory", "author": ["Sepp Hochreiter", "J\u00fcrgen Schmidhuber"], "venue": "Neural computation,", "citeRegEx": "12", "shortCiteRegEx": "12", "year": 1997}, {"title": "Exploring the limits of language modeling", "author": ["Rafal Jozefowicz", "Oriol Vinyals", "Mike Schuster", "Noam Shazeer", "Yonghui Wu"], "venue": "arXiv preprint arXiv:1602.02410,", "citeRegEx": "13", "shortCiteRegEx": "13", "year": 2016}, {"title": "Neural GPUs learn algorithms", "author": ["\u0141ukasz Kaiser", "Ilya Sutskever"], "venue": "In International Conference on Learning Representations (ICLR),", "citeRegEx": "14", "shortCiteRegEx": "14", "year": 2016}, {"title": "Neural machine translation in linear time", "author": ["Nal Kalchbrenner", "Lasse Espeholt", "Karen Simonyan", "Aaron van den Oord", "Alex Graves", "Koray Kavukcuoglu"], "venue": "arXiv preprint arXiv:1610.10099v2,", "citeRegEx": "15", "shortCiteRegEx": "15", "year": 2017}, {"title": "Structured attention networks", "author": ["Yoon Kim", "Carl Denton", "Luong Hoang", "Alexander M. Rush"], "venue": "In International Conference on Learning Representations,", "citeRegEx": "16", "shortCiteRegEx": "16", "year": 2017}, {"title": "Adam: A method for stochastic optimization", "author": ["Diederik Kingma", "Jimmy Ba"], "venue": "In ICLR,", "citeRegEx": "17", "shortCiteRegEx": "17", "year": 2015}, {"title": "Factorization tricks for LSTM networks", "author": ["Oleksii Kuchaiev", "Boris Ginsburg"], "venue": "arXiv preprint arXiv:1703.10722,", "citeRegEx": "18", "shortCiteRegEx": "18", "year": 2017}, {"title": "A structured self-attentive sentence embedding", "author": ["Zhouhan Lin", "Minwei Feng", "Cicero Nogueira dos Santos", "Mo Yu", "Bing Xiang", "Bowen Zhou", "Yoshua Bengio"], "venue": "arXiv preprint arXiv:1703.03130,", "citeRegEx": "19", "shortCiteRegEx": "19", "year": 2017}, {"title": "Can active memory replace attention", "author": ["Samy Bengio \u0141ukasz Kaiser"], "venue": "In Advances in Neural Information Processing Systems, (NIPS),", "citeRegEx": "20", "shortCiteRegEx": "20", "year": 2016}, {"title": "Effective approaches to attentionbased neural machine translation", "author": ["Minh-Thang Luong", "Hieu Pham", "Christopher D Manning"], "venue": "arXiv preprint arXiv:1508.04025,", "citeRegEx": "21", "shortCiteRegEx": "21", "year": 2015}, {"title": "A decomposable attention model", "author": ["Ankur Parikh", "Oscar T\u00e4ckstr\u00f6m", "Dipanjan Das", "Jakob Uszkoreit"], "venue": "In Empirical Methods in Natural Language Processing,", "citeRegEx": "22", "shortCiteRegEx": "22", "year": 2016}, {"title": "A deep reinforced model for abstractive summarization", "author": ["Romain Paulus", "Caiming Xiong", "Richard Socher"], "venue": "arXiv preprint arXiv:1705.04304,", "citeRegEx": "23", "shortCiteRegEx": "23", "year": 2017}, {"title": "Using the output embedding to improve language models", "author": ["Ofir Press", "Lior Wolf"], "venue": "arXiv preprint arXiv:1608.05859,", "citeRegEx": "24", "shortCiteRegEx": "24", "year": 2016}, {"title": "Neural machine translation of rare words with subword units", "author": ["Rico Sennrich", "Barry Haddow", "Alexandra Birch"], "venue": "arXiv preprint arXiv:1508.07909,", "citeRegEx": "25", "shortCiteRegEx": "25", "year": 2015}, {"title": "Outrageously large neural networks: The sparsely-gated mixture-of-experts layer", "author": ["Noam Shazeer", "Azalia Mirhoseini", "Krzysztof Maziarz", "Andy Davis", "Quoc Le", "Geoffrey Hinton", "Jeff Dean"], "venue": "arXiv preprint arXiv:1701.06538,", "citeRegEx": "26", "shortCiteRegEx": "26", "year": 2017}, {"title": "Dropout: a simple way to prevent neural networks from overfitting", "author": ["Nitish Srivastava", "Geoffrey E Hinton", "Alex Krizhevsky", "Ilya Sutskever", "Ruslan Salakhutdinov"], "venue": "Journal of Machine Learning Research,", "citeRegEx": "27", "shortCiteRegEx": "27", "year": 1929}, {"title": "End-to-end memory networks", "author": ["Sainbayar Sukhbaatar", "arthur szlam", "Jason Weston", "Rob Fergus"], "venue": "Advances in Neural Information Processing Systems", "citeRegEx": "28", "shortCiteRegEx": "28", "year": 2015}, {"title": "Sequence to sequence learning with neural networks", "author": ["Ilya Sutskever", "Oriol Vinyals", "Quoc VV Le"], "venue": "In Advances in Neural Information Processing Systems,", "citeRegEx": "29", "shortCiteRegEx": "29", "year": 2014}, {"title": "Rethinking the inception architecture for computer", "author": ["Christian Szegedy", "Vincent Vanhoucke", "Sergey Ioffe", "Jonathon Shlens", "Zbigniew Wojna"], "venue": "vision. CoRR,", "citeRegEx": "30", "shortCiteRegEx": "30", "year": 2015}, {"title": "Google\u2019s neural machine translation system: Bridging the gap between human and machine translation", "author": ["Yonghui Wu", "Mike Schuster", "Zhifeng Chen", "Quoc V Le", "Mohammad Norouzi", "Wolfgang Macherey", "Maxim Krikun", "Yuan Cao", "Qin Gao", "Klaus Macherey"], "venue": "arXiv preprint arXiv:1609.08144,", "citeRegEx": "31", "shortCiteRegEx": "31", "year": 2016}, {"title": "Deep recurrent models with fast-forward connections for neural machine translation", "author": ["Jie Zhou", "Ying Cao", "Xuguang Wang", "Peng Li", "Wei Xu"], "venue": "CoRR, abs/1606.04199,", "citeRegEx": "32", "shortCiteRegEx": "32", "year": 2016}], "referenceMentions": [{"referenceID": 10, "context": "Recurrent neural networks, long short-term memory [12] and gated recurrent [7] neural networks in particular, have been firmly established as state of the art approaches in sequence modeling and transduction problems such as language modeling and machine translation [29, 2, 5].", "startOffset": 50, "endOffset": 54}, {"referenceID": 5, "context": "Recurrent neural networks, long short-term memory [12] and gated recurrent [7] neural networks in particular, have been firmly established as state of the art approaches in sequence modeling and transduction problems such as language modeling and machine translation [29, 2, 5].", "startOffset": 75, "endOffset": 78}, {"referenceID": 27, "context": "Recurrent neural networks, long short-term memory [12] and gated recurrent [7] neural networks in particular, have been firmly established as state of the art approaches in sequence modeling and transduction problems such as language modeling and machine translation [29, 2, 5].", "startOffset": 267, "endOffset": 277}, {"referenceID": 0, "context": "Recurrent neural networks, long short-term memory [12] and gated recurrent [7] neural networks in particular, have been firmly established as state of the art approaches in sequence modeling and transduction problems such as language modeling and machine translation [29, 2, 5].", "startOffset": 267, "endOffset": 277}, {"referenceID": 3, "context": "Recurrent neural networks, long short-term memory [12] and gated recurrent [7] neural networks in particular, have been firmly established as state of the art approaches in sequence modeling and transduction problems such as language modeling and machine translation [29, 2, 5].", "startOffset": 267, "endOffset": 277}, {"referenceID": 29, "context": "Numerous efforts have since continued to push the boundaries of recurrent language models and encoder-decoder architectures [31, 21, 13].", "startOffset": 124, "endOffset": 136}, {"referenceID": 19, "context": "Numerous efforts have since continued to push the boundaries of recurrent language models and encoder-decoder architectures [31, 21, 13].", "startOffset": 124, "endOffset": 136}, {"referenceID": 11, "context": "Numerous efforts have since continued to push the boundaries of recurrent language models and encoder-decoder architectures [31, 21, 13].", "startOffset": 124, "endOffset": 136}, {"referenceID": 16, "context": "Recent work has achieved significant improvements in computational efficiency through factorization tricks [18] and conditional computation [26], while also improving model performance in case of the latter.", "startOffset": 107, "endOffset": 111}, {"referenceID": 24, "context": "Recent work has achieved significant improvements in computational efficiency through factorization tricks [18] and conditional computation [26], while also improving model performance in case of the latter.", "startOffset": 140, "endOffset": 144}, {"referenceID": 0, "context": "Attention mechanisms have become an integral part of compelling sequence modeling and transduction models in various tasks, allowing modeling of dependencies without regard to their distance in the input or output sequences [2, 16].", "startOffset": 224, "endOffset": 231}, {"referenceID": 14, "context": "Attention mechanisms have become an integral part of compelling sequence modeling and transduction models in various tasks, allowing modeling of dependencies without regard to their distance in the input or output sequences [2, 16].", "startOffset": 224, "endOffset": 231}, {"referenceID": 20, "context": "In all but a few cases [22], however, such attention mechanisms are used in conjunction with a recurrent network.", "startOffset": 23, "endOffset": 27}, {"referenceID": 18, "context": "The goal of reducing sequential computation also forms the foundation of the Extended Neural GPU [20], ByteNet [15] and ConvS2S [8], all of which use convolutional neural networks as basic building block, computing hidden representations in parallel for all input and output positions.", "startOffset": 97, "endOffset": 101}, {"referenceID": 13, "context": "The goal of reducing sequential computation also forms the foundation of the Extended Neural GPU [20], ByteNet [15] and ConvS2S [8], all of which use convolutional neural networks as basic building block, computing hidden representations in parallel for all input and output positions.", "startOffset": 111, "endOffset": 115}, {"referenceID": 6, "context": "The goal of reducing sequential computation also forms the foundation of the Extended Neural GPU [20], ByteNet [15] and ConvS2S [8], all of which use convolutional neural networks as basic building block, computing hidden representations in parallel for all input and output positions.", "startOffset": 128, "endOffset": 131}, {"referenceID": 9, "context": "This makes it more difficult to learn dependencies between distant positions [11].", "startOffset": 77, "endOffset": 81}, {"referenceID": 2, "context": "Self-attention has been used successfully in a variety of tasks including reading comprehension, abstractive summarization, textual entailment and learning task-independent sentence representations [4, 22, 23, 19].", "startOffset": 198, "endOffset": 213}, {"referenceID": 20, "context": "Self-attention has been used successfully in a variety of tasks including reading comprehension, abstractive summarization, textual entailment and learning task-independent sentence representations [4, 22, 23, 19].", "startOffset": 198, "endOffset": 213}, {"referenceID": 21, "context": "Self-attention has been used successfully in a variety of tasks including reading comprehension, abstractive summarization, textual entailment and learning task-independent sentence representations [4, 22, 23, 19].", "startOffset": 198, "endOffset": 213}, {"referenceID": 17, "context": "Self-attention has been used successfully in a variety of tasks including reading comprehension, abstractive summarization, textual entailment and learning task-independent sentence representations [4, 22, 23, 19].", "startOffset": 198, "endOffset": 213}, {"referenceID": 26, "context": "End-to-end memory networks are based on a recurrent attention mechanism instead of sequencealigned recurrence and have been shown to perform well on simple-language question answering and language modeling tasks [28].", "startOffset": 212, "endOffset": 216}, {"referenceID": 12, "context": "In the following sections, we will describe the Transformer, motivate self-attention and discuss its advantages over models such as [14, 15] and [8].", "startOffset": 132, "endOffset": 140}, {"referenceID": 13, "context": "In the following sections, we will describe the Transformer, motivate self-attention and discuss its advantages over models such as [14, 15] and [8].", "startOffset": 132, "endOffset": 140}, {"referenceID": 6, "context": "In the following sections, we will describe the Transformer, motivate self-attention and discuss its advantages over models such as [14, 15] and [8].", "startOffset": 145, "endOffset": 148}, {"referenceID": 3, "context": "Most competitive neural sequence transduction models have an encoder-decoder structure [5, 2, 29].", "startOffset": 87, "endOffset": 97}, {"referenceID": 0, "context": "Most competitive neural sequence transduction models have an encoder-decoder structure [5, 2, 29].", "startOffset": 87, "endOffset": 97}, {"referenceID": 27, "context": "Most competitive neural sequence transduction models have an encoder-decoder structure [5, 2, 29].", "startOffset": 87, "endOffset": 97}, {"referenceID": 7, "context": "At each step the model is auto-regressive [9], consuming the previously generated symbols as additional input when generating the next.", "startOffset": 42, "endOffset": 45}, {"referenceID": 8, "context": "We employ a residual connection [10] around each of the two sub-layers, followed by layer normalization [1].", "startOffset": 32, "endOffset": 36}, {"referenceID": 0, "context": "The two most commonly used attention functions are additive attention [2], and dot-product (multiplicative) attention.", "startOffset": 70, "endOffset": 73}, {"referenceID": 1, "context": "While for small values of dk the two mechanisms perform similarly, additive attention outperforms dot product attention without scaling for larger values of dk [3].", "startOffset": 160, "endOffset": 163}, {"referenceID": 29, "context": "This mimics the typical encoder-decoder attention mechanisms in sequence-to-sequence models such as [31, 2, 8].", "startOffset": 100, "endOffset": 110}, {"referenceID": 0, "context": "This mimics the typical encoder-decoder attention mechanisms in sequence-to-sequence models such as [31, 2, 8].", "startOffset": 100, "endOffset": 110}, {"referenceID": 6, "context": "This mimics the typical encoder-decoder attention mechanisms in sequence-to-sequence models such as [31, 2, 8].", "startOffset": 100, "endOffset": 110}, {"referenceID": 22, "context": "In our model, we share the same weight matrix between the two embedding layers and the pre-softmax linear transformation, similar to [24].", "startOffset": 133, "endOffset": 137}, {"referenceID": 6, "context": "There are many choices of positional encodings, learned and fixed [8].", "startOffset": 66, "endOffset": 69}, {"referenceID": 6, "context": "We also experimented with using learned positional embeddings [8] instead, and found that the two versions produced nearly identical results (see Table 3 row (E)).", "startOffset": 62, "endOffset": 65}, {"referenceID": 9, "context": "The shorter these paths between any combination of positions in the input and output sequences, the easier it is to learn long-range dependencies [11].", "startOffset": 146, "endOffset": 150}, {"referenceID": 29, "context": "In terms of computational complexity, self-attention layers are faster than recurrent layers when the sequence length n is smaller than the representation dimensionality d, which is most often the case with sentence representations used by state-of-the-art models in machine translations, such as word-piece [31] and byte-pair [25] representations.", "startOffset": 308, "endOffset": 312}, {"referenceID": 23, "context": "In terms of computational complexity, self-attention layers are faster than recurrent layers when the sequence length n is smaller than the representation dimensionality d, which is most often the case with sentence representations used by state-of-the-art models in machine translations, such as word-piece [31] and byte-pair [25] representations.", "startOffset": 327, "endOffset": 331}, {"referenceID": 13, "context": "Doing so requires a stack of O(n/k) convolutional layers in the case of contiguous kernels, or O(logk(n)) in the case of dilated convolutions [15], increasing the length of the longest paths between any two positions in the network.", "startOffset": 142, "endOffset": 146}, {"referenceID": 4, "context": "Separable convolutions [6], however, decrease the complexity considerably, to O(k \u00b7 n \u00b7 d + n \u00b7 d(2)).", "startOffset": 23, "endOffset": 26}, {"referenceID": 1, "context": "Sentences were encoded using byte-pair encoding [3], which has a shared sourcetarget vocabulary of about 37000 tokens.", "startOffset": 48, "endOffset": 51}, {"referenceID": 29, "context": "For English-French, we used the significantly larger WMT 2014 English-French dataset consisting of 36M sentences and split tokens into a 32000 word-piece vocabulary [31].", "startOffset": 165, "endOffset": 169}, {"referenceID": 15, "context": "3 Optimizer We used the Adam optimizer [17] with \u03b21 = 0.", "startOffset": 39, "endOffset": 43}, {"referenceID": 25, "context": "Residual Dropout We apply dropout [27] to the output of each sub-layer, before it is added to the sub-layer input and normalized.", "startOffset": 34, "endOffset": 38}, {"referenceID": 13, "context": "Model BLEU Training Cost (FLOPs) EN-DE EN-FR EN-DE EN-FR ByteNet [15] 23.", "startOffset": 65, "endOffset": 69}, {"referenceID": 30, "context": "2 \u00b7 10(20) Deep-Att + PosUnk Ensemble [32] 40.", "startOffset": 38, "endOffset": 42}, {"referenceID": 29, "context": "We set the maximum output length during inference to input length + 50, but terminate early when possible [31].", "startOffset": 106, "endOffset": 110}, {"referenceID": 6, "context": "In row (E) we replace our sinusoidal positional encoding with learned positional embeddings [8], and observe nearly identical results to the base model.", "startOffset": 92, "endOffset": 95}], "year": 2017, "abstractText": "The dominant sequence transduction models are based on complex recurrent or convolutional neural networks that include an encoder and a decoder. The best performing models also connect the encoder and decoder through an attention mechanism. We propose a new simple network architecture, the Transformer, based solely on attention mechanisms, dispensing with recurrence and convolutions entirely. Experiments on two machine translation tasks show these models to be superior in quality while being more parallelizable and requiring significantly less time to train. Our model achieves 28.4 BLEU on the WMT 2014 Englishto-German translation task, improving over the existing best results, including ensembles, by over 2 BLEU. On the WMT 2014 English-to-French translation task, our model establishes a new single-model state-of-the-art BLEU score of 41.0 after training for 3.5 days on eight GPUs, a small fraction of the training costs of the best models from the literature.", "creator": null}, "id": "NIPS_2017_575"}
--------------------------------------------------------------------------------