├── .github
    └── workflows
    │   ├── ci.yml
    │   └── publish-to-pypi.yml
├── .gitignore
├── .travis.yml
├── LICENSE
├── README.md
├── compare_mt
    ├── __init__.py
    ├── align_utils.py
    ├── arg_utils.py
    ├── bucketers.py
    ├── cache_utils.py
    ├── compare_ll_main.py
    ├── compare_mt_main.py
    ├── corpus_utils.py
    ├── formatting.py
    ├── ngram_utils.py
    ├── print_utils.py
    ├── reporters.py
    ├── rouge
    │   ├── README.md
    │   ├── __init__.py
    │   ├── io.py
    │   ├── requirements.txt
    │   ├── rouge.py
    │   ├── rouge_scorer.py
    │   ├── run.sh
    │   ├── scoring.py
    │   └── tokenize.py
    ├── scorers.py
    ├── sign_utils.py
    ├── stat_utils.py
    └── version_info.py
├── example
    ├── ll_test.sys1.likelihood
    ├── ll_test.sys2.likelihood
    ├── ll_test.tag
    ├── ll_test.txt
    ├── multited.ref.jpn
    ├── multited.ref.jpn.tag
    ├── multited.sys1.jpn
    ├── multited.sys1.jpn.tag
    ├── multited.sys2.jpn
    ├── multited.sys2.jpn.tag
    ├── sum.ref.eng
    ├── sum.sys1.eng
    ├── sum.sys2.eng
    ├── ted.orig.slk
    ├── ted.ref.align
    ├── ted.ref.detok.eng
    ├── ted.ref.eng
    ├── ted.ref.eng.rptag
    ├── ted.ref.eng.tag
    ├── ted.sys1.align
    ├── ted.sys1.detok.eng
    ├── ted.sys1.eng
    ├── ted.sys1.eng.rptag
    ├── ted.sys1.eng.senttag
    ├── ted.sys1.eng.tag
    ├── ted.sys2.align
    ├── ted.sys2.detok.eng
    ├── ted.sys2.eng
    ├── ted.sys2.eng.rptag
    ├── ted.sys2.eng.senttag
    ├── ted.sys2.eng.tag
    ├── ted.train.counts
    └── ted.train.eng
├── pytest.ini
├── requirements.txt
├── scripts
    ├── count.py
    ├── interleave.py
    ├── postag.py
    └── relativepositiontag.py
├── setup.py
└── tests
    ├── __init__.py
    ├── test_cache.py
    └── test_scorers.py


/.github/workflows/ci.yml:
--------------------------------------------------------------------------------
 1 | name: CI
 2 | on: [push]
 3 | 
 4 | jobs:
 5 |   build:
 6 |     runs-on: ubuntu-latest
 7 |     steps:
 8 |       - uses: actions/checkout@v2
 9 |       - name: Install Python 3
10 |         uses: actions/setup-python@v1
11 |         with:
12 |           python-version: 3.9
13 |       - name: Install dependencies
14 |         run: |
15 |           python -m pip install --upgrade pip
16 |           pip install .
17 |       - name: Run tests with unittest
18 |         run: python -m unittest
19 | 


--------------------------------------------------------------------------------
/.github/workflows/publish-to-pypi.yml:
--------------------------------------------------------------------------------
 1 | name: Publish Python 🐍 distributions 📦 to PyPI and TestPyPI
 2 | 
 3 | on: push
 4 | 
 5 | jobs:
 6 |   build-n-publish:
 7 |     name: Build and publish Python 🐍 distributions 📦 to PyPI and TestPyPI
 8 |     runs-on: ubuntu-18.04
 9 |     steps:
10 |     - uses: actions/checkout@master
11 |     - name: Set up Python 3.9
12 |       uses: actions/setup-python@v1
13 |       with:
14 |         python-version: 3.9
15 |     - name: Install pypa/build
16 |       run: >-
17 |         python -m
18 |         pip install
19 |         build
20 |         --user
21 |     - name: Build a binary wheel and a source tarball
22 |       run: >-
23 |         python -m
24 |         build
25 |         --sdist
26 |         --wheel
27 |         --outdir dist/
28 |         .
29 |     - name: Publish distribution 📦 to Test PyPI
30 |       uses: pypa/gh-action-pypi-publish@master
31 |       with:
32 |         skip_existing: true
33 |         password: ${{ secrets.TEST_PYPI_API_KEY }}
34 |         repository_url: https://test.pypi.org/legacy/
35 |     - name: Publish distribution 📦 to PyPI
36 |       if: startsWith(github.ref, 'refs/tags')
37 |       uses: pypa/gh-action-pypi-publish@master
38 |       with:
39 |         password: ${{ secrets.PYPI_API_KEY }}
40 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # PyCharm
 2 | .idea
 3 | __pycache__
 4 | # vim
 5 | *.swp
 6 | # VS code
 7 | .vscode/
 8 | # Mac
 9 | .DS_Store
10 | # setup.py build artifacts
11 | *.egg-info
12 | dist/
13 | build/
14 | # Virtualenv for developing
15 | env/
16 | # Outputs
17 | output/
18 | outputs/
19 | .pytest_cache
20 | 


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
 1 | language: python
 2 | python:
 3 | - '3.6'
 4 | - 3.7-dev
 5 | install:
 6 | - pip install -r requirements.txt
 7 | - pip install -U setuptools
 8 | - python setup.py install
 9 | script:
10 | - pytest
11 | - compare-mt example/ted.ref.eng example/ted.sys1.eng example/ted.sys2.eng --decimals 2 --output_directory output
12 | - compare-mt example/ted.ref.eng example/ted.sys1.eng example/ted.sys2.eng --compare_scores score_type=bleu,bootstrap=10,prob_thresh=0.05 --output_directory output
13 | - compare-mt example/ted.ref.eng example/ted.sys1.eng example/ted.sys2.eng --compare_word_accuracies bucket_type=freq,freq_corpus_file=example/ted.train.eng,bucket_cutoffs=1:2:3:5:10  bucket_type=freq,freq_count_file=example/ted.train.counts,bucket_cutoffs=1:2:3:5:10 --output_directory output
14 | - compare-mt example/ted.ref.eng example/ted.sys1.eng example/ted.sys2.eng --compare_word_accuracies bucket_type=case bucket_type=label,ref_labels=example/ted.ref.eng.tag,out_labels="example/ted.sys1.eng.tag;example/ted.sys2.eng.tag",label_set=CC+DT+IN+JJ+NN+NNP+NNS+PRP+RB+TO+VB+VBP+VBZ bucket_type=numlabel,ref_labels=example/ted.ref.eng.rptag,out_labels="example/ted.sys1.eng.rptag;example/ted.sys2.eng.rptag" --compare_ngrams compare_type=match,ref_labels=example/ted.ref.eng.tag,out_labels="example/ted.sys1.eng.tag;example/ted.sys2.eng.tag" --output_directory output
15 | - compare-mt example/ted.ref.eng example/ted.sys1.eng example/ted.sys2.eng --src_file example/ted.orig.slk --compare_src_word_accuracies ref_align_file=example/ted.ref.align --output_directory output
16 | - compare-ll --ref example/ll_test.txt --ll-files example/ll_test.sys1.likelihood example/ll_test.sys2.likelihood --compare-word-likelihoods bucket_type=freq,freq_corpus_file=example/ll_test.txt --decimals 2
17 | - compare-ll --ref example/ll_test.txt --ll-files example/ll_test.sys1.likelihood example/ll_test.sys2.likelihood --compare-word-likelihoods bucket_type=label,label_corpus=example/ll_test.tag,label_set=CC+DT+IN+JJ+NN+NNP+NNS+PRP+RB+TO+VB+VBP+VBZ
18 | - compare-mt example/sum.ref.eng example/sum.sys1.eng example/sum.sys2.eng --compare_scores 'score_type=rouge1' 'score_type=rouge2' 'score_type=rougeL' --output_directory output
19 | - python compare_mt/compare_mt_main.py example/ted.ref.eng example/ted.sys1.eng example/ted.sys2.eng --output_directory output
20 | deploy:
21 |   provider: pypi
22 |   user: pmichel31415
23 |   skip_existing: true
24 |   password:
25 |     secure: fGKIZDGfu5L2WGiGlIidPI5uBi2P2TIytEIDerK8sJWKdIM6CSLnzVVXHst5VIujIhF2/TP7YMniLvMEflW5HY7Bu5fb2dBMQnyQJiE8SE9ih/Oq35W3fHJCEiAYnWo3CKLYlwUyJC9VZn8w0JrU2MBWfLCIli3Fuh9sbRyVNvjRq4kc2IGIjcxwQvM0Hml9G/89UwWYKUbxi53tFfUr5qu9WyuPdy/i2bcHaYMB6FgXbTn47MmOgVDvLjLjePpMsF+fNQDkkN035ngPRLDfHfBM74ag2ycVUhjT8nsMOfKGMpmbk/CeyKOYT9TW6Fp/MALQ5nJ9qF4q49mOpz7lh0JfogTCxweU76cpPsi9j99BvYULTYy1SnjOP9ZqglobosWq2fUtw8Pf6KE57Y0ultfh+CAgXWhX7rBFGj9PrYW6+P8Y2p5+MQuXRZp+6TOXgpELh0SiXUAFQA5B77Kw8+tPw5DJL1b5oGXBTp94sttHxNXeV9bm9AwKB18rUcKKA0AHFP5FgjvdtfZKnjydSg/hFn82UA/0g0ubcSuqdoSRgk49NT4RasODiqnfqXseJ/q1vWm5eiW60QzXuHZrK6EN8vzKxFH7DYjAZTOQsAdoCgAQvSXABOKum/Pm3HWU+BfD0xZH9cJEn9YvSKD5qMNikmMK1LR2cgRHmbhjUXQ=
26 |   on:
27 |     branch: master
28 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | Copyright 2019 Graham Neubig
 2 | 
 3 | Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
 4 | 
 5 | 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
 6 | 
 7 | 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
 8 | 
 9 | 3. Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission.
10 | 
11 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
12 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # compare-mt
  2 | by [NeuLab](http://www.cs.cmu.edu/~neulab/) @ [CMU LTI](https://lti.cs.cmu.edu), and other contributors
  3 | 
  4 | [![Integration Tests](https://github.com/neulab/compare-mt/actions/workflows/ci.yml/badge.svg?event=push)](.github/workflows/ci.yml)
  5 | 
  6 | `compare-mt` (for "compare my text") is a program to compare the output of multiple systems for language generation,
  7 | including machine translation, summarization, dialog response generation, etc. 
  8 | To use it you need to have, in text format, a "correct" reference, and the output of two different systems.
  9 | Based on this, `compare-mt` will run a number of analyses that attempt to pick out salient differences between
 10 | the systems, which will make it easier for you to figure out what things one system is doing better than another.
 11 | 
 12 | ## Basic Usage
 13 | 
 14 | First, you need to install the package:
 15 | 
 16 | ```bash
 17 | # Requirements
 18 | pip install -r requirements.txt
 19 | # Install the package
 20 | python setup.py install
 21 | ```
 22 | 
 23 | Then, as an example, you can run this over two included system outputs.
 24 | 
 25 | ```bash
 26 | compare-mt --output_directory output/ example/ted.ref.eng example/ted.sys1.eng example/ted.sys2.eng
 27 | ```
 28 | 
 29 | This will output some statistics to the command line, and also write a formatted HTML report to `output/`.
 30 | Here, system 1 and system 2 are the baseline phrase-based and neural Slovak-English systems from our
 31 | [EMNLP 2018 paper](http://aclweb.org/anthology/D18-1103). This will print out a number of statistics including:
 32 | 
 33 | * **Aggregate Scores:** A report on overall BLEU scores and length ratios
 34 | * **Word Accuracy Analysis:** A report on the F-measure of words by frequency bucket
 35 | * **Sentence Bucket Analysis:** Bucket sentences by various statistics (e.g. sentence BLEU, length difference with the
 36 |   reference, overall length), and calculate statistics by bucket (e.g. number of sentences, BLEU score per bucket)
 37 | * **N-gram Difference Analysis:** Calculate which n-grams one system is consistently translating better
 38 | * **Sentence Examples:** Find sentences where one system is doing better than the other according to sentence BLEU
 39 | 
 40 | You can see an example of running this analysis (as well as the more advanced analysis below) either through a
 41 | [generated HTML report here](http://phontron.com/compare-mt/output/), or in the following narrated video:
 42 | 
 43 | [![IMAGE ALT TEXT HERE](https://img.youtube.com/vi/K-MNPOGKnDQ/0.jpg)](https://www.youtube.com/watch?v=K-MNPOGKnDQ)
 44 | 
 45 | To summarize the results that immediately stick out from the basic analysis:
 46 | 
 47 | * From the *aggregate scores* we can see that the BLEU of neural MT is higher, but its sentences are slightly shorter.
 48 | * From the *word accuracy analysis* we can see that phrase-based MT is better at low-frequency words.
 49 | * From the *sentence bucket analysis* we can see that neural seems to be better at translating shorter sentences.
 50 | * From the *n-gram difference analysis* we can see that there are a few words that neural MT is not good at
 51 |   but phrase based MT gets right (e.g. "phantom"), while there are a few long phrases that neural MT does better with
 52 |   (e.g. "going to show you").
 53 | 
 54 | If you run on your own data, you might be able to find more interesting things about your own systems. Try comparing
 55 | your modified system with your baseline and seeing what you find! 
 56 | 
 57 | ## Other Options
 58 | 
 59 | There are many options that can be used to do different types of analysis.
 60 | If you want to find all the different types of analysis supported, the most comprehensive way to do so is by
 61 | taking a look at `compare-mt`, which is documented relatively well and should give examples.
 62 | We do highlight a few particularly useful and common types of analysis below:
 63 | 
 64 | ### Significance Tests
 65 | 
 66 | The script allows you to perform statistical significance tests for scores based on [bootstrap resampling](https://aclanthology.org/W04-3250.pdf). You can set
 67 | the number of samples manually. Here is an example using the example data:
 68 | 
 69 | 
 70 | ```bash
 71 | compare-mt example/ted.ref.eng example/ted.sys1.eng example/ted.sys2.eng --compare_scores score_type=bleu,bootstrap=1000,prob_thresh=0.05
 72 | ```
 73 | 
 74 | One important thing to note is that bootrap resampling as implemented in compare-mt only tests for variance due to data sampling, approximately answering the question ``if I ran the same system on a different, similarly sampled dataset, would I be likely to get the same result?''.
 75 | It does not say anything about whether a system will perform better on another dataset in a different domain, and it [does not control for training-time factors](https://aclanthology.org/P11-2031/) such as selection of the random seed, so it cannot say if another training run of the same model would yield the same result.
 76 | 
 77 | ### Using Training Set Frequency
 78 | 
 79 | One useful piece of analysis is the "word accuracy by frequency" analysis. By default this frequency is the frequency
 80 | in the *test set*, but arguably it is more informative to know accuracy by frequency in the *training set* as this
 81 | demonstrates the models' robustness to words they haven't seen much, or at all, in the training data. To change the
 82 | corpus used to calculate word frequency and use the training set (or some other set), you can set the `freq_corpus_file`
 83 | option to the appropriate corpus.
 84 | 
 85 | 
 86 | ```bash
 87 | compare-mt example/ted.ref.eng example/ted.sys1.eng example/ted.sys2.eng
 88 |         --compare_word_accuracies bucket_type=freq,freq_corpus_file=example/ted.train.eng
 89 | ```
 90 | 
 91 | In addition, because training sets may be very big, you can also calculate the counts on the file beforehand,
 92 | 
 93 | ```bash
 94 | python scripts/count.py < example/ted.train.eng > example/ted.train.counts
 95 | ```
 96 | 
 97 | and then use these counts directly to improve efficiency.
 98 | 
 99 | ```bash
100 | compare-mt example/ted.ref.eng example/ted.sys1.eng example/ted.sys2.eng
101 |         --compare_word_accuracies bucket_type=freq,freq_count_file=example/ted.train.counts
102 | ```
103 | 
104 | 
105 | ### Incorporating Word/Sentence Labels
106 | 
107 | If you're interested in performing aggregate analysis over labels for each word/sentence instead of the words/sentences themselves, it
108 | is possible to do so. As an example, we've included POS tags for each of the example outputs. You can use these in
109 | aggregate analysis, or n-gram-based analysis. The following gives an example:
110 | 
111 | 
112 | ```bash
113 | compare-mt example/ted.ref.eng example/ted.sys1.eng example/ted.sys2.eng 
114 |     --compare_word_accuracies bucket_type=label,ref_labels=example/ted.ref.eng.tag,out_labels="example/ted.sys1.eng.tag;example/ted.sys2.eng.tag",label_set=CC+DT+IN+JJ+NN+NNP+NNS+PRP+RB+TO+VB+VBP+VBZ 
115 |     --compare_ngrams compare_type=match,ref_labels=example/ted.ref.eng.tag,out_labels="example/ted.sys1.eng.tag;example/ted.sys2.eng.tag"
116 | ```
117 | 
118 | This will calculate word accuracies and n-gram matches by POS bucket, and allows you to see things like the fact
119 | that the phrase-based MT system is better at translating content words such as nouns and verbs, while neural MT
120 | is doing better at translating function words.
121 | 
122 | We also give an example to perform aggregate analysis when multiple labels per word/sentence, where each group of labels is a string separated by '+'s, are allowed:
123 | 
124 | ```bash
125 | compare-mt example/multited.ref.jpn example/multited.sys1.jpn example/multited.sys2.jpn 
126 |     --compare_word_accuracies bucket_type=multilabel,ref_labels=example/multited.ref.jpn.tag,out_labels="example/multited.sys1.jpn.tag;example/multited.sys2.jpn.tag",label_set=lexical+formality+pronouns+ellipsis
127 | ```
128 | 
129 | It also is possible to create labels that represent numberical values. For example, `scripts/relativepositiontag.py` calculates the relative position of words in the sentence, where 0 is the first word in the sentence, 0.5 is the word in the middle, and 1.0 is the word in the end. These numerical values can then be bucketed. Here is an example:
130 | 
131 | ```bash
132 | compare-mt example/ted.ref.eng example/ted.sys1.eng example/ted.sys2.eng 
133 |     --compare_word_accuracies bucket_type=numlabel,ref_labels=example/ted.ref.eng.rptag,out_labels="example/ted.sys1.eng.rptag;example/ted.sys2.eng.rptag"
134 | ```
135 | 
136 | From this particular analysis we can discover that NMT does worse than PBMT at the end of the sentence, and of course other varieties of numerical labels could be used to measure different properties of words.
137 | 
138 | You can also perform analysis over labels for sentences. Here is an example:
139 | 
140 | ```bash
141 | compare-mt example/ted.ref.eng example/ted.sys1.eng example/ted.sys2.eng 
142 |     --compare_sentence_buckets 'bucket_type=label,out_labels=example/ted.sys1.eng.senttag;example/ted.sys2.eng.senttag,label_set=0+10+20+30+40+50+60+70+80+90+100,statistic_type=score,score_measure=bleu'
143 | ```
144 | 
145 | 
146 | ### Analyzing Source Words
147 | 
148 | If you have a source corpus that is aligned to the target, you can also analyze accuracies according to features of the
149 | source language words, which would allow you to examine whether, for example, infrequent words on the source side are
150 | hard to output properly. Here is an example using the example data:
151 | 
152 | ```bash
153 | compare-mt example/ted.ref.eng example/ted.sys1.eng example/ted.sys2.eng --src_file example/ted.orig.slk --compare_src_word_accuracies ref_align_file=example/ted.ref.align
154 | ```
155 | 
156 | ### Analyzing Word Likelihoods
157 | 
158 | If you wish to analyze the word log likelihoods by two systems on the target corpus, you can use the following
159 | 
160 | ```bash
161 | compare-ll --ref example/ll_test.txt --ll-files example/ll_test.sys1.likelihood example/ll_test.sys2.likelihood --compare-word-likelihoods bucket_type=freq,freq_corpus_file=example/ll_test.txt
162 | ```
163 | 
164 | You can analyze the word log likelihoods over labels for each word instead of the words themselves:
165 | 
166 | ```bash
167 | compare-ll --ref example/ll_test.txt --ll-files example/ll_test.sys1.likelihood example/ll_test.sys2.likelihood --compare-word-likelihoods bucket_type=label,label_corpus=example/ll_test.tag,label_set=CC+DT+IN+JJ+NN+NNP+NNS+PRP+RB+TO+VB+VBP+VBZ
168 | ```
169 | 
170 | NOTE: You can also use the above to also analyze the word likelihoods produced by two language models.
171 | 
172 | ### Analyzing Other Language Generation Systems
173 | 
174 | You can also analyze other language generation systems using the script. Here is an example of comparing two text summarization systems. 
175 | 
176 | ```bash
177 | compare-mt example/sum.ref.eng example/sum.sys1.eng example/sum.sys2.eng --compare_scores 'score_type=rouge1' 'score_type=rouge2' 'score_type=rougeL'
178 | ```
179 | 
180 | ### Evaluating on COMET
181 | 
182 | It is possible to use the [COMET](https://unbabel.github.io/COMET/html/index.html) as a metric. 
183 | To do so, you need to install it first by running
184 | 
185 | ```bash
186 | pip install unbabel-comet
187 | ```
188 | 
189 | To then run, pass the source and select the appropriate score type. Here is an example.
190 | ```bash
191 | compare-mt example/ted.ref.eng example/ted.sys1.eng example/ted.sys2.eng --src_file example/ted.orig.slk \
192 |   --compare_scores score_type=comet \
193 |   --compare_sentence_buckets bucket_type=score,score_measure=sentcomet
194 | ```
195 | 
196 | Note that COMET runs on top of XLM-R, so it's highly recommended you use a GPU with it.
197 | 
198 | ## Citation/References
199 | 
200 | If you use compare-mt, we'd appreciate if you cite the [paper](http://arxiv.org/abs/1903.07926) about it!
201 | 
202 |     @article{DBLP:journals/corr/abs-1903-07926,
203 |       author    = {Graham Neubig and Zi{-}Yi Dou and Junjie Hu and Paul Michel and Danish Pruthi and Xinyi Wang and John Wieting},
204 |       title     = {compare-mt: {A} Tool for Holistic Comparison of Language Generation Systems},
205 |       journal   = {CoRR},
206 |       volume    = {abs/1903.07926},
207 |       year      = {2019},
208 |       url       = {http://arxiv.org/abs/1903.07926},
209 |     }
210 | 
211 | There is an extensive literature review included in the paper above, but some key papers that it borrows ideas from are below:
212 | 
213 | * **Automatic Error Analysis:**
214 |   Popovic and Ney "[Towards Automatic Error Analysis of Machine Translation Output](https://www.mitpressjournals.org/doi/pdf/10.1162/COLI_a_00072)" Computational Linguistics 2011.
215 | * **POS-based Analysis:**
216 |   Chiang et al. "[The Hiero Machine Translation System](http://aclweb.org/anthology/H05-1098)" EMNLP 2005.
217 | * **n-gram Difference Analysis**
218 |   Akabe et al. "[Discriminative Language Models as a Tool for Machine Translation Error Analysis](http://www.phontron.com/paper/akabe14coling.pdf)" COLING 2014.
219 | 
220 | There is also other good software for automatic comparison or error analysis of MT systems:
221 | 
222 | * **[MT-ComparEval](https://github.com/choko/MT-ComparEval):** Very nice for visualization of individual examples, but
223 |   not as focused on aggregate analysis as `compare-mt`. Also has more software dependencies and requires using a web
224 |   browser, while `compare-mt` can be used as a command-line tool.
225 | 


--------------------------------------------------------------------------------
/compare_mt/__init__.py:
--------------------------------------------------------------------------------
 1 | import compare_mt.ngram_utils
 2 | import compare_mt.stat_utils
 3 | import compare_mt.corpus_utils
 4 | import compare_mt.sign_utils
 5 | import compare_mt.scorers
 6 | import compare_mt.bucketers
 7 | import compare_mt.reporters
 8 | import compare_mt.arg_utils
 9 | import compare_mt.print_utils
10 | import compare_mt.version_info
11 | 
12 | __version__ = compare_mt.version_info.__version__
13 | 


--------------------------------------------------------------------------------
/compare_mt/align_utils.py:
--------------------------------------------------------------------------------
 1 | from collections import defaultdict
 2 | from compare_mt import corpus_utils
 3 | 
 4 | def _count_ngram(sent, order):
 5 |   gram_pos = dict()
 6 |   for i in range(order):
 7 |     gram_pos[i+1] = defaultdict(lambda: [])
 8 |   for i, word in enumerate(sent):
 9 |     for j in range(min(i+1, order)):
10 |       gram_pos[j+1][word].append(i-j)
11 |       word = sent[i-j-1] + ' ' + word
12 |   return gram_pos
13 | 
14 | def ngram_context_align(ref, out, order=-1, case_insensitive=False):
15 |   """
16 |   Calculate the word alignment between a reference sentence and an output sentence. 
17 |   Proposed in the following paper:
18 | 
19 |   Automatic Evaluation of Translation Quality for Distant Language Pairs
20 |   Hideki Isozaki, Tsutomu Hirao, Kevin Duh, Katsuhito Sudoh, Hajime Tsukada
21 |   http://www.anthology.aclweb.org/D/D10/D10-1092.pdf 
22 | 
23 |   Args:
24 |     ref: A reference sentence
25 |     out: An output sentence
26 |     order: The highest order of grams we want to consider (-1=inf)
27 |     case_insensitive: A boolean specifying whether to turn on the case insensitive option
28 | 
29 |   Returns:
30 |     The word alignment, represented as a list of integers. 
31 |   """
32 | 
33 |   if case_insensitive:
34 |     ref = corpus_utils.lower(ref)
35 |     out = corpus_utils.lower(out)
36 | 
37 |   order = len(ref) if order == -1 else order
38 | 
39 |   ref_gram_pos = _count_ngram(ref, order)
40 |   out_gram_pos = _count_ngram(out, order)
41 | 
42 |   worder = []
43 |   for i, word in enumerate(out):
44 |     if len(ref_gram_pos[1][word]) == 0:
45 |       continue
46 |     if len(ref_gram_pos[1][word]) == len(out_gram_pos[1][word]) == 1:
47 |       worder.append(ref_gram_pos[1][word][0])
48 |     else:
49 |       word_forward = word 
50 |       word_backward = word 
51 |       for j in range(1, order):
52 |         if i - j >= 0:
53 |           word_backward = out[i-j] + ' ' + word_backward 
54 |           if len(ref_gram_pos[j+1][word_backward]) == len(out_gram_pos[j+1][word_backward]) == 1:
55 |             worder.append(ref_gram_pos[j+1][word_backward][0]+j)
56 |             break
57 | 
58 |         if i + j < len(out):
59 |           word_forward = word_forward + ' ' + out[i+j]
60 |           if len(ref_gram_pos[j+1][word_forward]) == len(out_gram_pos[j+1][word_forward]) == 1:
61 |             worder.append(ref_gram_pos[j+1][word_forward][0])
62 |             break
63 | 
64 |   return worder
65 | 


--------------------------------------------------------------------------------
/compare_mt/arg_utils.py:
--------------------------------------------------------------------------------
 1 | def parse_profile(profile):
 2 |   kargs = {}
 3 |   try:
 4 |     for kv in profile.split(','):
 5 |       k, v = kv.split('=')
 6 |       kargs[k] = v
 7 |   except ValueError:
 8 |     # more informative error message
 9 |     raise ValueError(
10 |       f"Failed to parse profile: {profile}. The expected format is:"
11 |       " \"key1=value1,key2=value2,[...]\""
12 |     )
13 |   return kargs
14 | 
15 | def parse_compare_directions(compare_directions):
16 |   direcs = []
17 |   try:
18 |     for direc in compare_directions.split(';'):
19 |       left, right = direc.split('-')
20 |       left, right = int(left), int(right)
21 |       direcs.append((left, right))
22 |   except ValueError:
23 |     # more informative error message
24 |     raise ValueError(
25 |       f"Failed to parse directions: {compare_directions}."
26 |       " The expected format is: \"left1-right1;left2-right2;[...]\""
27 |     )
28 |   return direcs
29 | 
30 | def parse_files(filenames):
31 |   files = []
32 |   for f in filenames.split(';'):
33 |     files.append(f)
34 |   return files
35 | 
36 | def parse_intfloat(s):
37 |   try:
38 |     return int(s)
39 |   except ValueError:
40 |     return float(s)


--------------------------------------------------------------------------------
/compare_mt/bucketers.py:
--------------------------------------------------------------------------------
  1 | import sys
  2 | import itertools
  3 | import numpy as np
  4 | from collections import defaultdict
  5 | 
  6 | from compare_mt import corpus_utils
  7 | from compare_mt import scorers
  8 | from compare_mt import arg_utils
  9 | 
 10 | class Bucketer:
 11 | 
 12 |   def set_bucket_cutoffs(self, bucket_cutoffs, num_type='int'):
 13 |     self.bucket_cutoffs = bucket_cutoffs
 14 |     self.bucket_strs = []
 15 |     for i, x in enumerate(bucket_cutoffs):
 16 |       if i == 0:
 17 |         self.bucket_strs.append(f'<{x}')
 18 |       elif num_type == 'int' and x-1 == bucket_cutoffs[i-1]:
 19 |         self.bucket_strs.append(f'{x-1}')
 20 |       else:
 21 |         self.bucket_strs.append(f'[{bucket_cutoffs[i-1]},{x})')
 22 |     self.bucket_strs.append(f'>={x}')
 23 | 
 24 |   def cutoff_into_bucket(self, value):
 25 |     for i, v in enumerate(self.bucket_cutoffs):
 26 |       if value < v:
 27 |         return i
 28 |     return len(self.bucket_cutoffs)
 29 | 
 30 | class WordBucketer(Bucketer):
 31 | 
 32 |   def calc_bucket(self, val, label=None):
 33 |     """
 34 |     Calculate the bucket for a particular word
 35 | 
 36 |     Args:
 37 |       val: The word to calculate the bucket for
 38 |       label: If there's a label on the target word, add it
 39 | 
 40 |     Returns:
 41 |       An integer ID of the bucket
 42 |     """
 43 |     raise NotImplementedError('calc_bucket must be implemented in subclasses of WordBucketer')
 44 | 
 45 |   def _calc_trg_matches(self, ref_sent, out_sents):
 46 |     ref_pos = defaultdict(lambda: [])
 47 |     out_matches = [[-1 for _ in s] for s in out_sents]
 48 |     ref_matches = [[-1 for _ in ref_sent] for _ in out_sents]
 49 |     for ri, ref_word in enumerate(ref_sent):
 50 |       ref_pos[ref_word].append(ri)
 51 |     for oai, out_sent in enumerate(out_sents):
 52 |       out_word_cnts = {}
 53 |       for oi, out_word in enumerate(out_sent):
 54 |         ref_poss = ref_pos.get(out_word, None)
 55 |         if ref_poss:
 56 |           out_word_cnt = out_word_cnts.get(out_word, 0)
 57 |           if out_word_cnt < len(ref_poss):
 58 |             out_matches[oai][oi] = ref_poss[out_word_cnt]
 59 |             ref_matches[oai][ref_poss[out_word_cnt]] = oi
 60 |           out_word_cnts[out_word] = out_word_cnt + 1
 61 |     return out_matches, ref_matches
 62 | 
 63 |   def _calc_trg_buckets_and_matches(self, ref_sent, ref_label, out_sents, out_labels):
 64 |     # Initial setup for special cases
 65 |     if self.case_insensitive:
 66 |       ref_sent = [corpus_utils.lower(w) for w in ref_sent]
 67 |       out_sents = [[corpus_utils.lower(w) for w in out_sent] for out_sent in out_sents]
 68 |     if not ref_label:
 69 |       ref_label = []
 70 |       out_labels = [[] for _ in out_sents]
 71 |     # Get matches
 72 |     out_matches, _ = self._calc_trg_matches(ref_sent, out_sents)
 73 |     # Process the reference, getting the bucket
 74 |     ref_buckets = [self.calc_bucket(w, label=l) for (w,l) in itertools.zip_longest(ref_sent, ref_label)]
 75 |     # Process each of the outputs, finding matches
 76 |     out_buckets = [[] for _ in out_sents]
 77 |     for oai, (out_sent, out_label, match, out_buck) in \
 78 |             enumerate(itertools.zip_longest(out_sents, out_labels, out_matches, out_buckets)):
 79 |       for oi, (w, l, m) in enumerate(itertools.zip_longest(out_sent, out_label, match)):
 80 |         out_buck.append(self.calc_bucket(w, label=l) if m < 0 else ref_buckets[m])
 81 |     # Calculate totals for each sentence
 82 |     num_buckets = len(self.bucket_strs)
 83 |     num_outs = len(out_sents)
 84 |     my_ref_total = np.zeros(num_buckets ,dtype=int)
 85 |     my_out_totals = np.zeros( (num_outs, num_buckets) ,dtype=int)
 86 |     my_out_matches = np.zeros( (num_outs, num_buckets) ,dtype=int)
 87 |     for b in ref_buckets:
 88 |       if isinstance(b, list):
 89 |         for bi in b:
 90 |           my_ref_total[bi] += 1
 91 |       else:
 92 |         my_ref_total[b] += 1
 93 |     for oi, (obs, ms) in enumerate(zip(out_buckets, out_matches)):
 94 |       for b, m in zip(obs, ms):
 95 |         if isinstance(b, list):
 96 |           for bi in b:
 97 |             my_out_totals[oi,bi] += 1
 98 |             if m >= 0:
 99 |               my_out_matches[oi,bi] += 1
100 |         else:
101 |           my_out_totals[oi,b] += 1
102 |           if m >= 0:
103 |             my_out_matches[oi,b] += 1
104 |     return my_ref_total, my_out_totals, my_out_matches, ref_buckets, out_buckets, out_matches
105 | 
106 |   def _calc_src_buckets_and_matches(self, src_sent, src_label, ref_sent, ref_aligns, out_sents):
107 |     # Initial setup for special cases
108 |     if self.case_insensitive:
109 |       src_sent = [corpus_utils.lower(w) for w in src_sent]
110 |       ref_sent = [corpus_utils.lower(w) for w in ref_sent]
111 |       out_sents = [[corpus_utils.lower(w) for w in out_sent] for out_sent in out_sents]
112 |     if not src_label:
113 |       src_label = []
114 |     # Get matches
115 |     _, ref_matches = self._calc_trg_matches(ref_sent, out_sents)
116 |     # Process the source, getting the bucket
117 |     src_buckets = [self.calc_bucket(w, label=l) for (w,l) in itertools.zip_longest(src_sent, src_label)]
118 |     # For each source word, find the reference words that need to be correct
119 |     src_aligns = [[] for _ in src_sent]
120 |     for src, trg in ref_aligns:
121 |       src_aligns[src].append(trg)
122 |     # Calculate totals for each sentence
123 |     num_buckets = len(self.bucket_strs)
124 |     num_outs = len(out_sents)
125 |     my_ref_total = np.zeros(num_buckets ,dtype=int)
126 |     my_out_matches = np.zeros( (num_outs, num_buckets) ,dtype=int)
127 |     for src_bucket in src_buckets:
128 |       my_ref_total[src_bucket] += 1
129 |     my_out_totals = np.broadcast_to(np.reshape(my_ref_total, (1, num_buckets)), (num_outs, num_buckets))
130 |     for oai, (out_sent, ref_match) in enumerate(zip(out_sents, ref_matches)):
131 |       for src_bucket, src_align in zip(src_buckets, src_aligns):
132 |         if len(src_align) != 0:
133 |           if all([ref_match[x] >= 0 for x in src_align]):
134 |             my_out_matches[oai,src_bucket] += 1
135 |     return my_ref_total, my_out_totals, my_out_matches, src_buckets, src_aligns, ref_matches
136 | 
137 |   def calc_statistics(self, ref, outs,
138 |                       src=None,
139 |                       ref_labels=None, out_labels=None,
140 |                       ref_aligns=None, src_labels=None):
141 |     """
142 |     Calculate match statistics, bucketed by the type of word we have, and IDs of example sentences to show.
143 |     This must be used with a subclass that has self.bucket_strs defined, and self.calc_bucket(word) implemented.
144 | 
145 |     Args:
146 |       ref: The reference corpus
147 |       outs: A list of output corpora
148 |       src: Source sentences.
149 |            If src is set, it will use ref_aligns, out_aligns, and src_labels.
150 |            Otherwise, it will use ref_labels and out_labels.
151 |       ref_labels: Labels of the reference corpus (optional)
152 |       out_labels: Labels of the output corpora (should be specified iff ref_labels is)
153 | 
154 |     Returns:
155 |       statistics: containing a list of equal length to out, containing for each system
156 |         both_tot: the frequency of a particular bucket appearing in both output and reference
157 |         ref_tot: the frequency of a particular bucket appearing in just reference
158 |         out_tot: the frequency of a particular bucket appearing in just output
159 |         rec: recall of the bucket
160 |         prec: precision of the bucket
161 |         fmeas: f1-measure of the bucket
162 |       my_ref_total_list: containing a list of statistics of the reference
163 |       my_out_matches_list: containing a list of statistics of the outputs
164 |     """
165 |     if not hasattr(self, 'case_insensitive'):
166 |       self.case_insensitive = False
167 | 
168 |     # Dimensions
169 |     num_buckets = len(self.bucket_strs)
170 |     num_outs = len(outs)
171 | 
172 |     # Initialize the sufficient statistics for prec/rec/fmeas
173 |     ref_total = np.zeros(num_buckets, dtype=int)
174 |     out_totals = np.zeros( (num_outs, num_buckets) ,dtype=int)
175 |     out_matches = np.zeros( ( num_outs, num_buckets) ,dtype=int)
176 | 
177 |     my_ref_total_list = []
178 |     my_out_totals_list = []
179 |     my_out_matches_list = []
180 | 
181 |     # Step through the sentences
182 |     for rsi, (ref_sent, ref_label) in enumerate(itertools.zip_longest(ref, ref_labels if ref_labels else [])):
183 |       if src:
184 |         my_ref_total, my_out_totals, my_out_matches, _, _, _ = \
185 |           self._calc_src_buckets_and_matches(src[rsi],
186 |                                              src_labels[rsi] if src_labels else None,
187 |                                              ref_sent,
188 |                                              ref_aligns[rsi],
189 |                                              [x[rsi] for x in outs])
190 |       else:
191 |         my_ref_total, my_out_totals, my_out_matches, _, _, _ = \
192 |            self._calc_trg_buckets_and_matches(ref_sent,
193 |                                               ref_label,
194 |                                               [x[rsi] for x in outs],
195 |                                               [x[rsi] for x in out_labels] if out_labels else None)
196 |       ref_total += my_ref_total
197 |       out_totals += my_out_totals
198 |       out_matches += my_out_matches
199 | 
200 |       my_ref_total_list.append(my_ref_total)
201 |       my_out_totals_list.append(my_out_totals)
202 |       my_out_matches_list.append(my_out_matches)
203 | 
204 |     # Calculate statistics
205 |     statistics = [[] for _ in range(num_outs)]
206 |     for oi, ostatistics in enumerate(statistics):
207 |       for bi in range(num_buckets):
208 |         mcnt, ocnt, rcnt = out_matches[oi,bi], out_totals[oi,bi], ref_total[bi]
209 |         if mcnt == 0:
210 |           rec, prec, fmeas = 0.0, 0.0, 0.0
211 |         else:
212 |           rec = mcnt / float(rcnt)
213 |           prec = mcnt / float(ocnt)
214 |           fmeas = 2 * prec * rec / (prec + rec)
215 |         ostatistics.append( (mcnt, rcnt, ocnt, rec, prec, fmeas) )
216 | 
217 |     return statistics, my_ref_total_list, my_out_totals_list, my_out_matches_list
218 | 
219 |   def calc_bucket_details(self, my_ref_total_list, my_out_totals_list, my_out_matches_list, num_samples=1000, sample_ratio=0.5):
220 |  
221 |     ref_total = np.array(my_ref_total_list).sum(0)
222 | 
223 |     num_outs, num_buckets = my_out_totals_list[0].shape
224 |     n = len(my_ref_total_list)
225 |     ids = list(range(n))
226 |     sample_size = int(np.ceil(n*sample_ratio))
227 |     rt_arr = np.array(my_ref_total_list)
228 |     ot_arr = np.array(my_out_totals_list)
229 |     om_arr = np.array(my_out_matches_list)
230 |     statistics = [[ [] for __ in range(num_buckets) ] for _ in range(num_outs)]
231 |     for _ in range(num_samples):
232 |       reduced_ids = np.random.choice(ids, size=sample_size, replace=True)
233 |       reduced_ref_total, reduced_out_totals, reduced_out_matches= rt_arr[reduced_ids].sum(0), ot_arr[reduced_ids].sum(0), om_arr[reduced_ids].sum(0)
234 |       # Calculate accuracy on the reduced sample and save stats
235 |       for oi in range(num_outs):
236 |         for bi in range(num_buckets):
237 |           mcnt, ocnt, rcnt = reduced_out_matches[oi,bi], reduced_out_totals[oi,bi], reduced_ref_total[bi]
238 |           if mcnt == 0:
239 |             rec, prec, fmeas = 0.0, 0.0, 0.0
240 |           else:
241 |             rec = mcnt / float(rcnt)
242 |             prec = mcnt / float(ocnt)
243 |             fmeas = 2 * prec * rec / (prec + rec)
244 |           statistics[oi][bi].append( (mcnt, rcnt, ocnt, rec, prec, fmeas) )
245 | 
246 |     intervals = [[] for _ in range(num_outs)]
247 |     for oi in range(num_outs):
248 |       for bi in range(num_buckets):
249 |         if len(statistics[oi][bi]) > 0: 
250 |           _, _, _, recs, precs, fmeas = zip(*statistics[oi][bi])
251 |         else:
252 |           recs, precs, fmeas = [0.0], [0.0], [0.0]
253 |         # The first three elements (intervals of mcnt, ocnt and rcnt) are None
254 |         bounds = [None, None, None]
255 |         for x in [recs, precs, fmeas]:
256 |           x = list(x)
257 |           x.sort()
258 |           lower_bound = x[int(num_samples * 0.025)]
259 |           upper_bound = x[int(num_samples * 0.975)]
260 |           bounds.append( (lower_bound, upper_bound) )
261 |         intervals[oi].append(bounds)
262 |  
263 |     return ref_total, intervals
264 | 
265 |   def calc_examples(self, num_sents, num_outs,
266 |                           statistics,
267 |                           my_ref_total_list, my_out_matches_list,
268 |                           num_examples=5):
269 |     """
270 |     Calculate examples based the computed statistics.
271 | 
272 |     Args:
273 |       num_sents: number of sentences
274 |       num_outs: number of outputs
275 |       statistics: containing a list of equal length to out, containing for each system
276 |         both_tot: the frequency of a particular bucket appearing in both output and reference
277 |         ref_tot: the frequency of a particular bucket appearing in just reference
278 |         out_tot: the frequency of a particular bucket appearing in just output
279 |         rec: recall of the bucket
280 |         prec: precision of the bucket
281 |         fmeas: f1-measure of the bucket
282 |       my_ref_total_list: containing a list of statistics of the reference
283 |       my_out_matches_list: containing a list of statistics of the outputs
284 |       num_examples: number of examples to print
285 | 
286 |     Returns:
287 |       example: containing a list of examples to print
288 |     """
289 |     num_buckets = len(self.bucket_strs)
290 |     num_examp_feats = 3
291 |     example_scores = np.zeros( (num_sents, num_examp_feats, num_buckets) )
292 | 
293 |     # Step through the sentences
294 |     for rsi, (my_ref_total, my_out_matches) in enumerate(zip(my_ref_total_list, my_out_matches_list)):
295 | 
296 |       # Scoring of examples across different dimensions:
297 |       #  0: overall variance of matches
298 |       example_scores[rsi,0] = (my_out_matches / (my_ref_total+1e-10).reshape( (1, num_buckets) )).std(axis=0)
299 |       #  1: overall percentage of matches
300 |       example_scores[rsi,1] = my_out_matches.sum(axis=0) / (my_ref_total*num_outs+1e-10)
301 |       #  2: overall percentage of misses
302 |       example_scores[rsi,2] = (my_ref_total*num_outs-my_out_matches.sum(axis=0)) / (my_ref_total*num_outs+1e-10)
303 | 
304 |     # Calculate statistics
305 |     # Find top-5 examples of each class
306 |     examples = [[('Examples where some systems were good, some were bad', []),
307 |                  ('Examples where all systems were good', []),
308 |                  ('Examples where all systems were bad', [])] for _ in range(num_buckets)]
309 |     # NOTE: This could be made faster with argpartition, but the complexity is probably not worth it
310 |     topn = np.argsort(-example_scores, axis=0)
311 |     for bi, bexamples in enumerate(examples):
312 |       for fi, (_, fexamples) in enumerate(bexamples):
313 |         for si in topn[:num_examples,fi,bi]:
314 |           if example_scores[si,fi,bi] > 0:
315 |             fexamples.append(si)
316 | 
317 |     return examples
318 | 
319 |   def calc_source_bucketed_matches(self, src, ref, out, ref_aligns, out_aligns, src_labels=None):
320 |     """
321 |     Calculate the number of matches, bucketed by the type of word we have
322 |     This must be used with a subclass that has self.bucket_strs defined, and self.calc_bucket(word) implemented.
323 | 
324 |     Args:
325 |       src: The source corpus
326 |       ref: The reference corpus
327 |       out: The output corpus
328 |       ref_aligns: Alignments of the reference corpus
329 |       out_aligns: Alignments of the output corpus
330 |       src_labels: Labels of the source corpus (optional)
331 | 
332 |     Returns:
333 |       A tuple containing:
334 |         both_tot: the frequency of a particular bucket appearing in both output and reference
335 |         ref_tot: the frequency of a particular bucket appearing in just reference
336 |         out_tot: the frequency of a particular bucket appearing in just output
337 |         rec: recall of the bucket
338 |         prec: precision of the bucket
339 |         fmeas: f1-measure of the bucket
340 |     """
341 |     if not hasattr(self, 'case_insensitive'):
342 |       self.case_insensitive = False
343 | 
344 |     src_labels = src_labels if src_labels else []
345 |     matches = [[0, 0, 0] for x in self.bucket_strs]
346 |     for src_sent, ref_sent, out_sent, ref_align, out_align, src_lab in itertools.zip_longest(src, ref, out, ref_aligns, out_aligns, src_labels):
347 |       ref_cnt = defaultdict(lambda: 0)
348 |       for i, word in enumerate(ref_sent):
349 |         if self.case_insensitive:
350 |           word = corpus_utils.lower(word)
351 |         ref_cnt[word] += 1
352 |       for i, (src_index, trg_index) in enumerate(out_align):
353 |         src_word = src_sent[src_index]
354 |         word = out_sent[trg_index]
355 |         if self.case_insensitive:
356 |           word = corpus_utils.lower(word)
357 |         bucket = self.calc_bucket(src_word,
358 |                                   label=src_lab[src_index] if src_lab else None)
359 |         if ref_cnt[word] > 0:
360 |           ref_cnt[word] -= 1
361 |           matches[bucket][0] += 1
362 |         matches[bucket][2] += 1
363 |       for i, (src_index, trg_index) in enumerate(ref_align):
364 |         src_word = src_sent[src_index]
365 |         bucket = self.calc_bucket(src_word,
366 |                                   label=src_lab[src_index] if src_lab else None)
367 |         matches[bucket][1] += 1
368 | 
369 |     for both_tot, ref_tot, out_tot in matches:
370 |       if both_tot == 0:
371 |         rec, prec, fmeas = 0.0, 0.0, 0.0
372 |       else:
373 |         rec = both_tot / float(ref_tot)
374 |         prec = both_tot / float(out_tot)
375 |         fmeas = 2 * prec * rec / (prec + rec)
376 |       yield both_tot, ref_tot, out_tot, rec, prec, fmeas
377 | 
378 |   def calc_bucketed_likelihoods(self, corpus, likelihoods):
379 |     """
380 |     Calculate the average of log likelihoods, bucketed by the type of word/label we have
381 |     This must be used with a subclass that has self.bucket_strs defined, and self.calc_bucket(word) implemented.
382 | 
383 |     Args:
384 |       corpus: The text/label corpus over which we compute the likelihoods
385 |       likelihoods: The log-likelihoods corresponding to each word/label in the corpus
386 | 
387 |     Returns:
388 |       the average log-likelihood bucketed by the type of word/label we have
389 |     """
390 |     if not hasattr(self, 'case_insensitive'):
391 |       self.case_insensitive = False
392 | 
393 |     if type(corpus) == str:
394 |       corpus = corpus_utils.load_tokens(corpus)
395 |     bucketed_likelihoods = [[0.0, 0] for _ in self.bucket_strs]
396 |     if len(corpus) != len(likelihoods):
397 |       raise ValueError("Corpus and likelihoods should have the same size.")
398 |     for sent, list_of_likelihoods in zip(corpus, likelihoods):
399 |       if len(sent) != len(list_of_likelihoods):
400 |         raise ValueError("Each sentence of the corpus should have likelihood value for each word")
401 | 
402 |       for word, ll in zip(sent, list_of_likelihoods):
403 |         if self.case_insensitive:
404 |           word = corpus_utils.lower(word)
405 |         bucket = self.calc_bucket(word, label=word)
406 |         bucketed_likelihoods[bucket][0] += ll
407 |         bucketed_likelihoods[bucket][1] += 1
408 | 
409 |     for ll, count in bucketed_likelihoods:
410 |       if count != 0:
411 |         yield ll/float(count)
412 |       else:
413 |         yield "NA" # not applicable
414 | 
415 | 
416 | class FreqWordBucketer(WordBucketer):
417 | 
418 |   def __init__(self,
419 |                freq_counts=None, freq_count_file=None, freq_corpus_file=None, freq_data=None,
420 |                bucket_cutoffs=None,
421 |                case_insensitive=False):
422 |     """
423 |     A bucketer that buckets words by their frequency.
424 | 
425 |     Args:
426 |       freq_counts: A dictionary containing word/count data.
427 |       freq_count_file: A file containing counts for each word in tab-separated word, count format.
428 |                        Ignored if freq_counts exists.
429 |       freq_corpus_file: A file with a corpus used for collecting counts. Ignored if freq_count_file exists.
430 |       freq_data: A tokenized corpus from which counts can be calculated. Ignored if freq_corpus_file exists.
431 |       bucket_cutoffs: Cutoffs for each bucket.
432 |                       The first bucket will be range(0,bucket_cutoffs[0]).
433 |                       Middle buckets will be range(bucket_cutoffs[i],bucket_cutoffs[i-1].
434 |                       Final bucket will be everything greater than bucket_cutoffs[-1].
435 |       case_insensitive: A boolean specifying whether to turn on the case insensitive option.
436 |     """
437 |     self.case_insensitive = case_insensitive
438 |     if not freq_counts:
439 |       freq_counts = defaultdict(lambda: 0)
440 |       if freq_count_file != None:
441 |         print(f'Reading frequency from "{freq_count_file}"')
442 |         with open(freq_count_file, "r") as f:
443 |           for line in f:
444 |             cols = line.strip().split('\t')
445 |             if len(cols) != 2:
446 |               print(f'Bad line in counts file {freq_count_file}, ignoring:\n{line}')
447 |             else:
448 |               word, freq = cols
449 |               if self.case_insensitive:
450 |                 word = corpus_utils.lower(word)
451 |               freq_counts[word] = int(freq)
452 |       elif freq_corpus_file:
453 |         print(f'Reading frequency from "{freq_corpus_file}"')
454 |         for words in corpus_utils.iterate_tokens(freq_corpus_file):
455 |           for word in words:
456 |             if self.case_insensitive:
457 |               word = corpus_utils.lower(word)
458 |             freq_counts[word] += 1
459 |       elif freq_data:
460 |         print('Reading frequency from the reference')
461 |         for words in freq_data:
462 |           for word in words:
463 |             if self.case_insensitive:
464 |               word = corpus_utils.lower(word)
465 |             freq_counts[word] += 1
466 |       else:
467 |         raise ValueError('Must have at least one source of frequency counts for FreqWordBucketer')
468 |     self.freq_counts = freq_counts
469 | 
470 |     if bucket_cutoffs is None:
471 |       bucket_cutoffs = [1, 2, 3, 4, 5, 10, 100, 1000]
472 |     self.set_bucket_cutoffs(bucket_cutoffs)
473 | 
474 |   def calc_bucket(self, word, label=None):
475 |     if self.case_insensitive:
476 |       word = corpus_utils.lower(word)
477 |     return self.cutoff_into_bucket(self.freq_counts.get(word, 0))
478 | 
479 |   def name(self):
480 |     return "frequency"
481 | 
482 |   def idstr(self):
483 |     return "freq"
484 | 
485 | class CaseWordBucketer(WordBucketer):
486 | 
487 |   def __init__(self):
488 |     """
489 |     A bucketer that buckets words by whether they're all all lower-case (lower), all upper-case (upper),
490 |     title case (title), or other.
491 |     """
492 |     self.bucket_strs = ['lower', 'upper', 'title', 'other']
493 | 
494 |   def calc_bucket(self, word, label=None):
495 |     if word.islower():
496 |       return 0
497 |     elif word.isupper():
498 |       return 1
499 |     elif word.istitle():
500 |       return 2
501 |     else:
502 |       return 3
503 | 
504 |   def name(self):
505 |     return "case"
506 | 
507 |   def idstr(self):
508 |     return "case"
509 | 
510 | class LabelWordBucketer(WordBucketer):
511 | 
512 |   def __init__(self,
513 |                label_set=None):
514 |     """
515 |     A bucketer that buckets words by their labels.
516 | 
517 |     Args:
518 |       label_set: The set of labels to use as buckets. This can be a list, or a string separated by '+'s.
519 |     """
520 |     if type(label_set) == str:
521 |       label_set = label_set.split('+')
522 |     self.bucket_strs = label_set + ['other']
523 |     label_set_len = len(label_set)
524 |     self.bucket_map = defaultdict(lambda: label_set_len)
525 |     for i, l in enumerate(label_set):
526 |       self.bucket_map[l] = i
527 | 
528 |   def calc_bucket(self, word, label=None):
529 |     if not label:
530 |       raise ValueError('When calculating buckets by label, label must be non-zero')
531 |     return self.bucket_map[label]
532 | 
533 |   def name(self):
534 |     return "labels"
535 | 
536 |   def idstr(self):
537 |     return "labels"
538 | 
539 | class MultiLabelWordBucketer(WordBucketer):
540 | 
541 |   def __init__(self,
542 |                label_set=None):
543 |     """
544 |     A bucketer that buckets words by one or multiple labels.
545 | 
546 |     Args:
547 |       label_set: The set of labels to use as buckets. This can be a list, or a string separated by '+'s.
548 |     """
549 |     if type(label_set) == str:
550 |       label_set = label_set.split('+')
551 |     self.bucket_strs = label_set + ['other']
552 |     label_set_len = len(label_set)
553 |     self.bucket_map = defaultdict(lambda: label_set_len)
554 |     for i, l in enumerate(label_set):
555 |       self.bucket_map[l] = i
556 | 
557 |   def calc_bucket(self, word, label=None):
558 |     if not label:
559 |       raise ValueError('When calculating buckets by label, label must be non-zero')
560 |     label = label.split('+')
561 |     return [self.bucket_map[l] for l in label]
562 | 
563 |   def name(self):
564 |     return "multilabels"
565 | 
566 |   def idstr(self):
567 |     return "multilabels"
568 | 
569 | class NumericalLabelWordBucketer(WordBucketer):
570 | 
571 |   def __init__(self,
572 |                bucket_cutoffs=None):
573 |     """
574 |     A bucketer that buckets words by labels that are numerical values.
575 | 
576 |     Args:
577 |       bucket_cutoffs: Cutoffs for each bucket.
578 |                       The first bucket will be range(0,bucket_cutoffs[0]).
579 |                       Middle buckets will be range(bucket_cutoffs[i],bucket_cutoffs[i-1].
580 |                       Final bucket will be everything greater than bucket_cutoffs[-1].
581 |     """
582 |     if bucket_cutoffs is None:
583 |       bucket_cutoffs = [0.25, 0.5, 0.75]
584 |     self.set_bucket_cutoffs(bucket_cutoffs)
585 | 
586 |   def calc_bucket(self, word, label=None):
587 |     if label:
588 |       return self.cutoff_into_bucket(float(label))
589 |     else:
590 |       raise ValueError('When calculating buckets by label must be non-zero')
591 | 
592 |   def name(self):
593 |     return "numerical labels"
594 | 
595 |   def idstr(self):
596 |     return "numlabels"
597 | 
598 | class SentenceBucketer(Bucketer):
599 | 
600 |   def calc_bucket(self, val, ref=None, src=None, out_label=None, ref_label=None):
601 |     """
602 |     Calculate the bucket for a particular sentence
603 | 
604 |     Args:
605 |       val: The sentence to calculate the bucket for
606 |       ref: The reference sentence, if it exists
607 |       src: The source sentence, if it exists
608 |       ref_labels: The label of the reference sentence, if it exists
609 |       out_labels: The label of the output sentence, if it exists
610 | 
611 |     Returns:
612 |       An integer ID of the bucket
613 |     """
614 |     raise NotImplementedError('calc_bucket must be implemented in subclasses of SentenceBucketer')
615 | 
616 |   def create_bucketed_corpus(self, out, ref=None, src=None, ref_labels=None, out_labels=None):
617 |     bucketed_corpus = [([],[] if ref else None, []) for _ in self.bucket_strs]
618 |     if ref is None:
619 |       ref = out
620 | 
621 |     if ref_labels is None:
622 |       ref_labels = out_labels
623 | 
624 |     src = [None for _ in out] if src is None else src
625 | 
626 |     for i, (out_words, ref_words, src_words) in enumerate(zip(out, ref, src)):
627 |       bucket = self.calc_bucket(out_words, ref_words, src_words, label=(ref_labels[i][0] if ref_labels else None))
628 | 
629 |       bucketed_corpus[bucket][0].append(out_words)
630 |       bucketed_corpus[bucket][1].append(ref_words)
631 |       bucketed_corpus[bucket][2].append(src_words)
632 |       
633 |     return bucketed_corpus
634 | 
635 | 
636 | class ScoreSentenceBucketer(SentenceBucketer):
637 |   """
638 |   Bucket sentences by some score (e.g. BLEU)
639 |   """
640 | 
641 |   def __init__(self, score_type, bucket_cutoffs=None, case_insensitive=False):
642 |     self.score_type = score_type
643 |     self.scorer = scorers.create_scorer_from_profile(score_type)
644 |     if bucket_cutoffs is None:
645 |       bucket_cutoffs = [x * self.scorer.scale / 10.0 for x in range(1,10)]
646 |     self.set_bucket_cutoffs(bucket_cutoffs, num_type='float')
647 |     self.case_insensitive = case_insensitive
648 | 
649 |   def calc_bucket(self, val, ref=None, src=None, label=None):
650 |     if self.case_insensitive:
651 |       return self.cutoff_into_bucket(self.scorer.score_sentence(corpus_utils.lower(ref), corpus_utils.lower(val))[0])
652 |     else:
653 |       return self.cutoff_into_bucket(self.scorer.score_sentence(ref, val, src)[0])
654 | 
655 |   def name(self):
656 |     return self.scorer.name()
657 | 
658 |   def idstr(self):
659 |     return self.scorer.idstr()
660 | 
661 | class LengthSentenceBucketer(SentenceBucketer):
662 |   """
663 |   Bucket sentences by length
664 |   """
665 | 
666 |   def __init__(self, bucket_cutoffs=None):
667 |     if bucket_cutoffs is None:
668 |       bucket_cutoffs = [10, 20, 30, 40, 50, 60]
669 |     self.set_bucket_cutoffs(bucket_cutoffs, num_type='int')
670 | 
671 |   def calc_bucket(self, val, ref=None, src=None, label=None):
672 |     return self.cutoff_into_bucket(len(ref))
673 | 
674 |   def name(self):
675 |     return "length"
676 | 
677 |   def idstr(self):
678 |     return "length"
679 | 
680 | class LengthDiffSentenceBucketer(SentenceBucketer):
681 |   """
682 |   Bucket sentences by length
683 |   """
684 | 
685 |   def __init__(self, bucket_cutoffs=None):
686 |     if bucket_cutoffs is None:
687 |       bucket_cutoffs = [-20, -10, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, 6, 11, 21]
688 |     self.set_bucket_cutoffs(bucket_cutoffs, num_type='int')
689 | 
690 |   def calc_bucket(self, val, ref=None, src=None, label=None):
691 |     return self.cutoff_into_bucket(len(val) - len(ref))
692 | 
693 |   def name(self):
694 |     return "len(output)-len(reference)"
695 | 
696 |   def idstr(self):
697 |     return "lengthdiff"
698 | 
699 | class LabelSentenceBucketer(SentenceBucketer):
700 | 
701 |   def __init__(self, label_set=None):
702 |     """
703 |     A bucketer that buckets sentences by their labels.
704 | 
705 |     Args:
706 |       label_set: The set of labels to use as buckets. This can be a list, or a string separated by '+'s.
707 |     """
708 |     if type(label_set) == str:
709 |       label_set = label_set.split('+')
710 |     self.bucket_strs = label_set + ['other']
711 |     label_set_len = len(label_set)
712 |     self.bucket_map = defaultdict(lambda: label_set_len)
713 |     for i, l in enumerate(label_set):
714 |       self.bucket_map[l] = i
715 | 
716 |   def calc_bucket(self, val, ref=None, src=None, label=None):
717 |     return self.bucket_map[label]
718 | 
719 |   def name(self):
720 |     return "labels"
721 | 
722 |   def idstr(self):
723 |     return "labels"
724 | 
725 | class MultiLabelSentenceBucketer(SentenceBucketer):
726 | 
727 |   def __init__(self, label_set=None):
728 |     """
729 |     A bucketer that buckets sentences by their labels.
730 | 
731 |     Args:
732 |       label_set: The set of labels to use as buckets. This can be a list, or a string separated by '+'s.
733 |     """
734 |     if type(label_set) == str:
735 |       label_set = label_set.split('+')
736 |     self.bucket_strs = label_set + ['other']
737 |     label_set_len = len(label_set)
738 |     self.bucket_map = defaultdict(lambda: label_set_len)
739 |     for i, l in enumerate(label_set):
740 |       self.bucket_map[l] = i
741 | 
742 |   def calc_bucket(self, val, ref=None, src=None, label=None):
743 |     label = label.split('+')
744 |     return [self.bucket_map[l] for l in label]
745 | 
746 |   def name(self):
747 |     return "multilabels"
748 | 
749 |   def idstr(self):
750 |     return "multilabels"
751 | 
752 | class NumericalLabelSentenceBucketer(SentenceBucketer):
753 | 
754 |   def __init__(self, bucket_cutoffs=None):
755 |     """
756 |     A bucketer that buckets sentences by labels that are numerical values.
757 | 
758 |     Args:
759 |       bucket_cutoffs: Cutoffs for each bucket.
760 |                       The first bucket will be range(0,bucket_cutoffs[0]).
761 |                       Middle buckets will be range(bucket_cutoffs[i],bucket_cutoffs[i-1].
762 |                       Final bucket will be everything greater than bucket_cutoffs[-1].
763 |     """
764 |     if bucket_cutoffs is None:
765 |       bucket_cutoffs = [0.25, 0.5, 0.75]
766 |     self.set_bucket_cutoffs(bucket_cutoffs)
767 | 
768 |   def calc_bucket(self, val, ref=None, src=None, label=None):
769 |     return self.cutoff_into_bucket(float(label))
770 | 
771 |   def name(self):
772 |     return "numerical labels"
773 | 
774 |   def idstr(self):
775 |     return "numlabels"
776 | 
777 | def create_word_bucketer_from_profile(bucket_type,
778 |                                       freq_counts=None, freq_count_file=None, freq_corpus_file=None, freq_data=None,
779 |                                       label_set=None,
780 |                                       bucket_cutoffs=None,
781 |                                       case_insensitive=False):
782 |   if type(bucket_cutoffs) == str:
783 |     bucket_cutoffs = [arg_utils.parse_intfloat(x) for x in bucket_cutoffs.split(':')]
784 |   if bucket_type == 'freq':
785 |     return FreqWordBucketer(
786 |       freq_counts=freq_counts,
787 |       freq_count_file=freq_count_file,
788 |       freq_corpus_file=freq_corpus_file,
789 |       freq_data=freq_data,
790 |       bucket_cutoffs=bucket_cutoffs,
791 |       case_insensitive=case_insensitive)
792 |   if bucket_type == 'case':
793 |     return CaseWordBucketer()
794 |   elif bucket_type == 'label':
795 |     return LabelWordBucketer(
796 |       label_set=label_set)
797 |   elif bucket_type == 'multilabel':
798 |     return MultiLabelWordBucketer(
799 |       label_set=label_set)
800 |   elif bucket_type == 'numlabel':
801 |     return NumericalLabelWordBucketer(
802 |       bucket_cutoffs=bucket_cutoffs)
803 |   else:
804 |     raise ValueError(f'Illegal bucket type {bucket_type}')
805 | 
806 | def create_sentence_bucketer_from_profile(bucket_type,
807 |                                           score_type=None,
808 |                                           bucket_cutoffs=None,
809 |                                           label_set=None,
810 |                                           case_insensitive=False):
811 |   if type(bucket_cutoffs) == str:
812 |     bucket_cutoffs = [arg_utils.parse_intfloat(x) for x in bucket_cutoffs.split(':')]
813 |   if bucket_type == 'score':
814 |     return ScoreSentenceBucketer(score_type, bucket_cutoffs=bucket_cutoffs, case_insensitive=case_insensitive)
815 |   elif bucket_type == 'length':
816 |     return LengthSentenceBucketer(bucket_cutoffs=bucket_cutoffs)
817 |   elif bucket_type == 'lengthdiff':
818 |     return LengthDiffSentenceBucketer(bucket_cutoffs=bucket_cutoffs)
819 |   elif bucket_type == 'label':
820 |     return LabelSentenceBucketer(label_set=label_set)
821 |   elif bucket_type == 'multilabel':
822 |     return MultiLabelSentenceBucketer(
823 |       label_set=label_set)
824 |   elif bucket_type == 'numlabel':
825 |     return NumericalLabelSentenceBucketer(bucket_cutoffs=bucket_cutoffs)
826 |   else:
827 |     raise NotImplementedError(f'Illegal bucket type {bucket_type}')
828 | 


--------------------------------------------------------------------------------
/compare_mt/cache_utils.py:
--------------------------------------------------------------------------------
 1 | from functools import lru_cache
 2 | from nltk.stem.porter import PorterStemmer
 3 | 
 4 | def extract_cache_dicts(cache_dicts, key_list, num_out):
 5 |   if cache_dicts is not None:
 6 |     if len(cache_dicts) != num_out:
 7 |        raise ValueError(f'Length of cache_dicts should be equal to the number of output files!')
 8 |     if len(key_list) == 1:
 9 |       return [c[key_list[0]] for c in cache_dicts]
10 |     return zip(*[[c[k] for k in key_list] for c in cache_dicts]) 
11 | 
12 |   return [None]*len(key_list)
13 | 
14 | def return_cache_dict(key_list, value_list):
15 |   for v in value_list:
16 |     if len(v) != 1:
17 |       raise ValueError(f'Only support caching for one system at a time!')
18 |   cache_dict = {k:v[0] for (k, v) in zip(key_list, value_list)}
19 |   return cache_dict
20 | 
21 | class CachedPorterStemmer(PorterStemmer):
22 |   """A wrapper class for PorterStemmer that uses LRU cache to reduce latency"""
23 |   def __init__(self, mode=PorterStemmer.NLTK_EXTENSIONS):
24 |     super().__init__(mode)
25 | 
26 |   @lru_cache(maxsize=50000)
27 |   def stem(self, word, to_lowercase=True):
28 |     return super().stem(word, to_lowercase)


--------------------------------------------------------------------------------
/compare_mt/compare_ll_main.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | 
 3 | # In-package imports
 4 | from compare_mt import corpus_utils
 5 | from compare_mt import bucketers
 6 | from compare_mt import arg_utils
 7 | from compare_mt import print_utils
 8 | from compare_mt import formatting
 9 | 
10 | def print_word_likelihood_report(ref, lls, bucket_type='freq', bucket_cutoffs=None,
11 |                           freq_count_file=None, freq_corpus_file=None,
12 |                           label_corpus=None, label_set=None,
13 |                           case_insensitive=False):
14 |   """
15 |   Print a report comparing the word log likelihood.
16 | 
17 |   Args:
18 |   ref: the ref of words over which the likelihoods are computed
19 |   lls: likelihoods corresponding to each word in ref from the systems
20 |   bucket_type: A string specifying the way to bucket words together to calculate average likelihood
21 |   bucket_cutoffs: The boundaries between buckets, specified as a colon-separated string.
22 |   freq_corpus_file: When using "freq" as a bucketer, which corpus to use to calculate frequency.
23 |   freq_count_file: An alternative to freq_corpus that uses a count file in "word\tfreq" format.
24 |   label_corpus: When using "label" as bucket type, the corpus containing the labels
25 |                 corresponding to each word in the corpus
26 |   label_set: the permissible set of labels when using "label" as a bucket type
27 |   case_insensitive: A boolean specifying whether to turn on the case insensitive option
28 |   """
29 |   case_insensitive = True if case_insensitive == 'True' else False
30 | 
31 |   bucketer = bucketers.create_word_bucketer_from_profile(bucket_type=bucket_type,
32 |                                                          bucket_cutoffs=bucket_cutoffs,
33 |                                                          freq_count_file=freq_count_file,
34 |                                                          freq_corpus_file=freq_corpus_file,
35 |                                                          label_set=label_set,
36 |                                                          case_insensitive=case_insensitive)
37 | 
38 |   if type(label_corpus) == str:
39 |     label_corpus = corpus_utils.load_tokens(label_corpus)
40 | 
41 |   if label_corpus is not None:
42 |     ref = label_corpus
43 | 
44 |   lls_out = [[l for l in bucketer.calc_bucketed_likelihoods(ref, ll)] for ll in lls]
45 | 
46 |   print(f'--- average word log likelihood by {bucketer.name()} bucket')
47 |   for i, bucket_str in enumerate(bucketer.bucket_strs):
48 |     print (bucket_str + "\t", end='')
49 |     for ll_out in lls_out:
50 |       print(f"{formatting.fmt(ll_out[i])}\t", end="")
51 |     print()
52 | 
53 | def main():
54 |   parser = argparse.ArgumentParser(
55 |     description='Program to compare MT results',
56 |   )
57 |   parser.add_argument('--ref-file', type=str, dest='ref_file',
58 |                     help='A path to a reference file over which the likelihoods are being computed/compared')
59 |   parser.add_argument('--ll-files', type=str, nargs='+', dest='ll_files',
60 |                     help='A path to file containing log likelihoods for ref-file generated by systems')
61 |   parser.add_argument('--compare-word-likelihoods', type=str, dest='compare_word_likelihoods', nargs='*',
62 |                     default=['bucket_type=freq'],
63 |                     help="""
64 |                     Compare word log likelihoods by buckets. Can specify arguments in 'arg1=val1,arg2=val2,...' format.
65 |                     See documentation for 'print_word_likelihood_report' to see which arguments are available.
66 |                     """)
67 |   parser.add_argument('--decimals', type=int, default=4,
68 |                       help="Number of decimals to print for floating point numbers")
69 | 
70 |   args = parser.parse_args()
71 | 
72 |   # Set formatting
73 |   
74 |   # Set formatting
75 |   formatting.fmt.set_decimals(args.decimals)
76 | 
77 |   ref = corpus_utils.load_tokens(args.ref_file)
78 |   lls = [corpus_utils.load_nums(x) for x in args.ll_files] 
79 | 
80 |   # Word likelihood analysis
81 |   if args.compare_word_likelihoods:
82 |     print_utils.print_header('Word Likelihood Analysis')
83 |     for profile in args.compare_word_likelihoods:
84 |       kargs = arg_utils.parse_profile(profile)
85 |       print_word_likelihood_report(ref, lls, **kargs)
86 |       print()
87 | 
88 | 
89 | if __name__ == '__main__':
90 |   main()
91 | 


--------------------------------------------------------------------------------
/compare_mt/corpus_utils.py:
--------------------------------------------------------------------------------
 1 | def iterate_tokens(filename):
 2 |   with open(filename, "r", encoding="utf-8") as f:
 3 |     for line in f:
 4 |       yield line.strip().split(' ')
 5 | 
 6 | def load_tokens(filename):
 7 |   return list(iterate_tokens(filename))
 8 | 
 9 | def iterate_nums(filename):
10 |   with open(filename, "r", encoding="utf-8") as f:
11 |     for line in f:
12 |       yield [float(i) for i in line.strip().split(' ')]
13 | 
14 | def load_nums(filename):
15 |   return list(iterate_nums(filename))
16 | 
17 | def iterate_alignments(filename):
18 |   with open(filename, "r", encoding="utf-8") as f:
19 |     for line in f:
20 |       try:
21 |         yield [(int(src),int(trg)) for (src,trg) in [x.split('-') for x in line.strip().split(' ')]]
22 |       except:
23 |         raise ValueError(f'Poorly formed alignment line in {filename}:\n{line}')
24 | 
25 | def load_alignments(filename):
26 |   return list(iterate_alignments(filename))
27 | 
28 | def lower(inp):
29 |   return inp.lower() if type(inp) == str else [lower(x) for x in inp]
30 | 
31 | def list2str(l):
32 |   string = ''
33 |   for i, s in enumerate(l):
34 |     string = string + ' ' + str(s) if i != 0 else string + str(s)
35 |   return string
36 |   
37 | def write_tokens(filename, ls):
38 |   with open(filename, 'w') as f:
39 |     for i, l in enumerate(ls):
40 |       string = list2str(l)
41 |       string = '\n' + string if i != 0 else string
42 |       f.write(string)
43 |   return string
44 | 


--------------------------------------------------------------------------------
/compare_mt/formatting.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | 
 3 | class Formatter(object):
 4 | 
 5 |     latex_substitutions = {
 6 |         re.compile("\["): "{[}",
 7 |         re.compile("\]"): "{]}",
 8 |         re.compile("<"): r"\\textless",
 9 |         re.compile(">"): r"\\textgreater"
10 |     }
11 | 
12 |     def __init__(self, decimals=4):
13 |         self.set_decimals(decimals)
14 | 
15 |     def set_decimals(self, decimals):
16 |         self.decimals = decimals
17 |     
18 |     def escape_latex(self, x):
19 |         """Adds escape sequences wherever needed to make the output
20 |         LateX compatible"""
21 |         for pat, replace_with in self.latex_substitutions.items():
22 |             x = pat.sub(replace_with, x)
23 |         return x
24 | 
25 |     def __call__(self, x, latex=True):
26 |         """Convert object to string with controlled decimals"""
27 |         if isinstance(x, str):
28 |             return self.escape_latex(x) if latex else x
29 |         elif isinstance(x, int):
30 |             return f"{x:d}"
31 |         elif isinstance(x, float):
32 |             return f"{x:.{self.decimals}f}"
33 |         else:
34 |             str(x)
35 | 
36 | fmt = Formatter(decimals=4)
37 | 


--------------------------------------------------------------------------------
/compare_mt/ngram_utils.py:
--------------------------------------------------------------------------------
 1 | from collections import defaultdict
 2 | import itertools
 3 | 
 4 | def sent_ngrams_list(words, n):
 5 |   """
 6 |   Create a list with all the n-grams in a sentence
 7 | 
 8 |   Arguments:
 9 |     words: A list of strings representing a sentence
10 |     n: The ngram length to consider
11 | 
12 |   Returns:
13 |     A list of n-grams in the sentence
14 |   """
15 |   word_ngram = []
16 |   for i in range(len(words) - n + 1):
17 |     ngram = tuple(words[i:i + n])
18 |     word_ngram.append(ngram)
19 |   return word_ngram
20 | 
21 | def iterate_sent_ngrams(words, labels=None, min_length=1, max_length=4):
22 |   """
23 |   Create a list with all the n-grams in a sentence
24 | 
25 |   Arguments:
26 |     words: A list of strings representing a sentence
27 |     labels: A list of labels on each word in the sentence, optional (will use `words` if not specified)
28 |     min_length: The minimum ngram length to consider
29 |     max_length: The maximum ngram length to consider
30 | 
31 |   Returns:
32 |     An iterator over n-grams in the sentence with both words and labels
33 |   """
34 |   if labels is not None and len(labels) != len(words):
35 |     raise ValueError(f'length of labels and sentence must be the same but got'
36 |                      f' {len(words)} != {len(labels)} at\n{words}\n{labels}')
37 |   for n in range(min_length-1, max_length):
38 |     for i in range(len(words) - n):
39 |       word_ngram = tuple(words[i:i + n + 1])
40 |       label_ngram = tuple(labels[i:i + n + 1]) if (labels is not None) else word_ngram
41 |       yield word_ngram, label_ngram
42 | 
43 | def compare_ngrams(ref, out, ref_labels=None, out_labels=None, min_length=1, max_length=4):
44 |   """
45 |   Compare n-grams appearing in the reference sentences and output
46 | 
47 |   Args:
48 |     ref: A list of reference sentences
49 |     out: A list of output sentences
50 |     ref_labels: Alternative labels for reference words (e.g. POS tags) to use when aggregating counts
51 |     out_labels: Alternative labels for output words (e.g. POS tags) to use when aggregating counts
52 |     min_length: The minimum length of n-grams to consider
53 |     max_length: The maximum length of n-grams to consider
54 | 
55 |   Returns:
56 |     A tuple of dictionaries including
57 |       total: the total number of n-grams in the output
58 |       match: the total number of matched n-grams appearing in both output and reference
59 |       over: the total number of over-generated n-grams appearing in output but not reference
60 |       under: the total number of under-generated n-grams appearing in output but not reference
61 |   """
62 |   if (ref_labels is None) != (out_labels is None):
63 |     raise ValueError('ref_labels or out_labels must both be either None or not None')
64 |   total, match, over, under = [defaultdict(lambda: 0) for _ in range(4)]
65 |   if ref_labels is None: ref_labels = []
66 |   if out_labels is None: out_labels = []
67 |   for ref_sent, out_sent, ref_lab, out_lab in itertools.zip_longest(ref, out, ref_labels, out_labels):
68 |     # Find the number of reference n-grams (on a word level)
69 |     ref_ngrams = list(iterate_sent_ngrams(ref_sent, labels=ref_lab, min_length=min_length, max_length=max_length))
70 |     ref_word_counts = defaultdict(lambda: 0)
71 |     for ref_w, ref_l in ref_ngrams:
72 |       ref_word_counts[ref_w] += 1
73 |     # Step through the output ngrams and find matched and overproduced ones
74 |     for out_w, out_l in iterate_sent_ngrams(out_sent, labels=out_lab, min_length=min_length, max_length=max_length):
75 |       total[out_l] += 1
76 |       if ref_word_counts[out_w] > 0:
77 |         match[out_l] += 1
78 |         ref_word_counts[out_w] -= 1
79 |       else:
80 |         over[out_l] += 1
81 |     # Remaining ones are underproduced
82 |     # (do reverse order just to make ordering consistent for over and under, shouldn't matter much)
83 |     for ref_w, ref_l in reversed(ref_ngrams):
84 |       if ref_word_counts[ref_w] > 0:
85 |         under[ref_l] += 1
86 |         ref_word_counts[ref_w] -= 1
87 |   return total, match, over, under
88 | 


--------------------------------------------------------------------------------
/compare_mt/print_utils.py:
--------------------------------------------------------------------------------
1 | def print_header(header):
2 |   print(f'********************** {header} ************************')


--------------------------------------------------------------------------------
/compare_mt/reporters.py:
--------------------------------------------------------------------------------
  1 | import matplotlib
  2 | matplotlib.use('agg')
  3 | from matplotlib import pyplot as plt
  4 | from cycler import cycler
  5 | plt.rcParams['font.family'] = 'sans-serif'
  6 | plt.rcParams['axes.prop_cycle'] = cycler(color=["#7293CB", "#E1974C", "#84BA5B", "#D35E60", "#808585", "#9067A7", "#AB6857", "#CCC210"])
  7 | import numpy as np
  8 | import os
  9 | import itertools
 10 | from compare_mt.formatting import fmt
 11 | 
 12 | from functools import partial
 13 | from http.server import SimpleHTTPRequestHandler, HTTPServer
 14 | import socket
 15 | from pathlib import Path
 16 | import logging as log
 17 | 
 18 | log.basicConfig(level=log.INFO)
 19 | 
 20 | # Global variables used by all reporters. These are set by compare_mt_main.py
 21 | sys_names = None
 22 | fig_size = None
 23 | 
 24 | # The CSS style file to use
 25 | css_style = """
 26 | html {
 27 |   font-family: sans-serif;
 28 | }
 29 | 
 30 | table, th, td {
 31 |   border: 1px solid black;
 32 | }
 33 | 
 34 | th, td {
 35 |   padding: 2px;
 36 | }
 37 | 
 38 | tr:hover {background-color: #f5f5f5;}
 39 | 
 40 | tr:nth-child(even) {background-color: #f2f2f2;}
 41 | 
 42 | th {
 43 |   background-color: #396AB1;
 44 |   color: white;
 45 | }
 46 | 
 47 | em {
 48 |   font-weight: bold;
 49 | }
 50 | 
 51 | caption {
 52 |   font-size: 14pt;
 53 |   font-weight: bold;
 54 | }
 55 | 
 56 | table {
 57 |   border-collapse: collapse;
 58 | }
 59 | """
 60 | 
 61 | # The Javascript header to use
 62 | javascript_style = """
 63 | function showhide(elem) {
 64 |   var x = document.getElementById(elem);
 65 |   if (x.style.display === "none") {
 66 |     x.style.display = "block";
 67 |   } else {
 68 |     x.style.display = "none";
 69 |   }
 70 | }
 71 | """
 72 | 
 73 | fig_counter, tab_counter = 0, 0
 74 | def next_fig_id():
 75 |   global fig_counter
 76 |   fig_counter += 1
 77 |   return f'{fig_counter:03d}'
 78 | def next_tab_id():
 79 |   global tab_counter
 80 |   tab_counter += 1
 81 |   return f'{tab_counter:03d}'
 82 | 
 83 | def make_bar_chart(datas,
 84 |                    output_directory, output_fig_file, output_fig_format='png',
 85 |                    errs=None, title=None, xlabel=None, xticklabels=None, ylabel=None):
 86 |   fig, ax = plt.subplots(figsize=fig_size)
 87 |   ind = np.arange(len(datas[0]))
 88 |   width = 0.7/len(datas)
 89 |   bars = []
 90 |   for i, data in enumerate(datas):
 91 |     err = errs[i] if errs != None else None
 92 |     bars.append(ax.bar(ind+i*width, data, width, bottom=0, yerr=err))
 93 |   # Set axis/title labels
 94 |   if title is not None:
 95 |     ax.set_title(title)
 96 |   if xlabel is not None:
 97 |     ax.set_xlabel(xlabel)
 98 |   if ylabel is not None:
 99 |     ax.set_ylabel(ylabel)
100 |   if xticklabels is not None:
101 |     ax.set_xticks(ind + width / 2)
102 |     ax.set_xticklabels(xticklabels)
103 |     plt.xticks(rotation=70)
104 |   else:
105 |     ax.xaxis.set_visible(False) 
106 | 
107 |   ax.legend(bars, sys_names)
108 |   ax.autoscale_view()
109 | 
110 |   if not os.path.exists(output_directory):
111 |     os.makedirs(output_directory)
112 |   out_file = os.path.join(output_directory, f'{output_fig_file}.{output_fig_format}')
113 |   plt.savefig(out_file, format=output_fig_format, bbox_inches='tight')
114 | 
115 | def html_img_reference(fig_file, title):
116 |   latex_code_pieces = [r"\begin{figure}[h]",
117 |                        r"  \centering",
118 |                        r"  \includegraphics{" + fig_file + ".pdf}",
119 |                        r"  \caption{" + title + "}",
120 |                        r"  \label{fig:" + fig_file + "}",
121 |                        r"\end{figure}"]
122 |   latex_code = "\n".join(latex_code_pieces)
123 |   return (f'<img src="{fig_file}.png" alt="{title}"> <br/>' +
124 |           f'<button onclick="showhide(\'{fig_file}_latex\')">Show/Hide LaTeX</button> <br/>' +
125 |           f'<pre id="{fig_file}_latex" style="display:none">{latex_code}</pre>')
126 | 
127 | class Report: 
128 |   # def __init__(self, iterable=(), **kwargs):
129 |   #   # Initialize a report by a dictionary which contains all the statistics
130 |   #   self.__dict__.update(iterable, **kwargs)
131 |   
132 |   def print(self): 
133 |     raise NotImplementedError('print must be implemented in subclasses of Report')
134 | 
135 |   def plot(self, output_directory, output_fig_file, output_fig_type):
136 |     raise NotImplementedError('plot must be implemented in subclasses of Report')
137 | 
138 |   def print_header(self, header):
139 |     print(f'********************** {header} ************************')
140 | 
141 |   def print_tabbed_table(self, tab):
142 |     for x in tab:
143 |       print('\t'.join([fmt(y, latex=False) if y else '' for y in x]))
144 |     print()
145 | 
146 |   def generate_report(self, output_fig_file=None, output_fig_format=None, output_directory=None):
147 |     self.print()
148 | 
149 | class ScoreReport(Report):
150 |   def __init__(self, scorer, scores, strs,
151 |                wins=None, sys_stats=None, prob_thresh=0.05,
152 |                title=None):
153 |     self.scorer = scorer 
154 |     self.scores = scores
155 |     self.strs = [f'{fmt(x)} ({y})' if y else fmt(x) for (x,y) in zip(scores,strs)]
156 |     self.wins = wins
157 |     self.sys_stats = sys_stats
158 |     self.output_fig_file = f'{next_fig_id()}-score-{scorer.idstr()}'
159 |     self.prob_thresh = prob_thresh
160 |     self.title = scorer.name() if not title else title
161 | 
162 |   def winstr_pval(self, my_wins):
163 |     if 1-my_wins[0] < self.prob_thresh:
164 |       winstr = 's1>s2'
165 |     elif 1-my_wins[1] < self.prob_thresh:
166 |       winstr = 's2>s1'
167 |     else:
168 |       winstr = '-'
169 |     pval = 1-(my_wins[0] if my_wins[0] > my_wins[1] else my_wins[1])
170 |     return winstr, pval
171 | 
172 |   def scores_to_tables(self):
173 |     if self.wins is None:
174 |       # Single table with just scores
175 |       return [[""]+sys_names, [self.scorer.name()]+self.strs], None
176 |     elif len(self.scores) == 1:
177 |       # Single table with scores for one system
178 |       return [
179 |         [""]+sys_names,
180 |         [self.scorer.name()]+self.strs,
181 |         [""]+[f'[{fmt(x["lower_bound"])},{fmt(x["upper_bound"])}]' for x in self.sys_stats]
182 |       ], None
183 |     elif len(self.scores) == 2:
184 |       # Single table with scores and wins for two systems
185 |       winstr, pval = self.winstr_pval(self.wins[0][1])
186 |       return [
187 |         [""]+sys_names+["Win?"],
188 |         [self.scorer.name()]+self.strs+[winstr],
189 |         [""]+[f'[{fmt(x["lower_bound"])},{fmt(x["upper_bound"])}]' for x in self.sys_stats]+[f'p={fmt(pval)}']
190 |       ], None
191 |     else:
192 |       # Table with scores, and separate one with wins for multiple systems
193 |       wptable = [['v s1 / s2 ->'] + [sys_names[i] for i in range(1,len(self.scores))]]
194 |       for i in range(0, len(self.scores)-1):
195 |         wptable.append([sys_names[i]] + [""] * (len(self.scores)-1))
196 |       for (left,right), my_wins in self.wins:
197 |         winstr, pval = self.winstr_pval(my_wins)
198 |         wptable[left+1][right] = f'{winstr} (p={fmt(pval)})'
199 |       return [[""]+sys_names, [self.scorer.name()]+self.strs], wptable
200 | 
201 |   def print(self):
202 |     aggregate_table, win_table = self.scores_to_tables()
203 |     self.print_header('Aggregate Scores')
204 |     print(f'{self.title}:')
205 |     self.print_tabbed_table(aggregate_table)
206 |     if win_table:
207 |       self.print_tabbed_table(win_table)
208 | 
209 |   def plot(self, output_directory, output_fig_file, output_fig_format='pdf'):
210 |     sys = [[score] for score in self.scores]
211 |     if self.wins:
212 |       sys_errs = [np.array([ [score-stat['lower_bound']], [stat['upper_bound']-score] ]) for (score,stat) in zip(self.scores, self.sys_stats)]
213 |     else:
214 |       sys_errs = None
215 |     xticklabels = None
216 | 
217 |     make_bar_chart(sys,
218 |                    output_directory, output_fig_file,
219 |                    output_fig_format=output_fig_format,
220 |                    errs=sys_errs, ylabel=self.scorer.name(),
221 |                    xticklabels=xticklabels)
222 | 
223 |   def html_content(self, output_directory):
224 |     aggregate_table, win_table = self.scores_to_tables()
225 |     html = html_table(aggregate_table, title=self.title)
226 |     if win_table:
227 |       html += html_table(win_table, title=f'{self.scorer.name()} Wins')
228 |     for ext in ('png', 'pdf'):
229 |       self.plot(output_directory, self.output_fig_file, ext)
230 |     html += html_img_reference(self.output_fig_file, 'Score Comparison')
231 |     return html
232 |     
233 | class WordReport(Report):
234 |   def __init__(self, bucketer, statistics,
235 |                acc_type, header,
236 |                examples=None,
237 |                bucket_cnts=None,
238 |                bucket_intervals=None,
239 |                src_sents=None,
240 |                ref_sents=None, ref_labels=None,
241 |                out_sents=None, out_labels=None,
242 |                src_labels=None, ref_aligns=None,
243 |                title=None):
244 |     self.bucketer = bucketer
245 |     self.statistics = [[s for s in stat] for stat in statistics]
246 |     self.examples = examples
247 |     self.bucket_cnts = bucket_cnts
248 |     self.bucket_intervals = bucket_intervals
249 |     self.src_sents = src_sents
250 |     self.ref_sents = ref_sents
251 |     self.ref_labels = ref_labels
252 |     self.out_sents = out_sents
253 |     self.out_labels = out_labels
254 |     self.src_labels = src_labels
255 |     self.ref_aligns = ref_aligns
256 |     self.acc_type = acc_type
257 |     self.header = header
258 |     self.acc_type_map = {'prec': 3, 'rec': 4, 'fmeas': 5}
259 |     self.output_fig_file = f'{next_fig_id()}-wordacc-{bucketer.name()}'
260 |     self.title = title if title else f'word {acc_type} by {bucketer.name()} bucket'
261 | 
262 |   def print(self):
263 |     acc_type_map = self.acc_type_map
264 |     bucketer, statistics, acc_type, header = self.bucketer, self.statistics, self.acc_type, self.header
265 |     self.print_header(header)
266 |     acc_types = acc_type.split('+')
267 |     for at in acc_types:
268 |       if at not in acc_type_map:
269 |         raise ValueError(f'Unknown accuracy type {at}')
270 |       aid = acc_type_map[at]
271 |       print(f'--- {self.title}')
272 |       # first line
273 |       print(f'{bucketer.name()}', end='')
274 |       if self.bucket_cnts is not None:
275 |         print(f'\t# words', end='')
276 |       for sn in sys_names:
277 |         print(f'\t{sn}', end='')
278 |       print()
279 |       # stats
280 |       for i, bucket_str in enumerate(bucketer.bucket_strs):
281 |         print(f'{bucket_str}', end='')
282 |         if self.bucket_cnts is not None:
283 |           print(f'\t{self.bucket_cnts[i]}', end='')
284 |         for j, match in enumerate(statistics):
285 |           print(f'\t{fmt(match[i][aid])}', end='')
286 |           if self.bucket_intervals is not None:
287 |             low, up = self.bucket_intervals[j][i][aid]
288 |             print(f' [{fmt(low)}, {fmt(up)}]', end='')
289 |         print()
290 |       print()
291 | 
292 |   def plot(self, output_directory, output_fig_file, output_fig_format='pdf'):
293 |     acc_types = self.acc_type.split('+')
294 |     for at in acc_types:
295 |       if at not in self.acc_type_map:
296 |         raise ValueError(f'Unknown accuracy type {at}')
297 |       aid = self.acc_type_map[at]
298 |       sys = [[m[aid] for m in match] for match in self.statistics]
299 |       xticklabels = [s for s in self.bucketer.bucket_strs] 
300 | 
301 |       if self.bucket_intervals:
302 |         errs = []
303 |         for i, match in enumerate(sys):
304 |           lows, ups = [], []
305 |           for j, score in enumerate(match):
306 |             low, up = self.bucket_intervals[i][j][aid] 
307 |             lows.append(score-low)
308 |             ups.append(up-score)
309 |           errs.append(np.array([lows, ups]) )
310 |       else:
311 |         errs = None
312 | 
313 |       make_bar_chart(sys,
314 |                      output_directory, output_fig_file,
315 |                      output_fig_format=output_fig_format,
316 |                      errs=errs,
317 |                      xlabel=self.bucketer.name(), ylabel=at,
318 |                      xticklabels=xticklabels)
319 | 
320 |   def highlight_words(self, sent, hls=None):
321 |     if not hls:
322 |       return ' '.join(sent)
323 |     return ' '.join([f'<em>{w}</em>' if hl else w for (w,hl) in zip(sent, hls)])
324 | 
325 |   def write_examples(self, title, output_directory):
326 |     # Create separate examples HTML file
327 |     html = ''
328 |     for bi, bucket_examples in enumerate(self.examples):
329 |       html += f'<a name="bucket{bi}"/>'
330 |       html += tag_str('h3', f'Examples for Bucket {self.bucketer.bucket_strs[bi]}')
331 |       for tag, examp_ids in bucket_examples:
332 |         #  Skip ones with no examples
333 |         if len(examp_ids) == 0:
334 |           continue
335 |         html += tag_str('h4', tag)
336 |         for eid in examp_ids:
337 |           table = [['', 'Output']]
338 |           # Find buckets for the examples if it's on the source side (will have alignments in this case)
339 |           if self.ref_aligns:
340 |             _, _, _, src_buckets, ref_aligns, ref_matches = \
341 |               self.bucketer._calc_src_buckets_and_matches(self.src_sents[eid],
342 |                                                           self.src_labels[eid] if self.src_labels else None,
343 |                                                           self.ref_sents[eid],
344 |                                                           self.ref_aligns[eid],
345 |                                                           [x[eid] for x in self.out_sents])
346 |             src_hls = [x == bi for x in src_buckets]
347 |             table.append(['Src', self.highlight_words(self.src_sents[eid], src_hls)])
348 |             ref_hls = [False for _ in self.ref_sents[eid]]
349 |             out_hls = [[False for _ in x[eid]] for x in self.out_sents]
350 |             for sid, tid in self.ref_aligns[eid]:
351 |               if src_hls[sid]:
352 |                 ref_hls[tid] = True
353 |                 for rm, ohls in zip(ref_matches, out_hls):
354 |                   if rm[tid] >= 0:
355 |                     ohls[rm[tid]] = True
356 |           # Find buckets for the examples if it's on the target side
357 |           else:
358 |             _, _, _, ref_buckets, out_buckets, out_matches = \
359 |               self.bucketer._calc_trg_buckets_and_matches(self.ref_sents[eid],
360 |                                                           self.ref_labels[eid] if self.ref_labels else None,
361 |                                                           [x[eid] for x in self.out_sents],
362 |                                                           [x[eid] for x in self.out_labels] if self.out_labels else None)
363 |             ref_hls = [x == bi for x in ref_buckets]
364 |             out_hls = [[(b == bi and m >= 0) for (b,m) in zip(ob, om)] for (ob, om) in zip(out_buckets, out_matches)]
365 |           table.append(['Ref', self.highlight_words(self.ref_sents[eid], ref_hls)])
366 |           for sn, oss, ohl in itertools.zip_longest(sys_names, self.out_sents, out_hls):
367 |             table.append([sn, self.highlight_words(oss[eid], ohl)])
368 |           html += html_table(table, None)
369 |     with open(f'{output_directory}/{self.output_fig_file}.html', 'w') as example_stream:
370 |       example_stream.write(styled_html_message(title, html))
371 | 
372 |   def html_content(self, output_directory):
373 |     acc_type_map = self.acc_type_map
374 |     bucketer, matches, acc_type, header = self.bucketer, self.statistics, self.acc_type, self.header
375 |     acc_types = acc_type.split('+')
376 | 
377 |     title = f'Word {acc_type} by {bucketer.name()} bucket' if not self.title else self.title
378 | 
379 |     if self.examples:
380 |       self.write_examples(title, output_directory)
381 | 
382 |     # Create main HTML content
383 |     html = ''
384 |     for at in acc_types:
385 |       if at not in acc_type_map:
386 |         raise ValueError(f'Unknown accuracy type {at}')
387 |       aid = acc_type_map[at]
388 |       line = [bucketer.name()]
389 |       if self.bucket_cnts is not None:
390 |         line.append('# words')
391 |       line += sys_names
392 |       table = [line]
393 |       if self.examples:
394 |         table[0].append('Examples')
395 |       for i, bs in enumerate(bucketer.bucket_strs):
396 |         line = [bs]
397 |         if self.bucket_cnts is not None:
398 |           line.append(f'{self.bucket_cnts[i]}')
399 |         for j, match in enumerate(matches):
400 |           line.append(f'{fmt(match[i][aid])}')
401 |           if self.bucket_intervals is not None:
402 |             low, up = self.bucket_intervals[j][i][aid]
403 |             line[-1] += f'<font size=2> [{fmt(low)}, {fmt(up)}]</font>'
404 |         if self.examples:
405 |           line.append(f'<a href="{self.output_fig_file}.html#bucket{i}">Examples</a>')
406 |         table += [line] 
407 |       html += html_table(table, title, latex_ignore_cols={3})
408 |       img_name = f'{self.output_fig_file}-{at}'
409 |       for ext in ('png', 'pdf'):
410 |         self.plot(output_directory, img_name, ext)
411 |       html += html_img_reference(img_name, self.header)
412 |     return html 
413 | 
414 | class NgramReport(Report):
415 |   def __init__(self, scorelist, report_length, min_ngram_length, max_ngram_length,
416 |                matches, compare_type, alpha, compare_directions=[(0, 1)], label_files=None, title=None):
417 |     self.scorelist = scorelist
418 |     self.report_length = report_length 
419 |     self.min_ngram_length = min_ngram_length
420 |     self.max_ngram_length = max_ngram_length
421 |     self.matches = matches
422 |     self.compare_type = compare_type
423 |     self.label_files = label_files
424 |     self.alpha = alpha
425 |     self.compare_directions = compare_directions
426 |     self.title = title
427 | 
428 |   def print(self):
429 |     report_length = self.report_length
430 |     self.print_header('N-gram Difference Analysis')
431 |     if self.title:
432 |       print(f'--- {self.title}')
433 |     else:
434 |       print(f'--- min_ngram_length={self.min_ngram_length}, max_ngram_length={self.max_ngram_length}')
435 |       print(f'    report_length={report_length}, alpha={self.alpha}, compare_type={self.compare_type}')
436 | 
437 |     if self.label_files is not None:
438 |       print(self.label_files)
439 | 
440 |     for i, (left, right) in enumerate(self.compare_directions):
441 |       print(f'--- {report_length} n-grams where {sys_names[left]}>{sys_names[right]} in {self.compare_type}')
442 |       for k, v in self.scorelist[i][:report_length]:
443 |         print(f"{' '.join(k)}\t{fmt(v)} (sys{left+1}={self.matches[left][k]}, sys{right+1}={self.matches[right][k]})")
444 |       print()
445 |       print(f'--- {report_length} n-grams where {sys_names[right]}>{sys_names[left]} in {self.compare_type}')
446 |       for k, v in reversed(self.scorelist[i][-report_length:]):
447 |         print(f"{' '.join(k)}\t{fmt(v)} (sys{left+1}={self.matches[left][k]}, sys{right+1}={self.matches[right][k]})")
448 |       print()
449 | 
450 |   def plot(self, output_directory, output_fig_file, output_fig_format='pdf'):
451 |     raise NotImplementedError('Plotting is not implemented for n-gram reports')
452 | 
453 |   def html_content(self, output_directory=None):
454 |     report_length = self.report_length
455 |     if self.title:
456 |       html = tag_str('p', self.title)
457 |     else:
458 |       html = tag_str('p', f'min_ngram_length={self.min_ngram_length}, max_ngram_length={self.max_ngram_length}')
459 |       html += tag_str('p', f'report_length={report_length}, alpha={self.alpha}, compare_type={self.compare_type}')
460 |       if self.label_files is not None:
461 |         html += tag_str('p', self.label_files)
462 | 
463 |     for i, (left, right) in enumerate(self.compare_directions):
464 |       title = f'{report_length} n-grams where {sys_names[left]}>{sys_names[right]} in {self.compare_type}'
465 |       table = [['n-gram', self.compare_type, f'{sys_names[left]}', f'{sys_names[right]}']]
466 |       table.extend([[' '.join(k), fmt(v), self.matches[left][k], self.matches[right][k]] for k, v in self.scorelist[i][:report_length]])
467 |       html += html_table(table, title)
468 | 
469 |       title = f'{report_length} n-grams where {sys_names[right]}>{sys_names[left]} in {self.compare_type}'
470 |       table = [['n-gram', self.compare_type, f'{sys_names[left]}', f'{sys_names[right]}']]
471 |       table.extend([[' '.join(k), fmt(v), self.matches[left][k], self.matches[right][k]] for k, v in reversed(self.scorelist[i][-report_length:])])
472 |       html += html_table(table, title)
473 |     return html 
474 | 
475 | class SentenceReport(Report):
476 | 
477 |   def __init__(self, bucketer=None, sys_stats=None, statistic_type=None, scorer=None, bucket_cnts=None, bucket_intervals=None, title=None):
478 |     self.bucketer = bucketer
479 |     self.sys_stats = [[s for s in stat] for stat in sys_stats]
480 |     self.statistic_type = statistic_type
481 |     self.scorer = scorer
482 |     self.bucket_cnts = bucket_cnts
483 |     self.bucket_intervals = bucket_intervals
484 |     self.yname = scorer.name() if statistic_type == 'score' else statistic_type
485 |     self.yidstr = scorer.idstr() if statistic_type == 'score' else statistic_type
486 |     self.output_fig_file = f'{next_fig_id()}-sent-{bucketer.idstr()}-{self.yidstr}'
487 |     if title:
488 |       self.title = title
489 |     elif scorer:
490 |       self.title = f'bucket type: {bucketer.name()}, statistic type: {scorer.name()}'
491 |     else:
492 |       self.title = f'bucket type: {bucketer.name()}, statistic type: {statistic_type}'
493 | 
494 |   def print(self):
495 |     self.print_header('Sentence Bucket Analysis')
496 |     print(f'--- {self.title}')
497 |     # first line
498 |     print(f'{self.bucketer.idstr()}', end='')
499 |     if self.bucket_cnts is not None:
500 |       print(f'\t# sents', end='')
501 |     for sn in sys_names:
502 |       print(f'\t{sn}', end='')
503 |     print()
504 |     for i, bs in enumerate(self.bucketer.bucket_strs):
505 |       print(f'{bs}', end='')
506 |       if self.bucket_cnts is not None:
507 |         print(f'\t{self.bucket_cnts[i]}', end='')
508 |       for j, stat in enumerate(self.sys_stats):
509 |         print(f'\t{fmt(stat[i])}', end='')
510 |         if self.bucket_intervals is not None:
511 |           interval =  self.bucket_intervals[j][i]
512 |           low, up = interval['lower_bound'], interval['upper_bound']
513 |           print(f' [{fmt(low)}, {fmt(up)}]', end='')
514 |       print()
515 |     print()
516 | 
517 |   def plot(self, output_directory='outputs', output_fig_file='word-acc', output_fig_format='pdf'):
518 |     sys = self.sys_stats
519 |     xticklabels = [s for s in self.bucketer.bucket_strs] 
520 | 
521 |     if self.bucket_intervals:
522 |       errs = []
523 |       for i, stat in enumerate(sys):
524 |         lows, ups = [], []
525 |         for j, score in enumerate(stat):
526 |           interval = self.bucket_intervals[i][j]
527 |           low, up = interval['lower_bound'], interval['upper_bound']
528 |           lows.append(score-low)
529 |           ups.append(up-score)
530 |         errs.append(np.array([lows, ups]) )
531 |     else:
532 |       errs = None
533 | 
534 |     make_bar_chart(sys,
535 |                    output_directory, output_fig_file,
536 |                    output_fig_format=output_fig_format,
537 |                    errs=errs,
538 |                    xlabel=self.bucketer.name(), ylabel=self.yname,
539 |                    xticklabels=xticklabels)
540 | 
541 |   def html_content(self, output_directory=None):
542 |     line = [self.bucketer.idstr()]
543 |     if self.bucket_cnts is not None:
544 |       line.append('# sents')
545 |     line += sys_names
546 |     table = [ line ]
547 |     for i, bs in enumerate(self.bucketer.bucket_strs):
548 |       line = [bs]
549 |       if self.bucket_cnts is not None:
550 |         line.append(f'\t{self.bucket_cnts[i]}')
551 |       for j, stat in enumerate(self.sys_stats):
552 |         line.append(fmt(stat[i]))
553 |         if self.bucket_intervals is not None:
554 |           interval =  self.bucket_intervals[j][i]
555 |           low, up = interval['lower_bound'], interval['upper_bound']
556 |           line[-1] += f'<font size=2> [{fmt(low)}, {fmt(up)}]</font>'
557 |       table.extend([line])
558 |     html = html_table(table, self.title)
559 |     for ext in ('png', 'pdf'):
560 |       self.plot(output_directory, self.output_fig_file, ext)
561 |     html += html_img_reference(self.output_fig_file, 'Sentence Bucket Analysis')
562 |     return html 
563 | 
564 | class SentenceExampleReport(Report):
565 | 
566 |   def __init__(self, report_length=None, scorediff_lists=None, scorer=None, ref=None, outs=None, src=None, compare_directions=[(0, 1)], title=None):
567 |     self.report_length = report_length 
568 |     self.scorediff_lists = scorediff_lists
569 |     self.scorer = scorer
570 |     self.ref = ref
571 |     self.outs = outs
572 |     self.src = src
573 |     self.compare_directions = compare_directions
574 |     self.title = title
575 | 
576 |   def print(self):
577 |     self.print_header('Sentence Examples Analysis')
578 |     report_length = self.report_length
579 |     for cnt, (left, right) in enumerate(self.compare_directions):
580 |       ref, out1, out2 = self.ref, self.outs[left], self.outs[right]
581 |       sleft, sright = sys_names[left], sys_names[right]
582 |       print(f'--- {report_length} sentences where {sleft}>{sright} at {self.scorer.name()}')
583 |       for bdiff, s1, s2, str1, str2, i in self.scorediff_lists[cnt][:report_length]:
584 |         print(f"{sleft}-{sright}={fmt(-bdiff)}, {sleft}={fmt(s1)}, {sright}={fmt(s2)}")
585 |         if self.src and self.src[i]:
586 |           print(f"Src:  {' '.join(self.src[i])}")
587 |         print ( 
588 |           f"Ref:  {' '.join(ref[i])}\n"
589 |           f"{sleft}: {' '.join(out1[i])}\n"
590 |           f"{sright}: {' '.join(out2[i])}\n"
591 |         )
592 | 
593 |       print(f'--- {report_length} sentences where {sright}>{sleft} at {self.scorer.name()}')
594 |       for bdiff, s1, s2, str1, str2, i in self.scorediff_lists[cnt][-report_length:]:
595 |         print(f"{sleft}-{sright}={fmt(-bdiff)}, {sleft}={fmt(s1)}, {sright}={fmt(s2)}")
596 |         if self.src and self.src[i]:
597 |           print(f"Src:  {' '.join(self.src[i])}")
598 |         print (
599 |           f"Ref:  {' '.join(ref[i])}\n"
600 |           f"{sleft}: {' '.join(out1[i])}\n"
601 |           f"{sright}: {' '.join(out2[i])}\n"
602 |         )
603 | 
604 |   def plot(self, output_directory, output_fig_file, output_fig_format='pdf'):
605 |     pass 
606 | 
607 |   def html_content(self, output_directory=None):
608 |     report_length = self.report_length
609 |     for cnt, (left, right) in enumerate(self.compare_directions):
610 |       sleft, sright = sys_names[left], sys_names[right]
611 |       ref, out1, out2 = self.ref, self.outs[left], self.outs[right]
612 |       html = tag_str('h4', f'{report_length} sentences where {sleft}>{sright} at {self.scorer.name()}')
613 |       for bdiff, s1, s2, str1, str2, i in self.scorediff_lists[cnt][:report_length]:
614 |         table = [['', 'Output', f'{self.scorer.idstr()}']]
615 |         if self.src and self.src[i]:
616 |           table.append(['Src', ' '.join(self.src[i]), ''])
617 |         table += [
618 |           ['Ref', ' '.join(ref[i]), ''],
619 |           [f'{sleft}', ' '.join(out1[i]), fmt(s1)],
620 |           [f'{sright}', ' '.join(out2[i]), fmt(s2)]
621 |         ]
622 |         
623 |         html += html_table(table, None)
624 | 
625 |       html += tag_str('h4', f'{report_length} sentences where {sright}>{sleft} at {self.scorer.name()}')
626 |       for bdiff, s1, s2, str1, str2, i in self.scorediff_lists[cnt][-report_length:]:
627 |         table = [['', 'Output', f'{self.scorer.idstr()}']]
628 |         if self.src and self.src[i]:
629 |           table.append(['Src', ' '.join(self.src[i]), ''])
630 |         table += [
631 |           ['Ref', ' '.join(ref[i]), ''],
632 |           [f'{sleft}', ' '.join(out1[i]), fmt(s1)],
633 |           [f'{sright}', ' '.join(out2[i]), fmt(s2)]
634 |         ]
635 | 
636 |         html += html_table(table, None)
637 | 
638 |     return html
639 | 
640 | 
641 | def tag_str(tag, str, new_line=''):
642 |   return f'<{tag}>{new_line} {str} {new_line}</{tag}>'
643 | 
644 | def html_table(table, title=None, bold_rows=1, bold_cols=1, latex_ignore_cols={}):
645 |   html = '<table border="1">\n'
646 |   if title is not None:
647 |     html += tag_str('caption', title)
648 |   for i, row in enumerate(table):
649 |     tag_type = 'th' if (i < bold_rows) else 'td'
650 |     table_row = '\n  '.join(tag_str('th' if j < bold_cols else tag_type, rdata) for (j, rdata) in enumerate(row))
651 |     html += tag_str('tr', table_row)
652 |   html += '\n</table>\n <br/>'
653 | 
654 |   tab_id = next_tab_id()
655 |   latex_code = "\\begin{table}[t]\n  \\centering\n"
656 |   cs = ['c'] * len(table[0])
657 |   if bold_cols != 0:
658 |     cs[bold_cols-1] = 'c||'
659 |   latex_code += "  \\begin{tabular}{"+''.join(cs)+"}\n"
660 |   for i, row in enumerate(table):
661 |     latex_code += ' & '.join([fmt(x) for c_i, x in enumerate(row) if c_i not in latex_ignore_cols]) + (' \\\\\n' if i != bold_rows-1 else ' \\\\ \\hline \\hline\n')
662 |   latex_code += "  \\end{tabular}\n  \\caption{Caption}\n  \\label{tab:table"+tab_id+"}\n\\end{table}"
663 | 
664 |   html += (f'<button onclick="showhide(\'{tab_id}_latex\')">Show/Hide LaTeX</button> <br/>' +
665 |            f'<pre id="{tab_id}_latex" style="display:none">{latex_code}</pre>')
666 |   return html
667 | 
668 | def styled_html_message(report_title, content):
669 |   content = content.encode("ascii","xmlcharrefreplace").decode()
670 |   return (f'<html>\n<head>\n<link rel="stylesheet" href="compare_mt.css">\n</head>\n'+
671 |           f'<script>\n{javascript_style}\n</script>\n'+
672 |           f'<body>\n<h1>{report_title}</h1>\n {content} \n</body>\n</html>')
673 | 
674 | def generate_html_report(reports, output_directory, report_title):
675 |   content = []
676 |   for name, rep in reports:
677 |     content.append(f'<h2>{name}</h2>')
678 |     for r in rep:
679 |       content.append(r.html_content(output_directory))
680 |   content = "\n".join(content)
681 |   
682 |   if not os.path.exists(output_directory):
683 |         os.makedirs(output_directory)
684 |   html_file = os.path.join(output_directory, 'index.html')
685 |   with open(html_file, 'w') as f:
686 |     f.write(styled_html_message(report_title, content))
687 |   css_file = os.path.join(output_directory, 'compare_mt.css')
688 |   with open(css_file, 'w') as f:
689 |     f.write(css_style)
690 | 
691 | def launch_http_server(output_directory: str, bind_address:str ='0.0.0.0', bind_port: int=8000):
692 |   assert Path(output_directory).is_dir()
693 |   hostname = bind_address if bind_address != '0.0.0.0' else socket.gethostname()
694 |   log.info(f'Directory = {output_directory}')
695 |   log.info(f'Launching a web server:: http://{hostname}:{bind_port}/')
696 |   Handler = partial(SimpleHTTPRequestHandler, directory=output_directory)
697 |   server = HTTPServer(server_address=(bind_address, bind_port),
698 |                                RequestHandlerClass=Handler)
699 |   try:
700 |     server.serve_forever()
701 |   except KeyboardInterrupt:
702 |     pass # all good! Exiting without printing stacktrace
703 |   
704 | 


--------------------------------------------------------------------------------
/compare_mt/rouge/README.md:
--------------------------------------------------------------------------------
 1 | # Python ROUGE Implementation
 2 | 
 3 | ## Overview
 4 | 
 5 | This is a native python implementation of ROUGE, designed to replicate results
 6 | from the original perl package.
 7 | 
 8 | ROUGE was originally introduced in the paper:
 9 | 
10 | Lin, Chin-Yew. ROUGE: a Package for Automatic Evaluation of Summaries. In
11 | Proceedings of the Workshop on Text Summarization Branches Out (WAS 2004),
12 | Barcelona, Spain, July 25 - 26, 2004.
13 | 
14 | ## ROUGE for Python
15 | 
16 | There are ROUGE implementations available for Python, however some are not
17 | native python due to their dependency on the perl script, and others provide
18 | differing results when compared with the original implementation. This makes it
19 | difficult to directly compare with known results.
20 | 
21 | This package is designed to replicate perl results. It implements:
22 | 
23 | *   ROUGE-N (N-gram) scoring
24 | *   ROUGE-L (Longest Common Subsequence) scoring
25 | *   Text normalization
26 | *   Bootstrap resampling for confidence interval calculation
27 | *   Optional Porter stemming to remove plurals and word suffixes such as (ing,
28 |     ion, ment).
29 | 
30 | Note that not all options provided by the original perl ROUGE script are
31 | supported, but the subset of options that are implemented should replicate the
32 | original functionality.
33 | 
34 | ## Stopword removal
35 | 
36 | The original ROUGE perl script implemented optional stopword removal (using the
37 | -s parameter). However, there were ~600 stopwords used by ROUGE, borrowed from
38 | another now defunct package. This word list contained many words that may not be
39 | suited to some tasks, such as day and month names and numbers. It also has no
40 | clear license for redistribution. Since we are unable to replicate this
41 | functionality precisely we do not include stopword removal.
42 | 
43 | ## How to run
44 | 
45 | This package compares target files (containing one example per line) with
46 | prediction files in the same format. It can be launched as follows (from
47 | google-research/):
48 | 
49 | ```shell
50 | python -m rouge.rouge \
51 |     --target_filepattern=*.targets \
52 |     --prediction_filepattern=*.decodes \
53 |     --output_filename=scores.csv \
54 |     --use_stemmer=true
55 | ```
56 | 
57 | ## License
58 | 
59 | Licensed under the
60 | [Apache 2.0](https://github.com/google-research/google-research/blob/master/LICENSE)
61 | License.
62 | 
63 | ## Disclaimer
64 | 
65 | This is not an official Google product.
66 | 


--------------------------------------------------------------------------------
/compare_mt/rouge/__init__.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright 2018 The Google Research Authors.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 
16 | 
17 | 


--------------------------------------------------------------------------------
/compare_mt/rouge/io.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | # Copyright 2018 The Google Research Authors.
  3 | #
  4 | # Licensed under the Apache License, Version 2.0 (the "License");
  5 | # you may not use this file except in compliance with the License.
  6 | # You may obtain a copy of the License at
  7 | #
  8 | #     http://www.apache.org/licenses/LICENSE-2.0
  9 | #
 10 | # Unless required by applicable law or agreed to in writing, software
 11 | # distributed under the License is distributed on an "AS IS" BASIS,
 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | # See the License for the specific language governing permissions and
 14 | # limitations under the License.
 15 | 
 16 | """Library for reading/writing input and score files."""
 17 | 
 18 | from __future__ import absolute_import
 19 | from __future__ import division
 20 | from __future__ import print_function
 21 | 
 22 | import glob
 23 | import itertools
 24 | 
 25 | from absl import logging
 26 | 
 27 | 
 28 | def compute_scores_and_write_to_csv(target_filepattern,
 29 |                                     prediction_filepattern,
 30 |                                     output_filename,
 31 |                                     scorer,
 32 |                                     aggregator,
 33 |                                     delimiter="\n"):
 34 |   """Runs aggregate score calculations and outputs results to a CSV file.
 35 | 
 36 |   Args:
 37 |     target_filepattern: Pattern for files containing target text.
 38 |     prediction_filepattern: Pattern for files containing prediction text.
 39 |     output_filename: Name of file to write results to.
 40 |     scorer: A BaseScorer object to compute scores.
 41 |     aggregator: An aggregator to aggregate scores. If None, outputs are
 42 |       per-example scores.
 43 |     delimiter: Record delimiter.
 44 |   """
 45 | 
 46 |   target_filenames = _glob(target_filepattern)
 47 |   prediction_filenames = _glob(prediction_filepattern)
 48 |   scores = _compute_scores(target_filenames, prediction_filenames, scorer,
 49 |                            delimiter)
 50 |   if aggregator:
 51 |     for score in scores:
 52 |       aggregator.add_scores(score)
 53 |     _write_aggregates_to_csv(output_filename, aggregator.aggregate())
 54 |   else:
 55 |     _write_scores_to_csv(output_filename, scores)
 56 | 
 57 | 
 58 | def _glob(filepattern):
 59 |   return glob.glob(filepattern)  # pylint: disable=unreachable
 60 | 
 61 | 
 62 | def _open(filepattern, mode="r"):
 63 |   return open(filepattern, mode)  # pylint: disable=unreachable
 64 | 
 65 | 
 66 | def _record_gen(filename, delimiter):
 67 |   """Opens file and yields records separated by delimiter."""
 68 |   with _open(filename) as f:
 69 |     records = f.read().split(delimiter)
 70 |   if records[-1]:
 71 |     # Need a final delimiter at end of file to be able to detect an empty last
 72 |     # record.
 73 |     logging.warn("Expected delimiter at end of file")
 74 |   else:
 75 |     records = records[:-1]
 76 |   for record in records:
 77 |     yield record
 78 | 
 79 | 
 80 | def _compute_scores(target_filenames, prediction_filenames, scorer, delimiter):
 81 |   """Computes aggregates scores across the given target and prediction files.
 82 | 
 83 |   Args:
 84 |     target_filenames: List of filenames from which to read target lines.
 85 |     prediction_filenames: List of filenames from which to read prediction lines.
 86 |     scorer: A BaseScorer object to compute scores.
 87 |     delimiter: string delimiter between each record in input files
 88 |   Returns:
 89 |     A list of dicts mapping score_type to Score objects.
 90 |   Raises:
 91 |     ValueError: If invalid targets or predictions are provided.
 92 |   """
 93 | 
 94 |   if (len(target_filenames) < 1 or
 95 |       len(target_filenames) != len(prediction_filenames)):
 96 |     raise ValueError("Must have equal and positive number of target and "
 97 |                      "prediction files. Found: %d target files, %d prediction "
 98 |                      "files." % (len(target_filenames),
 99 |                                  len(prediction_filenames)))
100 | 
101 |   scores = []
102 |   for target_filename, prediction_filename in zip(
103 |       sorted(target_filenames), sorted(prediction_filenames)):
104 |     logging.info("Reading targets from %s.", target_filename)
105 |     logging.info("Reading predictions from %s.", prediction_filename)
106 |     targets = _record_gen(target_filename, delimiter)
107 |     preds = _record_gen(prediction_filename, delimiter)
108 |     for target_rec, prediction_rec in itertools.zip_longest(targets, preds):
109 |       if target_rec is None or prediction_rec is None:
110 |         raise ValueError("Must have equal number of lines across target and "
111 |                          "prediction files. Mismatch between files: %s, %s." %
112 |                          (target_filename, prediction_filename))
113 |       scores.append(scorer.score(target_rec, prediction_rec))
114 | 
115 |   return scores
116 | 
117 | 
118 | def _write_aggregates_to_csv(output_filename, aggregates):
119 |   """Writes aggregate scores to an output CSV file.
120 | 
121 |   Output file is a comma separated where each line has the format:
122 |     score_type-(P|R|F),low_ci,mean,high_ci
123 | 
124 |   P/R/F indicates whether the score is a precision, recall or f-measure.
125 | 
126 |   Args:
127 |     output_filename: Name of file to write results to.
128 |     aggregates: A dict mapping each score_type to a AggregateScore object.
129 |   """
130 | 
131 |   logging.info("Writing results to %s.", output_filename)
132 |   with _open(output_filename, "w") as output_file:
133 |     output_file.write("score_type,low,mid,high\n")
134 |     for score_type, aggregate in sorted(aggregates.items()):
135 |       output_file.write("%s-R,%f,%f,%f\n" %
136 |                         (score_type, aggregate.low.recall, aggregate.mid.recall,
137 |                          aggregate.high.recall))
138 |       output_file.write("%s-P,%f,%f,%f\n" %
139 |                         (score_type, aggregate.low.precision,
140 |                          aggregate.mid.precision, aggregate.high.precision))
141 |       output_file.write("%s-F,%f,%f,%f\n" %
142 |                         (score_type, aggregate.low.fmeasure,
143 |                          aggregate.mid.fmeasure, aggregate.high.fmeasure))
144 |   logging.info("Finished writing results.")
145 | 
146 | 
147 | def _write_scores_to_csv(output_filename, scores):
148 |   """Writes scores for each individual example to an output CSV file.
149 | 
150 |   Output file is a comma separated where each line has the format:
151 |     id,score1,score2,score3,...
152 | 
153 |   The header row indicates the type of each score column.
154 | 
155 |   Args:
156 |     output_filename: Name of file to write results to.
157 |     scores: A list of dicts mapping each score_type to a Score object.
158 |   """
159 | 
160 |   if len(scores) < 1:
161 |     logging.warn("No scores to write")
162 |     return
163 |   rouge_types = sorted(scores[0].keys())
164 | 
165 |   logging.info("Writing results to %s.", output_filename)
166 |   with _open(output_filename, "w") as out_file:
167 |     out_file.write("id")
168 |     for rouge_type in rouge_types:
169 |       out_file.write(",{t}-P,{t}-R,{t}-F".format(t=rouge_type))
170 |     out_file.write("\n")
171 |     for i, result in enumerate(scores):
172 |       out_file.write("%d" % i)
173 |       for rouge_type in rouge_types:
174 |         out_file.write(",%f,%f,%f" %
175 |                        (result[rouge_type].precision, result[rouge_type].recall,
176 |                         result[rouge_type].fmeasure))
177 |       out_file.write("\n")
178 |   logging.info("Finished writing results.")
179 | 


--------------------------------------------------------------------------------
/compare_mt/rouge/requirements.txt:
--------------------------------------------------------------------------------
1 | absl-py
2 | nltk
3 | numpy
4 | six
5 | 


--------------------------------------------------------------------------------
/compare_mt/rouge/rouge.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright 2018 The Google Research Authors.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 
16 | r"""Main routine to calculate ROUGE scores across text files.
17 | 
18 | Designed to replicate scores computed by the ROUGE perl implementation as
19 | closely as possible.
20 | 
21 | Output is a text file in CSV format.
22 | 
23 | Sample usage:
24 | 
25 | rouge ---rouge_types=rouge1,rouge2,rougeL \
26 |     --target_filepattern=*.targets \
27 |     --prediction_fliepattern=*.decodes \
28 |     --output_filename=scores.csv \
29 |     --use_stemmer
30 | 
31 | Which is equivalent to calling the perl ROUGE script as:
32 | 
33 | ROUGE-1.5.5.pl -m -e ./data -n 2 -a /tmp/rouge/settings.xml
34 | 
35 | Where settings.xml provides target and decode text.
36 | """
37 | 
38 | from __future__ import absolute_import
39 | from __future__ import division
40 | from __future__ import print_function
41 | 
42 | from absl import app
43 | from absl import flags
44 | from compare_mt.rouge import io
45 | from compare_mt.rouge import rouge_scorer
46 | from compare_mt.rouge import scoring
47 | 
48 | flags.DEFINE_string("target_filepattern", None,
49 |                     "Files containing target text.")
50 | flags.DEFINE_string("prediction_filepattern", None,
51 |                     "Files containing prediction text.")
52 | flags.DEFINE_string("output_filename", None,
53 |                     "File in which to write calculated ROUGE scores as a CSV.")
54 | flags.DEFINE_string("delimiter", "\n",
55 |                     "Record delimiter  in files.")
56 | flags.DEFINE_list("rouge_types", ["rouge1", "rouge2", "rougeL"],
57 |                   "List of ROUGE types to calculate.")
58 | flags.DEFINE_boolean("use_stemmer", False,
59 |                      "Whether to use Porter stemmer to remove common suffixes.")
60 | flags.DEFINE_boolean("aggregate", True,
61 |                      "Write aggregates if this is set to True")
62 | 
63 | FLAGS = flags.FLAGS
64 | 
65 | 
66 | def main(argv):
67 |   if len(argv) > 1:
68 |     raise app.UsageError("Too many command-line arguments.")
69 |   scorer = rouge_scorer.RougeScorer(FLAGS.rouge_types, FLAGS.use_stemmer)
70 |   aggregator = scoring.BootstrapAggregator() if FLAGS.aggregate else None
71 |   io.compute_scores_and_write_to_csv(
72 |       FLAGS.target_filepattern,
73 |       FLAGS.prediction_filepattern,
74 |       FLAGS.output_filename,
75 |       scorer,
76 |       aggregator,
77 |       delimiter=FLAGS.delimiter)
78 | 
79 | 
80 | if __name__ == "__main__":
81 |   flags.mark_flag_as_required("target_filepattern")
82 |   flags.mark_flag_as_required("prediction_filepattern")
83 |   flags.mark_flag_as_required("output_filename")
84 |   app.run(main)
85 | 


--------------------------------------------------------------------------------
/compare_mt/rouge/rouge_scorer.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | # Copyright 2019 The Google Research Authors.
  3 | #
  4 | # Licensed under the Apache License, Version 2.0 (the "License");
  5 | # you may not use this file except in compliance with the License.
  6 | # You may obtain a copy of the License at
  7 | #
  8 | #     http://www.apache.org/licenses/LICENSE-2.0
  9 | #
 10 | # Unless required by applicable law or agreed to in writing, software
 11 | # distributed under the License is distributed on an "AS IS" BASIS,
 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | # See the License for the specific language governing permissions and
 14 | # limitations under the License.
 15 | 
 16 | # Lint as: python2, python3
 17 | """Computes rouge scores between two text blobs.
 18 | Implementation replicates the functionality in the original ROUGE package. See:
 19 | Lin, Chin-Yew. ROUGE: a Package for Automatic Evaluation of Summaries. In
 20 | Proceedings of the Workshop on Text Summarization Branches Out (WAS 2004),
 21 | Barcelona, Spain, July 25 - 26, 2004.
 22 | Default options are equivalent to running:
 23 | ROUGE-1.5.5.pl -e data -n 2 -a settings.xml
 24 | Or with use_stemmer=True:
 25 | ROUGE-1.5.5.pl -m -e data -n 2 -a settings.xml
 26 | In these examples settings.xml lists input files and formats.
 27 | """
 28 | 
 29 | from __future__ import absolute_import
 30 | from __future__ import division
 31 | from __future__ import print_function
 32 | 
 33 | import collections
 34 | import re
 35 | 
 36 | from compare_mt.cache_utils import CachedPorterStemmer
 37 | import six
 38 | from six.moves import map
 39 | from six.moves import range
 40 | from compare_mt.rouge import scoring
 41 | from compare_mt.rouge import tokenize
 42 | 
 43 | 
 44 | class RougeScorer(scoring.BaseScorer):
 45 |   """Calculate rouges scores between two blobs of text.
 46 |   Sample usage:
 47 |     scorer = RougeScorer(['rouge1', 'rougeL'], use_stemmer=True)
 48 |     scores = scorer.score('The quick brown fox jumps over the lazy dog',
 49 |                           'The quick brown dog jumps on the log.')
 50 |   """
 51 | 
 52 |   def __init__(self, rouge_types, use_stemmer=False):
 53 |     """Initializes a new RougeScorer.
 54 |     Valid rouge types that can be computed are:
 55 |       rougen (e.g. rouge1, rouge2): n-gram based scoring.
 56 |       rougeL: Longest common subsequence based scoring.
 57 |     Args:
 58 |       rouge_types: A list of rouge types to calculate.
 59 |       use_stemmer: Bool indicating whether Porter stemmer should be used to
 60 |         strip word suffixes to improve matching.
 61 |     Returns:
 62 |       A dict mapping rouge types to Score tuples.
 63 |     """
 64 | 
 65 |     self.rouge_types = rouge_types
 66 |     self._stemmer = CachedPorterStemmer() if use_stemmer else None
 67 | 
 68 |   def score(self, target, prediction):
 69 |     """Calculates rouge scores between the target and prediction.
 70 |     Args:
 71 |       target: Text containing the target (ground truth) text.
 72 |       prediction: Text containing the predicted text.
 73 |     Returns:
 74 |       A dict mapping each rouge type to a Score object.
 75 |     Raises:
 76 |       ValueError: If an invalid rouge type is encountered.
 77 |     """
 78 | 
 79 |     target_tokens = tokenize.tokenize(target, self._stemmer)
 80 |     prediction_tokens = tokenize.tokenize(prediction, self._stemmer)
 81 |     result = {}
 82 | 
 83 |     for rouge_type in self.rouge_types:
 84 |       if rouge_type == "rougeL":
 85 |         # Rouge from longest common subsequences.
 86 |         scores = _score_lcs(target_tokens, prediction_tokens)
 87 |       elif rouge_type == "rougeLsum":
 88 |         # Note: Does not support multi-line text.
 89 |         def get_sents(text):
 90 |           # Assume sentences are separated by newline.
 91 |           sents = six.ensure_str(text).split("\n")
 92 |           sents = [x for x in sents if len(x)]
 93 |           return sents
 94 | 
 95 |         target_tokens_list = [
 96 |             tokenize.tokenize(s, self._stemmer) for s in get_sents(target)]
 97 |         prediction_tokens_list = [
 98 |             tokenize.tokenize(s, self._stemmer) for s in get_sents(prediction)]
 99 |         scores = _summary_level_lcs(target_tokens_list,
100 |                                     prediction_tokens_list)
101 |       elif re.match(r"rouge[0-9]$", six.ensure_str(rouge_type)):
102 |         # Rouge from n-grams.
103 |         n = int(rouge_type[5:])
104 |         if n <= 0:
105 |           raise ValueError("rougen requires positive n: %s" % rouge_type)
106 |         target_ngrams = _create_ngrams(target_tokens, n)
107 |         prediction_ngrams = _create_ngrams(prediction_tokens, n)
108 |         scores = _score_ngrams(target_ngrams, prediction_ngrams)
109 |       else:
110 |         raise ValueError("Invalid rouge type: %s" % rouge_type)
111 |       result[rouge_type] = scores
112 | 
113 |     return result
114 | 
115 | 
116 | def _create_ngrams(tokens, n):
117 |   """Creates ngrams from the given list of tokens.
118 |   Args:
119 |     tokens: A list of tokens from which ngrams are created.
120 |     n: Number of tokens to use, e.g. 2 for bigrams.
121 |   Returns:
122 |     A dictionary mapping each bigram to the number of occurrences.
123 |   """
124 | 
125 |   ngrams = collections.Counter()
126 |   for ngram in (tuple(tokens[i:i + n]) for i in range(len(tokens) - n + 1)):
127 |     ngrams[ngram] += 1
128 |   return ngrams
129 | 
130 | 
131 | def _score_lcs(target_tokens, prediction_tokens):
132 |   """Computes LCS (Longest Common Subsequence) rouge scores.
133 |   Args:
134 |     target_tokens: Tokens from the target text.
135 |     prediction_tokens: Tokens from the predicted text.
136 |   Returns:
137 |     A Score object containing computed scores.
138 |   """
139 | 
140 |   if not target_tokens or not prediction_tokens:
141 |     return scoring.Score(precision=0, recall=0, fmeasure=0)
142 | 
143 |   # Compute length of LCS from the bottom up in a table (DP appproach).
144 |   lcs_table = _lcs_table(target_tokens, prediction_tokens)
145 |   lcs_length = lcs_table[-1][-1]
146 | 
147 |   precision = lcs_length / len(prediction_tokens)
148 |   recall = lcs_length / len(target_tokens)
149 |   fmeasure = scoring.fmeasure(precision, recall)
150 | 
151 |   return scoring.Score(precision=precision, recall=recall, fmeasure=fmeasure)
152 | 
153 | 
154 | def _lcs_table(ref, can):
155 |   """Create 2-d LCS score table."""
156 |   rows = len(ref)
157 |   cols = len(can)
158 |   lcs_table = [[0] * (cols + 1) for _ in range(rows + 1)]
159 |   for i in range(1, rows + 1):
160 |     for j in range(1, cols + 1):
161 |       if ref[i - 1] == can[j - 1]:
162 |         lcs_table[i][j] = lcs_table[i - 1][j - 1] + 1
163 |       else:
164 |         lcs_table[i][j] = max(lcs_table[i - 1][j], lcs_table[i][j - 1])
165 |   return lcs_table
166 | 
167 | 
168 | def _backtrack_norec(t, ref, can):
169 |   """Read out LCS."""
170 |   i = len(ref)
171 |   j = len(can)
172 |   lcs = []
173 |   while i > 0 and j > 0:
174 |     if ref[i - 1] == can[j - 1]:
175 |       lcs.insert(0, i-1)
176 |       i -= 1
177 |       j -= 1
178 |     elif t[i][j - 1] > t[i - 1][j]:
179 |       j -= 1
180 |     else:
181 |       i -= 1
182 |   return lcs
183 | 
184 | 
185 | def _summary_level_lcs(ref_sent, can_sent):
186 |   """ROUGE: Summary-level LCS, section 3.2 in ROUGE paper.
187 |   Args:
188 |     ref_sent: list of tokenized reference sentences
189 |     can_sent: list of tokenized candidate sentences
190 |   Returns:
191 |     summary level ROUGE score
192 |   """
193 |   if not ref_sent or not can_sent:
194 |     return scoring.Score(precision=0, recall=0, fmeasure=0)
195 | 
196 |   m = sum(map(len, ref_sent))
197 |   n = sum(map(len, can_sent))
198 |   if not n or not m:
199 |     return scoring.Score(precision=0, recall=0, fmeasure=0)
200 | 
201 |   # get token counts to prevent double counting
202 |   token_cnts_r = collections.Counter()
203 |   token_cnts_c = collections.Counter()
204 |   for s in ref_sent:
205 |     # s is a list of tokens
206 |     token_cnts_r.update(s)
207 |   for s in can_sent:
208 |     token_cnts_c.update(s)
209 | 
210 |   hits = 0
211 |   for r in ref_sent:
212 |     lcs = _union_lcs(r, can_sent)
213 |     # Prevent double-counting:
214 |     # The paper describes just computing hits += len(_union_lcs()),
215 |     # but the implementation prevents double counting. We also
216 |     # implement this as in version 1.5.5.
217 |     for t in lcs:
218 |       if token_cnts_c[t] > 0 and token_cnts_r[t] > 0:
219 |         hits += 1
220 |         token_cnts_c[t] -= 1
221 |         token_cnts_r[t] -= 1
222 | 
223 |   recall = hits / m
224 |   precision = hits / n
225 |   fmeasure = scoring.fmeasure(precision, recall)
226 |   return scoring.Score(precision=precision, recall=recall, fmeasure=fmeasure)
227 | 
228 | 
229 | def _union_lcs(ref, c_list):
230 |   """Find union LCS between a ref sentence and list of candidate sentences.
231 |   Args:
232 |     ref: list of tokens
233 |     c_list: list of list of indices for LCS into reference summary
234 |   Returns:
235 |     List of tokens in ref representing union LCS.
236 |   """
237 |   lcs_list = [lcs_ind(ref, c) for c in c_list]
238 |   return [ref[i] for i in _find_union(lcs_list)]
239 | 
240 | 
241 | def _find_union(lcs_list):
242 |   """Finds union LCS given a list of LCS."""
243 |   return sorted(list(set().union(*lcs_list)))
244 | 
245 | 
246 | def lcs_ind(ref, can):
247 |   """Returns one of the longest lcs."""
248 |   t = _lcs_table(ref, can)
249 |   return _backtrack_norec(t, ref, can)
250 | 
251 | 
252 | def _score_ngrams(target_ngrams, prediction_ngrams):
253 |   """Compute n-gram based rouge scores.
254 |   Args:
255 |     target_ngrams: A Counter object mapping each ngram to number of
256 |       occurrences for the target text.
257 |     prediction_ngrams: A Counter object mapping each ngram to number of
258 |       occurrences for the prediction text.
259 |   Returns:
260 |     A Score object containing computed scores.
261 |   """
262 | 
263 |   intersection_ngrams_count = 0
264 |   for ngram in six.iterkeys(target_ngrams):
265 |     intersection_ngrams_count += min(target_ngrams[ngram],
266 |                                      prediction_ngrams[ngram])
267 |   target_ngrams_count = sum(target_ngrams.values())
268 |   prediction_ngrams_count = sum(prediction_ngrams.values())
269 | 
270 |   precision = intersection_ngrams_count / max(prediction_ngrams_count, 1)
271 |   recall = intersection_ngrams_count / max(target_ngrams_count, 1)
272 |   fmeasure = scoring.fmeasure(precision, recall)
273 | 
274 |   return scoring.Score(precision=precision, recall=recall, fmeasure=fmeasure)
275 | 


--------------------------------------------------------------------------------
/compare_mt/rouge/run.sh:
--------------------------------------------------------------------------------
 1 | # Copyright 2018 The Google Research Authors.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | #!/bin/bash
16 | set -e
17 | set -x
18 | 
19 | virtualenv -p python3 .
20 | source ./bin/activate
21 | 
22 | pip install -r rouge/requirements.txt
23 | python -m rouge.io_test
24 | python -m rouge.rouge_scorer_test
25 | python -m rouge.scoring_test
26 | 


--------------------------------------------------------------------------------
/compare_mt/rouge/scoring.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | # Copyright 2018 The Google Research Authors.
  3 | #
  4 | # Licensed under the Apache License, Version 2.0 (the "License");
  5 | # you may not use this file except in compliance with the License.
  6 | # You may obtain a copy of the License at
  7 | #
  8 | #     http://www.apache.org/licenses/LICENSE-2.0
  9 | #
 10 | # Unless required by applicable law or agreed to in writing, software
 11 | # distributed under the License is distributed on an "AS IS" BASIS,
 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | # See the License for the specific language governing permissions and
 14 | # limitations under the License.
 15 | 
 16 | """Library for scoring and evaluation of text samples.
 17 | 
 18 | Aggregation functions use bootstrap resampling to compute confidence intervals
 19 | as per the original ROUGE perl implementation.
 20 | """
 21 | 
 22 | from __future__ import absolute_import
 23 | from __future__ import division
 24 | from __future__ import print_function
 25 | 
 26 | import abc
 27 | import collections
 28 | 
 29 | import numpy as np
 30 | import six
 31 | from six.moves import xrange  # pylint: disable=redefined-builtin
 32 | 
 33 | 
 34 | class Score(
 35 |     collections.namedtuple("Score", ["precision", "recall", "fmeasure"])):
 36 |   """Tuple containing precision, recall, and f-measure values."""
 37 | 
 38 | 
 39 | class BaseScorer(object):
 40 |   """Base class for Scorer objects."""
 41 | 
 42 |   @abc.abstractmethod
 43 |   def score(self, target, prediction):
 44 |     """Calculates score between the target and prediction.
 45 | 
 46 |     Args:
 47 |       target: Text containing the target (ground truth) text.
 48 |       prediction: Text containing the predicted text.
 49 |     Returns:
 50 |       A dict mapping each score_type (string) to Score object.
 51 |     """
 52 | 
 53 | 
 54 | class AggregateScore(
 55 |     collections.namedtuple("AggregateScore", ["low", "mid", "high"])):
 56 |   """Tuple containing confidence intervals for scores."""
 57 | 
 58 | 
 59 | class BootstrapAggregator(object):
 60 |   """Aggregates scores to provide confidence intervals.
 61 | 
 62 |   Sample usage:
 63 |     scorer = rouge_scorer.RougeScorer(['rouge1', 'rougeL'])
 64 |     aggregator = Aggregator()
 65 |     aggregator.add_scores(scorer.score("one two three", "one two"))
 66 |     aggregator.add_scores(scorer.score("one two five six", "seven eight"))
 67 |     result = aggregator.aggregate()
 68 |     print result
 69 |     {'rougeL': AggregateScore(
 70 |          low=Score(precision=0.0, recall=0.0, fmeasure=0.0),
 71 |          mid=Score(precision=0.5, recall=0.33, fmeasure=0.40),
 72 |          high=Score(precision=1.0, recall=0.66, fmeasure=0.80)),
 73 |      'rouge1': AggregateScore(
 74 |          low=Score(precision=0.0, recall=0.0, fmeasure=0.0),
 75 |          mid=Score(precision=0.5, recall=0.33, fmeasure=0.40),
 76 |          high=Score(precision=1.0, recall=0.66, fmeasure=0.80))}
 77 |   """
 78 | 
 79 |   def __init__(self,
 80 |                confidence_interval=0.95,
 81 |                n_samples=1000):
 82 |     """Initializes a BootstrapAggregator object.
 83 | 
 84 |     Args:
 85 |       confidence_interval: Confidence interval to compute on the mean as a
 86 |         decimal.
 87 |       n_samples: Number of samples to use for bootstrap resampling.
 88 |     Raises:
 89 |       ValueError: If invalid argument is given.
 90 |     """
 91 | 
 92 |     if confidence_interval < 0 or confidence_interval > 1:
 93 |       raise ValueError("confidence_interval must be in range [0, 1]")
 94 |     if n_samples <= 0:
 95 |       raise ValueError("n_samples must be positive")
 96 | 
 97 |     self._n_samples = n_samples
 98 |     self._confidence_interval = confidence_interval
 99 |     self._scores = collections.defaultdict(list)
100 | 
101 |   def add_scores(self, scores):
102 |     """Adds a sample for future aggregation.
103 | 
104 |     Args:
105 |       scores: Dict mapping score_type strings to Score object.
106 |     """
107 | 
108 |     for score_type, score in six.iteritems(scores):
109 |       self._scores[score_type].append((score.precision, score.recall,
110 |                                        score.fmeasure))
111 | 
112 |   def aggregate(self):
113 |     """Aggregates scores previously added using add_scores.
114 | 
115 |     Returns:
116 |       A dict mapping score_type to AggregateScore objects.
117 |     """
118 | 
119 |     result = {}
120 |     for score_type, scores in six.iteritems(self._scores):
121 |       # Stack scores into a 2-d matrix of (sample, measure).
122 |       score_matrix = np.vstack(scores)
123 |       # Percentiles are returned as (interval, measure).
124 |       percentiles = self._bootstrap_resample(score_matrix)
125 |       # Extract the three intervals (low, mid, high).
126 |       intervals = tuple((Score(
127 |           precision=percentiles[j, 0],
128 |           recall=percentiles[j, 1],
129 |           fmeasure=percentiles[j, 2]) for j in xrange(3)))
130 |       result[score_type] = AggregateScore(
131 |           low=intervals[0], mid=intervals[1], high=intervals[2])
132 |     return result
133 | 
134 |   def _bootstrap_resample(self, matrix):
135 |     """Performs bootstrap resampling on a matrix of scores.
136 | 
137 |     Args:
138 |       matrix: A 2-d matrix of (sample, measure).
139 |     Returns:
140 |       A 2-d matrix of (bounds, measure). There are three bounds: low (row 0),
141 |       mid (row 1) and high (row 2). Mid is always the mean, while low and high
142 |       bounds are specified by self._confidence_interval (which defaults to 0.95
143 |       meaning it will return the 2.5th and 97.5th percentiles for a 95%
144 |       confidence interval on the mean).
145 |     """
146 | 
147 |     # Matrix of (bootstrap sample, measure).
148 |     sample_mean = np.zeros((self._n_samples, matrix.shape[1]))
149 |     for i in xrange(self._n_samples):
150 |       sample_idx = np.random.choice(
151 |           np.arange(matrix.shape[0]), size=matrix.shape[0])
152 |       sample = matrix[sample_idx, :]
153 |       sample_mean[i, :] = np.mean(sample, axis=0)
154 | 
155 |     # Take percentiles on the estimate of the mean using bootstrap samples.
156 |     # Final result is a (bounds, measure) matrix.
157 |     percentile_delta = (1 - self._confidence_interval) / 2
158 |     q = 100 * np.array([percentile_delta, 0.5, 1 - percentile_delta])
159 |     return np.percentile(sample_mean, q, axis=0)
160 | 
161 | 
162 | def fmeasure(precision, recall):
163 |   """Computes f-measure given precision and recall values."""
164 | 
165 |   if precision + recall > 0:
166 |     return 2 * precision * recall / (precision + recall)
167 |   else:
168 |     return 0.0
169 | 


--------------------------------------------------------------------------------
/compare_mt/rouge/tokenize.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright 2019 The Google Research Authors.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 
16 | # Lint as: python2, python3
17 | """A library for tokenizing text."""
18 | 
19 | from __future__ import absolute_import
20 | from __future__ import division
21 | from __future__ import print_function
22 | 
23 | import re
24 | import six
25 | 
26 | EMPTY_OR_INVALID_TOKENS = re.compile(r"^[a-z0-9]+$")
27 | 
28 | def tokenize(text, stemmer):
29 |   """Tokenize input text into a list of tokens.
30 |   This approach aims to replicate the approach taken by Chin-Yew Lin in
31 |   the original ROUGE implementation.
32 |   Args:
33 |     text: A text blob to tokenize.
34 |     stemmer: An optional stemmer.
35 |   Returns:
36 |     A list of string tokens extracted from input text.
37 |   """
38 | 
39 |   # Convert everything to lowercase.
40 |   text = text.lower()
41 |   # Replace any non-alpha-numeric characters with spaces.
42 |   text = re.sub(r"[^a-z0-9]+", " ", six.ensure_str(text))
43 | 
44 |   tokens = re.split(r"\s+", text)
45 |   if stemmer:
46 |     # Only stem words more than 3 characters long.
47 |     tokens = [stemmer.stem(x) if len(x) > 3 else x for x in tokens]
48 | 
49 |   # One final check to drop any empty or invalid tokens.
50 |   tokens = [x for x in tokens if EMPTY_OR_INVALID_TOKENS.match(six.ensure_str(x))]
51 | 
52 |   return tokens
53 | 


--------------------------------------------------------------------------------
/compare_mt/sign_utils.py:
--------------------------------------------------------------------------------
 1 | ########################################################################################
 2 | # Compare two systems using bootstrap resampling                                       #
 3 | #  adapted from https://github.com/neubig/util-scripts/blob/master/paired-bootstrap.py #
 4 | #                                                                                      #
 5 | # See, e.g. the following paper for references                                         #
 6 | #                                                                                      #
 7 | # Statistical Significance Tests for Machine Translation Evaluation                    #
 8 | # Philipp Koehn                                                                        #
 9 | # http://www.aclweb.org/anthology/W04-3250                                             #
10 | #                                                                                      #
11 | ########################################################################################
12 | 
13 | import numpy as np
14 | 
15 | 
16 | def eval_with_paired_bootstrap(ref, outs, src,
17 |                                scorer,
18 |                                compare_directions=[(0, 1)],
19 |                                num_samples=1000, sample_ratio=0.5,
20 |                                cache_stats=None):
21 |   """
22 |   Evaluate with paired boostrap.
23 |   This compares several systems, performing a signifiance tests with
24 |   paired bootstrap resampling to compare the accuracy of the specified systems.
25 | 
26 |   Args:
27 |     ref: The correct labels
28 |     outs: The output of systems
29 |     src: The source corpus
30 |     scorer: The scorer
31 |     compare_directions: A string specifying which two systems to compare
32 |     num_samples: The number of bootstrap samples to take
33 |     sample_ratio: The ratio of samples to take every time
34 |     cache_stats: The precomputed statistics
35 | 
36 |   Returns:
37 |     A tuple containing the win ratios, statistics for systems
38 |   """
39 |   sys_scores = [[] for _ in outs] 
40 |   wins = [[0, 0, 0] for _ in compare_directions] if compare_directions is not None else None
41 |   n = len(ref)
42 |   ids = list(range(n))
43 | 
44 |   if cache_stats is None:
45 |     cache_stats = [scorer.cache_stats(ref, out, src=src) for out in outs]
46 |   sample_size = int(n*sample_ratio)
47 |   for _ in range(num_samples):
48 |     # Subsample the gold and system outputs (with replacement)
49 |     reduced_ids = np.random.choice(ids, size=sample_size, replace=True)
50 |     # Calculate accuracy on the reduced sample and save stats
51 |     if cache_stats[0]:
52 |       sys_score, _ = zip(*[scorer.score_cached_corpus(reduced_ids, cache_stat) for cache_stat in cache_stats])
53 |     else:
54 |       reduced_ref = [ref[i] for i in reduced_ids]
55 |       reduced_outs = [[out[i] for i in reduced_ids] for out in outs]
56 |       reduced_src = [src[i] for i in reduced_ids]
57 |       sys_score, _ = zip(*[scorer.score_corpus(reduced_ref, reduced_out, reduced_src) for reduced_out in reduced_outs])
58 | 
59 |     if wins is not None:
60 |       for i, compare_direction in enumerate(compare_directions): 
61 |         left, right = compare_direction
62 |         if sys_score[left] > sys_score[right]:
63 |           wins[i][0] += 1
64 |         if sys_score[left] < sys_score[right]:
65 |           wins[i][1] += 1
66 |         else:
67 |           wins[i][2] += 1
68 |     
69 |     for i in range(len(outs)): 
70 |       sys_scores[i].append(sys_score[i])
71 | 
72 |   # Print win stats
73 |   wins = [[x/float(num_samples) for x in win] for win in wins] if wins is not None else None
74 | 
75 |   # Print system stats
76 |   sys_stats = []
77 |   for i in range(len(outs)): 
78 |     sys_scores[i].sort()
79 |     sys_stats.append({
80 |       'mean':np.mean(sys_scores[i]),
81 |       'median':np.median(sys_scores[i]),
82 |       'lower_bound':sys_scores[i][int(num_samples * 0.025)],
83 |       'upper_bound':sys_scores[i][int(num_samples * 0.975)]
84 |     })
85 |  
86 |   return wins, sys_stats
87 | 


--------------------------------------------------------------------------------
/compare_mt/stat_utils.py:
--------------------------------------------------------------------------------
 1 | 
 2 | def extract_salient_features(dict1, dict2, alpha=1.0):
 3 |   """
 4 |   Score salient features given to dictionaries.
 5 | 
 6 |   Args:
 7 |     dict1: First set of feature coutns
 8 |     dict2: Second set of feature counts
 9 |     alpha: The amount of smoothing (default 1 to Laplace smoothed probabilities)
10 | 
11 |   Returns:
12 |     Laplace smoothed differences between features
13 |   """
14 |   all_keys = set(dict1.keys()) | set(dict2.keys())
15 |   scores = {}
16 |   for k in all_keys:
17 |     scores[k] = (dict1[k]+alpha) / (dict1[k] + dict2[k] + 2*alpha)
18 |   return scores


--------------------------------------------------------------------------------
/compare_mt/version_info.py:
--------------------------------------------------------------------------------
1 | __version__ = "0.2.11"
2 | 


--------------------------------------------------------------------------------
/example/ted.sys1.eng.senttag:
--------------------------------------------------------------------------------
   1 | 20
   2 | 10
   3 | 20
   4 | 20
   5 | 20
   6 | 0
   7 | 20
   8 | 80
   9 | 0
  10 | 10
  11 | 0
  12 | 10
  13 | 30
  14 | 20
  15 | 10
  16 | 30
  17 | 20
  18 | 20
  19 | 10
  20 | 0
  21 | 0
  22 | 10
  23 | 0
  24 | 20
  25 | 10
  26 | 20
  27 | 10
  28 | 20
  29 | 10
  30 | 20
  31 | 50
  32 | 50
  33 | 10
  34 | 30
  35 | 10
  36 | 20
  37 | 10
  38 | 30
  39 | 30
  40 | 30
  41 | 20
  42 | 30
  43 | 0
  44 | 0
  45 | 10
  46 | 0
  47 | 20
  48 | 10
  49 | 10
  50 | 0
  51 | 0
  52 | 20
  53 | 0
  54 | 10
  55 | 0
  56 | 0
  57 | 0
  58 | 0
  59 | 0
  60 | 0
  61 | 0
  62 | 0
  63 | 0
  64 | 10
  65 | 0
  66 | 0
  67 | 0
  68 | 0
  69 | 10
  70 | 0
  71 | 20
  72 | 10
  73 | 60
  74 | 20
  75 | 20
  76 | 10
  77 | 10
  78 | 40
  79 | 10
  80 | 20
  81 | 20
  82 | 20
  83 | 10
  84 | 20
  85 | 10
  86 | 20
  87 | 40
  88 | 10
  89 | 10
  90 | 10
  91 | 0
  92 | 0
  93 | 10
  94 | 0
  95 | 0
  96 | 10
  97 | 10
  98 | 10
  99 | 10
 100 | 10
 101 | 10
 102 | 10
 103 | 10
 104 | 0
 105 | 0
 106 | 20
 107 | 20
 108 | 10
 109 | 40
 110 | 0
 111 | 10
 112 | 10
 113 | 10
 114 | 10
 115 | 20
 116 | 20
 117 | 30
 118 | 30
 119 | 10
 120 | 20
 121 | 10
 122 | 10
 123 | 10
 124 | 30
 125 | 20
 126 | 10
 127 | 20
 128 | 0
 129 | 10
 130 | 30
 131 | 20
 132 | 10
 133 | 30
 134 | 60
 135 | 10
 136 | 0
 137 | 10
 138 | 10
 139 | 30
 140 | 10
 141 | 20
 142 | 40
 143 | 40
 144 | 70
 145 | 30
 146 | 30
 147 | 20
 148 | 40
 149 | 0
 150 | 0
 151 | 0
 152 | 0
 153 | 0
 154 | 20
 155 | 0
 156 | 40
 157 | 20
 158 | 10
 159 | 0
 160 | 0
 161 | 10
 162 | 10
 163 | 0
 164 | 10
 165 | 0
 166 | 0
 167 | 40
 168 | 10
 169 | 10
 170 | 0
 171 | 0
 172 | 0
 173 | 0
 174 | 10
 175 | 10
 176 | 20
 177 | 0
 178 | 20
 179 | 0
 180 | 10
 181 | 0
 182 | 10
 183 | 0
 184 | 10
 185 | 10
 186 | 0
 187 | 0
 188 | 30
 189 | 10
 190 | 10
 191 | 10
 192 | 20
 193 | 0
 194 | 0
 195 | 10
 196 | 10
 197 | 20
 198 | 10
 199 | 10
 200 | 10
 201 | 10
 202 | 10
 203 | 20
 204 | 0
 205 | 0
 206 | 20
 207 | 20
 208 | 30
 209 | 10
 210 | 10
 211 | 10
 212 | 10
 213 | 10
 214 | 0
 215 | 20
 216 | 10
 217 | 10
 218 | 30
 219 | 10
 220 | 0
 221 | 30
 222 | 10
 223 | 10
 224 | 10
 225 | 10
 226 | 10
 227 | 30
 228 | 20
 229 | 20
 230 | 20
 231 | 20
 232 | 0
 233 | 10
 234 | 20
 235 | 30
 236 | 10
 237 | 0
 238 | 0
 239 | 20
 240 | 10
 241 | 10
 242 | 20
 243 | 30
 244 | 10
 245 | 0
 246 | 0
 247 | 30
 248 | 20
 249 | 10
 250 | 30
 251 | 10
 252 | 10
 253 | 10
 254 | 0
 255 | 0
 256 | 10
 257 | 10
 258 | 20
 259 | 20
 260 | 10
 261 | 0
 262 | 10
 263 | 0
 264 | 0
 265 | 10
 266 | 0
 267 | 30
 268 | 20
 269 | 20
 270 | 0
 271 | 0
 272 | 10
 273 | 0
 274 | 0
 275 | 10
 276 | 20
 277 | 10
 278 | 10
 279 | 20
 280 | 20
 281 | 30
 282 | 0
 283 | 10
 284 | 10
 285 | 0
 286 | 0
 287 | 0
 288 | 10
 289 | 20
 290 | 10
 291 | 20
 292 | 30
 293 | 10
 294 | 20
 295 | 10
 296 | 30
 297 | 10
 298 | 0
 299 | 0
 300 | 10
 301 | 10
 302 | 0
 303 | 0
 304 | 20
 305 | 10
 306 | 20
 307 | 0
 308 | 0
 309 | 20
 310 | 0
 311 | 0
 312 | 20
 313 | 10
 314 | 10
 315 | 20
 316 | 20
 317 | 10
 318 | 10
 319 | 20
 320 | 20
 321 | 10
 322 | 10
 323 | 40
 324 | 10
 325 | 20
 326 | 10
 327 | 30
 328 | 20
 329 | 0
 330 | 0
 331 | 30
 332 | 20
 333 | 0
 334 | 0
 335 | 10
 336 | 10
 337 | 0
 338 | 40
 339 | 10
 340 | 0
 341 | 30
 342 | 10
 343 | 10
 344 | 10
 345 | 20
 346 | 40
 347 | 10
 348 | 10
 349 | 10
 350 | 30
 351 | 10
 352 | 10
 353 | 0
 354 | 10
 355 | 10
 356 | 10
 357 | 10
 358 | 10
 359 | 20
 360 | 20
 361 | 10
 362 | 20
 363 | 10
 364 | 10
 365 | 30
 366 | 20
 367 | 10
 368 | 10
 369 | 0
 370 | 30
 371 | 10
 372 | 20
 373 | 10
 374 | 10
 375 | 10
 376 | 10
 377 | 10
 378 | 10
 379 | 20
 380 | 10
 381 | 30
 382 | 30
 383 | 0
 384 | 0
 385 | 0
 386 | 0
 387 | 0
 388 | 10
 389 | 0
 390 | 0
 391 | 10
 392 | 10
 393 | 20
 394 | 0
 395 | 10
 396 | 10
 397 | 10
 398 | 10
 399 | 0
 400 | 10
 401 | 10
 402 | 10
 403 | 10
 404 | 10
 405 | 0
 406 | 10
 407 | 20
 408 | 10
 409 | 10
 410 | 10
 411 | 10
 412 | 10
 413 | 10
 414 | 10
 415 | 10
 416 | 0
 417 | 0
 418 | 30
 419 | 10
 420 | 10
 421 | 0
 422 | 10
 423 | 10
 424 | 10
 425 | 10
 426 | 10
 427 | 0
 428 | 0
 429 | 0
 430 | 0
 431 | 20
 432 | 0
 433 | 10
 434 | 10
 435 | 10
 436 | 20
 437 | 10
 438 | 10
 439 | 10
 440 | 10
 441 | 20
 442 | 0
 443 | 0
 444 | 20
 445 | 10
 446 | 20
 447 | 20
 448 | 10
 449 | 10
 450 | 10
 451 | 10
 452 | 10
 453 | 10
 454 | 0
 455 | 0
 456 | 10
 457 | 10
 458 | 20
 459 | 10
 460 | 20
 461 | 20
 462 | 0
 463 | 0
 464 | 10
 465 | 10
 466 | 0
 467 | 20
 468 | 10
 469 | 10
 470 | 0
 471 | 10
 472 | 0
 473 | 0
 474 | 10
 475 | 10
 476 | 10
 477 | 20
 478 | 30
 479 | 0
 480 | 10
 481 | 10
 482 | 20
 483 | 10
 484 | 10
 485 | 40
 486 | 10
 487 | 50
 488 | 0
 489 | 30
 490 | 20
 491 | 20
 492 | 20
 493 | 0
 494 | 0
 495 | 20
 496 | 10
 497 | 30
 498 | 10
 499 | 20
 500 | 20
 501 | 10
 502 | 20
 503 | 20
 504 | 30
 505 | 10
 506 | 0
 507 | 20
 508 | 20
 509 | 40
 510 | 20
 511 | 10
 512 | 20
 513 | 20
 514 | 0
 515 | 30
 516 | 10
 517 | 40
 518 | 0
 519 | 40
 520 | 10
 521 | 50
 522 | 10
 523 | 10
 524 | 40
 525 | 0
 526 | 10
 527 | 40
 528 | 30
 529 | 30
 530 | 20
 531 | 30
 532 | 20
 533 | 90
 534 | 10
 535 | 30
 536 | 30
 537 | 20
 538 | 20
 539 | 30
 540 | 50
 541 | 40
 542 | 10
 543 | 10
 544 | 20
 545 | 20
 546 | 0
 547 | 0
 548 | 10
 549 | 10
 550 | 10
 551 | 20
 552 | 50
 553 | 0
 554 | 10
 555 | 10
 556 | 30
 557 | 100
 558 | 10
 559 | 10
 560 | 50
 561 | 100
 562 | 20
 563 | 20
 564 | 0
 565 | 30
 566 | 60
 567 | 0
 568 | 10
 569 | 10
 570 | 10
 571 | 10
 572 | 30
 573 | 0
 574 | 30
 575 | 10
 576 | 20
 577 | 0
 578 | 20
 579 | 20
 580 | 40
 581 | 10
 582 | 10
 583 | 20
 584 | 10
 585 | 20
 586 | 20
 587 | 40
 588 | 40
 589 | 10
 590 | 30
 591 | 0
 592 | 40
 593 | 30
 594 | 40
 595 | 10
 596 | 10
 597 | 0
 598 | 0
 599 | 0
 600 | 10
 601 | 10
 602 | 30
 603 | 0
 604 | 10
 605 | 20
 606 | 10
 607 | 20
 608 | 0
 609 | 30
 610 | 10
 611 | 0
 612 | 0
 613 | 10
 614 | 0
 615 | 10
 616 | 20
 617 | 0
 618 | 0
 619 | 10
 620 | 10
 621 | 0
 622 | 20
 623 | 10
 624 | 10
 625 | 0
 626 | 10
 627 | 10
 628 | 10
 629 | 10
 630 | 10
 631 | 30
 632 | 10
 633 | 0
 634 | 20
 635 | 10
 636 | 10
 637 | 30
 638 | 0
 639 | 10
 640 | 10
 641 | 10
 642 | 20
 643 | 20
 644 | 20
 645 | 10
 646 | 20
 647 | 0
 648 | 0
 649 | 10
 650 | 0
 651 | 40
 652 | 30
 653 | 0
 654 | 0
 655 | 0
 656 | 0
 657 | 40
 658 | 0
 659 | 0
 660 | 20
 661 | 20
 662 | 0
 663 | 0
 664 | 10
 665 | 0
 666 | 10
 667 | 0
 668 | 20
 669 | 20
 670 | 0
 671 | 0
 672 | 20
 673 | 30
 674 | 20
 675 | 10
 676 | 10
 677 | 10
 678 | 50
 679 | 10
 680 | 10
 681 | 40
 682 | 20
 683 | 20
 684 | 30
 685 | 0
 686 | 10
 687 | 10
 688 | 40
 689 | 10
 690 | 20
 691 | 20
 692 | 0
 693 | 30
 694 | 20
 695 | 0
 696 | 40
 697 | 10
 698 | 10
 699 | 30
 700 | 0
 701 | 10
 702 | 20
 703 | 10
 704 | 20
 705 | 20
 706 | 0
 707 | 0
 708 | 20
 709 | 30
 710 | 20
 711 | 20
 712 | 20
 713 | 0
 714 | 30
 715 | 20
 716 | 10
 717 | 10
 718 | 20
 719 | 0
 720 | 20
 721 | 10
 722 | 0
 723 | 20
 724 | 0
 725 | 30
 726 | 10
 727 | 50
 728 | 0
 729 | 40
 730 | 10
 731 | 10
 732 | 0
 733 | 10
 734 | 10
 735 | 20
 736 | 0
 737 | 10
 738 | 0
 739 | 20
 740 | 10
 741 | 20
 742 | 0
 743 | 50
 744 | 20
 745 | 50
 746 | 20
 747 | 50
 748 | 0
 749 | 0
 750 | 10
 751 | 10
 752 | 20
 753 | 20
 754 | 20
 755 | 30
 756 | 30
 757 | 20
 758 | 50
 759 | 0
 760 | 20
 761 | 20
 762 | 30
 763 | 10
 764 | 10
 765 | 10
 766 | 0
 767 | 30
 768 | 30
 769 | 10
 770 | 40
 771 | 30
 772 | 10
 773 | 10
 774 | 10
 775 | 10
 776 | 30
 777 | 20
 778 | 10
 779 | 0
 780 | 10
 781 | 20
 782 | 20
 783 | 20
 784 | 40
 785 | 40
 786 | 10
 787 | 10
 788 | 10
 789 | 20
 790 | 20
 791 | 0
 792 | 30
 793 | 0
 794 | 30
 795 | 0
 796 | 30
 797 | 0
 798 | 10
 799 | 20
 800 | 10
 801 | 30
 802 | 10
 803 | 10
 804 | 10
 805 | 0
 806 | 0
 807 | 0
 808 | 0
 809 | 0
 810 | 20
 811 | 60
 812 | 10
 813 | 0
 814 | 0
 815 | 10
 816 | 0
 817 | 0
 818 | 0
 819 | 0
 820 | 10
 821 | 20
 822 | 10
 823 | 0
 824 | 0
 825 | 10
 826 | 30
 827 | 10
 828 | 20
 829 | 0
 830 | 20
 831 | 10
 832 | 20
 833 | 10
 834 | 10
 835 | 0
 836 | 10
 837 | 10
 838 | 0
 839 | 10
 840 | 0
 841 | 20
 842 | 10
 843 | 20
 844 | 0
 845 | 10
 846 | 10
 847 | 10
 848 | 10
 849 | 0
 850 | 10
 851 | 0
 852 | 10
 853 | 20
 854 | 0
 855 | 10
 856 | 10
 857 | 10
 858 | 10
 859 | 20
 860 | 60
 861 | 0
 862 | 0
 863 | 20
 864 | 10
 865 | 10
 866 | 0
 867 | 10
 868 | 20
 869 | 0
 870 | 10
 871 | 10
 872 | 0
 873 | 10
 874 | 10
 875 | 10
 876 | 10
 877 | 0
 878 | 10
 879 | 10
 880 | 0
 881 | 10
 882 | 10
 883 | 30
 884 | 10
 885 | 0
 886 | 0
 887 | 10
 888 | 0
 889 | 0
 890 | 20
 891 | 10
 892 | 10
 893 | 0
 894 | 0
 895 | 0
 896 | 20
 897 | 20
 898 | 10
 899 | 10
 900 | 0
 901 | 0
 902 | 10
 903 | 20
 904 | 10
 905 | 10
 906 | 10
 907 | 10
 908 | 0
 909 | 30
 910 | 10
 911 | 10
 912 | 0
 913 | 10
 914 | 10
 915 | 10
 916 | 10
 917 | 10
 918 | 20
 919 | 0
 920 | 10
 921 | 0
 922 | 0
 923 | 0
 924 | 40
 925 | 0
 926 | 0
 927 | 10
 928 | 10
 929 | 20
 930 | 10
 931 | 10
 932 | 10
 933 | 20
 934 | 10
 935 | 20
 936 | 20
 937 | 0
 938 | 10
 939 | 0
 940 | 0
 941 | 0
 942 | 20
 943 | 40
 944 | 0
 945 | 10
 946 | 10
 947 | 0
 948 | 0
 949 | 10
 950 | 0
 951 | 10
 952 | 10
 953 | 0
 954 | 10
 955 | 0
 956 | 10
 957 | 10
 958 | 0
 959 | 10
 960 | 10
 961 | 10
 962 | 10
 963 | 10
 964 | 20
 965 | 0
 966 | 10
 967 | 0
 968 | 0
 969 | 40
 970 | 10
 971 | 10
 972 | 0
 973 | 30
 974 | 20
 975 | 30
 976 | 10
 977 | 10
 978 | 10
 979 | 30
 980 | 0
 981 | 10
 982 | 30
 983 | 10
 984 | 10
 985 | 10
 986 | 20
 987 | 10
 988 | 20
 989 | 20
 990 | 10
 991 | 20
 992 | 20
 993 | 30
 994 | 20
 995 | 30
 996 | 10
 997 | 20
 998 | 20
 999 | 30
1000 | 10
1001 | 20
1002 | 10
1003 | 10
1004 | 10
1005 | 20
1006 | 30
1007 | 70
1008 | 10
1009 | 10
1010 | 10
1011 | 10
1012 | 20
1013 | 10
1014 | 10
1015 | 20
1016 | 10
1017 | 20
1018 | 10
1019 | 20
1020 | 10
1021 | 0
1022 | 20
1023 | 10
1024 | 20
1025 | 10
1026 | 10
1027 | 30
1028 | 10
1029 | 0
1030 | 20
1031 | 0
1032 | 20
1033 | 10
1034 | 30
1035 | 10
1036 | 10
1037 | 10
1038 | 10
1039 | 20
1040 | 20
1041 | 20
1042 | 20
1043 | 60
1044 | 40
1045 | 10
1046 | 0
1047 | 10
1048 | 20
1049 | 10
1050 | 10
1051 | 0
1052 | 0
1053 | 10
1054 | 10
1055 | 10
1056 | 10
1057 | 20
1058 | 0
1059 | 20
1060 | 0
1061 | 10
1062 | 10
1063 | 10
1064 | 10
1065 | 20
1066 | 10
1067 | 20
1068 | 10
1069 | 20
1070 | 20
1071 | 10
1072 | 10
1073 | 10
1074 | 10
1075 | 0
1076 | 10
1077 | 0
1078 | 10
1079 | 20
1080 | 30
1081 | 30
1082 | 30
1083 | 0
1084 | 0
1085 | 20
1086 | 10
1087 | 0
1088 | 0
1089 | 10
1090 | 10
1091 | 10
1092 | 0
1093 | 0
1094 | 10
1095 | 10
1096 | 0
1097 | 0
1098 | 0
1099 | 10
1100 | 10
1101 | 0
1102 | 10
1103 | 10
1104 | 0
1105 | 0
1106 | 0
1107 | 10
1108 | 10
1109 | 20
1110 | 20
1111 | 10
1112 | 0
1113 | 10
1114 | 30
1115 | 0
1116 | 10
1117 | 10
1118 | 10
1119 | 10
1120 | 20
1121 | 10
1122 | 40
1123 | 10
1124 | 10
1125 | 10
1126 | 10
1127 | 20
1128 | 30
1129 | 20
1130 | 20
1131 | 10
1132 | 40
1133 | 30
1134 | 20
1135 | 0
1136 | 10
1137 | 20
1138 | 0
1139 | 10
1140 | 0
1141 | 10
1142 | 10
1143 | 30
1144 | 0
1145 | 10
1146 | 0
1147 | 10
1148 | 10
1149 | 30
1150 | 0
1151 | 10
1152 | 60
1153 | 20
1154 | 60
1155 | 10
1156 | 10
1157 | 10
1158 | 20
1159 | 20
1160 | 30
1161 | 20
1162 | 0
1163 | 0
1164 | 0
1165 | 0
1166 | 0
1167 | 10
1168 | 20
1169 | 20
1170 | 0
1171 | 10
1172 | 10
1173 | 10
1174 | 30
1175 | 10
1176 | 40
1177 | 10
1178 | 10
1179 | 10
1180 | 10
1181 | 20
1182 | 10
1183 | 10
1184 | 10
1185 | 10
1186 | 20
1187 | 0
1188 | 10
1189 | 10
1190 | 0
1191 | 0
1192 | 70
1193 | 0
1194 | 10
1195 | 20
1196 | 0
1197 | 10
1198 | 10
1199 | 20
1200 | 20
1201 | 0
1202 | 30
1203 | 10
1204 | 10
1205 | 20
1206 | 60
1207 | 0
1208 | 20
1209 | 10
1210 | 0
1211 | 0
1212 | 30
1213 | 10
1214 | 30
1215 | 20
1216 | 0
1217 | 10
1218 | 10
1219 | 20
1220 | 10
1221 | 20
1222 | 10
1223 | 20
1224 | 10
1225 | 0
1226 | 0
1227 | 20
1228 | 20
1229 | 10
1230 | 40
1231 | 0
1232 | 10
1233 | 10
1234 | 0
1235 | 10
1236 | 20
1237 | 0
1238 | 10
1239 | 10
1240 | 30
1241 | 10
1242 | 10
1243 | 20
1244 | 30
1245 | 20
1246 | 10
1247 | 10
1248 | 10
1249 | 10
1250 | 40
1251 | 20
1252 | 20
1253 | 10
1254 | 70
1255 | 0
1256 | 10
1257 | 10
1258 | 10
1259 | 20
1260 | 0
1261 | 20
1262 | 10
1263 | 10
1264 | 40
1265 | 10
1266 | 50
1267 | 10
1268 | 10
1269 | 10
1270 | 30
1271 | 20
1272 | 10
1273 | 20
1274 | 10
1275 | 10
1276 | 10
1277 | 0
1278 | 0
1279 | 10
1280 | 0
1281 | 10
1282 | 20
1283 | 10
1284 | 0
1285 | 10
1286 | 20
1287 | 10
1288 | 10
1289 | 10
1290 | 10
1291 | 20
1292 | 0
1293 | 20
1294 | 10
1295 | 10
1296 | 40
1297 | 40
1298 | 10
1299 | 60
1300 | 30
1301 | 10
1302 | 0
1303 | 10
1304 | 10
1305 | 10
1306 | 0
1307 | 10
1308 | 20
1309 | 10
1310 | 0
1311 | 10
1312 | 30
1313 | 20
1314 | 0
1315 | 20
1316 | 20
1317 | 0
1318 | 10
1319 | 30
1320 | 30
1321 | 10
1322 | 10
1323 | 20
1324 | 0
1325 | 10
1326 | 40
1327 | 10
1328 | 20
1329 | 10
1330 | 30
1331 | 30
1332 | 10
1333 | 10
1334 | 10
1335 | 20
1336 | 0
1337 | 10
1338 | 10
1339 | 0
1340 | 20
1341 | 20
1342 | 0
1343 | 10
1344 | 10
1345 | 30
1346 | 30
1347 | 0
1348 | 40
1349 | 30
1350 | 20
1351 | 0
1352 | 0
1353 | 10
1354 | 0
1355 | 0
1356 | 10
1357 | 0
1358 | 10
1359 | 0
1360 | 10
1361 | 20
1362 | 10
1363 | 10
1364 | 20
1365 | 10
1366 | 0
1367 | 0
1368 | 0
1369 | 10
1370 | 30
1371 | 10
1372 | 20
1373 | 30
1374 | 10
1375 | 70
1376 | 10
1377 | 10
1378 | 20
1379 | 20
1380 | 40
1381 | 10
1382 | 10
1383 | 0
1384 | 60
1385 | 10
1386 | 50
1387 | 10
1388 | 10
1389 | 20
1390 | 10
1391 | 0
1392 | 10
1393 | 10
1394 | 20
1395 | 60
1396 | 10
1397 | 10
1398 | 20
1399 | 20
1400 | 40
1401 | 0
1402 | 10
1403 | 10
1404 | 10
1405 | 10
1406 | 10
1407 | 10
1408 | 20
1409 | 10
1410 | 30
1411 | 20
1412 | 10
1413 | 10
1414 | 10
1415 | 30
1416 | 0
1417 | 10
1418 | 20
1419 | 10
1420 | 0
1421 | 0
1422 | 10
1423 | 10
1424 | 20
1425 | 20
1426 | 10
1427 | 0
1428 | 0
1429 | 20
1430 | 10
1431 | 0
1432 | 30
1433 | 0
1434 | 10
1435 | 0
1436 | 20
1437 | 10
1438 | 20
1439 | 10
1440 | 0
1441 | 0
1442 | 0
1443 | 0
1444 | 20
1445 | 0
1446 | 20
1447 | 20
1448 | 0
1449 | 20
1450 | 10
1451 | 10
1452 | 0
1453 | 20
1454 | 10
1455 | 10
1456 | 10
1457 | 0
1458 | 0
1459 | 30
1460 | 0
1461 | 30
1462 | 0
1463 | 0
1464 | 0
1465 | 10
1466 | 20
1467 | 10
1468 | 0
1469 | 20
1470 | 10
1471 | 0
1472 | 10
1473 | 0
1474 | 10
1475 | 10
1476 | 30
1477 | 20
1478 | 10
1479 | 0
1480 | 20
1481 | 0
1482 | 0
1483 | 10
1484 | 20
1485 | 10
1486 | 30
1487 | 10
1488 | 10
1489 | 10
1490 | 0
1491 | 0
1492 | 0
1493 | 0
1494 | 10
1495 | 20
1496 | 0
1497 | 30
1498 | 0
1499 | 10
1500 | 20
1501 | 0
1502 | 20
1503 | 10
1504 | 0
1505 | 30
1506 | 20
1507 | 10
1508 | 10
1509 | 10
1510 | 10
1511 | 10
1512 | 20
1513 | 20
1514 | 10
1515 | 50
1516 | 0
1517 | 20
1518 | 10
1519 | 30
1520 | 0
1521 | 30
1522 | 10
1523 | 0
1524 | 0
1525 | 20
1526 | 20
1527 | 0
1528 | 10
1529 | 10
1530 | 20
1531 | 20
1532 | 0
1533 | 0
1534 | 0
1535 | 20
1536 | 0
1537 | 10
1538 | 30
1539 | 0
1540 | 0
1541 | 0
1542 | 20
1543 | 30
1544 | 60
1545 | 20
1546 | 20
1547 | 0
1548 | 20
1549 | 0
1550 | 10
1551 | 0
1552 | 10
1553 | 10
1554 | 0
1555 | 10
1556 | 10
1557 | 0
1558 | 40
1559 | 30
1560 | 0
1561 | 20
1562 | 40
1563 | 30
1564 | 30
1565 | 0
1566 | 10
1567 | 20
1568 | 10
1569 | 0
1570 | 20
1571 | 10
1572 | 10
1573 | 10
1574 | 10
1575 | 0
1576 | 10
1577 | 0
1578 | 0
1579 | 10
1580 | 10
1581 | 0
1582 | 10
1583 | 10
1584 | 0
1585 | 0
1586 | 10
1587 | 10
1588 | 0
1589 | 10
1590 | 10
1591 | 10
1592 | 10
1593 | 10
1594 | 0
1595 | 0
1596 | 10
1597 | 10
1598 | 30
1599 | 10
1600 | 30
1601 | 0
1602 | 0
1603 | 10
1604 | 20
1605 | 0
1606 | 10
1607 | 40
1608 | 10
1609 | 10
1610 | 0
1611 | 10
1612 | 20
1613 | 0
1614 | 40
1615 | 20
1616 | 20
1617 | 10
1618 | 0
1619 | 10
1620 | 20
1621 | 0
1622 | 40
1623 | 10
1624 | 10
1625 | 20
1626 | 0
1627 | 20
1628 | 10
1629 | 0
1630 | 10
1631 | 20
1632 | 0
1633 | 20
1634 | 10
1635 | 10
1636 | 30
1637 | 20
1638 | 10
1639 | 10
1640 | 0
1641 | 0
1642 | 0
1643 | 0
1644 | 10
1645 | 10
1646 | 20
1647 | 10
1648 | 0
1649 | 10
1650 | 30
1651 | 30
1652 | 10
1653 | 10
1654 | 20
1655 | 10
1656 | 10
1657 | 10
1658 | 60
1659 | 0
1660 | 10
1661 | 20
1662 | 0
1663 | 0
1664 | 20
1665 | 10
1666 | 20
1667 | 20
1668 | 0
1669 | 40
1670 | 10
1671 | 0
1672 | 40
1673 | 10
1674 | 10
1675 | 0
1676 | 20
1677 | 10
1678 | 0
1679 | 20
1680 | 0
1681 | 10
1682 | 10
1683 | 10
1684 | 10
1685 | 10
1686 | 10
1687 | 20
1688 | 20
1689 | 10
1690 | 10
1691 | 20
1692 | 10
1693 | 30
1694 | 10
1695 | 10
1696 | 20
1697 | 0
1698 | 10
1699 | 10
1700 | 0
1701 | 10
1702 | 10
1703 | 0
1704 | 40
1705 | 20
1706 | 60
1707 | 20
1708 | 10
1709 | 0
1710 | 0
1711 | 40
1712 | 10
1713 | 30
1714 | 10
1715 | 20
1716 | 20
1717 | 10
1718 | 60
1719 | 30
1720 | 50
1721 | 20
1722 | 50
1723 | 30
1724 | 10
1725 | 50
1726 | 80
1727 | 10
1728 | 50
1729 | 30
1730 | 20
1731 | 10
1732 | 10
1733 | 0
1734 | 20
1735 | 10
1736 | 20
1737 | 20
1738 | 10
1739 | 30
1740 | 30
1741 | 0
1742 | 50
1743 | 20
1744 | 20
1745 | 10
1746 | 20
1747 | 10
1748 | 70
1749 | 20
1750 | 30
1751 | 20
1752 | 20
1753 | 0
1754 | 50
1755 | 10
1756 | 30
1757 | 10
1758 | 20
1759 | 20
1760 | 30
1761 | 10
1762 | 40
1763 | 20
1764 | 0
1765 | 50
1766 | 10
1767 | 20
1768 | 20
1769 | 0
1770 | 20
1771 | 10
1772 | 20
1773 | 80
1774 | 0
1775 | 70
1776 | 40
1777 | 0
1778 | 10
1779 | 0
1780 | 40
1781 | 0
1782 | 10
1783 | 10
1784 | 0
1785 | 10
1786 | 10
1787 | 10
1788 | 30
1789 | 20
1790 | 20
1791 | 0
1792 | 40
1793 | 30
1794 | 70
1795 | 10
1796 | 10
1797 | 20
1798 | 0
1799 | 40
1800 | 60
1801 | 0
1802 | 10
1803 | 0
1804 | 0
1805 | 10
1806 | 0
1807 | 10
1808 | 10
1809 | 50
1810 | 10
1811 | 10
1812 | 10
1813 | 10
1814 | 10
1815 | 0
1816 | 10
1817 | 20
1818 | 30
1819 | 10
1820 | 10
1821 | 0
1822 | 10
1823 | 10
1824 | 10
1825 | 10
1826 | 10
1827 | 10
1828 | 10
1829 | 30
1830 | 10
1831 | 20
1832 | 0
1833 | 10
1834 | 10
1835 | 20
1836 | 20
1837 | 20
1838 | 0
1839 | 30
1840 | 20
1841 | 0
1842 | 0
1843 | 0
1844 | 30
1845 | 0
1846 | 20
1847 | 10
1848 | 0
1849 | 20
1850 | 10
1851 | 0
1852 | 0
1853 | 30
1854 | 40
1855 | 0
1856 | 10
1857 | 10
1858 | 10
1859 | 40
1860 | 0
1861 | 0
1862 | 10
1863 | 0
1864 | 20
1865 | 10
1866 | 10
1867 | 10
1868 | 0
1869 | 0
1870 | 30
1871 | 30
1872 | 10
1873 | 20
1874 | 10
1875 | 10
1876 | 20
1877 | 20
1878 | 10
1879 | 10
1880 | 10
1881 | 30
1882 | 10
1883 | 10
1884 | 40
1885 | 20
1886 | 20
1887 | 10
1888 | 30
1889 | 0
1890 | 0
1891 | 10
1892 | 0
1893 | 10
1894 | 0
1895 | 30
1896 | 10
1897 | 0
1898 | 0
1899 | 10
1900 | 10
1901 | 10
1902 | 20
1903 | 10
1904 | 30
1905 | 40
1906 | 10
1907 | 10
1908 | 10
1909 | 10
1910 | 10
1911 | 0
1912 | 0
1913 | 0
1914 | 0
1915 | 0
1916 | 10
1917 | 10
1918 | 0
1919 | 10
1920 | 10
1921 | 0
1922 | 20
1923 | 20
1924 | 10
1925 | 10
1926 | 40
1927 | 10
1928 | 10
1929 | 20
1930 | 10
1931 | 20
1932 | 0
1933 | 0
1934 | 20
1935 | 0
1936 | 20
1937 | 0
1938 | 10
1939 | 10
1940 | 10
1941 | 10
1942 | 40
1943 | 10
1944 | 0
1945 | 40
1946 | 10
1947 | 0
1948 | 10
1949 | 10
1950 | 30
1951 | 20
1952 | 10
1953 | 0
1954 | 10
1955 | 30
1956 | 20
1957 | 10
1958 | 10
1959 | 20
1960 | 0
1961 | 30
1962 | 30
1963 | 10
1964 | 20
1965 | 10
1966 | 20
1967 | 0
1968 | 30
1969 | 20
1970 | 0
1971 | 0
1972 | 10
1973 | 0
1974 | 10
1975 | 0
1976 | 30
1977 | 10
1978 | 70
1979 | 20
1980 | 10
1981 | 10
1982 | 10
1983 | 10
1984 | 30
1985 | 0
1986 | 20
1987 | 10
1988 | 0
1989 | 0
1990 | 20
1991 | 0
1992 | 10
1993 | 10
1994 | 10
1995 | 40
1996 | 40
1997 | 0
1998 | 0
1999 | 0
2000 | 40
2001 | 0
2002 | 20
2003 | 10
2004 | 10
2005 | 10
2006 | 0
2007 | 20
2008 | 10
2009 | 20
2010 | 10
2011 | 20
2012 | 0
2013 | 0
2014 | 10
2015 | 10
2016 | 10
2017 | 10
2018 | 20
2019 | 0
2020 | 10
2021 | 0
2022 | 10
2023 | 0
2024 | 0
2025 | 10
2026 | 20
2027 | 10
2028 | 10
2029 | 0
2030 | 0
2031 | 20
2032 | 10
2033 | 10
2034 | 10
2035 | 10
2036 | 0
2037 | 10
2038 | 10
2039 | 0
2040 | 10
2041 | 20
2042 | 40
2043 | 0
2044 | 10
2045 | 10
2046 | 20
2047 | 10
2048 | 10
2049 | 10
2050 | 20
2051 | 10
2052 | 10
2053 | 0
2054 | 10
2055 | 10
2056 | 0
2057 | 10
2058 | 10
2059 | 20
2060 | 10
2061 | 20
2062 | 10
2063 | 20
2064 | 30
2065 | 10
2066 | 10
2067 | 0
2068 | 0
2069 | 20
2070 | 40
2071 | 0
2072 | 10
2073 | 0
2074 | 10
2075 | 10
2076 | 10
2077 | 0
2078 | 10
2079 | 10
2080 | 0
2081 | 10
2082 | 10
2083 | 10
2084 | 10
2085 | 10
2086 | 0
2087 | 0
2088 | 20
2089 | 20
2090 | 10
2091 | 0
2092 | 20
2093 | 30
2094 | 10
2095 | 20
2096 | 0
2097 | 0
2098 | 10
2099 | 20
2100 | 10
2101 | 40
2102 | 20
2103 | 10
2104 | 20
2105 | 10
2106 | 20
2107 | 30
2108 | 100
2109 | 0
2110 | 30
2111 | 10
2112 | 30
2113 | 50
2114 | 50
2115 | 30
2116 | 0
2117 | 0
2118 | 20
2119 | 10
2120 | 20
2121 | 20
2122 | 30
2123 | 10
2124 | 30
2125 | 0
2126 | 10
2127 | 30
2128 | 20
2129 | 40
2130 | 20
2131 | 0
2132 | 10
2133 | 10
2134 | 10
2135 | 20
2136 | 30
2137 | 20
2138 | 30
2139 | 10
2140 | 40
2141 | 10
2142 | 20
2143 | 60
2144 | 20
2145 | 30
2146 | 40
2147 | 10
2148 | 0
2149 | 10
2150 | 20
2151 | 10
2152 | 40
2153 | 20
2154 | 20
2155 | 10
2156 | 40
2157 | 30
2158 | 0
2159 | 20
2160 | 0
2161 | 20
2162 | 10
2163 | 30
2164 | 0
2165 | 10
2166 | 0
2167 | 10
2168 | 30
2169 | 10
2170 | 30
2171 | 0
2172 | 20
2173 | 0
2174 | 10
2175 | 20
2176 | 20
2177 | 0
2178 | 50
2179 | 10
2180 | 20
2181 | 80
2182 | 20
2183 | 0
2184 | 20
2185 | 50
2186 | 20
2187 | 30
2188 | 10
2189 | 10
2190 | 10
2191 | 30
2192 | 20
2193 | 40
2194 | 50
2195 | 50
2196 | 20
2197 | 10
2198 | 30
2199 | 10
2200 | 10
2201 | 10
2202 | 10
2203 | 10
2204 | 10
2205 | 50
2206 | 10
2207 | 10
2208 | 30
2209 | 10
2210 | 10
2211 | 10
2212 | 20
2213 | 10
2214 | 0
2215 | 0
2216 | 10
2217 | 10
2218 | 20
2219 | 10
2220 | 20
2221 | 0
2222 | 30
2223 | 10
2224 | 50
2225 | 0
2226 | 0
2227 | 0
2228 | 0
2229 | 0
2230 | 40
2231 | 0
2232 | 40
2233 | 20
2234 | 20
2235 | 20
2236 | 0
2237 | 20
2238 | 20
2239 | 10
2240 | 10
2241 | 0
2242 | 10
2243 | 20
2244 | 20
2245 | 0
2246 | 20
2247 | 10
2248 | 0
2249 | 20
2250 | 40
2251 | 30
2252 | 40
2253 | 100
2254 | 10
2255 | 80
2256 | 0
2257 | 10
2258 | 10
2259 | 10
2260 | 10
2261 | 10
2262 | 10
2263 | 30
2264 | 20
2265 | 40
2266 | 50
2267 | 40
2268 | 80
2269 | 20
2270 | 40
2271 | 50
2272 | 70
2273 | 0
2274 | 10
2275 | 0
2276 | 20
2277 | 30
2278 | 10
2279 | 20
2280 | 0
2281 | 40
2282 | 10
2283 | 20
2284 | 10
2285 | 20
2286 | 10
2287 | 0
2288 | 20
2289 | 20
2290 | 10
2291 | 10
2292 | 10
2293 | 0
2294 | 10
2295 | 10
2296 | 0
2297 | 0
2298 | 10
2299 | 20
2300 | 50
2301 | 0
2302 | 0
2303 | 0
2304 | 10
2305 | 0
2306 | 0
2307 | 10
2308 | 10
2309 | 20
2310 | 20
2311 | 20
2312 | 10
2313 | 10
2314 | 10
2315 | 10
2316 | 10
2317 | 0
2318 | 10
2319 | 10
2320 | 20
2321 | 30
2322 | 10
2323 | 10
2324 | 0
2325 | 20
2326 | 60
2327 | 10
2328 | 20
2329 | 0
2330 | 30
2331 | 10
2332 | 10
2333 | 0
2334 | 20
2335 | 20
2336 | 0
2337 | 0
2338 | 10
2339 | 0
2340 | 10
2341 | 10
2342 | 20
2343 | 0
2344 | 10
2345 | 50
2346 | 50
2347 | 40
2348 | 50
2349 | 10
2350 | 20
2351 | 30
2352 | 20
2353 | 20
2354 | 0
2355 | 0
2356 | 0
2357 | 0
2358 | 20
2359 | 20
2360 | 10
2361 | 10
2362 | 10
2363 | 20
2364 | 10
2365 | 10
2366 | 0
2367 | 0
2368 | 0
2369 | 10
2370 | 20
2371 | 30
2372 | 10
2373 | 20
2374 | 10
2375 | 0
2376 | 10
2377 | 30
2378 | 40
2379 | 20
2380 | 10
2381 | 10
2382 | 10
2383 | 30
2384 | 10
2385 | 10
2386 | 10
2387 | 10
2388 | 20
2389 | 10
2390 | 20
2391 | 30
2392 | 20
2393 | 0
2394 | 20
2395 | 20
2396 | 10
2397 | 0
2398 | 0
2399 | 20
2400 | 0
2401 | 10
2402 | 10
2403 | 20
2404 | 40
2405 | 30
2406 | 20
2407 | 10
2408 | 10
2409 | 20
2410 | 30
2411 | 20
2412 | 10
2413 | 0
2414 | 20
2415 | 20
2416 | 10
2417 | 0
2418 | 30
2419 | 10
2420 | 20
2421 | 10
2422 | 20
2423 | 0
2424 | 0
2425 | 0
2426 | 30
2427 | 30
2428 | 0
2429 | 20
2430 | 10
2431 | 10
2432 | 0
2433 | 0
2434 | 0
2435 | 20
2436 | 10
2437 | 40
2438 | 10
2439 | 10
2440 | 10
2441 | 0
2442 | 0
2443 | 10
2444 | 10
2445 | 10
2446 | 


--------------------------------------------------------------------------------
/example/ted.sys2.eng.senttag:
--------------------------------------------------------------------------------
   1 | 20
   2 | 10
   3 | 20
   4 | 20
   5 | 10
   6 | 0
   7 | 20
   8 | 70
   9 | 0
  10 | 10
  11 | 10
  12 | 10
  13 | 30
  14 | 20
  15 | 10
  16 | 20
  17 | 20
  18 | 20
  19 | 10
  20 | 0
  21 | 0
  22 | 10
  23 | 0
  24 | 0
  25 | 10
  26 | 20
  27 | 0
  28 | 30
  29 | 10
  30 | 20
  31 | 40
  32 | 50
  33 | 0
  34 | 20
  35 | 10
  36 | 20
  37 | 20
  38 | 20
  39 | 30
  40 | 30
  41 | 20
  42 | 30
  43 | 0
  44 | 0
  45 | 10
  46 | 0
  47 | 10
  48 | 10
  49 | 10
  50 | 0
  51 | 0
  52 | 20
  53 | 0
  54 | 10
  55 | 0
  56 | 0
  57 | 0
  58 | 0
  59 | 0
  60 | 0
  61 | 0
  62 | 10
  63 | 0
  64 | 10
  65 | 0
  66 | 0
  67 | 0
  68 | 0
  69 | 10
  70 | 10
  71 | 20
  72 | 20
  73 | 60
  74 | 20
  75 | 10
  76 | 10
  77 | 10
  78 | 30
  79 | 10
  80 | 20
  81 | 30
  82 | 20
  83 | 10
  84 | 20
  85 | 0
  86 | 20
  87 | 40
  88 | 10
  89 | 10
  90 | 10
  91 | 10
  92 | 10
  93 | 10
  94 | 0
  95 | 0
  96 | 10
  97 | 0
  98 | 10
  99 | 0
 100 | 10
 101 | 10
 102 | 10
 103 | 20
 104 | 0
 105 | 0
 106 | 10
 107 | 20
 108 | 0
 109 | 40
 110 | 10
 111 | 10
 112 | 20
 113 | 10
 114 | 10
 115 | 30
 116 | 10
 117 | 20
 118 | 30
 119 | 10
 120 | 20
 121 | 10
 122 | 10
 123 | 10
 124 | 20
 125 | 20
 126 | 0
 127 | 20
 128 | 0
 129 | 20
 130 | 30
 131 | 20
 132 | 10
 133 | 30
 134 | 50
 135 | 20
 136 | 0
 137 | 10
 138 | 10
 139 | 30
 140 | 10
 141 | 20
 142 | 30
 143 | 40
 144 | 70
 145 | 20
 146 | 20
 147 | 20
 148 | 40
 149 | 0
 150 | 0
 151 | 0
 152 | 0
 153 | 0
 154 | 20
 155 | 10
 156 | 40
 157 | 20
 158 | 10
 159 | 0
 160 | 0
 161 | 10
 162 | 10
 163 | 0
 164 | 10
 165 | 0
 166 | 0
 167 | 40
 168 | 10
 169 | 10
 170 | 0
 171 | 0
 172 | 0
 173 | 10
 174 | 10
 175 | 10
 176 | 20
 177 | 0
 178 | 20
 179 | 0
 180 | 10
 181 | 10
 182 | 10
 183 | 0
 184 | 10
 185 | 0
 186 | 0
 187 | 0
 188 | 20
 189 | 0
 190 | 10
 191 | 10
 192 | 20
 193 | 10
 194 | 10
 195 | 10
 196 | 10
 197 | 20
 198 | 10
 199 | 10
 200 | 10
 201 | 10
 202 | 10
 203 | 10
 204 | 10
 205 | 0
 206 | 20
 207 | 20
 208 | 30
 209 | 10
 210 | 10
 211 | 10
 212 | 10
 213 | 10
 214 | 0
 215 | 10
 216 | 10
 217 | 20
 218 | 30
 219 | 10
 220 | 0
 221 | 20
 222 | 10
 223 | 10
 224 | 10
 225 | 10
 226 | 10
 227 | 20
 228 | 30
 229 | 10
 230 | 20
 231 | 20
 232 | 0
 233 | 10
 234 | 30
 235 | 30
 236 | 10
 237 | 0
 238 | 0
 239 | 20
 240 | 10
 241 | 10
 242 | 20
 243 | 30
 244 | 10
 245 | 0
 246 | 0
 247 | 30
 248 | 20
 249 | 10
 250 | 30
 251 | 10
 252 | 10
 253 | 10
 254 | 0
 255 | 0
 256 | 10
 257 | 10
 258 | 20
 259 | 20
 260 | 10
 261 | 0
 262 | 10
 263 | 0
 264 | 10
 265 | 10
 266 | 0
 267 | 30
 268 | 10
 269 | 10
 270 | 0
 271 | 0
 272 | 10
 273 | 0
 274 | 10
 275 | 0
 276 | 30
 277 | 10
 278 | 10
 279 | 20
 280 | 20
 281 | 20
 282 | 0
 283 | 10
 284 | 10
 285 | 0
 286 | 0
 287 | 0
 288 | 10
 289 | 20
 290 | 10
 291 | 20
 292 | 30
 293 | 10
 294 | 10
 295 | 20
 296 | 20
 297 | 10
 298 | 10
 299 | 0
 300 | 10
 301 | 10
 302 | 0
 303 | 0
 304 | 20
 305 | 10
 306 | 20
 307 | 0
 308 | 0
 309 | 20
 310 | 0
 311 | 0
 312 | 20
 313 | 10
 314 | 10
 315 | 20
 316 | 10
 317 | 10
 318 | 10
 319 | 20
 320 | 10
 321 | 10
 322 | 10
 323 | 30
 324 | 10
 325 | 20
 326 | 10
 327 | 30
 328 | 20
 329 | 0
 330 | 0
 331 | 20
 332 | 20
 333 | 0
 334 | 0
 335 | 10
 336 | 10
 337 | 10
 338 | 30
 339 | 10
 340 | 0
 341 | 30
 342 | 10
 343 | 10
 344 | 10
 345 | 20
 346 | 40
 347 | 10
 348 | 10
 349 | 10
 350 | 30
 351 | 10
 352 | 0
 353 | 10
 354 | 10
 355 | 10
 356 | 10
 357 | 10
 358 | 10
 359 | 20
 360 | 20
 361 | 10
 362 | 20
 363 | 0
 364 | 10
 365 | 20
 366 | 10
 367 | 10
 368 | 0
 369 | 0
 370 | 30
 371 | 10
 372 | 20
 373 | 10
 374 | 0
 375 | 10
 376 | 10
 377 | 10
 378 | 10
 379 | 20
 380 | 0
 381 | 30
 382 | 30
 383 | 0
 384 | 0
 385 | 10
 386 | 0
 387 | 0
 388 | 0
 389 | 10
 390 | 0
 391 | 10
 392 | 20
 393 | 20
 394 | 0
 395 | 10
 396 | 10
 397 | 10
 398 | 0
 399 | 0
 400 | 10
 401 | 10
 402 | 0
 403 | 10
 404 | 10
 405 | 10
 406 | 10
 407 | 10
 408 | 10
 409 | 10
 410 | 10
 411 | 10
 412 | 10
 413 | 10
 414 | 10
 415 | 10
 416 | 0
 417 | 0
 418 | 20
 419 | 10
 420 | 10
 421 | 0
 422 | 10
 423 | 10
 424 | 0
 425 | 10
 426 | 0
 427 | 0
 428 | 0
 429 | 0
 430 | 10
 431 | 10
 432 | 0
 433 | 10
 434 | 10
 435 | 10
 436 | 20
 437 | 10
 438 | 10
 439 | 10
 440 | 10
 441 | 20
 442 | 0
 443 | 0
 444 | 20
 445 | 10
 446 | 20
 447 | 10
 448 | 10
 449 | 10
 450 | 10
 451 | 10
 452 | 10
 453 | 10
 454 | 10
 455 | 0
 456 | 10
 457 | 10
 458 | 20
 459 | 10
 460 | 20
 461 | 20
 462 | 0
 463 | 0
 464 | 10
 465 | 10
 466 | 0
 467 | 10
 468 | 10
 469 | 10
 470 | 0
 471 | 10
 472 | 0
 473 | 0
 474 | 10
 475 | 10
 476 | 10
 477 | 20
 478 | 30
 479 | 0
 480 | 10
 481 | 10
 482 | 20
 483 | 10
 484 | 10
 485 | 40
 486 | 10
 487 | 50
 488 | 0
 489 | 30
 490 | 20
 491 | 10
 492 | 20
 493 | 0
 494 | 0
 495 | 20
 496 | 20
 497 | 30
 498 | 10
 499 | 20
 500 | 20
 501 | 20
 502 | 20
 503 | 20
 504 | 40
 505 | 10
 506 | 0
 507 | 20
 508 | 20
 509 | 40
 510 | 20
 511 | 0
 512 | 10
 513 | 10
 514 | 0
 515 | 30
 516 | 10
 517 | 40
 518 | 0
 519 | 40
 520 | 10
 521 | 40
 522 | 10
 523 | 10
 524 | 40
 525 | 0
 526 | 20
 527 | 40
 528 | 40
 529 | 20
 530 | 10
 531 | 30
 532 | 10
 533 | 70
 534 | 20
 535 | 30
 536 | 20
 537 | 20
 538 | 20
 539 | 30
 540 | 50
 541 | 30
 542 | 10
 543 | 10
 544 | 30
 545 | 20
 546 | 0
 547 | 0
 548 | 10
 549 | 10
 550 | 10
 551 | 10
 552 | 40
 553 | 10
 554 | 10
 555 | 10
 556 | 30
 557 | 50
 558 | 10
 559 | 10
 560 | 60
 561 | 50
 562 | 10
 563 | 20
 564 | 0
 565 | 30
 566 | 60
 567 | 0
 568 | 20
 569 | 10
 570 | 10
 571 | 10
 572 | 20
 573 | 0
 574 | 30
 575 | 10
 576 | 20
 577 | 0
 578 | 20
 579 | 20
 580 | 40
 581 | 0
 582 | 10
 583 | 20
 584 | 10
 585 | 20
 586 | 40
 587 | 50
 588 | 40
 589 | 20
 590 | 30
 591 | 0
 592 | 50
 593 | 20
 594 | 40
 595 | 10
 596 | 10
 597 | 0
 598 | 0
 599 | 0
 600 | 10
 601 | 10
 602 | 30
 603 | 0
 604 | 20
 605 | 20
 606 | 10
 607 | 10
 608 | 0
 609 | 20
 610 | 10
 611 | 0
 612 | 0
 613 | 10
 614 | 0
 615 | 10
 616 | 20
 617 | 0
 618 | 0
 619 | 10
 620 | 10
 621 | 0
 622 | 30
 623 | 0
 624 | 10
 625 | 0
 626 | 10
 627 | 10
 628 | 10
 629 | 10
 630 | 10
 631 | 30
 632 | 10
 633 | 0
 634 | 20
 635 | 10
 636 | 10
 637 | 30
 638 | 0
 639 | 10
 640 | 10
 641 | 10
 642 | 20
 643 | 20
 644 | 10
 645 | 10
 646 | 20
 647 | 0
 648 | 0
 649 | 10
 650 | 0
 651 | 40
 652 | 20
 653 | 0
 654 | 0
 655 | 0
 656 | 0
 657 | 40
 658 | 0
 659 | 0
 660 | 20
 661 | 20
 662 | 0
 663 | 10
 664 | 10
 665 | 0
 666 | 10
 667 | 0
 668 | 20
 669 | 20
 670 | 0
 671 | 0
 672 | 20
 673 | 40
 674 | 20
 675 | 10
 676 | 10
 677 | 10
 678 | 50
 679 | 0
 680 | 10
 681 | 40
 682 | 20
 683 | 20
 684 | 30
 685 | 10
 686 | 10
 687 | 0
 688 | 40
 689 | 10
 690 | 30
 691 | 20
 692 | 0
 693 | 30
 694 | 20
 695 | 0
 696 | 30
 697 | 10
 698 | 10
 699 | 40
 700 | 0
 701 | 10
 702 | 20
 703 | 10
 704 | 20
 705 | 20
 706 | 0
 707 | 0
 708 | 20
 709 | 30
 710 | 10
 711 | 10
 712 | 20
 713 | 0
 714 | 30
 715 | 20
 716 | 10
 717 | 10
 718 | 20
 719 | 0
 720 | 20
 721 | 10
 722 | 0
 723 | 20
 724 | 0
 725 | 20
 726 | 20
 727 | 40
 728 | 0
 729 | 30
 730 | 10
 731 | 10
 732 | 0
 733 | 10
 734 | 10
 735 | 20
 736 | 0
 737 | 20
 738 | 0
 739 | 10
 740 | 10
 741 | 20
 742 | 0
 743 | 50
 744 | 10
 745 | 50
 746 | 20
 747 | 50
 748 | 0
 749 | 0
 750 | 10
 751 | 10
 752 | 20
 753 | 20
 754 | 20
 755 | 30
 756 | 40
 757 | 20
 758 | 60
 759 | 0
 760 | 20
 761 | 20
 762 | 30
 763 | 10
 764 | 10
 765 | 10
 766 | 10
 767 | 30
 768 | 30
 769 | 10
 770 | 50
 771 | 30
 772 | 0
 773 | 10
 774 | 10
 775 | 10
 776 | 20
 777 | 20
 778 | 10
 779 | 0
 780 | 10
 781 | 20
 782 | 20
 783 | 20
 784 | 50
 785 | 40
 786 | 10
 787 | 10
 788 | 10
 789 | 10
 790 | 20
 791 | 0
 792 | 30
 793 | 0
 794 | 30
 795 | 0
 796 | 40
 797 | 0
 798 | 10
 799 | 20
 800 | 10
 801 | 20
 802 | 20
 803 | 10
 804 | 0
 805 | 10
 806 | 0
 807 | 0
 808 | 0
 809 | 0
 810 | 20
 811 | 60
 812 | 10
 813 | 0
 814 | 10
 815 | 10
 816 | 10
 817 | 0
 818 | 0
 819 | 10
 820 | 10
 821 | 30
 822 | 10
 823 | 0
 824 | 0
 825 | 20
 826 | 30
 827 | 10
 828 | 20
 829 | 10
 830 | 10
 831 | 10
 832 | 20
 833 | 10
 834 | 10
 835 | 0
 836 | 10
 837 | 10
 838 | 0
 839 | 10
 840 | 0
 841 | 20
 842 | 10
 843 | 20
 844 | 0
 845 | 10
 846 | 10
 847 | 10
 848 | 10
 849 | 10
 850 | 10
 851 | 0
 852 | 10
 853 | 30
 854 | 0
 855 | 10
 856 | 10
 857 | 10
 858 | 10
 859 | 20
 860 | 50
 861 | 0
 862 | 0
 863 | 20
 864 | 10
 865 | 10
 866 | 0
 867 | 10
 868 | 20
 869 | 0
 870 | 10
 871 | 10
 872 | 0
 873 | 10
 874 | 10
 875 | 10
 876 | 10
 877 | 0
 878 | 0
 879 | 10
 880 | 0
 881 | 10
 882 | 10
 883 | 20
 884 | 10
 885 | 10
 886 | 10
 887 | 20
 888 | 0
 889 | 0
 890 | 10
 891 | 10
 892 | 20
 893 | 0
 894 | 10
 895 | 0
 896 | 20
 897 | 20
 898 | 10
 899 | 10
 900 | 0
 901 | 0
 902 | 10
 903 | 20
 904 | 10
 905 | 10
 906 | 0
 907 | 10
 908 | 0
 909 | 20
 910 | 0
 911 | 10
 912 | 0
 913 | 10
 914 | 20
 915 | 10
 916 | 10
 917 | 0
 918 | 20
 919 | 0
 920 | 10
 921 | 0
 922 | 0
 923 | 0
 924 | 40
 925 | 0
 926 | 0
 927 | 0
 928 | 10
 929 | 20
 930 | 10
 931 | 10
 932 | 10
 933 | 20
 934 | 10
 935 | 20
 936 | 20
 937 | 0
 938 | 10
 939 | 0
 940 | 0
 941 | 0
 942 | 20
 943 | 30
 944 | 10
 945 | 10
 946 | 0
 947 | 0
 948 | 0
 949 | 10
 950 | 0
 951 | 10
 952 | 10
 953 | 0
 954 | 10
 955 | 0
 956 | 10
 957 | 10
 958 | 0
 959 | 10
 960 | 0
 961 | 10
 962 | 10
 963 | 10
 964 | 10
 965 | 0
 966 | 10
 967 | 0
 968 | 0
 969 | 40
 970 | 10
 971 | 20
 972 | 0
 973 | 30
 974 | 20
 975 | 20
 976 | 10
 977 | 10
 978 | 10
 979 | 30
 980 | 0
 981 | 10
 982 | 30
 983 | 20
 984 | 10
 985 | 10
 986 | 30
 987 | 20
 988 | 30
 989 | 30
 990 | 0
 991 | 20
 992 | 10
 993 | 40
 994 | 30
 995 | 20
 996 | 0
 997 | 20
 998 | 20
 999 | 30
1000 | 10
1001 | 20
1002 | 10
1003 | 10
1004 | 20
1005 | 20
1006 | 30
1007 | 60
1008 | 10
1009 | 0
1010 | 10
1011 | 0
1012 | 20
1013 | 10
1014 | 10
1015 | 20
1016 | 10
1017 | 30
1018 | 0
1019 | 20
1020 | 10
1021 | 0
1022 | 10
1023 | 10
1024 | 20
1025 | 10
1026 | 10
1027 | 30
1028 | 10
1029 | 0
1030 | 20
1031 | 0
1032 | 20
1033 | 10
1034 | 30
1035 | 10
1036 | 10
1037 | 10
1038 | 10
1039 | 20
1040 | 20
1041 | 20
1042 | 10
1043 | 70
1044 | 30
1045 | 20
1046 | 0
1047 | 10
1048 | 20
1049 | 20
1050 | 10
1051 | 10
1052 | 10
1053 | 0
1054 | 10
1055 | 10
1056 | 10
1057 | 20
1058 | 0
1059 | 20
1060 | 10
1061 | 10
1062 | 10
1063 | 10
1064 | 10
1065 | 20
1066 | 10
1067 | 20
1068 | 10
1069 | 30
1070 | 20
1071 | 0
1072 | 10
1073 | 20
1074 | 10
1075 | 0
1076 | 10
1077 | 0
1078 | 10
1079 | 20
1080 | 30
1081 | 30
1082 | 40
1083 | 0
1084 | 0
1085 | 10
1086 | 10
1087 | 0
1088 | 0
1089 | 10
1090 | 10
1091 | 10
1092 | 0
1093 | 0
1094 | 10
1095 | 20
1096 | 0
1097 | 0
1098 | 0
1099 | 10
1100 | 10
1101 | 0
1102 | 10
1103 | 10
1104 | 10
1105 | 0
1106 | 10
1107 | 10
1108 | 10
1109 | 10
1110 | 20
1111 | 0
1112 | 0
1113 | 20
1114 | 20
1115 | 0
1116 | 10
1117 | 10
1118 | 10
1119 | 10
1120 | 20
1121 | 10
1122 | 40
1123 | 10
1124 | 10
1125 | 20
1126 | 10
1127 | 10
1128 | 20
1129 | 20
1130 | 20
1131 | 10
1132 | 40
1133 | 40
1134 | 20
1135 | 0
1136 | 10
1137 | 20
1138 | 0
1139 | 10
1140 | 0
1141 | 10
1142 | 10
1143 | 30
1144 | 0
1145 | 20
1146 | 0
1147 | 10
1148 | 0
1149 | 30
1150 | 0
1151 | 10
1152 | 50
1153 | 20
1154 | 60
1155 | 0
1156 | 0
1157 | 10
1158 | 20
1159 | 20
1160 | 30
1161 | 20
1162 | 0
1163 | 0
1164 | 10
1165 | 0
1166 | 0
1167 | 10
1168 | 20
1169 | 30
1170 | 0
1171 | 10
1172 | 10
1173 | 10
1174 | 30
1175 | 10
1176 | 40
1177 | 10
1178 | 10
1179 | 10
1180 | 10
1181 | 20
1182 | 10
1183 | 10
1184 | 20
1185 | 10
1186 | 20
1187 | 0
1188 | 10
1189 | 10
1190 | 0
1191 | 10
1192 | 70
1193 | 0
1194 | 20
1195 | 20
1196 | 0
1197 | 10
1198 | 0
1199 | 20
1200 | 10
1201 | 10
1202 | 30
1203 | 10
1204 | 10
1205 | 20
1206 | 50
1207 | 0
1208 | 20
1209 | 10
1210 | 0
1211 | 0
1212 | 20
1213 | 10
1214 | 30
1215 | 30
1216 | 0
1217 | 10
1218 | 10
1219 | 10
1220 | 10
1221 | 20
1222 | 10
1223 | 20
1224 | 10
1225 | 0
1226 | 0
1227 | 20
1228 | 30
1229 | 10
1230 | 40
1231 | 0
1232 | 10
1233 | 20
1234 | 10
1235 | 10
1236 | 10
1237 | 0
1238 | 20
1239 | 10
1240 | 30
1241 | 10
1242 | 0
1243 | 20
1244 | 20
1245 | 20
1246 | 20
1247 | 10
1248 | 10
1249 | 10
1250 | 40
1251 | 20
1252 | 20
1253 | 20
1254 | 80
1255 | 0
1256 | 10
1257 | 10
1258 | 20
1259 | 30
1260 | 0
1261 | 20
1262 | 10
1263 | 10
1264 | 40
1265 | 0
1266 | 40
1267 | 10
1268 | 0
1269 | 10
1270 | 40
1271 | 20
1272 | 10
1273 | 20
1274 | 10
1275 | 10
1276 | 10
1277 | 10
1278 | 0
1279 | 10
1280 | 0
1281 | 10
1282 | 20
1283 | 20
1284 | 0
1285 | 20
1286 | 20
1287 | 10
1288 | 10
1289 | 10
1290 | 10
1291 | 20
1292 | 0
1293 | 10
1294 | 10
1295 | 10
1296 | 40
1297 | 50
1298 | 10
1299 | 40
1300 | 40
1301 | 20
1302 | 10
1303 | 10
1304 | 20
1305 | 10
1306 | 0
1307 | 10
1308 | 20
1309 | 10
1310 | 0
1311 | 10
1312 | 30
1313 | 30
1314 | 0
1315 | 20
1316 | 20
1317 | 10
1318 | 10
1319 | 20
1320 | 30
1321 | 10
1322 | 20
1323 | 20
1324 | 10
1325 | 0
1326 | 40
1327 | 10
1328 | 30
1329 | 20
1330 | 30
1331 | 20
1332 | 10
1333 | 0
1334 | 10
1335 | 20
1336 | 0
1337 | 10
1338 | 10
1339 | 0
1340 | 20
1341 | 20
1342 | 10
1343 | 10
1344 | 20
1345 | 30
1346 | 40
1347 | 0
1348 | 40
1349 | 30
1350 | 30
1351 | 0
1352 | 10
1353 | 10
1354 | 0
1355 | 0
1356 | 10
1357 | 0
1358 | 0
1359 | 0
1360 | 10
1361 | 20
1362 | 10
1363 | 10
1364 | 20
1365 | 10
1366 | 0
1367 | 10
1368 | 10
1369 | 10
1370 | 30
1371 | 10
1372 | 20
1373 | 40
1374 | 10
1375 | 60
1376 | 20
1377 | 10
1378 | 20
1379 | 20
1380 | 40
1381 | 10
1382 | 10
1383 | 0
1384 | 60
1385 | 10
1386 | 60
1387 | 10
1388 | 10
1389 | 20
1390 | 0
1391 | 10
1392 | 10
1393 | 10
1394 | 20
1395 | 60
1396 | 10
1397 | 10
1398 | 10
1399 | 20
1400 | 40
1401 | 0
1402 | 20
1403 | 0
1404 | 10
1405 | 10
1406 | 10
1407 | 10
1408 | 20
1409 | 10
1410 | 20
1411 | 10
1412 | 0
1413 | 20
1414 | 10
1415 | 30
1416 | 0
1417 | 10
1418 | 20
1419 | 10
1420 | 0
1421 | 0
1422 | 10
1423 | 10
1424 | 20
1425 | 20
1426 | 10
1427 | 0
1428 | 0
1429 | 10
1430 | 10
1431 | 0
1432 | 30
1433 | 0
1434 | 10
1435 | 0
1436 | 20
1437 | 10
1438 | 20
1439 | 10
1440 | 0
1441 | 0
1442 | 0
1443 | 0
1444 | 30
1445 | 0
1446 | 20
1447 | 10
1448 | 0
1449 | 10
1450 | 10
1451 | 10
1452 | 0
1453 | 20
1454 | 10
1455 | 10
1456 | 10
1457 | 0
1458 | 10
1459 | 40
1460 | 0
1461 | 30
1462 | 10
1463 | 0
1464 | 10
1465 | 10
1466 | 10
1467 | 0
1468 | 0
1469 | 20
1470 | 10
1471 | 0
1472 | 0
1473 | 0
1474 | 10
1475 | 10
1476 | 30
1477 | 10
1478 | 20
1479 | 0
1480 | 10
1481 | 10
1482 | 0
1483 | 10
1484 | 20
1485 | 10
1486 | 20
1487 | 10
1488 | 10
1489 | 10
1490 | 0
1491 | 0
1492 | 0
1493 | 0
1494 | 20
1495 | 10
1496 | 0
1497 | 20
1498 | 0
1499 | 10
1500 | 20
1501 | 10
1502 | 20
1503 | 10
1504 | 10
1505 | 20
1506 | 20
1507 | 10
1508 | 10
1509 | 10
1510 | 10
1511 | 10
1512 | 20
1513 | 20
1514 | 10
1515 | 50
1516 | 0
1517 | 20
1518 | 10
1519 | 20
1520 | 0
1521 | 30
1522 | 10
1523 | 0
1524 | 0
1525 | 20
1526 | 10
1527 | 0
1528 | 10
1529 | 10
1530 | 20
1531 | 10
1532 | 0
1533 | 0
1534 | 0
1535 | 20
1536 | 0
1537 | 0
1538 | 30
1539 | 0
1540 | 0
1541 | 0
1542 | 20
1543 | 30
1544 | 60
1545 | 20
1546 | 20
1547 | 0
1548 | 10
1549 | 0
1550 | 10
1551 | 0
1552 | 10
1553 | 10
1554 | 10
1555 | 20
1556 | 10
1557 | 0
1558 | 40
1559 | 30
1560 | 10
1561 | 20
1562 | 30
1563 | 40
1564 | 20
1565 | 0
1566 | 10
1567 | 30
1568 | 10
1569 | 0
1570 | 20
1571 | 10
1572 | 10
1573 | 0
1574 | 10
1575 | 0
1576 | 10
1577 | 0
1578 | 0
1579 | 10
1580 | 0
1581 | 0
1582 | 0
1583 | 0
1584 | 0
1585 | 10
1586 | 0
1587 | 10
1588 | 0
1589 | 10
1590 | 10
1591 | 10
1592 | 10
1593 | 10
1594 | 10
1595 | 10
1596 | 10
1597 | 10
1598 | 40
1599 | 10
1600 | 30
1601 | 0
1602 | 10
1603 | 10
1604 | 20
1605 | 0
1606 | 10
1607 | 50
1608 | 10
1609 | 10
1610 | 0
1611 | 10
1612 | 20
1613 | 0
1614 | 40
1615 | 10
1616 | 20
1617 | 10
1618 | 0
1619 | 10
1620 | 20
1621 | 0
1622 | 30
1623 | 10
1624 | 10
1625 | 20
1626 | 0
1627 | 20
1628 | 10
1629 | 0
1630 | 20
1631 | 10
1632 | 0
1633 | 20
1634 | 10
1635 | 10
1636 | 30
1637 | 20
1638 | 10
1639 | 10
1640 | 0
1641 | 0
1642 | 0
1643 | 0
1644 | 10
1645 | 10
1646 | 10
1647 | 10
1648 | 0
1649 | 10
1650 | 30
1651 | 30
1652 | 10
1653 | 20
1654 | 10
1655 | 10
1656 | 10
1657 | 10
1658 | 60
1659 | 10
1660 | 10
1661 | 20
1662 | 0
1663 | 0
1664 | 20
1665 | 10
1666 | 10
1667 | 30
1668 | 0
1669 | 30
1670 | 10
1671 | 0
1672 | 40
1673 | 10
1674 | 10
1675 | 0
1676 | 20
1677 | 20
1678 | 0
1679 | 20
1680 | 0
1681 | 10
1682 | 10
1683 | 10
1684 | 10
1685 | 10
1686 | 10
1687 | 20
1688 | 20
1689 | 0
1690 | 20
1691 | 20
1692 | 10
1693 | 40
1694 | 10
1695 | 10
1696 | 20
1697 | 0
1698 | 10
1699 | 10
1700 | 0
1701 | 10
1702 | 10
1703 | 0
1704 | 40
1705 | 20
1706 | 60
1707 | 20
1708 | 10
1709 | 0
1710 | 10
1711 | 40
1712 | 10
1713 | 20
1714 | 10
1715 | 20
1716 | 30
1717 | 10
1718 | 70
1719 | 30
1720 | 50
1721 | 20
1722 | 60
1723 | 20
1724 | 10
1725 | 70
1726 | 70
1727 | 10
1728 | 40
1729 | 30
1730 | 20
1731 | 10
1732 | 10
1733 | 0
1734 | 20
1735 | 10
1736 | 20
1737 | 20
1738 | 10
1739 | 40
1740 | 30
1741 | 0
1742 | 50
1743 | 20
1744 | 20
1745 | 20
1746 | 20
1747 | 10
1748 | 60
1749 | 20
1750 | 30
1751 | 20
1752 | 10
1753 | 0
1754 | 50
1755 | 10
1756 | 20
1757 | 0
1758 | 20
1759 | 20
1760 | 30
1761 | 10
1762 | 40
1763 | 20
1764 | 0
1765 | 40
1766 | 10
1767 | 10
1768 | 20
1769 | 0
1770 | 20
1771 | 10
1772 | 20
1773 | 80
1774 | 0
1775 | 70
1776 | 40
1777 | 0
1778 | 10
1779 | 0
1780 | 40
1781 | 10
1782 | 10
1783 | 10
1784 | 0
1785 | 10
1786 | 10
1787 | 10
1788 | 40
1789 | 20
1790 | 20
1791 | 0
1792 | 40
1793 | 50
1794 | 60
1795 | 20
1796 | 10
1797 | 10
1798 | 0
1799 | 40
1800 | 60
1801 | 0
1802 | 10
1803 | 0
1804 | 0
1805 | 20
1806 | 0
1807 | 10
1808 | 10
1809 | 40
1810 | 10
1811 | 0
1812 | 10
1813 | 10
1814 | 10
1815 | 0
1816 | 10
1817 | 20
1818 | 20
1819 | 10
1820 | 10
1821 | 0
1822 | 10
1823 | 10
1824 | 10
1825 | 10
1826 | 10
1827 | 10
1828 | 10
1829 | 20
1830 | 10
1831 | 10
1832 | 0
1833 | 10
1834 | 10
1835 | 10
1836 | 20
1837 | 10
1838 | 0
1839 | 20
1840 | 10
1841 | 0
1842 | 10
1843 | 0
1844 | 20
1845 | 0
1846 | 20
1847 | 10
1848 | 0
1849 | 20
1850 | 10
1851 | 0
1852 | 0
1853 | 30
1854 | 40
1855 | 0
1856 | 10
1857 | 10
1858 | 20
1859 | 30
1860 | 0
1861 | 10
1862 | 10
1863 | 10
1864 | 20
1865 | 10
1866 | 20
1867 | 10
1868 | 0
1869 | 0
1870 | 30
1871 | 30
1872 | 10
1873 | 30
1874 | 0
1875 | 10
1876 | 20
1877 | 20
1878 | 10
1879 | 10
1880 | 10
1881 | 20
1882 | 0
1883 | 20
1884 | 40
1885 | 10
1886 | 10
1887 | 10
1888 | 40
1889 | 0
1890 | 0
1891 | 10
1892 | 0
1893 | 0
1894 | 0
1895 | 30
1896 | 10
1897 | 0
1898 | 0
1899 | 10
1900 | 10
1901 | 10
1902 | 20
1903 | 10
1904 | 20
1905 | 30
1906 | 10
1907 | 20
1908 | 10
1909 | 10
1910 | 10
1911 | 0
1912 | 10
1913 | 0
1914 | 0
1915 | 0
1916 | 10
1917 | 10
1918 | 10
1919 | 10
1920 | 10
1921 | 10
1922 | 20
1923 | 20
1924 | 10
1925 | 10
1926 | 40
1927 | 10
1928 | 10
1929 | 10
1930 | 10
1931 | 20
1932 | 0
1933 | 0
1934 | 20
1935 | 0
1936 | 20
1937 | 0
1938 | 10
1939 | 10
1940 | 10
1941 | 10
1942 | 30
1943 | 20
1944 | 10
1945 | 40
1946 | 10
1947 | 0
1948 | 10
1949 | 10
1950 | 30
1951 | 20
1952 | 10
1953 | 0
1954 | 10
1955 | 30
1956 | 20
1957 | 10
1958 | 10
1959 | 10
1960 | 0
1961 | 30
1962 | 40
1963 | 10
1964 | 10
1965 | 10
1966 | 30
1967 | 0
1968 | 30
1969 | 20
1970 | 10
1971 | 0
1972 | 10
1973 | 0
1974 | 0
1975 | 0
1976 | 30
1977 | 10
1978 | 60
1979 | 20
1980 | 10
1981 | 10
1982 | 10
1983 | 10
1984 | 20
1985 | 0
1986 | 20
1987 | 10
1988 | 0
1989 | 0
1990 | 20
1991 | 0
1992 | 10
1993 | 10
1994 | 10
1995 | 30
1996 | 40
1997 | 0
1998 | 0
1999 | 0
2000 | 40
2001 | 0
2002 | 20
2003 | 10
2004 | 10
2005 | 10
2006 | 0
2007 | 20
2008 | 10
2009 | 20
2010 | 10
2011 | 30
2012 | 0
2013 | 0
2014 | 10
2015 | 10
2016 | 10
2017 | 10
2018 | 20
2019 | 10
2020 | 10
2021 | 0
2022 | 10
2023 | 0
2024 | 0
2025 | 10
2026 | 20
2027 | 10
2028 | 10
2029 | 0
2030 | 0
2031 | 20
2032 | 10
2033 | 10
2034 | 10
2035 | 10
2036 | 0
2037 | 10
2038 | 10
2039 | 0
2040 | 10
2041 | 20
2042 | 50
2043 | 0
2044 | 10
2045 | 10
2046 | 20
2047 | 10
2048 | 10
2049 | 10
2050 | 20
2051 | 10
2052 | 10
2053 | 10
2054 | 20
2055 | 10
2056 | 10
2057 | 10
2058 | 10
2059 | 20
2060 | 20
2061 | 20
2062 | 10
2063 | 20
2064 | 30
2065 | 10
2066 | 0
2067 | 0
2068 | 0
2069 | 20
2070 | 30
2071 | 10
2072 | 0
2073 | 0
2074 | 10
2075 | 10
2076 | 10
2077 | 0
2078 | 10
2079 | 10
2080 | 10
2081 | 10
2082 | 0
2083 | 0
2084 | 10
2085 | 10
2086 | 0
2087 | 0
2088 | 20
2089 | 20
2090 | 0
2091 | 10
2092 | 20
2093 | 20
2094 | 10
2095 | 20
2096 | 0
2097 | 10
2098 | 10
2099 | 20
2100 | 10
2101 | 40
2102 | 20
2103 | 20
2104 | 20
2105 | 20
2106 | 20
2107 | 30
2108 | 60
2109 | 0
2110 | 20
2111 | 10
2112 | 30
2113 | 50
2114 | 40
2115 | 20
2116 | 0
2117 | 0
2118 | 10
2119 | 0
2120 | 30
2121 | 20
2122 | 30
2123 | 10
2124 | 40
2125 | 0
2126 | 10
2127 | 30
2128 | 20
2129 | 40
2130 | 20
2131 | 0
2132 | 0
2133 | 10
2134 | 10
2135 | 20
2136 | 20
2137 | 20
2138 | 30
2139 | 0
2140 | 40
2141 | 10
2142 | 20
2143 | 60
2144 | 20
2145 | 30
2146 | 40
2147 | 10
2148 | 0
2149 | 10
2150 | 20
2151 | 10
2152 | 50
2153 | 20
2154 | 20
2155 | 0
2156 | 30
2157 | 30
2158 | 0
2159 | 30
2160 | 0
2161 | 20
2162 | 20
2163 | 40
2164 | 10
2165 | 10
2166 | 0
2167 | 10
2168 | 40
2169 | 10
2170 | 30
2171 | 0
2172 | 20
2173 | 0
2174 | 20
2175 | 20
2176 | 20
2177 | 0
2178 | 50
2179 | 10
2180 | 20
2181 | 60
2182 | 20
2183 | 0
2184 | 20
2185 | 60
2186 | 20
2187 | 20
2188 | 0
2189 | 10
2190 | 10
2191 | 30
2192 | 20
2193 | 50
2194 | 50
2195 | 50
2196 | 10
2197 | 10
2198 | 30
2199 | 10
2200 | 10
2201 | 10
2202 | 10
2203 | 10
2204 | 10
2205 | 60
2206 | 10
2207 | 10
2208 | 20
2209 | 10
2210 | 10
2211 | 10
2212 | 20
2213 | 10
2214 | 0
2215 | 0
2216 | 10
2217 | 10
2218 | 20
2219 | 10
2220 | 20
2221 | 0
2222 | 30
2223 | 20
2224 | 40
2225 | 0
2226 | 0
2227 | 0
2228 | 10
2229 | 0
2230 | 40
2231 | 0
2232 | 50
2233 | 20
2234 | 20
2235 | 20
2236 | 0
2237 | 20
2238 | 20
2239 | 10
2240 | 10
2241 | 0
2242 | 20
2243 | 10
2244 | 20
2245 | 0
2246 | 30
2247 | 10
2248 | 0
2249 | 10
2250 | 50
2251 | 30
2252 | 50
2253 | 70
2254 | 10
2255 | 50
2256 | 0
2257 | 10
2258 | 10
2259 | 10
2260 | 10
2261 | 10
2262 | 10
2263 | 30
2264 | 20
2265 | 40
2266 | 40
2267 | 50
2268 | 60
2269 | 30
2270 | 50
2271 | 50
2272 | 60
2273 | 0
2274 | 10
2275 | 10
2276 | 20
2277 | 30
2278 | 10
2279 | 30
2280 | 0
2281 | 30
2282 | 10
2283 | 20
2284 | 10
2285 | 20
2286 | 10
2287 | 0
2288 | 30
2289 | 10
2290 | 10
2291 | 10
2292 | 10
2293 | 0
2294 | 20
2295 | 0
2296 | 0
2297 | 0
2298 | 10
2299 | 20
2300 | 40
2301 | 0
2302 | 0
2303 | 0
2304 | 10
2305 | 0
2306 | 0
2307 | 10
2308 | 10
2309 | 20
2310 | 20
2311 | 20
2312 | 10
2313 | 20
2314 | 0
2315 | 0
2316 | 10
2317 | 0
2318 | 20
2319 | 10
2320 | 20
2321 | 20
2322 | 20
2323 | 10
2324 | 0
2325 | 10
2326 | 50
2327 | 20
2328 | 20
2329 | 0
2330 | 30
2331 | 10
2332 | 10
2333 | 0
2334 | 20
2335 | 20
2336 | 10
2337 | 10
2338 | 10
2339 | 0
2340 | 10
2341 | 0
2342 | 10
2343 | 0
2344 | 10
2345 | 50
2346 | 50
2347 | 40
2348 | 50
2349 | 10
2350 | 10
2351 | 30
2352 | 20
2353 | 30
2354 | 0
2355 | 0
2356 | 0
2357 | 10
2358 | 20
2359 | 20
2360 | 10
2361 | 10
2362 | 20
2363 | 30
2364 | 10
2365 | 10
2366 | 0
2367 | 0
2368 | 10
2369 | 0
2370 | 10
2371 | 30
2372 | 10
2373 | 30
2374 | 10
2375 | 0
2376 | 20
2377 | 30
2378 | 30
2379 | 20
2380 | 10
2381 | 10
2382 | 20
2383 | 30
2384 | 10
2385 | 20
2386 | 10
2387 | 10
2388 | 20
2389 | 10
2390 | 20
2391 | 20
2392 | 20
2393 | 0
2394 | 10
2395 | 20
2396 | 10
2397 | 0
2398 | 0
2399 | 20
2400 | 0
2401 | 10
2402 | 10
2403 | 20
2404 | 40
2405 | 30
2406 | 20
2407 | 10
2408 | 20
2409 | 20
2410 | 30
2411 | 20
2412 | 10
2413 | 0
2414 | 20
2415 | 30
2416 | 10
2417 | 0
2418 | 20
2419 | 10
2420 | 20
2421 | 10
2422 | 20
2423 | 0
2424 | 0
2425 | 0
2426 | 30
2427 | 30
2428 | 0
2429 | 20
2430 | 10
2431 | 0
2432 | 0
2433 | 0
2434 | 10
2435 | 10
2436 | 10
2437 | 40
2438 | 10
2439 | 10
2440 | 10
2441 | 0
2442 | 0
2443 | 10
2444 | 10
2445 | 0
2446 | 


--------------------------------------------------------------------------------
/pytest.ini:
--------------------------------------------------------------------------------
1 | [pytest]
2 | filterwarnings =
3 |     ignore::DeprecationWarning:nltk.*
4 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | nltk>=3.2
2 | numpy
3 | matplotlib
4 | absl-py
5 | sacrebleu
6 | 


--------------------------------------------------------------------------------
/scripts/count.py:
--------------------------------------------------------------------------------
 1 | from collections import defaultdict
 2 | import sys
 3 | 
 4 | cnts = defaultdict(lambda: 0)
 5 | for line in sys.stdin:
 6 |   for word in line.strip().split():
 7 |     cnts[word] += 1
 8 | 
 9 | for k, v in sorted(cnts.items(), key=lambda x: -x[1]):
10 |   print(f'{k}\t{v}')
11 | 


--------------------------------------------------------------------------------
/scripts/interleave.py:
--------------------------------------------------------------------------------
 1 | # This script is a simple script to interleave the lines of multiple systems
 2 | # It can be used like
 3 | #  python interleave.py ref.txt sys1.txt sys2.txt
 4 | 
 5 | import sys
 6 | import itertools
 7 | 
 8 | filenames = sys.argv[1:]
 9 | files = [open(x, 'r') for x in filenames]
10 | assert all(files), f'Could not open all files in {filenames}'
11 | 
12 | for lines in itertools.zip_longest(*files):
13 |   for line in lines:
14 |     print(line.strip('\n'))
15 |   print()
16 | 


--------------------------------------------------------------------------------
/scripts/postag.py:
--------------------------------------------------------------------------------
 1 | # This is a simple script to POS tag already-tokenized English using NLTK. To run it just do:
 2 | # $ python postag.py < file.eng > file.eng.tag
 3 | # You may need to install the NLTK POS tagger if you haven't already, in which case you'll get an error telling you how
 4 | # to do so the first time you run this script.
 5 | 
 6 | import nltk
 7 | import sys
 8 | 
 9 | for line in sys.stdin:
10 |   text = line.strip('\n').split(' ')
11 |   print(' '.join([x[1] for x in nltk.pos_tag(text)]))
12 | 


--------------------------------------------------------------------------------
/scripts/relativepositiontag.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | 
 3 | for line in sys.stdin:
 4 |   line = line.strip('\n').split(' ')
 5 |   if len(line) == 1:
 6 |     print('0')
 7 |   else:
 8 |     labels = [f'{float(i)/(len(line)-1):.4f}' for i in range(len(line))]
 9 |     print(' '.join(labels))
10 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | from setuptools import setup, find_packages
 2 | import unittest
 3 | import codecs
 4 | 
 5 | def test_suite():
 6 |   test_loader = unittest.TestLoader()
 7 |   test_suite = test_loader.discover("compare_mt/tests", pattern="test_*.py")
 8 | 
 9 |   return test_suite
10 | 
11 | 
12 | exec(open('compare_mt/version_info.py').read())
13 | 
14 | setup(
15 |   name="compare_mt",
16 |   version=__version__,
17 |   description="Holistic comparison of the output of text generation models",
18 |   long_description=codecs.open("README.md", encoding="utf-8").read(),
19 |   long_description_content_type="text/markdown",
20 |   url="https://github.com/neulab/compare-mt",
21 |   author="Graham Neubig",
22 |   license="BSD 3-Clause",
23 |   test_suite="setup.test_suite",
24 |   classifiers=[
25 |   "Intended Audience :: Developers",
26 |   "Topic :: Text Processing",
27 |   "Topic :: Scientific/Engineering :: Artificial Intelligence",
28 |   "License :: OSI Approved :: BSD License",
29 |   "Programming Language :: Python :: 3",
30 |   ],
31 |   packages=find_packages(),
32 |   entry_points={
33 |     "console_scripts": [
34 |       "compare-mt=compare_mt.compare_mt_main:main",
35 |       "compare-ll=compare_mt.compare_ll_main:main",
36 |     ],
37 |   },
38 |   install_requires=[
39 |     "nltk>=3.2",
40 |     "numpy",
41 |     "matplotlib",
42 |     "absl-py",
43 |     "sacrebleu"
44 |   ],
45 |   include_package_data=True,
46 | )
47 | 


--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/neulab/compare-mt/b6d8f79d02043243c3d8aa58373a0f4c55e17a69/tests/__init__.py


--------------------------------------------------------------------------------
/tests/test_cache.py:
--------------------------------------------------------------------------------
  1 | import os.path
  2 | import unittest
  3 | import numpy as np
  4 | import sys
  5 | 
  6 | from compare_mt.cache_utils import CachedPorterStemmer
  7 | from nltk.stem.porter import PorterStemmer
  8 | 
  9 | compare_mt_root = os.path.join(os.path.dirname(os.path.realpath(__file__)), "..")
 10 | sys.path.append(compare_mt_root)
 11 | 
 12 | from compare_mt import scorers
 13 | from compare_mt.corpus_utils import load_tokens, load_alignments
 14 | from compare_mt import compare_mt_main
 15 | from compare_mt import reporters
 16 | 
 17 | def _get_example_data():
 18 |   example_path = os.path.join(compare_mt_root, "example")
 19 |   ref_file = os.path.join(example_path, "ted.ref.eng")
 20 |   out1_file = os.path.join(example_path, "ted.sys1.eng")
 21 |   out2_file = os.path.join(example_path, "ted.sys2.eng")
 22 |   return [load_tokens(x) for x in (ref_file, out1_file, out2_file)]
 23 | 
 24 | def _get_example_data_detokenized():
 25 |   example_path = os.path.join(compare_mt_root, "example")
 26 |   ref_file = os.path.join(example_path, "ted.ref.detok.eng")
 27 |   out1_file = os.path.join(example_path, "ted.sys1.detok.eng")
 28 |   out2_file = os.path.join(example_path, "ted.sys2.detok.eng")
 29 |   return [load_tokens(x) for x in (ref_file, out1_file, out2_file)]
 30 | 
 31 | 
 32 | class TestScoreCache(unittest.TestCase):
 33 | 
 34 |   @classmethod
 35 |   def setUpClass(self):
 36 |     self.ref, self.out1, self.out2 = _get_example_data()
 37 | 
 38 |   def test_score_cache(self):
 39 |     cached_stats1 = compare_mt_main.generate_score_report(self.ref, [self.out1], to_cache=True)
 40 |     cached_stats2 = compare_mt_main.generate_score_report(self.ref, [self.out2], to_cache=True)
 41 |     self.assertTrue('scores' in cached_stats1 and 'strs' in cached_stats1 and 'sign_stats' in cached_stats1)
 42 |     self.assertTrue('scores' in cached_stats2 and 'strs' in cached_stats2 and 'sign_stats' in cached_stats2)
 43 |     self.assertAlmostEqual(cached_stats1['scores'], 22.44, places=1)
 44 |     reporters.sys_names = [f'sys{i+1}' for i in range(2)]
 45 |     cached_report = compare_mt_main.generate_score_report(self.ref, [self.out1, self.out2], cache_dicts=[cached_stats1, cached_stats2], title='Aggregate Scores')
 46 |     ori_report = compare_mt_main.generate_score_report(self.ref, [self.out1, self.out2], title='Aggregate Scores')
 47 |     self.assertTrue(cached_report.scores == ori_report.scores)
 48 |     self.assertTrue(cached_report.strs == ori_report.strs)
 49 |     self.assertTrue(cached_report.wins == ori_report.wins)
 50 | 
 51 |   def test_score_cache_bootstrap(self):
 52 |     cached_stats1 = compare_mt_main.generate_score_report(self.ref, [self.out1], to_cache=True)
 53 |     cached_stats2 = compare_mt_main.generate_score_report(self.ref, [self.out2], to_cache=True)
 54 |     self.assertTrue('scores' in cached_stats1 and 'strs' in cached_stats1 and 'sign_stats' in cached_stats1)
 55 |     self.assertTrue('scores' in cached_stats2 and 'strs' in cached_stats2 and 'sign_stats' in cached_stats2)
 56 |     self.assertAlmostEqual(cached_stats1['scores'], 22.44, places=1)
 57 |     reporters.sys_names = [f'sys{i+1}' for i in range(2)]
 58 |     cached_report = compare_mt_main.generate_score_report(self.ref, [self.out1, self.out2], cache_dicts=[cached_stats1, cached_stats2], bootstrap=5, title='Aggregate Scores')
 59 |     ori_report = compare_mt_main.generate_score_report(self.ref, [self.out1, self.out2], bootstrap=5, title='Aggregate Scores')
 60 |     self.assertTrue(cached_report.scores == ori_report.scores)
 61 |     self.assertTrue(cached_report.strs == ori_report.strs)
 62 |     self.assertTrue(cached_report.wins == ori_report.wins)
 63 |     
 64 | class TestWordAccCache(unittest.TestCase):
 65 | 
 66 |   @classmethod
 67 |   def setUpClass(self):
 68 |     self.ref, self.out1, self.out2 = _get_example_data()
 69 |  
 70 |   def test_wordacc_cache(self):
 71 |     cached_stats1 = compare_mt_main.generate_word_accuracy_report(self.ref, [self.out1], to_cache=True)
 72 |     cached_stats2 = compare_mt_main.generate_word_accuracy_report(self.ref, [self.out2], to_cache=True)
 73 |     self.assertTrue('statistics' in cached_stats1 and 'my_ref_total_list' in cached_stats1 and 'my_out_matches_list' in cached_stats1)
 74 |     self.assertTrue('statistics' in cached_stats2 and 'my_ref_total_list' in cached_stats2 and 'my_out_matches_list' in cached_stats2)
 75 |     ori_report = compare_mt_main.generate_word_accuracy_report(self.ref, [self.out1, self.out2])
 76 |     cached_report = compare_mt_main.generate_word_accuracy_report(self.ref, [self.out1, self.out2], cache_dicts=[cached_stats1, cached_stats2])
 77 |     self.assertTrue(cached_report.statistics == ori_report.statistics)
 78 |     self.assertTrue(cached_report.examples == ori_report.examples)
 79 | 
 80 | class TestSrcWordAccCache(unittest.TestCase):
 81 | 
 82 |   @classmethod
 83 |   def setUpClass(self):
 84 |     example_path = os.path.join(compare_mt_root, "example")
 85 |     self.ref, self.out1, self.out2 = _get_example_data()
 86 |     src_file = os.path.join(example_path, "ted.orig.slk")
 87 |     self.src = load_tokens(src_file)
 88 |     self.ref_align_file = os.path.join(example_path, "ted.ref.align")
 89 |  
 90 |   def test_src_wordacc_cache(self):
 91 |     cached_stats1 = compare_mt_main.generate_src_word_accuracy_report(self.ref, [self.out1], self.src, ref_align_file=self.ref_align_file, to_cache=True)
 92 |     cached_stats2 = compare_mt_main.generate_src_word_accuracy_report(self.ref, [self.out2], self.src, ref_align_file=self.ref_align_file, to_cache=True)
 93 |     self.assertTrue('statistics' in cached_stats1 and 'my_ref_total_list' in cached_stats1 and 'my_out_matches_list' in cached_stats1)
 94 |     self.assertTrue('statistics' in cached_stats2 and 'my_ref_total_list' in cached_stats2 and 'my_out_matches_list' in cached_stats2)
 95 |     ori_report = compare_mt_main.generate_src_word_accuracy_report(self.ref, [self.out1, self.out2], self.src, ref_align_file=self.ref_align_file)
 96 |     cached_report = compare_mt_main.generate_src_word_accuracy_report(self.ref, [self.out1, self.out2], self.src, ref_align_file=self.ref_align_file, cache_dicts=[cached_stats1, cached_stats2])
 97 |     self.assertTrue(cached_report.statistics == ori_report.statistics)
 98 |     self.assertTrue(cached_report.examples == ori_report.examples)
 99 | 
100 | class TestSentBucketCache(unittest.TestCase):
101 | 
102 |   @classmethod
103 |   def setUpClass(self):
104 |     self.ref, self.out1, self.out2 = _get_example_data()
105 | 
106 |   def test_sentbucket_cache(self):
107 |     cached_stats1 = compare_mt_main.generate_sentence_bucketed_report(self.ref, [self.out1], to_cache=True)
108 |     cached_stats2 = compare_mt_main.generate_sentence_bucketed_report(self.ref, [self.out2], to_cache=True)
109 |     self.assertTrue('stats' in cached_stats1)
110 |     self.assertTrue('stats' in cached_stats2)
111 |     ori_report = compare_mt_main.generate_sentence_bucketed_report(self.ref, [self.out1, self.out2])
112 |     cached_report = compare_mt_main.generate_sentence_bucketed_report(self.ref, [self.out1, self.out2], cache_dicts=[cached_stats1, cached_stats2])
113 |     self.assertTrue(cached_report.sys_stats == ori_report.sys_stats)
114 | 
115 | class TestNgramCache(unittest.TestCase):
116 | 
117 |   @classmethod
118 |   def setUpClass(self):
119 |     self.ref, self.out1, self.out2 = _get_example_data()
120 | 
121 |   def test_ngram_cache(self):
122 |     reporters.sys_names = [f'sys{i+1}' for i in range(2)]
123 |     cached_stats1 = compare_mt_main.generate_ngram_report(self.ref, [self.out1], to_cache=True)
124 |     cached_stats2 = compare_mt_main.generate_ngram_report(self.ref, [self.out2], to_cache=True)
125 |     self.assertTrue('totals' in cached_stats1 and 'matches' in cached_stats1 and 'overs' in cached_stats1 and 'unders' in cached_stats1)
126 |     self.assertTrue('totals' in cached_stats2 and 'matches' in cached_stats2 and 'overs' in cached_stats2 and 'unders' in cached_stats2)
127 |     ori_report = compare_mt_main.generate_ngram_report(self.ref, [self.out1, self.out2])
128 |     cached_report = compare_mt_main.generate_ngram_report(self.ref, [self.out1, self.out2], cache_dicts=[cached_stats1, cached_stats2])
129 |     self.assertTrue(cached_report.scorelist == ori_report.scorelist)
130 | 
131 | class TestSentExamCache(unittest.TestCase):
132 | 
133 |   @classmethod
134 |   def setUpClass(self):
135 |     self.ref, self.out1, self.out2 = _get_example_data()
136 | 
137 |   def test_sentexam_cache(self):
138 |     reporters.sys_names = [f'sys{i+1}' for i in range(2)]
139 |     cached_stats1 = compare_mt_main.generate_sentence_examples(self.ref, [self.out1], to_cache=True)
140 |     cached_stats2 = compare_mt_main.generate_sentence_examples(self.ref, [self.out2], to_cache=True)
141 |     self.assertTrue('scores' in cached_stats1 and 'strs' in cached_stats1)
142 |     self.assertTrue('scores' in cached_stats2 and 'strs' in cached_stats2)
143 |     ori_report = compare_mt_main.generate_sentence_examples(self.ref, [self.out1, self.out2])
144 |     cached_report = compare_mt_main.generate_sentence_examples(self.ref, [self.out1, self.out2], cache_dicts=[cached_stats1, cached_stats2])
145 |     self.assertTrue(cached_report.scorediff_lists== ori_report.scorediff_lists)
146 | 
147 | class TestCachedPorterStemmer(unittest.TestCase):
148 |   def test_stem(self):
149 |     cached_stemmer = CachedPorterStemmer()
150 |     stemmer = PorterStemmer()
151 |     words = ["cats", "citizen", "best", "citizen" "cats"]
152 | 
153 |     for w in words:
154 |       self.assertEqual(stemmer.stem(w), cached_stemmer.stem(w))
155 | 
156 | if __name__ == "__main__":
157 |   unittest.main()
158 | 


--------------------------------------------------------------------------------
/tests/test_scorers.py:
--------------------------------------------------------------------------------
  1 | import os.path
  2 | import unittest
  3 | import numpy as np
  4 | import sys
  5 | 
  6 | compare_mt_root = os.path.join(os.path.dirname(os.path.realpath(__file__)), "..")
  7 | sys.path.append(compare_mt_root)
  8 | 
  9 | from compare_mt import scorers
 10 | from compare_mt.corpus_utils import load_tokens
 11 | 
 12 | 
 13 | def _get_example_data():
 14 |   example_path = os.path.join(compare_mt_root, "example")
 15 |   ref_file = os.path.join(example_path, "ted.ref.eng")
 16 |   out1_file = os.path.join(example_path, "ted.sys1.eng")
 17 |   out2_file = os.path.join(example_path, "ted.sys2.eng")
 18 |   return [load_tokens(x) for x in (ref_file, out1_file, out2_file)]
 19 | 
 20 | def _get_example_data_detokenized():
 21 |   example_path = os.path.join(compare_mt_root, "example")
 22 |   ref_file = os.path.join(example_path, "ted.ref.detok.eng")
 23 |   out1_file = os.path.join(example_path, "ted.sys1.detok.eng")
 24 |   out2_file = os.path.join(example_path, "ted.sys2.detok.eng")
 25 |   return [load_tokens(x) for x in (ref_file, out1_file, out2_file)]
 26 | 
 27 | 
 28 | class TestBleuScorer(unittest.TestCase):
 29 | 
 30 |   @classmethod
 31 |   def setUpClass(self):
 32 |     self.ref, self.out1, self.out2 = _get_example_data()
 33 |     self.ids = list(range(len(self.ref)))
 34 |     self.scorer = scorers.create_scorer_from_profile("bleu", case_insensitive=False)
 35 |     self.cache_stats1 = self.scorer.cache_stats(self.ref, self.out1)
 36 |     self.cache_stats2 = self.scorer.cache_stats(self.ref, self.out2)
 37 |     self.n_random_retries = 10
 38 | 
 39 |   def test_score_corpus(self):
 40 |     bleu, _ = self.scorer.score_corpus(self.ref, self.out1)
 41 |     # Compare to moses multi-bleu.pl
 42 |     self.assertAlmostEqual(bleu, 22.44, places=1)
 43 |   
 44 |   def test_score_sentence(self):
 45 |     
 46 |     def should_raise():
 47 |       return self.scorer.score_sentence(self.ref[0], self.out1[0])
 48 |     
 49 |     self.assertRaises(NotImplementedError, should_raise)
 50 | 
 51 | 
 52 |   def test_score_cached_corpus(self):
 53 |     for _ in range(self.n_random_retries):
 54 |       np.random.shuffle(self.ids)
 55 |       random_ids = self.ids[:int(len(self.ids)*0.5)]
 56 | 
 57 |       # compare-mt implementation
 58 |       my_sys1_score, _ = self.scorer.score_cached_corpus(random_ids, self.cache_stats1)
 59 |       my_sys2_score, _ = self.scorer.score_cached_corpus(random_ids, self.cache_stats2)
 60 | 
 61 |       # nltk implementation
 62 |       random_ref = [self.ref[i] for i in random_ids]
 63 |       random_sys1 = [self.out1[i] for i in random_ids]
 64 |       random_sys2 = [self.out2[i] for i in random_ids]
 65 |       nltk_sys1_score, _ = self.scorer.score_corpus(random_ref, random_sys1)
 66 |       nltk_sys2_score, _ = self.scorer.score_corpus(random_ref, random_sys2)
 67 | 
 68 |       self.assertAlmostEqual(my_sys1_score, nltk_sys1_score)
 69 |       self.assertAlmostEqual(my_sys2_score, nltk_sys2_score)
 70 | 
 71 | 
 72 | class TestSentBleuScorer(unittest.TestCase):
 73 | 
 74 |   @classmethod
 75 |   def setUpClass(self):
 76 |     self.ref, self.out, _ = _get_example_data()
 77 |     self.scorer = scorers.create_scorer_from_profile("sentbleu")
 78 | 
 79 |   def test_score_sentence(self):
 80 |     bleu, _ = self.scorer.score_sentence(self.ref[0], self.out[0])
 81 |     # compare to nltk
 82 |     self.assertAlmostEqual(bleu, 32.44376694160122)
 83 |   
 84 |   def test_score_corpus(self):
 85 |     sent_bleu_corpus, _ = self.scorer.score_corpus(self.ref, self.out)
 86 |     avg_sent_bleu = sum([self.scorer.score_sentence(ref_sent, out_sent)[0]
 87 |                          for ref_sent, out_sent in zip(self.ref, self.out)])
 88 |     avg_sent_bleu /= len(self.ref)
 89 |     # compare to sacrebleu --force --metrics=chrf
 90 |     self.assertAlmostEqual(sent_bleu_corpus, avg_sent_bleu)
 91 | 
 92 | 
 93 | class TestLengthScorer(unittest.TestCase):
 94 | 
 95 |   @classmethod
 96 |   def setUpClass(self):
 97 |     self.ref, self.out, _ = _get_example_data()
 98 |     self.scorer = scorers.create_scorer_from_profile("length")
 99 | 
100 |   def test_score_sentence(self):
101 |     length_ratio, desc = self.scorer.score_sentence(self.ref[0], self.out[0])
102 |     self.assertAlmostEqual(length_ratio, 22 / 24)
103 |     self.assertEqual(desc, "ref=24, out=22")
104 |   
105 |   def test_score_corpus(self):
106 |     length_ratio_corpus, desc = self.scorer.score_corpus(self.ref, self.out)
107 |     self.assertAlmostEqual(length_ratio_corpus, 45672 / 48183)
108 |     self.assertEqual(desc, "ref=48183, out=45672")
109 | 
110 | 
111 | 
112 | class TestRibesScorer(unittest.TestCase):
113 | 
114 |   @classmethod
115 |   def setUpClass(self):
116 |     self.ref, self.out, _ = _get_example_data()
117 |     self.scorer = scorers.create_scorer_from_profile("ribes")
118 | 
119 |   def test_score_sentence(self):
120 |     ribes, _ = self.scorer.score_sentence(self.ref[0], self.out[0])
121 |     self.assertAlmostEqual(ribes, 84.9014, 4)
122 |   
123 |   def test_score_corpus(self):
124 |     ribes_corpus, _ = self.scorer.score_corpus(self.ref, self.out)
125 |     self.assertAlmostEqual(ribes_corpus, 80.0020, 4)
126 | 
127 | 
128 | class TestChrFScorer(unittest.TestCase):
129 | 
130 |   @classmethod
131 |   def setUpClass(self):
132 |     self.ref, self.out, _ = _get_example_data()
133 |     self.scorer = scorers.create_scorer_from_profile("chrf")
134 | 
135 |   def test_chrf_sentence(self):
136 |     chrf, _ = self.scorer.score_sentence(self.ref[0], self.out[0])
137 |     # compare to sacrebleu --force --metrics=chrf
138 |     self.assertAlmostEqual(chrf, 59, places=0)
139 |   
140 |   def test_chrf_corpus(self):
141 |     chrf, _ = self.scorer.score_corpus(self.ref, self.out)
142 |     # compare to sacrebleu --force --metrics=chrf
143 |     self.assertAlmostEqual(chrf, 48, places=0)
144 | 
145 | 
146 | class TestSacreBleuScorer(unittest.TestCase):
147 | 
148 |   @classmethod
149 |   def setUpClass(self):
150 |     self.ref, self.out, _ = _get_example_data_detokenized()
151 |     self.scorer = scorers.create_scorer_from_profile("sacrebleu")
152 | 
153 |   def test_detok_bleu_corpus(self):
154 |     detok_bleu, _ = self.scorer.score_corpus(self.ref, self.out)
155 |     # compare to sacrebleu
156 |     self.assertAlmostEqual(detok_bleu, 21.7, places=0)
157 | 
158 | 
159 | class TestGleuScorer(unittest.TestCase):
160 | 
161 |   @classmethod
162 |   def setUpClass(cls) -> None:
163 |     example_path = os.path.join(compare_mt_root, "example")
164 |     filenames = ["ted.ref.eng", "ted.sys1.eng", "ted.orig.slk"]
165 |     cls.ref, cls.out, cls.src = [load_tokens(os.path.join(example_path, name)) for name in filenames]
166 |     cls.scorer = scorers.create_scorer_from_profile("gleu", case_insensitive=False)
167 | 
168 |   def test_score_corpus(self):
169 |     gleu, _ = self.scorer.score_corpus(self.ref, self.out, self.src)
170 |     # Compare to https://github.com/cnap/gec-ranking
171 |     self.assertAlmostEqual(gleu, 22.39, places=1)
172 | 
173 |   def test_score_sentence(self):
174 |     src = "A simple src sentence of test .".split()
175 |     ref = "A simple source sentence for testing .".split()
176 |     out = "A simple src sentence for testing .".split()
177 |     gleu, _ = self.scorer.score_sentence(ref, out, src)
178 |     # Compare to https://github.com/cnap/gec-ranking
179 |     self.assertAlmostEqual(gleu, 33.03, places=1)
180 | 
181 |   def test_score_cached_corpus(self):
182 |     cached_stats = [
183 |       (9, 2, [(2, 2), (1, 1), (0, 0), (0, 0)]),
184 |       (4, 13, [(4, 13), (2, 12), (0, 11), (0, 10)]),
185 |       (10, 10, [(6, 10), (4, 9), (1, 8), (0, 7)])
186 |     ]
187 |     gleu, _ = self.scorer.score_cached_corpus(range(len(cached_stats)), cached_stats)
188 |     self.assertEqual(gleu, 0)
189 | 
190 | 
191 | if __name__ == "__main__":
192 |   unittest.main()
193 | 


--------------------------------------------------------------------------------