├── .gitignore
├── LICENSE
├── README.md
├── tag-and-generate-data-prep
    ├── .gitignore
    ├── README.md
    ├── data
    │   └── catcher
    │   │   └── data.tsv
    ├── requirements.txt
    ├── scripts
    │   ├── prep_generator.sh
    │   └── prep_tagger.sh
    └── src
    │   ├── run.py
    │   └── style_tags.py
└── tag-and-generate-train
    ├── README.md
    ├── data
        └── catcher
        │   └── data.tsv
    ├── eval
        ├── context_eval.py
        ├── nlg_eval
        │   ├── LICENSE.md
        │   ├── MANIFEST.in
        │   ├── README.md
        │   ├── __init__.py
        │   ├── bin
        │   │   └── nlg-eval
        │   ├── examples
        │   │   ├── hyp.txt
        │   │   ├── ref1.txt
        │   │   └── ref2.txt
        │   ├── multi-bleu.perl
        │   ├── nlgeval
        │   │   ├── __init__.py
        │   │   ├── pycocoevalcap
        │   │   │   ├── README.md
        │   │   │   ├── __init__.py
        │   │   │   ├── bleu
        │   │   │   │   ├── LICENSE
        │   │   │   │   ├── __init__.py
        │   │   │   │   ├── bleu.py
        │   │   │   │   └── bleu_scorer.py
        │   │   │   ├── cider
        │   │   │   │   ├── __init__.py
        │   │   │   │   ├── cider.py
        │   │   │   │   └── cider_scorer.py
        │   │   │   ├── license.txt
        │   │   │   ├── meteor
        │   │   │   │   ├── __init__.py
        │   │   │   │   ├── data
        │   │   │   │   │   └── paraphrase-en.gz
        │   │   │   │   ├── meteor-1.5.jar
        │   │   │   │   ├── meteor.py
        │   │   │   │   └── tests
        │   │   │   │   │   └── test_meteor.py
        │   │   │   └── rouge
        │   │   │   │   ├── __init__.py
        │   │   │   │   └── rouge.py
        │   │   ├── skipthoughts
        │   │   │   ├── .gitignore
        │   │   │   ├── README.md
        │   │   │   ├── __init__.py
        │   │   │   └── skipthoughts.py
        │   │   ├── tests
        │   │   │   ├── __init__.py
        │   │   │   └── test_nlgeval.py
        │   │   ├── utils.py
        │   │   └── word2vec
        │   │   │   ├── __init__.py
        │   │   │   ├── evaluate.py
        │   │   │   └── generate_w2v_files.py
        │   ├── requirements.txt
        │   ├── requirements_py2.txt
        │   ├── setup.py
        │   └── test
        │   │   ├── __init__.py
        │   │   └── api.py
        └── run_context_eval.sh
    ├── requirements.txt
    ├── scripts
        ├── inference.sh
        ├── prepare_bpe.sh
        ├── train_generator.sh
        └── train_tagger.sh
    └── src
        ├── data.py
        ├── decoding.py
        ├── noisy.py
        ├── subwords.py
        ├── training.py
        ├── transformer.py
        └── translate.py


/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | pip-wheel-metadata/
 24 | share/python-wheels/
 25 | *.egg-info/
 26 | .installed.cfg
 27 | *.egg
 28 | MANIFEST
 29 | 
 30 | # PyInstaller
 31 | #  Usually these files are written by a python script from a template
 32 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 33 | *.manifest
 34 | *.spec
 35 | 
 36 | # Installer logs
 37 | pip-log.txt
 38 | pip-delete-this-directory.txt
 39 | 
 40 | # Unit test / coverage reports
 41 | htmlcov/
 42 | .tox/
 43 | .nox/
 44 | .coverage
 45 | .coverage.*
 46 | .cache
 47 | nosetests.xml
 48 | coverage.xml
 49 | *.cover
 50 | *.py,cover
 51 | .hypothesis/
 52 | .pytest_cache/
 53 | 
 54 | # Translations
 55 | *.mo
 56 | *.pot
 57 | 
 58 | # Django stuff:
 59 | *.log
 60 | local_settings.py
 61 | db.sqlite3
 62 | db.sqlite3-journal
 63 | 
 64 | # Flask stuff:
 65 | instance/
 66 | .webassets-cache
 67 | 
 68 | # Scrapy stuff:
 69 | .scrapy
 70 | 
 71 | # Sphinx documentation
 72 | docs/_build/
 73 | 
 74 | # PyBuilder
 75 | target/
 76 | 
 77 | # Jupyter Notebook
 78 | .ipynb_checkpoints
 79 | 
 80 | # IPython
 81 | profile_default/
 82 | ipython_config.py
 83 | 
 84 | # pyenv
 85 | .python-version
 86 | 
 87 | # pipenv
 88 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 89 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 90 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 91 | #   install all needed dependencies.
 92 | #Pipfile.lock
 93 | 
 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
 95 | __pypackages__/
 96 | 
 97 | # Celery stuff
 98 | celerybeat-schedule
 99 | celerybeat.pid
100 | 
101 | # SageMath parsed files
102 | *.sage.py
103 | 
104 | # Environments
105 | .env
106 | .venv
107 | env/
108 | venv/
109 | ENV/
110 | env.bak/
111 | venv.bak/
112 | 
113 | # Spyder project settings
114 | .spyderproject
115 | .spyproject
116 | 
117 | # Rope project settings
118 | .ropeproject
119 | 
120 | # mkdocs documentation
121 | /site
122 | 
123 | # mypy
124 | .mypy_cache/
125 | .dmypy.json
126 | dmypy.json
127 | 
128 | # Pyre type checker
129 | .pyre/
130 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2020 Politeness Transfer: A Tag and Generate Approach
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Tagger and Generator
  2 | 
  3 | ## Dataset preparation: [tag-and-generate/tagger-generator/tag-and-generate-data-prep](https://github.com/tag-and-generate/tagger-generator/tree/master/tag-and-generate-data-prep)
  4 | ## Training, inference, evaluation: [tag-and-generate/tagger-generator/tag-and-generate-train](https://github.com/tag-and-generate/tagger-generator/tree/master/tag-and-generate-train)
  5 | 
  6 | --- 
  7 | 
  8 | ## Walkthrough 
  9 | We will now present an example of training the politeness transfer system from scratch.
 10 | The process has five steps:
 11 |   * [Step 1: Getting the code](#step-1-getting-the-code)
 12 |   * [Step 2: Getting the training data](#step-2-getting-the-training-data)
 13 |   * [Step 3: Preparing parallel data for training](#step-3-preparing-parallel-data-for-training)
 14 |   * [Step 4: Training the tagger and generator](#step-4-training-the-tagger-and-generator)
 15 |   * [Step 5: Running inference](#step-5-running-inference)
 16 |   
 17 | ### Step 1: Getting the code
 18 | 
 19 | We begin by cloning this repo:
 20 | 
 21 | ```sh
 22 | git clone https://github.com/tag-and-generate/tagger-generator.git
 23 | ```
 24 | The cloned folder contains: i) ``tag-and-generate-data-prep`` the codebase used for creating the parallel tag and generate dataset, and ii) ``tag-and-generate-train``, the training code.
 25 | 
 26 | Each of these folders has a ``requirements.txt`` file that can be used to download the dependencies.
 27 | 
 28 | Next, let's create a folder inside ``tagger-generator`` to save all the datasets/tags:
 29 | 
 30 | ```sh
 31 | cd tagger-generator
 32 | mkdir data
 33 | ```
 34 | 
 35 | 
 36 | ### Step 2: Getting the training data.
 37 | 
 38 | The training data in a ready to use format is located [here](https://drive.google.com/file/d/1E9GHwmVM9DL9-KiaIaG5lm_oagLWe908/view?usp=sharing).
 39 | 
 40 | Download the zip file to the ``data`` folder created above and extract ```politeness.tsv```.
 41 | 
 42 | ```sh
 43 | unzip politeness_processed.zip
 44 | head politeness.tsv
 45 | ```
 46 | **txt**|**style**|**split**
 47 | -----|-----|-----
 48 | forwarded by tana jones / hou / ect on 09/28/2000|P\_2|train
 49 | the clickpaper approvals for 9/27/00 are attached below .|P\_7|train
 50 | "hello everyone : please let me know if you have a subscription to "" telerate "" ?"|P\_7|train
 51 | we are being billed for this service and i do not know who is using it .|P\_0|train
 52 | 
 53 | As we can see, the data is in the tsv format and has the right header.
 54 | 
 55 | 
 56 | You can also use ``gdown`` to directly download the file:
 57 | 
 58 | ```sh
 59 | gdown --id 1E9GHwmVM9DL9-KiaIaG5lm_oagLWe908
 60 | ```
 61 | 
 62 | 
 63 | 
 64 | 
 65 | 
 66 | 
 67 | Now that we have the codebase and the dataset, let's start by creating the parallel data required for training the models. Let's do a listing of the folder so far to make sure we are on the same page:
 68 | 
 69 | ```sh
 70 | (dl) tutorial@sa:~/tagger-generator$ ls
 71 | data  LICENSE  README.md  tag-and-generate-data-prep  tag-and-generate-train
 72 | ```
 73 | So, we are in the repo (tagger-generator), and see the two code folders (``tag-and-generate-data-prep`` and ``tag-and-generate-train``), as well as the data folder (``data``).
 74 | Further, the data folder has the ``politeness.tsv`` file that we just downloaded:
 75 | ```sh
 76 | (dl) tutorial@sa:~/tagger-generator$ ls data/
 77 | politeness_processed.zip  politeness.tsv
 78 | ```
 79 | 
 80 | ### Step 3: Preparing parallel data for training
 81 | 
 82 | We prepare the parallel data using ``tag-and-generate-data-prep``:
 83 | 
 84 | ```sh
 85 | cd tag-and-generate-data-prep
 86 | python src/run.py --data_pth ../data/politeness.tsv --outpath ../data/ --style_0_label P_9 --style_1_label P_0 --is_unimodal True
 87 | ```
 88 | More details on these options are located in [tag-and-generate/tagger-generator/tag-and-generate-data-prep](https://github.com/tag-and-generate/tagger-generator/tree/master/tag-and-generate-data-prep). In summary, we specify the input file, the label for the style of interest (``P_9``) and a neutral/contrastive style (``P_0``). Importantly, we specify ``--is_unimodal True``. This option ensures that the parallel data is created as per the unimodal style setting (Figure 3 in [the paper](https://arxiv.org/pdf/2004.14257.pdf)).
 89 | 
 90 | After data-prep finishes, we see several files in ``../data/``.
 91 | The important files are described below:
 92 | 
 93 | * P_9_tags.json: these are the politeness tags or phrases identified as polite phrases:
 94 | 
 95 | ```"thank you"
 96 | "thank"
 97 | "looking forward"
 98 | "glad"
 99 | "be interested"
100 | ```
101 | 
102 | * The data prep code creates two sets of training files: one for the ``tagger`` and another for the ``generator``. 
103 | To understand these, let's take a sample sentence ```please get back to me if you have any additional concerns .``` and look at how it is represented in different files:
104 | 
105 |     - ``entagged_parallel.train.en`` (input to the tagger):
106 |         -  ``back to me have concerns .``
107 |     - ``entagged_parallel.train.tagged`` (output of the tagger): 
108 |         - ``[P_90] back to me [P_91] have [P_92] concerns .``
109 |     - ``engenerated_parallel.train.en`` (input to the generator):
110 |         - ``[P_90] back to me [P_91] have [P_92] concerns .``
111 |     -  ``engenerated_parallel.train.generated`` (output of the generator)
112 |         - ``please get back to me if you have any additional concerns .``
113 | 
114 |     Here, ``P_9`` is the style tag, and the number after the style tag captures the position of the tag in the sentence.
115 | 
116 | With the data files ready, we are ready to run training.
117 | 
118 | 
119 | ### Step 4: Training the tagger and generator
120 | 
121 | All the training and inference related scripts/code is present in ``tag-and-generate-train``, so let's ``cd`` to it.
122 | 
123 | ```sh
124 | cd tag-and-generate-train
125 | ```
126 | 
127 | In order to prepare the files for training, we first process them using ``BPE. ``
128 | 
129 | ```sh
130 | bash scripts/prepare_bpe.sh tagged ../data/
131 | bash scripts/prepare_bpe.sh generated ../data/
132 | ```
133 | 
134 | We can now start training the tagger and generator:
135 | 
136 | ```sh
137 | nohup bash scripts/train_tagger.sh tagged politeness ../data/ > tagger.log &
138 | nohup bash scripts/train_generator.sh generated politeness ../data/ > generator.log &
139 | ```
140 | 
141 | ```politeness``` is a user-defined handle that we will use during inference. 
142 | 
143 | After the training finishes, the best models (given by validation perplexity) are stored in ``models``:
144 | 
145 | ```sh
146 | (dl) tutorial@sa:~/tagger-generator/tag-and-generate-train$ ls models/politeness/bpe/
147 | en-generated-generator.pt  en-tagged-tagger.pt
148 | ```
149 | 
150 | For our run, at the end of 5 epochs, the validation perplexity was 1.26 for the tagger, and 1.76 for the generator.
151 | 
152 | ### Step 5: Running inference
153 | 
154 | Let's test out the trained models on some sample sentences:
155 | 
156 | ```sh
157 | (dl) tutorial@sa:~/tagger-generator/tag-and-generate-train$ cat > input.txt
158 | send me the text files.
159 | look into this issue.
160 | 
161 | bash scripts/inference.sh input.txt sample tagged generated politeness P_9 P_9 ../data/ 3
162 | ```
163 | 
164 | Here ``sample`` is a unique identifier for the inference job, and ``politeness`` is the identifier we used for the training job. ``P_9`` is the style tag (kept the same for unimodal jobs). (Please see the README at [tag-and-generate/tagger-generator/tag-and-generate-train](https://github.com/tag-and-generate/tagger-generator/tree/master/tag-and-generate-train) for more details).
165 | 
166 | The final and intermediate outputs are located in experiments folder:
167 | 
168 | ```sh
169 | (dl) tutorial@sa:~/tagger-generator/tag-and-generate-train$ ls experiments/sample_*
170 | experiments/sample_generator_input  experiments/sample_tagged
171 | experiments/sample_output       experiments/sample_tagger_input
172 | ```
173 | 
174 | Let's look at the final output:
175 | 
176 | ```sh
177 | (dl) tutorial@sa:~/tagger-generator/tag-and-generate-train$ cat experiments/sample_output 
178 | please send me the text files.
179 | we would like to look into this issue.
180 | ```
181 | Not bad! 
182 | 
183 | We hope this walkthrough is helpful in understanding and using the codebase. Here are some additional helpful links:
184 | 
185 | - [Trained Models](https://drive.google.com/drive/folders/1tXLC4WbXc_WLgvQu2mTa3jDe0efZ3dz1?usp=sharing).
186 | - [Outputs](https://github.com/tag-and-generate/outputs)
187 | - [Datasets](https://github.com/tag-and-generate/politeness-dataset)
188 | 
189 | 
190 | 


--------------------------------------------------------------------------------
/tag-and-generate-data-prep/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | pip-wheel-metadata/
 24 | share/python-wheels/
 25 | *.egg-info/
 26 | .installed.cfg
 27 | *.egg
 28 | MANIFEST
 29 | 
 30 | # PyInstaller
 31 | #  Usually these files are written by a python script from a template
 32 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 33 | *.manifest
 34 | *.spec
 35 | 
 36 | # Installer logs
 37 | pip-log.txt
 38 | pip-delete-this-directory.txt
 39 | 
 40 | # Unit test / coverage reports
 41 | htmlcov/
 42 | .tox/
 43 | .nox/
 44 | .coverage
 45 | .coverage.*
 46 | .cache
 47 | nosetests.xml
 48 | coverage.xml
 49 | *.cover
 50 | *.py,cover
 51 | .hypothesis/
 52 | .pytest_cache/
 53 | 
 54 | # Translations
 55 | *.mo
 56 | *.pot
 57 | 
 58 | # Django stuff:
 59 | *.log
 60 | local_settings.py
 61 | db.sqlite3
 62 | db.sqlite3-journal
 63 | 
 64 | # Flask stuff:
 65 | instance/
 66 | .webassets-cache
 67 | 
 68 | # Scrapy stuff:
 69 | .scrapy
 70 | 
 71 | # Sphinx documentation
 72 | docs/_build/
 73 | 
 74 | # PyBuilder
 75 | target/
 76 | 
 77 | # Jupyter Notebook
 78 | .ipynb_checkpoints
 79 | 
 80 | # IPython
 81 | profile_default/
 82 | ipython_config.py
 83 | 
 84 | # pyenv
 85 | .python-version
 86 | 
 87 | # pipenv
 88 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 89 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 90 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 91 | #   install all needed dependencies.
 92 | #Pipfile.lock
 93 | 
 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
 95 | __pypackages__/
 96 | 
 97 | # Celery stuff
 98 | celerybeat-schedule
 99 | celerybeat.pid
100 | 
101 | # SageMath parsed files
102 | *.sage.py
103 | 
104 | # Environments
105 | .env
106 | .venv
107 | env/
108 | venv/
109 | ENV/
110 | env.bak/
111 | venv.bak/
112 | 
113 | # Spyder project settings
114 | .spyderproject
115 | .spyproject
116 | 
117 | # Rope project settings
118 | .ropeproject
119 | 
120 | # mkdocs documentation
121 | /site
122 | 
123 | # mypy
124 | .mypy_cache/
125 | .dmypy.json
126 | dmypy.json
127 | 
128 | # Pyre type checker
129 | .pyre/
130 | 


--------------------------------------------------------------------------------
/tag-and-generate-data-prep/README.md:
--------------------------------------------------------------------------------
 1 | ## Data Preparation
 2 | 
 3 | - This repository contains the code for creating parallel data that can be used to train the ``tagger`` and ``generator`` modules on your dataset https://arxiv.org/abs/2004.14257
 4 | 
 5 | - The `data/catcher/` directory contains some sample text that can be used to test the codebase.
 6 | 
 7 | ## Usage
 8 | 
 9 | 
10 | ```py
11 | python src/run.py --data_pth PTH\
12 |                   --outpath OUTPATH\
13 |                   --style_0_label label0\
14 |                   --style_1_label label1\
15 |                   --is_unimodal True | False
16 | ```
17 | 
18 | Where:
19 | - `PTH` should point to a tab-separated file (tsv) that contains the corpus. We assume that the corpus is made up of a set of sentences. The `tsv` is supposed to have three fields: 1) txt: the sentence, 2) split: train/test/val, and 3) style: label that identifies the style of the sentence (one of `label0` or `label1`). Sample:
20 | 
21 | | txt 	| style 	| split 	|
22 | |-	|-	|-	|
23 | | How've you been, Mrs. Spencer? 	| catcher 	| test 	|
24 | | C'mon, c'mon 	| catcher 	| train 	|
25 | | And the place death, considering who thou art, 	| romeo-juliet 	| train 	|
26 | | He's got a lot of dough, now. 	| catcher 	| test 	|
27 | 
28 | 
29 | 
30 | - `OUTPATH` is the location of the output
31 | 
32 | - `label0` and `label1` are tags that identify individual styles. This explicit assignment is important for unimodal cases, such as politeness and captions (please see the paper for more details)
33 | 
34 | - `is_unimodal` should be set to `True` for datasets that have only one stylistic information. `style_0_label` should be used to speficify the style of interest, and `style_1_label` should be the tag for a neutral/style free corpus. In case of politeness transfer, you can use `style_0_label` as `P_9` and `style_1_label` as `P_0` or `P_1`.
35 | 
36 | Please see run.py for the details on other options.
37 | 
38 | ## Outputs
39 | 
40 | While the program creates a number of files in the `OUTPATH` dir, only a subset of them are required for training `tagger` and `generator`. All of the files are named according to the following format:
41 | 
42 | `en{target}\_parallel.{split}.[en | {target}]`
43 | 
44 | Where `split` is either `train`, `test`, or `val`, and `target` is either set to `tagged` (for tagger) or `generated`) for generator. We always use `en` to refer to the source files.
45 | 
46 | Further, the attribute tags can also be found under the name `{style_label}_tags.json`
47 | 
48 | ## Walkthrough 
49 | 
50 | We walk through the usage of the data prep codebase by creating parallel data for our tag and generate system using the sample data present in `data/catcher`. The (toy) data consists of a few lines from the Catcher in the Rye and Romeo & Juliet. 
51 | 
52 | Some sample rows from the dataset are shown below:
53 | 
54 | | txt 	| style 	| split 	|
55 | |-	|-	|-	|
56 | | How've you been, Mrs. Spencer? 	| catcher 	| test 	|
57 | | C'mon, c'mon 	| catcher 	| train 	|
58 | | And the place death, considering who thou art, 	| romeo-juliet 	| train 	|
59 | | He's got a lot of dough, now. 	| catcher 	| test 	|
60 | | My life were better ended by their hate, 	| romeo-juliet 	| train 	|
61 | | He lent me counsel and I lent him eyes. 	| romeo-juliet 	| train 	|
62 | | It wasn't all my fault. 	| catcher 	| test 	|
63 | | If you don't, you feel even worse. 	| catcher 	| test 	|
64 | 
65 | 
66 | Using the defaults specified in src/run.py, we can generate the parallel data for training tag and generator using the following command:
67 | 
68 | ```py
69 | python3 src/run.py --data_pth data/catcher/data.tsv\
70 |                    --outpath data/tmp/\
71 |                    --style_0_label romeo-juliet\
72 |                    --style_1_label catcher\
73 |                    --is_unimodal False
74 | ```
75 | 
76 | After running this command, the specified output directory `data/tmp` will contain a number of files. The important ones are listed below.
77 | 
78 | - Style attribute tags:
79 |     - `romeo-juliet_tags.json`: The style tags for style 0 (romeo-juliet)
80 |     - `catcher_tags.json`: The style tags for style 1 (catcher)
81 | 
82 | - Tagger training files:
83 |     - `entagged_parallel.[train|test|val].en`: Source files for the tagger 
84 |     - `entagged_parallel.[train|test|val].tagged`: Target files for the tagger 
85 | 
86 | - Generator training files:
87 |     - `engenerated_parallel.[train|test|val].en`: Source files for the generator 
88 |     - `engenerated_parallel.[train|test|val].generated`: Target files for the generator
89 | 


--------------------------------------------------------------------------------
/tag-and-generate-data-prep/requirements.txt:
--------------------------------------------------------------------------------
  1 | absl-py==0.8.0
  2 | adjustText==0.7.3
  3 | altair==3.2.0
  4 | appnope==0.1.0
  5 | argh==0.26.2
  6 | astroid==2.2.5
  7 | astropy==3.2.1
  8 | atomicwrites==1.3.0
  9 | attrs==19.1.0
 10 | autopep8==1.4.4
 11 | backcall==0.1.0
 12 | backports.functools-lru-cache==1.5
 13 | base58==1.0.3
 14 | beautifulsoup4==4.8.0
 15 | bleach==3.1.4
 16 | blis==0.2.4
 17 | boto3==1.9.243
 18 | botocore==1.12.243
 19 | certifi==2019.6.16
 20 | chardet==3.0.4
 21 | Click==7.0
 22 | colorama==0.4.1
 23 | compare-mt==0.2.6
 24 | confuse==1.0.0
 25 | cycler==0.10.0
 26 | cymem==2.0.2
 27 | decorator==4.4.0
 28 | defusedxml==0.6.0
 29 | docopt==0.6.2
 30 | docutils==0.15.2
 31 | editdistance==0.5.3
 32 | en-core-web-sm==2.1.0
 33 | entrypoints==0.3
 34 | enum-compat==0.0.2
 35 | epitran==1.1
 36 | flake8==3.7.9
 37 | future==0.17.1
 38 | htmlmin==0.1.12
 39 | idna==2.8
 40 | importlib-metadata==0.23
 41 | indic-transliteration==1.8.8
 42 | ipykernel==5.1.2
 43 | ipython==7.8.0
 44 | ipython-genutils==0.2.0
 45 | ipywidgets==7.5.0
 46 | isort==4.3.21
 47 | jedi==0.15.1
 48 | Jinja2==2.10.1
 49 | jmespath==0.9.4
 50 | joblib==0.14.0
 51 | jsonschema==2.6.0
 52 | jupyter-client==5.3.3
 53 | jupyter-core==4.5.0
 54 | kiwisolver==1.1.0
 55 | lazy-object-proxy==1.4.2
 56 | llvmlite==0.29.0
 57 | marisa-trie==0.7.5
 58 | MarkupSafe==1.1.1
 59 | matplotlib==3.1.1
 60 | mccabe==0.6.1
 61 | missingno==0.4.2
 62 | mistune==0.8.4
 63 | more-itertools==7.2.0
 64 | munkres==1.1.2
 65 | murmurhash==1.0.2
 66 | nbconvert==5.6.0
 67 | nbformat==4.4.0
 68 | networkx==2.3
 69 | neuralcoref==4.0
 70 | nltk==3.4.5
 71 | notebook==6.0.1
 72 | numba==0.45.1
 73 | numpy==1.17.0
 74 | packaging==19.2
 75 | pandas==0.25.1
 76 | pandas-profiling==2.3.0
 77 | pandocfilters==1.4.2
 78 | panphon==0.15
 79 | parso==0.5.1
 80 | pathtools==0.1.2
 81 | pep8==1.7.1
 82 | pexpect==4.7.0
 83 | phik==0.9.8
 84 | pickleshare==0.7.5
 85 | pigar==0.9.2
 86 | Pillow==6.2.0
 87 | plac==0.9.6
 88 | plotly==4.1.1
 89 | pluggy==0.13.0
 90 | portalocker==1.5.1
 91 | praw==6.3.1
 92 | prawcore==1.0.1
 93 | preshed==2.0.1
 94 | prometheus-client==0.7.1
 95 | prompt-toolkit==2.0.9
 96 | psaw==0.0.7
 97 | ptyprocess==0.6.0
 98 | py==1.8.0
 99 | pycodestyle==2.5.0
100 | pyflakes==2.1.1
101 | Pygments==2.4.2
102 | pylint==2.3.1
103 | pyparsing==2.4.2
104 | pyreqs==0.1.1
105 | pyrsistent==0.15.4
106 | pytest==5.2.0
107 | pytest-pylint==0.14.1
108 | python-dateutil==2.8.0
109 | pytz==2019.2
110 | PyYAML==5.1.2
111 | pyzmq==18.1.0
112 | regex==2019.8.19
113 | requests==2.22.0
114 | retrying==1.3.3
115 | rope==0.14.0
116 | s3transfer==0.2.1
117 | sacrebleu==1.4.1
118 | scikit-learn==0.21.3
119 | scipy==1.3.1
120 | seaborn==0.9.0
121 | selenium==3.141.0
122 | Send2Trash==1.5.0
123 | sh==1.12.14
124 | six==1.12.0
125 | sklearn==0.0
126 | soupsieve==1.9.3
127 | spacy==2.1.0
128 | splinter==0.11.0
129 | srsly==0.1.0
130 | terminado==0.8.2
131 | testpath==0.4.2
132 | thinc==7.0.8
133 | toolz==0.10.0
134 | torch==1.3.0
135 | torchvision==0.4.1
136 | tornado==5.1.1
137 | tqdm==4.35.0
138 | traitlets==4.3.2
139 | typed-ast==1.4.0
140 | typing==3.7.4.1
141 | tzlocal==2.0.0
142 | unicodecsv==0.14.1
143 | update-checker==0.16
144 | urllib3==1.26.5
145 | validators==0.14.0
146 | wasabi==0.2.2
147 | wcwidth==0.1.7
148 | webencodings==0.5.1
149 | websocket-client==0.56.0
150 | widgetsnbextension==3.5.1
151 | wiki-dump-parser==2.0.0
152 | wikipedia==1.4.0
153 | Wikipedia-API==0.5.2
154 | wrapt==1.11.2
155 | zipp==0.6.0
156 | 


--------------------------------------------------------------------------------
/tag-and-generate-data-prep/scripts/prep_generator.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | set -u
 3 | BASEDIR="$1"
 4 | OUTDIR="$2"
 5 | tgt_lang="$3"
 6 | tgt_lang_tag="$4"
 7 | UNIMODAL="$5"
 8 | POS="$6"
 9 | NEG="$7"
10 | 
11 | for split in train test val; do
12 | 	if [ "$UNIMODAL" -eq 1 ]; then
13 | 		cp ${BASEDIR}/en${tgt_lang}_parallel.${split}.${tgt_lang}."$POS"  ${OUTDIR}/en${tgt_lang_tag}_parallel.${split}.en
14 | 	else
15 | 		cat ${BASEDIR}/en${tgt_lang}_parallel.${split}.${tgt_lang}."$POS" ${BASEDIR}/en${tgt_lang}_parallel.${split}.${tgt_lang}."$NEG" > ${OUTDIR}/en${tgt_lang_tag}_parallel.${split}.en
16 | 	fi
17 | done
18 | 
19 | for split in train test val; do
20 | 	if [ "$UNIMODAL" -eq 1 ]; then 
21 | 		cp ${BASEDIR}/en${tgt_lang}_parallel.${split}.en."$POS" ${OUTDIR}/en${tgt_lang_tag}_parallel.${split}.${tgt_lang_tag}
22 | 		sed -i "s/\[${NEG}[0-9]*\]//g;s/  / /g;s/^ //g;s/ $//g" ${OUTDIR}/en${tgt_lang_tag}_parallel.${split}.${tgt_lang_tag}
23 | 	else
24 | 		cat ${BASEDIR}/en${tgt_lang}_parallel.${split}.en."$POS" ${BASEDIR}/en${tgt_lang}_parallel.${split}.en."$NEG" > ${OUTDIR}/en${tgt_lang_tag}_parallel.${split}.${tgt_lang_tag}
25 | 	fi
26 | done
27 | 
28 | mv ${OUTDIR}/en${tgt_lang_tag}_parallel.val.en ${OUTDIR}/en${tgt_lang_tag}_parallel.dev.en
29 | mv ${OUTDIR}/en${tgt_lang_tag}_parallel.val.${tgt_lang_tag} ${OUTDIR}/en${tgt_lang_tag}_parallel.dev.${tgt_lang_tag}
30 | 
31 | 


--------------------------------------------------------------------------------
/tag-and-generate-data-prep/scripts/prep_tagger.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | BASEDIR="$1"
 3 | OUTDIR="$2"
 4 | tgt_lang="$3"
 5 | UNIMODAL="$4"
 6 | STYLE_0_LABEL="$5"
 7 | STYLE_1_LABEL="$6"
 8 | # In the unimodal case, POS is supposed to be the stylistic corpus
 9 | 
10 | # Step 1: Create source data for tagger
11 | for split in train test val; do
12 | 	if [ "$UNIMODAL" -eq 1 ]; then
13 | 		# UNIMODAL: The tags are deleted to create the source data for unimodal case
14 | 		cp "${BASEDIR}/en${tgt_lang}_parallel.${split}.${tgt_lang}.${STYLE_0_LABEL}" "${OUTDIR}/en${tgt_lang}_parallel.${split}.en"
15 | 		# UNIMODAL: The tags are deleted to create the source data for unimodal case
16 | 		sed -i "s/\[${STYLE_0_LABEL}[0-9]*\]//g;s/  / /g;s/^ //g;s/ $//g" ${OUTDIR}/en${tgt_lang}_parallel.${split}.en
17 | 	else
18 | 		# BIMODAL: Source data for the bimodal case is just the concatenation of the two styles
19 | 		cat ${BASEDIR}/en${tgt_lang}_parallel.${split}.en."${STYLE_0_LABEL}" ${BASEDIR}/en${tgt_lang}_parallel.${split}.en."${STYLE_1_LABEL}" > ${OUTDIR}/en${tgt_lang}_parallel.${split}.en
20 | 	fi
21 | 	# the following line performs simple strip operations on the lines
22 | 	sed -i 's/  / /g;s/^ //g;s/ $//g' ${OUTDIR}/en${tgt_lang}_parallel.${split}.en
23 | done
24 | 
25 | # Step 2: Create target data for tagger
26 | for split in train test val; do
27 | 	if [ "$UNIMODAL" -eq 1 ]; then
28 | 		# UNIMODAL: Target for unimodal tagger is the POS tagged data
29 | 		cp "${BASEDIR}/en${tgt_lang}_parallel.${split}.${tgt_lang}.${STYLE_0_LABEL}" "${OUTDIR}/en${tgt_lang}_parallel.${split}.${tgt_lang}"
30 | 	else
31 | 		cat ${BASEDIR}/en${tgt_lang}_parallel.${split}.${tgt_lang}."${STYLE_0_LABEL}" ${BASEDIR}/en${tgt_lang}_parallel.${split}.${tgt_lang}."${STYLE_1_LABEL}" > ${OUTDIR}/en${tgt_lang}_parallel.${split}.${tgt_lang}
32 | 	fi
33 | done
34 | 
35 | mv ${OUTDIR}/en${tgt_lang}_parallel.val.en ${OUTDIR}/en${tgt_lang}_parallel.dev.en
36 | mv ${OUTDIR}/en${tgt_lang}_parallel.val.${tgt_lang} ${OUTDIR}/en${tgt_lang}_parallel.dev.${tgt_lang}
37 | 


--------------------------------------------------------------------------------
/tag-and-generate-data-prep/src/run.py:
--------------------------------------------------------------------------------
  1 | """Generates tags
  2 | Usage:
  3 |     run.py [options]
  4 | 
  5 | Options:
  6 |     --data_pth=<str>                        Path to the data directory
  7 |     --outpath=<str>                         Output path
  8 |     --style_0_label=<str>                   Label for style 0
  9 |     --style_1_label=<str>                   Label for style 1
 10 |     --ngram_range_min=<int>                 Min n_gram_range [default: 1]
 11 |     --ngram_range_max=<int>                 Max n_gram_range [default: 2]
 12 |     --style_label_col=<str>                 Name of the column that has style label column [default: style]
 13 |     --thresh=<float>                        tf-idf ratio threshold [default: 0.90]
 14 |     --is_unimodal=<bool>                    Whether the dataset is unimodal (like politeness) or has two styles (like yelp)
 15 |     --gen_tags=<bool>                       Whether the style labels need to be generated again [default: True]
 16 | """
 17 | from docopt import docopt
 18 | import json
 19 | import pandas as pd
 20 | import pandas as pd
 21 | import subprocess
 22 | import logging
 23 | 
 24 | from src.style_tags import TFIDFStatsGenerator, RelativeTagsGenerator, TrainDataGen
 25 | 
 26 | 
 27 | def tag_style_markers(data_pth: str, outpath: str, style_0_label: str, style_1_label: str, tgt_lang="tagged", thresh=0.90, ngram_range=(1, 2),
 28 |                       ignore_from_tags=None, style_label_col="label", drop_duplicates=False,
 29 |                       gen_tags=True):
 30 |     """Runs tag generator. After this step, the following files are generated in the ``outpath`` directory:
 31 |         * entgt_lang_parallel.{split}.en.style_N_label: Sentences in style N
 32 |         * entgt_lang_parallel.{split}.taged.style_N_label: Sentences in style N with attribute phrases tagged
 33 |         (Here N is either 0 or 1, and split is one of {train, test, dev})
 34 |         * style_N_tags.json: Attribute tags for style N (0 or 1)
 35 | 
 36 |         A combination of the above files is sufficient to generate training data 
 37 |         for seq2seq models used by the tag-and-generate approach.
 38 |     Args:
 39 |         data_pth ([str]): Path to a file with the data. Each file should have the following columns:
 40 |             txt: The actual text
 41 |             split: train/test/dev
 42 |             style_label_col: indicates the style
 43 |         outpath ([str]): Path to the folder where the output files are written.
 44 |         style_0_label ([str]): Label for the 0th style.
 45 |         style_1_label ([str]): abel for the 1st style.
 46 |         tgt_lang ([str]): [description]
 47 |         thresh (float, optional): [description]. Defaults to 0.90.
 48 |         ngram_range (tuple, optional): [description]. Defaults to (1, 2).
 49 |         ignore_from_tags ([type], optional): [description]. Defaults to None.
 50 |         style_label_col (str, optional): [description]. Defaults to "label".
 51 |     """
 52 |     data = pd.read_csv(data_pth, sep="\t")
 53 |     if drop_duplicates:
 54 |         data = data.drop_duplicates(subset="txt")
 55 | 
 56 |     # Step 1
 57 |     logging.info("Reading the data")
 58 |     data_style_0 = data[data[style_label_col] == style_0_label]
 59 |     data_style_1 = data[data[style_label_col] == style_1_label]
 60 | 
 61 |     if gen_tags:
 62 |         # Step 2
 63 |         logging.info("Getting TF-IDF stats for both the corpora")
 64 |         logging.info(f"#Records {style_0_label} = {len(data_style_0)}")
 65 |         logging.info(f"#Records {style_1_label} = {len(data_style_1)}")
 66 | 
 67 |         tags_style_0, tags_style_1 = generate_tags(df_txt_class_1=data_style_0[data_style_0["split"] != "test"]["txt"],
 68 |                                                    df_txt_class_2=data_style_1[data_style_1["split"]
 69 |                                                                                != "test"]["txt"],
 70 |                                                    tag_class_1=style_0_label,
 71 |                                                    tag_class_2=style_1_label,
 72 |                                                    ignore_from_tags=ignore_from_tags,
 73 |                                                    thresh=thresh,
 74 |                                                    ngram_range=ngram_range)
 75 | 
 76 |         with open(f"{outpath}/{style_0_label}_tags.json", "w") as f:
 77 |             json.dump(tags_style_0, f)
 78 | 
 79 |         with open(f"{outpath}/{style_1_label}_tags.json", "w") as f:
 80 |             json.dump(tags_style_1, f)
 81 | 
 82 |     else:
 83 |         with open(f"{outpath}/{style_0_label}_tags.json", "r") as f:
 84 |             tags_style_0 = json.load(f)
 85 |         with open(f"{outpath}/{style_1_label}_tags.json", "r") as f:
 86 |             tags_style_1 = json.load(f)
 87 | 
 88 |     # Step 3
 89 |     logging.info("Generating the tagged data")
 90 |     TrainDataGen(data=data_style_0, outpath=outpath, tags=tags_style_0,
 91 |                  tag_token=style_0_label, tgt_lang=tgt_lang).generate()
 92 |     TrainDataGen(data=data_style_1, outpath=outpath, tags=tags_style_1,
 93 |                  tag_token=style_1_label, tgt_lang=tgt_lang).generate()
 94 | 
 95 | 
 96 | def generate_tags(df_txt_class_1,
 97 |                   df_txt_class_2,
 98 |                   tag_class_1,
 99 |                   tag_class_2,
100 |                   thresh,
101 |                   ngram_range,
102 |                   ignore_from_tags=None,
103 |                   ):
104 |     stats_class_1 = TFIDFStatsGenerator(
105 |         df_txt_class_1, tag_class_1, ngram_range=ngram_range)
106 |     stats_class_2 = TFIDFStatsGenerator(
107 |         df_txt_class_2, tag_class_2, ngram_range=ngram_range)
108 | 
109 |     class_1_tags = RelativeTagsGenerator(main_class_stats=stats_class_1,
110 |                                          relative_class_stats=stats_class_2,
111 |                                          ignore_from_tags=ignore_from_tags,
112 |                                          thresh=thresh).tags
113 | 
114 |     class_2_tags = RelativeTagsGenerator(main_class_stats=stats_class_2,
115 |                                          relative_class_stats=stats_class_1,
116 |                                          thresh=thresh).tags
117 |     return class_1_tags, class_2_tags
118 | 
119 | 
120 | def prepare_parallel_data_tagger(outdir, style_0_label, style_1_label, is_unimodal):
121 |     subprocess.check_call(f"scripts/prep_tagger.sh {outdir} {outdir} tagged {int(is_unimodal)} {style_0_label} {style_1_label}",
122 |                           shell=True)
123 | 
124 | 
125 | def prepare_parallel_data_generator(outdir, style_0_label, style_1_label, is_unimodal):
126 |     # "${MASKED_OP_DIR}" "${MASKED_OP_DIR}" "$prefix"masked "$prefix"unmasked "$isunimodal" "$posmask" "$negmask"
127 |     subprocess.check_call(f"scripts/prep_generator.sh {outdir} {outdir} tagged generated {int(is_unimodal)} {style_0_label} {style_1_label}",
128 |                           shell=True)
129 | 
130 | 
131 | if __name__ == '__main__':
132 |     logging.basicConfig(level=logging.INFO)
133 | 
134 |     args = docopt(__doc__)
135 |     is_unimodal = int(args["--is_unimodal"] == "True")
136 | 
137 |     # step 1: generate attribute markers, tagged dataset
138 |     tag_style_markers(data_pth=args["--data_pth"],
139 |                       outpath=args["--outpath"],
140 |                       style_0_label=args["--style_0_label"],
141 |                       style_1_label=args["--style_1_label"],
142 |                       thresh=float(args["--thresh"]),
143 |                       ngram_range=(int(args["--ngram_range_min"]),
144 |                                    int(args["--ngram_range_max"])),
145 |                       style_label_col=args["--style_label_col"],
146 |                       gen_tags=(args["--gen_tags"] == "True"))
147 | 
148 |     
149 | 
150 |     # step 2: generate parallel dataset for the tagger
151 |     prepare_parallel_data_tagger(
152 |         args["--outpath"], args["--style_0_label"], args["--style_1_label"], is_unimodal)
153 | 
154 |     # step 3: generate parallel dataset for the generator
155 |     prepare_parallel_data_generator(
156 |         args["--outpath"], args["--style_0_label"], args["--style_1_label"], is_unimodal)
157 | 


--------------------------------------------------------------------------------
/tag-and-generate-data-prep/src/style_tags.py:
--------------------------------------------------------------------------------
  1 | """Generates the tags for training the tager
  2 | """
  3 | import pandas as pd
  4 | from tqdm.autonotebook import tqdm
  5 | import numpy as np
  6 | from sklearn.feature_extraction.text import TfidfVectorizer
  7 | from sklearn.feature_extraction.text import CountVectorizer
  8 | import numpy
  9 | import scipy.sparse
 10 | from collections import namedtuple
 11 | import pickle
 12 | import tqdm
 13 | import numpy as np
 14 | import csv
 15 | from tqdm import tqdm
 16 | import logging
 17 | tqdm.pandas()
 18 | 
 19 | TFIDFStats = namedtuple("TFIDFStats", ["data_id", "id_to_word", "word_to_id", "tfidf_avg", "word_to_idf", "counts"])
 20 | 
 21 | 
 22 | class TrainDataGen:
 23 |     """
 24 |     Generates the training data
 25 |     """
 26 |     def __init__(self, data, outpath, tags, tag_token, tgt_lang):
 27 |         super().__init__()
 28 |         self.data = data
 29 |         
 30 |         self.outpath = outpath
 31 |         self.tag_token = tag_token
 32 |         self.tags = tags
 33 |         self.tgt_lang = tgt_lang
 34 |     
 35 |     def generate(self):
 36 |         self.tag_and_dump(split="train")
 37 |         self.tag_and_dump(split="test")
 38 |         self.tag_and_dump(split="val")
 39 |     
 40 | 
 41 |     def tag_and_dump(self, split):
 42 |         """Iterate over the given split, tags the sentences and write out the data
 43 |         
 44 |         Arguments:
 45 |             split {[str]} -- [description]
 46 |         """
 47 |         orig_sents, taged_sents = [], []
 48 |         data_in = self.data[self.data["split"] == split]
 49 |         for _, r in data_in.iterrows():
 50 |             orig = r["txt"].strip().replace("\n", "")
 51 |             orig_sents.append(orig)
 52 |             taged_sents.append(TrainDataGen.tag_sentence(orig, self.tags, self.tag_token).strip().replace("\n", ""))
 53 |             #polite_out.write(f"{orig}\n")
 54 |             #polite_taged_out.write(f"{taged_sent}\n")
 55 |         with open(f"{self.outpath}/en{self.tgt_lang}_parallel.{split}.en.{self.tag_token}", "w") as orig_out,\
 56 |              open(f"{self.outpath}/en{self.tgt_lang}_parallel.{split}.{self.tgt_lang}.{self.tag_token}", "w") as taged_out:
 57 |             for orig, taged in tqdm(zip(orig_sents, taged_sents), total=len(taged_sents)):
 58 |                 if self.tag_token in taged:
 59 |                     ### ONLY WRITE OUT THE tagED DATA
 60 |                     orig_out.write(f"{orig.strip()}\n")
 61 |                     taged_out.write(f"{taged.strip()}\n") 
 62 | 
 63 |     def tag_and_dump_batched(self, split):
 64 |         """Iterate over the given split, tags the sentences and write out the data
 65 |         
 66 |         Arguments:
 67 |             split {[str]} -- [description]
 68 |         """
 69 |         orig_sents, taged_sents = [], []
 70 |         self.data["txt_taged"] = self.data["txt"].progress_apply(lambda x: \
 71 |                                                                   TrainDataGen.tag_sentence(orig, self.tags,\
 72 |                                                                                              self.tag_token).strip().replace("\n", ""))
 73 |         
 74 |         
 75 |         with open(f"{self.outpath}/en{self.tgt_lang}_parallel.{split}.en.{self.tag_token}]", "w") as orig_out,\
 76 |              open(f"{self.outpath}/en{self.tgt_lang}_parallel.{split}.{self.tgt_lang}.{self.tag_token}]", "w") as taged_out:
 77 |             for orig, taged in tqdm(zip(orig_sents, taged_sents), total=len(taged_sents)):
 78 |                 if self.tag_token in taged:
 79 |                     ### ONLY WRITE OUT THE tagED DATA
 80 |                     orig_out.write(f"{orig.strip()}\n")
 81 |                     taged_out.write(f"{taged.strip()}\n") 
 82 | 
 83 |     @staticmethod
 84 |     def tag_sentence(sent, tag_dict, tag_token,
 85 |                       pos_weight: int = 3,
 86 |                       max_pos_indicator: int = 20,
 87 |                       concat = True):
 88 |         """Given a sentence and a dictionary from 
 89 |         tag_value to tag_probability, replaces all the words mw that are in the tag_dict
 90 |         with a probability tag_dict[mw]
 91 |         
 92 |         Arguments:
 93 |             sent {[str]}       -- [the given sentence]
 94 |             tag_dict {[dict]} -- [the tag dictionary]
 95 |             tag_token {[str]} -- [the taging token]
 96 |             dont_concat        -- [do not concat]
 97 |         
 98 |         Returns:
 99 |             [str] -- [the taged sentence]
100 |         """
101 |         i = 0
102 |         sent = sent.split()
103 |         taged_sent = []
104 |         prev_tag = False
105 |         while i < len(sent):
106 |             loc = min(i // pos_weight, max_pos_indicator)
107 |             key_bi_gram = " ".join(sent[i: i + 2])
108 |             key_tri_gram = " ".join(sent[i: i + 3])
109 |             key_quad_gram = " ".join(sent[i: i + 4])
110 |             
111 |             if key_quad_gram in tag_dict and np.random.rand() < tag_dict[key_quad_gram]:
112 |                 if not concat or not prev_tag:
113 |                     taged_sent.append(f"[{tag_token}{loc}]")
114 |                 prev_tag = True
115 |                 i += 4
116 | 
117 |             elif key_tri_gram in tag_dict and np.random.rand() < tag_dict[key_tri_gram]:
118 |                 if not concat or not prev_tag:
119 |                     taged_sent.append(f"[{tag_token}{loc}]")
120 |                 prev_tag = True
121 |                 i += 3
122 |             elif key_bi_gram in tag_dict and np.random.rand() < tag_dict[key_bi_gram]:
123 |                 if not concat or not prev_tag:
124 |                     taged_sent.append(f"[{tag_token}{loc}]")
125 |                 prev_tag = True
126 |                 i += 2
127 |             elif sent[i] in tag_dict and np.random.rand()< tag_dict[sent[i]]:
128 |                 if not concat or not prev_tag:
129 |                     taged_sent.append(f"[{tag_token}{loc}]")
130 |                 prev_tag = True
131 |                 i += 1
132 |             else:
133 |                 taged_sent.append(sent[i])
134 |                 prev_tag = False
135 |                 i += 1
136 |         return " ".join(taged_sent)
137 | 
138 | class TFIDFStatsGenerator:
139 | 
140 |     def __init__(self, data, data_id, ngram_range):
141 |         super().__init__()
142 |         self.ngram_range = ngram_range
143 |         self.data_id = data_id
144 |         self.data = data
145 |         self.generate()
146 | 
147 |     def get_word_counts(self):
148 |         """Generates the counts for various n-grams for the given corpus
149 |         
150 |         Returns:
151 |             a dictionary from phrase to word count
152 |         """
153 |         cv = CountVectorizer(ngram_range=self.ngram_range)
154 |         cv_fit = cv.fit_transform(self.data)
155 |         feature_names = cv.get_feature_names() 
156 |         X = np.asarray(cv_fit.sum(axis=0)) # sum counts across sentences
157 |         word_to_id = {feature_names[i]: i for i in range(len(cv.get_feature_names()))}
158 |         word_count = {}
159 |         for w in word_to_id:
160 |             word_count[w] = X[0, word_to_id[w]]
161 |         return word_count
162 | 
163 |     def generate(self):
164 |         """Generates various TFIDF related stats
165 |         for the given data and wraps them in a namedtuple
166 |         
167 |         Returns:
168 |             [type] -- [description]
169 |         """
170 |         logging.info("Running TfidfVectorizer")
171 |         vectorizer = TfidfVectorizer(ngram_range=self.ngram_range)
172 |         X = vectorizer.fit_transform(self.data)
173 |         feature_names = vectorizer.get_feature_names()
174 |         id_to_word = {i: feature_names[i] for i in range(len(vectorizer.get_feature_names()))}
175 |         word_to_id = {v: k for k, v in id_to_word.items()}
176 |         X = np.asarray(X.mean(axis=0)).squeeze(0) # / num_docs
177 |        
178 |         idf = vectorizer.idf_
179 |         counts = self.get_word_counts()
180 |         word_to_idf = dict(zip(feature_names, idf))
181 | 
182 |         self.id_to_word = id_to_word
183 |         self.word_to_id = word_to_id
184 |         self.tfidf_avg = X
185 |         self.word_to_idf = word_to_idf
186 |         self.counts = counts
187 | 
188 | 
189 | class RelativeTagsGenerator:
190 | 
191 |     def __init__(self, main_class_stats, relative_class_stats,
192 |                  min_freq: int = 2, thresh: float = 0.90,
193 |                  ignore_from_tags = None):
194 |         """Generates tags for the main class relative to 
195 |         the relative class. This is done on the basis of relative TF-IDF ratios of the words.
196 | 
197 |         Arguments:
198 |             main_class_stats {[type]} -- [description]
199 |             ref_class_stats {[type]} -- [description]
200 |         
201 |         Keyword Arguments:
202 |             min_freq {int} -- [Minimum freq in the main class for the phrase to be considered] (default: {1})
203 |             thresh {float} -- [The relative tf-idf scores are converted to percentiles. These percentiles are then
204 |                                used to select the tag phrases. In this case, the cutoff for such phrases is 0.90] (default: {0.90})
205 |             ignore_from_tags {[set]} -- [Set of words like the NER words, which might have to be ignored] (default: {None})
206 |         """
207 |         super().__init__()
208 |         self.main_class_stats = main_class_stats
209 |         self.relative_class_stats = relative_class_stats
210 |         self.min_freq = min_freq
211 |         self.c1_tag = main_class_stats.data_id
212 |         self.c2_tag = relative_class_stats.data_id
213 |         self.thresh = thresh
214 |         self.ignore_from_tags = ignore_from_tags
215 | 
216 |         self.generate_tfidf_report()
217 |         self.generate_relative_tags()
218 |         
219 | 
220 |     def generate_tfidf_report(self):
221 |         """Given TFIDF statistics on two datasets, returns a common tf-idf report. 
222 |         The report measures various statistics on the words that appear in class_2
223 |         
224 |         Arguments:
225 |             class1_tfidf_report {[TFIDFStats]} -- [TFIDFStats for class1]
226 |             class2_tfidf_report {[TFIDFStats]} -- [TFIDFStats for class2]
227 |         """
228 |         report = []
229 |         for word in self.main_class_stats.word_to_id.keys():
230 |             if self.main_class_stats.counts[word] >= self.min_freq and word in self.relative_class_stats.word_to_id:
231 |                     res = {}
232 |                     res["word"] = word
233 |                     res["freq"] = self.main_class_stats.counts[word]
234 |                     res[f"{self.c1_tag}_mean_tfidf"] = self.main_class_stats.tfidf_avg[self.main_class_stats.word_to_id[word]]
235 |                     res[f"{self.c2_tag}_mean_tfidf"] = self.relative_class_stats.tfidf_avg[self.relative_class_stats.word_to_id[word]]
236 |                     res[f"{self.c1_tag}_idf"] = self.main_class_stats.word_to_idf[word]
237 |                     res[f"{self.c2_tag}_idf"] = self.relative_class_stats.word_to_idf[word]
238 |                     report.append(res) 
239 |         self.report = pd.DataFrame(report)
240 | 
241 |     def generate_relative_tags(self):
242 |         """Returns a dictionary of phrases that are important in class1 relative to
243 |         class2
244 |         """
245 |         c1_over_c2 = f"{self.c1_tag}_over_{self.c2_tag}"
246 |         c2_over_c1 = f"{self.c2_tag}_over_{self.c1_tag}"
247 |         # tfidf_report["np_over_p"] = (tfidf_report["np_mean_tfidf"] / len(data_p_0)) / (tfidf_report["p_mean_tfidf"] /  len(data_p_9))
248 |         self.report[c1_over_c2] = self.report[f"{self.c1_tag}_mean_tfidf"] / self.report[f"{self.c2_tag}_mean_tfidf"] #ratio of tf-idf in the two corpora
249 | 
250 |         self.report[c2_over_c1] = 1 / self.report[c1_over_c2]
251 | 
252 |         self.report[f"{self.c1_tag}_tag"] = (self.report[c1_over_c2] / self.report[c1_over_c2].sum()) ** 0.75
253 |         # ^ add support for the small values
254 | 
255 |         self.report[f"{self.c1_tag}_tag"] = self.report[f"{self.c1_tag}_tag"] / self.report[f"{self.c1_tag}_tag"].sum()
256 |         # ^ make a probability
257 |         
258 |         self.report.sort_values(by=f"{self.c1_tag}_tag", ascending=False, inplace=True)
259 |         self.report['rank'] = self.report[f"{self.c1_tag}_tag"].rank(pct=True)
260 |         # ^ assign percentile
261 | 
262 | 
263 |         important_phrases = self.report[self.report["rank"] >= self.thresh]
264 |         # ^ only take phrases that clear the threshold (default: 0.9)
265 | 
266 |         important_phrases["score"] = (important_phrases["rank"] - self.thresh) / (1 - self.thresh) 
267 |         # ^ make a distribution again
268 |         
269 |         tags= {}
270 |         for i, r in important_phrases.iterrows():
271 |             tags[r["word"]] = r["score"]
272 |         
273 |         self.tags = tags
274 | 
275 |         if self.ignore_from_tags is not None:
276 |             logging.info("Ignoring tags")
277 |             self.tags = self.filter_tags_with_ignored_entities()
278 | 
279 |     def filter_tags_with_ignored_entities(self):
280 |         res = {}
281 |         for k, v in self.tags.items():
282 |             if not any(k_part in self.ignore_from_tags for k_part in k.split()):
283 |                 res[k] = v
284 |         return res
285 | 


--------------------------------------------------------------------------------
/tag-and-generate-train/README.md:
--------------------------------------------------------------------------------
 1 | # Training Tagger and Generator
 2 | 
 3 | - This repository contains the code to train the ``tagger`` and ``generator`` [modules](https://arxiv.org/abs/2004.14257).
 4 | 
 5 | - Apart from scripts to train the modules, it also has scripts needed to run inference on the test set and to run evaluation for metrics like `BLEU`, `ROUGE`, and `METEOR`.  
 6 | 
 7 | ---
 8 | 
 9 | ## Background
10 | 
11 | - Both `tagger` and `generator` are seq2seq models that require parallel data generated by the [data prep module](https://github.com/tag-and-generate/code-pre-release/tree/master/tag-and-generate-data-prep).
12 | 
13 | - The parallel datasets are:
14 |   - Tagger:
15 | `entagged_parallel.{split}.en` &rarr; `entagged_parallel.{split}.tagged`
16 |   - Generated:
17 | `engenerated_parallel.{split}.en` &rarr; `engenerated_parallel.{split}.generated`
18 | (where `{split}` is either train, test, or dev.)
19 | 
20 | ---
21 | 
22 | ## Prepare BPE
23 | 
24 | ```sh
25 | bash scripts/prepare_bpe.sh [tagged|generated] {base_folder}
26 | ```
27 | 
28 | Where:
29 | 
30 | - `base_folder`: The folder in which the data files are stored (argument used in creation of training data)
31 | 
32 | ---
33 | 
34 | ## Train Tagger and Generator
35 | 
36 | ```sh
37 | bash scripts/train_tagger.sh tagged {handle} {base_folder}
38 | ```
39 | 
40 | Where:
41 | 
42 | - `handle:` This is an identifier used to bucketize models trained on different datasets. Models on each `handle` are stored seperate folders with names indexed by `{handle}`, within the `{models}` directory.
43 | - `base_folder:` The folder in which the data files are stored (argument used in creation of training data).
44 |   
45 | ### Train Generator
46 | 
47 | ```sh
48 | bash scripts/train_generator.sh generated {handle} {base_folder}
49 | ```
50 | 
51 | Where:
52 | 
53 | - `handle:` This is an identifier used to bucketize models trained on different datasets. Models on each handle are stored seperate folders with names indexed by `{handle}`, within the `{models}` directory.
54 | - `base_folder:` The folder in which the data files are stored (argument used in creation of training data).
55 | 
56 | ---
57 | 
58 | ## Inference
59 |   
60 | ```sh
61 | bash scripts/inference.sh {input_file} {jobname}\
62 |                           tagged generated\
63 |                           {handle}\
64 |                           {style_0_label} {style_1_label}\
65 |                           {base_folder} {device}
66 | ```
67 | 
68 | Where:
69 | 
70 | - `input_file:` The input test file which needs to be transferred. This is the raw text file, with one sentence per line.
71 | - `jobname:` A unique identifier for the inference job.
72 | - `handle:` dataset argument we pass when we train `tagger` or `generator` -- used to identify model paths for `tagger` and `generator`.
73 | - `style_0_label:` A label for style 0
74 | - `style_1_label:` A label for style 1
75 | - `base_folder:` The folder in which the data files are stored (argument used in creation of training data)
76 | - `device:` gpu id
77 | 
78 | ---
79 | 
80 | ## Evaluation
81 | 
82 | ```sh
83 | bash run_context_eval.sh {hypothesis_filepath} {reference_filepath}
84 | ```
85 | 
86 | Where:
87 | 
88 | - `hypothesis_filepath:` The full path to the transferred output from trained model (hypothesis).
89 | - `reference_filepath:` The full path to the ideal output (for BLEU-r) or the original input file (for BLEU-s).
90 | 
91 | ## Trained Models
92 | 
93 | The trained models can be found [here](https://drive.google.com/drive/folders/1tXLC4WbXc_WLgvQu2mTa3jDe0efZ3dz1?usp=sharing).
94 | 
95 | ## References
96 | 
97 | - The code for evaluation has been partially borrowed from https://github.com/Maluuba/nlg-eval
98 | - Most of the code for the training pipeline has been borrowed from https://github.com/pmichel31415/jsalt-2019-mt-tutorial
99 | 


--------------------------------------------------------------------------------
/tag-and-generate-train/eval/context_eval.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Compute context based metrics for hypothesis given reference
 3 | 
 4 | Usage:
 5 |         context_eval.py [options]
 6 | 
 7 | Options:
 8 |         --hyp=<str>             Path to model hypothesis
 9 |         --ref=<str>             Path to model reference
10 | """
11 | 
12 | from nlgeval import compute_metrics
13 | from docopt import docopt
14 | 
15 | class Scorer:
16 | 
17 | 	def __init__(self, ref_file, hyp_file):
18 | 		self.ref_file = ref_file
19 | 		self.hyp_file = hyp_file
20 | 		self.references = list(map(lambda x:x.strip('\n'), open(ref_file, 'r').readlines()))
21 | 		self.hypothesis = list(map(lambda x:x.strip('\n'), open(hyp_file, 'r').readlines()))
22 | 		self.metrics_dict = {}
23 | 
24 | 	def score(self):
25 | 		hyp_test_str = "\n".join([h.replace('\n', '') for h in self.hypothesis])
26 | 		ref_test_str = "\n".join([r.replace('\n', '') for r in self.references])
27 | 		with open("/tmp/hyp.txt", 'w') as fd_hyp:
28 | 			fd_hyp.write(hyp_test_str)
29 | 			fd_hyp.close()
30 | 		with open("/tmp/ref.txt", 'w') as fd_ref:
31 | 			fd_ref.write(ref_test_str)
32 | 			fd_ref.close()
33 | 
34 | 		self.metrics_dict = compute_metrics(hypothesis="/tmp/hyp.txt", references=["/tmp/ref.txt"], no_glove=True, no_skipthoughts=True)
35 | 
36 | 	def print_metrics(self):
37 | 		for key in self.metrics_dict:
38 | 			print (key + "\t\t" + str(self.metrics_dict[key]))
39 | 
40 | def evaluate():
41 | 	args = docopt(__doc__)
42 | 	scorer = Scorer(args["--ref"], args["--hyp"])
43 | 	scorer.score()
44 | 	# scorer.print_metrics()		# Script already prints. Uncomment if needed
45 | 
46 | if __name__ == '__main__':
47 | 	evaluate()
48 | 


--------------------------------------------------------------------------------
/tag-and-generate-train/eval/nlg_eval/LICENSE.md:
--------------------------------------------------------------------------------
 1 | # nlg-eval
 2 | 
 3 | Copyright (c) Microsoft Corporation
 4 | 
 5 | All rights reserved.
 6 | 
 7 | MIT License
 8 | 
 9 | Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
10 | 
11 | The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
12 | 
13 | THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
14 | 
15 | ## THIRD PARTY NOTICES
16 | 
17 | This project is based on or incorporates material from the projects listed below (collectively, Third Party Code). Microsoft is not the original author of the Third Party Code. The original copyright notice and the license under which Microsoft received such Third Party Code, are set forth below. Such licenses and notices are provided for informational purposes only. Microsoft licenses the Third Party Code to you under the terms set forth in the Use Terms for the Microsoft Product. Microsoft reserves all other rights not expressly granted under this agreement, whether by implication, estoppel or otherwise.
18 | 
19 | ### Skip-Thoughts
20 | From the paper "Skip-Thought Vectors." arXiv preprint arXiv:1506.06726 (2015).
21 | Copyright (c) 2015 Ryan Kiros, Yukun Zhu, Ruslan Salakhutdinov, Richard S. Zemel, Antonio Torralba, Raquel Urtasun, and Sanja Fidler
22 | https://github.com/ryankiros/skip-thoughts
23 | 
24 | Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License.
25 | 
26 | You may obtain a copy of the License at
27 | 
28 |        http://www.apache.org/licenses/LICENSE-2.0
29 | 
30 | Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
31 | 
32 | See the License for the specific language governing permissions and limitations under the License.
33 | 
34 | ### Microsoft COCO Caption Evaluation
35 | Copyright (c) 2015 Xinlei Chen, Hao Fang, Tsung-Yi Lin, and Ramakrishna Vedantam
36 | All rights reserved.
37 | 
38 | Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
39 | 
40 | 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
41 | 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
42 | 
43 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" ANDANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIEDWARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
44 | 
45 | The views and conclusions contained in the software and documentation are those of the authors and should not be interpreted as representing official policies, either expressed or implied, of the FreeBSD Project.
46 | 


--------------------------------------------------------------------------------
/tag-and-generate-train/eval/nlg_eval/MANIFEST.in:
--------------------------------------------------------------------------------
1 | recursive-include nlgeval *
2 | 


--------------------------------------------------------------------------------
/tag-and-generate-train/eval/nlg_eval/README.md:
--------------------------------------------------------------------------------
  1 | [![Build Status](https://travis-ci.org/Maluuba/nlg-eval.svg?branch=master)](https://travis-ci.org/Maluuba/nlg-eval)
  2 | 
  3 | # nlg-eval
  4 | Evaluation code for various unsupervised automated metrics for NLG (Natural Language Generation).
  5 | It takes as input a hypothesis file, and one or more references files and outputs values of metrics.
  6 | Rows across these files should correspond to the same example.
  7 | 
  8 | ## Metrics ##
  9 | - BLEU
 10 | - METEOR
 11 | - ROUGE
 12 | - CIDEr
 13 | - SkipThought cosine similarity
 14 | - Embedding Average cosine similarity
 15 | - Vector Extrema cosine similarity
 16 | - Greedy Matching score
 17 | 
 18 | ## Setup ##
 19 | 
 20 | Install Java 1.8.0 (or higher).
 21 | Then run:
 22 | 
 23 | ```bash
 24 | # Install the Python dependencies.
 25 | pip install git+https://github.com/Maluuba/nlg-eval.git@master
 26 | 
 27 | # If using macOS High Sierra or higher, run this before run setup, to allow multithreading
 28 | # export OBJC_DISABLE_INITIALIZE_FORK_SAFETY=YES
 29 | 
 30 | # Simple setup:
 31 | # Download required data (e.g. models, embeddings) and external code files.
 32 | nlg-eval --setup
 33 | ```
 34 | 
 35 | ### Custom Setup ###
 36 | ```bash
 37 | # If you don't like the default path (~/.cache/nlgeval) for the downloaded data,
 38 | # then specify a path where you want the files to be downloaded.
 39 | # The value for the data path is stored in ~/.config/nlgeval/rc.json and can be overwritten by
 40 | # setting the NLGEVAL_DATA environment variable.
 41 | nlg-eval --setup ${data_path}
 42 | ```
 43 | 
 44 | ## Usage ##
 45 | Once setup has completed, the metrics can be evaluated with a Python API or in the command line.
 46 | 
 47 | Examples of the Python API can be found in [test_nlgeval.py](nlgeval/tests/test_nlgeval.py).
 48 | 
 49 | ### Standalone ###
 50 | 
 51 |     nlg-eval --hypothesis=examples/hyp.txt --references=examples/ref1.txt --references=examples/ref2.txt
 52 | 
 53 | where each line in the hypothesis file is a generated sentence and the corresponding
 54 | lines across the reference files are ground truth reference sentences for the
 55 | corresponding hypothesis.
 56 | 
 57 | ### functional API: for the entire corpus ###
 58 | 
 59 | ```python
 60 | from nlgeval import compute_metrics
 61 | metrics_dict = compute_metrics(hypothesis='examples/hyp.txt',
 62 |                                references=['examples/ref1.txt', 'examples/ref2.txt'])
 63 | ```
 64 | 
 65 | ### functional API: for only one sentence ###
 66 | 
 67 | ```python
 68 | from nlgeval import compute_individual_metrics
 69 | metrics_dict = compute_individual_metrics(references, hypothesis)
 70 | ```
 71 | 
 72 | where `references` is a list of ground truth reference text strings and
 73 | `hypothesis` is the hypothesis text string.
 74 | 
 75 | ### object oriented API for repeated calls in a script - single example ###
 76 | 
 77 | ```python
 78 | from nlgeval import NLGEval
 79 | nlgeval = NLGEval()  # loads the models
 80 | metrics_dict = nlgeval.compute_individual_metrics(references, hypothesis)
 81 | ```
 82 | 
 83 | where `references` is a list of ground truth reference text strings and
 84 | `hypothesis` is the hypothesis text string.
 85 | 
 86 | ### object oriented API for repeated calls in a script - multiple examples ###
 87 | 
 88 | ```python
 89 | from nlgeval import NLGEval
 90 | nlgeval = NLGEval()  # loads the models
 91 | metrics_dict = nlgeval.compute_metrics(references, hypothesis)
 92 | ```
 93 | 
 94 | where `references` is a list of lists of ground truth reference text strings and
 95 | `hypothesis` is a list of hypothesis text strings. Each inner list in `references`
 96 | is one set of references for the hypothesis (a list of single reference strings for
 97 | each sentence in `hypothesis` in the same order).
 98 | 
 99 | ## Reference ##
100 | If you use this code as part of any published research, please cite the following paper:
101 | 
102 | Shikhar Sharma, Layla El Asri, Hannes Schulz, and Jeremie Zumer.
103 | **"Relevance of Unsupervised Metrics in Task-Oriented Dialogue for Evaluating Natural Language Generation"**
104 | *arXiv preprint arXiv:1706.09799* (2017)
105 | 
106 | ```bibtex
107 | @article{sharma2017nlgeval,
108 |     author  = {Sharma, Shikhar and El Asri, Layla and Schulz, Hannes and Zumer, Jeremie},
109 |     title   = {Relevance of Unsupervised Metrics in Task-Oriented Dialogue for Evaluating Natural Language Generation},
110 |     journal = {CoRR},
111 |     volume  = {abs/1706.09799},
112 |     year    = {2017},
113 |     url     = {http://arxiv.org/abs/1706.09799}
114 | }
115 | ```
116 | 
117 | ## Example ##
118 | Running
119 | 
120 |     nlg-eval --hypothesis=examples/hyp.txt --references=examples/ref1.txt --references=examples/ref2.txt
121 | 
122 | gives
123 | 
124 |     Bleu_1: 0.550000
125 |     Bleu_2: 0.428174
126 |     Bleu_3: 0.284043
127 |     Bleu_4: 0.201143
128 |     METEOR: 0.295797
129 |     ROUGE_L: 0.522104
130 |     CIDEr: 1.242192
131 |     SkipThoughtsCosineSimilairty: 0.626149
132 |     EmbeddingAverageCosineSimilairty: 0.884690
133 |     VectorExtremaCosineSimilarity: 0.568696
134 |     GreedyMatchingScore: 0.784205
135 | 
136 | ## Troubleshooting
137 | If you have issues with Meteor then you can try lowering the `mem` variable in meteor.py
138 | 
139 | ## Important Note ##
140 | CIDEr by default (with idf parameter set to "corpus" mode) computes IDF values using the reference sentences provided. Thus,
141 | CIDEr score for a reference dataset with only 1 image (or example for NLG) will be zero. When evaluating using one (or few)
142 | images, set idf to "coco-val-df" instead, which uses IDF from the MSCOCO Vaildation Dataset for reliable results. This has
143 | not been adapted in this code. For this use-case, apply patches from
144 | [vrama91/coco-caption](https://github.com/vrama91/coco-caption).
145 | 
146 | 
147 | ## External data directory
148 | 
149 | To mount an already prepared data directory to a Docker container or share it between
150 | users, you can set the `NLGEVAL_DATA` environment variable to let nlg-eval know
151 | where to find its models and data.  E.g.
152 | 
153 |     NLGEVAL_DATA=~/workspace/nlg-eval/nlgeval/data
154 | 
155 | This variable overrides the value provided during setup (stored in `~/.config/nlgeval/rc.json`)
156 | 
157 | ## Microsoft Open Source Code of Conduct ##
158 | This project has adopted the [Microsoft Open Source Code of
159 | Conduct](https://opensource.microsoft.com/codeofconduct/).
160 | For more information see the [Code of Conduct
161 | FAQ](https://opensource.microsoft.com/codeofconduct/faq/) or
162 | contact [opencode@microsoft.com](mailto:opencode@microsoft.com)
163 | with any additional questions or comments.
164 | 
165 | ## License ##
166 | See [LICENSE.md](LICENSE.md).
167 | 


--------------------------------------------------------------------------------
/tag-and-generate-train/eval/nlg_eval/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tag-and-generate/tagger-generator/0059c49fd7df273be9421cb683c96f362663057d/tag-and-generate-train/eval/nlg_eval/__init__.py


--------------------------------------------------------------------------------
/tag-and-generate-train/eval/nlg_eval/bin/nlg-eval:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | #
  3 | # Copyright (c) Microsoft Corporation. All rights reserved.
  4 | # Licensed under the MIT license. See LICENSE.md file in the project root for full license information.
  5 | 
  6 | import json
  7 | import logging
  8 | import os
  9 | import stat
 10 | import sys
 11 | import time
 12 | from zipfile import ZipFile
 13 | 
 14 | import click
 15 | from xdg import XDG_CONFIG_HOME, XDG_CACHE_HOME
 16 | 
 17 | import nlgeval
 18 | import nlgeval.utils
 19 | 
 20 | CODE_PATH = nlgeval.__path__[0]
 21 | 
 22 | 
 23 | def _download_file(d):
 24 |     import requests
 25 |     from tqdm import tqdm
 26 | 
 27 |     url, target_dir = d['url'], d['target_dir']
 28 |     filename = url[url.rfind('/') + 1:]
 29 |     target_path = os.path.join(target_dir, filename)
 30 |     if not os.path.exists(target_path):
 31 |         # Collect data 1MB at a time.
 32 |         chunk_size = 1 * 1024 * 1024
 33 | 
 34 |         num_attempts = 3
 35 | 
 36 |         for attempt_num in range(num_attempts):
 37 |             try:
 38 |                 print("Downloading {} to {}.".format(url, target_dir))
 39 |                 r = requests.get(url, stream=True)
 40 |                 r.raise_for_status()
 41 | 
 42 |                 total = None
 43 |                 length = r.headers.get('Content-length')
 44 |                 if length is not None:
 45 |                     total = int(length) // chunk_size + 1
 46 | 
 47 |                 with open(target_path, 'wb') as f:
 48 |                     for chunk in tqdm(r.iter_content(chunk_size=chunk_size),
 49 |                                       desc="{}".format(filename),
 50 |                                       total=total,
 51 |                                       unit_scale=True, mininterval=15, unit=" chunks"):
 52 |                         sys.stdout.flush()
 53 |                         f.write(chunk)
 54 |                 break
 55 |             except:
 56 |                 if attempt_num < num_attempts - 1:
 57 |                     wait_s = 1 * 60
 58 |                     logging.exception("Error downloading file, will retry in %ds.", wait_s)
 59 |                     # Wait and try to download later.
 60 |                     time.sleep(wait_s)
 61 |                 else:
 62 |                     raise
 63 | 
 64 | 
 65 | @click.command()
 66 | @click.argument("data_path", required=False)
 67 | def setup(data_path):
 68 |     """
 69 |     Download required code and data files for nlg-eval.
 70 | 
 71 |     If the data_path argument is provided, install to the given location.
 72 |     Otherwise, your cache directory is used (usually ~/.cache/nlgeval).
 73 |     """
 74 |     from nltk.downloader import download
 75 |     download('punkt')
 76 | 
 77 |     from multiprocessing import Pool
 78 | 
 79 |     if data_path is None:
 80 |         data_path = os.getenv('NLGEVAL_DATA', os.path.join(XDG_CACHE_HOME, 'nlgeval'))
 81 |     click.secho("Installing to {}".format(data_path), fg='red')
 82 |     click.secho("In case of incomplete downloads, delete the directory and run `nlg-eval --setup {}' again.".format(data_path),
 83 |                fg='red')
 84 | 
 85 |     path = os.path.join(CODE_PATH, 'word2vec/glove2word2vec.py')
 86 |     if os.path.exists(path):
 87 |         os.remove(path)
 88 | 
 89 |     downloads = []
 90 | 
 91 |     if sys.version_info[0] == 2:
 92 |         downloads.append(dict(
 93 |             url='https://raw.githubusercontent.com/manasRK/glove-gensim/42ce46f00e83d3afa028fb6bf17ed3c90ca65fcc/glove2word2vec.py',
 94 |             target_dir=os.path.join(CODE_PATH, 'word2vec')
 95 |         ))
 96 |     else:
 97 |         downloads.append(dict(
 98 |             url='https://raw.githubusercontent.com/robmsmt/glove-gensim/4c2224bccd61627b76c50a5e1d6afd1c82699d22/glove2word2vec.py',
 99 |             target_dir=os.path.join(CODE_PATH, 'word2vec')
100 |         ))
101 | 
102 |     setup_glove = not os.path.exists(os.path.join(data_path, 'glove.6B.300d.model.bin'))
103 |     if setup_glove:
104 |         downloads.append(dict(
105 |             url='http://nlp.stanford.edu/data/glove.6B.zip',
106 |             target_dir=data_path
107 |         ))
108 | 
109 |     # Skip-thoughts data.
110 |     downloads.append(dict(
111 |         url='http://www.cs.toronto.edu/~rkiros/models/dictionary.txt',
112 |         target_dir=data_path
113 |     ))
114 |     downloads.append(dict(
115 |         url='http://www.cs.toronto.edu/~rkiros/models/utable.npy',
116 |         target_dir=data_path
117 |     ))
118 |     downloads.append(dict(
119 |         url='http://www.cs.toronto.edu/~rkiros/models/btable.npy',
120 |         target_dir=data_path
121 |     ))
122 |     downloads.append(dict(
123 |         url='http://www.cs.toronto.edu/~rkiros/models/uni_skip.npz',
124 |         target_dir=data_path
125 |     ))
126 |     downloads.append(dict(
127 |         url='http://www.cs.toronto.edu/~rkiros/models/uni_skip.npz.pkl',
128 |         target_dir=data_path
129 |     ))
130 |     downloads.append(dict(
131 |         url='http://www.cs.toronto.edu/~rkiros/models/bi_skip.npz',
132 |         target_dir=data_path
133 |     ))
134 |     downloads.append(dict(
135 |         url='http://www.cs.toronto.edu/~rkiros/models/bi_skip.npz.pkl',
136 |         target_dir=data_path
137 |     ))
138 | 
139 |     # multi-bleu.perl
140 |     downloads.append(dict(
141 |         url='https://raw.githubusercontent.com/moses-smt/mosesdecoder/b199e654df2a26ea58f234cbb642e89d9c1f269d/scripts/generic/multi-bleu.perl',
142 |         target_dir=os.path.join(CODE_PATH, 'multibleu')
143 |     ))
144 | 
145 |     for target_dir in {d['target_dir'] for d in downloads}:
146 |         if not os.path.exists(target_dir):
147 |             os.makedirs(target_dir)
148 | 
149 |     # Limit the number of threads so that we don't download too much from the same source concurrently.
150 |     pool = Pool(min(4, len(downloads)))
151 |     pool.map(_download_file, downloads)
152 |     pool.close()
153 |     pool.join()
154 | 
155 |     if setup_glove:
156 |         from nlgeval.word2vec.generate_w2v_files import generate
157 |         with ZipFile(os.path.join(data_path, 'glove.6B.zip')) as z:
158 |             z.extract('glove.6B.300d.txt', data_path)
159 |         generate(data_path)
160 |         for p in [
161 |             os.path.join(data_path, 'glove.6B.zip'),
162 |             os.path.join(data_path, 'glove.6B.300d.txt'),
163 |             os.path.join(data_path, 'glove.6B.300d.model.txt'),
164 |         ]:
165 |             if os.path.exists(p):
166 |                 os.remove(p)
167 | 
168 |     path = os.path.join(CODE_PATH, 'multibleu/multi-bleu.perl')
169 |     stats = os.stat(path)
170 |     os.chmod(path, stats.st_mode | stat.S_IEXEC)
171 | 
172 |     cfg_path = os.path.join(XDG_CONFIG_HOME, "nlgeval")
173 |     if not os.path.exists(cfg_path):
174 |         os.makedirs(cfg_path)
175 |     rc = dict()
176 |     try:
177 |         with open(os.path.join(cfg_path, "rc.json"), 'rt') as f:
178 |             rc = json.load(f)
179 |     except:
180 |         print("WARNING: could not read rc.json in %s, overwriting" % cfg_path)
181 |     rc['data_path'] = data_path
182 |     with open(os.path.join(cfg_path, "rc.json"), 'wt') as f:
183 |         f.write(json.dumps(rc))
184 | 
185 | 
186 | @click.command()
187 | @click.option('--references', type=click.Path(exists=True), multiple=True, required=True, help='Path of the reference file. This option can be provided multiple times for multiple reference files.')
188 | @click.option('--hypothesis', type=click.Path(exists=True), required=True, help='Path of the hypothesis file.')
189 | @click.option('--no-overlap', is_flag=True, help='Flag. If provided, word overlap based metrics will not be computed.')
190 | @click.option('--no-skipthoughts', is_flag=True, help='Flag. If provided, skip-thought cosine similarity will not be computed.')
191 | @click.option('--no-glove', is_flag=True, help='Flag. If provided, other word embedding based metrics will not be computed.')
192 | def compute_metrics(hypothesis, references, no_overlap, no_skipthoughts, no_glove):
193 |     """
194 |     Compute nlg-eval metrics.
195 | 
196 |     The --hypothesis and at least one --references parameters are required.
197 | 
198 |     To download the data and additional code files, use `nlg-eval --setup [data path]`.
199 | 
200 |     Note that nlg-eval also features an API, which may be easier to use.
201 |     """
202 |     try:
203 |         data_dir = nlgeval.utils.get_data_dir()
204 |     except nlgeval.utils.InvalidDataDirException:
205 |         sys.exit(1)
206 |     click.secho("Using data from {}".format(data_dir), fg='green')
207 |     click.secho("In case of broken downloads, remove the directory and run setup again.", fg='green')
208 |     nlgeval.compute_metrics(hypothesis, references, no_overlap, no_skipthoughts, no_glove)
209 | 
210 | 
211 | if __name__ == '__main__':
212 |     if len(sys.argv) > 1 and sys.argv[1] == '--setup':
213 |         del sys.argv[0]
214 |         setup()
215 |     else:
216 |         compute_metrics()
217 | 


--------------------------------------------------------------------------------
/tag-and-generate-train/eval/nlg_eval/examples/hyp.txt:
--------------------------------------------------------------------------------
1 | this is the model generated sentence1 which seems good enough
2 | this is sentence2 which has been generated by your model
3 | 


--------------------------------------------------------------------------------
/tag-and-generate-train/eval/nlg_eval/examples/ref1.txt:
--------------------------------------------------------------------------------
1 | this is one reference sentence for sentence1
2 | this is a reference sentence for sentence2 which was generated by your model
3 | 


--------------------------------------------------------------------------------
/tag-and-generate-train/eval/nlg_eval/examples/ref2.txt:
--------------------------------------------------------------------------------
1 | this is one more reference sentence for sentence1
2 | this is the second reference sentence for sentence2
3 | 


--------------------------------------------------------------------------------
/tag-and-generate-train/eval/nlg_eval/multi-bleu.perl:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env perl
  2 | #
  3 | # This file is part of moses.  Its use is licensed under the GNU Lesser General
  4 | # Public License version 2.1 or, at your option, any later version.
  5 | 
  6 | # $Id$
  7 | use warnings;
  8 | use strict;
  9 | 
 10 | my $lowercase = 0;
 11 | if ($ARGV[0] eq "-lc") {
 12 |   $lowercase = 1;
 13 |   shift;
 14 | }
 15 | 
 16 | my $stem = $ARGV[0];
 17 | if (!defined $stem) {
 18 |   print STDERR "usage: multi-bleu.pl [-lc] reference < hypothesis\n";
 19 |   print STDERR "Reads the references from reference or reference0, reference1, ...\n";
 20 |   exit(1);
 21 | }
 22 | 
 23 | $stem .= ".ref" if !-e $stem && !-e $stem."0" && -e $stem.".ref0";
 24 | 
 25 | my @REF;
 26 | my $ref=0;
 27 | while(-e "$stem$ref") {
 28 |     &add_to_ref("$stem$ref",\@REF);
 29 |     $ref++;
 30 | }
 31 | &add_to_ref($stem,\@REF) if -e $stem;
 32 | die("ERROR: could not find reference file $stem") unless scalar @REF;
 33 | 
 34 | # add additional references explicitly specified on the command line
 35 | shift;
 36 | foreach my $stem (@ARGV) {
 37 |     &add_to_ref($stem,\@REF) if -e $stem;
 38 | }
 39 | 
 40 | 
 41 | 
 42 | sub add_to_ref {
 43 |     my ($file,$REF) = @_;
 44 |     my $s=0;
 45 |     if ($file =~ /.gz$/) {
 46 | 	open(REF,"gzip -dc $file|") or die "Can't read $file";
 47 |     } else { 
 48 | 	open(REF,$file) or die "Can't read $file";
 49 |     }
 50 |     while(<REF>) {
 51 | 	chomp;
 52 | 	push @{$$REF[$s++]}, $_;
 53 |     }
 54 |     close(REF);
 55 | }
 56 | 
 57 | my(@CORRECT,@TOTAL,$length_translation,$length_reference);
 58 | my $s=0;
 59 | while(<STDIN>) {
 60 |     chomp;
 61 |     $_ = lc if $lowercase;
 62 |     my @WORD = split;
 63 |     my %REF_NGRAM = ();
 64 |     my $length_translation_this_sentence = scalar(@WORD);
 65 |     my ($closest_diff,$closest_length) = (9999,9999);
 66 |     foreach my $reference (@{$REF[$s]}) {
 67 | #      print "$s $_ <=> $reference\n";
 68 |   $reference = lc($reference) if $lowercase;
 69 | 	my @WORD = split(' ',$reference);
 70 | 	my $length = scalar(@WORD);
 71 |         my $diff = abs($length_translation_this_sentence-$length);
 72 | 	if ($diff < $closest_diff) {
 73 | 	    $closest_diff = $diff;
 74 | 	    $closest_length = $length;
 75 | 	    # print STDERR "$s: closest diff ".abs($length_translation_this_sentence-$length)." = abs($length_translation_this_sentence-$length), setting len: $closest_length\n";
 76 | 	} elsif ($diff == $closest_diff) {
 77 |             $closest_length = $length if $length < $closest_length;
 78 |             # from two references with the same closeness to me
 79 |             # take the *shorter* into account, not the "first" one.
 80 |         }
 81 | 	for(my $n=1;$n<=4;$n++) {
 82 | 	    my %REF_NGRAM_N = ();
 83 | 	    for(my $start=0;$start<=$#WORD-($n-1);$start++) {
 84 | 		my $ngram = "$n";
 85 | 		for(my $w=0;$w<$n;$w++) {
 86 | 		    $ngram .= " ".$WORD[$start+$w];
 87 | 		}
 88 | 		$REF_NGRAM_N{$ngram}++;
 89 | 	    }
 90 | 	    foreach my $ngram (keys %REF_NGRAM_N) {
 91 | 		if (!defined($REF_NGRAM{$ngram}) ||
 92 | 		    $REF_NGRAM{$ngram} < $REF_NGRAM_N{$ngram}) {
 93 | 		    $REF_NGRAM{$ngram} = $REF_NGRAM_N{$ngram};
 94 | #	    print "$i: REF_NGRAM{$ngram} = $REF_NGRAM{$ngram}<BR>\n";
 95 | 		}
 96 | 	    }
 97 | 	}
 98 |     }
 99 |     $length_translation += $length_translation_this_sentence;
100 |     $length_reference += $closest_length;
101 |     for(my $n=1;$n<=4;$n++) {
102 | 	my %T_NGRAM = ();
103 | 	for(my $start=0;$start<=$#WORD-($n-1);$start++) {
104 | 	    my $ngram = "$n";
105 | 	    for(my $w=0;$w<$n;$w++) {
106 | 		$ngram .= " ".$WORD[$start+$w];
107 | 	    }
108 | 	    $T_NGRAM{$ngram}++;
109 | 	}
110 | 	foreach my $ngram (keys %T_NGRAM) {
111 | 	    $ngram =~ /^(\d+) /;
112 | 	    my $n = $1;
113 |             # my $corr = 0;
114 | #	print "$i e $ngram $T_NGRAM{$ngram}<BR>\n";
115 | 	    $TOTAL[$n] += $T_NGRAM{$ngram};
116 | 	    if (defined($REF_NGRAM{$ngram})) {
117 | 		if ($REF_NGRAM{$ngram} >= $T_NGRAM{$ngram}) {
118 | 		    $CORRECT[$n] += $T_NGRAM{$ngram};
119 |                     # $corr =  $T_NGRAM{$ngram};
120 | #	    print "$i e correct1 $T_NGRAM{$ngram}<BR>\n";
121 | 		}
122 | 		else {
123 | 		    $CORRECT[$n] += $REF_NGRAM{$ngram};
124 |                     # $corr =  $REF_NGRAM{$ngram};
125 | #	    print "$i e correct2 $REF_NGRAM{$ngram}<BR>\n";
126 | 		}
127 | 	    }
128 |             # $REF_NGRAM{$ngram} = 0 if !defined $REF_NGRAM{$ngram};
129 |             # print STDERR "$ngram: {$s, $REF_NGRAM{$ngram}, $T_NGRAM{$ngram}, $corr}\n"
130 | 	}
131 |     }
132 |     $s++;
133 | }
134 | my $brevity_penalty = 1;
135 | my $bleu = 0;
136 | 
137 | my @bleu=();
138 | 
139 | for(my $n=1;$n<=4;$n++) {
140 |   if (defined ($TOTAL[$n])){
141 |     $bleu[$n]=($TOTAL[$n])?$CORRECT[$n]/$TOTAL[$n]:0;
142 |     # print STDERR "CORRECT[$n]:$CORRECT[$n] TOTAL[$n]:$TOTAL[$n]\n";
143 |   }else{
144 |     $bleu[$n]=0;
145 |   }
146 | }
147 | 
148 | if ($length_reference==0){
149 |   printf "BLEU = 0, 0/0/0/0 (BP=0, ratio=0, hyp_len=0, ref_len=0)\n";
150 |   exit(1);
151 | }
152 | 
153 | if ($length_translation<$length_reference) {
154 |   $brevity_penalty = exp(1-$length_reference/$length_translation);
155 | }
156 | $bleu = $brevity_penalty * exp((my_log( $bleu[1] ) +
157 | 				my_log( $bleu[2] ) +
158 | 				my_log( $bleu[3] ) +
159 | 				my_log( $bleu[4] ) ) / 4) ;
160 | printf "BLEU = %.2f, %.1f/%.1f/%.1f/%.1f (BP=%.3f, ratio=%.3f, hyp_len=%d, ref_len=%d)\n",
161 |     100*$bleu,
162 |     100*$bleu[1],
163 |     100*$bleu[2],
164 |     100*$bleu[3],
165 |     100*$bleu[4],
166 |     $brevity_penalty,
167 |     $length_translation / $length_reference,
168 |     $length_translation,
169 |     $length_reference;
170 | 
171 | 
172 | print STDERR "It is not advisable to publish scores from multi-bleu.perl.  The scores depend on your tokenizer, which is unlikely to be reproducible from your paper or consistent across research groups.  Instead you should detokenize then use mteval-v14.pl, which has a standard tokenization.  Scores from multi-bleu.perl can still be used for internal purposes when you have a consistent tokenizer.\n";
173 | 
174 | sub my_log {
175 |   return -9999999999 unless $_[0];
176 |   return log($_[0]);
177 | }
178 | 


--------------------------------------------------------------------------------
/tag-and-generate-train/eval/nlg_eval/nlgeval/__init__.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Microsoft Corporation. All rights reserved.
  2 | # Licensed under the MIT license. See LICENSE.md file in the project root for full license information.
  3 | from __future__ import print_function
  4 | 
  5 | import six
  6 | from six.moves import map
  7 | 
  8 | from nlgeval.pycocoevalcap.bleu.bleu import Bleu
  9 | from nlgeval.pycocoevalcap.cider.cider import Cider
 10 | from nlgeval.pycocoevalcap.meteor.meteor import Meteor
 11 | from nlgeval.pycocoevalcap.rouge.rouge import Rouge
 12 | 
 13 | 
 14 | # str/unicode stripping in Python 2 and 3 instead of `str.strip`.
 15 | def _strip(s):
 16 |     return s.strip()
 17 | 
 18 | 
 19 | def compute_metrics(hypothesis, references, no_overlap=False, no_skipthoughts=False, no_glove=False):
 20 |     with open(hypothesis, 'r') as f:
 21 |         hyp_list = f.readlines()
 22 |     ref_list = []
 23 |     for iidx, reference in enumerate(references):
 24 |         with open(reference, 'r') as f:
 25 |             ref_list.append(f.readlines())
 26 |     ref_list = [list(map(_strip, refs)) for refs in zip(*ref_list)]
 27 |     refs = {idx: strippedlines for (idx, strippedlines) in enumerate(ref_list)}
 28 |     hyps = {idx: [lines.strip()] for (idx, lines) in enumerate(hyp_list)}
 29 |     assert len(refs) == len(hyps)
 30 | 
 31 |     ret_scores = {}
 32 |     if not no_overlap:
 33 |         scorers = [
 34 |             (Bleu(4), ["Bleu_1", "Bleu_2", "Bleu_3", "Bleu_4"]),
 35 |             (Meteor(), "METEOR"),
 36 |             (Rouge(), "ROUGE_L"),
 37 |             (Cider(), "CIDEr")
 38 |         ]
 39 |         for scorer, method in scorers:
 40 |             score, scores = scorer.compute_score(refs, hyps)
 41 |             if isinstance(method, list):
 42 |                 for sc, scs, m in zip(score, scores, method):
 43 |                     print("%s: %0.6f" % (m, sc))
 44 |                     ret_scores[m] = sc
 45 |             else:
 46 |                 print("%s: %0.6f" % (method, score))
 47 |                 ret_scores[method] = score
 48 |         del scorers
 49 | 
 50 |     if not no_skipthoughts:
 51 |         from nlgeval.skipthoughts import skipthoughts
 52 |         import numpy as np
 53 |         from sklearn.metrics.pairwise import cosine_similarity
 54 | 
 55 |         model = skipthoughts.load_model()
 56 |         encoder = skipthoughts.Encoder(model)
 57 |         vector_hyps = encoder.encode([h.strip() for h in hyp_list], verbose=False)
 58 |         ref_list_T = np.array(ref_list).T.tolist()
 59 |         vector_refs = map(lambda refl: encoder.encode([r.strip() for r in refl], verbose=False), ref_list_T)
 60 |         cosine_similarity = list(map(lambda refv: cosine_similarity(refv, vector_hyps).diagonal(), vector_refs))
 61 |         cosine_similarity = np.max(cosine_similarity, axis=0).mean()
 62 |         print("SkipThoughtsCosineSimilairty: %0.6f" % (cosine_similarity))
 63 |         ret_scores['SkipThoughtCS'] = cosine_similarity
 64 |         del model
 65 | 
 66 |     if not no_glove:
 67 |         from nlgeval.word2vec.evaluate import eval_emb_metrics
 68 |         import numpy as np
 69 | 
 70 |         glove_hyps = [h.strip() for h in hyp_list]
 71 |         ref_list_T = np.array(ref_list).T.tolist()
 72 |         glove_refs = map(lambda refl: [r.strip() for r in refl], ref_list_T)
 73 |         scores = eval_emb_metrics(glove_hyps, glove_refs)
 74 |         print(scores)
 75 |         scores = scores.split('\n')
 76 |         for score in scores:
 77 |             name, value = score.split(':')
 78 |             value = float(value.strip())
 79 |             ret_scores[name] = value
 80 | 
 81 |     return ret_scores
 82 | 
 83 | 
 84 | def compute_individual_metrics(ref, hyp, no_overlap=False, no_skipthoughts=False, no_glove=False):
 85 |     assert isinstance(hyp, six.string_types)
 86 | 
 87 |     if isinstance(ref, six.string_types):
 88 |         ref = ref.split('||<|>||')  # special delimiter for backward compatibility
 89 |     ref = [a.strip() for a in ref]
 90 |     refs = {0: ref}
 91 |     ref_list = [ref]
 92 | 
 93 |     hyps = {0: [hyp.strip()]}
 94 |     hyp_list = [hyp]
 95 | 
 96 |     ret_scores = {}
 97 |     if not no_overlap:
 98 |         scorers = [
 99 |             (Bleu(4), ["Bleu_1", "Bleu_2", "Bleu_3", "Bleu_4"]),
100 |             (Meteor(), "METEOR"),
101 |             (Rouge(), "ROUGE_L"),
102 |             (Cider(), "CIDEr")
103 |         ]
104 |         for scorer, method in scorers:
105 |             score, scores = scorer.compute_score(refs, hyps)
106 |             if isinstance(method, list):
107 |                 for sc, scs, m in zip(score, scores, method):
108 |                     ret_scores[m] = sc
109 |             else:
110 |                 ret_scores[method] = score
111 | 
112 |     if not no_skipthoughts:
113 |         from nlgeval.skipthoughts import skipthoughts
114 |         import numpy as np
115 |         from sklearn.metrics.pairwise import cosine_similarity
116 | 
117 |         model = skipthoughts.load_model()
118 |         encoder = skipthoughts.Encoder(model)
119 |         vector_hyps = encoder.encode([h.strip() for h in hyp_list], verbose=False)
120 |         ref_list_T = np.array(ref_list).T.tolist()
121 |         vector_refs = map(lambda refl: encoder.encode([r.strip() for r in refl], verbose=False), ref_list_T)
122 |         cosine_similarity = list(map(lambda refv: cosine_similarity(refv, vector_hyps).diagonal(), vector_refs))
123 |         cosine_similarity = np.max(cosine_similarity, axis=0).mean()
124 |         ret_scores['SkipThoughtCS'] = cosine_similarity
125 | 
126 |     if not no_glove:
127 |         from nlgeval.word2vec.evaluate import eval_emb_metrics
128 |         import numpy as np
129 | 
130 |         glove_hyps = [h.strip() for h in hyp_list]
131 |         ref_list_T = np.array(ref_list).T.tolist()
132 |         glove_refs = map(lambda refl: [r.strip() for r in refl], ref_list_T)
133 |         scores = eval_emb_metrics(glove_hyps, glove_refs)
134 |         scores = scores.split('\n')
135 |         for score in scores:
136 |             name, value = score.split(':')
137 |             value = float(value.strip())
138 |             ret_scores[name] = value
139 | 
140 |     return ret_scores
141 | 
142 | 
143 | class NLGEval(object):
144 |     glove_metrics = {
145 |         'EmbeddingAverageCosineSimilairty',
146 |         'VectorExtremaCosineSimilarity',
147 |         'GreedyMatchingScore',
148 |     }
149 | 
150 |     valid_metrics = {
151 |                         # Overlap
152 |                         'Bleu_1', 'Bleu_2', 'Bleu_3', 'Bleu_4',
153 |                         'METEOR',
154 |                         'ROUGE_L',
155 |                         'CIDEr',
156 | 
157 |                         # Skip-thought
158 |                         'SkipThoughtCS',
159 |                     } | glove_metrics
160 | 
161 |     def __init__(self, no_overlap=False, no_skipthoughts=False, no_glove=False,
162 |                  metrics_to_omit=None):
163 |         """
164 |         :param no_overlap: Default: Use overlap metrics.
165 |             `True` if these metrics should not be used.
166 |         :type no_overlap: bool
167 |         :param no_skipthoughts: Default: Use the skip-thoughts metric.
168 |             `True` if this metrics should not be used.
169 |         :type no_skipthoughts: bool
170 |         :param no_glove: Default: Use GloVe based metrics.
171 |             `True` if these metrics should not be used.
172 |         :type no_glove: bool
173 |         :param metrics_to_omit: Default: Use all metrics. See `NLGEval.valid_metrics` for all metrics.
174 |             The previous parameters will override metrics in this one if they are set.
175 |             Metrics to omit. Omitting Bleu_{i} will omit Bleu_{j} for j>=i.
176 |         :type metrics_to_omit: Optional[Collection[str]]
177 |         """
178 | 
179 |         if metrics_to_omit is None:
180 |             self.metrics_to_omit = set()
181 |         else:
182 |             self.metrics_to_omit = set(metrics_to_omit)
183 |         assert len(self.metrics_to_omit - self.valid_metrics) == 0, \
184 |             "Invalid metrics to omit: {}".format(self.metrics_to_omit - self.valid_metrics)
185 | 
186 |         self.no_overlap = no_overlap
187 |         if not no_overlap:
188 |             self.load_scorers()
189 | 
190 |         self.no_skipthoughts = no_skipthoughts or 'SkipThoughtCS' in self.metrics_to_omit
191 |         if not self.no_skipthoughts:
192 |             self.load_skipthought_model()
193 | 
194 |         self.no_glove = no_glove or len(self.glove_metrics - self.metrics_to_omit) == 0
195 |         if not self.no_glove:
196 |             self.load_glove()
197 | 
198 |     def load_scorers(self):
199 |         self.scorers = []
200 | 
201 |         omit_bleu_i = False
202 |         for i in range(1, 4 + 1):
203 |             if 'Bleu_{}'.format(i) in self.metrics_to_omit:
204 |                 omit_bleu_i = True
205 |                 if i > 1:
206 |                     self.scorers.append((Bleu(i - 1), ['Bleu_{}'.format(j) for j in range(1, i)]))
207 |                 break
208 |         if not omit_bleu_i:
209 |             self.scorers.append((Bleu(4), ["Bleu_1", "Bleu_2", "Bleu_3", "Bleu_4"]))
210 | 
211 |         if 'METEOR' not in self.metrics_to_omit:
212 |             self.scorers.append((Meteor(), "METEOR"))
213 |         if 'ROUGE_L' not in self.metrics_to_omit:
214 |             self.scorers.append((Rouge(), "ROUGE_L"))
215 |         if 'CIDEr' not in self.metrics_to_omit:
216 |             self.scorers.append((Cider(), "CIDEr"))
217 | 
218 | 
219 |     def load_skipthought_model(self):
220 |         from nlgeval.skipthoughts import skipthoughts
221 |         import numpy as np
222 |         from sklearn.metrics.pairwise import cosine_similarity
223 |         self.np = np
224 |         self.cosine_similarity = cosine_similarity
225 | 
226 |         model = skipthoughts.load_model()
227 |         self.skipthought_encoder = skipthoughts.Encoder(model)
228 | 
229 |     def load_glove(self):
230 |         from nlgeval.word2vec.evaluate import Embedding
231 |         from nlgeval.word2vec.evaluate import eval_emb_metrics
232 |         import numpy as np
233 |         self.eval_emb_metrics = eval_emb_metrics
234 |         self.np = np
235 |         self.glove_emb = Embedding()
236 | 
237 |     def compute_individual_metrics(self, ref, hyp):
238 |         assert isinstance(hyp, six.string_types)
239 |         ref = [a.strip() for a in ref]
240 |         refs = {0: ref}
241 |         ref_list = [ref]
242 | 
243 |         hyps = {0: [hyp.strip()]}
244 |         hyp_list = [hyp]
245 | 
246 |         ret_scores = {}
247 |         if not self.no_overlap:
248 |             for scorer, method in self.scorers:
249 |                 score, scores = scorer.compute_score(refs, hyps)
250 |                 if isinstance(method, list):
251 |                     for sc, scs, m in zip(score, scores, method):
252 |                         ret_scores[m] = sc
253 |                 else:
254 |                     ret_scores[method] = score
255 | 
256 |         if not self.no_skipthoughts:
257 |             vector_hyps = self.skipthought_encoder.encode([h.strip() for h in hyp_list], verbose=False)
258 |             ref_list_T = self.np.array(ref_list).T.tolist()
259 |             vector_refs = map(lambda refl: self.skipthought_encoder.encode([r.strip() for r in refl], verbose=False), ref_list_T)
260 |             cosine_similarity = list(map(lambda refv: self.cosine_similarity(refv, vector_hyps).diagonal(), vector_refs))
261 |             cosine_similarity = self.np.max(cosine_similarity, axis=0).mean()
262 |             ret_scores['SkipThoughtCS'] = cosine_similarity
263 | 
264 |         if not self.no_glove:
265 |             glove_hyps = [h.strip() for h in hyp_list]
266 |             ref_list_T = self.np.array(ref_list).T.tolist()
267 |             glove_refs = map(lambda refl: [r.strip() for r in refl], ref_list_T)
268 |             scores = self.eval_emb_metrics(glove_hyps, glove_refs, emb=self.glove_emb,
269 |                                            metrics_to_omit=self.metrics_to_omit)
270 |             scores = scores.split('\n')
271 |             for score in scores:
272 |                 name, value = score.split(':')
273 |                 value = float(value.strip())
274 |                 ret_scores[name] = value
275 | 
276 |         return ret_scores
277 | 
278 |     def compute_metrics(self, ref_list, hyp_list):
279 |         ref_list = [list(map(_strip, refs)) for refs in zip(*ref_list)]
280 |         refs = {idx: strippedlines for (idx, strippedlines) in enumerate(ref_list)}
281 |         hyps = {idx: [lines.strip()] for (idx, lines) in enumerate(hyp_list)}
282 |         assert len(refs) == len(hyps)
283 | 
284 |         ret_scores = {}
285 |         if not self.no_overlap:
286 |             for scorer, method in self.scorers:
287 |                 score, scores = scorer.compute_score(refs, hyps)
288 |                 if isinstance(method, list):
289 |                     for sc, scs, m in zip(score, scores, method):
290 |                         ret_scores[m] = sc
291 |                 else:
292 |                     ret_scores[method] = score
293 | 
294 |         if not self.no_skipthoughts:
295 |             vector_hyps = self.skipthought_encoder.encode([h.strip() for h in hyp_list], verbose=False)
296 |             ref_list_T = self.np.array(ref_list).T.tolist()
297 |             vector_refs = map(lambda refl: self.skipthought_encoder.encode([r.strip() for r in refl], verbose=False), ref_list_T)
298 |             cosine_similarity = list(map(lambda refv: self.cosine_similarity(refv, vector_hyps).diagonal(), vector_refs))
299 |             cosine_similarity = self.np.max(cosine_similarity, axis=0).mean()
300 |             ret_scores['SkipThoughtCS'] = cosine_similarity
301 | 
302 |         if not self.no_glove:
303 |             glove_hyps = [h.strip() for h in hyp_list]
304 |             ref_list_T = self.np.array(ref_list).T.tolist()
305 |             glove_refs = map(lambda refl: [r.strip() for r in refl], ref_list_T)
306 |             scores = self.eval_emb_metrics(glove_hyps, glove_refs, emb=self.glove_emb)
307 |             scores = scores.split('\n')
308 |             for score in scores:
309 |                 name, value = score.split(':')
310 |                 value = float(value.strip())
311 |                 ret_scores[name] = value
312 | 
313 |         return ret_scores
314 | 


--------------------------------------------------------------------------------
/tag-and-generate-train/eval/nlg_eval/nlgeval/pycocoevalcap/README.md:
--------------------------------------------------------------------------------
1 | # coco-caption
2 | 
3 | Original README can be found at [tylin/coco-caption](https://github.com/tylin/coco-caption/blob/3f0fe9b819c0ea881a56441e4de1146924a394eb/README.md).
4 | 
5 | ## License
6 | 
7 | All files in the pycocoevalcap directory are under
8 | [BSD 2-clause "Simplified" License](https://github.com/tylin/coco-caption/blob/3f0fe9b819c0ea881a56441e4de1146924a394eb/license.txt)
9 | 


--------------------------------------------------------------------------------
/tag-and-generate-train/eval/nlg_eval/nlgeval/pycocoevalcap/__init__.py:
--------------------------------------------------------------------------------
1 | __author__ = 'tylin'
2 | 


--------------------------------------------------------------------------------
/tag-and-generate-train/eval/nlg_eval/nlgeval/pycocoevalcap/bleu/LICENSE:
--------------------------------------------------------------------------------
 1 | Copyright (c) 2015 Xinlei Chen, Hao Fang, Tsung-Yi Lin, and Ramakrishna Vedantam
 2 | 
 3 | Permission is hereby granted, free of charge, to any person obtaining a copy
 4 | of this software and associated documentation files (the "Software"), to deal
 5 | in the Software without restriction, including without limitation the rights
 6 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 7 | copies of the Software, and to permit persons to whom the Software is
 8 | furnished to do so, subject to the following conditions:
 9 | 
10 | The above copyright notice and this permission notice shall be included in
11 | all copies or substantial portions of the Software.
12 | 
13 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
19 | THE SOFTWARE.
20 | 


--------------------------------------------------------------------------------
/tag-and-generate-train/eval/nlg_eval/nlgeval/pycocoevalcap/bleu/__init__.py:
--------------------------------------------------------------------------------
1 | __author__ = 'tylin'
2 | 


--------------------------------------------------------------------------------
/tag-and-generate-train/eval/nlg_eval/nlgeval/pycocoevalcap/bleu/bleu.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # 
 3 | # File Name : bleu.py
 4 | #
 5 | # Description : Wrapper for BLEU scorer.
 6 | #
 7 | # Creation Date : 06-01-2015
 8 | # Last Modified : Thu 19 Mar 2015 09:13:28 PM PDT
 9 | # Authors : Hao Fang <hfang@uw.edu> and Tsung-Yi Lin <tl483@cornell.edu>
10 | 
11 | from .bleu_scorer import BleuScorer
12 | 
13 | 
14 | class Bleu:
15 |     def __init__(self, n=4):
16 |         # default compute Blue score up to 4
17 |         self._n = n
18 |         self._hypo_for_image = {}
19 |         self.ref_for_image = {}
20 | 
21 |     def compute_score(self, gts, res):
22 | 
23 |         assert(gts.keys() == res.keys())
24 |         imgIds = gts.keys()
25 | 
26 |         bleu_scorer = BleuScorer(n=self._n)
27 |         for id in imgIds:
28 |             hypo = res[id]
29 |             ref = gts[id]
30 | 
31 |             # Sanity check.
32 |             assert(type(hypo) is list)
33 |             assert(len(hypo) == 1)
34 |             assert(type(ref) is list)
35 |             assert(len(ref) >= 1)
36 | 
37 |             bleu_scorer += (hypo[0], ref)
38 | 
39 |         #score, scores = bleu_scorer.compute_score(option='shortest')
40 |         score, scores = bleu_scorer.compute_score(option='closest', verbose=0)
41 |         #score, scores = bleu_scorer.compute_score(option='average', verbose=1)
42 | 
43 |         # return (bleu, bleu_info)
44 |         return score, scores
45 | 
46 |     def method(self):
47 |         return "Bleu"
48 | 


--------------------------------------------------------------------------------
/tag-and-generate-train/eval/nlg_eval/nlgeval/pycocoevalcap/bleu/bleu_scorer.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | 
  3 | # bleu_scorer.py
  4 | # David Chiang <chiang@isi.edu>
  5 | 
  6 | # Copyright (c) 2004-2006 University of Maryland. All rights
  7 | # reserved. Do not redistribute without permission from the
  8 | # author. Not for commercial use.
  9 | 
 10 | # Modified by:
 11 | # Hao Fang <hfang@uw.edu>
 12 | # Tsung-Yi Lin <tl483@cornell.edu>
 13 | 
 14 | '''Provides:
 15 | cook_refs(refs, n=4): Transform a list of reference sentences as strings into a form usable by cook_test().
 16 | cook_test(test, refs, n=4): Transform a test sentence as a string (together with the cooked reference sentences) into a form usable by score_cooked().
 17 | '''
 18 | 
 19 | import copy
 20 | import sys, math, re
 21 | from collections import defaultdict
 22 | 
 23 | import six
 24 | from six.moves import xrange as range
 25 | 
 26 | 
 27 | def precook(s, n=4, out=False):
 28 |     """Takes a string as input and returns an object that can be given to
 29 |     either cook_refs or cook_test. This is optional: cook_refs and cook_test
 30 |     can take string arguments as well."""
 31 |     words = s.split()
 32 |     counts = defaultdict(int)
 33 |     for k in range(1,n+1):
 34 |         for i in range(len(words)-k+1):
 35 |             ngram = tuple(words[i:i+k])
 36 |             counts[ngram] += 1
 37 |     return (len(words), counts)
 38 | 
 39 | def cook_refs(refs, eff=None, n=4): ## lhuang: oracle will call with "average"
 40 |     '''Takes a list of reference sentences for a single segment
 41 |     and returns an object that encapsulates everything that BLEU
 42 |     needs to know about them.'''
 43 | 
 44 |     reflen = []
 45 |     maxcounts = {}
 46 |     for ref in refs:
 47 |         rl, counts = precook(ref, n)
 48 |         reflen.append(rl)
 49 |         for (ngram,count) in six.iteritems(counts):
 50 |             maxcounts[ngram] = max(maxcounts.get(ngram,0), count)
 51 | 
 52 |     # Calculate effective reference sentence length.
 53 |     if eff == "shortest":
 54 |         reflen = min(reflen)
 55 |     elif eff == "average":
 56 |         reflen = float(sum(reflen))/len(reflen)
 57 | 
 58 |     ## lhuang: N.B.: leave reflen computaiton to the very end!!
 59 | 
 60 |     ## lhuang: N.B.: in case of "closest", keep a list of reflens!! (bad design)
 61 | 
 62 |     return (reflen, maxcounts)
 63 | 
 64 | def cook_test(test, reflen_refmaxcounts, eff=None, n=4):
 65 |     '''Takes a test sentence and returns an object that
 66 |     encapsulates everything that BLEU needs to know about it.'''
 67 | 
 68 |     reflen, refmaxcounts = reflen_refmaxcounts
 69 |     testlen, counts = precook(test, n, True)
 70 | 
 71 |     result = {}
 72 | 
 73 |     # Calculate effective reference sentence length.
 74 | 
 75 |     if eff == "closest":
 76 |         result["reflen"] = min((abs(l-testlen), l) for l in reflen)[1]
 77 |     else: ## i.e., "average" or "shortest" or None
 78 |         result["reflen"] = reflen
 79 | 
 80 |     result["testlen"] = testlen
 81 | 
 82 |     result["guess"] = [max(0,testlen-k+1) for k in range(1,n+1)]
 83 | 
 84 |     result['correct'] = [0]*n
 85 |     for (ngram, count) in six.iteritems(counts):
 86 |         result["correct"][len(ngram)-1] += min(refmaxcounts.get(ngram,0), count)
 87 | 
 88 |     return result
 89 | 
 90 | class BleuScorer(object):
 91 |     """Bleu scorer.
 92 |     """
 93 | 
 94 |     __slots__ = "n", "crefs", "ctest", "_score", "_ratio", "_testlen", "_reflen", "special_reflen"
 95 |     # special_reflen is used in oracle (proportional effective ref len for a node).
 96 | 
 97 |     def copy(self):
 98 |         ''' copy the refs.'''
 99 |         new = BleuScorer(n=self.n)
100 |         new.ctest = copy.copy(self.ctest)
101 |         new.crefs = copy.copy(self.crefs)
102 |         new._score = None
103 |         return new
104 | 
105 |     def __init__(self, test=None, refs=None, n=4, special_reflen=None):
106 |         ''' singular instance '''
107 | 
108 |         self.n = n
109 |         self.crefs = []
110 |         self.ctest = []
111 |         self.cook_append(test, refs)
112 |         self.special_reflen = special_reflen
113 | 
114 |     def cook_append(self, test, refs):
115 |         '''called by constructor and __iadd__ to avoid creating new instances.'''
116 | 
117 |         if refs is not None:
118 |             self.crefs.append(cook_refs(refs))
119 |             if test is not None:
120 |                 cooked_test = cook_test(test, self.crefs[-1])
121 |                 self.ctest.append(cooked_test) ## N.B.: -1
122 |             else:
123 |                 self.ctest.append(None) # lens of crefs and ctest have to match
124 | 
125 |         self._score = None ## need to recompute
126 | 
127 |     def ratio(self, option=None):
128 |         self.compute_score(option=option)
129 |         return self._ratio
130 | 
131 |     def score_ratio(self, option=None):
132 |         '''return (bleu, len_ratio) pair'''
133 |         return (self.fscore(option=option), self.ratio(option=option))
134 | 
135 |     def score_ratio_str(self, option=None):
136 |         return "%.4f (%.2f)" % self.score_ratio(option)
137 | 
138 |     def reflen(self, option=None):
139 |         self.compute_score(option=option)
140 |         return self._reflen
141 | 
142 |     def testlen(self, option=None):
143 |         self.compute_score(option=option)
144 |         return self._testlen
145 | 
146 |     def retest(self, new_test):
147 |         if type(new_test) is str:
148 |             new_test = [new_test]
149 |         assert len(new_test) == len(self.crefs), new_test
150 |         self.ctest = []
151 |         for t, rs in zip(new_test, self.crefs):
152 |             self.ctest.append(cook_test(t, rs))
153 |         self._score = None
154 | 
155 |         return self
156 | 
157 |     def rescore(self, new_test):
158 |         ''' replace test(s) with new test(s), and returns the new score.'''
159 | 
160 |         return self.retest(new_test).compute_score()
161 | 
162 |     def size(self):
163 |         assert len(self.crefs) == len(self.ctest), "refs/test mismatch! %d<>%d" % (len(self.crefs), len(self.ctest))
164 |         return len(self.crefs)
165 | 
166 |     def __iadd__(self, other):
167 |         '''add an instance (e.g., from another sentence).'''
168 | 
169 |         if type(other) is tuple:
170 |             ## avoid creating new BleuScorer instances
171 |             self.cook_append(other[0], other[1])
172 |         else:
173 |             assert self.compatible(other), "incompatible BLEUs."
174 |             self.ctest.extend(other.ctest)
175 |             self.crefs.extend(other.crefs)
176 |             self._score = None ## need to recompute
177 | 
178 |         return self
179 | 
180 |     def compatible(self, other):
181 |         return isinstance(other, BleuScorer) and self.n == other.n
182 | 
183 |     def single_reflen(self, option="average"):
184 |         return self._single_reflen(self.crefs[0][0], option)
185 | 
186 |     def _single_reflen(self, reflens, option=None, testlen=None):
187 | 
188 |         if option == "shortest":
189 |             reflen = min(reflens)
190 |         elif option == "average":
191 |             reflen = float(sum(reflens))/len(reflens)
192 |         elif option == "closest":
193 |             reflen = min((abs(l-testlen), l) for l in reflens)[1]
194 |         else:
195 |             assert False, "unsupported reflen option %s" % option
196 | 
197 |         return reflen
198 | 
199 |     def recompute_score(self, option=None, verbose=0):
200 |         self._score = None
201 |         return self.compute_score(option, verbose)
202 | 
203 |     def compute_score(self, option=None, verbose=0):
204 |         n = self.n
205 |         small = 1e-9
206 |         tiny = 1e-15 ## so that if guess is 0 still return 0
207 |         bleu_list = [[] for _ in range(n)]
208 | 
209 |         if self._score is not None:
210 |             return self._score
211 | 
212 |         if option is None:
213 |             option = "average" if len(self.crefs) == 1 else "closest"
214 | 
215 |         self._testlen = 0
216 |         self._reflen = 0
217 |         totalcomps = {'testlen':0, 'reflen':0, 'guess':[0]*n, 'correct':[0]*n}
218 | 
219 |         # for each sentence
220 |         for comps in self.ctest:
221 |             testlen = comps['testlen']
222 |             self._testlen += testlen
223 | 
224 |             if self.special_reflen is None: ## need computation
225 |                 reflen = self._single_reflen(comps['reflen'], option, testlen)
226 |             else:
227 |                 reflen = self.special_reflen
228 | 
229 |             self._reflen += reflen
230 | 
231 |             for key in ['guess','correct']:
232 |                 for k in range(n):
233 |                     totalcomps[key][k] += comps[key][k]
234 | 
235 |             # append per image bleu score
236 |             bleu = 1.
237 |             for k in range(n):
238 |                 bleu *= (float(comps['correct'][k]) + tiny) \
239 |                         /(float(comps['guess'][k]) + small)
240 |                 bleu_list[k].append(bleu ** (1./(k+1)))
241 |             ratio = (testlen + tiny) / (reflen + small) ## N.B.: avoid zero division
242 |             if ratio < 1:
243 |                 for k in range(n):
244 |                     bleu_list[k][-1] *= math.exp(1 - 1/ratio)
245 | 
246 |             if verbose > 1:
247 |                 print(comps, reflen)
248 | 
249 |         totalcomps['reflen'] = self._reflen
250 |         totalcomps['testlen'] = self._testlen
251 | 
252 |         bleus = []
253 |         bleu = 1.
254 |         for k in range(n):
255 |             bleu *= float(totalcomps['correct'][k] + tiny) \
256 |                     / (totalcomps['guess'][k] + small)
257 |             bleus.append(bleu ** (1./(k+1)))
258 |         ratio = (self._testlen + tiny) / (self._reflen + small) ## N.B.: avoid zero division
259 |         if ratio < 1:
260 |             for k in range(n):
261 |                 bleus[k] *= math.exp(1 - 1/ratio)
262 | 
263 |         if verbose > 0:
264 |             print(totalcomps)
265 |             print("ratio:", ratio)
266 | 
267 |         self._score = bleus
268 |         return self._score, bleu_list
269 | 


--------------------------------------------------------------------------------
/tag-and-generate-train/eval/nlg_eval/nlgeval/pycocoevalcap/cider/__init__.py:
--------------------------------------------------------------------------------
1 | __author__ = 'tylin'
2 | 


--------------------------------------------------------------------------------
/tag-and-generate-train/eval/nlg_eval/nlgeval/pycocoevalcap/cider/cider.py:
--------------------------------------------------------------------------------
 1 | # Filename: cider.py
 2 | #
 3 | # Description: Describes the class to compute the CIDEr (Consensus-Based Image Description Evaluation) Metric 
 4 | #               by Vedantam, Zitnick, and Parikh (http://arxiv.org/abs/1411.5726)
 5 | #
 6 | # Creation Date: Sun Feb  8 14:16:54 2015
 7 | #
 8 | # Authors: Ramakrishna Vedantam <vrama91@vt.edu> and Tsung-Yi Lin <tl483@cornell.edu>
 9 | 
10 | from .cider_scorer import CiderScorer
11 | import pdb
12 | 
13 | class Cider:
14 |     """
15 |     Main Class to compute the CIDEr metric 
16 | 
17 |     """
18 |     def __init__(self, test=None, refs=None, n=4, sigma=6.0):
19 |         # set cider to sum over 1 to 4-grams
20 |         self._n = n
21 |         # set the standard deviation parameter for gaussian penalty
22 |         self._sigma = sigma
23 | 
24 |     def compute_score(self, gts, res):
25 |         """
26 |         Main function to compute CIDEr score
27 |         :param  hypo_for_image (dict) : dictionary with key <image> and value <tokenized hypothesis / candidate sentence>
28 |                 ref_for_image (dict)  : dictionary with key <image> and value <tokenized reference sentence>
29 |         :return: cider (float) : computed CIDEr score for the corpus 
30 |         """
31 | 
32 |         assert(gts.keys() == res.keys())
33 |         imgIds = gts.keys()
34 | 
35 |         cider_scorer = CiderScorer(n=self._n, sigma=self._sigma)
36 | 
37 |         for id in imgIds:
38 |             hypo = res[id]
39 |             ref = gts[id]
40 | 
41 |             # Sanity check.
42 |             assert(type(hypo) is list)
43 |             assert(len(hypo) == 1)
44 |             assert(type(ref) is list)
45 |             assert(len(ref) > 0)
46 | 
47 |             cider_scorer += (hypo[0], ref)
48 | 
49 |         (score, scores) = cider_scorer.compute_score()
50 | 
51 |         return score, scores
52 | 
53 |     def method(self):
54 |         return "CIDEr"
55 | 


--------------------------------------------------------------------------------
/tag-and-generate-train/eval/nlg_eval/nlgeval/pycocoevalcap/cider/cider_scorer.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # Tsung-Yi Lin <tl483@cornell.edu>
  3 | # Ramakrishna Vedantam <vrama91@vt.edu>
  4 | 
  5 | import copy
  6 | import math
  7 | from collections import defaultdict
  8 | 
  9 | import numpy as np
 10 | from six.moves import xrange as range
 11 | import six
 12 | 
 13 | def precook(s, n=4, out=False):
 14 |     """
 15 |     Takes a string as input and returns an object that can be given to
 16 |     either cook_refs or cook_test. This is optional: cook_refs and cook_test
 17 |     can take string arguments as well.
 18 |     :param s: string : sentence to be converted into ngrams
 19 |     :param n: int    : number of ngrams for which representation is calculated
 20 |     :return: term frequency vector for occuring ngrams
 21 |     """
 22 |     words = s.split()
 23 |     counts = defaultdict(int)
 24 |     for k in range(1,n+1):
 25 |         for i in range(len(words)-k+1):
 26 |             ngram = tuple(words[i:i+k])
 27 |             counts[ngram] += 1
 28 |     return counts
 29 | 
 30 | def cook_refs(refs, n=4): ## lhuang: oracle will call with "average"
 31 |     '''Takes a list of reference sentences for a single segment
 32 |     and returns an object that encapsulates everything that BLEU
 33 |     needs to know about them.
 34 |     :param refs: list of string : reference sentences for some image
 35 |     :param n: int : number of ngrams for which (ngram) representation is calculated
 36 |     :return: result (list of dict)
 37 |     '''
 38 |     return [precook(ref, n) for ref in refs]
 39 | 
 40 | def cook_test(test, n=4):
 41 |     '''Takes a test sentence and returns an object that
 42 |     encapsulates everything that BLEU needs to know about it.
 43 |     :param test: list of string : hypothesis sentence for some image
 44 |     :param n: int : number of ngrams for which (ngram) representation is calculated
 45 |     :return: result (dict)
 46 |     '''
 47 |     return precook(test, n, True)
 48 | 
 49 | class CiderScorer(object):
 50 |     """CIDEr scorer.
 51 |     """
 52 | 
 53 |     def copy(self):
 54 |         ''' copy the refs.'''
 55 |         new = CiderScorer(n=self.n)
 56 |         new.ctest = copy.copy(self.ctest)
 57 |         new.crefs = copy.copy(self.crefs)
 58 |         return new
 59 | 
 60 |     def __init__(self, test=None, refs=None, n=4, sigma=6.0):
 61 |         ''' singular instance '''
 62 |         self.n = n
 63 |         self.sigma = sigma
 64 |         self.crefs = []
 65 |         self.ctest = []
 66 |         self.document_frequency = defaultdict(float)
 67 |         self.cook_append(test, refs)
 68 |         self.ref_len = None
 69 | 
 70 |     def cook_append(self, test, refs):
 71 |         '''called by constructor and __iadd__ to avoid creating new instances.'''
 72 | 
 73 |         if refs is not None:
 74 |             self.crefs.append(cook_refs(refs))
 75 |             if test is not None:
 76 |                 self.ctest.append(cook_test(test)) ## N.B.: -1
 77 |             else:
 78 |                 self.ctest.append(None) # lens of crefs and ctest have to match
 79 | 
 80 |     def size(self):
 81 |         assert len(self.crefs) == len(self.ctest), "refs/test mismatch! %d<>%d" % (len(self.crefs), len(self.ctest))
 82 |         return len(self.crefs)
 83 | 
 84 |     def __iadd__(self, other):
 85 |         '''add an instance (e.g., from another sentence).'''
 86 | 
 87 |         if type(other) is tuple:
 88 |             ## avoid creating new CiderScorer instances
 89 |             self.cook_append(other[0], other[1])
 90 |         else:
 91 |             self.ctest.extend(other.ctest)
 92 |             self.crefs.extend(other.crefs)
 93 | 
 94 |         return self
 95 |     def compute_doc_freq(self):
 96 |         '''
 97 |         Compute term frequency for reference data.
 98 |         This will be used to compute idf (inverse document frequency later)
 99 |         The term frequency is stored in the object
100 |         :return: None
101 |         '''
102 |         for refs in self.crefs:
103 |             # refs, k ref captions of one image
104 |             for ngram in set([ngram for ref in refs for (ngram,count) in six.iteritems(ref)]):
105 |                 self.document_frequency[ngram] += 1
106 |             # maxcounts[ngram] = max(maxcounts.get(ngram,0), count)
107 | 
108 |     def compute_cider(self):
109 |         def counts2vec(cnts):
110 |             """
111 |             Function maps counts of ngram to vector of tfidf weights.
112 |             The function returns vec, an array of dictionary that store mapping of n-gram and tf-idf weights.
113 |             The n-th entry of array denotes length of n-grams.
114 |             :param cnts:
115 |             :return: vec (array of dict), norm (array of float), length (int)
116 |             """
117 |             vec = [defaultdict(float) for _ in range(self.n)]
118 |             length = 0
119 |             norm = [0.0 for _ in range(self.n)]
120 |             for (ngram,term_freq) in six.iteritems(cnts):
121 |                 # give word count 1 if it doesn't appear in reference corpus
122 |                 df = np.log(max(1.0, self.document_frequency[ngram]))
123 |                 # ngram index
124 |                 n = len(ngram)-1
125 |                 # tf (term_freq) * idf (precomputed idf) for n-grams
126 |                 vec[n][ngram] = float(term_freq)*(self.ref_len - df)
127 |                 # compute norm for the vector.  the norm will be used for computing similarity
128 |                 norm[n] += pow(vec[n][ngram], 2)
129 | 
130 |                 if n == 1:
131 |                     length += term_freq
132 |             norm = [np.sqrt(n) for n in norm]
133 |             return vec, norm, length
134 | 
135 |         def sim(vec_hyp, vec_ref, norm_hyp, norm_ref, length_hyp, length_ref):
136 |             '''
137 |             Compute the cosine similarity of two vectors.
138 |             :param vec_hyp: array of dictionary for vector corresponding to hypothesis
139 |             :param vec_ref: array of dictionary for vector corresponding to reference
140 |             :param norm_hyp: array of float for vector corresponding to hypothesis
141 |             :param norm_ref: array of float for vector corresponding to reference
142 |             :param length_hyp: int containing length of hypothesis
143 |             :param length_ref: int containing length of reference
144 |             :return: array of score for each n-grams cosine similarity
145 |             '''
146 |             delta = float(length_hyp - length_ref)
147 |             # measure consine similarity
148 |             val = np.array([0.0 for _ in range(self.n)])
149 |             for n in range(self.n):
150 |                 # ngram
151 |                 for (ngram,count) in six.iteritems(vec_hyp[n]):
152 |                     # vrama91 : added clipping
153 |                     val[n] += min(vec_hyp[n][ngram], vec_ref[n][ngram]) * vec_ref[n][ngram]
154 | 
155 |                 if (norm_hyp[n] != 0) and (norm_ref[n] != 0):
156 |                     val[n] /= (norm_hyp[n]*norm_ref[n])
157 | 
158 |                 assert(not math.isnan(val[n]))
159 |                 # vrama91: added a length based gaussian penalty
160 |                 val[n] *= np.e**(-(delta**2)/(2*self.sigma**2))
161 |             return val
162 | 
163 |         # compute log reference length
164 |         self.ref_len = np.log(float(len(self.crefs)))
165 | 
166 |         scores = []
167 |         for test, refs in zip(self.ctest, self.crefs):
168 |             # compute vector for test captions
169 |             vec, norm, length = counts2vec(test)
170 |             # compute vector for ref captions
171 |             score = np.array([0.0 for _ in range(self.n)])
172 |             for ref in refs:
173 |                 vec_ref, norm_ref, length_ref = counts2vec(ref)
174 |                 score += sim(vec, vec_ref, norm, norm_ref, length, length_ref)
175 |             # change by vrama91 - mean of ngram scores, instead of sum
176 |             score_avg = np.mean(score)
177 |             # divide by number of references
178 |             score_avg /= len(refs)
179 |             # multiply score by 10
180 |             score_avg *= 10.0
181 |             # append score of an image to the score list
182 |             scores.append(score_avg)
183 |         return scores
184 | 
185 |     def compute_score(self, option=None, verbose=0):
186 |         # compute idf
187 |         self.compute_doc_freq()
188 |         # assert to check document frequency
189 |         assert(len(self.ctest) >= max(self.document_frequency.values()))
190 |         # compute cider score
191 |         score = self.compute_cider()
192 |         # debug
193 |         # print score
194 |         return np.mean(np.array(score)), np.array(score)


--------------------------------------------------------------------------------
/tag-and-generate-train/eval/nlg_eval/nlgeval/pycocoevalcap/license.txt:
--------------------------------------------------------------------------------
 1 | Copyright (c) 2015, Xinlei Chen, Hao Fang, Tsung-Yi Lin, and Ramakrishna Vedantam
 2 | All rights reserved.
 3 | 
 4 | Redistribution and use in source and binary forms, with or without
 5 | modification, are permitted provided that the following conditions are met:
 6 | 
 7 | 1. Redistributions of source code must retain the above copyright notice, this
 8 |    list of conditions and the following disclaimer.
 9 | 2. Redistributions in binary form must reproduce the above copyright notice,
10 |    this list of conditions and the following disclaimer in the documentation
11 |    and/or other materials provided with the distribution.
12 | 
13 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
14 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
15 | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
16 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
17 | ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
18 | (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
19 | LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
20 | ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
21 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
22 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
23 | 
24 | The views and conclusions contained in the software and documentation are those
25 | of the authors and should not be interpreted as representing official policies,
26 | either expressed or implied, of the FreeBSD Project.
27 | 


--------------------------------------------------------------------------------
/tag-and-generate-train/eval/nlg_eval/nlgeval/pycocoevalcap/meteor/__init__.py:
--------------------------------------------------------------------------------
1 | __author__ = 'tylin'
2 | 


--------------------------------------------------------------------------------
/tag-and-generate-train/eval/nlg_eval/nlgeval/pycocoevalcap/meteor/data/paraphrase-en.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tag-and-generate/tagger-generator/0059c49fd7df273be9421cb683c96f362663057d/tag-and-generate-train/eval/nlg_eval/nlgeval/pycocoevalcap/meteor/data/paraphrase-en.gz


--------------------------------------------------------------------------------
/tag-and-generate-train/eval/nlg_eval/nlgeval/pycocoevalcap/meteor/meteor-1.5.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tag-and-generate/tagger-generator/0059c49fd7df273be9421cb683c96f362663057d/tag-and-generate-train/eval/nlg_eval/nlgeval/pycocoevalcap/meteor/meteor-1.5.jar


--------------------------------------------------------------------------------
/tag-and-generate-train/eval/nlg_eval/nlgeval/pycocoevalcap/meteor/meteor.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | 
  3 | # Python wrapper for METEOR implementation, by Xinlei Chen
  4 | # Acknowledge Michael Denkowski for the generous discussion and help
  5 | from __future__ import division
  6 | 
  7 | import atexit
  8 | import logging
  9 | import os
 10 | import subprocess
 11 | import sys
 12 | import threading
 13 | 
 14 | import psutil
 15 | 
 16 | # Assumes meteor-1.5.jar is in the same directory as meteor.py.  Change as needed.
 17 | METEOR_JAR = 'meteor-1.5.jar'
 18 | 
 19 | 
 20 | def enc(s):
 21 |     return s.encode('utf-8')
 22 | 
 23 | 
 24 | def dec(s):
 25 |     return s.decode('utf-8')
 26 | 
 27 | 
 28 | class Meteor:
 29 | 
 30 |     def __init__(self):
 31 |         # Used to guarantee thread safety
 32 |         self.lock = threading.Lock()
 33 | 
 34 |         mem = '2G'
 35 |         mem_available_G = psutil.virtual_memory().available / 1E9
 36 |         if mem_available_G < 2:
 37 |             logging.warning("There is less than 2GB of available memory.\n"
 38 |                             "Will try with limiting Meteor to 1GB of memory but this might cause issues.\n"
 39 |                             "If you have problems using Meteor, "
 40 |                             "then you can try to lower the `mem` variable in meteor.py")
 41 |             mem = '1G'
 42 | 
 43 |         meteor_cmd = ['java', '-jar', '-Xmx{}'.format(mem), METEOR_JAR,
 44 |                       '-', '-', '-stdio', '-l', 'en', '-norm']
 45 |         env = os.environ.copy()
 46 |         env['LC_ALL'] = "C"
 47 |         self.meteor_p = subprocess.Popen(meteor_cmd,
 48 |                                          cwd=os.path.dirname(os.path.abspath(__file__)),
 49 |                                          env=env,
 50 |                                          stdin=subprocess.PIPE,
 51 |                                          stdout=subprocess.PIPE,
 52 |                                          stderr=subprocess.PIPE)
 53 | 
 54 |         atexit.register(self.close)
 55 | 
 56 |     def close(self):
 57 |         with self.lock:
 58 |             if self.meteor_p:
 59 |                 self.meteor_p.kill()
 60 |                 self.meteor_p.wait()
 61 |                 self.meteor_p = None
 62 |         # if the user calls close() manually, remove the
 63 |         # reference from atexit so the object can be garbage-collected.
 64 |         if atexit is not None and atexit.unregister is not None:
 65 |             atexit.unregister(self.close)
 66 | 
 67 |     def compute_score(self, gts, res):
 68 |         assert (gts.keys() == res.keys())
 69 |         imgIds = gts.keys()
 70 |         scores = []
 71 | 
 72 |         eval_line = 'EVAL'
 73 |         with self.lock:
 74 |             for i in imgIds:
 75 |                 assert (len(res[i]) == 1)
 76 |                 stat = self._stat(res[i][0], gts[i])
 77 |                 eval_line += ' ||| {}'.format(stat)
 78 | 
 79 |             self.meteor_p.stdin.write(enc('{}\n'.format(eval_line)))
 80 |             self.meteor_p.stdin.flush()
 81 |             for i in range(0, len(imgIds)):
 82 |                 v = self.meteor_p.stdout.readline()
 83 |                 try:
 84 |                     scores.append(float(dec(v.strip())))
 85 |                 except:
 86 |                     sys.stderr.write("Error handling value: {}\n".format(v))
 87 |                     sys.stderr.write("Decoded value: {}\n".format(dec(v.strip())))
 88 |                     sys.stderr.write("eval_line: {}\n".format(eval_line))
 89 |                     # You can try uncommenting the next code line to show stderr from the Meteor JAR.
 90 |                     # If the Meteor JAR is not writing to stderr, then the line will just hang.
 91 |                     # sys.stderr.write("Error from Meteor:\n{}".format(self.meteor_p.stderr.read()))
 92 |                     raise
 93 |             score = float(dec(self.meteor_p.stdout.readline()).strip())
 94 | 
 95 |         return score, scores
 96 | 
 97 |     def method(self):
 98 |         return "METEOR"
 99 | 
100 |     def _stat(self, hypothesis_str, reference_list):
101 |         # SCORE ||| reference 1 words ||| reference n words ||| hypothesis words
102 |         hypothesis_str = hypothesis_str.replace('|||', '').replace('  ', ' ')
103 |         score_line = ' ||| '.join(('SCORE', ' ||| '.join(reference_list), hypothesis_str))
104 |         self.meteor_p.stdin.write(enc(score_line))
105 |         self.meteor_p.stdin.write(enc('\n'))
106 |         self.meteor_p.stdin.flush()
107 |         return dec(self.meteor_p.stdout.readline()).strip()
108 | 
109 |     def _score(self, hypothesis_str, reference_list):
110 |         with self.lock:
111 |             # SCORE ||| reference 1 words ||| reference n words ||| hypothesis words
112 |             hypothesis_str = hypothesis_str.replace('|||', '').replace('  ', ' ')
113 |             score_line = ' ||| '.join(('SCORE', ' ||| '.join(reference_list), hypothesis_str))
114 |             self.meteor_p.stdin.write(enc('{}\n'.format(score_line)))
115 |             self.meteor_p.stdin.flush()
116 |             stats = dec(self.meteor_p.stdout.readline()).strip()
117 |             eval_line = 'EVAL ||| {}'.format(stats)
118 |             # EVAL ||| stats 
119 |             self.meteor_p.stdin.write(enc('{}\n'.format(eval_line)))
120 |             self.meteor_p.stdin.flush()
121 |             score = float(dec(self.meteor_p.stdout.readline()).strip())
122 |             # bug fix: there are two values returned by the jar file, one average, and one all, so do it twice
123 |             # thanks for Andrej for pointing this out
124 |             score = float(dec(self.meteor_p.stdout.readline()).strip())
125 |         return score
126 | 
127 |     def __del__(self):
128 |         self.close()
129 | 


--------------------------------------------------------------------------------
/tag-and-generate-train/eval/nlg_eval/nlgeval/pycocoevalcap/meteor/tests/test_meteor.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | from __future__ import unicode_literals
 3 | 
 4 | import unittest
 5 | 
 6 | from nlgeval.pycocoevalcap.meteor.meteor import Meteor
 7 | 
 8 | 
 9 | class TestMeteor(unittest.TestCase):
10 |     def test_compute_score(self):
11 |         m = Meteor()
12 | 
13 |         s = m.compute_score({0: ["test"]}, {0: ["test"]})
14 |         self.assertEqual(s, (1.0, [1.0]))
15 | 
16 |         s = m.compute_score({0: ["テスト"]}, {0: ["テスト"]})
17 |         self.assertEqual(s, (1.0, [1.0]))
18 | 


--------------------------------------------------------------------------------
/tag-and-generate-train/eval/nlg_eval/nlgeval/pycocoevalcap/rouge/__init__.py:
--------------------------------------------------------------------------------
1 | __author__ = 'vrama91'
2 | 


--------------------------------------------------------------------------------
/tag-and-generate-train/eval/nlg_eval/nlgeval/pycocoevalcap/rouge/rouge.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # 
  3 | # File Name : rouge.py
  4 | #
  5 | # Description : Computes ROUGE-L metric as described by Lin and Hovey (2004)
  6 | #
  7 | # Creation Date : 2015-01-07 06:03
  8 | # Author : Ramakrishna Vedantam <vrama91@vt.edu>
  9 | 
 10 | import numpy as np
 11 | import pdb
 12 | 
 13 | def my_lcs(string, sub):
 14 |     """
 15 |     Calculates longest common subsequence for a pair of tokenized strings
 16 |     :param string : list of str : tokens from a string split using whitespace
 17 |     :param sub : list of str : shorter string, also split using whitespace
 18 |     :returns: length (list of int): length of the longest common subsequence between the two strings
 19 | 
 20 |     Note: my_lcs only gives length of the longest common subsequence, not the actual LCS
 21 |     """
 22 |     if(len(string)< len(sub)):
 23 |         sub, string = string, sub
 24 | 
 25 |     lengths = [[0 for i in range(0,len(sub)+1)] for j in range(0,len(string)+1)]
 26 | 
 27 |     for j in range(1,len(sub)+1):
 28 |         for i in range(1,len(string)+1):
 29 |             if(string[i-1] == sub[j-1]):
 30 |                 lengths[i][j] = lengths[i-1][j-1] + 1
 31 |             else:
 32 |                 lengths[i][j] = max(lengths[i-1][j] , lengths[i][j-1])
 33 | 
 34 |     return lengths[len(string)][len(sub)]
 35 | 
 36 | class Rouge():
 37 |     '''
 38 |     Class for computing ROUGE-L score for a set of candidate sentences for the MS COCO test set
 39 | 
 40 |     '''
 41 |     def __init__(self):
 42 |         # vrama91: updated the value below based on discussion with Hovey
 43 |         self.beta = 1.2
 44 | 
 45 |     def calc_score(self, candidate, refs):
 46 |         """
 47 |         Compute ROUGE-L score given one candidate and references for an image
 48 |         :param candidate: str : candidate sentence to be evaluated
 49 |         :param refs: list of str : COCO reference sentences for the particular image to be evaluated
 50 |         :returns score: int (ROUGE-L score for the candidate evaluated against references)
 51 |         """
 52 |         assert(len(candidate)==1)	
 53 |         assert(len(refs)>0)         
 54 |         prec = []
 55 |         rec = []
 56 | 
 57 |         # split into tokens
 58 |         token_c = candidate[0].split(" ")
 59 |     	
 60 |         for reference in refs:
 61 |             # split into tokens
 62 |             token_r = reference.split(" ")
 63 |             # compute the longest common subsequence
 64 |             lcs = my_lcs(token_r, token_c)
 65 |             prec.append(lcs/float(len(token_c)))
 66 |             rec.append(lcs/float(len(token_r)))
 67 | 
 68 |         prec_max = max(prec)
 69 |         rec_max = max(rec)
 70 | 
 71 |         if(prec_max!=0 and rec_max !=0):
 72 |             score = ((1 + self.beta**2)*prec_max*rec_max)/float(rec_max + self.beta**2*prec_max)
 73 |         else:
 74 |             score = 0.0
 75 |         return score
 76 | 
 77 |     def compute_score(self, gts, res):
 78 |         """
 79 |         Computes Rouge-L score given a set of reference and candidate sentences for the dataset
 80 |         Invoked by evaluate_captions.py 
 81 |         :param hypo_for_image: dict : candidate / test sentences with "image name" key and "tokenized sentences" as values 
 82 |         :param ref_for_image: dict : reference MS-COCO sentences with "image name" key and "tokenized sentences" as values
 83 |         :returns: average_score: float (mean ROUGE-L score computed by averaging scores for all the images)
 84 |         """
 85 |         assert(gts.keys() == res.keys())
 86 |         imgIds = gts.keys()
 87 | 
 88 |         score = []
 89 |         for id in imgIds:
 90 |             hypo = res[id]
 91 |             ref  = gts[id]
 92 | 
 93 |             score.append(self.calc_score(hypo, ref))
 94 | 
 95 |             # Sanity check.
 96 |             assert(type(hypo) is list)
 97 |             assert(len(hypo) == 1)
 98 |             assert(type(ref) is list)
 99 |             assert(len(ref) > 0)
100 | 
101 |         average_score = np.mean(np.array(score))
102 |         return average_score, np.array(score)
103 | 
104 |     def method(self):
105 |         return "Rouge"
106 | 


--------------------------------------------------------------------------------
/tag-and-generate-train/eval/nlg_eval/nlgeval/skipthoughts/.gitignore:
--------------------------------------------------------------------------------
 1 | # Byte-compiled / optimized / DLL files
 2 | __pycache__/
 3 | *.py[cod]
 4 | 
 5 | # C extensions
 6 | *.so
 7 | 
 8 | # Distribution / packaging
 9 | .Python
10 | env/
11 | build/
12 | develop-eggs/
13 | dist/
14 | downloads/
15 | eggs/
16 | .eggs/
17 | lib/
18 | lib64/
19 | parts/
20 | sdist/
21 | var/
22 | *.egg-info/
23 | .installed.cfg
24 | *.egg
25 | 
26 | # PyInstaller
27 | #  Usually these files are written by a python script from a template
28 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
29 | *.manifest
30 | *.spec
31 | 
32 | # Installer logs
33 | pip-log.txt
34 | pip-delete-this-directory.txt
35 | 
36 | # Unit test / coverage reports
37 | htmlcov/
38 | .tox/
39 | .coverage
40 | .coverage.*
41 | .cache
42 | nosetests.xml
43 | coverage.xml
44 | *,cover
45 | 
46 | # Translations
47 | *.mo
48 | *.pot
49 | 
50 | # Django stuff:
51 | *.log
52 | 
53 | # Sphinx documentation
54 | docs/_build/
55 | 
56 | # PyBuilder
57 | target/
58 | 
59 | # Downloaded data files
60 | data/
61 | 


--------------------------------------------------------------------------------
/tag-and-generate-train/eval/nlg_eval/nlgeval/skipthoughts/README.md:
--------------------------------------------------------------------------------
 1 | # skip-thoughts
 2 | 
 3 | Original README can be found at [ryankiros/skip-thoughts](https://github.com/ryankiros/skip-thoughts/blob/6661cad40664b6c251cac1dad779986eb332c26a/README.md).
 4 | 
 5 | ## License
 6 | 
 7 | All files in the skipthoughts directory are under
 8 | [Apache License 2.0](http://www.apache.org/licenses/LICENSE-2.0)
 9 | to the authors of [ryankiros/skip-thoughts](https://github.com/ryankiros/skip-thoughts/tree/6661cad40664b6c251cac1dad779986eb332c26a).
10 | 


--------------------------------------------------------------------------------
/tag-and-generate-train/eval/nlg_eval/nlgeval/skipthoughts/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tag-and-generate/tagger-generator/0059c49fd7df273be9421cb683c96f362663057d/tag-and-generate-train/eval/nlg_eval/nlgeval/skipthoughts/__init__.py


--------------------------------------------------------------------------------
/tag-and-generate-train/eval/nlg_eval/nlgeval/skipthoughts/skipthoughts.py:
--------------------------------------------------------------------------------
  1 | '''
  2 | Skip-thought vectors
  3 | '''
  4 | import copy
  5 | import os
  6 | from collections import OrderedDict, defaultdict
  7 | 
  8 | import nltk
  9 | import numpy
 10 | import six
 11 | import theano
 12 | import theano.tensor as tensor
 13 | from nltk.tokenize import word_tokenize
 14 | from scipy.linalg import norm
 15 | from six.moves import cPickle as pkl
 16 | from nlgeval.utils import get_data_dir
 17 | import logging
 18 | 
 19 | profile = False
 20 | 
 21 | #-----------------------------------------------------------------------------#
 22 | # Specify model and table locations here
 23 | #-----------------------------------------------------------------------------#
 24 | path_to_models = get_data_dir()
 25 | path_to_tables = get_data_dir()
 26 | #-----------------------------------------------------------------------------#
 27 | 
 28 | path_to_umodel = os.path.join(path_to_models, 'uni_skip.npz')
 29 | path_to_bmodel = os.path.join(path_to_models, 'bi_skip.npz')
 30 | 
 31 | 
 32 | def load_model():
 33 |     """
 34 |     Load the model with saved tables
 35 |     """
 36 |     # Load model options
 37 |     # print 'Loading model parameters...'
 38 |     with open('%s.pkl'%path_to_umodel, 'rb') as f:
 39 |         uoptions = pkl.load(f)
 40 |     with open('%s.pkl'%path_to_bmodel, 'rb') as f:
 41 |         boptions = pkl.load(f)
 42 | 
 43 |     # Load parameters
 44 |     uparams = init_params(uoptions)
 45 |     uparams = load_params(path_to_umodel, uparams)
 46 |     utparams = init_tparams(uparams)
 47 |     bparams = init_params_bi(boptions)
 48 |     bparams = load_params(path_to_bmodel, bparams)
 49 |     btparams = init_tparams(bparams)
 50 | 
 51 |     # Extractor functions
 52 |     # print 'Compiling encoders...'
 53 |     embedding, x_mask, ctxw2v = build_encoder(utparams, uoptions)
 54 |     f_w2v = theano.function([embedding, x_mask], ctxw2v, name='f_w2v')
 55 |     embedding, x_mask, ctxw2v = build_encoder_bi(btparams, boptions)
 56 |     f_w2v2 = theano.function([embedding, x_mask], ctxw2v, name='f_w2v2')
 57 | 
 58 |     # Tables
 59 |     # print 'Loading tables...'
 60 |     utable, btable = load_tables()
 61 | 
 62 |     # Store everything we need in a dictionary
 63 |     # print 'Packing up...'
 64 |     model = {}
 65 |     model['uoptions'] = uoptions
 66 |     model['boptions'] = boptions
 67 |     model['utable'] = utable
 68 |     model['btable'] = btable
 69 |     model['f_w2v'] = f_w2v
 70 |     model['f_w2v2'] = f_w2v2
 71 | 
 72 |     return model
 73 | 
 74 | 
 75 | def load_tables():
 76 |     """
 77 |     Load the tables
 78 |     """
 79 |     words = []
 80 |     utable = numpy.load(os.path.join(path_to_tables, 'utable.npy'), allow_pickle=True, encoding='bytes')
 81 |     btable = numpy.load(os.path.join(path_to_tables, 'btable.npy'), allow_pickle=True,  encoding='bytes')
 82 |     f = open(os.path.join(path_to_tables, 'dictionary.txt'), 'rb')
 83 |     for line in f:
 84 |         words.append(line.decode('utf-8').strip())
 85 |     f.close()
 86 |     utable = OrderedDict(zip(words, utable))
 87 |     btable = OrderedDict(zip(words, btable))
 88 |     return utable, btable
 89 | 
 90 | 
 91 | class Encoder(object):
 92 |     """
 93 |     Sentence encoder.
 94 |     """
 95 | 
 96 |     def __init__(self, model):
 97 |       self._model = model
 98 | 
 99 |     def encode(self, X, use_norm=True, verbose=True, batch_size=128, use_eos=False):
100 |       """
101 |       Encode sentences in the list X. Each entry will return a vector
102 |       """
103 |       return encode(self._model, X, use_norm, verbose, batch_size, use_eos)
104 | 
105 | 
106 | def encode(model, X, use_norm=True, verbose=True, batch_size=128, use_eos=False):
107 |     """
108 |     Encode sentences in the list X. Each entry will return a vector
109 |     """
110 |     # first, do preprocessing
111 |     X = preprocess(X)
112 | 
113 |     # word dictionary and init
114 |     d = defaultdict(lambda : 0)
115 |     for w in model['utable'].keys():
116 |         d[w] = 1
117 |     ufeatures = numpy.zeros((len(X), model['uoptions']['dim']), dtype='float32')
118 |     bfeatures = numpy.zeros((len(X), 2 * model['boptions']['dim']), dtype='float32')
119 | 
120 |     # length dictionary
121 |     ds = defaultdict(list)
122 |     captions = [s.split() for s in X]
123 |     for i,s in enumerate(captions):
124 |         ds[len(s)].append(i)
125 | 
126 |     # Get features. This encodes by length, in order to avoid wasting computation
127 |     for k in ds.keys():
128 |         if verbose:
129 |             print(k)
130 |         numbatches = int(len(ds[k]) / batch_size + 1)
131 |         for minibatch in range(numbatches):
132 |             caps = ds[k][minibatch::numbatches]
133 | 
134 |             if use_eos:
135 |                 uembedding = numpy.zeros((k+1, len(caps), model['uoptions']['dim_word']), dtype='float32')
136 |                 bembedding = numpy.zeros((k+1, len(caps), model['boptions']['dim_word']), dtype='float32')
137 |             else:
138 |                 uembedding = numpy.zeros((k, len(caps), model['uoptions']['dim_word']), dtype='float32')
139 |                 bembedding = numpy.zeros((k, len(caps), model['boptions']['dim_word']), dtype='float32')
140 |             for ind, c in enumerate(caps):
141 |                 caption = captions[c]
142 |                 for j in range(len(caption)):
143 |                     if d[caption[j]] > 0:
144 |                         uembedding[j,ind] = model['utable'][caption[j]]
145 |                         bembedding[j,ind] = model['btable'][caption[j]]
146 |                     else:
147 |                         uembedding[j,ind] = model['utable']['UNK']
148 |                         bembedding[j,ind] = model['btable']['UNK']
149 |                 if use_eos:
150 |                     uembedding[-1,ind] = model['utable']['<eos>']
151 |                     bembedding[-1,ind] = model['btable']['<eos>']
152 |             if use_eos:
153 |                 uff = model['f_w2v'](uembedding, numpy.ones((len(caption)+1,len(caps)), dtype='float32'))
154 |                 bff = model['f_w2v2'](bembedding, numpy.ones((len(caption)+1,len(caps)), dtype='float32'))
155 |             else:
156 |                 uff = model['f_w2v'](uembedding, numpy.ones((len(caption),len(caps)), dtype='float32'))
157 |                 bff = model['f_w2v2'](bembedding, numpy.ones((len(caption),len(caps)), dtype='float32'))
158 |             if use_norm:
159 |                 for j in range(len(uff)):
160 |                     uff[j] /= norm(uff[j])
161 |                     bff[j] /= norm(bff[j])
162 |             for ind, c in enumerate(caps):
163 |                 ufeatures[c] = uff[ind]
164 |                 bfeatures[c] = bff[ind]
165 |     
166 |     features = numpy.c_[ufeatures, bfeatures]
167 |     return features
168 | 
169 | 
170 | def preprocess(text):
171 |     """
172 |     Preprocess text for encoder
173 |     """
174 |     X = []
175 |     sent_detector = nltk.data.load('tokenizers/punkt/english.pickle')
176 |     for t in text:
177 |         sents = sent_detector.tokenize(t)
178 |         result = ''
179 |         for s in sents:
180 |             tokens = word_tokenize(s)
181 |             result += ' ' + ' '.join(tokens)
182 |         X.append(result)
183 |     return X
184 | 
185 | 
186 | def nn(model, text, vectors, query, k=5):
187 |     """
188 |     Return the nearest neighbour sentences to query
189 |     text: list of sentences
190 |     vectors: the corresponding representations for text
191 |     query: a string to search
192 |     """
193 |     qf = encode(model, [query])
194 |     qf /= norm(qf)
195 |     scores = numpy.dot(qf, vectors.T).flatten()
196 |     sorted_args = numpy.argsort(scores)[::-1]
197 |     sentences = [text[a] for a in sorted_args[:k]]
198 |     print('QUERY: ' + query)
199 |     print('NEAREST: ')
200 |     for i, s in enumerate(sentences):
201 |         print(s, sorted_args[i])
202 | 
203 | 
204 | def word_features(table):
205 |     """
206 |     Extract word features into a normalized matrix
207 |     """
208 |     features = numpy.zeros((len(table), 620), dtype='float32')
209 |     keys = table.keys()
210 |     for i in range(len(table)):
211 |         f = table[keys[i]]
212 |         features[i] = f / norm(f)
213 |     return features
214 | 
215 | 
216 | def nn_words(table, wordvecs, query, k=10):
217 |     """
218 |     Get the nearest neighbour words
219 |     """
220 |     keys = table.keys()
221 |     qf = table[query]
222 |     scores = numpy.dot(qf, wordvecs.T).flatten()
223 |     sorted_args = numpy.argsort(scores)[::-1]
224 |     words = [keys[a] for a in sorted_args[:k]]
225 |     print('QUERY: ' + query)
226 |     print('NEAREST: ')
227 |     for i, w in enumerate(words):
228 |         print(w)
229 | 
230 | 
231 | def _p(pp, name):
232 |     """
233 |     make prefix-appended name
234 |     """
235 |     return '%s_%s'%(pp, name)
236 | 
237 | 
238 | def init_tparams(params):
239 |     """
240 |     initialize Theano shared variables according to the initial parameters
241 |     """
242 |     tparams = OrderedDict()
243 |     for kk, pp in six.iteritems(params):
244 |         tparams[kk] = theano.shared(params[kk], name=kk)
245 |     return tparams
246 | 
247 | 
248 | def load_params(path, params):
249 |     """
250 |     load parameters
251 |     """
252 |     pp = numpy.load(path)
253 |     for kk, vv in six.iteritems(params):
254 |         if kk not in pp:
255 |             logging.warning('%s is not in the archive', kk)
256 |             continue
257 |         params[kk] = pp[kk]
258 |     return params
259 | 
260 | 
261 | # layers: 'name': ('parameter initializer', 'feedforward')
262 | layers = {'gru': ('param_init_gru', 'gru_layer')}
263 | 
264 | def get_layer(name):
265 |     fns = layers[name]
266 |     return (eval(fns[0]), eval(fns[1]))
267 | 
268 | 
269 | def init_params(options):
270 |     """
271 |     initialize all parameters needed for the encoder
272 |     """
273 |     params = OrderedDict()
274 | 
275 |     # embedding
276 |     params['Wemb'] = norm_weight(options['n_words_src'], options['dim_word'])
277 | 
278 |     # encoder: GRU
279 |     params = get_layer(options['encoder'])[0](options, params, prefix='encoder',
280 |                                               nin=options['dim_word'], dim=options['dim'])
281 |     return params
282 | 
283 | 
284 | def init_params_bi(options):
285 |     """
286 |     initialize all paramters needed for bidirectional encoder
287 |     """
288 |     params = OrderedDict()
289 | 
290 |     # embedding
291 |     params['Wemb'] = norm_weight(options['n_words_src'], options['dim_word'])
292 | 
293 |     # encoder: GRU
294 |     params = get_layer(options['encoder'])[0](options, params, prefix='encoder',
295 |                                               nin=options['dim_word'], dim=options['dim'])
296 |     params = get_layer(options['encoder'])[0](options, params, prefix='encoder_r',
297 |                                               nin=options['dim_word'], dim=options['dim'])
298 |     return params
299 | 
300 | 
301 | def build_encoder(tparams, options):
302 |     """
303 |     build an encoder, given pre-computed word embeddings
304 |     """
305 |     # word embedding (source)
306 |     embedding = tensor.tensor3('embedding', dtype='float32')
307 |     x_mask = tensor.matrix('x_mask', dtype='float32')
308 | 
309 |     # encoder
310 |     proj = get_layer(options['encoder'])[1](tparams, embedding, options,
311 |                                             prefix='encoder',
312 |                                             mask=x_mask)
313 |     ctx = proj[0][-1]
314 | 
315 |     return embedding, x_mask, ctx
316 | 
317 | 
318 | def build_encoder_bi(tparams, options):
319 |     """
320 |     build bidirectional encoder, given pre-computed word embeddings
321 |     """
322 |     # word embedding (source)
323 |     embedding = tensor.tensor3('embedding', dtype='float32')
324 |     embeddingr = embedding[::-1]
325 |     x_mask = tensor.matrix('x_mask', dtype='float32')
326 |     xr_mask = x_mask[::-1]
327 | 
328 |     # encoder
329 |     proj = get_layer(options['encoder'])[1](tparams, embedding, options,
330 |                                             prefix='encoder',
331 |                                             mask=x_mask)
332 |     projr = get_layer(options['encoder'])[1](tparams, embeddingr, options,
333 |                                              prefix='encoder_r',
334 |                                              mask=xr_mask)
335 | 
336 |     ctx = tensor.concatenate([proj[0][-1], projr[0][-1]], axis=1)
337 | 
338 |     return embedding, x_mask, ctx
339 | 
340 | 
341 | # some utilities
342 | def ortho_weight(ndim):
343 |     W = numpy.random.randn(ndim, ndim)
344 |     u, s, v = numpy.linalg.svd(W)
345 |     return u.astype('float32')
346 | 
347 | 
348 | def norm_weight(nin,nout=None, scale=0.1, ortho=True):
349 |     if nout == None:
350 |         nout = nin
351 |     if nout == nin and ortho:
352 |         W = ortho_weight(nin)
353 |     else:
354 |         W = numpy.random.uniform(low=-scale, high=scale, size=(nin, nout))
355 |     return W.astype('float32')
356 | 
357 | 
358 | def param_init_gru(options, params, prefix='gru', nin=None, dim=None):
359 |     """
360 |     parameter init for GRU
361 |     """
362 |     if nin == None:
363 |         nin = options['dim_proj']
364 |     if dim == None:
365 |         dim = options['dim_proj']
366 |     W = numpy.concatenate([norm_weight(nin,dim),
367 |                            norm_weight(nin,dim)], axis=1)
368 |     params[_p(prefix,'W')] = W
369 |     params[_p(prefix,'b')] = numpy.zeros((2 * dim,)).astype('float32')
370 |     U = numpy.concatenate([ortho_weight(dim),
371 |                            ortho_weight(dim)], axis=1)
372 |     params[_p(prefix,'U')] = U
373 | 
374 |     Wx = norm_weight(nin, dim)
375 |     params[_p(prefix,'Wx')] = Wx
376 |     Ux = ortho_weight(dim)
377 |     params[_p(prefix,'Ux')] = Ux
378 |     params[_p(prefix,'bx')] = numpy.zeros((dim,)).astype('float32')
379 | 
380 |     return params
381 | 
382 | 
383 | def gru_layer(tparams, state_below, options, prefix='gru', mask=None, **kwargs):
384 |     """
385 |     Forward pass through GRU layer
386 |     """
387 |     nsteps = state_below.shape[0]
388 |     if state_below.ndim == 3:
389 |         n_samples = state_below.shape[1]
390 |     else:
391 |         n_samples = 1
392 | 
393 |     dim = tparams[_p(prefix,'Ux')].shape[1]
394 | 
395 |     if mask == None:
396 |         mask = tensor.alloc(1., state_below.shape[0], 1)
397 | 
398 |     def _slice(_x, n, dim):
399 |         if _x.ndim == 3:
400 |             return _x[:, :, n*dim:(n+1)*dim]
401 |         return _x[:, n*dim:(n+1)*dim]
402 | 
403 |     state_below_ = tensor.dot(state_below, tparams[_p(prefix, 'W')]) + tparams[_p(prefix, 'b')]
404 |     state_belowx = tensor.dot(state_below, tparams[_p(prefix, 'Wx')]) + tparams[_p(prefix, 'bx')]
405 |     U = tparams[_p(prefix, 'U')]
406 |     Ux = tparams[_p(prefix, 'Ux')]
407 | 
408 |     def _step_slice(m_, x_, xx_, h_, U, Ux):
409 |         preact = tensor.dot(h_, U)
410 |         preact += x_
411 | 
412 |         r = tensor.nnet.sigmoid(_slice(preact, 0, dim))
413 |         u = tensor.nnet.sigmoid(_slice(preact, 1, dim))
414 | 
415 |         preactx = tensor.dot(h_, Ux)
416 |         preactx = preactx * r
417 |         preactx = preactx + xx_
418 | 
419 |         h = tensor.tanh(preactx)
420 | 
421 |         h = u * h_ + (1. - u) * h
422 |         h = m_[:,None] * h + (1. - m_)[:,None] * h_
423 | 
424 |         return h
425 | 
426 |     seqs = [mask, state_below_, state_belowx]
427 |     _step = _step_slice
428 | 
429 |     rval, updates = theano.scan(_step,
430 |                                 sequences=seqs,
431 |                                 outputs_info = [tensor.alloc(0., n_samples, dim)],
432 |                                 non_sequences = [tparams[_p(prefix, 'U')],
433 |                                                  tparams[_p(prefix, 'Ux')]],
434 |                                 name=_p(prefix, '_layers'),
435 |                                 n_steps=nsteps,
436 |                                 profile=profile,
437 |                                 strict=True)
438 |     rval = [rval]
439 |     return rval
440 | 
441 | 
442 | 


--------------------------------------------------------------------------------
/tag-and-generate-train/eval/nlg_eval/nlgeval/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tag-and-generate/tagger-generator/0059c49fd7df273be9421cb683c96f362663057d/tag-and-generate-train/eval/nlg_eval/nlgeval/tests/__init__.py


--------------------------------------------------------------------------------
/tag-and-generate-train/eval/nlg_eval/nlgeval/tests/test_nlgeval.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | from __future__ import unicode_literals
  3 | 
  4 | import os
  5 | import unittest
  6 | 
  7 | import nlgeval
  8 | from nlgeval import NLGEval
  9 | 
 10 | 
 11 | class TestNlgEval(unittest.TestCase):
 12 |     def test_compute_metrics_oo(self):
 13 |         # Create the object in the test so that it can be garbage collected once the test is done.
 14 |         n = NLGEval()
 15 | 
 16 |         # Individual Metrics
 17 |         scores = n.compute_individual_metrics(ref=["this is a test",
 18 |                                                    "this is also a test"],
 19 |                                               hyp="this is a good test")
 20 |         self.assertAlmostEqual(0.799999, scores['Bleu_1'], places=5)
 21 |         self.assertAlmostEqual(0.632455, scores['Bleu_2'], places=5)
 22 |         self.assertAlmostEqual(0.5108729, scores['Bleu_3'], places=5)
 23 |         self.assertAlmostEqual(0.0000903602, scores['Bleu_4'], places=5)
 24 |         self.assertAlmostEqual(0.44434387, scores['METEOR'], places=5)
 25 |         self.assertAlmostEqual(0.9070631, scores['ROUGE_L'], places=5)
 26 |         self.assertAlmostEqual(0.0, scores['CIDEr'], places=5)
 27 |         self.assertAlmostEqual(0.8375251, scores['SkipThoughtCS'], places=5)
 28 |         self.assertAlmostEqual(0.980075, scores['EmbeddingAverageCosineSimilairty'], places=5)
 29 |         self.assertAlmostEqual(0.94509, scores['VectorExtremaCosineSimilarity'], places=5)
 30 |         self.assertAlmostEqual(0.960771, scores['GreedyMatchingScore'], places=5)
 31 |         self.assertEqual(11, len(scores))
 32 | 
 33 |         scores = n.compute_metrics(ref_list=[
 34 |             [
 35 |                 "this is one reference sentence for sentence1",
 36 |                 "this is a reference sentence for sentence2 which was generated by your model"
 37 |             ],
 38 |             [
 39 |                 "this is one more reference sentence for sentence1",
 40 |                 "this is the second reference sentence for sentence2"
 41 |             ],
 42 |         ],
 43 |             hyp_list=[
 44 |                 "this is the model generated sentence1 which seems good enough",
 45 |                 "this is sentence2 which has been generated by your model"
 46 |             ]
 47 |         )
 48 |         self.assertAlmostEqual(0.55, scores['Bleu_1'], places=5)
 49 |         self.assertAlmostEqual(0.428174, scores['Bleu_2'], places=5)
 50 |         self.assertAlmostEqual(0.284043, scores['Bleu_3'], places=5)
 51 |         self.assertAlmostEqual(0.201143, scores['Bleu_4'], places=5)
 52 |         self.assertAlmostEqual(0.295797, scores['METEOR'], places=5)
 53 |         self.assertAlmostEqual(0.522104, scores['ROUGE_L'], places=5)
 54 |         self.assertAlmostEqual(1.242192, scores['CIDEr'], places=5)
 55 |         self.assertAlmostEqual(0.626149, scores['SkipThoughtCS'], places=5)
 56 |         self.assertAlmostEqual(0.88469, scores['EmbeddingAverageCosineSimilairty'], places=5)
 57 |         self.assertAlmostEqual(0.568696, scores['VectorExtremaCosineSimilarity'], places=5)
 58 |         self.assertAlmostEqual(0.784205, scores['GreedyMatchingScore'], places=5)
 59 |         self.assertEqual(11, len(scores))
 60 | 
 61 |         # Non-ASCII tests.
 62 |         scores = n.compute_individual_metrics(ref=["Test en français.",
 63 |                                                    "Le test en français."],
 64 |                                               hyp="Le test est en français.")
 65 |         self.assertAlmostEqual(0.799999, scores['Bleu_1'], places=5)
 66 |         self.assertAlmostEqual(0.632455, scores['Bleu_2'], places=5)
 67 |         self.assertAlmostEqual(0.0000051, scores['Bleu_3'], places=5)
 68 |         self.assertAlmostEqual(0, scores['Bleu_4'], places=5)
 69 |         self.assertAlmostEqual(0.48372379050300296, scores['METEOR'], places=5)
 70 |         self.assertAlmostEqual(0.9070631, scores['ROUGE_L'], places=5)
 71 |         self.assertAlmostEqual(0.0, scores['CIDEr'], places=5)
 72 |         self.assertAlmostEqual(0.9192341566085815, scores['SkipThoughtCS'], places=5)
 73 |         self.assertAlmostEqual(0.906562, scores['EmbeddingAverageCosineSimilairty'], places=5)
 74 |         self.assertAlmostEqual(0.815158, scores['VectorExtremaCosineSimilarity'], places=5)
 75 |         self.assertAlmostEqual(0.940959, scores['GreedyMatchingScore'], places=5)
 76 |         self.assertEqual(11, len(scores))
 77 | 
 78 |         scores = n.compute_individual_metrics(ref=["テスト"],
 79 |                                               hyp="テスト")
 80 |         self.assertAlmostEqual(0.99999999, scores['Bleu_1'], places=5)
 81 |         self.assertAlmostEqual(1.0, scores['METEOR'], places=3)
 82 |         self.assertAlmostEqual(1.0, scores['ROUGE_L'], places=3)
 83 |         self.assertAlmostEqual(0.0, scores['CIDEr'], places=3)
 84 |         self.assertAlmostEqual(1.0, scores['SkipThoughtCS'], places=3)
 85 |         self.assertAlmostEqual(1.0, scores['GreedyMatchingScore'], places=3)
 86 |         self.assertEqual(11, len(scores))
 87 | 
 88 |     def test_compute_metrics_omit(self):
 89 |         n = NLGEval(metrics_to_omit=['Bleu_3', 'METEOR', 'EmbeddingAverageCosineSimilairty'])
 90 | 
 91 |         # Individual Metrics
 92 |         scores = n.compute_individual_metrics(ref=["this is a test",
 93 |                                                    "this is also a test"],
 94 |                                               hyp="this is a good test")
 95 |         self.assertAlmostEqual(0.799999, scores['Bleu_1'], places=5)
 96 |         self.assertAlmostEqual(0.632455, scores['Bleu_2'], places=5)
 97 |         self.assertAlmostEqual(0.9070631, scores['ROUGE_L'], places=5)
 98 |         self.assertAlmostEqual(0.0, scores['CIDEr'], places=5)
 99 |         self.assertAlmostEqual(0.8375251, scores['SkipThoughtCS'], places=5)
100 |         self.assertAlmostEqual(0.94509, scores['VectorExtremaCosineSimilarity'], places=5)
101 |         self.assertAlmostEqual(0.960771, scores['GreedyMatchingScore'], places=5)
102 |         self.assertEqual(7, len(scores))
103 | 
104 |     def test_compute_metrics(self):
105 |         # The example from the README.
106 |         root_dir = os.path.join(os.path.dirname(__file__), '..', '..')
107 |         hypothesis = os.path.join(root_dir, 'examples/hyp.txt')
108 |         references = os.path.join(root_dir, 'examples/ref1.txt'), os.path.join(root_dir, 'examples/ref2.txt')
109 |         scores = nlgeval.compute_metrics(hypothesis, references)
110 |         self.assertAlmostEqual(0.55, scores['Bleu_1'], places=5)
111 |         self.assertAlmostEqual(0.428174, scores['Bleu_2'], places=5)
112 |         self.assertAlmostEqual(0.284043, scores['Bleu_3'], places=5)
113 |         self.assertAlmostEqual(0.201143, scores['Bleu_4'], places=5)
114 |         self.assertAlmostEqual(0.295797, scores['METEOR'], places=5)
115 |         self.assertAlmostEqual(0.522104, scores['ROUGE_L'], places=5)
116 |         self.assertAlmostEqual(1.242192, scores['CIDEr'], places=5)
117 |         self.assertAlmostEqual(0.626149, scores['SkipThoughtCS'], places=5)
118 |         self.assertAlmostEqual(0.88469, scores['EmbeddingAverageCosineSimilairty'], places=5)
119 |         self.assertAlmostEqual(0.568696, scores['VectorExtremaCosineSimilarity'], places=5)
120 |         self.assertAlmostEqual(0.784205, scores['GreedyMatchingScore'], places=5)
121 |         self.assertEqual(11, len(scores))
122 | 


--------------------------------------------------------------------------------
/tag-and-generate-train/eval/nlg_eval/nlgeval/utils.py:
--------------------------------------------------------------------------------
 1 | import click
 2 | import json
 3 | import os
 4 | 
 5 | from xdg import XDG_CONFIG_HOME
 6 | 
 7 | 
 8 | class InvalidDataDirException(Exception):
 9 |   pass
10 | 
11 | 
12 | def get_data_dir():
13 |     if os.environ.get('NLGEVAL_DATA'):
14 |         if not os.path.exists(os.environ.get('NLGEVAL_DATA')):
15 |             click.secho("NLGEVAL_DATA variable is set but points to non-existent path.", fg='red', err=True)
16 |             raise InvalidDataDirException()
17 |         return os.environ.get('NLGEVAL_DATA')
18 |     else:
19 |         try:
20 |             cfg_file = os.path.join(XDG_CONFIG_HOME, 'nlgeval', 'rc.json')
21 |             with open(cfg_file, 'rt') as f:
22 |                 rc = json.load(f)
23 |                 if not os.path.exists(rc['data_path']):
24 |                     click.secho("Data path found in {} does not exist: {} " % (cfg_file, rc['data_path']), fg='red', err=True)
25 |                     click.secho("Run `nlg-eval --setup DATA_DIR' to download or set $NLGEVAL_DATA to an existing location",
26 |                                 fg='red', err=True)
27 |                     raise InvalidDataDirException()
28 |                 return rc['data_path']
29 |         except:
30 |             click.secho("Could not determine location of data.", fg='red', err=True)
31 |             click.secho("Run `nlg-eval --setup DATA_DIR' to download or set $NLGEVAL_DATA to an existing location", fg='red',
32 |                         err=True)
33 |             raise InvalidDataDirException()
34 | 


--------------------------------------------------------------------------------
/tag-and-generate-train/eval/nlg_eval/nlgeval/word2vec/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tag-and-generate/tagger-generator/0059c49fd7df273be9421cb683c96f362663057d/tag-and-generate-train/eval/nlg_eval/nlgeval/word2vec/__init__.py


--------------------------------------------------------------------------------
/tag-and-generate-train/eval/nlg_eval/nlgeval/word2vec/evaluate.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Microsoft Corporation. All rights reserved.
  2 | # Licensed under the MIT license. See LICENSE.md file in the project root for full license information.
  3 | import os
  4 | import numpy as np
  5 | 
  6 | from nlgeval.utils import get_data_dir
  7 | 
  8 | 
  9 | try:
 10 |     from gensim.models import KeyedVectors
 11 | except ImportError:
 12 |     from gensim.models import Word2Vec as KeyedVectors
 13 | 
 14 | 
 15 | class Embedding(object):
 16 |     def __init__(self):
 17 |         path = get_data_dir()
 18 |         self.m = KeyedVectors.load(os.path.join(path, 'glove.6B.300d.model.bin'), mmap='r')
 19 |         try:
 20 |             self.unk = self.m.vectors.mean(axis=0)
 21 |         except AttributeError:
 22 |             self.unk = self.m.syn0.mean(axis=0)
 23 | 
 24 |     @property
 25 |     def w2v(self):
 26 |         return np.concatenate((self.m.syn0, self.unk[None,:]), axis=0)
 27 | 
 28 |     def __getitem__(self, key):
 29 |         try:
 30 |             return self.m.vocab[key].index
 31 |         except KeyError:
 32 |             return len(self.m.syn0)
 33 | 
 34 |     def vec(self, key):
 35 |         try:
 36 |             vectors = self.m.vectors
 37 |         except AttributeError:
 38 |             vectors = self.m.syn0
 39 |         try:
 40 |             return vectors[self.m.vocab[key].index]
 41 |         except KeyError:
 42 |             return self.unk
 43 | 
 44 | 
 45 | def eval_emb_metrics(hypothesis, references, emb=None, metrics_to_omit=None):
 46 |     from sklearn.metrics.pairwise import cosine_similarity
 47 |     from nltk.tokenize import word_tokenize
 48 |     import numpy as np
 49 |     if emb is None:
 50 |         emb = Embedding()
 51 | 
 52 |     if metrics_to_omit is None:
 53 |         metrics_to_omit = set()
 54 | 
 55 |     emb_hyps = []
 56 |     avg_emb_hyps = []
 57 |     extreme_emb_hyps = []
 58 |     for hyp in hypothesis:
 59 |         embs = [emb.vec(word) for word in word_tokenize(hyp)]
 60 | 
 61 |         avg_emb = np.sum(embs, axis=0) / np.linalg.norm(np.sum(embs, axis=0))
 62 |         assert not np.any(np.isnan(avg_emb))
 63 | 
 64 |         maxemb = np.max(embs, axis=0)
 65 |         minemb = np.min(embs, axis=0)
 66 |         extreme_emb = list(map(lambda x, y: x if ((x>y or x<-y) and y>0) or ((x<y or x>-y) and y<0) else y, maxemb, minemb))
 67 | 
 68 |         emb_hyps.append(embs)
 69 |         avg_emb_hyps.append(avg_emb)
 70 |         extreme_emb_hyps.append(extreme_emb)
 71 | 
 72 |     emb_refs = []
 73 |     avg_emb_refs = []
 74 |     extreme_emb_refs = []
 75 |     for refsource in references:
 76 |         emb_refsource = []
 77 |         avg_emb_refsource = []
 78 |         extreme_emb_refsource = []
 79 |         for ref in refsource:
 80 |             embs = [emb.vec(word) for word in word_tokenize(ref)]
 81 | 
 82 |             avg_emb = np.sum(embs, axis=0) / np.linalg.norm(np.sum(embs, axis=0))
 83 |             assert not np.any(np.isnan(avg_emb))
 84 | 
 85 |             maxemb = np.max(embs, axis=0)
 86 |             minemb = np.min(embs, axis=0)
 87 |             extreme_emb = list(map(lambda x, y: x if ((x>y or x<-y) and y>0) or ((x<y or x>-y) and y<0) else y, maxemb, minemb))
 88 | 
 89 |             emb_refsource.append(embs)
 90 |             avg_emb_refsource.append(avg_emb)
 91 |             extreme_emb_refsource.append(extreme_emb)
 92 |         emb_refs.append(emb_refsource)
 93 |         avg_emb_refs.append(avg_emb_refsource)
 94 |         extreme_emb_refs.append(extreme_emb_refsource)
 95 | 
 96 |     rval = []
 97 |     if 'EmbeddingAverageCosineSimilairty' not in metrics_to_omit:
 98 |         cos_similarity = list(map(lambda refv: cosine_similarity(refv, avg_emb_hyps).diagonal(), avg_emb_refs))
 99 |         cos_similarity = np.max(cos_similarity, axis=0).mean()
100 |         rval.append("EmbeddingAverageCosineSimilairty: %0.6f" % (cos_similarity))
101 | 
102 |     if 'VectorExtremaCosineSimilarity' not in metrics_to_omit:
103 |         cos_similarity = list(map(lambda refv: cosine_similarity(refv, extreme_emb_hyps).diagonal(), extreme_emb_refs))
104 |         cos_similarity = np.max(cos_similarity, axis=0).mean()
105 |         rval.append("VectorExtremaCosineSimilarity: %0.6f" % (cos_similarity))
106 | 
107 |     if 'GreedyMatchingScore' not in metrics_to_omit:
108 |         scores = []
109 |         for emb_refsource in emb_refs:
110 |             score_source = []
111 |             for emb_ref, emb_hyp in zip(emb_refsource, emb_hyps):
112 |                 simi_matrix = cosine_similarity(emb_ref, emb_hyp)
113 |                 dir1 = simi_matrix.max(axis=0).mean()
114 |                 dir2 = simi_matrix.max(axis=1).mean()
115 |                 score_source.append((dir1 + dir2) / 2)
116 |             scores.append(score_source)
117 |         scores = np.max(scores, axis=0).mean()
118 |         rval.append("GreedyMatchingScore: %0.6f" % (scores))
119 | 
120 |     rval = "\n".join(rval)
121 |     return rval
122 | 
123 | 
124 | if __name__ == '__main__':
125 |     emb = Embedding()
126 | 


--------------------------------------------------------------------------------
/tag-and-generate-train/eval/nlg_eval/nlgeval/word2vec/generate_w2v_files.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Microsoft Corporation. All rights reserved.
 2 | # Licensed under the MIT license. See LICENSE.md file in the project root for full license information.
 3 | import os
 4 | 
 5 | try:
 6 |     from gensim.models import KeyedVectors
 7 | except ImportError:
 8 |     from gensim.models import Word2Vec as KeyedVectors
 9 | 
10 | import six
11 | from nlgeval.word2vec.glove2word2vec import glove2word2vec
12 | 
13 | 
14 | def txt2bin(filename):
15 |     m = KeyedVectors.load_word2vec_format(filename)
16 |     m.vocab[next(six.iterkeys(m.vocab))].sample_int = 1
17 |     m.save(filename.replace('txt', 'bin'), separately=None)
18 |     KeyedVectors.load(filename.replace('txt', 'bin'), mmap='r')
19 | 
20 | 
21 | def generate(path):
22 |     glove_vector_file = os.path.join(path, 'glove.6B.300d.txt')
23 |     output_model_file = os.path.join(path, 'glove.6B.300d.model.txt')
24 | 
25 |     txt2bin(glove2word2vec(glove_vector_file, output_model_file))
26 | 
27 | 
28 | if __name__ == "__main__":
29 |     generate()
30 | 


--------------------------------------------------------------------------------
/tag-and-generate-train/eval/nlg_eval/requirements.txt:
--------------------------------------------------------------------------------
 1 | click>=6.3
 2 | nltk>=3.1
 3 | numpy>=1.11.0
 4 | psutil>=5.6.2
 5 | requests>=2.19
 6 | six>=1.11
 7 | scipy>=0.17.0
 8 | scikit-learn>=0.17
 9 | gensim>=3
10 | Theano>=0.8.1
11 | tqdm>=4.24
12 | xdg
13 | 


--------------------------------------------------------------------------------
/tag-and-generate-train/eval/nlg_eval/requirements_py2.txt:
--------------------------------------------------------------------------------
 1 | click>=6.3
 2 | nltk>=3.1
 3 | numpy>=1.11.0<=1.17
 4 | psutil>=5.6.2
 5 | requests>=2.19
 6 | six>=1.11
 7 | scipy>=0.17.0
 8 | scikit-learn<0.21
 9 | gensim<1
10 | Theano>=0.8.1
11 | tqdm>=4.24
12 | xdg==1.0.7
13 | 


--------------------------------------------------------------------------------
/tag-and-generate-train/eval/nlg_eval/setup.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | # Copyright (c) Microsoft Corporation. All rights reserved.
 4 | # Licensed under the MIT license. See LICENSE.md file in the project root for full license information.
 5 | 
 6 | import sys
 7 | 
 8 | from setuptools import find_packages
 9 | from setuptools import setup
10 | from setuptools.command.develop import develop
11 | from setuptools.command.install import install
12 | 
13 | try:
14 |     from pip._internal.req import parse_requirements
15 | except:
16 |     from pip.req import parse_requirements
17 | 
18 | 
19 | if __name__ == '__main__':
20 |     requirements_path = 'requirements.txt'
21 |     if sys.version_info[0] < 3:
22 |         requirements_path = 'requirements_py2.txt'
23 |     install_reqs = parse_requirements(requirements_path, session=False)
24 |     reqs = [str(ir.req) for ir in install_reqs]
25 | 
26 |     setup(name='nlg-eval',
27 |           version='2.2',
28 |           description="Wrapper for multiple NLG evaluation methods and metrics.",
29 |           author='Shikhar Sharma, Hannes Schulz, Justin Harris',
30 |           author_email='shikhar.sharma@microsoft.com, hannes.schulz@microsoft.com, justin.harris@microsoft.com',
31 |           url='https://github.com/Maluuba/nlg-eval',
32 |           packages=find_packages(),
33 |           include_package_data=True,
34 |           scripts=['bin/nlg-eval'],
35 |           install_requires=reqs,
36 |     )
37 | 


--------------------------------------------------------------------------------
/tag-and-generate-train/eval/nlg_eval/test/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tag-and-generate/tagger-generator/0059c49fd7df273be9421cb683c96f362663057d/tag-and-generate-train/eval/nlg_eval/test/__init__.py


--------------------------------------------------------------------------------
/tag-and-generate-train/eval/nlg_eval/test/api.py:
--------------------------------------------------------------------------------
 1 | from nlgeval import NLGEval
 2 | 
 3 | def test_oo_api():
 4 |     with open("examples/hyp.txt") as f:
 5 |         hyp = f.readlines()
 6 |         hyp = [x.strip() for x in hyp]
 7 |     with open("examples/ref1.txt") as f:
 8 |         ref1 = f.readlines()
 9 |         ref1 = [x.strip() for x in ref1]
10 |     with open("examples/ref2.txt") as f:
11 |         ref2 = f.readlines()
12 |         ref2 = [x.strip() for x in ref2]
13 | 
14 |     nlge = NLGEval()
15 | 
16 |     res = nlge.compute_individual_metrics([ref1[0]] + [ref2[0]], hyp[0])
17 |     res = nlge.compute_individual_metrics([ref1[1]] + [ref2[1]], hyp[1])
18 | 
19 |     hyp_list = hyp
20 |     ref_list = [ref1, ref2]
21 |     res = nlge.compute_metrics(ref_list, hyp_list)
22 | 


--------------------------------------------------------------------------------
/tag-and-generate-train/eval/run_context_eval.sh:
--------------------------------------------------------------------------------
 1 | ###########
 2 | # Usage: bash eval/run_context_eval.sh <hypothesis_filepath> <reference_filepath>
 3 | ###########
 4 | 
 5 | export PYTHONPATH='eval/nlg_eval:.'
 6 | hyp=$1
 7 | ref=$2
 8 | 
 9 | python3 eval/context_eval.py --hyp "$1" --ref "$2"
10 | tail -n 1 <(cat $hyp | sacrebleu -w 2 $ref)
11 | 


--------------------------------------------------------------------------------
/tag-and-generate-train/requirements.txt:
--------------------------------------------------------------------------------
  1 | absl-py==0.8.0
  2 | adjustText==0.7.3
  3 | altair==3.2.0
  4 | appnope==0.1.0
  5 | argh==0.26.2
  6 | astroid==2.2.5
  7 | astropy==3.2.1
  8 | atomicwrites==1.3.0
  9 | attrs==19.1.0
 10 | autopep8==1.4.4
 11 | backcall==0.1.0
 12 | backports.functools-lru-cache==1.5
 13 | base58==1.0.3
 14 | beautifulsoup4==4.8.0
 15 | bleach==3.1.4
 16 | blis==0.2.4
 17 | boto3==1.9.243
 18 | botocore==1.12.243
 19 | certifi==2019.6.16
 20 | chardet==3.0.4
 21 | Click==7.0
 22 | colorama==0.4.1
 23 | compare-mt==0.2.6
 24 | confuse==1.0.0
 25 | cycler==0.10.0
 26 | cymem==2.0.2
 27 | decorator==4.4.0
 28 | defusedxml==0.6.0
 29 | docopt==0.6.2
 30 | docutils==0.15.2
 31 | editdistance==0.5.3
 32 | en-core-web-sm==2.1.0
 33 | entrypoints==0.3
 34 | enum-compat==0.0.2
 35 | epitran==1.1
 36 | flake8==3.7.9
 37 | future==0.17.1
 38 | htmlmin==0.1.12
 39 | idna==2.8
 40 | importlib-metadata==0.23
 41 | indic-transliteration==1.8.8
 42 | ipykernel==5.1.2
 43 | ipython==7.8.0
 44 | ipython-genutils==0.2.0
 45 | ipywidgets==7.5.0
 46 | isort==4.3.21
 47 | jedi==0.15.1
 48 | Jinja2==2.10.1
 49 | jmespath==0.9.4
 50 | joblib==0.14.0
 51 | jsonschema==2.6.0
 52 | jupyter-client==5.3.3
 53 | jupyter-core==4.5.0
 54 | kiwisolver==1.1.0
 55 | lazy-object-proxy==1.4.2
 56 | llvmlite==0.29.0
 57 | marisa-trie==0.7.5
 58 | MarkupSafe==1.1.1
 59 | matplotlib==3.1.1
 60 | mccabe==0.6.1
 61 | missingno==0.4.2
 62 | mistune==0.8.4
 63 | more-itertools==7.2.0
 64 | munkres==1.1.2
 65 | murmurhash==1.0.2
 66 | nbconvert==5.6.0
 67 | nbformat==4.4.0
 68 | networkx==2.3
 69 | neuralcoref==4.0
 70 | nltk==3.4.5
 71 | notebook==6.0.1
 72 | numba==0.45.1
 73 | numpy==1.17.0
 74 | packaging==19.2
 75 | pandas==0.25.1
 76 | pandas-profiling==2.3.0
 77 | pandocfilters==1.4.2
 78 | panphon==0.15
 79 | parso==0.5.1
 80 | pathtools==0.1.2
 81 | pep8==1.7.1
 82 | pexpect==4.7.0
 83 | phik==0.9.8
 84 | pickleshare==0.7.5
 85 | pigar==0.9.2
 86 | Pillow==6.2.0
 87 | plac==0.9.6
 88 | plotly==4.1.1
 89 | pluggy==0.13.0
 90 | portalocker==1.5.1
 91 | praw==6.3.1
 92 | prawcore==1.0.1
 93 | preshed==2.0.1
 94 | prometheus-client==0.7.1
 95 | prompt-toolkit==2.0.9
 96 | psaw==0.0.7
 97 | ptyprocess==0.6.0
 98 | py==1.8.0
 99 | pycodestyle==2.5.0
100 | pyflakes==2.1.1
101 | Pygments==2.4.2
102 | pylint==2.3.1
103 | pyparsing==2.4.2
104 | pyreqs==0.1.1
105 | pyrsistent==0.15.4
106 | pytest==5.2.0
107 | pytest-pylint==0.14.1
108 | python-dateutil==2.8.0
109 | pytz==2019.2
110 | PyYAML==5.1.2
111 | pyzmq==18.1.0
112 | regex==2019.8.19
113 | requests==2.22.0
114 | retrying==1.3.3
115 | rope==0.14.0
116 | s3transfer==0.2.1
117 | sacrebleu==1.4.1
118 | scikit-learn==0.21.3
119 | scipy==1.3.1
120 | seaborn==0.9.0
121 | selenium==3.141.0
122 | Send2Trash==1.5.0
123 | sh==1.12.14
124 | six==1.12.0
125 | sklearn==0.0
126 | soupsieve==1.9.3
127 | spacy==2.1.0
128 | splinter==0.11.0
129 | srsly==0.1.0
130 | terminado==0.8.2
131 | testpath==0.4.2
132 | thinc==7.0.8
133 | toolz==0.10.0
134 | torch==1.3.0
135 | torchvision==0.4.1
136 | tornado==5.1.1
137 | tqdm==4.35.0
138 | traitlets==4.3.2
139 | typed-ast==1.4.0
140 | typing==3.7.4.1
141 | tzlocal==2.0.0
142 | unicodecsv==0.14.1
143 | update-checker==0.16
144 | urllib3==1.26.5
145 | validators==0.14.0
146 | wasabi==0.2.2
147 | wcwidth==0.1.7
148 | webencodings==0.5.1
149 | websocket-client==0.56.0
150 | widgetsnbextension==3.5.1
151 | wiki-dump-parser==2.0.0
152 | wikipedia==1.4.0
153 | Wikipedia-API==0.5.2
154 | wrapt==1.11.2
155 | zipp==0.6.0
156 | 


--------------------------------------------------------------------------------
/tag-and-generate-train/scripts/inference.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | # Given a sentence, runs the tag-generate model on it
  3 | set -u
  4 | 
  5 | 
  6 | input_file="$1"         # the input test file which needs to be transferred
  7 | jobname="$2"            # unique identifier for the inference job 
  8 | tagger_target="$3"      # target [argument we pass when we run scripts/train_tagger.sh] used to train tagger
  9 | generator_target="$4"   # target [argument we pass when we run scripts/train_generator.sh] used to train generator
 10 | dataset="$5"            # dataset [argument we pass when we train tagger or generator -- used to identify model paths for tagger and generator]
 11 | src_tag="$6"            # style_0_label used to create training data [src style] 
 12 | tgt_tag="$7"            # style_1_label used to create training data [tgt style]
 13 | base_folder="$8"        # path to data folder, where outputs of the training data creation process are stored
 14 | device="$9"             # gpu id [comment line in script if it needs to be run on cpu]
 15 | 
 16 | tag_generate_base="experiments/" # base dir to store outputs of inference
 17 | mkdir -p $tag_generate_base
 18 | 
 19 | # SET UNSET BPE HERE
 20 | BPE=1
 21 | if [ "$BPE" -eq  1 ]; then
 22 |     MODEL_PTH="bpe"
 23 |     echo "Using BPE"
 24 | else
 25 |     MODEL_PTH="nobpe"
 26 |     echo "Not using BPE"
 27 | fi
 28 | # 
 29 | 
 30 | ## ARCHITECTURE
 31 | HSZ=512
 32 | EMBED_DIM=512
 33 | NHEAD=4
 34 | NL=4
 35 | ##
 36 | 
 37 | 
 38 | 
 39 | function infer() {
 40 |     # run inference
 41 |     infile="$1"
 42 |     src="$2"
 43 |     tgt="$3"
 44 |     model="$4"
 45 |     outfile="$5"
 46 |     prefer_gtag="$6"
 47 |     if [ "$BPE" -eq  1 ]; then
 48 |         CUDA_VISIBLE_DEVICES=$device python src/translate.py --cuda --src "$src" \
 49 |             --tgt "$tgt" \
 50 |             --model-file "$model" \
 51 |             --search "beam_search" \
 52 |             --hidden-dim $HSZ \
 53 |             --embed-dim $EMBED_DIM \
 54 |             --n-heads $NHEAD \
 55 |             --n-layers $NL \
 56 |             --beam-size 5 \
 57 |             --bpe \
 58 |             --prefer_gtag "$prefer_gtag" \
 59 |             --tag "$src_tag" \
 60 |             --input-file "$infile" \
 61 |             --output-file "$outfile" \
 62 |             --base-folder "$base_folder"
 63 |     else
 64 |         CUDA_VISIBLE_DEVICES=$device python src/translate.py --cuda --src "$src" \
 65 |             --tgt "$tgt" \
 66 |             --model-file "$model" \
 67 |             --search "beam_search" \
 68 |             --hidden-dim $HSZ \
 69 |             --embed-dim $EMBED_DIM \
 70 |             --n-heads $NHEAD \
 71 |             --n-layers $NL \
 72 |             --beam-size 5 \
 73 |             --tag "$src_tag" \
 74 |             --prefer_gtag "$prefer_gtag" \
 75 |             --input-file "$infile" \
 76 |             --output-file "$outfile" \
 77 |             --base-folder "$base_folder"
 78 |     fi
 79 |         
 80 | }
 81 | 
 82 | 
 83 | function add_eos() {
 84 |     # append eos to each line of the file
 85 |     ip="$1"
 86 |     awk '{printf("%s <eos>\n", $0)}' $ip > "${ip}.bak"
 87 |     mv "${ip}.bak" $ip
 88 | }
 89 | 
 90 | 
 91 | 
 92 | # Step 1: Run Preprocess/BPE on the input
 93 | TAGGER_INPUT="${tag_generate_base}/${jobname}_tagger_input"
 94 | if [ $BPE -eq 1 ]; then
 95 |     echo "Running BPE on input"
 96 |     CUDA_VISIBLE_deviceS=$device python src/subwords.py segment\
 97 |                                 --model "$base_folder/en${tagger_target}_subwords.model" < "$input_file"\
 98 |                                 > "$TAGGER_INPUT"
 99 | else     
100 |     cp "$input_file" "$TAGGER_INPUT"
101 | fi
102 | echo "Adding eos to the input"
103 | add_eos "$TAGGER_INPUT"
104 | 
105 | 
106 | # Step 2: Tag the input
107 | echo "Running tagger"
108 | infer "$TAGGER_INPUT" "en" "$tagger_target" "models/${dataset}/${MODEL_PTH}/en-${tagger_target}-tagger.pt"\
109 |       "${tag_generate_base}/${jobname}_tagged" 1
110 | ### SRC_TAG -> TGT_TAG
111 | sed -i "s/${src_tag}/${tgt_tag}/g" "${tag_generate_base}/${jobname}_tagged"
112 | 
113 | 
114 | # Step 3: Run Preprocess/BPE on the tagger output
115 | GENERATOR_INPUT="${tag_generate_base}/${jobname}_generator_input"
116 | if [ $BPE -eq 1 ]; then
117 |     echo "Running BPE on masked output"
118 |     CUDA_VISIBLE_deviceS=$device python src/subwords.py segment\
119 |                             --model "$base_folder/en${generator_target}_subwords.model" < "${tag_generate_base}/${jobname}_tagged" > "$GENERATOR_INPUT"
120 |         
121 | else
122 |     cp "${tag_generate_base}/${jobname}_tagged"  "$GENERATOR_INPUT"
123 | fi
124 | add_eos "$GENERATOR_INPUT"
125 | 
126 | 
127 | # Step 4: Generate
128 | echo "Running generator"
129 | infer "$GENERATOR_INPUT" "en" "${generator_target}" "models/${dataset}/${MODEL_PTH}/en-${generator_target}-generator.pt"\
130 |       "${tag_generate_base}/${jobname}_output" 0
131 | sed -i 's/^\"//g' "${tag_generate_base}/${jobname}_output"
132 | 
133 | 
134 | # Step 5: Run sacrebleu
135 | cat "$input_file"|sacrebleu -w2  "${tag_generate_base}/${jobname}_output"
136 | 


--------------------------------------------------------------------------------
/tag-and-generate-train/scripts/prepare_bpe.sh:
--------------------------------------------------------------------------------
 1 | # Prepare BPE
 2 | 
 3 | tgt="$1"
 4 | base_folder="$2"
 5 | VOCAB_SIZE=16000
 6 | python src/subwords.py train \
 7 |     --model_prefix "${base_folder}"/en${tgt}_subwords \
 8 |     --vocab_size "${VOCAB_SIZE}" \
 9 |     --model_type bpe \
10 |     --input "${base_folder}"/en${tgt}_parallel.train.$tgt,"${base_folder}"/en${tgt}_parallel.train.en
11 | 
12 | # Apply BPE
13 | for split in train dev test
14 | do
15 |     for l in $tgt en
16 |     do
17 |         python src/subwords.py segment \
18 |         --model "${base_folder}"/en${tgt}_subwords.model \
19 |         < "${base_folder}"/en${tgt}_parallel.$split.$l \
20 |         > "${base_folder}"/en${tgt}_parallel.bpe.$split.$l
21 |     done
22 | done
23 | 


--------------------------------------------------------------------------------
/tag-and-generate-train/scripts/train_generator.sh:
--------------------------------------------------------------------------------
 1 | ##########
 2 | # Usage: bash train_generator.sh <generator-target> <dataset> <base-folder>
 3 | ##########
 4 | 
 5 | #!/usr/bin/env bash
 6 | #SBATCH --mem=8G
 7 | #SBATCH --gres=gpu:1
 8 | #SBATCH -t 0
 9 | tgt="$1"
10 | dataset="$2"
11 | base_folder="$3"
12 | 
13 | # Switch to 0 for no bpe
14 | BPE=1
15 | if [ "$BPE" -eq  1 ]; then
16 |     MODEL_PTH=models/$dataset/"bpe"
17 |     echo "Using BPE"
18 | else
19 |     MODEL_PTH=models/$dataset/"nobpe"
20 |     echo "Not using BPE"
21 | fi
22 | 
23 | mkdir -p $MODEL_PTH
24 | 
25 | if [ "$BPE" -eq 1 ]; then
26 |     python src/training.py \
27 |         --cuda \
28 |         --src en \
29 |         --tgt "$tgt" \
30 |         --model-file "$MODEL_PTH/en-${tgt}-generator.pt" \
31 |         --n-layers 4 \
32 |         --n-heads 4 \
33 |         --embed-dim 512 \
34 |         --hidden-dim 512 \
35 |         --dropout 0.3 \
36 |         --bpe \
37 |         --word-dropout 0.1 \
38 |         --lr 1e-3 \
39 |         --n-epochs 5 \
40 |         --tokens-per-batch 8000 \
41 |         --clip-grad 1.1 \
42 |         --base-folder "$base_folder"
43 | else
44 |     python src/training.py \
45 |         --cuda \
46 |         --src en \
47 |         --tgt "$tgt" \
48 |         --model-file "$MODEL_PTH/en-${tgt}-generator.pt" \
49 |         --n-layers 4 \
50 |         --n-heads 4 \
51 |         --embed-dim 512 \
52 |         --hidden-dim 512 \
53 |         --dropout 0.3 \
54 |         --word-dropout 0.1 \
55 |         --lr 1e-3 \
56 |         --n-epochs 5 \
57 |         --tokens-per-batch 8000 \
58 |         --clip-grad 1.1 \
59 |         --base-folder "$base_folder"
60 | fi
61 | 


--------------------------------------------------------------------------------
/tag-and-generate-train/scripts/train_tagger.sh:
--------------------------------------------------------------------------------
 1 | ##########
 2 | # Usage: bash train_tagger.sh <tagger-target> <dataset> <base-folder>
 3 | ##########
 4 | 
 5 | #!/usr/bin/env bash
 6 | #SBATCH --mem=8G
 7 | #SBATCH --gres=gpu:1
 8 | #SBATCH -t 0
 9 | tgt="$1"
10 | dataset="$2"
11 | base_folder="$3"
12 | 
13 | # Switch to 0 for no bpe
14 | BPE=1
15 | if [ "$BPE" -eq  1 ]; then
16 |     MODEL_PTH=models/$dataset/"bpe"
17 |     echo "Using BPE"
18 | else
19 |     MODEL_PTH=models/$dataset/"nobpe"
20 |     echo "Not using BPE"
21 | fi
22 | 
23 | mkdir -p $MODEL_PTH
24 | 
25 | if [ "$BPE" -eq 1 ]; then
26 |     python src/training.py \
27 |         --cuda \
28 |         --src en \
29 |         --tgt "$tgt" \
30 |         --model-file "$MODEL_PTH/en-${tgt}-tagger.pt" \
31 |         --n-layers 4 \
32 |         --n-heads 4 \
33 |         --embed-dim 512 \
34 |         --hidden-dim 512 \
35 |         --dropout 0.3 \
36 |         --bpe \
37 |         --word-dropout 0.1 \
38 |         --lr 1e-3 \
39 |         --n-epochs 5 \
40 |         --tokens-per-batch 8000 \
41 |         --clip-grad 1.1 \
42 |         --base-folder "$base_folder"
43 | else
44 |     python src/training.py \
45 |         --cuda \
46 |         --src en \
47 |         --tgt "$tgt" \
48 |         --model-file "$MODEL_PTH/en-${tgt}-tagger.pt" \
49 |         --n-layers 4 \
50 |         --n-heads 4 \
51 |         --embed-dim 512 \
52 |         --hidden-dim 512 \
53 |         --dropout 0.3 \
54 |         --word-dropout 0.1 \
55 |         --lr 1e-3 \
56 |         --n-epochs 5 \
57 |         --tokens-per-batch 8000 \
58 |         --clip-grad 1.1 \
59 |         --base-folder "$base_folder"
60 | fi
61 | 


--------------------------------------------------------------------------------
/tag-and-generate-train/src/data.py:
--------------------------------------------------------------------------------
  1 | from collections import defaultdict
  2 | import torch as th
  3 | from torch.utils import data
  4 | import json
  5 | import numpy as np
  6 | 
  7 | 
  8 | def loadtxt(filename):
  9 |     txt = []
 10 |     with open(filename, encoding="utf-8") as f:
 11 |         for line in f:
 12 |             txt.append(line.rstrip())
 13 |     return txt
 14 | 
 15 | 
 16 | class Vocab(object):
 17 |     """Maps symbols (word/tokens) to indices"""
 18 | 
 19 |     def __init__(self):
 20 |         # Containers
 21 |         self.symbols = []
 22 |         self.idxs = {}
 23 |         # State
 24 |         self.frozen = False
 25 |         # Special symbols
 26 |         self.add_symbol("<pad>")  # Padding token
 27 |         self.add_symbol("<sos>")  # Start of sentence token
 28 |         self.add_symbol("<eos>")  # End of sentence token
 29 |         self.add_symbol("<unk>")  # Unknown token
 30 |         self.add_symbol("[GMASK]") # add GMASK
 31 | 
 32 |     def __len__(self):
 33 |         return len(self.idxs)
 34 | 
 35 |     def add_symbol(self, symbol):
 36 |         """Add a symbol to the dictionary and return its index
 37 | 
 38 |         If the symbol already exists in the dictionary this just returns
 39 |         the index"""
 40 |         if symbol not in self.idxs:
 41 |             if self.frozen:
 42 |                 raise ValueError("Can't add symbol to frozen dictionary")
 43 |             self.symbols.append(symbol)
 44 |             # print(symbol, len(self.idxs))
 45 |             self.idxs[symbol] = len(self.idxs)
 46 |         return self.idxs[symbol]
 47 | 
 48 |     def to_idx(self, symbol):
 49 |         """Return symbol's index
 50 | 
 51 |         If the symbol is not in the dictionary, returns the index of <unk>"""
 52 |         if symbol in self.idxs:
 53 |             return self.idxs[symbol]
 54 |         else:
 55 |             return self.idxs["<unk>"]
 56 | 
 57 |     def to_symbol(self, idx):
 58 |         """Return idx's symbol"""
 59 |         return self.symbols[idx]
 60 | 
 61 |     def __getitem__(self, symbol_or_idx):
 62 |         if isinstance(symbol_or_idx, int):
 63 |             return self.to_symbol(symbol_or_idx)
 64 |         else:
 65 |             return self.to_idx(symbol_or_idx)
 66 | 
 67 |     @staticmethod
 68 |     def from_data_files(*filenames, max_size=-1, min_freq=2):  # AB Change 1
 69 |         """Builds a dictionary from the most frequent tokens in files"""
 70 |         vocab = Vocab()
 71 |         # Record token counts
 72 |         token_counts = defaultdict(lambda: 0)
 73 |         for filename in filenames:
 74 |             with open(filename, encoding="utf-8") as f:
 75 |                 for line in f:
 76 |                     tokens = line.rstrip().split()
 77 |                     for token in tokens:
 78 |                         token_counts[token] += 1
 79 |         # Filter out least frequent tokens
 80 |         token_counts = {
 81 |             tok: cnt
 82 |             for tok, cnt in token_counts.items()
 83 |             if cnt >= min_freq
 84 |         }
 85 |         # Only keep most common tokens
 86 |         tokens = list(token_counts.keys())
 87 |         sorted_tokens = sorted(tokens, key=lambda x: token_counts[x])[::-1]
 88 |         if max_size > 0:
 89 |             sorted_tokens = sorted_tokens[:max_size]
 90 |         # Add the remaining tokens to the dictionary
 91 |         for token in sorted_tokens:
 92 |             vocab.add_symbol(token)
 93 | 
 94 |         return vocab
 95 | 
 96 | 
 97 | def _make_tagged_tokens(sents, pad_idx):
 98 |     """Pad sentences to the max length and create the relevant tag"""
 99 |     lengths = [len(sent) for sent in sents]
100 |     max_len = max(lengths)
101 |     bsz = len(lengths)
102 |     # Tensor containing the (right) padded tokens
103 |     tokens = th.full((max_len, bsz), pad_idx).long()
104 |     for i in range(bsz):
105 |         tokens[:lengths[i], i] = th.LongTensor(sents[i])
106 |     # Mask such that tag[i, b] = 1 iff lengths[b] < i
107 |     lengths = th.LongTensor(lengths).view(1, -1)
108 |     tag = th.gt(th.arange(max_len).view(-1, 1), lengths)
109 |     # print (lengths, th.arange(max_len).view(-1, 1), tag)
110 |     return tokens, tag
111 | 
112 | 
113 | class MTDataset(data.Dataset):
114 | 
115 |     def __init__(self, vocab, prefix, src_lang="en", tgt_lang="fr"):
116 |         # Attributes
117 |         self.vocab = vocab
118 |         self.src_lang = src_lang
119 |         self.tgt_lang = tgt_lang
120 |         # Load from files
121 |         src_file = prefix + "." + src_lang
122 |         tgt_file = prefix + "." + tgt_lang
123 |         self.src_txt = loadtxt(src_file)
124 |         self.tgt_txt = loadtxt(tgt_file)
125 |         # Check length
126 |         self.length = len(self.src_txt)
127 |         if self.length != len(self.tgt_txt):
128 |             raise ValueError("Mismatched source and target length")
129 |         # Append start/end of sentence token to the target
130 |         for idx, tgt_sent in enumerate(self.tgt_txt):
131 |             self.tgt_txt[idx] = f"<sos> {tgt_sent} <eos>"
132 |         # Convert to indices
133 |         self.src_idxs = [
134 |             [self.vocab[tok] for tok in sent.split()] + [self.vocab["<eos>"]]
135 |             for sent in self.src_txt
136 |         ]
137 |         self.tgt_idxs = [
138 |             [self.vocab[tok] for tok in sent.split()]
139 |             for sent in self.tgt_txt
140 |         ]
141 | 
142 |     def __getitem__(self, i):
143 |         return self.src_idxs[i], self.tgt_idxs[i]
144 | 
145 |     def __len__(self):
146 |         return self.length
147 | 
148 | 
149 | class MTDataLoader(data.DataLoader):
150 |     """Special Dataloader for MT datasets
151 | 
152 |     Batches by number of sentences and/or tokens
153 |     """
154 | 
155 |     def __init__(self, dataset, vocab, dynamic_tag=False, max_bsz=1, max_tokens=1000, shuffle=False):
156 |     
157 |         self.dataset = dataset
158 |         self.max_bsz = max_bsz
159 |         self.max_tokens = max_tokens
160 |         self.shuffle = shuffle
161 |         self.vocab = vocab
162 |         self.dynamic_tag = dynamic_tag
163 |         if self.dynamic_tag:
164 |             print("Training with Dynamic Mask.")
165 |         # Order of batches
166 | 
167 |     def init_epoch(self):
168 |         """Make batches that contain no more than
169 |         `max_tokens` tokens and `max_bsz` samples"""
170 |         N = len(self.dataset)
171 |         if self.shuffle:
172 |             self.order = th.randperm(N).numpy()
173 |         else:
174 |             self.order = th.arange(N).long().numpy()
175 |         self.batches = []
176 |         batch_size = max_src_tokens = max_tgt_tokens = 0
177 |         current_batch = []
178 |         pointer = 0
179 |         while pointer < N:
180 |             idx = self.order[pointer]
181 |             src, tgt = self.dataset[idx]
182 |             # Check whether adding this sample would bring us over
183 |             # the size limit
184 |             batch_size += 1
185 |             max_src_tokens = max(max_src_tokens, len(src))
186 |             max_tgt_tokens = max(max_tgt_tokens, len(tgt))
187 |             tot_tokens = (max_src_tokens + max_tgt_tokens) * batch_size
188 |             # If this is the case, wrap up current batch
189 |             if batch_size > self.max_bsz or tot_tokens > self.max_tokens:
190 |                 if len(current_batch) > 0:
191 |                     self.batches.append(current_batch)
192 |                 else:
193 |                     # If this happens then there is one sample that is too big,
194 |                     # just ignore it wth a warning
195 |                     print(f"WARNING: ignoring sample {idx}"
196 |                           "(too big for specified batch size)")
197 |                     pointer += 1
198 |                 batch_size = max_src_tokens = max_tgt_tokens = 0
199 |                 current_batch = []
200 |             else:
201 |                 current_batch.append(idx)
202 |                 pointer += 1
203 |         # Add the last batch
204 |         if len(current_batch) > 0:
205 |             self.batches.append(current_batch)
206 | 
207 |                  
208 | 
209 |     def process_tokens(self, tag_dict):
210 | 
211 |         processed_tag_dict = {
212 |             self.vocab[k] : v for k, v in tag_dict.items() if k in self.vocab.idxs
213 |         }        
214 | 
215 |         return processed_tag_dict
216 | 
217 | 
218 |     def __iter__(self):
219 |         self.init_epoch()
220 |         self.pos = 0
221 |         return self
222 | 
223 |     def __len__(self):
224 |         return len(self.batches)
225 | 
226 |     def get_batch(self, pos):
227 |         samples = [self.dataset[i] for i in self.batches[pos]]
228 |         src_sents = [src for src, _ in samples]
229 |         if self.dynamic_tag:
230 |             tgt_sents = [self.get_gtagged(tgt) for _, tgt in samples]
231 |             # for tgt in tgt_sents:
232 |             #     cnt = 0
233 |             #     for k in tgt:
234 |             #         print(k)
235 |             #         cnt+=(k == 4)
236 |             #     print ("count",cnt)
237 |             tgt_sents = np.array(tgt_sents)
238 |             selection = np.ones(len(tgt_sents), dtype=bool)
239 |             selection[1:] = tgt_sents[1:] != tgt_sents[:-1]
240 |             tgt_sents = tgt_sents[selection]
241 |         else:
242 |             tgt_sents = [tgt for _, tgt in samples]
243 |         # Input tensor
244 |         pad_idx = self.dataset.vocab["<pad>"]
245 |         src_tokens, src_tag = _make_tagged_tokens(src_sents, pad_idx)
246 |         tgt_tokens, tgt_tag = _make_tagged_tokens(tgt_sents, pad_idx)
247 |         # print(sum(tgt_tokens==4))
248 |         return src_tokens, src_tag, tgt_tokens, tgt_tag
249 | 
250 |     
251 |     def get_gtagged(self, tgt_sent):
252 |         
253 |         output = []
254 |         for tok in tgt_sent:
255 |             if tok in self.p9_tags:
256 |                 output.append(self.get_random(tok, self.p9_tags[tok]))
257 |             else:
258 |                 output.append(tok)
259 | 
260 |         return output
261 |         
262 |     def get_random(self, tok, prob):
263 | 
264 |         if np.random.uniform() < prob:
265 |             return self.vocab["[GMASK]"]
266 |         else:
267 |             return tok
268 | 
269 |     def __next__(self):
270 |         if self.pos >= len(self.batches):
271 |             raise StopIteration()
272 |         batch = self.get_batch(self.pos)
273 |         self.pos += 1
274 |         return batch
275 | 
276 | 
277 | 
278 | 
279 | class MTNoisyDataset(data.Dataset):
280 | 
281 |     def __init__(self, vocab, prefix, src_lang="en", tgt_lang="fr"):
282 |         # Attributes
283 |         self.vocab = vocab
284 |         self.src_lang = src_lang
285 |         self.tgt_lang = tgt_lang
286 |         
287 |         # Load from files
288 |         src_file = prefix + "." + src_lang
289 |         tgt_file = prefix + "." + tgt_lang
290 |         self.src_txt = loadtxt(src_file)
291 |         self.tgt_txt = loadtxt(tgt_file)
292 |         # Check length
293 |         self.length = len(self.src_txt)
294 |         if self.length != len(self.tgt_txt):
295 |             raise ValueError("Mismatched source and target length")
296 |         # Append start/end of sentence token to the target
297 |         for idx, tgt_sent in enumerate(self.tgt_txt):
298 |             self.tgt_txt[idx] = f"<sos> {tgt_sent} <eos>"
299 |         # Convert to indices
300 |         self.src_idxs = [
301 |             [self.vocab[tok] for tok in sent.split()] + [self.vocab["<eos>"]]
302 |             for sent in self.src_txt
303 |         ]
304 |         self.tgt_idxs = [
305 |             [self.vocab[tok] for tok in sent.split()]
306 |             for sent in self.tgt_txt
307 |         ]
308 | 
309 |     def __getitem__(self, i):
310 |         return self.src_idxs[i], self.tgt_idxs[i]
311 | 
312 |     def __len__(self):
313 |         return self.length
314 | 


--------------------------------------------------------------------------------
/tag-and-generate-train/src/decoding.py:
--------------------------------------------------------------------------------
  1 | import torch as th
  2 | import math
  3 | 
  4 | 
  5 | def sample(model, src_tokens, temperature=1.0, max_len=200, device=None):
  6 |     # Either decode on the model's device or specified device
  7 |     # (in which case move the model accordingly)
  8 |     if device is None:
  9 |         device = list(model.parameters())[0].device
 10 |     else:
 11 |         model = model.to(device)
 12 |     # Go into eval mode (e.g. disable dropout)
 13 |     model.eval()
 14 |     # Encode source sentece
 15 |     src_tensor = th.LongTensor(src_tokens).to(device).view(-1, 1)
 16 |     encodings = model.encode(src_tensor)
 17 |     # Initialize decoder state
 18 |     state = model.initial_state()
 19 |     # Start decoding
 20 |     out_tokens = [model.vocab["<sos>"]]
 21 |     eos_token = model.vocab["<eos>"]
 22 |     while out_tokens[-1] != eos_token and len(out_tokens) <= max_len:
 23 |         current_token = th.LongTensor([out_tokens[-1]]).view(1, 1).to(device)
 24 |         # One step of the decoder
 25 |         log_p, state = model.decode_step(current_token, encodings, state)
 26 |         # Probabilities
 27 |         probs = th.exp(log_p / temperature).view(-1)
 28 |         # Sample
 29 |         next_token = th.multinomial(probs.view(-1), 1).item()
 30 |         # Add to the generated sentence
 31 |         out_tokens.append(next_token)
 32 |     # Return generated token (idxs) without <sos> and <eos>
 33 |     out_tokens = out_tokens[1:]
 34 |     if out_tokens[-1] == eos_token:
 35 |         out_tokens = out_tokens[:-1]
 36 |     return out_tokens
 37 | 
 38 | 
 39 | def greedy(model, src_tokens, max_len=200, device=None):
 40 |     # Either decode on the model's device or specified device
 41 |     # (in which case move the model accordingly)
 42 |     if device is None:
 43 |         device = list(model.parameters())[0].device
 44 |     else:
 45 |         model = model.to(device)
 46 |     # Go into eval mode (e.g. disable dropout)
 47 |     model.eval()
 48 |     # Encode source sentece
 49 |     src_tensor = th.LongTensor(src_tokens).to(device).view(-1, 1)
 50 |     encodings = model.encode(src_tensor)
 51 |     # Initialize decoder state
 52 |     state = model.initial_state()
 53 |     # Start decoding
 54 |     out_tokens = [model.vocab["<sos>"]]
 55 |     eos_token = model.vocab["<eos>"]
 56 |     while out_tokens[-1] != eos_token and len(out_tokens) <= max_len:
 57 |         current_token = th.LongTensor([out_tokens[-1]]).view(1, 1).to(device)
 58 |         # One step of the decoder
 59 |         log_p, state = model.decode_step(current_token, encodings, state)
 60 |         # Sample
 61 |         next_token = log_p.view(-1).argmax()
 62 |         # Add to the generated sentence
 63 |         out_tokens.append(next_token.item())
 64 |     # Return generated token (idxs) without <sos> and <eos>
 65 |     out_tokens = out_tokens[1:]
 66 |     if out_tokens[-1] == eos_token:
 67 |         out_tokens = out_tokens[:-1]
 68 |     return out_tokens
 69 | 
 70 | 
 71 | def beam_search(
 72 |     model,
 73 |     src_tokens,
 74 |     prefer_gtag,
 75 |     src_tag,
 76 |     beam_size=1,
 77 |     len_penalty=0.0,
 78 |     max_len=200,
 79 |     # style_prior=1,  # lower the better!
 80 |     device=None
 81 | ):
 82 |     # assert style_prior <= 1 and style_prior > 0
 83 |     # Either decode on the model's device or specified device
 84 |     # (in which case move the model accordingly)
 85 |     if device is None:
 86 |         device = list(model.parameters())[0].device
 87 |     else:
 88 |         model = model.to(device)
 89 |     # Go into eval mode (e.g. disable dropout)
 90 |     model.eval()
 91 |     # Encode source sentece
 92 |     src_tensor = th.LongTensor(src_tokens).to(device).view(-1, 1)
 93 |     encodings = model.encode(src_tensor)
 94 |     # Initialize beams
 95 |     beams = [{
 96 |         # Tokens generated in this beam
 97 |         "tokens": [model.vocab["<sos>"]],
 98 |         # Internal decoder state
 99 |         "state": model.initial_state(),
100 |         # log probabilityof the sequence
101 |         "log_p": 0,
102 |         # Whether this beam is dead
103 |         "is_over": False,
104 |     }]
105 |     # Start decoding
106 |     eos_token = model.vocab["<eos>"]
107 |     t = 0
108 |     while not beams[-1]["is_over"]:
109 |         # Pass on dead beams
110 |         beam_candidates = [beam for beam in beams if beam["is_over"]]
111 |         # Take a step on all active beams
112 |         active_beams = [beam for beam in beams if not beam["is_over"]]
113 |         # Last produced tokens
114 |         current_tokens = th.LongTensor(
115 |             [beam["tokens"][-1] for beam in active_beams])
116 |         current_tokens = current_tokens.view(1, -1).to(device)
117 |         # Decoder states
118 |         states = [
119 |             th.cat([beam["state"][layer] for beam in active_beams], dim=1)
120 |             if beams[0]["state"][0] is not None
121 |             else None
122 |             for layer in range(model.n_layers)
123 |         ]
124 |         # Take a step
125 |         log_ps, new_states = model.decode_step(
126 |             current_tokens,
127 |             encodings.repeat(1, len(active_beams), 1),
128 |             states,
129 |         )
130 |         # Topk tokens at this step
131 |         log_ps = log_ps.view(log_ps.size(1), -1)
132 | 
133 |         # Style Prior
134 |         # log_ps[:, model.vocab["GMASK"]] -= math.log(style_prior)
135 | 
136 |         log_p_tokens, top_tokens = log_ps.topk(beam_size, dim=-1)
137 |         #print(log_ps.shape)
138 |         
139 |         # Append to candidates
140 |         for i, beam in enumerate(active_beams):
141 |             for token, log_p_token in zip(top_tokens[i], log_p_tokens[i]):
142 |                 # Update tokens, state and log_p
143 |                 candidate = {
144 |                     "tokens": beam["tokens"] + [token.item()],
145 |                     "state": [h[:, i:i+1].detach() for h in new_states],
146 |                     "log_p": beam["log_p"] + log_p_token.item(),
147 |                     "is_over": False,
148 |                 }
149 |                 # check whether this beam is over
150 |                 generated_eos = candidate["tokens"][-1] == eos_token
151 |                 too_long = len(candidate["tokens"]) > max_len
152 |                 candidate["is_over"] = generated_eos or too_long
153 |                 # Save candidate
154 |                 beam_candidates.append(candidate)
155 |         t += 1
156 |         # Now rerank and keep top beams
157 |         beams = sorted(
158 |             beam_candidates,
159 |             key=lambda beam: beam["log_p"] /  # log probability
160 |             (len(beam["tokens"]))**len_penalty,  # Length penalty
161 |         )[-beam_size:]  # top k
162 |     # Return generated token (idxs) without <sos> and <eos>
163 |     
164 |     if prefer_gtag == 1:  # prefer the hypothesis that's mostly gtags
165 |         num_gtag_criterion = lambda x: len([x_i for x_i in x["tokens"] if src_tag in model.vocab[x_i]]) 
166 |         beams = sorted(beams, key=num_gtag_criterion)
167 |         
168 |     out_tokens = beams[-1]["tokens"][1:]
169 |     if out_tokens[-1] == eos_token:
170 |         out_tokens = out_tokens[:-1]
171 |     return out_tokens
172 | 


--------------------------------------------------------------------------------
/tag-and-generate-train/src/noisy.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import torch
  3 | from data import _make_tagged_tokens
  4 | import itertools
  5 | 
  6 | # TODO: Add reference
  7 | 
  8 | def word_shuffle(vocab, x, k):   # slight shuffle such that |sigma[i]-i| <= k
  9 |     base = torch.arange(x.size(0), dtype=torch.float).repeat(x.size(1), 1).t()
 10 |     inc = (k+1) * torch.rand(x.size())
 11 |     inc[x == vocab['<sos>']] = 0     # do not shuffle the start sentence symbol
 12 |     inc[x == vocab['<pad>']] = k+1  # do not shuffle end paddings
 13 |     inc[x == vocab['<eos>']] = k+1
 14 |     _, sigma = (base + inc).sort(dim=0)
 15 |     return x[sigma, torch.arange(x.size(1))]
 16 | 
 17 | def word_drop(vocab, x, p):     # drop words with probability p
 18 |     x_ = []
 19 |     for i in range(x.size(1)):
 20 |         words = x[:, i].tolist()
 21 |         keep = np.random.rand(len(words)) > p
 22 |         keep[0] = True  # do not drop the start sentence symbol
 23 |         sent = [w for j, w in enumerate(words) if keep[j]]
 24 |         sent += [vocab['<pad>']] * (len(words)-len(sent))
 25 |         x_.append(sent)
 26 |     return torch.LongTensor(x_).t().contiguous().to(x.device)
 27 | 
 28 | def word_blank(vocab, x, p):     # blank words with probability p
 29 |     blank = (torch.rand(x.size(), device=x.device) < p) & \
 30 |         (x != vocab['<pad>']) & (x != vocab['<sos>']) & (x != vocab['<eos>'])
 31 |     x_ = x.clone()
 32 |     x_[blank] = vocab['<unk>']
 33 |     return x_
 34 | 
 35 | def word_substitute(vocab, x, p):     # substitute words with probability p
 36 |     keep = (torch.rand(x.size(), device=x.device) > p) | \
 37 |         (x == vocab['<sos>']) | (x == vocab['<pad>']) | (x == vocab['<eos>']) | (x == vocab['[GMASK]'])
 38 |     x_ = x.clone()
 39 |     x_.random_(0, len(vocab))
 40 |     x_[keep] = x[keep]
 41 |     return x_
 42 | 
 43 | def add_gtag(vocab, x, p):     # drop words with probability p
 44 |     x_ = []
 45 |     for i in range(x.size(1)):
 46 |         words = x[:, i].tolist()
 47 |         add = np.random.rand(len(words)) < p
 48 |         add[-1] = False
 49 |         # sent = [[w , vocab['▁['], vocab['GMASK'] , vocab[']']] if add[j] else [w] for j, w in enumerate(words)]
 50 |         sent = [[w , vocab[f'[GMASK{j//3}]']] if add[j] else [w] for j, w in enumerate(words)]
 51 |         sent = list(itertools.chain.from_iterable(sent)) + [vocab['<pad>']]
 52 |         x_.append(sent)
 53 |     sent, _ = _make_tagged_tokens(x_, vocab['<pad>'])
 54 |     return sent.to(x.device)
 55 | 
 56 | def add_intelligent_gtag(vocab, x, p):     # drop words with probability p
 57 |     x_ = []
 58 |     for i in range(x.size(1)):
 59 |         words = x[:, i].tolist()
 60 |         add = np.random.rand(len(words)) < p
 61 |         add[-1] = False
 62 |         sent = [[w, vocab['GMASK']] if add[j] and w==vocab['GMASK'] else [w] for j, w in enumerate(words)]
 63 |         sent = list(itertools.chain.from_iterable(sent)) + [vocab['<pad>']]
 64 |         x_.append(sent)
 65 |     sent, _ = _make_tagged_tokens(x_, vocab['<pad>'])
 66 |     return sent.to(x.device)
 67 | 
 68 | 
 69 | def intelligent_word_shuffle(vocab, x, k):   # slight shuffle such that |sigma[i]-i| <= k
 70 |     base = torch.arange(x.size(0), dtype=torch.float).repeat(x.size(1), 1).t()
 71 |     inc = (k+1) * torch.rand(x.size())
 72 |     for j in range(x.size(1)):
 73 |         for i in range(x.size(0)):
 74 |             do_shuf = 0
 75 |             for l in range(k//2):
 76 |                 if x[max(i-l,0)][j] == vocab['GMASK']:
 77 |                     do_shuf = 1
 78 |             for l in range(k//2):
 79 |                 if x[min(i+l,x.size(0)-1)][j] == vocab['GMASK']: 
 80 |                     do_shuf = 1
 81 |             inc[i][j] *= do_shuf
 82 |     inc[x == vocab['<sos>']] = 0     # do not shuffle the start sentence symbol
 83 |     inc[x == vocab['<pad>']] = k+1  # do not shuffle end paddings
 84 |     inc[x == vocab['<eos>']] = k+1
 85 |     _, sigma = (base + inc).sort(dim=0)
 86 |     return x[sigma, torch.arange(x.size(1))]
 87 | 
 88 | 
 89 | def noisy(vocab, x, drop_prob, blank_prob, sub_prob, shuffle_dist, add_gtag_prob, add_int_gtag_prob):
 90 |     if drop_prob > 0:
 91 |         x = word_drop(vocab, x, drop_prob)
 92 |     if blank_prob > 0:
 93 |         x = word_blank(vocab, x, blank_prob)
 94 |     if sub_prob > 0:
 95 |         x = word_substitute(vocab, x, sub_prob)
 96 |     if add_int_gtag_prob > 0:
 97 |         x = add_intelligent_gtag(vocab, x, add_gtag_prob)
 98 |         x = intelligent_word_shuffle(vocab, x, 3)
 99 |     if add_gtag_prob > 0:
100 |         x = add_gtag(vocab, x, add_gtag_prob)
101 |     if shuffle_dist > 0:
102 |         x = word_shuffle(vocab, x, shuffle_dist)
103 |     return x
104 | 


--------------------------------------------------------------------------------
/tag-and-generate-train/src/subwords.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | import sentencepiece as sp
 3 | import argparse
 4 | 
 5 | 
 6 | def train(args):
 7 |     arg_string = "".join(
 8 |         arg + ("=" if arg.startswith("--") else " ")
 9 |         for arg in args
10 |     ).strip()
11 |     sp.SentencePieceTrainer.Train(arg_string)
12 | 
13 | 
14 | def load(model_path):
15 |     model = sp.SentencePieceProcessor()
16 |     model.Load(model_path)
17 |     return model
18 | 
19 | 
20 | def desegment(tokens):
21 |     return ("".join(tokens)).replace("▁",  " ").strip()
22 | 
23 | 
24 | def get_args():
25 |     parser = argparse.ArgumentParser("Subword training/segmentation")
26 |     subparsers = parser.add_subparsers(help="Actions")
27 |     # Training
28 |     train_parser = subparsers.add_parser("train")
29 |     train_parser.set_defaults(which="train")
30 |     train_parser.add_argument("--input", required=True, type=str)
31 |     train_parser.add_argument("--model_prefix", required=True, type=str)
32 |     train_parser.add_argument("--vocab_size", required=True, type=int)
33 |     train_parser.add_argument("--model_type", required=True, type=str)
34 |     # Segmentation
35 |     segment_parser = subparsers.add_parser("segment")
36 |     segment_parser.set_defaults(which="segment")
37 |     segment_parser.add_argument("--model", required=True, type=str)
38 |     # De-segmentation
39 |     segment_parser = subparsers.add_parser("desegment")
40 |     segment_parser.set_defaults(which="desegment")
41 |     # Parse
42 |     args = parser.parse_args()
43 |     return args
44 | 
45 | 
46 | def main():
47 |     args = get_args()
48 |     if args.which == "train":
49 |         train(sys.argv[2:])
50 |     elif args.which == "segment":
51 |         model = load(args.model)
52 |         for line in sys.stdin:
53 |             print(" ".join(model.EncodeAsPieces(line)))
54 |     elif args.which == "desegment":
55 |         for line in sys.stdin:
56 |             print(desegment(line.strip().split()))
57 | 
58 | 
59 | if __name__ == "__main__":
60 |     main()
61 | 


--------------------------------------------------------------------------------
/tag-and-generate-train/src/training.py:
--------------------------------------------------------------------------------
  1 | import os.path
  2 | import argparse
  3 | from math import sqrt, exp
  4 | import torch as th
  5 | from data import MTDataset, MTDataLoader, Vocab, MTNoisyDataset
  6 | from transformer import Transformer
  7 | from tqdm import tqdm
  8 | import noisy
  9 | 
 10 | 
 11 | def load_data(src_lang, tgt_lang, base_folder, bpe=False):
 12 |     
 13 |     if bpe:
 14 |         train_prefix = os.path.join(
 15 |             base_folder,
 16 |             f"{src_lang}{tgt_lang}_parallel.bpe.train"
 17 |         )
 18 |     
 19 |         dev_prefix = os.path.join(
 20 |             base_folder,
 21 |             f"{src_lang}{tgt_lang}_parallel.bpe.dev"
 22 |         )
 23 |         print("loading", train_prefix, dev_prefix)
 24 |         vocab = Vocab.from_data_files(
 25 |             f"{train_prefix}.{src_lang}",
 26 |             f"{train_prefix}.{tgt_lang}",
 27 |         )
 28 |     else:
 29 |         train_prefix = os.path.join(
 30 |             base_folder,
 31 |             f"{src_lang}{tgt_lang}_parallel.train"
 32 |         )
 33 | 
 34 |         dev_prefix = os.path.join(
 35 |             base_folder,
 36 |             f"{src_lang}{tgt_lang}_parallel.dev"
 37 |         )
 38 |         vocab = Vocab.from_data_files(
 39 |             f"{train_prefix}.{src_lang}",
 40 |             f"{train_prefix}.{tgt_lang}",
 41 |             min_freq=2
 42 |         )
 43 |         print("loading", train_prefix, dev_prefix)
 44 |     train = MTNoisyDataset(vocab, train_prefix,
 45 |                         src_lang=src_lang, tgt_lang=tgt_lang)
 46 |     valid = MTNoisyDataset(vocab, dev_prefix,
 47 |                         src_lang=src_lang, tgt_lang=tgt_lang)
 48 |     return vocab, train, valid
 49 | 
 50 | 
 51 | def get_args():
 52 |     parser = argparse.ArgumentParser("Train an MT model")
 53 |     # General params
 54 |     parser.add_argument("--seed", type=int, default=11731)
 55 |     parser.add_argument("--src", type=str, default="en")
 56 |     parser.add_argument("--tgt", type=str)
 57 |     parser.add_argument("--model-file", type=str, default="model.pt")
 58 |     parser.add_argument("--overwrite-model", action="store_true")
 59 |     parser.add_argument("--cuda", action="store_true")
 60 |     parser.add_argument("--validate-only", action="store_true")
 61 |     parser.add_argument("--noisy-input", action="store_true")
 62 |     parser.add_argument("--noisy-output", action="store_true")
 63 |     parser.add_argument("--bpe", action="store_true")
 64 |     parser.add_argument("--dynamic-tag", action="store_true")
 65 |     parser.add_argument("--base-folder", type=str)
 66 |     # Model parameters
 67 |     parser.add_argument("--n-layers", type=int, default=4)
 68 |     parser.add_argument("--n-heads", type=int, default=4)
 69 |     parser.add_argument("--embed-dim", type=int, default=512)
 70 |     parser.add_argument("--hidden-dim", type=int, default=512)
 71 |     parser.add_argument("--dropout", type=float, default=0.1)
 72 |     parser.add_argument("--word-dropout", type=float, default=0.1)
 73 |     # Optimization parameters
 74 |     parser.add_argument("--n-epochs", type=int, default=15)
 75 |     parser.add_argument("--lr", type=float, default=4e-2)
 76 |     parser.add_argument("--lr-decay", type=float, default=0.8)
 77 |     parser.add_argument("--inverse-sqrt-schedule", action="store_true")
 78 |     parser.add_argument("--clip-grad", type=float, default=1.0)
 79 |     parser.add_argument("--tokens-per-batch", type=int, default=8000)
 80 |     parser.add_argument("--samples-per-batch", type=int, default=128)
 81 |     return parser.parse_args()
 82 | 
 83 | 
 84 | def move_to_device(tensors, device):
 85 |     return [tensor.to(device) for tensor in tensors]
 86 | 
 87 | 
 88 | def inverse_sqrt_schedule(warmup, lr0):
 89 |     """Inverse sqrt learning rate schedule with warmup"""
 90 |     step = 0
 91 |     # Trick for allowing warmup of 0
 92 |     warmup = max(warmup, 0.01)
 93 |     while True:
 94 |         scale = min(1/sqrt(step+1e-20), step/sqrt(warmup**3))
 95 |         step += 1
 96 |         yield lr0 * scale
 97 | 
 98 | 
 99 | def train_epoch(model, optim, dataloader, lr_schedule=None, clip_grad=5.0, is_noisy=(False, False)):
100 |     # Model device
101 |     device = list(model.parameters())[0].device
102 |     # Iterate over batches
103 |     itr = tqdm(dataloader)
104 |     print("Train with noisy input : ", is_noisy[0])
105 |     print("Train with noisy output. ", is_noisy[1])
106 |     
107 |         
108 | 
109 |     for batch in itr:
110 |         optim.zero_grad()
111 |         itr.total = len(dataloader)
112 |         # Cast input to device
113 |         batch = move_to_device(batch, device)
114 |         # Various inputs
115 |         src_tokens, src_tag, tgt_tokens, tgt_tag = batch
116 |         # print(model.vocab["[GMASK]"])
117 |         # print(tgt_tokens, th.sum(tgt_tokens==model.vocab["[GMASK]"]))
118 |         # Noise
119 |         if is_noisy[0] and model.training:
120 |             src_tokens = noisy.noisy(model.vocab, src_tokens, drop_prob=0.025, blank_prob=0., sub_prob=0.075,\
121 |                                      shuffle_dist=2, add_gtag_prob=0.0, add_int_gtag_prob=0)
122 |             src_tag = (src_tokens == model.vocab['<pad>'])
123 | 
124 |         if is_noisy[1] and model.training:
125 |             tgt_tokens = noisy.noisy(model.vocab, tgt_tokens, drop_prob=0., blank_prob=0., sub_prob=0.0, shuffle_dist=0,\
126 |                                      add_gtag_prob=0.0, add_int_gtag_prob=0.)
127 |         
128 |         # Get log probs
129 |         log_p = model(src_tokens, tgt_tokens[:-1], src_tag)
130 |         # Negative log likelihood of the target tokens
131 |         # (this selects log_p[i, b, tgt_tokens[i+1, b]]
132 |         # for each batch b, position i)
133 |         nll = th.nn.functional.nll_loss(
134 |             # Log probabilities (flattened to (l*b) x V)
135 |             log_p.view(-1, log_p.size(-1)),
136 |             # Target tokens (we start from the 1st real token, ignoring <sos>)
137 |             tgt_tokens[1:].view(-1),
138 |             # Don't compute the nll of padding tokens
139 |             ignore_index=model.vocab["<pad>"],
140 |             # Take the average
141 |             reduction="mean",
142 |         )
143 |         # Perplexity (for logging)
144 |         ppl = th.exp(nll).item()
145 |         # Backprop
146 |         nll.backward()
147 |         # Adjust learning rate with schedule
148 |         if lr_schedule is not None:
149 |             learning_rate = next(lr_schedule)
150 |             for param_group in optim.param_groups:
151 |                 param_group["lr"] = learning_rate
152 |         # Gradient clipping
153 |         if clip_grad > 0:
154 |             th.nn.utils.clip_grad_norm_(model.parameters(), clip_grad)
155 |         # Optimizer step
156 |         optim.step()
157 |         # Update stats
158 |         itr.set_postfix(loss=f"{nll.item():.3f}", ppl=f"{ppl:.2f}")
159 | 
160 | 
161 | def evaluate_ppl(model, dataloader):
162 |     model.eval()
163 |     # Model device
164 |     device = list(model.parameters())[0].device
165 |     # total tokens
166 |     tot_tokens = tot_nll = 0
167 |     # Iterate over batches
168 |     for batch in tqdm(dataloader):
169 |         # Cast input to device
170 |         batch = move_to_device(batch, device)
171 |         # Various inputs
172 |         src_tokens, src_tag, tgt_tokens, tgt_tag = batch
173 |         with th.no_grad():
174 |             # Get log probs
175 |             log_p = model(src_tokens, tgt_tokens[:-1], src_tag)
176 |             # Negative log likelihood of the target tokens
177 |             # (this selects log_p[i, b, tgt_tokens[i+1, b]]
178 |             # for each batch b, position i)
179 |             nll = th.nn.functional.nll_loss(
180 |                 # Log probabilities (flattened to (l*b) x V)
181 |                 log_p.view(-1, log_p.size(-1)),
182 |                 # Target tokens (we start from the 1st real token)
183 |                 tgt_tokens[1:].view(-1),
184 |                 # Don't compute the nll of padding tokens
185 |                 ignore_index=model.vocab["<pad>"],
186 |                 # Take the average
187 |                 reduction="sum",
188 |             )
189 |             # Number of tokens (ignoring <sos> and <pad>)
190 |             n_sos = tgt_tokens.eq(model.vocab["<sos>"]).float().sum().item()
191 |             n_pad = tgt_tokens.eq(model.vocab["<pad>"]).float().sum().item()
192 |             n_tokens = tgt_tokens.numel() - n_pad - n_sos
193 |             # Keep track
194 |             tot_nll += nll.item()
195 |             tot_tokens += n_tokens
196 |     return exp(tot_nll / tot_tokens)
197 | 
198 | def read_embeddings(embeddings_path, vocab, device):
199 |     word_to_weights = {}
200 |     with open(embeddings_path, "r") as f:
201 |         for line in f:
202 |             elems = line.strip().split()
203 |             word = elems[0]
204 |             emebeddings = [float(e) for e in elems[1:]]
205 |             word_to_weights[vocab[word]] = emebeddings  # store id -> W
206 |     W = []
207 |     zero_embed = [0 for _ in range(len(elems[1:]))]
208 |     for i in range(len(vocab)):
209 |         if i in word_to_weights:
210 |             W.append(word_to_weights[i])
211 |         else:
212 |             W.append(zero_embed)
213 |     W = torch.FloatTensor(W)
214 |     return torch.nn.Embedding.from_pretrained(W, freeze=False)
215 | 
216 | def main():
217 |     # Command line arguments
218 |     args = get_args()
219 |     # Set random seed
220 |     th.manual_seed(args.seed)
221 |     # data
222 |     vocab, train_data, valid_data = load_data(args.src, args.tgt, base_folder=args.base_folder, bpe=args.bpe)
223 |     # Model
224 |     model = Transformer(
225 |         args.n_layers,
226 |         args.embed_dim,
227 |         args.hidden_dim,
228 |         args.n_heads,
229 |         vocab,
230 |         args.dropout,
231 |         args.word_dropout,
232 |     )
233 |     if args.cuda:
234 |         model = model.cuda()
235 |     # Load existing model
236 |     if os.path.isfile(args.model_file) and not args.overwrite_model:
237 |         model.load_state_dict(th.load(args.model_file))
238 |     # Optimizer
239 |     optim = th.optim.Adam(model.parameters(), lr=args.lr, betas=(0.9, 0.98))
240 |     # Learning rate schedule
241 |     lr_schedule = None
242 |     if args.inverse_sqrt_schedule:
243 |         inverse_sqrt_schedule(2000, args.lr)
244 |     # Dataloader
245 |     train_loader = MTDataLoader(
246 |         train_data,
247 |         max_bsz=args.samples_per_batch,
248 |         max_tokens=args.tokens_per_batch,
249 |         shuffle=True,
250 |         dynamic_tag=args.dynamic_tag,
251 |         vocab=vocab
252 |     )
253 |     valid_loader = MTDataLoader(
254 |         valid_data,
255 |         max_bsz=args.samples_per_batch,
256 |         max_tokens=args.tokens_per_batch,
257 |         shuffle=False,
258 |         dynamic_tag=args.dynamic_tag,
259 |         vocab=vocab
260 |     )
261 |     # Either validate
262 |     if args.validate_only:
263 |         valid_ppl = evaluate_ppl(model, valid_loader)
264 |         print(f"Validation perplexity: {valid_ppl:.2f}")
265 |     else:
266 |         # Train epochs
267 |         best_ppl = 1e12
268 |         for epoch in range(1, args.n_epochs+1):
269 |             print(f"----- Epoch {epoch} -----", flush=True)
270 |             # Train for one epoch
271 |             model.train()
272 |             train_epoch(model, optim, train_loader,
273 |                         lr_schedule, args.clip_grad, (args.noisy_input, args.noisy_output))
274 |             # Check dev ppl
275 |             model.eval()
276 |             valid_ppl = evaluate_ppl(model, valid_loader)
277 |             print(f"Validation perplexity: {valid_ppl:.2f}", flush=True)
278 |             # Early stopping maybe
279 |             if valid_ppl < best_ppl:
280 |                 best_ppl = valid_ppl
281 |                 print(f"Saving new best model (epoch {epoch} ppl {valid_ppl})")
282 |                 th.save(model.state_dict(), args.model_file)
283 |             else:
284 |                 for param_group in optim.param_groups:
285 |                     param_group["lr"] *= args.lr_decay
286 | 
287 | 
288 | if __name__ == "__main__":
289 |     main()
290 | 


--------------------------------------------------------------------------------
/tag-and-generate-train/src/translate.py:
--------------------------------------------------------------------------------
  1 | import sys
  2 | import argparse
  3 | import torch as th
  4 | from tqdm import tqdm
  5 | import numpy as np
  6 | import random
  7 | 
  8 | from transformer import Transformer
  9 | from decoding import sample, greedy, beam_search
 10 | from training import load_data
 11 | from subwords import desegment
 12 | 
 13 | 
 14 | def get_args():
 15 |     parser = argparse.ArgumentParser("Translate with an MT model")
 16 |     # General params
 17 |     parser.add_argument("--src", type=str, default="en")
 18 |     parser.add_argument("--tgt", type=str)
 19 |     parser.add_argument("--model-file", type=str,
 20 |                         default="model.pt", required=True)
 21 |     parser.add_argument("--input-file", type=str, default=None)
 22 |     parser.add_argument("--output-file", type=str, default=None)
 23 |     parser.add_argument("--cuda", action="store_true")
 24 |     parser.add_argument("--seed", type=int, default=15062019)
 25 |     parser.add_argument("--bpe", action="store_true")
 26 |     parser.add_argument("--base-folder", type=str)
 27 |     # Model parameters
 28 |     parser.add_argument("--n-layers", type=int, default=4)
 29 |     parser.add_argument("--n-heads", type=int, default=4)
 30 |     parser.add_argument("--embed-dim", type=int, default=512)
 31 |     parser.add_argument("--hidden-dim", type=int, default=512)
 32 |     parser.add_argument("--dropout", type=float, default=0.3)
 33 |     # Translation parameters
 34 |     parser.add_argument("--search", type=str, default="beam_search",
 35 |                         choices=["random", "greedy", "beam_search"])
 36 |     parser.add_argument("--beam-size", type=int, default=2)
 37 |     parser.add_argument("--prefer_gtag", type=int, default=0)
 38 |     parser.add_argument("--tag", type=str)
 39 |     return parser.parse_args()
 40 | 
 41 | 
 42 | def move_to_device(tensors, device):
 43 |     return [tensor.to(device) for tensor in tensors]
 44 | 
 45 | 
 46 | def translate_sentence(
 47 |     model,
 48 |     sentence,
 49 |     prefer_gtag,
 50 |     tag,
 51 |     beam_size=1,
 52 |     search="beam_search",
 53 |     vocab=None
 54 | ):
 55 |     # Convert string to indices
 56 |     src_tokens = [model.vocab[word] for word in sentence]
 57 |     # Decode
 58 |     with th.no_grad():
 59 |         if search == "random":
 60 |             out_tokens = sample(model, src_tokens)
 61 |         elif search == "greedy":
 62 |             out_tokens = greedy(model, src_tokens)
 63 |         elif search == "beam_search":
 64 |             out_tokens = beam_search(model=model, src_tokens=src_tokens, beam_size=beam_size, src_tag=tag,
 65 |                                      prefer_gtag=prefer_gtag)
 66 |     
 67 |     # Convert back to strings
 68 |     return [model.vocab[tok] for tok in out_tokens]
 69 | 
 70 | 
 71 | def main():
 72 |     # Command line arguments
 73 |     args = get_args()
 74 |     # Fix seed for consistent sampling
 75 |     th.manual_seed(args.seed)
 76 |     #np.random.seed(args.seed)
 77 |     #random.seed(args.seed)
 78 | 
 79 |     # data
 80 |     vocab, _, _ = load_data(args.src, args.tgt, base_folder=args.base_folder, bpe=args.bpe)
 81 |     # Model
 82 |     model = Transformer(
 83 |         args.n_layers,
 84 |         args.embed_dim,
 85 |         args.hidden_dim,
 86 |         args.n_heads,
 87 |         vocab,
 88 |         args.dropout
 89 |     )
 90 |     if args.cuda:
 91 |         model = model.cuda()
 92 |     # Load existing model
 93 |     model.load_state_dict(th.load(args.model_file, map_location="cpu"))
 94 |     # Read from file/stdin
 95 |     if args.input_file is not None:
 96 |         input_stream = open(args.input_file, "r", encoding="utf-8")
 97 |     else:
 98 |         input_stream = sys.stdin
 99 |     # Write to file/stdout
100 |     if args.output_file is not None:
101 |         output_stream = open(args.output_file, "w", encoding="utf-8")
102 |         # If we're printing to a file, display stats in stdout
103 |         input_stream = tqdm(input_stream)
104 |     else:
105 |         output_stream = sys.stdout
106 |     # Translate
107 |     try:
108 |         for line in input_stream:
109 |             in_words = line.strip().split()
110 |             out_words = translate_sentence(
111 |                 model,
112 |                 in_words,
113 |                 beam_size=args.beam_size,
114 |                 search=args.search,
115 |                 vocab=vocab,
116 |                 prefer_gtag=args.prefer_gtag == 1,
117 |                 tag=args.tag
118 |             )
119 |             if args.bpe:
120 |                 print(desegment(out_words), file=output_stream)
121 |             else:
122 |                 print(" ".join(out_words), file=output_stream)
123 |             output_stream.flush()
124 |     except KeyboardInterrupt:
125 |         pass
126 |     finally:
127 |         input_stream.close()
128 |         output_stream.close()
129 | 
130 | 
131 | if __name__ == "__main__":
132 |     main()
133 | 


--------------------------------------------------------------------------------