├── .gitignore ├── LICENSE ├── README.md ├── tag-and-generate-data-prep ├── .gitignore ├── README.md ├── data │ └── catcher │ │ └── data.tsv ├── requirements.txt ├── scripts │ ├── prep_generator.sh │ └── prep_tagger.sh └── src │ ├── run.py │ └── style_tags.py └── tag-and-generate-train ├── README.md ├── data └── catcher │ └── data.tsv ├── eval ├── context_eval.py ├── nlg_eval │ ├── LICENSE.md │ ├── MANIFEST.in │ ├── README.md │ ├── __init__.py │ ├── bin │ │ └── nlg-eval │ ├── examples │ │ ├── hyp.txt │ │ ├── ref1.txt │ │ └── ref2.txt │ ├── multi-bleu.perl │ ├── nlgeval │ │ ├── __init__.py │ │ ├── pycocoevalcap │ │ │ ├── README.md │ │ │ ├── __init__.py │ │ │ ├── bleu │ │ │ │ ├── LICENSE │ │ │ │ ├── __init__.py │ │ │ │ ├── bleu.py │ │ │ │ └── bleu_scorer.py │ │ │ ├── cider │ │ │ │ ├── __init__.py │ │ │ │ ├── cider.py │ │ │ │ └── cider_scorer.py │ │ │ ├── license.txt │ │ │ ├── meteor │ │ │ │ ├── __init__.py │ │ │ │ ├── data │ │ │ │ │ └── paraphrase-en.gz │ │ │ │ ├── meteor-1.5.jar │ │ │ │ ├── meteor.py │ │ │ │ └── tests │ │ │ │ │ └── test_meteor.py │ │ │ └── rouge │ │ │ │ ├── __init__.py │ │ │ │ └── rouge.py │ │ ├── skipthoughts │ │ │ ├── .gitignore │ │ │ ├── README.md │ │ │ ├── __init__.py │ │ │ └── skipthoughts.py │ │ ├── tests │ │ │ ├── __init__.py │ │ │ └── test_nlgeval.py │ │ ├── utils.py │ │ └── word2vec │ │ │ ├── __init__.py │ │ │ ├── evaluate.py │ │ │ └── generate_w2v_files.py │ ├── requirements.txt │ ├── requirements_py2.txt │ ├── setup.py │ └── test │ │ ├── __init__.py │ │ └── api.py └── run_context_eval.sh ├── requirements.txt ├── scripts ├── inference.sh ├── prepare_bpe.sh ├── train_generator.sh └── train_tagger.sh └── src ├── data.py ├── decoding.py ├── noisy.py ├── subwords.py ├── training.py ├── transformer.py └── translate.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | pip-wheel-metadata/ 24 | share/python-wheels/ 25 | *.egg-info/ 26 | .installed.cfg 27 | *.egg 28 | MANIFEST 29 | 30 | # PyInstaller 31 | # Usually these files are written by a python script from a template 32 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 33 | *.manifest 34 | *.spec 35 | 36 | # Installer logs 37 | pip-log.txt 38 | pip-delete-this-directory.txt 39 | 40 | # Unit test / coverage reports 41 | htmlcov/ 42 | .tox/ 43 | .nox/ 44 | .coverage 45 | .coverage.* 46 | .cache 47 | nosetests.xml 48 | coverage.xml 49 | *.cover 50 | *.py,cover 51 | .hypothesis/ 52 | .pytest_cache/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | target/ 76 | 77 | # Jupyter Notebook 78 | .ipynb_checkpoints 79 | 80 | # IPython 81 | profile_default/ 82 | ipython_config.py 83 | 84 | # pyenv 85 | .python-version 86 | 87 | # pipenv 88 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 89 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 90 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 91 | # install all needed dependencies. 92 | #Pipfile.lock 93 | 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 95 | __pypackages__/ 96 | 97 | # Celery stuff 98 | celerybeat-schedule 99 | celerybeat.pid 100 | 101 | # SageMath parsed files 102 | *.sage.py 103 | 104 | # Environments 105 | .env 106 | .venv 107 | env/ 108 | venv/ 109 | ENV/ 110 | env.bak/ 111 | venv.bak/ 112 | 113 | # Spyder project settings 114 | .spyderproject 115 | .spyproject 116 | 117 | # Rope project settings 118 | .ropeproject 119 | 120 | # mkdocs documentation 121 | /site 122 | 123 | # mypy 124 | .mypy_cache/ 125 | .dmypy.json 126 | dmypy.json 127 | 128 | # Pyre type checker 129 | .pyre/ 130 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2020 Politeness Transfer: A Tag and Generate Approach 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Tagger and Generator 2 | 3 | ## Dataset preparation: [tag-and-generate/tagger-generator/tag-and-generate-data-prep](https://github.com/tag-and-generate/tagger-generator/tree/master/tag-and-generate-data-prep) 4 | ## Training, inference, evaluation: [tag-and-generate/tagger-generator/tag-and-generate-train](https://github.com/tag-and-generate/tagger-generator/tree/master/tag-and-generate-train) 5 | 6 | --- 7 | 8 | ## Walkthrough 9 | We will now present an example of training the politeness transfer system from scratch. 10 | The process has five steps: 11 | * [Step 1: Getting the code](#step-1-getting-the-code) 12 | * [Step 2: Getting the training data](#step-2-getting-the-training-data) 13 | * [Step 3: Preparing parallel data for training](#step-3-preparing-parallel-data-for-training) 14 | * [Step 4: Training the tagger and generator](#step-4-training-the-tagger-and-generator) 15 | * [Step 5: Running inference](#step-5-running-inference) 16 | 17 | ### Step 1: Getting the code 18 | 19 | We begin by cloning this repo: 20 | 21 | ```sh 22 | git clone https://github.com/tag-and-generate/tagger-generator.git 23 | ``` 24 | The cloned folder contains: i) ``tag-and-generate-data-prep`` the codebase used for creating the parallel tag and generate dataset, and ii) ``tag-and-generate-train``, the training code. 25 | 26 | Each of these folders has a ``requirements.txt`` file that can be used to download the dependencies. 27 | 28 | Next, let's create a folder inside ``tagger-generator`` to save all the datasets/tags: 29 | 30 | ```sh 31 | cd tagger-generator 32 | mkdir data 33 | ``` 34 | 35 | 36 | ### Step 2: Getting the training data. 37 | 38 | The training data in a ready to use format is located [here](https://drive.google.com/file/d/1E9GHwmVM9DL9-KiaIaG5lm_oagLWe908/view?usp=sharing). 39 | 40 | Download the zip file to the ``data`` folder created above and extract ```politeness.tsv```. 41 | 42 | ```sh 43 | unzip politeness_processed.zip 44 | head politeness.tsv 45 | ``` 46 | **txt**|**style**|**split** 47 | -----|-----|----- 48 | forwarded by tana jones / hou / ect on 09/28/2000|P\_2|train 49 | the clickpaper approvals for 9/27/00 are attached below .|P\_7|train 50 | "hello everyone : please let me know if you have a subscription to "" telerate "" ?"|P\_7|train 51 | we are being billed for this service and i do not know who is using it .|P\_0|train 52 | 53 | As we can see, the data is in the tsv format and has the right header. 54 | 55 | 56 | You can also use ``gdown`` to directly download the file: 57 | 58 | ```sh 59 | gdown --id 1E9GHwmVM9DL9-KiaIaG5lm_oagLWe908 60 | ``` 61 | 62 | 63 | 64 | 65 | 66 | 67 | Now that we have the codebase and the dataset, let's start by creating the parallel data required for training the models. Let's do a listing of the folder so far to make sure we are on the same page: 68 | 69 | ```sh 70 | (dl) tutorial@sa:~/tagger-generator$ ls 71 | data LICENSE README.md tag-and-generate-data-prep tag-and-generate-train 72 | ``` 73 | So, we are in the repo (tagger-generator), and see the two code folders (``tag-and-generate-data-prep`` and ``tag-and-generate-train``), as well as the data folder (``data``). 74 | Further, the data folder has the ``politeness.tsv`` file that we just downloaded: 75 | ```sh 76 | (dl) tutorial@sa:~/tagger-generator$ ls data/ 77 | politeness_processed.zip politeness.tsv 78 | ``` 79 | 80 | ### Step 3: Preparing parallel data for training 81 | 82 | We prepare the parallel data using ``tag-and-generate-data-prep``: 83 | 84 | ```sh 85 | cd tag-and-generate-data-prep 86 | python src/run.py --data_pth ../data/politeness.tsv --outpath ../data/ --style_0_label P_9 --style_1_label P_0 --is_unimodal True 87 | ``` 88 | More details on these options are located in [tag-and-generate/tagger-generator/tag-and-generate-data-prep](https://github.com/tag-and-generate/tagger-generator/tree/master/tag-and-generate-data-prep). In summary, we specify the input file, the label for the style of interest (``P_9``) and a neutral/contrastive style (``P_0``). Importantly, we specify ``--is_unimodal True``. This option ensures that the parallel data is created as per the unimodal style setting (Figure 3 in [the paper](https://arxiv.org/pdf/2004.14257.pdf)). 89 | 90 | After data-prep finishes, we see several files in ``../data/``. 91 | The important files are described below: 92 | 93 | * P_9_tags.json: these are the politeness tags or phrases identified as polite phrases: 94 | 95 | ```"thank you" 96 | "thank" 97 | "looking forward" 98 | "glad" 99 | "be interested" 100 | ``` 101 | 102 | * The data prep code creates two sets of training files: one for the ``tagger`` and another for the ``generator``. 103 | To understand these, let's take a sample sentence ```please get back to me if you have any additional concerns .``` and look at how it is represented in different files: 104 | 105 | - ``entagged_parallel.train.en`` (input to the tagger): 106 | - ``back to me have concerns .`` 107 | - ``entagged_parallel.train.tagged`` (output of the tagger): 108 | - ``[P_90] back to me [P_91] have [P_92] concerns .`` 109 | - ``engenerated_parallel.train.en`` (input to the generator): 110 | - ``[P_90] back to me [P_91] have [P_92] concerns .`` 111 | - ``engenerated_parallel.train.generated`` (output of the generator) 112 | - ``please get back to me if you have any additional concerns .`` 113 | 114 | Here, ``P_9`` is the style tag, and the number after the style tag captures the position of the tag in the sentence. 115 | 116 | With the data files ready, we are ready to run training. 117 | 118 | 119 | ### Step 4: Training the tagger and generator 120 | 121 | All the training and inference related scripts/code is present in ``tag-and-generate-train``, so let's ``cd`` to it. 122 | 123 | ```sh 124 | cd tag-and-generate-train 125 | ``` 126 | 127 | In order to prepare the files for training, we first process them using ``BPE. `` 128 | 129 | ```sh 130 | bash scripts/prepare_bpe.sh tagged ../data/ 131 | bash scripts/prepare_bpe.sh generated ../data/ 132 | ``` 133 | 134 | We can now start training the tagger and generator: 135 | 136 | ```sh 137 | nohup bash scripts/train_tagger.sh tagged politeness ../data/ > tagger.log & 138 | nohup bash scripts/train_generator.sh generated politeness ../data/ > generator.log & 139 | ``` 140 | 141 | ```politeness``` is a user-defined handle that we will use during inference. 142 | 143 | After the training finishes, the best models (given by validation perplexity) are stored in ``models``: 144 | 145 | ```sh 146 | (dl) tutorial@sa:~/tagger-generator/tag-and-generate-train$ ls models/politeness/bpe/ 147 | en-generated-generator.pt en-tagged-tagger.pt 148 | ``` 149 | 150 | For our run, at the end of 5 epochs, the validation perplexity was 1.26 for the tagger, and 1.76 for the generator. 151 | 152 | ### Step 5: Running inference 153 | 154 | Let's test out the trained models on some sample sentences: 155 | 156 | ```sh 157 | (dl) tutorial@sa:~/tagger-generator/tag-and-generate-train$ cat > input.txt 158 | send me the text files. 159 | look into this issue. 160 | 161 | bash scripts/inference.sh input.txt sample tagged generated politeness P_9 P_9 ../data/ 3 162 | ``` 163 | 164 | Here ``sample`` is a unique identifier for the inference job, and ``politeness`` is the identifier we used for the training job. ``P_9`` is the style tag (kept the same for unimodal jobs). (Please see the README at [tag-and-generate/tagger-generator/tag-and-generate-train](https://github.com/tag-and-generate/tagger-generator/tree/master/tag-and-generate-train) for more details). 165 | 166 | The final and intermediate outputs are located in experiments folder: 167 | 168 | ```sh 169 | (dl) tutorial@sa:~/tagger-generator/tag-and-generate-train$ ls experiments/sample_* 170 | experiments/sample_generator_input experiments/sample_tagged 171 | experiments/sample_output experiments/sample_tagger_input 172 | ``` 173 | 174 | Let's look at the final output: 175 | 176 | ```sh 177 | (dl) tutorial@sa:~/tagger-generator/tag-and-generate-train$ cat experiments/sample_output 178 | please send me the text files. 179 | we would like to look into this issue. 180 | ``` 181 | Not bad! 182 | 183 | We hope this walkthrough is helpful in understanding and using the codebase. Here are some additional helpful links: 184 | 185 | - [Trained Models](https://drive.google.com/drive/folders/1tXLC4WbXc_WLgvQu2mTa3jDe0efZ3dz1?usp=sharing). 186 | - [Outputs](https://github.com/tag-and-generate/outputs) 187 | - [Datasets](https://github.com/tag-and-generate/politeness-dataset) 188 | 189 | 190 | -------------------------------------------------------------------------------- /tag-and-generate-data-prep/.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | pip-wheel-metadata/ 24 | share/python-wheels/ 25 | *.egg-info/ 26 | .installed.cfg 27 | *.egg 28 | MANIFEST 29 | 30 | # PyInstaller 31 | # Usually these files are written by a python script from a template 32 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 33 | *.manifest 34 | *.spec 35 | 36 | # Installer logs 37 | pip-log.txt 38 | pip-delete-this-directory.txt 39 | 40 | # Unit test / coverage reports 41 | htmlcov/ 42 | .tox/ 43 | .nox/ 44 | .coverage 45 | .coverage.* 46 | .cache 47 | nosetests.xml 48 | coverage.xml 49 | *.cover 50 | *.py,cover 51 | .hypothesis/ 52 | .pytest_cache/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | target/ 76 | 77 | # Jupyter Notebook 78 | .ipynb_checkpoints 79 | 80 | # IPython 81 | profile_default/ 82 | ipython_config.py 83 | 84 | # pyenv 85 | .python-version 86 | 87 | # pipenv 88 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 89 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 90 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 91 | # install all needed dependencies. 92 | #Pipfile.lock 93 | 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 95 | __pypackages__/ 96 | 97 | # Celery stuff 98 | celerybeat-schedule 99 | celerybeat.pid 100 | 101 | # SageMath parsed files 102 | *.sage.py 103 | 104 | # Environments 105 | .env 106 | .venv 107 | env/ 108 | venv/ 109 | ENV/ 110 | env.bak/ 111 | venv.bak/ 112 | 113 | # Spyder project settings 114 | .spyderproject 115 | .spyproject 116 | 117 | # Rope project settings 118 | .ropeproject 119 | 120 | # mkdocs documentation 121 | /site 122 | 123 | # mypy 124 | .mypy_cache/ 125 | .dmypy.json 126 | dmypy.json 127 | 128 | # Pyre type checker 129 | .pyre/ 130 | -------------------------------------------------------------------------------- /tag-and-generate-data-prep/README.md: -------------------------------------------------------------------------------- 1 | ## Data Preparation 2 | 3 | - This repository contains the code for creating parallel data that can be used to train the ``tagger`` and ``generator`` modules on your dataset https://arxiv.org/abs/2004.14257 4 | 5 | - The `data/catcher/` directory contains some sample text that can be used to test the codebase. 6 | 7 | ## Usage 8 | 9 | 10 | ```py 11 | python src/run.py --data_pth PTH\ 12 | --outpath OUTPATH\ 13 | --style_0_label label0\ 14 | --style_1_label label1\ 15 | --is_unimodal True | False 16 | ``` 17 | 18 | Where: 19 | - `PTH` should point to a tab-separated file (tsv) that contains the corpus. We assume that the corpus is made up of a set of sentences. The `tsv` is supposed to have three fields: 1) txt: the sentence, 2) split: train/test/val, and 3) style: label that identifies the style of the sentence (one of `label0` or `label1`). Sample: 20 | 21 | | txt | style | split | 22 | |- |- |- | 23 | | How've you been, Mrs. Spencer? | catcher | test | 24 | | C'mon, c'mon | catcher | train | 25 | | And the place death, considering who thou art, | romeo-juliet | train | 26 | | He's got a lot of dough, now. | catcher | test | 27 | 28 | 29 | 30 | - `OUTPATH` is the location of the output 31 | 32 | - `label0` and `label1` are tags that identify individual styles. This explicit assignment is important for unimodal cases, such as politeness and captions (please see the paper for more details) 33 | 34 | - `is_unimodal` should be set to `True` for datasets that have only one stylistic information. `style_0_label` should be used to speficify the style of interest, and `style_1_label` should be the tag for a neutral/style free corpus. In case of politeness transfer, you can use `style_0_label` as `P_9` and `style_1_label` as `P_0` or `P_1`. 35 | 36 | Please see run.py for the details on other options. 37 | 38 | ## Outputs 39 | 40 | While the program creates a number of files in the `OUTPATH` dir, only a subset of them are required for training `tagger` and `generator`. All of the files are named according to the following format: 41 | 42 | `en{target}\_parallel.{split}.[en | {target}]` 43 | 44 | Where `split` is either `train`, `test`, or `val`, and `target` is either set to `tagged` (for tagger) or `generated`) for generator. We always use `en` to refer to the source files. 45 | 46 | Further, the attribute tags can also be found under the name `{style_label}_tags.json` 47 | 48 | ## Walkthrough 49 | 50 | We walk through the usage of the data prep codebase by creating parallel data for our tag and generate system using the sample data present in `data/catcher`. The (toy) data consists of a few lines from the Catcher in the Rye and Romeo & Juliet. 51 | 52 | Some sample rows from the dataset are shown below: 53 | 54 | | txt | style | split | 55 | |- |- |- | 56 | | How've you been, Mrs. Spencer? | catcher | test | 57 | | C'mon, c'mon | catcher | train | 58 | | And the place death, considering who thou art, | romeo-juliet | train | 59 | | He's got a lot of dough, now. | catcher | test | 60 | | My life were better ended by their hate, | romeo-juliet | train | 61 | | He lent me counsel and I lent him eyes. | romeo-juliet | train | 62 | | It wasn't all my fault. | catcher | test | 63 | | If you don't, you feel even worse. | catcher | test | 64 | 65 | 66 | Using the defaults specified in src/run.py, we can generate the parallel data for training tag and generator using the following command: 67 | 68 | ```py 69 | python3 src/run.py --data_pth data/catcher/data.tsv\ 70 | --outpath data/tmp/\ 71 | --style_0_label romeo-juliet\ 72 | --style_1_label catcher\ 73 | --is_unimodal False 74 | ``` 75 | 76 | After running this command, the specified output directory `data/tmp` will contain a number of files. The important ones are listed below. 77 | 78 | - Style attribute tags: 79 | - `romeo-juliet_tags.json`: The style tags for style 0 (romeo-juliet) 80 | - `catcher_tags.json`: The style tags for style 1 (catcher) 81 | 82 | - Tagger training files: 83 | - `entagged_parallel.[train|test|val].en`: Source files for the tagger 84 | - `entagged_parallel.[train|test|val].tagged`: Target files for the tagger 85 | 86 | - Generator training files: 87 | - `engenerated_parallel.[train|test|val].en`: Source files for the generator 88 | - `engenerated_parallel.[train|test|val].generated`: Target files for the generator 89 | -------------------------------------------------------------------------------- /tag-and-generate-data-prep/requirements.txt: -------------------------------------------------------------------------------- 1 | absl-py==0.8.0 2 | adjustText==0.7.3 3 | altair==3.2.0 4 | appnope==0.1.0 5 | argh==0.26.2 6 | astroid==2.2.5 7 | astropy==3.2.1 8 | atomicwrites==1.3.0 9 | attrs==19.1.0 10 | autopep8==1.4.4 11 | backcall==0.1.0 12 | backports.functools-lru-cache==1.5 13 | base58==1.0.3 14 | beautifulsoup4==4.8.0 15 | bleach==3.1.4 16 | blis==0.2.4 17 | boto3==1.9.243 18 | botocore==1.12.243 19 | certifi==2019.6.16 20 | chardet==3.0.4 21 | Click==7.0 22 | colorama==0.4.1 23 | compare-mt==0.2.6 24 | confuse==1.0.0 25 | cycler==0.10.0 26 | cymem==2.0.2 27 | decorator==4.4.0 28 | defusedxml==0.6.0 29 | docopt==0.6.2 30 | docutils==0.15.2 31 | editdistance==0.5.3 32 | en-core-web-sm==2.1.0 33 | entrypoints==0.3 34 | enum-compat==0.0.2 35 | epitran==1.1 36 | flake8==3.7.9 37 | future==0.17.1 38 | htmlmin==0.1.12 39 | idna==2.8 40 | importlib-metadata==0.23 41 | indic-transliteration==1.8.8 42 | ipykernel==5.1.2 43 | ipython==7.8.0 44 | ipython-genutils==0.2.0 45 | ipywidgets==7.5.0 46 | isort==4.3.21 47 | jedi==0.15.1 48 | Jinja2==2.10.1 49 | jmespath==0.9.4 50 | joblib==0.14.0 51 | jsonschema==2.6.0 52 | jupyter-client==5.3.3 53 | jupyter-core==4.5.0 54 | kiwisolver==1.1.0 55 | lazy-object-proxy==1.4.2 56 | llvmlite==0.29.0 57 | marisa-trie==0.7.5 58 | MarkupSafe==1.1.1 59 | matplotlib==3.1.1 60 | mccabe==0.6.1 61 | missingno==0.4.2 62 | mistune==0.8.4 63 | more-itertools==7.2.0 64 | munkres==1.1.2 65 | murmurhash==1.0.2 66 | nbconvert==5.6.0 67 | nbformat==4.4.0 68 | networkx==2.3 69 | neuralcoref==4.0 70 | nltk==3.4.5 71 | notebook==6.0.1 72 | numba==0.45.1 73 | numpy==1.17.0 74 | packaging==19.2 75 | pandas==0.25.1 76 | pandas-profiling==2.3.0 77 | pandocfilters==1.4.2 78 | panphon==0.15 79 | parso==0.5.1 80 | pathtools==0.1.2 81 | pep8==1.7.1 82 | pexpect==4.7.0 83 | phik==0.9.8 84 | pickleshare==0.7.5 85 | pigar==0.9.2 86 | Pillow==6.2.0 87 | plac==0.9.6 88 | plotly==4.1.1 89 | pluggy==0.13.0 90 | portalocker==1.5.1 91 | praw==6.3.1 92 | prawcore==1.0.1 93 | preshed==2.0.1 94 | prometheus-client==0.7.1 95 | prompt-toolkit==2.0.9 96 | psaw==0.0.7 97 | ptyprocess==0.6.0 98 | py==1.8.0 99 | pycodestyle==2.5.0 100 | pyflakes==2.1.1 101 | Pygments==2.4.2 102 | pylint==2.3.1 103 | pyparsing==2.4.2 104 | pyreqs==0.1.1 105 | pyrsistent==0.15.4 106 | pytest==5.2.0 107 | pytest-pylint==0.14.1 108 | python-dateutil==2.8.0 109 | pytz==2019.2 110 | PyYAML==5.1.2 111 | pyzmq==18.1.0 112 | regex==2019.8.19 113 | requests==2.22.0 114 | retrying==1.3.3 115 | rope==0.14.0 116 | s3transfer==0.2.1 117 | sacrebleu==1.4.1 118 | scikit-learn==0.21.3 119 | scipy==1.3.1 120 | seaborn==0.9.0 121 | selenium==3.141.0 122 | Send2Trash==1.5.0 123 | sh==1.12.14 124 | six==1.12.0 125 | sklearn==0.0 126 | soupsieve==1.9.3 127 | spacy==2.1.0 128 | splinter==0.11.0 129 | srsly==0.1.0 130 | terminado==0.8.2 131 | testpath==0.4.2 132 | thinc==7.0.8 133 | toolz==0.10.0 134 | torch==1.3.0 135 | torchvision==0.4.1 136 | tornado==5.1.1 137 | tqdm==4.35.0 138 | traitlets==4.3.2 139 | typed-ast==1.4.0 140 | typing==3.7.4.1 141 | tzlocal==2.0.0 142 | unicodecsv==0.14.1 143 | update-checker==0.16 144 | urllib3==1.26.5 145 | validators==0.14.0 146 | wasabi==0.2.2 147 | wcwidth==0.1.7 148 | webencodings==0.5.1 149 | websocket-client==0.56.0 150 | widgetsnbextension==3.5.1 151 | wiki-dump-parser==2.0.0 152 | wikipedia==1.4.0 153 | Wikipedia-API==0.5.2 154 | wrapt==1.11.2 155 | zipp==0.6.0 156 | -------------------------------------------------------------------------------- /tag-and-generate-data-prep/scripts/prep_generator.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -u 3 | BASEDIR="$1" 4 | OUTDIR="$2" 5 | tgt_lang="$3" 6 | tgt_lang_tag="$4" 7 | UNIMODAL="$5" 8 | POS="$6" 9 | NEG="$7" 10 | 11 | for split in train test val; do 12 | if [ "$UNIMODAL" -eq 1 ]; then 13 | cp ${BASEDIR}/en${tgt_lang}_parallel.${split}.${tgt_lang}."$POS" ${OUTDIR}/en${tgt_lang_tag}_parallel.${split}.en 14 | else 15 | cat ${BASEDIR}/en${tgt_lang}_parallel.${split}.${tgt_lang}."$POS" ${BASEDIR}/en${tgt_lang}_parallel.${split}.${tgt_lang}."$NEG" > ${OUTDIR}/en${tgt_lang_tag}_parallel.${split}.en 16 | fi 17 | done 18 | 19 | for split in train test val; do 20 | if [ "$UNIMODAL" -eq 1 ]; then 21 | cp ${BASEDIR}/en${tgt_lang}_parallel.${split}.en."$POS" ${OUTDIR}/en${tgt_lang_tag}_parallel.${split}.${tgt_lang_tag} 22 | sed -i "s/\[${NEG}[0-9]*\]//g;s/ / /g;s/^ //g;s/ $//g" ${OUTDIR}/en${tgt_lang_tag}_parallel.${split}.${tgt_lang_tag} 23 | else 24 | cat ${BASEDIR}/en${tgt_lang}_parallel.${split}.en."$POS" ${BASEDIR}/en${tgt_lang}_parallel.${split}.en."$NEG" > ${OUTDIR}/en${tgt_lang_tag}_parallel.${split}.${tgt_lang_tag} 25 | fi 26 | done 27 | 28 | mv ${OUTDIR}/en${tgt_lang_tag}_parallel.val.en ${OUTDIR}/en${tgt_lang_tag}_parallel.dev.en 29 | mv ${OUTDIR}/en${tgt_lang_tag}_parallel.val.${tgt_lang_tag} ${OUTDIR}/en${tgt_lang_tag}_parallel.dev.${tgt_lang_tag} 30 | 31 | -------------------------------------------------------------------------------- /tag-and-generate-data-prep/scripts/prep_tagger.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | BASEDIR="$1" 3 | OUTDIR="$2" 4 | tgt_lang="$3" 5 | UNIMODAL="$4" 6 | STYLE_0_LABEL="$5" 7 | STYLE_1_LABEL="$6" 8 | # In the unimodal case, POS is supposed to be the stylistic corpus 9 | 10 | # Step 1: Create source data for tagger 11 | for split in train test val; do 12 | if [ "$UNIMODAL" -eq 1 ]; then 13 | # UNIMODAL: The tags are deleted to create the source data for unimodal case 14 | cp "${BASEDIR}/en${tgt_lang}_parallel.${split}.${tgt_lang}.${STYLE_0_LABEL}" "${OUTDIR}/en${tgt_lang}_parallel.${split}.en" 15 | # UNIMODAL: The tags are deleted to create the source data for unimodal case 16 | sed -i "s/\[${STYLE_0_LABEL}[0-9]*\]//g;s/ / /g;s/^ //g;s/ $//g" ${OUTDIR}/en${tgt_lang}_parallel.${split}.en 17 | else 18 | # BIMODAL: Source data for the bimodal case is just the concatenation of the two styles 19 | cat ${BASEDIR}/en${tgt_lang}_parallel.${split}.en."${STYLE_0_LABEL}" ${BASEDIR}/en${tgt_lang}_parallel.${split}.en."${STYLE_1_LABEL}" > ${OUTDIR}/en${tgt_lang}_parallel.${split}.en 20 | fi 21 | # the following line performs simple strip operations on the lines 22 | sed -i 's/ / /g;s/^ //g;s/ $//g' ${OUTDIR}/en${tgt_lang}_parallel.${split}.en 23 | done 24 | 25 | # Step 2: Create target data for tagger 26 | for split in train test val; do 27 | if [ "$UNIMODAL" -eq 1 ]; then 28 | # UNIMODAL: Target for unimodal tagger is the POS tagged data 29 | cp "${BASEDIR}/en${tgt_lang}_parallel.${split}.${tgt_lang}.${STYLE_0_LABEL}" "${OUTDIR}/en${tgt_lang}_parallel.${split}.${tgt_lang}" 30 | else 31 | cat ${BASEDIR}/en${tgt_lang}_parallel.${split}.${tgt_lang}."${STYLE_0_LABEL}" ${BASEDIR}/en${tgt_lang}_parallel.${split}.${tgt_lang}."${STYLE_1_LABEL}" > ${OUTDIR}/en${tgt_lang}_parallel.${split}.${tgt_lang} 32 | fi 33 | done 34 | 35 | mv ${OUTDIR}/en${tgt_lang}_parallel.val.en ${OUTDIR}/en${tgt_lang}_parallel.dev.en 36 | mv ${OUTDIR}/en${tgt_lang}_parallel.val.${tgt_lang} ${OUTDIR}/en${tgt_lang}_parallel.dev.${tgt_lang} 37 | -------------------------------------------------------------------------------- /tag-and-generate-data-prep/src/run.py: -------------------------------------------------------------------------------- 1 | """Generates tags 2 | Usage: 3 | run.py [options] 4 | 5 | Options: 6 | --data_pth= Path to the data directory 7 | --outpath= Output path 8 | --style_0_label= Label for style 0 9 | --style_1_label= Label for style 1 10 | --ngram_range_min= Min n_gram_range [default: 1] 11 | --ngram_range_max= Max n_gram_range [default: 2] 12 | --style_label_col= Name of the column that has style label column [default: style] 13 | --thresh= tf-idf ratio threshold [default: 0.90] 14 | --is_unimodal= Whether the dataset is unimodal (like politeness) or has two styles (like yelp) 15 | --gen_tags= Whether the style labels need to be generated again [default: True] 16 | """ 17 | from docopt import docopt 18 | import json 19 | import pandas as pd 20 | import pandas as pd 21 | import subprocess 22 | import logging 23 | 24 | from src.style_tags import TFIDFStatsGenerator, RelativeTagsGenerator, TrainDataGen 25 | 26 | 27 | def tag_style_markers(data_pth: str, outpath: str, style_0_label: str, style_1_label: str, tgt_lang="tagged", thresh=0.90, ngram_range=(1, 2), 28 | ignore_from_tags=None, style_label_col="label", drop_duplicates=False, 29 | gen_tags=True): 30 | """Runs tag generator. After this step, the following files are generated in the ``outpath`` directory: 31 | * entgt_lang_parallel.{split}.en.style_N_label: Sentences in style N 32 | * entgt_lang_parallel.{split}.taged.style_N_label: Sentences in style N with attribute phrases tagged 33 | (Here N is either 0 or 1, and split is one of {train, test, dev}) 34 | * style_N_tags.json: Attribute tags for style N (0 or 1) 35 | 36 | A combination of the above files is sufficient to generate training data 37 | for seq2seq models used by the tag-and-generate approach. 38 | Args: 39 | data_pth ([str]): Path to a file with the data. Each file should have the following columns: 40 | txt: The actual text 41 | split: train/test/dev 42 | style_label_col: indicates the style 43 | outpath ([str]): Path to the folder where the output files are written. 44 | style_0_label ([str]): Label for the 0th style. 45 | style_1_label ([str]): abel for the 1st style. 46 | tgt_lang ([str]): [description] 47 | thresh (float, optional): [description]. Defaults to 0.90. 48 | ngram_range (tuple, optional): [description]. Defaults to (1, 2). 49 | ignore_from_tags ([type], optional): [description]. Defaults to None. 50 | style_label_col (str, optional): [description]. Defaults to "label". 51 | """ 52 | data = pd.read_csv(data_pth, sep="\t") 53 | if drop_duplicates: 54 | data = data.drop_duplicates(subset="txt") 55 | 56 | # Step 1 57 | logging.info("Reading the data") 58 | data_style_0 = data[data[style_label_col] == style_0_label] 59 | data_style_1 = data[data[style_label_col] == style_1_label] 60 | 61 | if gen_tags: 62 | # Step 2 63 | logging.info("Getting TF-IDF stats for both the corpora") 64 | logging.info(f"#Records {style_0_label} = {len(data_style_0)}") 65 | logging.info(f"#Records {style_1_label} = {len(data_style_1)}") 66 | 67 | tags_style_0, tags_style_1 = generate_tags(df_txt_class_1=data_style_0[data_style_0["split"] != "test"]["txt"], 68 | df_txt_class_2=data_style_1[data_style_1["split"] 69 | != "test"]["txt"], 70 | tag_class_1=style_0_label, 71 | tag_class_2=style_1_label, 72 | ignore_from_tags=ignore_from_tags, 73 | thresh=thresh, 74 | ngram_range=ngram_range) 75 | 76 | with open(f"{outpath}/{style_0_label}_tags.json", "w") as f: 77 | json.dump(tags_style_0, f) 78 | 79 | with open(f"{outpath}/{style_1_label}_tags.json", "w") as f: 80 | json.dump(tags_style_1, f) 81 | 82 | else: 83 | with open(f"{outpath}/{style_0_label}_tags.json", "r") as f: 84 | tags_style_0 = json.load(f) 85 | with open(f"{outpath}/{style_1_label}_tags.json", "r") as f: 86 | tags_style_1 = json.load(f) 87 | 88 | # Step 3 89 | logging.info("Generating the tagged data") 90 | TrainDataGen(data=data_style_0, outpath=outpath, tags=tags_style_0, 91 | tag_token=style_0_label, tgt_lang=tgt_lang).generate() 92 | TrainDataGen(data=data_style_1, outpath=outpath, tags=tags_style_1, 93 | tag_token=style_1_label, tgt_lang=tgt_lang).generate() 94 | 95 | 96 | def generate_tags(df_txt_class_1, 97 | df_txt_class_2, 98 | tag_class_1, 99 | tag_class_2, 100 | thresh, 101 | ngram_range, 102 | ignore_from_tags=None, 103 | ): 104 | stats_class_1 = TFIDFStatsGenerator( 105 | df_txt_class_1, tag_class_1, ngram_range=ngram_range) 106 | stats_class_2 = TFIDFStatsGenerator( 107 | df_txt_class_2, tag_class_2, ngram_range=ngram_range) 108 | 109 | class_1_tags = RelativeTagsGenerator(main_class_stats=stats_class_1, 110 | relative_class_stats=stats_class_2, 111 | ignore_from_tags=ignore_from_tags, 112 | thresh=thresh).tags 113 | 114 | class_2_tags = RelativeTagsGenerator(main_class_stats=stats_class_2, 115 | relative_class_stats=stats_class_1, 116 | thresh=thresh).tags 117 | return class_1_tags, class_2_tags 118 | 119 | 120 | def prepare_parallel_data_tagger(outdir, style_0_label, style_1_label, is_unimodal): 121 | subprocess.check_call(f"scripts/prep_tagger.sh {outdir} {outdir} tagged {int(is_unimodal)} {style_0_label} {style_1_label}", 122 | shell=True) 123 | 124 | 125 | def prepare_parallel_data_generator(outdir, style_0_label, style_1_label, is_unimodal): 126 | # "${MASKED_OP_DIR}" "${MASKED_OP_DIR}" "$prefix"masked "$prefix"unmasked "$isunimodal" "$posmask" "$negmask" 127 | subprocess.check_call(f"scripts/prep_generator.sh {outdir} {outdir} tagged generated {int(is_unimodal)} {style_0_label} {style_1_label}", 128 | shell=True) 129 | 130 | 131 | if __name__ == '__main__': 132 | logging.basicConfig(level=logging.INFO) 133 | 134 | args = docopt(__doc__) 135 | is_unimodal = int(args["--is_unimodal"] == "True") 136 | 137 | # step 1: generate attribute markers, tagged dataset 138 | tag_style_markers(data_pth=args["--data_pth"], 139 | outpath=args["--outpath"], 140 | style_0_label=args["--style_0_label"], 141 | style_1_label=args["--style_1_label"], 142 | thresh=float(args["--thresh"]), 143 | ngram_range=(int(args["--ngram_range_min"]), 144 | int(args["--ngram_range_max"])), 145 | style_label_col=args["--style_label_col"], 146 | gen_tags=(args["--gen_tags"] == "True")) 147 | 148 | 149 | 150 | # step 2: generate parallel dataset for the tagger 151 | prepare_parallel_data_tagger( 152 | args["--outpath"], args["--style_0_label"], args["--style_1_label"], is_unimodal) 153 | 154 | # step 3: generate parallel dataset for the generator 155 | prepare_parallel_data_generator( 156 | args["--outpath"], args["--style_0_label"], args["--style_1_label"], is_unimodal) 157 | -------------------------------------------------------------------------------- /tag-and-generate-data-prep/src/style_tags.py: -------------------------------------------------------------------------------- 1 | """Generates the tags for training the tager 2 | """ 3 | import pandas as pd 4 | from tqdm.autonotebook import tqdm 5 | import numpy as np 6 | from sklearn.feature_extraction.text import TfidfVectorizer 7 | from sklearn.feature_extraction.text import CountVectorizer 8 | import numpy 9 | import scipy.sparse 10 | from collections import namedtuple 11 | import pickle 12 | import tqdm 13 | import numpy as np 14 | import csv 15 | from tqdm import tqdm 16 | import logging 17 | tqdm.pandas() 18 | 19 | TFIDFStats = namedtuple("TFIDFStats", ["data_id", "id_to_word", "word_to_id", "tfidf_avg", "word_to_idf", "counts"]) 20 | 21 | 22 | class TrainDataGen: 23 | """ 24 | Generates the training data 25 | """ 26 | def __init__(self, data, outpath, tags, tag_token, tgt_lang): 27 | super().__init__() 28 | self.data = data 29 | 30 | self.outpath = outpath 31 | self.tag_token = tag_token 32 | self.tags = tags 33 | self.tgt_lang = tgt_lang 34 | 35 | def generate(self): 36 | self.tag_and_dump(split="train") 37 | self.tag_and_dump(split="test") 38 | self.tag_and_dump(split="val") 39 | 40 | 41 | def tag_and_dump(self, split): 42 | """Iterate over the given split, tags the sentences and write out the data 43 | 44 | Arguments: 45 | split {[str]} -- [description] 46 | """ 47 | orig_sents, taged_sents = [], [] 48 | data_in = self.data[self.data["split"] == split] 49 | for _, r in data_in.iterrows(): 50 | orig = r["txt"].strip().replace("\n", "") 51 | orig_sents.append(orig) 52 | taged_sents.append(TrainDataGen.tag_sentence(orig, self.tags, self.tag_token).strip().replace("\n", "")) 53 | #polite_out.write(f"{orig}\n") 54 | #polite_taged_out.write(f"{taged_sent}\n") 55 | with open(f"{self.outpath}/en{self.tgt_lang}_parallel.{split}.en.{self.tag_token}", "w") as orig_out,\ 56 | open(f"{self.outpath}/en{self.tgt_lang}_parallel.{split}.{self.tgt_lang}.{self.tag_token}", "w") as taged_out: 57 | for orig, taged in tqdm(zip(orig_sents, taged_sents), total=len(taged_sents)): 58 | if self.tag_token in taged: 59 | ### ONLY WRITE OUT THE tagED DATA 60 | orig_out.write(f"{orig.strip()}\n") 61 | taged_out.write(f"{taged.strip()}\n") 62 | 63 | def tag_and_dump_batched(self, split): 64 | """Iterate over the given split, tags the sentences and write out the data 65 | 66 | Arguments: 67 | split {[str]} -- [description] 68 | """ 69 | orig_sents, taged_sents = [], [] 70 | self.data["txt_taged"] = self.data["txt"].progress_apply(lambda x: \ 71 | TrainDataGen.tag_sentence(orig, self.tags,\ 72 | self.tag_token).strip().replace("\n", "")) 73 | 74 | 75 | with open(f"{self.outpath}/en{self.tgt_lang}_parallel.{split}.en.{self.tag_token}]", "w") as orig_out,\ 76 | open(f"{self.outpath}/en{self.tgt_lang}_parallel.{split}.{self.tgt_lang}.{self.tag_token}]", "w") as taged_out: 77 | for orig, taged in tqdm(zip(orig_sents, taged_sents), total=len(taged_sents)): 78 | if self.tag_token in taged: 79 | ### ONLY WRITE OUT THE tagED DATA 80 | orig_out.write(f"{orig.strip()}\n") 81 | taged_out.write(f"{taged.strip()}\n") 82 | 83 | @staticmethod 84 | def tag_sentence(sent, tag_dict, tag_token, 85 | pos_weight: int = 3, 86 | max_pos_indicator: int = 20, 87 | concat = True): 88 | """Given a sentence and a dictionary from 89 | tag_value to tag_probability, replaces all the words mw that are in the tag_dict 90 | with a probability tag_dict[mw] 91 | 92 | Arguments: 93 | sent {[str]} -- [the given sentence] 94 | tag_dict {[dict]} -- [the tag dictionary] 95 | tag_token {[str]} -- [the taging token] 96 | dont_concat -- [do not concat] 97 | 98 | Returns: 99 | [str] -- [the taged sentence] 100 | """ 101 | i = 0 102 | sent = sent.split() 103 | taged_sent = [] 104 | prev_tag = False 105 | while i < len(sent): 106 | loc = min(i // pos_weight, max_pos_indicator) 107 | key_bi_gram = " ".join(sent[i: i + 2]) 108 | key_tri_gram = " ".join(sent[i: i + 3]) 109 | key_quad_gram = " ".join(sent[i: i + 4]) 110 | 111 | if key_quad_gram in tag_dict and np.random.rand() < tag_dict[key_quad_gram]: 112 | if not concat or not prev_tag: 113 | taged_sent.append(f"[{tag_token}{loc}]") 114 | prev_tag = True 115 | i += 4 116 | 117 | elif key_tri_gram in tag_dict and np.random.rand() < tag_dict[key_tri_gram]: 118 | if not concat or not prev_tag: 119 | taged_sent.append(f"[{tag_token}{loc}]") 120 | prev_tag = True 121 | i += 3 122 | elif key_bi_gram in tag_dict and np.random.rand() < tag_dict[key_bi_gram]: 123 | if not concat or not prev_tag: 124 | taged_sent.append(f"[{tag_token}{loc}]") 125 | prev_tag = True 126 | i += 2 127 | elif sent[i] in tag_dict and np.random.rand()< tag_dict[sent[i]]: 128 | if not concat or not prev_tag: 129 | taged_sent.append(f"[{tag_token}{loc}]") 130 | prev_tag = True 131 | i += 1 132 | else: 133 | taged_sent.append(sent[i]) 134 | prev_tag = False 135 | i += 1 136 | return " ".join(taged_sent) 137 | 138 | class TFIDFStatsGenerator: 139 | 140 | def __init__(self, data, data_id, ngram_range): 141 | super().__init__() 142 | self.ngram_range = ngram_range 143 | self.data_id = data_id 144 | self.data = data 145 | self.generate() 146 | 147 | def get_word_counts(self): 148 | """Generates the counts for various n-grams for the given corpus 149 | 150 | Returns: 151 | a dictionary from phrase to word count 152 | """ 153 | cv = CountVectorizer(ngram_range=self.ngram_range) 154 | cv_fit = cv.fit_transform(self.data) 155 | feature_names = cv.get_feature_names() 156 | X = np.asarray(cv_fit.sum(axis=0)) # sum counts across sentences 157 | word_to_id = {feature_names[i]: i for i in range(len(cv.get_feature_names()))} 158 | word_count = {} 159 | for w in word_to_id: 160 | word_count[w] = X[0, word_to_id[w]] 161 | return word_count 162 | 163 | def generate(self): 164 | """Generates various TFIDF related stats 165 | for the given data and wraps them in a namedtuple 166 | 167 | Returns: 168 | [type] -- [description] 169 | """ 170 | logging.info("Running TfidfVectorizer") 171 | vectorizer = TfidfVectorizer(ngram_range=self.ngram_range) 172 | X = vectorizer.fit_transform(self.data) 173 | feature_names = vectorizer.get_feature_names() 174 | id_to_word = {i: feature_names[i] for i in range(len(vectorizer.get_feature_names()))} 175 | word_to_id = {v: k for k, v in id_to_word.items()} 176 | X = np.asarray(X.mean(axis=0)).squeeze(0) # / num_docs 177 | 178 | idf = vectorizer.idf_ 179 | counts = self.get_word_counts() 180 | word_to_idf = dict(zip(feature_names, idf)) 181 | 182 | self.id_to_word = id_to_word 183 | self.word_to_id = word_to_id 184 | self.tfidf_avg = X 185 | self.word_to_idf = word_to_idf 186 | self.counts = counts 187 | 188 | 189 | class RelativeTagsGenerator: 190 | 191 | def __init__(self, main_class_stats, relative_class_stats, 192 | min_freq: int = 2, thresh: float = 0.90, 193 | ignore_from_tags = None): 194 | """Generates tags for the main class relative to 195 | the relative class. This is done on the basis of relative TF-IDF ratios of the words. 196 | 197 | Arguments: 198 | main_class_stats {[type]} -- [description] 199 | ref_class_stats {[type]} -- [description] 200 | 201 | Keyword Arguments: 202 | min_freq {int} -- [Minimum freq in the main class for the phrase to be considered] (default: {1}) 203 | thresh {float} -- [The relative tf-idf scores are converted to percentiles. These percentiles are then 204 | used to select the tag phrases. In this case, the cutoff for such phrases is 0.90] (default: {0.90}) 205 | ignore_from_tags {[set]} -- [Set of words like the NER words, which might have to be ignored] (default: {None}) 206 | """ 207 | super().__init__() 208 | self.main_class_stats = main_class_stats 209 | self.relative_class_stats = relative_class_stats 210 | self.min_freq = min_freq 211 | self.c1_tag = main_class_stats.data_id 212 | self.c2_tag = relative_class_stats.data_id 213 | self.thresh = thresh 214 | self.ignore_from_tags = ignore_from_tags 215 | 216 | self.generate_tfidf_report() 217 | self.generate_relative_tags() 218 | 219 | 220 | def generate_tfidf_report(self): 221 | """Given TFIDF statistics on two datasets, returns a common tf-idf report. 222 | The report measures various statistics on the words that appear in class_2 223 | 224 | Arguments: 225 | class1_tfidf_report {[TFIDFStats]} -- [TFIDFStats for class1] 226 | class2_tfidf_report {[TFIDFStats]} -- [TFIDFStats for class2] 227 | """ 228 | report = [] 229 | for word in self.main_class_stats.word_to_id.keys(): 230 | if self.main_class_stats.counts[word] >= self.min_freq and word in self.relative_class_stats.word_to_id: 231 | res = {} 232 | res["word"] = word 233 | res["freq"] = self.main_class_stats.counts[word] 234 | res[f"{self.c1_tag}_mean_tfidf"] = self.main_class_stats.tfidf_avg[self.main_class_stats.word_to_id[word]] 235 | res[f"{self.c2_tag}_mean_tfidf"] = self.relative_class_stats.tfidf_avg[self.relative_class_stats.word_to_id[word]] 236 | res[f"{self.c1_tag}_idf"] = self.main_class_stats.word_to_idf[word] 237 | res[f"{self.c2_tag}_idf"] = self.relative_class_stats.word_to_idf[word] 238 | report.append(res) 239 | self.report = pd.DataFrame(report) 240 | 241 | def generate_relative_tags(self): 242 | """Returns a dictionary of phrases that are important in class1 relative to 243 | class2 244 | """ 245 | c1_over_c2 = f"{self.c1_tag}_over_{self.c2_tag}" 246 | c2_over_c1 = f"{self.c2_tag}_over_{self.c1_tag}" 247 | # tfidf_report["np_over_p"] = (tfidf_report["np_mean_tfidf"] / len(data_p_0)) / (tfidf_report["p_mean_tfidf"] / len(data_p_9)) 248 | self.report[c1_over_c2] = self.report[f"{self.c1_tag}_mean_tfidf"] / self.report[f"{self.c2_tag}_mean_tfidf"] #ratio of tf-idf in the two corpora 249 | 250 | self.report[c2_over_c1] = 1 / self.report[c1_over_c2] 251 | 252 | self.report[f"{self.c1_tag}_tag"] = (self.report[c1_over_c2] / self.report[c1_over_c2].sum()) ** 0.75 253 | # ^ add support for the small values 254 | 255 | self.report[f"{self.c1_tag}_tag"] = self.report[f"{self.c1_tag}_tag"] / self.report[f"{self.c1_tag}_tag"].sum() 256 | # ^ make a probability 257 | 258 | self.report.sort_values(by=f"{self.c1_tag}_tag", ascending=False, inplace=True) 259 | self.report['rank'] = self.report[f"{self.c1_tag}_tag"].rank(pct=True) 260 | # ^ assign percentile 261 | 262 | 263 | important_phrases = self.report[self.report["rank"] >= self.thresh] 264 | # ^ only take phrases that clear the threshold (default: 0.9) 265 | 266 | important_phrases["score"] = (important_phrases["rank"] - self.thresh) / (1 - self.thresh) 267 | # ^ make a distribution again 268 | 269 | tags= {} 270 | for i, r in important_phrases.iterrows(): 271 | tags[r["word"]] = r["score"] 272 | 273 | self.tags = tags 274 | 275 | if self.ignore_from_tags is not None: 276 | logging.info("Ignoring tags") 277 | self.tags = self.filter_tags_with_ignored_entities() 278 | 279 | def filter_tags_with_ignored_entities(self): 280 | res = {} 281 | for k, v in self.tags.items(): 282 | if not any(k_part in self.ignore_from_tags for k_part in k.split()): 283 | res[k] = v 284 | return res 285 | -------------------------------------------------------------------------------- /tag-and-generate-train/README.md: -------------------------------------------------------------------------------- 1 | # Training Tagger and Generator 2 | 3 | - This repository contains the code to train the ``tagger`` and ``generator`` [modules](https://arxiv.org/abs/2004.14257). 4 | 5 | - Apart from scripts to train the modules, it also has scripts needed to run inference on the test set and to run evaluation for metrics like `BLEU`, `ROUGE`, and `METEOR`. 6 | 7 | --- 8 | 9 | ## Background 10 | 11 | - Both `tagger` and `generator` are seq2seq models that require parallel data generated by the [data prep module](https://github.com/tag-and-generate/code-pre-release/tree/master/tag-and-generate-data-prep). 12 | 13 | - The parallel datasets are: 14 | - Tagger: 15 | `entagged_parallel.{split}.en` → `entagged_parallel.{split}.tagged` 16 | - Generated: 17 | `engenerated_parallel.{split}.en` → `engenerated_parallel.{split}.generated` 18 | (where `{split}` is either train, test, or dev.) 19 | 20 | --- 21 | 22 | ## Prepare BPE 23 | 24 | ```sh 25 | bash scripts/prepare_bpe.sh [tagged|generated] {base_folder} 26 | ``` 27 | 28 | Where: 29 | 30 | - `base_folder`: The folder in which the data files are stored (argument used in creation of training data) 31 | 32 | --- 33 | 34 | ## Train Tagger and Generator 35 | 36 | ```sh 37 | bash scripts/train_tagger.sh tagged {handle} {base_folder} 38 | ``` 39 | 40 | Where: 41 | 42 | - `handle:` This is an identifier used to bucketize models trained on different datasets. Models on each `handle` are stored seperate folders with names indexed by `{handle}`, within the `{models}` directory. 43 | - `base_folder:` The folder in which the data files are stored (argument used in creation of training data). 44 | 45 | ### Train Generator 46 | 47 | ```sh 48 | bash scripts/train_generator.sh generated {handle} {base_folder} 49 | ``` 50 | 51 | Where: 52 | 53 | - `handle:` This is an identifier used to bucketize models trained on different datasets. Models on each handle are stored seperate folders with names indexed by `{handle}`, within the `{models}` directory. 54 | - `base_folder:` The folder in which the data files are stored (argument used in creation of training data). 55 | 56 | --- 57 | 58 | ## Inference 59 | 60 | ```sh 61 | bash scripts/inference.sh {input_file} {jobname}\ 62 | tagged generated\ 63 | {handle}\ 64 | {style_0_label} {style_1_label}\ 65 | {base_folder} {device} 66 | ``` 67 | 68 | Where: 69 | 70 | - `input_file:` The input test file which needs to be transferred. This is the raw text file, with one sentence per line. 71 | - `jobname:` A unique identifier for the inference job. 72 | - `handle:` dataset argument we pass when we train `tagger` or `generator` -- used to identify model paths for `tagger` and `generator`. 73 | - `style_0_label:` A label for style 0 74 | - `style_1_label:` A label for style 1 75 | - `base_folder:` The folder in which the data files are stored (argument used in creation of training data) 76 | - `device:` gpu id 77 | 78 | --- 79 | 80 | ## Evaluation 81 | 82 | ```sh 83 | bash run_context_eval.sh {hypothesis_filepath} {reference_filepath} 84 | ``` 85 | 86 | Where: 87 | 88 | - `hypothesis_filepath:` The full path to the transferred output from trained model (hypothesis). 89 | - `reference_filepath:` The full path to the ideal output (for BLEU-r) or the original input file (for BLEU-s). 90 | 91 | ## Trained Models 92 | 93 | The trained models can be found [here](https://drive.google.com/drive/folders/1tXLC4WbXc_WLgvQu2mTa3jDe0efZ3dz1?usp=sharing). 94 | 95 | ## References 96 | 97 | - The code for evaluation has been partially borrowed from https://github.com/Maluuba/nlg-eval 98 | - Most of the code for the training pipeline has been borrowed from https://github.com/pmichel31415/jsalt-2019-mt-tutorial 99 | -------------------------------------------------------------------------------- /tag-and-generate-train/eval/context_eval.py: -------------------------------------------------------------------------------- 1 | """ 2 | Compute context based metrics for hypothesis given reference 3 | 4 | Usage: 5 | context_eval.py [options] 6 | 7 | Options: 8 | --hyp= Path to model hypothesis 9 | --ref= Path to model reference 10 | """ 11 | 12 | from nlgeval import compute_metrics 13 | from docopt import docopt 14 | 15 | class Scorer: 16 | 17 | def __init__(self, ref_file, hyp_file): 18 | self.ref_file = ref_file 19 | self.hyp_file = hyp_file 20 | self.references = list(map(lambda x:x.strip('\n'), open(ref_file, 'r').readlines())) 21 | self.hypothesis = list(map(lambda x:x.strip('\n'), open(hyp_file, 'r').readlines())) 22 | self.metrics_dict = {} 23 | 24 | def score(self): 25 | hyp_test_str = "\n".join([h.replace('\n', '') for h in self.hypothesis]) 26 | ref_test_str = "\n".join([r.replace('\n', '') for r in self.references]) 27 | with open("/tmp/hyp.txt", 'w') as fd_hyp: 28 | fd_hyp.write(hyp_test_str) 29 | fd_hyp.close() 30 | with open("/tmp/ref.txt", 'w') as fd_ref: 31 | fd_ref.write(ref_test_str) 32 | fd_ref.close() 33 | 34 | self.metrics_dict = compute_metrics(hypothesis="/tmp/hyp.txt", references=["/tmp/ref.txt"], no_glove=True, no_skipthoughts=True) 35 | 36 | def print_metrics(self): 37 | for key in self.metrics_dict: 38 | print (key + "\t\t" + str(self.metrics_dict[key])) 39 | 40 | def evaluate(): 41 | args = docopt(__doc__) 42 | scorer = Scorer(args["--ref"], args["--hyp"]) 43 | scorer.score() 44 | # scorer.print_metrics() # Script already prints. Uncomment if needed 45 | 46 | if __name__ == '__main__': 47 | evaluate() 48 | -------------------------------------------------------------------------------- /tag-and-generate-train/eval/nlg_eval/LICENSE.md: -------------------------------------------------------------------------------- 1 | # nlg-eval 2 | 3 | Copyright (c) Microsoft Corporation 4 | 5 | All rights reserved. 6 | 7 | MIT License 8 | 9 | Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: 10 | 11 | The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. 12 | 13 | THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 14 | 15 | ## THIRD PARTY NOTICES 16 | 17 | This project is based on or incorporates material from the projects listed below (collectively, Third Party Code). Microsoft is not the original author of the Third Party Code. The original copyright notice and the license under which Microsoft received such Third Party Code, are set forth below. Such licenses and notices are provided for informational purposes only. Microsoft licenses the Third Party Code to you under the terms set forth in the Use Terms for the Microsoft Product. Microsoft reserves all other rights not expressly granted under this agreement, whether by implication, estoppel or otherwise. 18 | 19 | ### Skip-Thoughts 20 | From the paper "Skip-Thought Vectors." arXiv preprint arXiv:1506.06726 (2015). 21 | Copyright (c) 2015 Ryan Kiros, Yukun Zhu, Ruslan Salakhutdinov, Richard S. Zemel, Antonio Torralba, Raquel Urtasun, and Sanja Fidler 22 | https://github.com/ryankiros/skip-thoughts 23 | 24 | Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. 25 | 26 | You may obtain a copy of the License at 27 | 28 | http://www.apache.org/licenses/LICENSE-2.0 29 | 30 | Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 31 | 32 | See the License for the specific language governing permissions and limitations under the License. 33 | 34 | ### Microsoft COCO Caption Evaluation 35 | Copyright (c) 2015 Xinlei Chen, Hao Fang, Tsung-Yi Lin, and Ramakrishna Vedantam 36 | All rights reserved. 37 | 38 | Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 39 | 40 | 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 41 | 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 42 | 43 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" ANDANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIEDWARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 44 | 45 | The views and conclusions contained in the software and documentation are those of the authors and should not be interpreted as representing official policies, either expressed or implied, of the FreeBSD Project. 46 | -------------------------------------------------------------------------------- /tag-and-generate-train/eval/nlg_eval/MANIFEST.in: -------------------------------------------------------------------------------- 1 | recursive-include nlgeval * 2 | -------------------------------------------------------------------------------- /tag-and-generate-train/eval/nlg_eval/README.md: -------------------------------------------------------------------------------- 1 | [![Build Status](https://travis-ci.org/Maluuba/nlg-eval.svg?branch=master)](https://travis-ci.org/Maluuba/nlg-eval) 2 | 3 | # nlg-eval 4 | Evaluation code for various unsupervised automated metrics for NLG (Natural Language Generation). 5 | It takes as input a hypothesis file, and one or more references files and outputs values of metrics. 6 | Rows across these files should correspond to the same example. 7 | 8 | ## Metrics ## 9 | - BLEU 10 | - METEOR 11 | - ROUGE 12 | - CIDEr 13 | - SkipThought cosine similarity 14 | - Embedding Average cosine similarity 15 | - Vector Extrema cosine similarity 16 | - Greedy Matching score 17 | 18 | ## Setup ## 19 | 20 | Install Java 1.8.0 (or higher). 21 | Then run: 22 | 23 | ```bash 24 | # Install the Python dependencies. 25 | pip install git+https://github.com/Maluuba/nlg-eval.git@master 26 | 27 | # If using macOS High Sierra or higher, run this before run setup, to allow multithreading 28 | # export OBJC_DISABLE_INITIALIZE_FORK_SAFETY=YES 29 | 30 | # Simple setup: 31 | # Download required data (e.g. models, embeddings) and external code files. 32 | nlg-eval --setup 33 | ``` 34 | 35 | ### Custom Setup ### 36 | ```bash 37 | # If you don't like the default path (~/.cache/nlgeval) for the downloaded data, 38 | # then specify a path where you want the files to be downloaded. 39 | # The value for the data path is stored in ~/.config/nlgeval/rc.json and can be overwritten by 40 | # setting the NLGEVAL_DATA environment variable. 41 | nlg-eval --setup ${data_path} 42 | ``` 43 | 44 | ## Usage ## 45 | Once setup has completed, the metrics can be evaluated with a Python API or in the command line. 46 | 47 | Examples of the Python API can be found in [test_nlgeval.py](nlgeval/tests/test_nlgeval.py). 48 | 49 | ### Standalone ### 50 | 51 | nlg-eval --hypothesis=examples/hyp.txt --references=examples/ref1.txt --references=examples/ref2.txt 52 | 53 | where each line in the hypothesis file is a generated sentence and the corresponding 54 | lines across the reference files are ground truth reference sentences for the 55 | corresponding hypothesis. 56 | 57 | ### functional API: for the entire corpus ### 58 | 59 | ```python 60 | from nlgeval import compute_metrics 61 | metrics_dict = compute_metrics(hypothesis='examples/hyp.txt', 62 | references=['examples/ref1.txt', 'examples/ref2.txt']) 63 | ``` 64 | 65 | ### functional API: for only one sentence ### 66 | 67 | ```python 68 | from nlgeval import compute_individual_metrics 69 | metrics_dict = compute_individual_metrics(references, hypothesis) 70 | ``` 71 | 72 | where `references` is a list of ground truth reference text strings and 73 | `hypothesis` is the hypothesis text string. 74 | 75 | ### object oriented API for repeated calls in a script - single example ### 76 | 77 | ```python 78 | from nlgeval import NLGEval 79 | nlgeval = NLGEval() # loads the models 80 | metrics_dict = nlgeval.compute_individual_metrics(references, hypothesis) 81 | ``` 82 | 83 | where `references` is a list of ground truth reference text strings and 84 | `hypothesis` is the hypothesis text string. 85 | 86 | ### object oriented API for repeated calls in a script - multiple examples ### 87 | 88 | ```python 89 | from nlgeval import NLGEval 90 | nlgeval = NLGEval() # loads the models 91 | metrics_dict = nlgeval.compute_metrics(references, hypothesis) 92 | ``` 93 | 94 | where `references` is a list of lists of ground truth reference text strings and 95 | `hypothesis` is a list of hypothesis text strings. Each inner list in `references` 96 | is one set of references for the hypothesis (a list of single reference strings for 97 | each sentence in `hypothesis` in the same order). 98 | 99 | ## Reference ## 100 | If you use this code as part of any published research, please cite the following paper: 101 | 102 | Shikhar Sharma, Layla El Asri, Hannes Schulz, and Jeremie Zumer. 103 | **"Relevance of Unsupervised Metrics in Task-Oriented Dialogue for Evaluating Natural Language Generation"** 104 | *arXiv preprint arXiv:1706.09799* (2017) 105 | 106 | ```bibtex 107 | @article{sharma2017nlgeval, 108 | author = {Sharma, Shikhar and El Asri, Layla and Schulz, Hannes and Zumer, Jeremie}, 109 | title = {Relevance of Unsupervised Metrics in Task-Oriented Dialogue for Evaluating Natural Language Generation}, 110 | journal = {CoRR}, 111 | volume = {abs/1706.09799}, 112 | year = {2017}, 113 | url = {http://arxiv.org/abs/1706.09799} 114 | } 115 | ``` 116 | 117 | ## Example ## 118 | Running 119 | 120 | nlg-eval --hypothesis=examples/hyp.txt --references=examples/ref1.txt --references=examples/ref2.txt 121 | 122 | gives 123 | 124 | Bleu_1: 0.550000 125 | Bleu_2: 0.428174 126 | Bleu_3: 0.284043 127 | Bleu_4: 0.201143 128 | METEOR: 0.295797 129 | ROUGE_L: 0.522104 130 | CIDEr: 1.242192 131 | SkipThoughtsCosineSimilairty: 0.626149 132 | EmbeddingAverageCosineSimilairty: 0.884690 133 | VectorExtremaCosineSimilarity: 0.568696 134 | GreedyMatchingScore: 0.784205 135 | 136 | ## Troubleshooting 137 | If you have issues with Meteor then you can try lowering the `mem` variable in meteor.py 138 | 139 | ## Important Note ## 140 | CIDEr by default (with idf parameter set to "corpus" mode) computes IDF values using the reference sentences provided. Thus, 141 | CIDEr score for a reference dataset with only 1 image (or example for NLG) will be zero. When evaluating using one (or few) 142 | images, set idf to "coco-val-df" instead, which uses IDF from the MSCOCO Vaildation Dataset for reliable results. This has 143 | not been adapted in this code. For this use-case, apply patches from 144 | [vrama91/coco-caption](https://github.com/vrama91/coco-caption). 145 | 146 | 147 | ## External data directory 148 | 149 | To mount an already prepared data directory to a Docker container or share it between 150 | users, you can set the `NLGEVAL_DATA` environment variable to let nlg-eval know 151 | where to find its models and data. E.g. 152 | 153 | NLGEVAL_DATA=~/workspace/nlg-eval/nlgeval/data 154 | 155 | This variable overrides the value provided during setup (stored in `~/.config/nlgeval/rc.json`) 156 | 157 | ## Microsoft Open Source Code of Conduct ## 158 | This project has adopted the [Microsoft Open Source Code of 159 | Conduct](https://opensource.microsoft.com/codeofconduct/). 160 | For more information see the [Code of Conduct 161 | FAQ](https://opensource.microsoft.com/codeofconduct/faq/) or 162 | contact [opencode@microsoft.com](mailto:opencode@microsoft.com) 163 | with any additional questions or comments. 164 | 165 | ## License ## 166 | See [LICENSE.md](LICENSE.md). 167 | -------------------------------------------------------------------------------- /tag-and-generate-train/eval/nlg_eval/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tag-and-generate/tagger-generator/0059c49fd7df273be9421cb683c96f362663057d/tag-and-generate-train/eval/nlg_eval/__init__.py -------------------------------------------------------------------------------- /tag-and-generate-train/eval/nlg_eval/bin/nlg-eval: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # 3 | # Copyright (c) Microsoft Corporation. All rights reserved. 4 | # Licensed under the MIT license. See LICENSE.md file in the project root for full license information. 5 | 6 | import json 7 | import logging 8 | import os 9 | import stat 10 | import sys 11 | import time 12 | from zipfile import ZipFile 13 | 14 | import click 15 | from xdg import XDG_CONFIG_HOME, XDG_CACHE_HOME 16 | 17 | import nlgeval 18 | import nlgeval.utils 19 | 20 | CODE_PATH = nlgeval.__path__[0] 21 | 22 | 23 | def _download_file(d): 24 | import requests 25 | from tqdm import tqdm 26 | 27 | url, target_dir = d['url'], d['target_dir'] 28 | filename = url[url.rfind('/') + 1:] 29 | target_path = os.path.join(target_dir, filename) 30 | if not os.path.exists(target_path): 31 | # Collect data 1MB at a time. 32 | chunk_size = 1 * 1024 * 1024 33 | 34 | num_attempts = 3 35 | 36 | for attempt_num in range(num_attempts): 37 | try: 38 | print("Downloading {} to {}.".format(url, target_dir)) 39 | r = requests.get(url, stream=True) 40 | r.raise_for_status() 41 | 42 | total = None 43 | length = r.headers.get('Content-length') 44 | if length is not None: 45 | total = int(length) // chunk_size + 1 46 | 47 | with open(target_path, 'wb') as f: 48 | for chunk in tqdm(r.iter_content(chunk_size=chunk_size), 49 | desc="{}".format(filename), 50 | total=total, 51 | unit_scale=True, mininterval=15, unit=" chunks"): 52 | sys.stdout.flush() 53 | f.write(chunk) 54 | break 55 | except: 56 | if attempt_num < num_attempts - 1: 57 | wait_s = 1 * 60 58 | logging.exception("Error downloading file, will retry in %ds.", wait_s) 59 | # Wait and try to download later. 60 | time.sleep(wait_s) 61 | else: 62 | raise 63 | 64 | 65 | @click.command() 66 | @click.argument("data_path", required=False) 67 | def setup(data_path): 68 | """ 69 | Download required code and data files for nlg-eval. 70 | 71 | If the data_path argument is provided, install to the given location. 72 | Otherwise, your cache directory is used (usually ~/.cache/nlgeval). 73 | """ 74 | from nltk.downloader import download 75 | download('punkt') 76 | 77 | from multiprocessing import Pool 78 | 79 | if data_path is None: 80 | data_path = os.getenv('NLGEVAL_DATA', os.path.join(XDG_CACHE_HOME, 'nlgeval')) 81 | click.secho("Installing to {}".format(data_path), fg='red') 82 | click.secho("In case of incomplete downloads, delete the directory and run `nlg-eval --setup {}' again.".format(data_path), 83 | fg='red') 84 | 85 | path = os.path.join(CODE_PATH, 'word2vec/glove2word2vec.py') 86 | if os.path.exists(path): 87 | os.remove(path) 88 | 89 | downloads = [] 90 | 91 | if sys.version_info[0] == 2: 92 | downloads.append(dict( 93 | url='https://raw.githubusercontent.com/manasRK/glove-gensim/42ce46f00e83d3afa028fb6bf17ed3c90ca65fcc/glove2word2vec.py', 94 | target_dir=os.path.join(CODE_PATH, 'word2vec') 95 | )) 96 | else: 97 | downloads.append(dict( 98 | url='https://raw.githubusercontent.com/robmsmt/glove-gensim/4c2224bccd61627b76c50a5e1d6afd1c82699d22/glove2word2vec.py', 99 | target_dir=os.path.join(CODE_PATH, 'word2vec') 100 | )) 101 | 102 | setup_glove = not os.path.exists(os.path.join(data_path, 'glove.6B.300d.model.bin')) 103 | if setup_glove: 104 | downloads.append(dict( 105 | url='http://nlp.stanford.edu/data/glove.6B.zip', 106 | target_dir=data_path 107 | )) 108 | 109 | # Skip-thoughts data. 110 | downloads.append(dict( 111 | url='http://www.cs.toronto.edu/~rkiros/models/dictionary.txt', 112 | target_dir=data_path 113 | )) 114 | downloads.append(dict( 115 | url='http://www.cs.toronto.edu/~rkiros/models/utable.npy', 116 | target_dir=data_path 117 | )) 118 | downloads.append(dict( 119 | url='http://www.cs.toronto.edu/~rkiros/models/btable.npy', 120 | target_dir=data_path 121 | )) 122 | downloads.append(dict( 123 | url='http://www.cs.toronto.edu/~rkiros/models/uni_skip.npz', 124 | target_dir=data_path 125 | )) 126 | downloads.append(dict( 127 | url='http://www.cs.toronto.edu/~rkiros/models/uni_skip.npz.pkl', 128 | target_dir=data_path 129 | )) 130 | downloads.append(dict( 131 | url='http://www.cs.toronto.edu/~rkiros/models/bi_skip.npz', 132 | target_dir=data_path 133 | )) 134 | downloads.append(dict( 135 | url='http://www.cs.toronto.edu/~rkiros/models/bi_skip.npz.pkl', 136 | target_dir=data_path 137 | )) 138 | 139 | # multi-bleu.perl 140 | downloads.append(dict( 141 | url='https://raw.githubusercontent.com/moses-smt/mosesdecoder/b199e654df2a26ea58f234cbb642e89d9c1f269d/scripts/generic/multi-bleu.perl', 142 | target_dir=os.path.join(CODE_PATH, 'multibleu') 143 | )) 144 | 145 | for target_dir in {d['target_dir'] for d in downloads}: 146 | if not os.path.exists(target_dir): 147 | os.makedirs(target_dir) 148 | 149 | # Limit the number of threads so that we don't download too much from the same source concurrently. 150 | pool = Pool(min(4, len(downloads))) 151 | pool.map(_download_file, downloads) 152 | pool.close() 153 | pool.join() 154 | 155 | if setup_glove: 156 | from nlgeval.word2vec.generate_w2v_files import generate 157 | with ZipFile(os.path.join(data_path, 'glove.6B.zip')) as z: 158 | z.extract('glove.6B.300d.txt', data_path) 159 | generate(data_path) 160 | for p in [ 161 | os.path.join(data_path, 'glove.6B.zip'), 162 | os.path.join(data_path, 'glove.6B.300d.txt'), 163 | os.path.join(data_path, 'glove.6B.300d.model.txt'), 164 | ]: 165 | if os.path.exists(p): 166 | os.remove(p) 167 | 168 | path = os.path.join(CODE_PATH, 'multibleu/multi-bleu.perl') 169 | stats = os.stat(path) 170 | os.chmod(path, stats.st_mode | stat.S_IEXEC) 171 | 172 | cfg_path = os.path.join(XDG_CONFIG_HOME, "nlgeval") 173 | if not os.path.exists(cfg_path): 174 | os.makedirs(cfg_path) 175 | rc = dict() 176 | try: 177 | with open(os.path.join(cfg_path, "rc.json"), 'rt') as f: 178 | rc = json.load(f) 179 | except: 180 | print("WARNING: could not read rc.json in %s, overwriting" % cfg_path) 181 | rc['data_path'] = data_path 182 | with open(os.path.join(cfg_path, "rc.json"), 'wt') as f: 183 | f.write(json.dumps(rc)) 184 | 185 | 186 | @click.command() 187 | @click.option('--references', type=click.Path(exists=True), multiple=True, required=True, help='Path of the reference file. This option can be provided multiple times for multiple reference files.') 188 | @click.option('--hypothesis', type=click.Path(exists=True), required=True, help='Path of the hypothesis file.') 189 | @click.option('--no-overlap', is_flag=True, help='Flag. If provided, word overlap based metrics will not be computed.') 190 | @click.option('--no-skipthoughts', is_flag=True, help='Flag. If provided, skip-thought cosine similarity will not be computed.') 191 | @click.option('--no-glove', is_flag=True, help='Flag. If provided, other word embedding based metrics will not be computed.') 192 | def compute_metrics(hypothesis, references, no_overlap, no_skipthoughts, no_glove): 193 | """ 194 | Compute nlg-eval metrics. 195 | 196 | The --hypothesis and at least one --references parameters are required. 197 | 198 | To download the data and additional code files, use `nlg-eval --setup [data path]`. 199 | 200 | Note that nlg-eval also features an API, which may be easier to use. 201 | """ 202 | try: 203 | data_dir = nlgeval.utils.get_data_dir() 204 | except nlgeval.utils.InvalidDataDirException: 205 | sys.exit(1) 206 | click.secho("Using data from {}".format(data_dir), fg='green') 207 | click.secho("In case of broken downloads, remove the directory and run setup again.", fg='green') 208 | nlgeval.compute_metrics(hypothesis, references, no_overlap, no_skipthoughts, no_glove) 209 | 210 | 211 | if __name__ == '__main__': 212 | if len(sys.argv) > 1 and sys.argv[1] == '--setup': 213 | del sys.argv[0] 214 | setup() 215 | else: 216 | compute_metrics() 217 | -------------------------------------------------------------------------------- /tag-and-generate-train/eval/nlg_eval/examples/hyp.txt: -------------------------------------------------------------------------------- 1 | this is the model generated sentence1 which seems good enough 2 | this is sentence2 which has been generated by your model 3 | -------------------------------------------------------------------------------- /tag-and-generate-train/eval/nlg_eval/examples/ref1.txt: -------------------------------------------------------------------------------- 1 | this is one reference sentence for sentence1 2 | this is a reference sentence for sentence2 which was generated by your model 3 | -------------------------------------------------------------------------------- /tag-and-generate-train/eval/nlg_eval/examples/ref2.txt: -------------------------------------------------------------------------------- 1 | this is one more reference sentence for sentence1 2 | this is the second reference sentence for sentence2 3 | -------------------------------------------------------------------------------- /tag-and-generate-train/eval/nlg_eval/multi-bleu.perl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env perl 2 | # 3 | # This file is part of moses. Its use is licensed under the GNU Lesser General 4 | # Public License version 2.1 or, at your option, any later version. 5 | 6 | # $Id$ 7 | use warnings; 8 | use strict; 9 | 10 | my $lowercase = 0; 11 | if ($ARGV[0] eq "-lc") { 12 | $lowercase = 1; 13 | shift; 14 | } 15 | 16 | my $stem = $ARGV[0]; 17 | if (!defined $stem) { 18 | print STDERR "usage: multi-bleu.pl [-lc] reference < hypothesis\n"; 19 | print STDERR "Reads the references from reference or reference0, reference1, ...\n"; 20 | exit(1); 21 | } 22 | 23 | $stem .= ".ref" if !-e $stem && !-e $stem."0" && -e $stem.".ref0"; 24 | 25 | my @REF; 26 | my $ref=0; 27 | while(-e "$stem$ref") { 28 | &add_to_ref("$stem$ref",\@REF); 29 | $ref++; 30 | } 31 | &add_to_ref($stem,\@REF) if -e $stem; 32 | die("ERROR: could not find reference file $stem") unless scalar @REF; 33 | 34 | # add additional references explicitly specified on the command line 35 | shift; 36 | foreach my $stem (@ARGV) { 37 | &add_to_ref($stem,\@REF) if -e $stem; 38 | } 39 | 40 | 41 | 42 | sub add_to_ref { 43 | my ($file,$REF) = @_; 44 | my $s=0; 45 | if ($file =~ /.gz$/) { 46 | open(REF,"gzip -dc $file|") or die "Can't read $file"; 47 | } else { 48 | open(REF,$file) or die "Can't read $file"; 49 | } 50 | while() { 51 | chomp; 52 | push @{$$REF[$s++]}, $_; 53 | } 54 | close(REF); 55 | } 56 | 57 | my(@CORRECT,@TOTAL,$length_translation,$length_reference); 58 | my $s=0; 59 | while() { 60 | chomp; 61 | $_ = lc if $lowercase; 62 | my @WORD = split; 63 | my %REF_NGRAM = (); 64 | my $length_translation_this_sentence = scalar(@WORD); 65 | my ($closest_diff,$closest_length) = (9999,9999); 66 | foreach my $reference (@{$REF[$s]}) { 67 | # print "$s $_ <=> $reference\n"; 68 | $reference = lc($reference) if $lowercase; 69 | my @WORD = split(' ',$reference); 70 | my $length = scalar(@WORD); 71 | my $diff = abs($length_translation_this_sentence-$length); 72 | if ($diff < $closest_diff) { 73 | $closest_diff = $diff; 74 | $closest_length = $length; 75 | # print STDERR "$s: closest diff ".abs($length_translation_this_sentence-$length)." = abs($length_translation_this_sentence-$length), setting len: $closest_length\n"; 76 | } elsif ($diff == $closest_diff) { 77 | $closest_length = $length if $length < $closest_length; 78 | # from two references with the same closeness to me 79 | # take the *shorter* into account, not the "first" one. 80 | } 81 | for(my $n=1;$n<=4;$n++) { 82 | my %REF_NGRAM_N = (); 83 | for(my $start=0;$start<=$#WORD-($n-1);$start++) { 84 | my $ngram = "$n"; 85 | for(my $w=0;$w<$n;$w++) { 86 | $ngram .= " ".$WORD[$start+$w]; 87 | } 88 | $REF_NGRAM_N{$ngram}++; 89 | } 90 | foreach my $ngram (keys %REF_NGRAM_N) { 91 | if (!defined($REF_NGRAM{$ngram}) || 92 | $REF_NGRAM{$ngram} < $REF_NGRAM_N{$ngram}) { 93 | $REF_NGRAM{$ngram} = $REF_NGRAM_N{$ngram}; 94 | # print "$i: REF_NGRAM{$ngram} = $REF_NGRAM{$ngram}
\n"; 95 | } 96 | } 97 | } 98 | } 99 | $length_translation += $length_translation_this_sentence; 100 | $length_reference += $closest_length; 101 | for(my $n=1;$n<=4;$n++) { 102 | my %T_NGRAM = (); 103 | for(my $start=0;$start<=$#WORD-($n-1);$start++) { 104 | my $ngram = "$n"; 105 | for(my $w=0;$w<$n;$w++) { 106 | $ngram .= " ".$WORD[$start+$w]; 107 | } 108 | $T_NGRAM{$ngram}++; 109 | } 110 | foreach my $ngram (keys %T_NGRAM) { 111 | $ngram =~ /^(\d+) /; 112 | my $n = $1; 113 | # my $corr = 0; 114 | # print "$i e $ngram $T_NGRAM{$ngram}
\n"; 115 | $TOTAL[$n] += $T_NGRAM{$ngram}; 116 | if (defined($REF_NGRAM{$ngram})) { 117 | if ($REF_NGRAM{$ngram} >= $T_NGRAM{$ngram}) { 118 | $CORRECT[$n] += $T_NGRAM{$ngram}; 119 | # $corr = $T_NGRAM{$ngram}; 120 | # print "$i e correct1 $T_NGRAM{$ngram}
\n"; 121 | } 122 | else { 123 | $CORRECT[$n] += $REF_NGRAM{$ngram}; 124 | # $corr = $REF_NGRAM{$ngram}; 125 | # print "$i e correct2 $REF_NGRAM{$ngram}
\n"; 126 | } 127 | } 128 | # $REF_NGRAM{$ngram} = 0 if !defined $REF_NGRAM{$ngram}; 129 | # print STDERR "$ngram: {$s, $REF_NGRAM{$ngram}, $T_NGRAM{$ngram}, $corr}\n" 130 | } 131 | } 132 | $s++; 133 | } 134 | my $brevity_penalty = 1; 135 | my $bleu = 0; 136 | 137 | my @bleu=(); 138 | 139 | for(my $n=1;$n<=4;$n++) { 140 | if (defined ($TOTAL[$n])){ 141 | $bleu[$n]=($TOTAL[$n])?$CORRECT[$n]/$TOTAL[$n]:0; 142 | # print STDERR "CORRECT[$n]:$CORRECT[$n] TOTAL[$n]:$TOTAL[$n]\n"; 143 | }else{ 144 | $bleu[$n]=0; 145 | } 146 | } 147 | 148 | if ($length_reference==0){ 149 | printf "BLEU = 0, 0/0/0/0 (BP=0, ratio=0, hyp_len=0, ref_len=0)\n"; 150 | exit(1); 151 | } 152 | 153 | if ($length_translation<$length_reference) { 154 | $brevity_penalty = exp(1-$length_reference/$length_translation); 155 | } 156 | $bleu = $brevity_penalty * exp((my_log( $bleu[1] ) + 157 | my_log( $bleu[2] ) + 158 | my_log( $bleu[3] ) + 159 | my_log( $bleu[4] ) ) / 4) ; 160 | printf "BLEU = %.2f, %.1f/%.1f/%.1f/%.1f (BP=%.3f, ratio=%.3f, hyp_len=%d, ref_len=%d)\n", 161 | 100*$bleu, 162 | 100*$bleu[1], 163 | 100*$bleu[2], 164 | 100*$bleu[3], 165 | 100*$bleu[4], 166 | $brevity_penalty, 167 | $length_translation / $length_reference, 168 | $length_translation, 169 | $length_reference; 170 | 171 | 172 | print STDERR "It is not advisable to publish scores from multi-bleu.perl. The scores depend on your tokenizer, which is unlikely to be reproducible from your paper or consistent across research groups. Instead you should detokenize then use mteval-v14.pl, which has a standard tokenization. Scores from multi-bleu.perl can still be used for internal purposes when you have a consistent tokenizer.\n"; 173 | 174 | sub my_log { 175 | return -9999999999 unless $_[0]; 176 | return log($_[0]); 177 | } 178 | -------------------------------------------------------------------------------- /tag-and-generate-train/eval/nlg_eval/nlgeval/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. All rights reserved. 2 | # Licensed under the MIT license. See LICENSE.md file in the project root for full license information. 3 | from __future__ import print_function 4 | 5 | import six 6 | from six.moves import map 7 | 8 | from nlgeval.pycocoevalcap.bleu.bleu import Bleu 9 | from nlgeval.pycocoevalcap.cider.cider import Cider 10 | from nlgeval.pycocoevalcap.meteor.meteor import Meteor 11 | from nlgeval.pycocoevalcap.rouge.rouge import Rouge 12 | 13 | 14 | # str/unicode stripping in Python 2 and 3 instead of `str.strip`. 15 | def _strip(s): 16 | return s.strip() 17 | 18 | 19 | def compute_metrics(hypothesis, references, no_overlap=False, no_skipthoughts=False, no_glove=False): 20 | with open(hypothesis, 'r') as f: 21 | hyp_list = f.readlines() 22 | ref_list = [] 23 | for iidx, reference in enumerate(references): 24 | with open(reference, 'r') as f: 25 | ref_list.append(f.readlines()) 26 | ref_list = [list(map(_strip, refs)) for refs in zip(*ref_list)] 27 | refs = {idx: strippedlines for (idx, strippedlines) in enumerate(ref_list)} 28 | hyps = {idx: [lines.strip()] for (idx, lines) in enumerate(hyp_list)} 29 | assert len(refs) == len(hyps) 30 | 31 | ret_scores = {} 32 | if not no_overlap: 33 | scorers = [ 34 | (Bleu(4), ["Bleu_1", "Bleu_2", "Bleu_3", "Bleu_4"]), 35 | (Meteor(), "METEOR"), 36 | (Rouge(), "ROUGE_L"), 37 | (Cider(), "CIDEr") 38 | ] 39 | for scorer, method in scorers: 40 | score, scores = scorer.compute_score(refs, hyps) 41 | if isinstance(method, list): 42 | for sc, scs, m in zip(score, scores, method): 43 | print("%s: %0.6f" % (m, sc)) 44 | ret_scores[m] = sc 45 | else: 46 | print("%s: %0.6f" % (method, score)) 47 | ret_scores[method] = score 48 | del scorers 49 | 50 | if not no_skipthoughts: 51 | from nlgeval.skipthoughts import skipthoughts 52 | import numpy as np 53 | from sklearn.metrics.pairwise import cosine_similarity 54 | 55 | model = skipthoughts.load_model() 56 | encoder = skipthoughts.Encoder(model) 57 | vector_hyps = encoder.encode([h.strip() for h in hyp_list], verbose=False) 58 | ref_list_T = np.array(ref_list).T.tolist() 59 | vector_refs = map(lambda refl: encoder.encode([r.strip() for r in refl], verbose=False), ref_list_T) 60 | cosine_similarity = list(map(lambda refv: cosine_similarity(refv, vector_hyps).diagonal(), vector_refs)) 61 | cosine_similarity = np.max(cosine_similarity, axis=0).mean() 62 | print("SkipThoughtsCosineSimilairty: %0.6f" % (cosine_similarity)) 63 | ret_scores['SkipThoughtCS'] = cosine_similarity 64 | del model 65 | 66 | if not no_glove: 67 | from nlgeval.word2vec.evaluate import eval_emb_metrics 68 | import numpy as np 69 | 70 | glove_hyps = [h.strip() for h in hyp_list] 71 | ref_list_T = np.array(ref_list).T.tolist() 72 | glove_refs = map(lambda refl: [r.strip() for r in refl], ref_list_T) 73 | scores = eval_emb_metrics(glove_hyps, glove_refs) 74 | print(scores) 75 | scores = scores.split('\n') 76 | for score in scores: 77 | name, value = score.split(':') 78 | value = float(value.strip()) 79 | ret_scores[name] = value 80 | 81 | return ret_scores 82 | 83 | 84 | def compute_individual_metrics(ref, hyp, no_overlap=False, no_skipthoughts=False, no_glove=False): 85 | assert isinstance(hyp, six.string_types) 86 | 87 | if isinstance(ref, six.string_types): 88 | ref = ref.split('||<|>||') # special delimiter for backward compatibility 89 | ref = [a.strip() for a in ref] 90 | refs = {0: ref} 91 | ref_list = [ref] 92 | 93 | hyps = {0: [hyp.strip()]} 94 | hyp_list = [hyp] 95 | 96 | ret_scores = {} 97 | if not no_overlap: 98 | scorers = [ 99 | (Bleu(4), ["Bleu_1", "Bleu_2", "Bleu_3", "Bleu_4"]), 100 | (Meteor(), "METEOR"), 101 | (Rouge(), "ROUGE_L"), 102 | (Cider(), "CIDEr") 103 | ] 104 | for scorer, method in scorers: 105 | score, scores = scorer.compute_score(refs, hyps) 106 | if isinstance(method, list): 107 | for sc, scs, m in zip(score, scores, method): 108 | ret_scores[m] = sc 109 | else: 110 | ret_scores[method] = score 111 | 112 | if not no_skipthoughts: 113 | from nlgeval.skipthoughts import skipthoughts 114 | import numpy as np 115 | from sklearn.metrics.pairwise import cosine_similarity 116 | 117 | model = skipthoughts.load_model() 118 | encoder = skipthoughts.Encoder(model) 119 | vector_hyps = encoder.encode([h.strip() for h in hyp_list], verbose=False) 120 | ref_list_T = np.array(ref_list).T.tolist() 121 | vector_refs = map(lambda refl: encoder.encode([r.strip() for r in refl], verbose=False), ref_list_T) 122 | cosine_similarity = list(map(lambda refv: cosine_similarity(refv, vector_hyps).diagonal(), vector_refs)) 123 | cosine_similarity = np.max(cosine_similarity, axis=0).mean() 124 | ret_scores['SkipThoughtCS'] = cosine_similarity 125 | 126 | if not no_glove: 127 | from nlgeval.word2vec.evaluate import eval_emb_metrics 128 | import numpy as np 129 | 130 | glove_hyps = [h.strip() for h in hyp_list] 131 | ref_list_T = np.array(ref_list).T.tolist() 132 | glove_refs = map(lambda refl: [r.strip() for r in refl], ref_list_T) 133 | scores = eval_emb_metrics(glove_hyps, glove_refs) 134 | scores = scores.split('\n') 135 | for score in scores: 136 | name, value = score.split(':') 137 | value = float(value.strip()) 138 | ret_scores[name] = value 139 | 140 | return ret_scores 141 | 142 | 143 | class NLGEval(object): 144 | glove_metrics = { 145 | 'EmbeddingAverageCosineSimilairty', 146 | 'VectorExtremaCosineSimilarity', 147 | 'GreedyMatchingScore', 148 | } 149 | 150 | valid_metrics = { 151 | # Overlap 152 | 'Bleu_1', 'Bleu_2', 'Bleu_3', 'Bleu_4', 153 | 'METEOR', 154 | 'ROUGE_L', 155 | 'CIDEr', 156 | 157 | # Skip-thought 158 | 'SkipThoughtCS', 159 | } | glove_metrics 160 | 161 | def __init__(self, no_overlap=False, no_skipthoughts=False, no_glove=False, 162 | metrics_to_omit=None): 163 | """ 164 | :param no_overlap: Default: Use overlap metrics. 165 | `True` if these metrics should not be used. 166 | :type no_overlap: bool 167 | :param no_skipthoughts: Default: Use the skip-thoughts metric. 168 | `True` if this metrics should not be used. 169 | :type no_skipthoughts: bool 170 | :param no_glove: Default: Use GloVe based metrics. 171 | `True` if these metrics should not be used. 172 | :type no_glove: bool 173 | :param metrics_to_omit: Default: Use all metrics. See `NLGEval.valid_metrics` for all metrics. 174 | The previous parameters will override metrics in this one if they are set. 175 | Metrics to omit. Omitting Bleu_{i} will omit Bleu_{j} for j>=i. 176 | :type metrics_to_omit: Optional[Collection[str]] 177 | """ 178 | 179 | if metrics_to_omit is None: 180 | self.metrics_to_omit = set() 181 | else: 182 | self.metrics_to_omit = set(metrics_to_omit) 183 | assert len(self.metrics_to_omit - self.valid_metrics) == 0, \ 184 | "Invalid metrics to omit: {}".format(self.metrics_to_omit - self.valid_metrics) 185 | 186 | self.no_overlap = no_overlap 187 | if not no_overlap: 188 | self.load_scorers() 189 | 190 | self.no_skipthoughts = no_skipthoughts or 'SkipThoughtCS' in self.metrics_to_omit 191 | if not self.no_skipthoughts: 192 | self.load_skipthought_model() 193 | 194 | self.no_glove = no_glove or len(self.glove_metrics - self.metrics_to_omit) == 0 195 | if not self.no_glove: 196 | self.load_glove() 197 | 198 | def load_scorers(self): 199 | self.scorers = [] 200 | 201 | omit_bleu_i = False 202 | for i in range(1, 4 + 1): 203 | if 'Bleu_{}'.format(i) in self.metrics_to_omit: 204 | omit_bleu_i = True 205 | if i > 1: 206 | self.scorers.append((Bleu(i - 1), ['Bleu_{}'.format(j) for j in range(1, i)])) 207 | break 208 | if not omit_bleu_i: 209 | self.scorers.append((Bleu(4), ["Bleu_1", "Bleu_2", "Bleu_3", "Bleu_4"])) 210 | 211 | if 'METEOR' not in self.metrics_to_omit: 212 | self.scorers.append((Meteor(), "METEOR")) 213 | if 'ROUGE_L' not in self.metrics_to_omit: 214 | self.scorers.append((Rouge(), "ROUGE_L")) 215 | if 'CIDEr' not in self.metrics_to_omit: 216 | self.scorers.append((Cider(), "CIDEr")) 217 | 218 | 219 | def load_skipthought_model(self): 220 | from nlgeval.skipthoughts import skipthoughts 221 | import numpy as np 222 | from sklearn.metrics.pairwise import cosine_similarity 223 | self.np = np 224 | self.cosine_similarity = cosine_similarity 225 | 226 | model = skipthoughts.load_model() 227 | self.skipthought_encoder = skipthoughts.Encoder(model) 228 | 229 | def load_glove(self): 230 | from nlgeval.word2vec.evaluate import Embedding 231 | from nlgeval.word2vec.evaluate import eval_emb_metrics 232 | import numpy as np 233 | self.eval_emb_metrics = eval_emb_metrics 234 | self.np = np 235 | self.glove_emb = Embedding() 236 | 237 | def compute_individual_metrics(self, ref, hyp): 238 | assert isinstance(hyp, six.string_types) 239 | ref = [a.strip() for a in ref] 240 | refs = {0: ref} 241 | ref_list = [ref] 242 | 243 | hyps = {0: [hyp.strip()]} 244 | hyp_list = [hyp] 245 | 246 | ret_scores = {} 247 | if not self.no_overlap: 248 | for scorer, method in self.scorers: 249 | score, scores = scorer.compute_score(refs, hyps) 250 | if isinstance(method, list): 251 | for sc, scs, m in zip(score, scores, method): 252 | ret_scores[m] = sc 253 | else: 254 | ret_scores[method] = score 255 | 256 | if not self.no_skipthoughts: 257 | vector_hyps = self.skipthought_encoder.encode([h.strip() for h in hyp_list], verbose=False) 258 | ref_list_T = self.np.array(ref_list).T.tolist() 259 | vector_refs = map(lambda refl: self.skipthought_encoder.encode([r.strip() for r in refl], verbose=False), ref_list_T) 260 | cosine_similarity = list(map(lambda refv: self.cosine_similarity(refv, vector_hyps).diagonal(), vector_refs)) 261 | cosine_similarity = self.np.max(cosine_similarity, axis=0).mean() 262 | ret_scores['SkipThoughtCS'] = cosine_similarity 263 | 264 | if not self.no_glove: 265 | glove_hyps = [h.strip() for h in hyp_list] 266 | ref_list_T = self.np.array(ref_list).T.tolist() 267 | glove_refs = map(lambda refl: [r.strip() for r in refl], ref_list_T) 268 | scores = self.eval_emb_metrics(glove_hyps, glove_refs, emb=self.glove_emb, 269 | metrics_to_omit=self.metrics_to_omit) 270 | scores = scores.split('\n') 271 | for score in scores: 272 | name, value = score.split(':') 273 | value = float(value.strip()) 274 | ret_scores[name] = value 275 | 276 | return ret_scores 277 | 278 | def compute_metrics(self, ref_list, hyp_list): 279 | ref_list = [list(map(_strip, refs)) for refs in zip(*ref_list)] 280 | refs = {idx: strippedlines for (idx, strippedlines) in enumerate(ref_list)} 281 | hyps = {idx: [lines.strip()] for (idx, lines) in enumerate(hyp_list)} 282 | assert len(refs) == len(hyps) 283 | 284 | ret_scores = {} 285 | if not self.no_overlap: 286 | for scorer, method in self.scorers: 287 | score, scores = scorer.compute_score(refs, hyps) 288 | if isinstance(method, list): 289 | for sc, scs, m in zip(score, scores, method): 290 | ret_scores[m] = sc 291 | else: 292 | ret_scores[method] = score 293 | 294 | if not self.no_skipthoughts: 295 | vector_hyps = self.skipthought_encoder.encode([h.strip() for h in hyp_list], verbose=False) 296 | ref_list_T = self.np.array(ref_list).T.tolist() 297 | vector_refs = map(lambda refl: self.skipthought_encoder.encode([r.strip() for r in refl], verbose=False), ref_list_T) 298 | cosine_similarity = list(map(lambda refv: self.cosine_similarity(refv, vector_hyps).diagonal(), vector_refs)) 299 | cosine_similarity = self.np.max(cosine_similarity, axis=0).mean() 300 | ret_scores['SkipThoughtCS'] = cosine_similarity 301 | 302 | if not self.no_glove: 303 | glove_hyps = [h.strip() for h in hyp_list] 304 | ref_list_T = self.np.array(ref_list).T.tolist() 305 | glove_refs = map(lambda refl: [r.strip() for r in refl], ref_list_T) 306 | scores = self.eval_emb_metrics(glove_hyps, glove_refs, emb=self.glove_emb) 307 | scores = scores.split('\n') 308 | for score in scores: 309 | name, value = score.split(':') 310 | value = float(value.strip()) 311 | ret_scores[name] = value 312 | 313 | return ret_scores 314 | -------------------------------------------------------------------------------- /tag-and-generate-train/eval/nlg_eval/nlgeval/pycocoevalcap/README.md: -------------------------------------------------------------------------------- 1 | # coco-caption 2 | 3 | Original README can be found at [tylin/coco-caption](https://github.com/tylin/coco-caption/blob/3f0fe9b819c0ea881a56441e4de1146924a394eb/README.md). 4 | 5 | ## License 6 | 7 | All files in the pycocoevalcap directory are under 8 | [BSD 2-clause "Simplified" License](https://github.com/tylin/coco-caption/blob/3f0fe9b819c0ea881a56441e4de1146924a394eb/license.txt) 9 | -------------------------------------------------------------------------------- /tag-and-generate-train/eval/nlg_eval/nlgeval/pycocoevalcap/__init__.py: -------------------------------------------------------------------------------- 1 | __author__ = 'tylin' 2 | -------------------------------------------------------------------------------- /tag-and-generate-train/eval/nlg_eval/nlgeval/pycocoevalcap/bleu/LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (c) 2015 Xinlei Chen, Hao Fang, Tsung-Yi Lin, and Ramakrishna Vedantam 2 | 3 | Permission is hereby granted, free of charge, to any person obtaining a copy 4 | of this software and associated documentation files (the "Software"), to deal 5 | in the Software without restriction, including without limitation the rights 6 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 7 | copies of the Software, and to permit persons to whom the Software is 8 | furnished to do so, subject to the following conditions: 9 | 10 | The above copyright notice and this permission notice shall be included in 11 | all copies or substantial portions of the Software. 12 | 13 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 14 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 15 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 16 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 17 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 18 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 19 | THE SOFTWARE. 20 | -------------------------------------------------------------------------------- /tag-and-generate-train/eval/nlg_eval/nlgeval/pycocoevalcap/bleu/__init__.py: -------------------------------------------------------------------------------- 1 | __author__ = 'tylin' 2 | -------------------------------------------------------------------------------- /tag-and-generate-train/eval/nlg_eval/nlgeval/pycocoevalcap/bleu/bleu.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # 3 | # File Name : bleu.py 4 | # 5 | # Description : Wrapper for BLEU scorer. 6 | # 7 | # Creation Date : 06-01-2015 8 | # Last Modified : Thu 19 Mar 2015 09:13:28 PM PDT 9 | # Authors : Hao Fang and Tsung-Yi Lin 10 | 11 | from .bleu_scorer import BleuScorer 12 | 13 | 14 | class Bleu: 15 | def __init__(self, n=4): 16 | # default compute Blue score up to 4 17 | self._n = n 18 | self._hypo_for_image = {} 19 | self.ref_for_image = {} 20 | 21 | def compute_score(self, gts, res): 22 | 23 | assert(gts.keys() == res.keys()) 24 | imgIds = gts.keys() 25 | 26 | bleu_scorer = BleuScorer(n=self._n) 27 | for id in imgIds: 28 | hypo = res[id] 29 | ref = gts[id] 30 | 31 | # Sanity check. 32 | assert(type(hypo) is list) 33 | assert(len(hypo) == 1) 34 | assert(type(ref) is list) 35 | assert(len(ref) >= 1) 36 | 37 | bleu_scorer += (hypo[0], ref) 38 | 39 | #score, scores = bleu_scorer.compute_score(option='shortest') 40 | score, scores = bleu_scorer.compute_score(option='closest', verbose=0) 41 | #score, scores = bleu_scorer.compute_score(option='average', verbose=1) 42 | 43 | # return (bleu, bleu_info) 44 | return score, scores 45 | 46 | def method(self): 47 | return "Bleu" 48 | -------------------------------------------------------------------------------- /tag-and-generate-train/eval/nlg_eval/nlgeval/pycocoevalcap/bleu/bleu_scorer.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | # bleu_scorer.py 4 | # David Chiang 5 | 6 | # Copyright (c) 2004-2006 University of Maryland. All rights 7 | # reserved. Do not redistribute without permission from the 8 | # author. Not for commercial use. 9 | 10 | # Modified by: 11 | # Hao Fang 12 | # Tsung-Yi Lin 13 | 14 | '''Provides: 15 | cook_refs(refs, n=4): Transform a list of reference sentences as strings into a form usable by cook_test(). 16 | cook_test(test, refs, n=4): Transform a test sentence as a string (together with the cooked reference sentences) into a form usable by score_cooked(). 17 | ''' 18 | 19 | import copy 20 | import sys, math, re 21 | from collections import defaultdict 22 | 23 | import six 24 | from six.moves import xrange as range 25 | 26 | 27 | def precook(s, n=4, out=False): 28 | """Takes a string as input and returns an object that can be given to 29 | either cook_refs or cook_test. This is optional: cook_refs and cook_test 30 | can take string arguments as well.""" 31 | words = s.split() 32 | counts = defaultdict(int) 33 | for k in range(1,n+1): 34 | for i in range(len(words)-k+1): 35 | ngram = tuple(words[i:i+k]) 36 | counts[ngram] += 1 37 | return (len(words), counts) 38 | 39 | def cook_refs(refs, eff=None, n=4): ## lhuang: oracle will call with "average" 40 | '''Takes a list of reference sentences for a single segment 41 | and returns an object that encapsulates everything that BLEU 42 | needs to know about them.''' 43 | 44 | reflen = [] 45 | maxcounts = {} 46 | for ref in refs: 47 | rl, counts = precook(ref, n) 48 | reflen.append(rl) 49 | for (ngram,count) in six.iteritems(counts): 50 | maxcounts[ngram] = max(maxcounts.get(ngram,0), count) 51 | 52 | # Calculate effective reference sentence length. 53 | if eff == "shortest": 54 | reflen = min(reflen) 55 | elif eff == "average": 56 | reflen = float(sum(reflen))/len(reflen) 57 | 58 | ## lhuang: N.B.: leave reflen computaiton to the very end!! 59 | 60 | ## lhuang: N.B.: in case of "closest", keep a list of reflens!! (bad design) 61 | 62 | return (reflen, maxcounts) 63 | 64 | def cook_test(test, reflen_refmaxcounts, eff=None, n=4): 65 | '''Takes a test sentence and returns an object that 66 | encapsulates everything that BLEU needs to know about it.''' 67 | 68 | reflen, refmaxcounts = reflen_refmaxcounts 69 | testlen, counts = precook(test, n, True) 70 | 71 | result = {} 72 | 73 | # Calculate effective reference sentence length. 74 | 75 | if eff == "closest": 76 | result["reflen"] = min((abs(l-testlen), l) for l in reflen)[1] 77 | else: ## i.e., "average" or "shortest" or None 78 | result["reflen"] = reflen 79 | 80 | result["testlen"] = testlen 81 | 82 | result["guess"] = [max(0,testlen-k+1) for k in range(1,n+1)] 83 | 84 | result['correct'] = [0]*n 85 | for (ngram, count) in six.iteritems(counts): 86 | result["correct"][len(ngram)-1] += min(refmaxcounts.get(ngram,0), count) 87 | 88 | return result 89 | 90 | class BleuScorer(object): 91 | """Bleu scorer. 92 | """ 93 | 94 | __slots__ = "n", "crefs", "ctest", "_score", "_ratio", "_testlen", "_reflen", "special_reflen" 95 | # special_reflen is used in oracle (proportional effective ref len for a node). 96 | 97 | def copy(self): 98 | ''' copy the refs.''' 99 | new = BleuScorer(n=self.n) 100 | new.ctest = copy.copy(self.ctest) 101 | new.crefs = copy.copy(self.crefs) 102 | new._score = None 103 | return new 104 | 105 | def __init__(self, test=None, refs=None, n=4, special_reflen=None): 106 | ''' singular instance ''' 107 | 108 | self.n = n 109 | self.crefs = [] 110 | self.ctest = [] 111 | self.cook_append(test, refs) 112 | self.special_reflen = special_reflen 113 | 114 | def cook_append(self, test, refs): 115 | '''called by constructor and __iadd__ to avoid creating new instances.''' 116 | 117 | if refs is not None: 118 | self.crefs.append(cook_refs(refs)) 119 | if test is not None: 120 | cooked_test = cook_test(test, self.crefs[-1]) 121 | self.ctest.append(cooked_test) ## N.B.: -1 122 | else: 123 | self.ctest.append(None) # lens of crefs and ctest have to match 124 | 125 | self._score = None ## need to recompute 126 | 127 | def ratio(self, option=None): 128 | self.compute_score(option=option) 129 | return self._ratio 130 | 131 | def score_ratio(self, option=None): 132 | '''return (bleu, len_ratio) pair''' 133 | return (self.fscore(option=option), self.ratio(option=option)) 134 | 135 | def score_ratio_str(self, option=None): 136 | return "%.4f (%.2f)" % self.score_ratio(option) 137 | 138 | def reflen(self, option=None): 139 | self.compute_score(option=option) 140 | return self._reflen 141 | 142 | def testlen(self, option=None): 143 | self.compute_score(option=option) 144 | return self._testlen 145 | 146 | def retest(self, new_test): 147 | if type(new_test) is str: 148 | new_test = [new_test] 149 | assert len(new_test) == len(self.crefs), new_test 150 | self.ctest = [] 151 | for t, rs in zip(new_test, self.crefs): 152 | self.ctest.append(cook_test(t, rs)) 153 | self._score = None 154 | 155 | return self 156 | 157 | def rescore(self, new_test): 158 | ''' replace test(s) with new test(s), and returns the new score.''' 159 | 160 | return self.retest(new_test).compute_score() 161 | 162 | def size(self): 163 | assert len(self.crefs) == len(self.ctest), "refs/test mismatch! %d<>%d" % (len(self.crefs), len(self.ctest)) 164 | return len(self.crefs) 165 | 166 | def __iadd__(self, other): 167 | '''add an instance (e.g., from another sentence).''' 168 | 169 | if type(other) is tuple: 170 | ## avoid creating new BleuScorer instances 171 | self.cook_append(other[0], other[1]) 172 | else: 173 | assert self.compatible(other), "incompatible BLEUs." 174 | self.ctest.extend(other.ctest) 175 | self.crefs.extend(other.crefs) 176 | self._score = None ## need to recompute 177 | 178 | return self 179 | 180 | def compatible(self, other): 181 | return isinstance(other, BleuScorer) and self.n == other.n 182 | 183 | def single_reflen(self, option="average"): 184 | return self._single_reflen(self.crefs[0][0], option) 185 | 186 | def _single_reflen(self, reflens, option=None, testlen=None): 187 | 188 | if option == "shortest": 189 | reflen = min(reflens) 190 | elif option == "average": 191 | reflen = float(sum(reflens))/len(reflens) 192 | elif option == "closest": 193 | reflen = min((abs(l-testlen), l) for l in reflens)[1] 194 | else: 195 | assert False, "unsupported reflen option %s" % option 196 | 197 | return reflen 198 | 199 | def recompute_score(self, option=None, verbose=0): 200 | self._score = None 201 | return self.compute_score(option, verbose) 202 | 203 | def compute_score(self, option=None, verbose=0): 204 | n = self.n 205 | small = 1e-9 206 | tiny = 1e-15 ## so that if guess is 0 still return 0 207 | bleu_list = [[] for _ in range(n)] 208 | 209 | if self._score is not None: 210 | return self._score 211 | 212 | if option is None: 213 | option = "average" if len(self.crefs) == 1 else "closest" 214 | 215 | self._testlen = 0 216 | self._reflen = 0 217 | totalcomps = {'testlen':0, 'reflen':0, 'guess':[0]*n, 'correct':[0]*n} 218 | 219 | # for each sentence 220 | for comps in self.ctest: 221 | testlen = comps['testlen'] 222 | self._testlen += testlen 223 | 224 | if self.special_reflen is None: ## need computation 225 | reflen = self._single_reflen(comps['reflen'], option, testlen) 226 | else: 227 | reflen = self.special_reflen 228 | 229 | self._reflen += reflen 230 | 231 | for key in ['guess','correct']: 232 | for k in range(n): 233 | totalcomps[key][k] += comps[key][k] 234 | 235 | # append per image bleu score 236 | bleu = 1. 237 | for k in range(n): 238 | bleu *= (float(comps['correct'][k]) + tiny) \ 239 | /(float(comps['guess'][k]) + small) 240 | bleu_list[k].append(bleu ** (1./(k+1))) 241 | ratio = (testlen + tiny) / (reflen + small) ## N.B.: avoid zero division 242 | if ratio < 1: 243 | for k in range(n): 244 | bleu_list[k][-1] *= math.exp(1 - 1/ratio) 245 | 246 | if verbose > 1: 247 | print(comps, reflen) 248 | 249 | totalcomps['reflen'] = self._reflen 250 | totalcomps['testlen'] = self._testlen 251 | 252 | bleus = [] 253 | bleu = 1. 254 | for k in range(n): 255 | bleu *= float(totalcomps['correct'][k] + tiny) \ 256 | / (totalcomps['guess'][k] + small) 257 | bleus.append(bleu ** (1./(k+1))) 258 | ratio = (self._testlen + tiny) / (self._reflen + small) ## N.B.: avoid zero division 259 | if ratio < 1: 260 | for k in range(n): 261 | bleus[k] *= math.exp(1 - 1/ratio) 262 | 263 | if verbose > 0: 264 | print(totalcomps) 265 | print("ratio:", ratio) 266 | 267 | self._score = bleus 268 | return self._score, bleu_list 269 | -------------------------------------------------------------------------------- /tag-and-generate-train/eval/nlg_eval/nlgeval/pycocoevalcap/cider/__init__.py: -------------------------------------------------------------------------------- 1 | __author__ = 'tylin' 2 | -------------------------------------------------------------------------------- /tag-and-generate-train/eval/nlg_eval/nlgeval/pycocoevalcap/cider/cider.py: -------------------------------------------------------------------------------- 1 | # Filename: cider.py 2 | # 3 | # Description: Describes the class to compute the CIDEr (Consensus-Based Image Description Evaluation) Metric 4 | # by Vedantam, Zitnick, and Parikh (http://arxiv.org/abs/1411.5726) 5 | # 6 | # Creation Date: Sun Feb 8 14:16:54 2015 7 | # 8 | # Authors: Ramakrishna Vedantam and Tsung-Yi Lin 9 | 10 | from .cider_scorer import CiderScorer 11 | import pdb 12 | 13 | class Cider: 14 | """ 15 | Main Class to compute the CIDEr metric 16 | 17 | """ 18 | def __init__(self, test=None, refs=None, n=4, sigma=6.0): 19 | # set cider to sum over 1 to 4-grams 20 | self._n = n 21 | # set the standard deviation parameter for gaussian penalty 22 | self._sigma = sigma 23 | 24 | def compute_score(self, gts, res): 25 | """ 26 | Main function to compute CIDEr score 27 | :param hypo_for_image (dict) : dictionary with key and value 28 | ref_for_image (dict) : dictionary with key and value 29 | :return: cider (float) : computed CIDEr score for the corpus 30 | """ 31 | 32 | assert(gts.keys() == res.keys()) 33 | imgIds = gts.keys() 34 | 35 | cider_scorer = CiderScorer(n=self._n, sigma=self._sigma) 36 | 37 | for id in imgIds: 38 | hypo = res[id] 39 | ref = gts[id] 40 | 41 | # Sanity check. 42 | assert(type(hypo) is list) 43 | assert(len(hypo) == 1) 44 | assert(type(ref) is list) 45 | assert(len(ref) > 0) 46 | 47 | cider_scorer += (hypo[0], ref) 48 | 49 | (score, scores) = cider_scorer.compute_score() 50 | 51 | return score, scores 52 | 53 | def method(self): 54 | return "CIDEr" 55 | -------------------------------------------------------------------------------- /tag-and-generate-train/eval/nlg_eval/nlgeval/pycocoevalcap/cider/cider_scorer.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # Tsung-Yi Lin 3 | # Ramakrishna Vedantam 4 | 5 | import copy 6 | import math 7 | from collections import defaultdict 8 | 9 | import numpy as np 10 | from six.moves import xrange as range 11 | import six 12 | 13 | def precook(s, n=4, out=False): 14 | """ 15 | Takes a string as input and returns an object that can be given to 16 | either cook_refs or cook_test. This is optional: cook_refs and cook_test 17 | can take string arguments as well. 18 | :param s: string : sentence to be converted into ngrams 19 | :param n: int : number of ngrams for which representation is calculated 20 | :return: term frequency vector for occuring ngrams 21 | """ 22 | words = s.split() 23 | counts = defaultdict(int) 24 | for k in range(1,n+1): 25 | for i in range(len(words)-k+1): 26 | ngram = tuple(words[i:i+k]) 27 | counts[ngram] += 1 28 | return counts 29 | 30 | def cook_refs(refs, n=4): ## lhuang: oracle will call with "average" 31 | '''Takes a list of reference sentences for a single segment 32 | and returns an object that encapsulates everything that BLEU 33 | needs to know about them. 34 | :param refs: list of string : reference sentences for some image 35 | :param n: int : number of ngrams for which (ngram) representation is calculated 36 | :return: result (list of dict) 37 | ''' 38 | return [precook(ref, n) for ref in refs] 39 | 40 | def cook_test(test, n=4): 41 | '''Takes a test sentence and returns an object that 42 | encapsulates everything that BLEU needs to know about it. 43 | :param test: list of string : hypothesis sentence for some image 44 | :param n: int : number of ngrams for which (ngram) representation is calculated 45 | :return: result (dict) 46 | ''' 47 | return precook(test, n, True) 48 | 49 | class CiderScorer(object): 50 | """CIDEr scorer. 51 | """ 52 | 53 | def copy(self): 54 | ''' copy the refs.''' 55 | new = CiderScorer(n=self.n) 56 | new.ctest = copy.copy(self.ctest) 57 | new.crefs = copy.copy(self.crefs) 58 | return new 59 | 60 | def __init__(self, test=None, refs=None, n=4, sigma=6.0): 61 | ''' singular instance ''' 62 | self.n = n 63 | self.sigma = sigma 64 | self.crefs = [] 65 | self.ctest = [] 66 | self.document_frequency = defaultdict(float) 67 | self.cook_append(test, refs) 68 | self.ref_len = None 69 | 70 | def cook_append(self, test, refs): 71 | '''called by constructor and __iadd__ to avoid creating new instances.''' 72 | 73 | if refs is not None: 74 | self.crefs.append(cook_refs(refs)) 75 | if test is not None: 76 | self.ctest.append(cook_test(test)) ## N.B.: -1 77 | else: 78 | self.ctest.append(None) # lens of crefs and ctest have to match 79 | 80 | def size(self): 81 | assert len(self.crefs) == len(self.ctest), "refs/test mismatch! %d<>%d" % (len(self.crefs), len(self.ctest)) 82 | return len(self.crefs) 83 | 84 | def __iadd__(self, other): 85 | '''add an instance (e.g., from another sentence).''' 86 | 87 | if type(other) is tuple: 88 | ## avoid creating new CiderScorer instances 89 | self.cook_append(other[0], other[1]) 90 | else: 91 | self.ctest.extend(other.ctest) 92 | self.crefs.extend(other.crefs) 93 | 94 | return self 95 | def compute_doc_freq(self): 96 | ''' 97 | Compute term frequency for reference data. 98 | This will be used to compute idf (inverse document frequency later) 99 | The term frequency is stored in the object 100 | :return: None 101 | ''' 102 | for refs in self.crefs: 103 | # refs, k ref captions of one image 104 | for ngram in set([ngram for ref in refs for (ngram,count) in six.iteritems(ref)]): 105 | self.document_frequency[ngram] += 1 106 | # maxcounts[ngram] = max(maxcounts.get(ngram,0), count) 107 | 108 | def compute_cider(self): 109 | def counts2vec(cnts): 110 | """ 111 | Function maps counts of ngram to vector of tfidf weights. 112 | The function returns vec, an array of dictionary that store mapping of n-gram and tf-idf weights. 113 | The n-th entry of array denotes length of n-grams. 114 | :param cnts: 115 | :return: vec (array of dict), norm (array of float), length (int) 116 | """ 117 | vec = [defaultdict(float) for _ in range(self.n)] 118 | length = 0 119 | norm = [0.0 for _ in range(self.n)] 120 | for (ngram,term_freq) in six.iteritems(cnts): 121 | # give word count 1 if it doesn't appear in reference corpus 122 | df = np.log(max(1.0, self.document_frequency[ngram])) 123 | # ngram index 124 | n = len(ngram)-1 125 | # tf (term_freq) * idf (precomputed idf) for n-grams 126 | vec[n][ngram] = float(term_freq)*(self.ref_len - df) 127 | # compute norm for the vector. the norm will be used for computing similarity 128 | norm[n] += pow(vec[n][ngram], 2) 129 | 130 | if n == 1: 131 | length += term_freq 132 | norm = [np.sqrt(n) for n in norm] 133 | return vec, norm, length 134 | 135 | def sim(vec_hyp, vec_ref, norm_hyp, norm_ref, length_hyp, length_ref): 136 | ''' 137 | Compute the cosine similarity of two vectors. 138 | :param vec_hyp: array of dictionary for vector corresponding to hypothesis 139 | :param vec_ref: array of dictionary for vector corresponding to reference 140 | :param norm_hyp: array of float for vector corresponding to hypothesis 141 | :param norm_ref: array of float for vector corresponding to reference 142 | :param length_hyp: int containing length of hypothesis 143 | :param length_ref: int containing length of reference 144 | :return: array of score for each n-grams cosine similarity 145 | ''' 146 | delta = float(length_hyp - length_ref) 147 | # measure consine similarity 148 | val = np.array([0.0 for _ in range(self.n)]) 149 | for n in range(self.n): 150 | # ngram 151 | for (ngram,count) in six.iteritems(vec_hyp[n]): 152 | # vrama91 : added clipping 153 | val[n] += min(vec_hyp[n][ngram], vec_ref[n][ngram]) * vec_ref[n][ngram] 154 | 155 | if (norm_hyp[n] != 0) and (norm_ref[n] != 0): 156 | val[n] /= (norm_hyp[n]*norm_ref[n]) 157 | 158 | assert(not math.isnan(val[n])) 159 | # vrama91: added a length based gaussian penalty 160 | val[n] *= np.e**(-(delta**2)/(2*self.sigma**2)) 161 | return val 162 | 163 | # compute log reference length 164 | self.ref_len = np.log(float(len(self.crefs))) 165 | 166 | scores = [] 167 | for test, refs in zip(self.ctest, self.crefs): 168 | # compute vector for test captions 169 | vec, norm, length = counts2vec(test) 170 | # compute vector for ref captions 171 | score = np.array([0.0 for _ in range(self.n)]) 172 | for ref in refs: 173 | vec_ref, norm_ref, length_ref = counts2vec(ref) 174 | score += sim(vec, vec_ref, norm, norm_ref, length, length_ref) 175 | # change by vrama91 - mean of ngram scores, instead of sum 176 | score_avg = np.mean(score) 177 | # divide by number of references 178 | score_avg /= len(refs) 179 | # multiply score by 10 180 | score_avg *= 10.0 181 | # append score of an image to the score list 182 | scores.append(score_avg) 183 | return scores 184 | 185 | def compute_score(self, option=None, verbose=0): 186 | # compute idf 187 | self.compute_doc_freq() 188 | # assert to check document frequency 189 | assert(len(self.ctest) >= max(self.document_frequency.values())) 190 | # compute cider score 191 | score = self.compute_cider() 192 | # debug 193 | # print score 194 | return np.mean(np.array(score)), np.array(score) -------------------------------------------------------------------------------- /tag-and-generate-train/eval/nlg_eval/nlgeval/pycocoevalcap/license.txt: -------------------------------------------------------------------------------- 1 | Copyright (c) 2015, Xinlei Chen, Hao Fang, Tsung-Yi Lin, and Ramakrishna Vedantam 2 | All rights reserved. 3 | 4 | Redistribution and use in source and binary forms, with or without 5 | modification, are permitted provided that the following conditions are met: 6 | 7 | 1. Redistributions of source code must retain the above copyright notice, this 8 | list of conditions and the following disclaimer. 9 | 2. Redistributions in binary form must reproduce the above copyright notice, 10 | this list of conditions and the following disclaimer in the documentation 11 | and/or other materials provided with the distribution. 12 | 13 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 14 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 15 | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 16 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR 17 | ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 18 | (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 19 | LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 20 | ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 21 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 22 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 23 | 24 | The views and conclusions contained in the software and documentation are those 25 | of the authors and should not be interpreted as representing official policies, 26 | either expressed or implied, of the FreeBSD Project. 27 | -------------------------------------------------------------------------------- /tag-and-generate-train/eval/nlg_eval/nlgeval/pycocoevalcap/meteor/__init__.py: -------------------------------------------------------------------------------- 1 | __author__ = 'tylin' 2 | -------------------------------------------------------------------------------- /tag-and-generate-train/eval/nlg_eval/nlgeval/pycocoevalcap/meteor/data/paraphrase-en.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tag-and-generate/tagger-generator/0059c49fd7df273be9421cb683c96f362663057d/tag-and-generate-train/eval/nlg_eval/nlgeval/pycocoevalcap/meteor/data/paraphrase-en.gz -------------------------------------------------------------------------------- /tag-and-generate-train/eval/nlg_eval/nlgeval/pycocoevalcap/meteor/meteor-1.5.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tag-and-generate/tagger-generator/0059c49fd7df273be9421cb683c96f362663057d/tag-and-generate-train/eval/nlg_eval/nlgeval/pycocoevalcap/meteor/meteor-1.5.jar -------------------------------------------------------------------------------- /tag-and-generate-train/eval/nlg_eval/nlgeval/pycocoevalcap/meteor/meteor.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | # Python wrapper for METEOR implementation, by Xinlei Chen 4 | # Acknowledge Michael Denkowski for the generous discussion and help 5 | from __future__ import division 6 | 7 | import atexit 8 | import logging 9 | import os 10 | import subprocess 11 | import sys 12 | import threading 13 | 14 | import psutil 15 | 16 | # Assumes meteor-1.5.jar is in the same directory as meteor.py. Change as needed. 17 | METEOR_JAR = 'meteor-1.5.jar' 18 | 19 | 20 | def enc(s): 21 | return s.encode('utf-8') 22 | 23 | 24 | def dec(s): 25 | return s.decode('utf-8') 26 | 27 | 28 | class Meteor: 29 | 30 | def __init__(self): 31 | # Used to guarantee thread safety 32 | self.lock = threading.Lock() 33 | 34 | mem = '2G' 35 | mem_available_G = psutil.virtual_memory().available / 1E9 36 | if mem_available_G < 2: 37 | logging.warning("There is less than 2GB of available memory.\n" 38 | "Will try with limiting Meteor to 1GB of memory but this might cause issues.\n" 39 | "If you have problems using Meteor, " 40 | "then you can try to lower the `mem` variable in meteor.py") 41 | mem = '1G' 42 | 43 | meteor_cmd = ['java', '-jar', '-Xmx{}'.format(mem), METEOR_JAR, 44 | '-', '-', '-stdio', '-l', 'en', '-norm'] 45 | env = os.environ.copy() 46 | env['LC_ALL'] = "C" 47 | self.meteor_p = subprocess.Popen(meteor_cmd, 48 | cwd=os.path.dirname(os.path.abspath(__file__)), 49 | env=env, 50 | stdin=subprocess.PIPE, 51 | stdout=subprocess.PIPE, 52 | stderr=subprocess.PIPE) 53 | 54 | atexit.register(self.close) 55 | 56 | def close(self): 57 | with self.lock: 58 | if self.meteor_p: 59 | self.meteor_p.kill() 60 | self.meteor_p.wait() 61 | self.meteor_p = None 62 | # if the user calls close() manually, remove the 63 | # reference from atexit so the object can be garbage-collected. 64 | if atexit is not None and atexit.unregister is not None: 65 | atexit.unregister(self.close) 66 | 67 | def compute_score(self, gts, res): 68 | assert (gts.keys() == res.keys()) 69 | imgIds = gts.keys() 70 | scores = [] 71 | 72 | eval_line = 'EVAL' 73 | with self.lock: 74 | for i in imgIds: 75 | assert (len(res[i]) == 1) 76 | stat = self._stat(res[i][0], gts[i]) 77 | eval_line += ' ||| {}'.format(stat) 78 | 79 | self.meteor_p.stdin.write(enc('{}\n'.format(eval_line))) 80 | self.meteor_p.stdin.flush() 81 | for i in range(0, len(imgIds)): 82 | v = self.meteor_p.stdout.readline() 83 | try: 84 | scores.append(float(dec(v.strip()))) 85 | except: 86 | sys.stderr.write("Error handling value: {}\n".format(v)) 87 | sys.stderr.write("Decoded value: {}\n".format(dec(v.strip()))) 88 | sys.stderr.write("eval_line: {}\n".format(eval_line)) 89 | # You can try uncommenting the next code line to show stderr from the Meteor JAR. 90 | # If the Meteor JAR is not writing to stderr, then the line will just hang. 91 | # sys.stderr.write("Error from Meteor:\n{}".format(self.meteor_p.stderr.read())) 92 | raise 93 | score = float(dec(self.meteor_p.stdout.readline()).strip()) 94 | 95 | return score, scores 96 | 97 | def method(self): 98 | return "METEOR" 99 | 100 | def _stat(self, hypothesis_str, reference_list): 101 | # SCORE ||| reference 1 words ||| reference n words ||| hypothesis words 102 | hypothesis_str = hypothesis_str.replace('|||', '').replace(' ', ' ') 103 | score_line = ' ||| '.join(('SCORE', ' ||| '.join(reference_list), hypothesis_str)) 104 | self.meteor_p.stdin.write(enc(score_line)) 105 | self.meteor_p.stdin.write(enc('\n')) 106 | self.meteor_p.stdin.flush() 107 | return dec(self.meteor_p.stdout.readline()).strip() 108 | 109 | def _score(self, hypothesis_str, reference_list): 110 | with self.lock: 111 | # SCORE ||| reference 1 words ||| reference n words ||| hypothesis words 112 | hypothesis_str = hypothesis_str.replace('|||', '').replace(' ', ' ') 113 | score_line = ' ||| '.join(('SCORE', ' ||| '.join(reference_list), hypothesis_str)) 114 | self.meteor_p.stdin.write(enc('{}\n'.format(score_line))) 115 | self.meteor_p.stdin.flush() 116 | stats = dec(self.meteor_p.stdout.readline()).strip() 117 | eval_line = 'EVAL ||| {}'.format(stats) 118 | # EVAL ||| stats 119 | self.meteor_p.stdin.write(enc('{}\n'.format(eval_line))) 120 | self.meteor_p.stdin.flush() 121 | score = float(dec(self.meteor_p.stdout.readline()).strip()) 122 | # bug fix: there are two values returned by the jar file, one average, and one all, so do it twice 123 | # thanks for Andrej for pointing this out 124 | score = float(dec(self.meteor_p.stdout.readline()).strip()) 125 | return score 126 | 127 | def __del__(self): 128 | self.close() 129 | -------------------------------------------------------------------------------- /tag-and-generate-train/eval/nlg_eval/nlgeval/pycocoevalcap/meteor/tests/test_meteor.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from __future__ import unicode_literals 3 | 4 | import unittest 5 | 6 | from nlgeval.pycocoevalcap.meteor.meteor import Meteor 7 | 8 | 9 | class TestMeteor(unittest.TestCase): 10 | def test_compute_score(self): 11 | m = Meteor() 12 | 13 | s = m.compute_score({0: ["test"]}, {0: ["test"]}) 14 | self.assertEqual(s, (1.0, [1.0])) 15 | 16 | s = m.compute_score({0: ["テスト"]}, {0: ["テスト"]}) 17 | self.assertEqual(s, (1.0, [1.0])) 18 | -------------------------------------------------------------------------------- /tag-and-generate-train/eval/nlg_eval/nlgeval/pycocoevalcap/rouge/__init__.py: -------------------------------------------------------------------------------- 1 | __author__ = 'vrama91' 2 | -------------------------------------------------------------------------------- /tag-and-generate-train/eval/nlg_eval/nlgeval/pycocoevalcap/rouge/rouge.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # 3 | # File Name : rouge.py 4 | # 5 | # Description : Computes ROUGE-L metric as described by Lin and Hovey (2004) 6 | # 7 | # Creation Date : 2015-01-07 06:03 8 | # Author : Ramakrishna Vedantam 9 | 10 | import numpy as np 11 | import pdb 12 | 13 | def my_lcs(string, sub): 14 | """ 15 | Calculates longest common subsequence for a pair of tokenized strings 16 | :param string : list of str : tokens from a string split using whitespace 17 | :param sub : list of str : shorter string, also split using whitespace 18 | :returns: length (list of int): length of the longest common subsequence between the two strings 19 | 20 | Note: my_lcs only gives length of the longest common subsequence, not the actual LCS 21 | """ 22 | if(len(string)< len(sub)): 23 | sub, string = string, sub 24 | 25 | lengths = [[0 for i in range(0,len(sub)+1)] for j in range(0,len(string)+1)] 26 | 27 | for j in range(1,len(sub)+1): 28 | for i in range(1,len(string)+1): 29 | if(string[i-1] == sub[j-1]): 30 | lengths[i][j] = lengths[i-1][j-1] + 1 31 | else: 32 | lengths[i][j] = max(lengths[i-1][j] , lengths[i][j-1]) 33 | 34 | return lengths[len(string)][len(sub)] 35 | 36 | class Rouge(): 37 | ''' 38 | Class for computing ROUGE-L score for a set of candidate sentences for the MS COCO test set 39 | 40 | ''' 41 | def __init__(self): 42 | # vrama91: updated the value below based on discussion with Hovey 43 | self.beta = 1.2 44 | 45 | def calc_score(self, candidate, refs): 46 | """ 47 | Compute ROUGE-L score given one candidate and references for an image 48 | :param candidate: str : candidate sentence to be evaluated 49 | :param refs: list of str : COCO reference sentences for the particular image to be evaluated 50 | :returns score: int (ROUGE-L score for the candidate evaluated against references) 51 | """ 52 | assert(len(candidate)==1) 53 | assert(len(refs)>0) 54 | prec = [] 55 | rec = [] 56 | 57 | # split into tokens 58 | token_c = candidate[0].split(" ") 59 | 60 | for reference in refs: 61 | # split into tokens 62 | token_r = reference.split(" ") 63 | # compute the longest common subsequence 64 | lcs = my_lcs(token_r, token_c) 65 | prec.append(lcs/float(len(token_c))) 66 | rec.append(lcs/float(len(token_r))) 67 | 68 | prec_max = max(prec) 69 | rec_max = max(rec) 70 | 71 | if(prec_max!=0 and rec_max !=0): 72 | score = ((1 + self.beta**2)*prec_max*rec_max)/float(rec_max + self.beta**2*prec_max) 73 | else: 74 | score = 0.0 75 | return score 76 | 77 | def compute_score(self, gts, res): 78 | """ 79 | Computes Rouge-L score given a set of reference and candidate sentences for the dataset 80 | Invoked by evaluate_captions.py 81 | :param hypo_for_image: dict : candidate / test sentences with "image name" key and "tokenized sentences" as values 82 | :param ref_for_image: dict : reference MS-COCO sentences with "image name" key and "tokenized sentences" as values 83 | :returns: average_score: float (mean ROUGE-L score computed by averaging scores for all the images) 84 | """ 85 | assert(gts.keys() == res.keys()) 86 | imgIds = gts.keys() 87 | 88 | score = [] 89 | for id in imgIds: 90 | hypo = res[id] 91 | ref = gts[id] 92 | 93 | score.append(self.calc_score(hypo, ref)) 94 | 95 | # Sanity check. 96 | assert(type(hypo) is list) 97 | assert(len(hypo) == 1) 98 | assert(type(ref) is list) 99 | assert(len(ref) > 0) 100 | 101 | average_score = np.mean(np.array(score)) 102 | return average_score, np.array(score) 103 | 104 | def method(self): 105 | return "Rouge" 106 | -------------------------------------------------------------------------------- /tag-and-generate-train/eval/nlg_eval/nlgeval/skipthoughts/.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | 5 | # C extensions 6 | *.so 7 | 8 | # Distribution / packaging 9 | .Python 10 | env/ 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | *.egg-info/ 23 | .installed.cfg 24 | *.egg 25 | 26 | # PyInstaller 27 | # Usually these files are written by a python script from a template 28 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 29 | *.manifest 30 | *.spec 31 | 32 | # Installer logs 33 | pip-log.txt 34 | pip-delete-this-directory.txt 35 | 36 | # Unit test / coverage reports 37 | htmlcov/ 38 | .tox/ 39 | .coverage 40 | .coverage.* 41 | .cache 42 | nosetests.xml 43 | coverage.xml 44 | *,cover 45 | 46 | # Translations 47 | *.mo 48 | *.pot 49 | 50 | # Django stuff: 51 | *.log 52 | 53 | # Sphinx documentation 54 | docs/_build/ 55 | 56 | # PyBuilder 57 | target/ 58 | 59 | # Downloaded data files 60 | data/ 61 | -------------------------------------------------------------------------------- /tag-and-generate-train/eval/nlg_eval/nlgeval/skipthoughts/README.md: -------------------------------------------------------------------------------- 1 | # skip-thoughts 2 | 3 | Original README can be found at [ryankiros/skip-thoughts](https://github.com/ryankiros/skip-thoughts/blob/6661cad40664b6c251cac1dad779986eb332c26a/README.md). 4 | 5 | ## License 6 | 7 | All files in the skipthoughts directory are under 8 | [Apache License 2.0](http://www.apache.org/licenses/LICENSE-2.0) 9 | to the authors of [ryankiros/skip-thoughts](https://github.com/ryankiros/skip-thoughts/tree/6661cad40664b6c251cac1dad779986eb332c26a). 10 | -------------------------------------------------------------------------------- /tag-and-generate-train/eval/nlg_eval/nlgeval/skipthoughts/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tag-and-generate/tagger-generator/0059c49fd7df273be9421cb683c96f362663057d/tag-and-generate-train/eval/nlg_eval/nlgeval/skipthoughts/__init__.py -------------------------------------------------------------------------------- /tag-and-generate-train/eval/nlg_eval/nlgeval/skipthoughts/skipthoughts.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Skip-thought vectors 3 | ''' 4 | import copy 5 | import os 6 | from collections import OrderedDict, defaultdict 7 | 8 | import nltk 9 | import numpy 10 | import six 11 | import theano 12 | import theano.tensor as tensor 13 | from nltk.tokenize import word_tokenize 14 | from scipy.linalg import norm 15 | from six.moves import cPickle as pkl 16 | from nlgeval.utils import get_data_dir 17 | import logging 18 | 19 | profile = False 20 | 21 | #-----------------------------------------------------------------------------# 22 | # Specify model and table locations here 23 | #-----------------------------------------------------------------------------# 24 | path_to_models = get_data_dir() 25 | path_to_tables = get_data_dir() 26 | #-----------------------------------------------------------------------------# 27 | 28 | path_to_umodel = os.path.join(path_to_models, 'uni_skip.npz') 29 | path_to_bmodel = os.path.join(path_to_models, 'bi_skip.npz') 30 | 31 | 32 | def load_model(): 33 | """ 34 | Load the model with saved tables 35 | """ 36 | # Load model options 37 | # print 'Loading model parameters...' 38 | with open('%s.pkl'%path_to_umodel, 'rb') as f: 39 | uoptions = pkl.load(f) 40 | with open('%s.pkl'%path_to_bmodel, 'rb') as f: 41 | boptions = pkl.load(f) 42 | 43 | # Load parameters 44 | uparams = init_params(uoptions) 45 | uparams = load_params(path_to_umodel, uparams) 46 | utparams = init_tparams(uparams) 47 | bparams = init_params_bi(boptions) 48 | bparams = load_params(path_to_bmodel, bparams) 49 | btparams = init_tparams(bparams) 50 | 51 | # Extractor functions 52 | # print 'Compiling encoders...' 53 | embedding, x_mask, ctxw2v = build_encoder(utparams, uoptions) 54 | f_w2v = theano.function([embedding, x_mask], ctxw2v, name='f_w2v') 55 | embedding, x_mask, ctxw2v = build_encoder_bi(btparams, boptions) 56 | f_w2v2 = theano.function([embedding, x_mask], ctxw2v, name='f_w2v2') 57 | 58 | # Tables 59 | # print 'Loading tables...' 60 | utable, btable = load_tables() 61 | 62 | # Store everything we need in a dictionary 63 | # print 'Packing up...' 64 | model = {} 65 | model['uoptions'] = uoptions 66 | model['boptions'] = boptions 67 | model['utable'] = utable 68 | model['btable'] = btable 69 | model['f_w2v'] = f_w2v 70 | model['f_w2v2'] = f_w2v2 71 | 72 | return model 73 | 74 | 75 | def load_tables(): 76 | """ 77 | Load the tables 78 | """ 79 | words = [] 80 | utable = numpy.load(os.path.join(path_to_tables, 'utable.npy'), allow_pickle=True, encoding='bytes') 81 | btable = numpy.load(os.path.join(path_to_tables, 'btable.npy'), allow_pickle=True, encoding='bytes') 82 | f = open(os.path.join(path_to_tables, 'dictionary.txt'), 'rb') 83 | for line in f: 84 | words.append(line.decode('utf-8').strip()) 85 | f.close() 86 | utable = OrderedDict(zip(words, utable)) 87 | btable = OrderedDict(zip(words, btable)) 88 | return utable, btable 89 | 90 | 91 | class Encoder(object): 92 | """ 93 | Sentence encoder. 94 | """ 95 | 96 | def __init__(self, model): 97 | self._model = model 98 | 99 | def encode(self, X, use_norm=True, verbose=True, batch_size=128, use_eos=False): 100 | """ 101 | Encode sentences in the list X. Each entry will return a vector 102 | """ 103 | return encode(self._model, X, use_norm, verbose, batch_size, use_eos) 104 | 105 | 106 | def encode(model, X, use_norm=True, verbose=True, batch_size=128, use_eos=False): 107 | """ 108 | Encode sentences in the list X. Each entry will return a vector 109 | """ 110 | # first, do preprocessing 111 | X = preprocess(X) 112 | 113 | # word dictionary and init 114 | d = defaultdict(lambda : 0) 115 | for w in model['utable'].keys(): 116 | d[w] = 1 117 | ufeatures = numpy.zeros((len(X), model['uoptions']['dim']), dtype='float32') 118 | bfeatures = numpy.zeros((len(X), 2 * model['boptions']['dim']), dtype='float32') 119 | 120 | # length dictionary 121 | ds = defaultdict(list) 122 | captions = [s.split() for s in X] 123 | for i,s in enumerate(captions): 124 | ds[len(s)].append(i) 125 | 126 | # Get features. This encodes by length, in order to avoid wasting computation 127 | for k in ds.keys(): 128 | if verbose: 129 | print(k) 130 | numbatches = int(len(ds[k]) / batch_size + 1) 131 | for minibatch in range(numbatches): 132 | caps = ds[k][minibatch::numbatches] 133 | 134 | if use_eos: 135 | uembedding = numpy.zeros((k+1, len(caps), model['uoptions']['dim_word']), dtype='float32') 136 | bembedding = numpy.zeros((k+1, len(caps), model['boptions']['dim_word']), dtype='float32') 137 | else: 138 | uembedding = numpy.zeros((k, len(caps), model['uoptions']['dim_word']), dtype='float32') 139 | bembedding = numpy.zeros((k, len(caps), model['boptions']['dim_word']), dtype='float32') 140 | for ind, c in enumerate(caps): 141 | caption = captions[c] 142 | for j in range(len(caption)): 143 | if d[caption[j]] > 0: 144 | uembedding[j,ind] = model['utable'][caption[j]] 145 | bembedding[j,ind] = model['btable'][caption[j]] 146 | else: 147 | uembedding[j,ind] = model['utable']['UNK'] 148 | bembedding[j,ind] = model['btable']['UNK'] 149 | if use_eos: 150 | uembedding[-1,ind] = model['utable'][''] 151 | bembedding[-1,ind] = model['btable'][''] 152 | if use_eos: 153 | uff = model['f_w2v'](uembedding, numpy.ones((len(caption)+1,len(caps)), dtype='float32')) 154 | bff = model['f_w2v2'](bembedding, numpy.ones((len(caption)+1,len(caps)), dtype='float32')) 155 | else: 156 | uff = model['f_w2v'](uembedding, numpy.ones((len(caption),len(caps)), dtype='float32')) 157 | bff = model['f_w2v2'](bembedding, numpy.ones((len(caption),len(caps)), dtype='float32')) 158 | if use_norm: 159 | for j in range(len(uff)): 160 | uff[j] /= norm(uff[j]) 161 | bff[j] /= norm(bff[j]) 162 | for ind, c in enumerate(caps): 163 | ufeatures[c] = uff[ind] 164 | bfeatures[c] = bff[ind] 165 | 166 | features = numpy.c_[ufeatures, bfeatures] 167 | return features 168 | 169 | 170 | def preprocess(text): 171 | """ 172 | Preprocess text for encoder 173 | """ 174 | X = [] 175 | sent_detector = nltk.data.load('tokenizers/punkt/english.pickle') 176 | for t in text: 177 | sents = sent_detector.tokenize(t) 178 | result = '' 179 | for s in sents: 180 | tokens = word_tokenize(s) 181 | result += ' ' + ' '.join(tokens) 182 | X.append(result) 183 | return X 184 | 185 | 186 | def nn(model, text, vectors, query, k=5): 187 | """ 188 | Return the nearest neighbour sentences to query 189 | text: list of sentences 190 | vectors: the corresponding representations for text 191 | query: a string to search 192 | """ 193 | qf = encode(model, [query]) 194 | qf /= norm(qf) 195 | scores = numpy.dot(qf, vectors.T).flatten() 196 | sorted_args = numpy.argsort(scores)[::-1] 197 | sentences = [text[a] for a in sorted_args[:k]] 198 | print('QUERY: ' + query) 199 | print('NEAREST: ') 200 | for i, s in enumerate(sentences): 201 | print(s, sorted_args[i]) 202 | 203 | 204 | def word_features(table): 205 | """ 206 | Extract word features into a normalized matrix 207 | """ 208 | features = numpy.zeros((len(table), 620), dtype='float32') 209 | keys = table.keys() 210 | for i in range(len(table)): 211 | f = table[keys[i]] 212 | features[i] = f / norm(f) 213 | return features 214 | 215 | 216 | def nn_words(table, wordvecs, query, k=10): 217 | """ 218 | Get the nearest neighbour words 219 | """ 220 | keys = table.keys() 221 | qf = table[query] 222 | scores = numpy.dot(qf, wordvecs.T).flatten() 223 | sorted_args = numpy.argsort(scores)[::-1] 224 | words = [keys[a] for a in sorted_args[:k]] 225 | print('QUERY: ' + query) 226 | print('NEAREST: ') 227 | for i, w in enumerate(words): 228 | print(w) 229 | 230 | 231 | def _p(pp, name): 232 | """ 233 | make prefix-appended name 234 | """ 235 | return '%s_%s'%(pp, name) 236 | 237 | 238 | def init_tparams(params): 239 | """ 240 | initialize Theano shared variables according to the initial parameters 241 | """ 242 | tparams = OrderedDict() 243 | for kk, pp in six.iteritems(params): 244 | tparams[kk] = theano.shared(params[kk], name=kk) 245 | return tparams 246 | 247 | 248 | def load_params(path, params): 249 | """ 250 | load parameters 251 | """ 252 | pp = numpy.load(path) 253 | for kk, vv in six.iteritems(params): 254 | if kk not in pp: 255 | logging.warning('%s is not in the archive', kk) 256 | continue 257 | params[kk] = pp[kk] 258 | return params 259 | 260 | 261 | # layers: 'name': ('parameter initializer', 'feedforward') 262 | layers = {'gru': ('param_init_gru', 'gru_layer')} 263 | 264 | def get_layer(name): 265 | fns = layers[name] 266 | return (eval(fns[0]), eval(fns[1])) 267 | 268 | 269 | def init_params(options): 270 | """ 271 | initialize all parameters needed for the encoder 272 | """ 273 | params = OrderedDict() 274 | 275 | # embedding 276 | params['Wemb'] = norm_weight(options['n_words_src'], options['dim_word']) 277 | 278 | # encoder: GRU 279 | params = get_layer(options['encoder'])[0](options, params, prefix='encoder', 280 | nin=options['dim_word'], dim=options['dim']) 281 | return params 282 | 283 | 284 | def init_params_bi(options): 285 | """ 286 | initialize all paramters needed for bidirectional encoder 287 | """ 288 | params = OrderedDict() 289 | 290 | # embedding 291 | params['Wemb'] = norm_weight(options['n_words_src'], options['dim_word']) 292 | 293 | # encoder: GRU 294 | params = get_layer(options['encoder'])[0](options, params, prefix='encoder', 295 | nin=options['dim_word'], dim=options['dim']) 296 | params = get_layer(options['encoder'])[0](options, params, prefix='encoder_r', 297 | nin=options['dim_word'], dim=options['dim']) 298 | return params 299 | 300 | 301 | def build_encoder(tparams, options): 302 | """ 303 | build an encoder, given pre-computed word embeddings 304 | """ 305 | # word embedding (source) 306 | embedding = tensor.tensor3('embedding', dtype='float32') 307 | x_mask = tensor.matrix('x_mask', dtype='float32') 308 | 309 | # encoder 310 | proj = get_layer(options['encoder'])[1](tparams, embedding, options, 311 | prefix='encoder', 312 | mask=x_mask) 313 | ctx = proj[0][-1] 314 | 315 | return embedding, x_mask, ctx 316 | 317 | 318 | def build_encoder_bi(tparams, options): 319 | """ 320 | build bidirectional encoder, given pre-computed word embeddings 321 | """ 322 | # word embedding (source) 323 | embedding = tensor.tensor3('embedding', dtype='float32') 324 | embeddingr = embedding[::-1] 325 | x_mask = tensor.matrix('x_mask', dtype='float32') 326 | xr_mask = x_mask[::-1] 327 | 328 | # encoder 329 | proj = get_layer(options['encoder'])[1](tparams, embedding, options, 330 | prefix='encoder', 331 | mask=x_mask) 332 | projr = get_layer(options['encoder'])[1](tparams, embeddingr, options, 333 | prefix='encoder_r', 334 | mask=xr_mask) 335 | 336 | ctx = tensor.concatenate([proj[0][-1], projr[0][-1]], axis=1) 337 | 338 | return embedding, x_mask, ctx 339 | 340 | 341 | # some utilities 342 | def ortho_weight(ndim): 343 | W = numpy.random.randn(ndim, ndim) 344 | u, s, v = numpy.linalg.svd(W) 345 | return u.astype('float32') 346 | 347 | 348 | def norm_weight(nin,nout=None, scale=0.1, ortho=True): 349 | if nout == None: 350 | nout = nin 351 | if nout == nin and ortho: 352 | W = ortho_weight(nin) 353 | else: 354 | W = numpy.random.uniform(low=-scale, high=scale, size=(nin, nout)) 355 | return W.astype('float32') 356 | 357 | 358 | def param_init_gru(options, params, prefix='gru', nin=None, dim=None): 359 | """ 360 | parameter init for GRU 361 | """ 362 | if nin == None: 363 | nin = options['dim_proj'] 364 | if dim == None: 365 | dim = options['dim_proj'] 366 | W = numpy.concatenate([norm_weight(nin,dim), 367 | norm_weight(nin,dim)], axis=1) 368 | params[_p(prefix,'W')] = W 369 | params[_p(prefix,'b')] = numpy.zeros((2 * dim,)).astype('float32') 370 | U = numpy.concatenate([ortho_weight(dim), 371 | ortho_weight(dim)], axis=1) 372 | params[_p(prefix,'U')] = U 373 | 374 | Wx = norm_weight(nin, dim) 375 | params[_p(prefix,'Wx')] = Wx 376 | Ux = ortho_weight(dim) 377 | params[_p(prefix,'Ux')] = Ux 378 | params[_p(prefix,'bx')] = numpy.zeros((dim,)).astype('float32') 379 | 380 | return params 381 | 382 | 383 | def gru_layer(tparams, state_below, options, prefix='gru', mask=None, **kwargs): 384 | """ 385 | Forward pass through GRU layer 386 | """ 387 | nsteps = state_below.shape[0] 388 | if state_below.ndim == 3: 389 | n_samples = state_below.shape[1] 390 | else: 391 | n_samples = 1 392 | 393 | dim = tparams[_p(prefix,'Ux')].shape[1] 394 | 395 | if mask == None: 396 | mask = tensor.alloc(1., state_below.shape[0], 1) 397 | 398 | def _slice(_x, n, dim): 399 | if _x.ndim == 3: 400 | return _x[:, :, n*dim:(n+1)*dim] 401 | return _x[:, n*dim:(n+1)*dim] 402 | 403 | state_below_ = tensor.dot(state_below, tparams[_p(prefix, 'W')]) + tparams[_p(prefix, 'b')] 404 | state_belowx = tensor.dot(state_below, tparams[_p(prefix, 'Wx')]) + tparams[_p(prefix, 'bx')] 405 | U = tparams[_p(prefix, 'U')] 406 | Ux = tparams[_p(prefix, 'Ux')] 407 | 408 | def _step_slice(m_, x_, xx_, h_, U, Ux): 409 | preact = tensor.dot(h_, U) 410 | preact += x_ 411 | 412 | r = tensor.nnet.sigmoid(_slice(preact, 0, dim)) 413 | u = tensor.nnet.sigmoid(_slice(preact, 1, dim)) 414 | 415 | preactx = tensor.dot(h_, Ux) 416 | preactx = preactx * r 417 | preactx = preactx + xx_ 418 | 419 | h = tensor.tanh(preactx) 420 | 421 | h = u * h_ + (1. - u) * h 422 | h = m_[:,None] * h + (1. - m_)[:,None] * h_ 423 | 424 | return h 425 | 426 | seqs = [mask, state_below_, state_belowx] 427 | _step = _step_slice 428 | 429 | rval, updates = theano.scan(_step, 430 | sequences=seqs, 431 | outputs_info = [tensor.alloc(0., n_samples, dim)], 432 | non_sequences = [tparams[_p(prefix, 'U')], 433 | tparams[_p(prefix, 'Ux')]], 434 | name=_p(prefix, '_layers'), 435 | n_steps=nsteps, 436 | profile=profile, 437 | strict=True) 438 | rval = [rval] 439 | return rval 440 | 441 | 442 | -------------------------------------------------------------------------------- /tag-and-generate-train/eval/nlg_eval/nlgeval/tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tag-and-generate/tagger-generator/0059c49fd7df273be9421cb683c96f362663057d/tag-and-generate-train/eval/nlg_eval/nlgeval/tests/__init__.py -------------------------------------------------------------------------------- /tag-and-generate-train/eval/nlg_eval/nlgeval/tests/test_nlgeval.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from __future__ import unicode_literals 3 | 4 | import os 5 | import unittest 6 | 7 | import nlgeval 8 | from nlgeval import NLGEval 9 | 10 | 11 | class TestNlgEval(unittest.TestCase): 12 | def test_compute_metrics_oo(self): 13 | # Create the object in the test so that it can be garbage collected once the test is done. 14 | n = NLGEval() 15 | 16 | # Individual Metrics 17 | scores = n.compute_individual_metrics(ref=["this is a test", 18 | "this is also a test"], 19 | hyp="this is a good test") 20 | self.assertAlmostEqual(0.799999, scores['Bleu_1'], places=5) 21 | self.assertAlmostEqual(0.632455, scores['Bleu_2'], places=5) 22 | self.assertAlmostEqual(0.5108729, scores['Bleu_3'], places=5) 23 | self.assertAlmostEqual(0.0000903602, scores['Bleu_4'], places=5) 24 | self.assertAlmostEqual(0.44434387, scores['METEOR'], places=5) 25 | self.assertAlmostEqual(0.9070631, scores['ROUGE_L'], places=5) 26 | self.assertAlmostEqual(0.0, scores['CIDEr'], places=5) 27 | self.assertAlmostEqual(0.8375251, scores['SkipThoughtCS'], places=5) 28 | self.assertAlmostEqual(0.980075, scores['EmbeddingAverageCosineSimilairty'], places=5) 29 | self.assertAlmostEqual(0.94509, scores['VectorExtremaCosineSimilarity'], places=5) 30 | self.assertAlmostEqual(0.960771, scores['GreedyMatchingScore'], places=5) 31 | self.assertEqual(11, len(scores)) 32 | 33 | scores = n.compute_metrics(ref_list=[ 34 | [ 35 | "this is one reference sentence for sentence1", 36 | "this is a reference sentence for sentence2 which was generated by your model" 37 | ], 38 | [ 39 | "this is one more reference sentence for sentence1", 40 | "this is the second reference sentence for sentence2" 41 | ], 42 | ], 43 | hyp_list=[ 44 | "this is the model generated sentence1 which seems good enough", 45 | "this is sentence2 which has been generated by your model" 46 | ] 47 | ) 48 | self.assertAlmostEqual(0.55, scores['Bleu_1'], places=5) 49 | self.assertAlmostEqual(0.428174, scores['Bleu_2'], places=5) 50 | self.assertAlmostEqual(0.284043, scores['Bleu_3'], places=5) 51 | self.assertAlmostEqual(0.201143, scores['Bleu_4'], places=5) 52 | self.assertAlmostEqual(0.295797, scores['METEOR'], places=5) 53 | self.assertAlmostEqual(0.522104, scores['ROUGE_L'], places=5) 54 | self.assertAlmostEqual(1.242192, scores['CIDEr'], places=5) 55 | self.assertAlmostEqual(0.626149, scores['SkipThoughtCS'], places=5) 56 | self.assertAlmostEqual(0.88469, scores['EmbeddingAverageCosineSimilairty'], places=5) 57 | self.assertAlmostEqual(0.568696, scores['VectorExtremaCosineSimilarity'], places=5) 58 | self.assertAlmostEqual(0.784205, scores['GreedyMatchingScore'], places=5) 59 | self.assertEqual(11, len(scores)) 60 | 61 | # Non-ASCII tests. 62 | scores = n.compute_individual_metrics(ref=["Test en français.", 63 | "Le test en français."], 64 | hyp="Le test est en français.") 65 | self.assertAlmostEqual(0.799999, scores['Bleu_1'], places=5) 66 | self.assertAlmostEqual(0.632455, scores['Bleu_2'], places=5) 67 | self.assertAlmostEqual(0.0000051, scores['Bleu_3'], places=5) 68 | self.assertAlmostEqual(0, scores['Bleu_4'], places=5) 69 | self.assertAlmostEqual(0.48372379050300296, scores['METEOR'], places=5) 70 | self.assertAlmostEqual(0.9070631, scores['ROUGE_L'], places=5) 71 | self.assertAlmostEqual(0.0, scores['CIDEr'], places=5) 72 | self.assertAlmostEqual(0.9192341566085815, scores['SkipThoughtCS'], places=5) 73 | self.assertAlmostEqual(0.906562, scores['EmbeddingAverageCosineSimilairty'], places=5) 74 | self.assertAlmostEqual(0.815158, scores['VectorExtremaCosineSimilarity'], places=5) 75 | self.assertAlmostEqual(0.940959, scores['GreedyMatchingScore'], places=5) 76 | self.assertEqual(11, len(scores)) 77 | 78 | scores = n.compute_individual_metrics(ref=["テスト"], 79 | hyp="テスト") 80 | self.assertAlmostEqual(0.99999999, scores['Bleu_1'], places=5) 81 | self.assertAlmostEqual(1.0, scores['METEOR'], places=3) 82 | self.assertAlmostEqual(1.0, scores['ROUGE_L'], places=3) 83 | self.assertAlmostEqual(0.0, scores['CIDEr'], places=3) 84 | self.assertAlmostEqual(1.0, scores['SkipThoughtCS'], places=3) 85 | self.assertAlmostEqual(1.0, scores['GreedyMatchingScore'], places=3) 86 | self.assertEqual(11, len(scores)) 87 | 88 | def test_compute_metrics_omit(self): 89 | n = NLGEval(metrics_to_omit=['Bleu_3', 'METEOR', 'EmbeddingAverageCosineSimilairty']) 90 | 91 | # Individual Metrics 92 | scores = n.compute_individual_metrics(ref=["this is a test", 93 | "this is also a test"], 94 | hyp="this is a good test") 95 | self.assertAlmostEqual(0.799999, scores['Bleu_1'], places=5) 96 | self.assertAlmostEqual(0.632455, scores['Bleu_2'], places=5) 97 | self.assertAlmostEqual(0.9070631, scores['ROUGE_L'], places=5) 98 | self.assertAlmostEqual(0.0, scores['CIDEr'], places=5) 99 | self.assertAlmostEqual(0.8375251, scores['SkipThoughtCS'], places=5) 100 | self.assertAlmostEqual(0.94509, scores['VectorExtremaCosineSimilarity'], places=5) 101 | self.assertAlmostEqual(0.960771, scores['GreedyMatchingScore'], places=5) 102 | self.assertEqual(7, len(scores)) 103 | 104 | def test_compute_metrics(self): 105 | # The example from the README. 106 | root_dir = os.path.join(os.path.dirname(__file__), '..', '..') 107 | hypothesis = os.path.join(root_dir, 'examples/hyp.txt') 108 | references = os.path.join(root_dir, 'examples/ref1.txt'), os.path.join(root_dir, 'examples/ref2.txt') 109 | scores = nlgeval.compute_metrics(hypothesis, references) 110 | self.assertAlmostEqual(0.55, scores['Bleu_1'], places=5) 111 | self.assertAlmostEqual(0.428174, scores['Bleu_2'], places=5) 112 | self.assertAlmostEqual(0.284043, scores['Bleu_3'], places=5) 113 | self.assertAlmostEqual(0.201143, scores['Bleu_4'], places=5) 114 | self.assertAlmostEqual(0.295797, scores['METEOR'], places=5) 115 | self.assertAlmostEqual(0.522104, scores['ROUGE_L'], places=5) 116 | self.assertAlmostEqual(1.242192, scores['CIDEr'], places=5) 117 | self.assertAlmostEqual(0.626149, scores['SkipThoughtCS'], places=5) 118 | self.assertAlmostEqual(0.88469, scores['EmbeddingAverageCosineSimilairty'], places=5) 119 | self.assertAlmostEqual(0.568696, scores['VectorExtremaCosineSimilarity'], places=5) 120 | self.assertAlmostEqual(0.784205, scores['GreedyMatchingScore'], places=5) 121 | self.assertEqual(11, len(scores)) 122 | -------------------------------------------------------------------------------- /tag-and-generate-train/eval/nlg_eval/nlgeval/utils.py: -------------------------------------------------------------------------------- 1 | import click 2 | import json 3 | import os 4 | 5 | from xdg import XDG_CONFIG_HOME 6 | 7 | 8 | class InvalidDataDirException(Exception): 9 | pass 10 | 11 | 12 | def get_data_dir(): 13 | if os.environ.get('NLGEVAL_DATA'): 14 | if not os.path.exists(os.environ.get('NLGEVAL_DATA')): 15 | click.secho("NLGEVAL_DATA variable is set but points to non-existent path.", fg='red', err=True) 16 | raise InvalidDataDirException() 17 | return os.environ.get('NLGEVAL_DATA') 18 | else: 19 | try: 20 | cfg_file = os.path.join(XDG_CONFIG_HOME, 'nlgeval', 'rc.json') 21 | with open(cfg_file, 'rt') as f: 22 | rc = json.load(f) 23 | if not os.path.exists(rc['data_path']): 24 | click.secho("Data path found in {} does not exist: {} " % (cfg_file, rc['data_path']), fg='red', err=True) 25 | click.secho("Run `nlg-eval --setup DATA_DIR' to download or set $NLGEVAL_DATA to an existing location", 26 | fg='red', err=True) 27 | raise InvalidDataDirException() 28 | return rc['data_path'] 29 | except: 30 | click.secho("Could not determine location of data.", fg='red', err=True) 31 | click.secho("Run `nlg-eval --setup DATA_DIR' to download or set $NLGEVAL_DATA to an existing location", fg='red', 32 | err=True) 33 | raise InvalidDataDirException() 34 | -------------------------------------------------------------------------------- /tag-and-generate-train/eval/nlg_eval/nlgeval/word2vec/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tag-and-generate/tagger-generator/0059c49fd7df273be9421cb683c96f362663057d/tag-and-generate-train/eval/nlg_eval/nlgeval/word2vec/__init__.py -------------------------------------------------------------------------------- /tag-and-generate-train/eval/nlg_eval/nlgeval/word2vec/evaluate.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. All rights reserved. 2 | # Licensed under the MIT license. See LICENSE.md file in the project root for full license information. 3 | import os 4 | import numpy as np 5 | 6 | from nlgeval.utils import get_data_dir 7 | 8 | 9 | try: 10 | from gensim.models import KeyedVectors 11 | except ImportError: 12 | from gensim.models import Word2Vec as KeyedVectors 13 | 14 | 15 | class Embedding(object): 16 | def __init__(self): 17 | path = get_data_dir() 18 | self.m = KeyedVectors.load(os.path.join(path, 'glove.6B.300d.model.bin'), mmap='r') 19 | try: 20 | self.unk = self.m.vectors.mean(axis=0) 21 | except AttributeError: 22 | self.unk = self.m.syn0.mean(axis=0) 23 | 24 | @property 25 | def w2v(self): 26 | return np.concatenate((self.m.syn0, self.unk[None,:]), axis=0) 27 | 28 | def __getitem__(self, key): 29 | try: 30 | return self.m.vocab[key].index 31 | except KeyError: 32 | return len(self.m.syn0) 33 | 34 | def vec(self, key): 35 | try: 36 | vectors = self.m.vectors 37 | except AttributeError: 38 | vectors = self.m.syn0 39 | try: 40 | return vectors[self.m.vocab[key].index] 41 | except KeyError: 42 | return self.unk 43 | 44 | 45 | def eval_emb_metrics(hypothesis, references, emb=None, metrics_to_omit=None): 46 | from sklearn.metrics.pairwise import cosine_similarity 47 | from nltk.tokenize import word_tokenize 48 | import numpy as np 49 | if emb is None: 50 | emb = Embedding() 51 | 52 | if metrics_to_omit is None: 53 | metrics_to_omit = set() 54 | 55 | emb_hyps = [] 56 | avg_emb_hyps = [] 57 | extreme_emb_hyps = [] 58 | for hyp in hypothesis: 59 | embs = [emb.vec(word) for word in word_tokenize(hyp)] 60 | 61 | avg_emb = np.sum(embs, axis=0) / np.linalg.norm(np.sum(embs, axis=0)) 62 | assert not np.any(np.isnan(avg_emb)) 63 | 64 | maxemb = np.max(embs, axis=0) 65 | minemb = np.min(embs, axis=0) 66 | extreme_emb = list(map(lambda x, y: x if ((x>y or x<-y) and y>0) or ((x-y) and y<0) else y, maxemb, minemb)) 67 | 68 | emb_hyps.append(embs) 69 | avg_emb_hyps.append(avg_emb) 70 | extreme_emb_hyps.append(extreme_emb) 71 | 72 | emb_refs = [] 73 | avg_emb_refs = [] 74 | extreme_emb_refs = [] 75 | for refsource in references: 76 | emb_refsource = [] 77 | avg_emb_refsource = [] 78 | extreme_emb_refsource = [] 79 | for ref in refsource: 80 | embs = [emb.vec(word) for word in word_tokenize(ref)] 81 | 82 | avg_emb = np.sum(embs, axis=0) / np.linalg.norm(np.sum(embs, axis=0)) 83 | assert not np.any(np.isnan(avg_emb)) 84 | 85 | maxemb = np.max(embs, axis=0) 86 | minemb = np.min(embs, axis=0) 87 | extreme_emb = list(map(lambda x, y: x if ((x>y or x<-y) and y>0) or ((x-y) and y<0) else y, maxemb, minemb)) 88 | 89 | emb_refsource.append(embs) 90 | avg_emb_refsource.append(avg_emb) 91 | extreme_emb_refsource.append(extreme_emb) 92 | emb_refs.append(emb_refsource) 93 | avg_emb_refs.append(avg_emb_refsource) 94 | extreme_emb_refs.append(extreme_emb_refsource) 95 | 96 | rval = [] 97 | if 'EmbeddingAverageCosineSimilairty' not in metrics_to_omit: 98 | cos_similarity = list(map(lambda refv: cosine_similarity(refv, avg_emb_hyps).diagonal(), avg_emb_refs)) 99 | cos_similarity = np.max(cos_similarity, axis=0).mean() 100 | rval.append("EmbeddingAverageCosineSimilairty: %0.6f" % (cos_similarity)) 101 | 102 | if 'VectorExtremaCosineSimilarity' not in metrics_to_omit: 103 | cos_similarity = list(map(lambda refv: cosine_similarity(refv, extreme_emb_hyps).diagonal(), extreme_emb_refs)) 104 | cos_similarity = np.max(cos_similarity, axis=0).mean() 105 | rval.append("VectorExtremaCosineSimilarity: %0.6f" % (cos_similarity)) 106 | 107 | if 'GreedyMatchingScore' not in metrics_to_omit: 108 | scores = [] 109 | for emb_refsource in emb_refs: 110 | score_source = [] 111 | for emb_ref, emb_hyp in zip(emb_refsource, emb_hyps): 112 | simi_matrix = cosine_similarity(emb_ref, emb_hyp) 113 | dir1 = simi_matrix.max(axis=0).mean() 114 | dir2 = simi_matrix.max(axis=1).mean() 115 | score_source.append((dir1 + dir2) / 2) 116 | scores.append(score_source) 117 | scores = np.max(scores, axis=0).mean() 118 | rval.append("GreedyMatchingScore: %0.6f" % (scores)) 119 | 120 | rval = "\n".join(rval) 121 | return rval 122 | 123 | 124 | if __name__ == '__main__': 125 | emb = Embedding() 126 | -------------------------------------------------------------------------------- /tag-and-generate-train/eval/nlg_eval/nlgeval/word2vec/generate_w2v_files.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. All rights reserved. 2 | # Licensed under the MIT license. See LICENSE.md file in the project root for full license information. 3 | import os 4 | 5 | try: 6 | from gensim.models import KeyedVectors 7 | except ImportError: 8 | from gensim.models import Word2Vec as KeyedVectors 9 | 10 | import six 11 | from nlgeval.word2vec.glove2word2vec import glove2word2vec 12 | 13 | 14 | def txt2bin(filename): 15 | m = KeyedVectors.load_word2vec_format(filename) 16 | m.vocab[next(six.iterkeys(m.vocab))].sample_int = 1 17 | m.save(filename.replace('txt', 'bin'), separately=None) 18 | KeyedVectors.load(filename.replace('txt', 'bin'), mmap='r') 19 | 20 | 21 | def generate(path): 22 | glove_vector_file = os.path.join(path, 'glove.6B.300d.txt') 23 | output_model_file = os.path.join(path, 'glove.6B.300d.model.txt') 24 | 25 | txt2bin(glove2word2vec(glove_vector_file, output_model_file)) 26 | 27 | 28 | if __name__ == "__main__": 29 | generate() 30 | -------------------------------------------------------------------------------- /tag-and-generate-train/eval/nlg_eval/requirements.txt: -------------------------------------------------------------------------------- 1 | click>=6.3 2 | nltk>=3.1 3 | numpy>=1.11.0 4 | psutil>=5.6.2 5 | requests>=2.19 6 | six>=1.11 7 | scipy>=0.17.0 8 | scikit-learn>=0.17 9 | gensim>=3 10 | Theano>=0.8.1 11 | tqdm>=4.24 12 | xdg 13 | -------------------------------------------------------------------------------- /tag-and-generate-train/eval/nlg_eval/requirements_py2.txt: -------------------------------------------------------------------------------- 1 | click>=6.3 2 | nltk>=3.1 3 | numpy>=1.11.0<=1.17 4 | psutil>=5.6.2 5 | requests>=2.19 6 | six>=1.11 7 | scipy>=0.17.0 8 | scikit-learn<0.21 9 | gensim<1 10 | Theano>=0.8.1 11 | tqdm>=4.24 12 | xdg==1.0.7 13 | -------------------------------------------------------------------------------- /tag-and-generate-train/eval/nlg_eval/setup.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | # Copyright (c) Microsoft Corporation. All rights reserved. 4 | # Licensed under the MIT license. See LICENSE.md file in the project root for full license information. 5 | 6 | import sys 7 | 8 | from setuptools import find_packages 9 | from setuptools import setup 10 | from setuptools.command.develop import develop 11 | from setuptools.command.install import install 12 | 13 | try: 14 | from pip._internal.req import parse_requirements 15 | except: 16 | from pip.req import parse_requirements 17 | 18 | 19 | if __name__ == '__main__': 20 | requirements_path = 'requirements.txt' 21 | if sys.version_info[0] < 3: 22 | requirements_path = 'requirements_py2.txt' 23 | install_reqs = parse_requirements(requirements_path, session=False) 24 | reqs = [str(ir.req) for ir in install_reqs] 25 | 26 | setup(name='nlg-eval', 27 | version='2.2', 28 | description="Wrapper for multiple NLG evaluation methods and metrics.", 29 | author='Shikhar Sharma, Hannes Schulz, Justin Harris', 30 | author_email='shikhar.sharma@microsoft.com, hannes.schulz@microsoft.com, justin.harris@microsoft.com', 31 | url='https://github.com/Maluuba/nlg-eval', 32 | packages=find_packages(), 33 | include_package_data=True, 34 | scripts=['bin/nlg-eval'], 35 | install_requires=reqs, 36 | ) 37 | -------------------------------------------------------------------------------- /tag-and-generate-train/eval/nlg_eval/test/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tag-and-generate/tagger-generator/0059c49fd7df273be9421cb683c96f362663057d/tag-and-generate-train/eval/nlg_eval/test/__init__.py -------------------------------------------------------------------------------- /tag-and-generate-train/eval/nlg_eval/test/api.py: -------------------------------------------------------------------------------- 1 | from nlgeval import NLGEval 2 | 3 | def test_oo_api(): 4 | with open("examples/hyp.txt") as f: 5 | hyp = f.readlines() 6 | hyp = [x.strip() for x in hyp] 7 | with open("examples/ref1.txt") as f: 8 | ref1 = f.readlines() 9 | ref1 = [x.strip() for x in ref1] 10 | with open("examples/ref2.txt") as f: 11 | ref2 = f.readlines() 12 | ref2 = [x.strip() for x in ref2] 13 | 14 | nlge = NLGEval() 15 | 16 | res = nlge.compute_individual_metrics([ref1[0]] + [ref2[0]], hyp[0]) 17 | res = nlge.compute_individual_metrics([ref1[1]] + [ref2[1]], hyp[1]) 18 | 19 | hyp_list = hyp 20 | ref_list = [ref1, ref2] 21 | res = nlge.compute_metrics(ref_list, hyp_list) 22 | -------------------------------------------------------------------------------- /tag-and-generate-train/eval/run_context_eval.sh: -------------------------------------------------------------------------------- 1 | ########### 2 | # Usage: bash eval/run_context_eval.sh 3 | ########### 4 | 5 | export PYTHONPATH='eval/nlg_eval:.' 6 | hyp=$1 7 | ref=$2 8 | 9 | python3 eval/context_eval.py --hyp "$1" --ref "$2" 10 | tail -n 1 <(cat $hyp | sacrebleu -w 2 $ref) 11 | -------------------------------------------------------------------------------- /tag-and-generate-train/requirements.txt: -------------------------------------------------------------------------------- 1 | absl-py==0.8.0 2 | adjustText==0.7.3 3 | altair==3.2.0 4 | appnope==0.1.0 5 | argh==0.26.2 6 | astroid==2.2.5 7 | astropy==3.2.1 8 | atomicwrites==1.3.0 9 | attrs==19.1.0 10 | autopep8==1.4.4 11 | backcall==0.1.0 12 | backports.functools-lru-cache==1.5 13 | base58==1.0.3 14 | beautifulsoup4==4.8.0 15 | bleach==3.1.4 16 | blis==0.2.4 17 | boto3==1.9.243 18 | botocore==1.12.243 19 | certifi==2019.6.16 20 | chardet==3.0.4 21 | Click==7.0 22 | colorama==0.4.1 23 | compare-mt==0.2.6 24 | confuse==1.0.0 25 | cycler==0.10.0 26 | cymem==2.0.2 27 | decorator==4.4.0 28 | defusedxml==0.6.0 29 | docopt==0.6.2 30 | docutils==0.15.2 31 | editdistance==0.5.3 32 | en-core-web-sm==2.1.0 33 | entrypoints==0.3 34 | enum-compat==0.0.2 35 | epitran==1.1 36 | flake8==3.7.9 37 | future==0.17.1 38 | htmlmin==0.1.12 39 | idna==2.8 40 | importlib-metadata==0.23 41 | indic-transliteration==1.8.8 42 | ipykernel==5.1.2 43 | ipython==7.8.0 44 | ipython-genutils==0.2.0 45 | ipywidgets==7.5.0 46 | isort==4.3.21 47 | jedi==0.15.1 48 | Jinja2==2.10.1 49 | jmespath==0.9.4 50 | joblib==0.14.0 51 | jsonschema==2.6.0 52 | jupyter-client==5.3.3 53 | jupyter-core==4.5.0 54 | kiwisolver==1.1.0 55 | lazy-object-proxy==1.4.2 56 | llvmlite==0.29.0 57 | marisa-trie==0.7.5 58 | MarkupSafe==1.1.1 59 | matplotlib==3.1.1 60 | mccabe==0.6.1 61 | missingno==0.4.2 62 | mistune==0.8.4 63 | more-itertools==7.2.0 64 | munkres==1.1.2 65 | murmurhash==1.0.2 66 | nbconvert==5.6.0 67 | nbformat==4.4.0 68 | networkx==2.3 69 | neuralcoref==4.0 70 | nltk==3.4.5 71 | notebook==6.0.1 72 | numba==0.45.1 73 | numpy==1.17.0 74 | packaging==19.2 75 | pandas==0.25.1 76 | pandas-profiling==2.3.0 77 | pandocfilters==1.4.2 78 | panphon==0.15 79 | parso==0.5.1 80 | pathtools==0.1.2 81 | pep8==1.7.1 82 | pexpect==4.7.0 83 | phik==0.9.8 84 | pickleshare==0.7.5 85 | pigar==0.9.2 86 | Pillow==6.2.0 87 | plac==0.9.6 88 | plotly==4.1.1 89 | pluggy==0.13.0 90 | portalocker==1.5.1 91 | praw==6.3.1 92 | prawcore==1.0.1 93 | preshed==2.0.1 94 | prometheus-client==0.7.1 95 | prompt-toolkit==2.0.9 96 | psaw==0.0.7 97 | ptyprocess==0.6.0 98 | py==1.8.0 99 | pycodestyle==2.5.0 100 | pyflakes==2.1.1 101 | Pygments==2.4.2 102 | pylint==2.3.1 103 | pyparsing==2.4.2 104 | pyreqs==0.1.1 105 | pyrsistent==0.15.4 106 | pytest==5.2.0 107 | pytest-pylint==0.14.1 108 | python-dateutil==2.8.0 109 | pytz==2019.2 110 | PyYAML==5.1.2 111 | pyzmq==18.1.0 112 | regex==2019.8.19 113 | requests==2.22.0 114 | retrying==1.3.3 115 | rope==0.14.0 116 | s3transfer==0.2.1 117 | sacrebleu==1.4.1 118 | scikit-learn==0.21.3 119 | scipy==1.3.1 120 | seaborn==0.9.0 121 | selenium==3.141.0 122 | Send2Trash==1.5.0 123 | sh==1.12.14 124 | six==1.12.0 125 | sklearn==0.0 126 | soupsieve==1.9.3 127 | spacy==2.1.0 128 | splinter==0.11.0 129 | srsly==0.1.0 130 | terminado==0.8.2 131 | testpath==0.4.2 132 | thinc==7.0.8 133 | toolz==0.10.0 134 | torch==1.3.0 135 | torchvision==0.4.1 136 | tornado==5.1.1 137 | tqdm==4.35.0 138 | traitlets==4.3.2 139 | typed-ast==1.4.0 140 | typing==3.7.4.1 141 | tzlocal==2.0.0 142 | unicodecsv==0.14.1 143 | update-checker==0.16 144 | urllib3==1.26.5 145 | validators==0.14.0 146 | wasabi==0.2.2 147 | wcwidth==0.1.7 148 | webencodings==0.5.1 149 | websocket-client==0.56.0 150 | widgetsnbextension==3.5.1 151 | wiki-dump-parser==2.0.0 152 | wikipedia==1.4.0 153 | Wikipedia-API==0.5.2 154 | wrapt==1.11.2 155 | zipp==0.6.0 156 | -------------------------------------------------------------------------------- /tag-and-generate-train/scripts/inference.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Given a sentence, runs the tag-generate model on it 3 | set -u 4 | 5 | 6 | input_file="$1" # the input test file which needs to be transferred 7 | jobname="$2" # unique identifier for the inference job 8 | tagger_target="$3" # target [argument we pass when we run scripts/train_tagger.sh] used to train tagger 9 | generator_target="$4" # target [argument we pass when we run scripts/train_generator.sh] used to train generator 10 | dataset="$5" # dataset [argument we pass when we train tagger or generator -- used to identify model paths for tagger and generator] 11 | src_tag="$6" # style_0_label used to create training data [src style] 12 | tgt_tag="$7" # style_1_label used to create training data [tgt style] 13 | base_folder="$8" # path to data folder, where outputs of the training data creation process are stored 14 | device="$9" # gpu id [comment line in script if it needs to be run on cpu] 15 | 16 | tag_generate_base="experiments/" # base dir to store outputs of inference 17 | mkdir -p $tag_generate_base 18 | 19 | # SET UNSET BPE HERE 20 | BPE=1 21 | if [ "$BPE" -eq 1 ]; then 22 | MODEL_PTH="bpe" 23 | echo "Using BPE" 24 | else 25 | MODEL_PTH="nobpe" 26 | echo "Not using BPE" 27 | fi 28 | # 29 | 30 | ## ARCHITECTURE 31 | HSZ=512 32 | EMBED_DIM=512 33 | NHEAD=4 34 | NL=4 35 | ## 36 | 37 | 38 | 39 | function infer() { 40 | # run inference 41 | infile="$1" 42 | src="$2" 43 | tgt="$3" 44 | model="$4" 45 | outfile="$5" 46 | prefer_gtag="$6" 47 | if [ "$BPE" -eq 1 ]; then 48 | CUDA_VISIBLE_DEVICES=$device python src/translate.py --cuda --src "$src" \ 49 | --tgt "$tgt" \ 50 | --model-file "$model" \ 51 | --search "beam_search" \ 52 | --hidden-dim $HSZ \ 53 | --embed-dim $EMBED_DIM \ 54 | --n-heads $NHEAD \ 55 | --n-layers $NL \ 56 | --beam-size 5 \ 57 | --bpe \ 58 | --prefer_gtag "$prefer_gtag" \ 59 | --tag "$src_tag" \ 60 | --input-file "$infile" \ 61 | --output-file "$outfile" \ 62 | --base-folder "$base_folder" 63 | else 64 | CUDA_VISIBLE_DEVICES=$device python src/translate.py --cuda --src "$src" \ 65 | --tgt "$tgt" \ 66 | --model-file "$model" \ 67 | --search "beam_search" \ 68 | --hidden-dim $HSZ \ 69 | --embed-dim $EMBED_DIM \ 70 | --n-heads $NHEAD \ 71 | --n-layers $NL \ 72 | --beam-size 5 \ 73 | --tag "$src_tag" \ 74 | --prefer_gtag "$prefer_gtag" \ 75 | --input-file "$infile" \ 76 | --output-file "$outfile" \ 77 | --base-folder "$base_folder" 78 | fi 79 | 80 | } 81 | 82 | 83 | function add_eos() { 84 | # append eos to each line of the file 85 | ip="$1" 86 | awk '{printf("%s \n", $0)}' $ip > "${ip}.bak" 87 | mv "${ip}.bak" $ip 88 | } 89 | 90 | 91 | 92 | # Step 1: Run Preprocess/BPE on the input 93 | TAGGER_INPUT="${tag_generate_base}/${jobname}_tagger_input" 94 | if [ $BPE -eq 1 ]; then 95 | echo "Running BPE on input" 96 | CUDA_VISIBLE_deviceS=$device python src/subwords.py segment\ 97 | --model "$base_folder/en${tagger_target}_subwords.model" < "$input_file"\ 98 | > "$TAGGER_INPUT" 99 | else 100 | cp "$input_file" "$TAGGER_INPUT" 101 | fi 102 | echo "Adding eos to the input" 103 | add_eos "$TAGGER_INPUT" 104 | 105 | 106 | # Step 2: Tag the input 107 | echo "Running tagger" 108 | infer "$TAGGER_INPUT" "en" "$tagger_target" "models/${dataset}/${MODEL_PTH}/en-${tagger_target}-tagger.pt"\ 109 | "${tag_generate_base}/${jobname}_tagged" 1 110 | ### SRC_TAG -> TGT_TAG 111 | sed -i "s/${src_tag}/${tgt_tag}/g" "${tag_generate_base}/${jobname}_tagged" 112 | 113 | 114 | # Step 3: Run Preprocess/BPE on the tagger output 115 | GENERATOR_INPUT="${tag_generate_base}/${jobname}_generator_input" 116 | if [ $BPE -eq 1 ]; then 117 | echo "Running BPE on masked output" 118 | CUDA_VISIBLE_deviceS=$device python src/subwords.py segment\ 119 | --model "$base_folder/en${generator_target}_subwords.model" < "${tag_generate_base}/${jobname}_tagged" > "$GENERATOR_INPUT" 120 | 121 | else 122 | cp "${tag_generate_base}/${jobname}_tagged" "$GENERATOR_INPUT" 123 | fi 124 | add_eos "$GENERATOR_INPUT" 125 | 126 | 127 | # Step 4: Generate 128 | echo "Running generator" 129 | infer "$GENERATOR_INPUT" "en" "${generator_target}" "models/${dataset}/${MODEL_PTH}/en-${generator_target}-generator.pt"\ 130 | "${tag_generate_base}/${jobname}_output" 0 131 | sed -i 's/^\"//g' "${tag_generate_base}/${jobname}_output" 132 | 133 | 134 | # Step 5: Run sacrebleu 135 | cat "$input_file"|sacrebleu -w2 "${tag_generate_base}/${jobname}_output" 136 | -------------------------------------------------------------------------------- /tag-and-generate-train/scripts/prepare_bpe.sh: -------------------------------------------------------------------------------- 1 | # Prepare BPE 2 | 3 | tgt="$1" 4 | base_folder="$2" 5 | VOCAB_SIZE=16000 6 | python src/subwords.py train \ 7 | --model_prefix "${base_folder}"/en${tgt}_subwords \ 8 | --vocab_size "${VOCAB_SIZE}" \ 9 | --model_type bpe \ 10 | --input "${base_folder}"/en${tgt}_parallel.train.$tgt,"${base_folder}"/en${tgt}_parallel.train.en 11 | 12 | # Apply BPE 13 | for split in train dev test 14 | do 15 | for l in $tgt en 16 | do 17 | python src/subwords.py segment \ 18 | --model "${base_folder}"/en${tgt}_subwords.model \ 19 | < "${base_folder}"/en${tgt}_parallel.$split.$l \ 20 | > "${base_folder}"/en${tgt}_parallel.bpe.$split.$l 21 | done 22 | done 23 | -------------------------------------------------------------------------------- /tag-and-generate-train/scripts/train_generator.sh: -------------------------------------------------------------------------------- 1 | ########## 2 | # Usage: bash train_generator.sh 3 | ########## 4 | 5 | #!/usr/bin/env bash 6 | #SBATCH --mem=8G 7 | #SBATCH --gres=gpu:1 8 | #SBATCH -t 0 9 | tgt="$1" 10 | dataset="$2" 11 | base_folder="$3" 12 | 13 | # Switch to 0 for no bpe 14 | BPE=1 15 | if [ "$BPE" -eq 1 ]; then 16 | MODEL_PTH=models/$dataset/"bpe" 17 | echo "Using BPE" 18 | else 19 | MODEL_PTH=models/$dataset/"nobpe" 20 | echo "Not using BPE" 21 | fi 22 | 23 | mkdir -p $MODEL_PTH 24 | 25 | if [ "$BPE" -eq 1 ]; then 26 | python src/training.py \ 27 | --cuda \ 28 | --src en \ 29 | --tgt "$tgt" \ 30 | --model-file "$MODEL_PTH/en-${tgt}-generator.pt" \ 31 | --n-layers 4 \ 32 | --n-heads 4 \ 33 | --embed-dim 512 \ 34 | --hidden-dim 512 \ 35 | --dropout 0.3 \ 36 | --bpe \ 37 | --word-dropout 0.1 \ 38 | --lr 1e-3 \ 39 | --n-epochs 5 \ 40 | --tokens-per-batch 8000 \ 41 | --clip-grad 1.1 \ 42 | --base-folder "$base_folder" 43 | else 44 | python src/training.py \ 45 | --cuda \ 46 | --src en \ 47 | --tgt "$tgt" \ 48 | --model-file "$MODEL_PTH/en-${tgt}-generator.pt" \ 49 | --n-layers 4 \ 50 | --n-heads 4 \ 51 | --embed-dim 512 \ 52 | --hidden-dim 512 \ 53 | --dropout 0.3 \ 54 | --word-dropout 0.1 \ 55 | --lr 1e-3 \ 56 | --n-epochs 5 \ 57 | --tokens-per-batch 8000 \ 58 | --clip-grad 1.1 \ 59 | --base-folder "$base_folder" 60 | fi 61 | -------------------------------------------------------------------------------- /tag-and-generate-train/scripts/train_tagger.sh: -------------------------------------------------------------------------------- 1 | ########## 2 | # Usage: bash train_tagger.sh 3 | ########## 4 | 5 | #!/usr/bin/env bash 6 | #SBATCH --mem=8G 7 | #SBATCH --gres=gpu:1 8 | #SBATCH -t 0 9 | tgt="$1" 10 | dataset="$2" 11 | base_folder="$3" 12 | 13 | # Switch to 0 for no bpe 14 | BPE=1 15 | if [ "$BPE" -eq 1 ]; then 16 | MODEL_PTH=models/$dataset/"bpe" 17 | echo "Using BPE" 18 | else 19 | MODEL_PTH=models/$dataset/"nobpe" 20 | echo "Not using BPE" 21 | fi 22 | 23 | mkdir -p $MODEL_PTH 24 | 25 | if [ "$BPE" -eq 1 ]; then 26 | python src/training.py \ 27 | --cuda \ 28 | --src en \ 29 | --tgt "$tgt" \ 30 | --model-file "$MODEL_PTH/en-${tgt}-tagger.pt" \ 31 | --n-layers 4 \ 32 | --n-heads 4 \ 33 | --embed-dim 512 \ 34 | --hidden-dim 512 \ 35 | --dropout 0.3 \ 36 | --bpe \ 37 | --word-dropout 0.1 \ 38 | --lr 1e-3 \ 39 | --n-epochs 5 \ 40 | --tokens-per-batch 8000 \ 41 | --clip-grad 1.1 \ 42 | --base-folder "$base_folder" 43 | else 44 | python src/training.py \ 45 | --cuda \ 46 | --src en \ 47 | --tgt "$tgt" \ 48 | --model-file "$MODEL_PTH/en-${tgt}-tagger.pt" \ 49 | --n-layers 4 \ 50 | --n-heads 4 \ 51 | --embed-dim 512 \ 52 | --hidden-dim 512 \ 53 | --dropout 0.3 \ 54 | --word-dropout 0.1 \ 55 | --lr 1e-3 \ 56 | --n-epochs 5 \ 57 | --tokens-per-batch 8000 \ 58 | --clip-grad 1.1 \ 59 | --base-folder "$base_folder" 60 | fi 61 | -------------------------------------------------------------------------------- /tag-and-generate-train/src/data.py: -------------------------------------------------------------------------------- 1 | from collections import defaultdict 2 | import torch as th 3 | from torch.utils import data 4 | import json 5 | import numpy as np 6 | 7 | 8 | def loadtxt(filename): 9 | txt = [] 10 | with open(filename, encoding="utf-8") as f: 11 | for line in f: 12 | txt.append(line.rstrip()) 13 | return txt 14 | 15 | 16 | class Vocab(object): 17 | """Maps symbols (word/tokens) to indices""" 18 | 19 | def __init__(self): 20 | # Containers 21 | self.symbols = [] 22 | self.idxs = {} 23 | # State 24 | self.frozen = False 25 | # Special symbols 26 | self.add_symbol("") # Padding token 27 | self.add_symbol("") # Start of sentence token 28 | self.add_symbol("") # End of sentence token 29 | self.add_symbol("") # Unknown token 30 | self.add_symbol("[GMASK]") # add GMASK 31 | 32 | def __len__(self): 33 | return len(self.idxs) 34 | 35 | def add_symbol(self, symbol): 36 | """Add a symbol to the dictionary and return its index 37 | 38 | If the symbol already exists in the dictionary this just returns 39 | the index""" 40 | if symbol not in self.idxs: 41 | if self.frozen: 42 | raise ValueError("Can't add symbol to frozen dictionary") 43 | self.symbols.append(symbol) 44 | # print(symbol, len(self.idxs)) 45 | self.idxs[symbol] = len(self.idxs) 46 | return self.idxs[symbol] 47 | 48 | def to_idx(self, symbol): 49 | """Return symbol's index 50 | 51 | If the symbol is not in the dictionary, returns the index of """ 52 | if symbol in self.idxs: 53 | return self.idxs[symbol] 54 | else: 55 | return self.idxs[""] 56 | 57 | def to_symbol(self, idx): 58 | """Return idx's symbol""" 59 | return self.symbols[idx] 60 | 61 | def __getitem__(self, symbol_or_idx): 62 | if isinstance(symbol_or_idx, int): 63 | return self.to_symbol(symbol_or_idx) 64 | else: 65 | return self.to_idx(symbol_or_idx) 66 | 67 | @staticmethod 68 | def from_data_files(*filenames, max_size=-1, min_freq=2): # AB Change 1 69 | """Builds a dictionary from the most frequent tokens in files""" 70 | vocab = Vocab() 71 | # Record token counts 72 | token_counts = defaultdict(lambda: 0) 73 | for filename in filenames: 74 | with open(filename, encoding="utf-8") as f: 75 | for line in f: 76 | tokens = line.rstrip().split() 77 | for token in tokens: 78 | token_counts[token] += 1 79 | # Filter out least frequent tokens 80 | token_counts = { 81 | tok: cnt 82 | for tok, cnt in token_counts.items() 83 | if cnt >= min_freq 84 | } 85 | # Only keep most common tokens 86 | tokens = list(token_counts.keys()) 87 | sorted_tokens = sorted(tokens, key=lambda x: token_counts[x])[::-1] 88 | if max_size > 0: 89 | sorted_tokens = sorted_tokens[:max_size] 90 | # Add the remaining tokens to the dictionary 91 | for token in sorted_tokens: 92 | vocab.add_symbol(token) 93 | 94 | return vocab 95 | 96 | 97 | def _make_tagged_tokens(sents, pad_idx): 98 | """Pad sentences to the max length and create the relevant tag""" 99 | lengths = [len(sent) for sent in sents] 100 | max_len = max(lengths) 101 | bsz = len(lengths) 102 | # Tensor containing the (right) padded tokens 103 | tokens = th.full((max_len, bsz), pad_idx).long() 104 | for i in range(bsz): 105 | tokens[:lengths[i], i] = th.LongTensor(sents[i]) 106 | # Mask such that tag[i, b] = 1 iff lengths[b] < i 107 | lengths = th.LongTensor(lengths).view(1, -1) 108 | tag = th.gt(th.arange(max_len).view(-1, 1), lengths) 109 | # print (lengths, th.arange(max_len).view(-1, 1), tag) 110 | return tokens, tag 111 | 112 | 113 | class MTDataset(data.Dataset): 114 | 115 | def __init__(self, vocab, prefix, src_lang="en", tgt_lang="fr"): 116 | # Attributes 117 | self.vocab = vocab 118 | self.src_lang = src_lang 119 | self.tgt_lang = tgt_lang 120 | # Load from files 121 | src_file = prefix + "." + src_lang 122 | tgt_file = prefix + "." + tgt_lang 123 | self.src_txt = loadtxt(src_file) 124 | self.tgt_txt = loadtxt(tgt_file) 125 | # Check length 126 | self.length = len(self.src_txt) 127 | if self.length != len(self.tgt_txt): 128 | raise ValueError("Mismatched source and target length") 129 | # Append start/end of sentence token to the target 130 | for idx, tgt_sent in enumerate(self.tgt_txt): 131 | self.tgt_txt[idx] = f" {tgt_sent} " 132 | # Convert to indices 133 | self.src_idxs = [ 134 | [self.vocab[tok] for tok in sent.split()] + [self.vocab[""]] 135 | for sent in self.src_txt 136 | ] 137 | self.tgt_idxs = [ 138 | [self.vocab[tok] for tok in sent.split()] 139 | for sent in self.tgt_txt 140 | ] 141 | 142 | def __getitem__(self, i): 143 | return self.src_idxs[i], self.tgt_idxs[i] 144 | 145 | def __len__(self): 146 | return self.length 147 | 148 | 149 | class MTDataLoader(data.DataLoader): 150 | """Special Dataloader for MT datasets 151 | 152 | Batches by number of sentences and/or tokens 153 | """ 154 | 155 | def __init__(self, dataset, vocab, dynamic_tag=False, max_bsz=1, max_tokens=1000, shuffle=False): 156 | 157 | self.dataset = dataset 158 | self.max_bsz = max_bsz 159 | self.max_tokens = max_tokens 160 | self.shuffle = shuffle 161 | self.vocab = vocab 162 | self.dynamic_tag = dynamic_tag 163 | if self.dynamic_tag: 164 | print("Training with Dynamic Mask.") 165 | # Order of batches 166 | 167 | def init_epoch(self): 168 | """Make batches that contain no more than 169 | `max_tokens` tokens and `max_bsz` samples""" 170 | N = len(self.dataset) 171 | if self.shuffle: 172 | self.order = th.randperm(N).numpy() 173 | else: 174 | self.order = th.arange(N).long().numpy() 175 | self.batches = [] 176 | batch_size = max_src_tokens = max_tgt_tokens = 0 177 | current_batch = [] 178 | pointer = 0 179 | while pointer < N: 180 | idx = self.order[pointer] 181 | src, tgt = self.dataset[idx] 182 | # Check whether adding this sample would bring us over 183 | # the size limit 184 | batch_size += 1 185 | max_src_tokens = max(max_src_tokens, len(src)) 186 | max_tgt_tokens = max(max_tgt_tokens, len(tgt)) 187 | tot_tokens = (max_src_tokens + max_tgt_tokens) * batch_size 188 | # If this is the case, wrap up current batch 189 | if batch_size > self.max_bsz or tot_tokens > self.max_tokens: 190 | if len(current_batch) > 0: 191 | self.batches.append(current_batch) 192 | else: 193 | # If this happens then there is one sample that is too big, 194 | # just ignore it wth a warning 195 | print(f"WARNING: ignoring sample {idx}" 196 | "(too big for specified batch size)") 197 | pointer += 1 198 | batch_size = max_src_tokens = max_tgt_tokens = 0 199 | current_batch = [] 200 | else: 201 | current_batch.append(idx) 202 | pointer += 1 203 | # Add the last batch 204 | if len(current_batch) > 0: 205 | self.batches.append(current_batch) 206 | 207 | 208 | 209 | def process_tokens(self, tag_dict): 210 | 211 | processed_tag_dict = { 212 | self.vocab[k] : v for k, v in tag_dict.items() if k in self.vocab.idxs 213 | } 214 | 215 | return processed_tag_dict 216 | 217 | 218 | def __iter__(self): 219 | self.init_epoch() 220 | self.pos = 0 221 | return self 222 | 223 | def __len__(self): 224 | return len(self.batches) 225 | 226 | def get_batch(self, pos): 227 | samples = [self.dataset[i] for i in self.batches[pos]] 228 | src_sents = [src for src, _ in samples] 229 | if self.dynamic_tag: 230 | tgt_sents = [self.get_gtagged(tgt) for _, tgt in samples] 231 | # for tgt in tgt_sents: 232 | # cnt = 0 233 | # for k in tgt: 234 | # print(k) 235 | # cnt+=(k == 4) 236 | # print ("count",cnt) 237 | tgt_sents = np.array(tgt_sents) 238 | selection = np.ones(len(tgt_sents), dtype=bool) 239 | selection[1:] = tgt_sents[1:] != tgt_sents[:-1] 240 | tgt_sents = tgt_sents[selection] 241 | else: 242 | tgt_sents = [tgt for _, tgt in samples] 243 | # Input tensor 244 | pad_idx = self.dataset.vocab[""] 245 | src_tokens, src_tag = _make_tagged_tokens(src_sents, pad_idx) 246 | tgt_tokens, tgt_tag = _make_tagged_tokens(tgt_sents, pad_idx) 247 | # print(sum(tgt_tokens==4)) 248 | return src_tokens, src_tag, tgt_tokens, tgt_tag 249 | 250 | 251 | def get_gtagged(self, tgt_sent): 252 | 253 | output = [] 254 | for tok in tgt_sent: 255 | if tok in self.p9_tags: 256 | output.append(self.get_random(tok, self.p9_tags[tok])) 257 | else: 258 | output.append(tok) 259 | 260 | return output 261 | 262 | def get_random(self, tok, prob): 263 | 264 | if np.random.uniform() < prob: 265 | return self.vocab["[GMASK]"] 266 | else: 267 | return tok 268 | 269 | def __next__(self): 270 | if self.pos >= len(self.batches): 271 | raise StopIteration() 272 | batch = self.get_batch(self.pos) 273 | self.pos += 1 274 | return batch 275 | 276 | 277 | 278 | 279 | class MTNoisyDataset(data.Dataset): 280 | 281 | def __init__(self, vocab, prefix, src_lang="en", tgt_lang="fr"): 282 | # Attributes 283 | self.vocab = vocab 284 | self.src_lang = src_lang 285 | self.tgt_lang = tgt_lang 286 | 287 | # Load from files 288 | src_file = prefix + "." + src_lang 289 | tgt_file = prefix + "." + tgt_lang 290 | self.src_txt = loadtxt(src_file) 291 | self.tgt_txt = loadtxt(tgt_file) 292 | # Check length 293 | self.length = len(self.src_txt) 294 | if self.length != len(self.tgt_txt): 295 | raise ValueError("Mismatched source and target length") 296 | # Append start/end of sentence token to the target 297 | for idx, tgt_sent in enumerate(self.tgt_txt): 298 | self.tgt_txt[idx] = f" {tgt_sent} " 299 | # Convert to indices 300 | self.src_idxs = [ 301 | [self.vocab[tok] for tok in sent.split()] + [self.vocab[""]] 302 | for sent in self.src_txt 303 | ] 304 | self.tgt_idxs = [ 305 | [self.vocab[tok] for tok in sent.split()] 306 | for sent in self.tgt_txt 307 | ] 308 | 309 | def __getitem__(self, i): 310 | return self.src_idxs[i], self.tgt_idxs[i] 311 | 312 | def __len__(self): 313 | return self.length 314 | -------------------------------------------------------------------------------- /tag-and-generate-train/src/decoding.py: -------------------------------------------------------------------------------- 1 | import torch as th 2 | import math 3 | 4 | 5 | def sample(model, src_tokens, temperature=1.0, max_len=200, device=None): 6 | # Either decode on the model's device or specified device 7 | # (in which case move the model accordingly) 8 | if device is None: 9 | device = list(model.parameters())[0].device 10 | else: 11 | model = model.to(device) 12 | # Go into eval mode (e.g. disable dropout) 13 | model.eval() 14 | # Encode source sentece 15 | src_tensor = th.LongTensor(src_tokens).to(device).view(-1, 1) 16 | encodings = model.encode(src_tensor) 17 | # Initialize decoder state 18 | state = model.initial_state() 19 | # Start decoding 20 | out_tokens = [model.vocab[""]] 21 | eos_token = model.vocab[""] 22 | while out_tokens[-1] != eos_token and len(out_tokens) <= max_len: 23 | current_token = th.LongTensor([out_tokens[-1]]).view(1, 1).to(device) 24 | # One step of the decoder 25 | log_p, state = model.decode_step(current_token, encodings, state) 26 | # Probabilities 27 | probs = th.exp(log_p / temperature).view(-1) 28 | # Sample 29 | next_token = th.multinomial(probs.view(-1), 1).item() 30 | # Add to the generated sentence 31 | out_tokens.append(next_token) 32 | # Return generated token (idxs) without and 33 | out_tokens = out_tokens[1:] 34 | if out_tokens[-1] == eos_token: 35 | out_tokens = out_tokens[:-1] 36 | return out_tokens 37 | 38 | 39 | def greedy(model, src_tokens, max_len=200, device=None): 40 | # Either decode on the model's device or specified device 41 | # (in which case move the model accordingly) 42 | if device is None: 43 | device = list(model.parameters())[0].device 44 | else: 45 | model = model.to(device) 46 | # Go into eval mode (e.g. disable dropout) 47 | model.eval() 48 | # Encode source sentece 49 | src_tensor = th.LongTensor(src_tokens).to(device).view(-1, 1) 50 | encodings = model.encode(src_tensor) 51 | # Initialize decoder state 52 | state = model.initial_state() 53 | # Start decoding 54 | out_tokens = [model.vocab[""]] 55 | eos_token = model.vocab[""] 56 | while out_tokens[-1] != eos_token and len(out_tokens) <= max_len: 57 | current_token = th.LongTensor([out_tokens[-1]]).view(1, 1).to(device) 58 | # One step of the decoder 59 | log_p, state = model.decode_step(current_token, encodings, state) 60 | # Sample 61 | next_token = log_p.view(-1).argmax() 62 | # Add to the generated sentence 63 | out_tokens.append(next_token.item()) 64 | # Return generated token (idxs) without and 65 | out_tokens = out_tokens[1:] 66 | if out_tokens[-1] == eos_token: 67 | out_tokens = out_tokens[:-1] 68 | return out_tokens 69 | 70 | 71 | def beam_search( 72 | model, 73 | src_tokens, 74 | prefer_gtag, 75 | src_tag, 76 | beam_size=1, 77 | len_penalty=0.0, 78 | max_len=200, 79 | # style_prior=1, # lower the better! 80 | device=None 81 | ): 82 | # assert style_prior <= 1 and style_prior > 0 83 | # Either decode on the model's device or specified device 84 | # (in which case move the model accordingly) 85 | if device is None: 86 | device = list(model.parameters())[0].device 87 | else: 88 | model = model.to(device) 89 | # Go into eval mode (e.g. disable dropout) 90 | model.eval() 91 | # Encode source sentece 92 | src_tensor = th.LongTensor(src_tokens).to(device).view(-1, 1) 93 | encodings = model.encode(src_tensor) 94 | # Initialize beams 95 | beams = [{ 96 | # Tokens generated in this beam 97 | "tokens": [model.vocab[""]], 98 | # Internal decoder state 99 | "state": model.initial_state(), 100 | # log probabilityof the sequence 101 | "log_p": 0, 102 | # Whether this beam is dead 103 | "is_over": False, 104 | }] 105 | # Start decoding 106 | eos_token = model.vocab[""] 107 | t = 0 108 | while not beams[-1]["is_over"]: 109 | # Pass on dead beams 110 | beam_candidates = [beam for beam in beams if beam["is_over"]] 111 | # Take a step on all active beams 112 | active_beams = [beam for beam in beams if not beam["is_over"]] 113 | # Last produced tokens 114 | current_tokens = th.LongTensor( 115 | [beam["tokens"][-1] for beam in active_beams]) 116 | current_tokens = current_tokens.view(1, -1).to(device) 117 | # Decoder states 118 | states = [ 119 | th.cat([beam["state"][layer] for beam in active_beams], dim=1) 120 | if beams[0]["state"][0] is not None 121 | else None 122 | for layer in range(model.n_layers) 123 | ] 124 | # Take a step 125 | log_ps, new_states = model.decode_step( 126 | current_tokens, 127 | encodings.repeat(1, len(active_beams), 1), 128 | states, 129 | ) 130 | # Topk tokens at this step 131 | log_ps = log_ps.view(log_ps.size(1), -1) 132 | 133 | # Style Prior 134 | # log_ps[:, model.vocab["GMASK"]] -= math.log(style_prior) 135 | 136 | log_p_tokens, top_tokens = log_ps.topk(beam_size, dim=-1) 137 | #print(log_ps.shape) 138 | 139 | # Append to candidates 140 | for i, beam in enumerate(active_beams): 141 | for token, log_p_token in zip(top_tokens[i], log_p_tokens[i]): 142 | # Update tokens, state and log_p 143 | candidate = { 144 | "tokens": beam["tokens"] + [token.item()], 145 | "state": [h[:, i:i+1].detach() for h in new_states], 146 | "log_p": beam["log_p"] + log_p_token.item(), 147 | "is_over": False, 148 | } 149 | # check whether this beam is over 150 | generated_eos = candidate["tokens"][-1] == eos_token 151 | too_long = len(candidate["tokens"]) > max_len 152 | candidate["is_over"] = generated_eos or too_long 153 | # Save candidate 154 | beam_candidates.append(candidate) 155 | t += 1 156 | # Now rerank and keep top beams 157 | beams = sorted( 158 | beam_candidates, 159 | key=lambda beam: beam["log_p"] / # log probability 160 | (len(beam["tokens"]))**len_penalty, # Length penalty 161 | )[-beam_size:] # top k 162 | # Return generated token (idxs) without and 163 | 164 | if prefer_gtag == 1: # prefer the hypothesis that's mostly gtags 165 | num_gtag_criterion = lambda x: len([x_i for x_i in x["tokens"] if src_tag in model.vocab[x_i]]) 166 | beams = sorted(beams, key=num_gtag_criterion) 167 | 168 | out_tokens = beams[-1]["tokens"][1:] 169 | if out_tokens[-1] == eos_token: 170 | out_tokens = out_tokens[:-1] 171 | return out_tokens 172 | -------------------------------------------------------------------------------- /tag-and-generate-train/src/noisy.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import torch 3 | from data import _make_tagged_tokens 4 | import itertools 5 | 6 | # TODO: Add reference 7 | 8 | def word_shuffle(vocab, x, k): # slight shuffle such that |sigma[i]-i| <= k 9 | base = torch.arange(x.size(0), dtype=torch.float).repeat(x.size(1), 1).t() 10 | inc = (k+1) * torch.rand(x.size()) 11 | inc[x == vocab['']] = 0 # do not shuffle the start sentence symbol 12 | inc[x == vocab['']] = k+1 # do not shuffle end paddings 13 | inc[x == vocab['']] = k+1 14 | _, sigma = (base + inc).sort(dim=0) 15 | return x[sigma, torch.arange(x.size(1))] 16 | 17 | def word_drop(vocab, x, p): # drop words with probability p 18 | x_ = [] 19 | for i in range(x.size(1)): 20 | words = x[:, i].tolist() 21 | keep = np.random.rand(len(words)) > p 22 | keep[0] = True # do not drop the start sentence symbol 23 | sent = [w for j, w in enumerate(words) if keep[j]] 24 | sent += [vocab['']] * (len(words)-len(sent)) 25 | x_.append(sent) 26 | return torch.LongTensor(x_).t().contiguous().to(x.device) 27 | 28 | def word_blank(vocab, x, p): # blank words with probability p 29 | blank = (torch.rand(x.size(), device=x.device) < p) & \ 30 | (x != vocab['']) & (x != vocab['']) & (x != vocab['']) 31 | x_ = x.clone() 32 | x_[blank] = vocab[''] 33 | return x_ 34 | 35 | def word_substitute(vocab, x, p): # substitute words with probability p 36 | keep = (torch.rand(x.size(), device=x.device) > p) | \ 37 | (x == vocab['']) | (x == vocab['']) | (x == vocab['']) | (x == vocab['[GMASK]']) 38 | x_ = x.clone() 39 | x_.random_(0, len(vocab)) 40 | x_[keep] = x[keep] 41 | return x_ 42 | 43 | def add_gtag(vocab, x, p): # drop words with probability p 44 | x_ = [] 45 | for i in range(x.size(1)): 46 | words = x[:, i].tolist() 47 | add = np.random.rand(len(words)) < p 48 | add[-1] = False 49 | # sent = [[w , vocab['▁['], vocab['GMASK'] , vocab[']']] if add[j] else [w] for j, w in enumerate(words)] 50 | sent = [[w , vocab[f'[GMASK{j//3}]']] if add[j] else [w] for j, w in enumerate(words)] 51 | sent = list(itertools.chain.from_iterable(sent)) + [vocab['']] 52 | x_.append(sent) 53 | sent, _ = _make_tagged_tokens(x_, vocab['']) 54 | return sent.to(x.device) 55 | 56 | def add_intelligent_gtag(vocab, x, p): # drop words with probability p 57 | x_ = [] 58 | for i in range(x.size(1)): 59 | words = x[:, i].tolist() 60 | add = np.random.rand(len(words)) < p 61 | add[-1] = False 62 | sent = [[w, vocab['GMASK']] if add[j] and w==vocab['GMASK'] else [w] for j, w in enumerate(words)] 63 | sent = list(itertools.chain.from_iterable(sent)) + [vocab['']] 64 | x_.append(sent) 65 | sent, _ = _make_tagged_tokens(x_, vocab['']) 66 | return sent.to(x.device) 67 | 68 | 69 | def intelligent_word_shuffle(vocab, x, k): # slight shuffle such that |sigma[i]-i| <= k 70 | base = torch.arange(x.size(0), dtype=torch.float).repeat(x.size(1), 1).t() 71 | inc = (k+1) * torch.rand(x.size()) 72 | for j in range(x.size(1)): 73 | for i in range(x.size(0)): 74 | do_shuf = 0 75 | for l in range(k//2): 76 | if x[max(i-l,0)][j] == vocab['GMASK']: 77 | do_shuf = 1 78 | for l in range(k//2): 79 | if x[min(i+l,x.size(0)-1)][j] == vocab['GMASK']: 80 | do_shuf = 1 81 | inc[i][j] *= do_shuf 82 | inc[x == vocab['']] = 0 # do not shuffle the start sentence symbol 83 | inc[x == vocab['']] = k+1 # do not shuffle end paddings 84 | inc[x == vocab['']] = k+1 85 | _, sigma = (base + inc).sort(dim=0) 86 | return x[sigma, torch.arange(x.size(1))] 87 | 88 | 89 | def noisy(vocab, x, drop_prob, blank_prob, sub_prob, shuffle_dist, add_gtag_prob, add_int_gtag_prob): 90 | if drop_prob > 0: 91 | x = word_drop(vocab, x, drop_prob) 92 | if blank_prob > 0: 93 | x = word_blank(vocab, x, blank_prob) 94 | if sub_prob > 0: 95 | x = word_substitute(vocab, x, sub_prob) 96 | if add_int_gtag_prob > 0: 97 | x = add_intelligent_gtag(vocab, x, add_gtag_prob) 98 | x = intelligent_word_shuffle(vocab, x, 3) 99 | if add_gtag_prob > 0: 100 | x = add_gtag(vocab, x, add_gtag_prob) 101 | if shuffle_dist > 0: 102 | x = word_shuffle(vocab, x, shuffle_dist) 103 | return x 104 | -------------------------------------------------------------------------------- /tag-and-generate-train/src/subwords.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import sentencepiece as sp 3 | import argparse 4 | 5 | 6 | def train(args): 7 | arg_string = "".join( 8 | arg + ("=" if arg.startswith("--") else " ") 9 | for arg in args 10 | ).strip() 11 | sp.SentencePieceTrainer.Train(arg_string) 12 | 13 | 14 | def load(model_path): 15 | model = sp.SentencePieceProcessor() 16 | model.Load(model_path) 17 | return model 18 | 19 | 20 | def desegment(tokens): 21 | return ("".join(tokens)).replace("▁", " ").strip() 22 | 23 | 24 | def get_args(): 25 | parser = argparse.ArgumentParser("Subword training/segmentation") 26 | subparsers = parser.add_subparsers(help="Actions") 27 | # Training 28 | train_parser = subparsers.add_parser("train") 29 | train_parser.set_defaults(which="train") 30 | train_parser.add_argument("--input", required=True, type=str) 31 | train_parser.add_argument("--model_prefix", required=True, type=str) 32 | train_parser.add_argument("--vocab_size", required=True, type=int) 33 | train_parser.add_argument("--model_type", required=True, type=str) 34 | # Segmentation 35 | segment_parser = subparsers.add_parser("segment") 36 | segment_parser.set_defaults(which="segment") 37 | segment_parser.add_argument("--model", required=True, type=str) 38 | # De-segmentation 39 | segment_parser = subparsers.add_parser("desegment") 40 | segment_parser.set_defaults(which="desegment") 41 | # Parse 42 | args = parser.parse_args() 43 | return args 44 | 45 | 46 | def main(): 47 | args = get_args() 48 | if args.which == "train": 49 | train(sys.argv[2:]) 50 | elif args.which == "segment": 51 | model = load(args.model) 52 | for line in sys.stdin: 53 | print(" ".join(model.EncodeAsPieces(line))) 54 | elif args.which == "desegment": 55 | for line in sys.stdin: 56 | print(desegment(line.strip().split())) 57 | 58 | 59 | if __name__ == "__main__": 60 | main() 61 | -------------------------------------------------------------------------------- /tag-and-generate-train/src/training.py: -------------------------------------------------------------------------------- 1 | import os.path 2 | import argparse 3 | from math import sqrt, exp 4 | import torch as th 5 | from data import MTDataset, MTDataLoader, Vocab, MTNoisyDataset 6 | from transformer import Transformer 7 | from tqdm import tqdm 8 | import noisy 9 | 10 | 11 | def load_data(src_lang, tgt_lang, base_folder, bpe=False): 12 | 13 | if bpe: 14 | train_prefix = os.path.join( 15 | base_folder, 16 | f"{src_lang}{tgt_lang}_parallel.bpe.train" 17 | ) 18 | 19 | dev_prefix = os.path.join( 20 | base_folder, 21 | f"{src_lang}{tgt_lang}_parallel.bpe.dev" 22 | ) 23 | print("loading", train_prefix, dev_prefix) 24 | vocab = Vocab.from_data_files( 25 | f"{train_prefix}.{src_lang}", 26 | f"{train_prefix}.{tgt_lang}", 27 | ) 28 | else: 29 | train_prefix = os.path.join( 30 | base_folder, 31 | f"{src_lang}{tgt_lang}_parallel.train" 32 | ) 33 | 34 | dev_prefix = os.path.join( 35 | base_folder, 36 | f"{src_lang}{tgt_lang}_parallel.dev" 37 | ) 38 | vocab = Vocab.from_data_files( 39 | f"{train_prefix}.{src_lang}", 40 | f"{train_prefix}.{tgt_lang}", 41 | min_freq=2 42 | ) 43 | print("loading", train_prefix, dev_prefix) 44 | train = MTNoisyDataset(vocab, train_prefix, 45 | src_lang=src_lang, tgt_lang=tgt_lang) 46 | valid = MTNoisyDataset(vocab, dev_prefix, 47 | src_lang=src_lang, tgt_lang=tgt_lang) 48 | return vocab, train, valid 49 | 50 | 51 | def get_args(): 52 | parser = argparse.ArgumentParser("Train an MT model") 53 | # General params 54 | parser.add_argument("--seed", type=int, default=11731) 55 | parser.add_argument("--src", type=str, default="en") 56 | parser.add_argument("--tgt", type=str) 57 | parser.add_argument("--model-file", type=str, default="model.pt") 58 | parser.add_argument("--overwrite-model", action="store_true") 59 | parser.add_argument("--cuda", action="store_true") 60 | parser.add_argument("--validate-only", action="store_true") 61 | parser.add_argument("--noisy-input", action="store_true") 62 | parser.add_argument("--noisy-output", action="store_true") 63 | parser.add_argument("--bpe", action="store_true") 64 | parser.add_argument("--dynamic-tag", action="store_true") 65 | parser.add_argument("--base-folder", type=str) 66 | # Model parameters 67 | parser.add_argument("--n-layers", type=int, default=4) 68 | parser.add_argument("--n-heads", type=int, default=4) 69 | parser.add_argument("--embed-dim", type=int, default=512) 70 | parser.add_argument("--hidden-dim", type=int, default=512) 71 | parser.add_argument("--dropout", type=float, default=0.1) 72 | parser.add_argument("--word-dropout", type=float, default=0.1) 73 | # Optimization parameters 74 | parser.add_argument("--n-epochs", type=int, default=15) 75 | parser.add_argument("--lr", type=float, default=4e-2) 76 | parser.add_argument("--lr-decay", type=float, default=0.8) 77 | parser.add_argument("--inverse-sqrt-schedule", action="store_true") 78 | parser.add_argument("--clip-grad", type=float, default=1.0) 79 | parser.add_argument("--tokens-per-batch", type=int, default=8000) 80 | parser.add_argument("--samples-per-batch", type=int, default=128) 81 | return parser.parse_args() 82 | 83 | 84 | def move_to_device(tensors, device): 85 | return [tensor.to(device) for tensor in tensors] 86 | 87 | 88 | def inverse_sqrt_schedule(warmup, lr0): 89 | """Inverse sqrt learning rate schedule with warmup""" 90 | step = 0 91 | # Trick for allowing warmup of 0 92 | warmup = max(warmup, 0.01) 93 | while True: 94 | scale = min(1/sqrt(step+1e-20), step/sqrt(warmup**3)) 95 | step += 1 96 | yield lr0 * scale 97 | 98 | 99 | def train_epoch(model, optim, dataloader, lr_schedule=None, clip_grad=5.0, is_noisy=(False, False)): 100 | # Model device 101 | device = list(model.parameters())[0].device 102 | # Iterate over batches 103 | itr = tqdm(dataloader) 104 | print("Train with noisy input : ", is_noisy[0]) 105 | print("Train with noisy output. ", is_noisy[1]) 106 | 107 | 108 | 109 | for batch in itr: 110 | optim.zero_grad() 111 | itr.total = len(dataloader) 112 | # Cast input to device 113 | batch = move_to_device(batch, device) 114 | # Various inputs 115 | src_tokens, src_tag, tgt_tokens, tgt_tag = batch 116 | # print(model.vocab["[GMASK]"]) 117 | # print(tgt_tokens, th.sum(tgt_tokens==model.vocab["[GMASK]"])) 118 | # Noise 119 | if is_noisy[0] and model.training: 120 | src_tokens = noisy.noisy(model.vocab, src_tokens, drop_prob=0.025, blank_prob=0., sub_prob=0.075,\ 121 | shuffle_dist=2, add_gtag_prob=0.0, add_int_gtag_prob=0) 122 | src_tag = (src_tokens == model.vocab['']) 123 | 124 | if is_noisy[1] and model.training: 125 | tgt_tokens = noisy.noisy(model.vocab, tgt_tokens, drop_prob=0., blank_prob=0., sub_prob=0.0, shuffle_dist=0,\ 126 | add_gtag_prob=0.0, add_int_gtag_prob=0.) 127 | 128 | # Get log probs 129 | log_p = model(src_tokens, tgt_tokens[:-1], src_tag) 130 | # Negative log likelihood of the target tokens 131 | # (this selects log_p[i, b, tgt_tokens[i+1, b]] 132 | # for each batch b, position i) 133 | nll = th.nn.functional.nll_loss( 134 | # Log probabilities (flattened to (l*b) x V) 135 | log_p.view(-1, log_p.size(-1)), 136 | # Target tokens (we start from the 1st real token, ignoring ) 137 | tgt_tokens[1:].view(-1), 138 | # Don't compute the nll of padding tokens 139 | ignore_index=model.vocab[""], 140 | # Take the average 141 | reduction="mean", 142 | ) 143 | # Perplexity (for logging) 144 | ppl = th.exp(nll).item() 145 | # Backprop 146 | nll.backward() 147 | # Adjust learning rate with schedule 148 | if lr_schedule is not None: 149 | learning_rate = next(lr_schedule) 150 | for param_group in optim.param_groups: 151 | param_group["lr"] = learning_rate 152 | # Gradient clipping 153 | if clip_grad > 0: 154 | th.nn.utils.clip_grad_norm_(model.parameters(), clip_grad) 155 | # Optimizer step 156 | optim.step() 157 | # Update stats 158 | itr.set_postfix(loss=f"{nll.item():.3f}", ppl=f"{ppl:.2f}") 159 | 160 | 161 | def evaluate_ppl(model, dataloader): 162 | model.eval() 163 | # Model device 164 | device = list(model.parameters())[0].device 165 | # total tokens 166 | tot_tokens = tot_nll = 0 167 | # Iterate over batches 168 | for batch in tqdm(dataloader): 169 | # Cast input to device 170 | batch = move_to_device(batch, device) 171 | # Various inputs 172 | src_tokens, src_tag, tgt_tokens, tgt_tag = batch 173 | with th.no_grad(): 174 | # Get log probs 175 | log_p = model(src_tokens, tgt_tokens[:-1], src_tag) 176 | # Negative log likelihood of the target tokens 177 | # (this selects log_p[i, b, tgt_tokens[i+1, b]] 178 | # for each batch b, position i) 179 | nll = th.nn.functional.nll_loss( 180 | # Log probabilities (flattened to (l*b) x V) 181 | log_p.view(-1, log_p.size(-1)), 182 | # Target tokens (we start from the 1st real token) 183 | tgt_tokens[1:].view(-1), 184 | # Don't compute the nll of padding tokens 185 | ignore_index=model.vocab[""], 186 | # Take the average 187 | reduction="sum", 188 | ) 189 | # Number of tokens (ignoring and ) 190 | n_sos = tgt_tokens.eq(model.vocab[""]).float().sum().item() 191 | n_pad = tgt_tokens.eq(model.vocab[""]).float().sum().item() 192 | n_tokens = tgt_tokens.numel() - n_pad - n_sos 193 | # Keep track 194 | tot_nll += nll.item() 195 | tot_tokens += n_tokens 196 | return exp(tot_nll / tot_tokens) 197 | 198 | def read_embeddings(embeddings_path, vocab, device): 199 | word_to_weights = {} 200 | with open(embeddings_path, "r") as f: 201 | for line in f: 202 | elems = line.strip().split() 203 | word = elems[0] 204 | emebeddings = [float(e) for e in elems[1:]] 205 | word_to_weights[vocab[word]] = emebeddings # store id -> W 206 | W = [] 207 | zero_embed = [0 for _ in range(len(elems[1:]))] 208 | for i in range(len(vocab)): 209 | if i in word_to_weights: 210 | W.append(word_to_weights[i]) 211 | else: 212 | W.append(zero_embed) 213 | W = torch.FloatTensor(W) 214 | return torch.nn.Embedding.from_pretrained(W, freeze=False) 215 | 216 | def main(): 217 | # Command line arguments 218 | args = get_args() 219 | # Set random seed 220 | th.manual_seed(args.seed) 221 | # data 222 | vocab, train_data, valid_data = load_data(args.src, args.tgt, base_folder=args.base_folder, bpe=args.bpe) 223 | # Model 224 | model = Transformer( 225 | args.n_layers, 226 | args.embed_dim, 227 | args.hidden_dim, 228 | args.n_heads, 229 | vocab, 230 | args.dropout, 231 | args.word_dropout, 232 | ) 233 | if args.cuda: 234 | model = model.cuda() 235 | # Load existing model 236 | if os.path.isfile(args.model_file) and not args.overwrite_model: 237 | model.load_state_dict(th.load(args.model_file)) 238 | # Optimizer 239 | optim = th.optim.Adam(model.parameters(), lr=args.lr, betas=(0.9, 0.98)) 240 | # Learning rate schedule 241 | lr_schedule = None 242 | if args.inverse_sqrt_schedule: 243 | inverse_sqrt_schedule(2000, args.lr) 244 | # Dataloader 245 | train_loader = MTDataLoader( 246 | train_data, 247 | max_bsz=args.samples_per_batch, 248 | max_tokens=args.tokens_per_batch, 249 | shuffle=True, 250 | dynamic_tag=args.dynamic_tag, 251 | vocab=vocab 252 | ) 253 | valid_loader = MTDataLoader( 254 | valid_data, 255 | max_bsz=args.samples_per_batch, 256 | max_tokens=args.tokens_per_batch, 257 | shuffle=False, 258 | dynamic_tag=args.dynamic_tag, 259 | vocab=vocab 260 | ) 261 | # Either validate 262 | if args.validate_only: 263 | valid_ppl = evaluate_ppl(model, valid_loader) 264 | print(f"Validation perplexity: {valid_ppl:.2f}") 265 | else: 266 | # Train epochs 267 | best_ppl = 1e12 268 | for epoch in range(1, args.n_epochs+1): 269 | print(f"----- Epoch {epoch} -----", flush=True) 270 | # Train for one epoch 271 | model.train() 272 | train_epoch(model, optim, train_loader, 273 | lr_schedule, args.clip_grad, (args.noisy_input, args.noisy_output)) 274 | # Check dev ppl 275 | model.eval() 276 | valid_ppl = evaluate_ppl(model, valid_loader) 277 | print(f"Validation perplexity: {valid_ppl:.2f}", flush=True) 278 | # Early stopping maybe 279 | if valid_ppl < best_ppl: 280 | best_ppl = valid_ppl 281 | print(f"Saving new best model (epoch {epoch} ppl {valid_ppl})") 282 | th.save(model.state_dict(), args.model_file) 283 | else: 284 | for param_group in optim.param_groups: 285 | param_group["lr"] *= args.lr_decay 286 | 287 | 288 | if __name__ == "__main__": 289 | main() 290 | -------------------------------------------------------------------------------- /tag-and-generate-train/src/translate.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import argparse 3 | import torch as th 4 | from tqdm import tqdm 5 | import numpy as np 6 | import random 7 | 8 | from transformer import Transformer 9 | from decoding import sample, greedy, beam_search 10 | from training import load_data 11 | from subwords import desegment 12 | 13 | 14 | def get_args(): 15 | parser = argparse.ArgumentParser("Translate with an MT model") 16 | # General params 17 | parser.add_argument("--src", type=str, default="en") 18 | parser.add_argument("--tgt", type=str) 19 | parser.add_argument("--model-file", type=str, 20 | default="model.pt", required=True) 21 | parser.add_argument("--input-file", type=str, default=None) 22 | parser.add_argument("--output-file", type=str, default=None) 23 | parser.add_argument("--cuda", action="store_true") 24 | parser.add_argument("--seed", type=int, default=15062019) 25 | parser.add_argument("--bpe", action="store_true") 26 | parser.add_argument("--base-folder", type=str) 27 | # Model parameters 28 | parser.add_argument("--n-layers", type=int, default=4) 29 | parser.add_argument("--n-heads", type=int, default=4) 30 | parser.add_argument("--embed-dim", type=int, default=512) 31 | parser.add_argument("--hidden-dim", type=int, default=512) 32 | parser.add_argument("--dropout", type=float, default=0.3) 33 | # Translation parameters 34 | parser.add_argument("--search", type=str, default="beam_search", 35 | choices=["random", "greedy", "beam_search"]) 36 | parser.add_argument("--beam-size", type=int, default=2) 37 | parser.add_argument("--prefer_gtag", type=int, default=0) 38 | parser.add_argument("--tag", type=str) 39 | return parser.parse_args() 40 | 41 | 42 | def move_to_device(tensors, device): 43 | return [tensor.to(device) for tensor in tensors] 44 | 45 | 46 | def translate_sentence( 47 | model, 48 | sentence, 49 | prefer_gtag, 50 | tag, 51 | beam_size=1, 52 | search="beam_search", 53 | vocab=None 54 | ): 55 | # Convert string to indices 56 | src_tokens = [model.vocab[word] for word in sentence] 57 | # Decode 58 | with th.no_grad(): 59 | if search == "random": 60 | out_tokens = sample(model, src_tokens) 61 | elif search == "greedy": 62 | out_tokens = greedy(model, src_tokens) 63 | elif search == "beam_search": 64 | out_tokens = beam_search(model=model, src_tokens=src_tokens, beam_size=beam_size, src_tag=tag, 65 | prefer_gtag=prefer_gtag) 66 | 67 | # Convert back to strings 68 | return [model.vocab[tok] for tok in out_tokens] 69 | 70 | 71 | def main(): 72 | # Command line arguments 73 | args = get_args() 74 | # Fix seed for consistent sampling 75 | th.manual_seed(args.seed) 76 | #np.random.seed(args.seed) 77 | #random.seed(args.seed) 78 | 79 | # data 80 | vocab, _, _ = load_data(args.src, args.tgt, base_folder=args.base_folder, bpe=args.bpe) 81 | # Model 82 | model = Transformer( 83 | args.n_layers, 84 | args.embed_dim, 85 | args.hidden_dim, 86 | args.n_heads, 87 | vocab, 88 | args.dropout 89 | ) 90 | if args.cuda: 91 | model = model.cuda() 92 | # Load existing model 93 | model.load_state_dict(th.load(args.model_file, map_location="cpu")) 94 | # Read from file/stdin 95 | if args.input_file is not None: 96 | input_stream = open(args.input_file, "r", encoding="utf-8") 97 | else: 98 | input_stream = sys.stdin 99 | # Write to file/stdout 100 | if args.output_file is not None: 101 | output_stream = open(args.output_file, "w", encoding="utf-8") 102 | # If we're printing to a file, display stats in stdout 103 | input_stream = tqdm(input_stream) 104 | else: 105 | output_stream = sys.stdout 106 | # Translate 107 | try: 108 | for line in input_stream: 109 | in_words = line.strip().split() 110 | out_words = translate_sentence( 111 | model, 112 | in_words, 113 | beam_size=args.beam_size, 114 | search=args.search, 115 | vocab=vocab, 116 | prefer_gtag=args.prefer_gtag == 1, 117 | tag=args.tag 118 | ) 119 | if args.bpe: 120 | print(desegment(out_words), file=output_stream) 121 | else: 122 | print(" ".join(out_words), file=output_stream) 123 | output_stream.flush() 124 | except KeyboardInterrupt: 125 | pass 126 | finally: 127 | input_stream.close() 128 | output_stream.close() 129 | 130 | 131 | if __name__ == "__main__": 132 | main() 133 | --------------------------------------------------------------------------------