├── LICENSE
├── README.md
├── data
    ├── msmarco
    │   ├── get_data.sh
    │   └── process.py
    └── setup.sh
├── main
    ├── multitask.py
    ├── ranker.py
    └── recommender.py
├── neuroir
    ├── __init__.py
    ├── config.py
    ├── decoders
    │   ├── __init__.py
    │   ├── decoder.py
    │   ├── rnn_decoder.py
    │   └── state.py
    ├── encoders
    │   ├── __init__.py
    │   ├── encoder.py
    │   └── rnn_encoder.py
    ├── eval
    │   ├── __init__.py
    │   ├── bleu
    │   │   ├── __init__.py
    │   │   ├── bleu.py
    │   │   └── bleu_scorer.py
    │   ├── ltorank.py
    │   ├── rouge
    │   │   ├── __init__.py
    │   │   └── rouge.py
    │   └── squad_eval.py
    ├── hyparam.py
    ├── inputters
    │   ├── __init__.py
    │   ├── constants.py
    │   ├── multitask
    │   │   ├── __init__.py
    │   │   ├── data.py
    │   │   ├── utils.py
    │   │   └── vector.py
    │   ├── ranker
    │   │   ├── __init__.py
    │   │   ├── data.py
    │   │   ├── utils.py
    │   │   └── vector.py
    │   ├── recommender
    │   │   ├── __init__.py
    │   │   ├── data.py
    │   │   ├── utils.py
    │   │   └── vector.py
    │   └── vocabulary.py
    ├── models
    │   ├── __init__.py
    │   ├── multitask.py
    │   ├── ranker.py
    │   └── recommender.py
    ├── modules
    │   ├── __init__.py
    │   ├── copy_generator.py
    │   ├── embeddings.py
    │   ├── global_attention.py
    │   ├── maxout.py
    │   └── util_class.py
    ├── multitask
    │   ├── __init.py
    │   ├── cars.py
    │   ├── layers.py
    │   ├── mmtensor.py
    │   └── mnsrf.py
    ├── objects
    │   ├── __init__.py
    │   ├── document.py
    │   ├── query.py
    │   └── session.py
    ├── rankers
    │   ├── __init.py
    │   ├── arci.py
    │   ├── arcii.py
    │   ├── cdssm.py
    │   ├── drmm.py
    │   ├── dssm.py
    │   ├── duet.py
    │   ├── esm.py
    │   └── mtensor.py
    ├── recommender
    │   ├── __init.py
    │   ├── hredqs.py
    │   ├── layers.py
    │   └── seq2seq.py
    └── utils
    │   ├── __init__.py
    │   ├── copy_utils.py
    │   ├── logging.py
    │   ├── misc.py
    │   └── timer.py
└── scripts
    ├── multitask.sh
    ├── ranker.sh
    └── recommender.sh


/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2017 Wasi Ahmad
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | ## Context-aware Neural Information Retrieval
  2 | 
  3 | ### Introduction
  4 | 
  5 | PyTorch code for our ICLR 2018 and SIGIR 2019 papers.
  6 |   - [ICLR 2018] [Multi-task Learning for Document Ranking and Query Suggestion](https://openreview.net/pdf?id=SJ1nzBeA-)
  7 |   - [SIGIR 2019] [Context Attentive Document Ranking and Query Suggestion](https://arxiv.org/abs/1906.02329)
  8 | 
  9 | The codebase contains source-code of 8 document ranking models, 3 query suggestions models and 3 multi-task context-aware ranking and suggestion models. 
 10 | 
 11 | ##### Document Ranking Models
 12 | - ESM: Embedding Space Model [See the details in our ICLR'18 paper]
 13 | - DSSM: [Learning Deep Structured Semantic Models for Web Search using Clickthrough Data](https://www.microsoft.com/en-us/research/wp-content/uploads/2016/02/cikm2013_DSSM_fullversion.pdf)
 14 | - CDSSM: [Learning Semantic Representations Using Convolutional Neural Networks for Web Search](https://www.microsoft.com/en-us/research/wp-content/uploads/2016/02/www2014_cdssm_p07.pdf)
 15 | - DRMM: [A Deep Relevance Matching Model for Ad-hoc Retrieval](https://arxiv.org/abs/1711.08611)
 16 | - ARCI/ARCII: [Convolutional Neural Network Architectures for Matching Natural Language Sentences](https://arxiv.org/pdf/1503.03244.pdf)
 17 | - DUET: [Learning to Match using Local and Distributed Representations of Text for Web Search](https://www.microsoft.com/en-us/research/wp-content/uploads/2016/10/wwwfp0192-mitra.pdf)
 18 | - MATCH TENSOR: [Match-Tensor: a Deep Relevance Model for Search](https://arxiv.org/abs/1701.07795)
 19 | 
 20 | ##### Query Suggestion Models
 21 | 
 22 | - HREDQS: [A Hierarchical Recurrent Encoder-Decoder for Generative Context-Aware Query Suggestion](https://arxiv.org/abs/1507.02221)
 23 | - Seq2seq with Attention: [Effective Approaches to Attention-based Neural Machine Translation](https://nlp.stanford.edu/pubs/emnlp15_attn.pdf)
 24 | - ACG: [Learning to Attend, Copy, and Generate for Session-Based Quer Suggestion](https://arxiv.org/abs/1708.03418)
 25 | 
 26 | Please note, we have a simplified implementation of ACG.
 27 | 
 28 | ##### Multi-task Learning Models
 29 | 
 30 | - MNSRF/M-MATCH-TENSOR: [Multi-task Learning for Document Ranking and Query Suggestion](https://openreview.net/pdf?id=SJ1nzBeA-)
 31 | - CARS: [Context Attentive Document Ranking and Query Suggestion](https://arxiv.org/abs/1906.02329)
 32 | 
 33 | ### Requirements
 34 | 
 35 | * python 3.6
 36 | * pytorch >= 0.4 (tested on pytorch 0.4.1)
 37 | * [spaCy](https://spacy.io/usage)
 38 | * [tqdm](https://pypi.org/project/tqdm/)
 39 | * [prettytable](https://pypi.org/project/PrettyTable/)
 40 | 
 41 | 
 42 | ### Training/Testing Models
 43 | 
 44 | ```
 45 | $ cd  scripts
 46 | $ bash SCRIPT_NAME GPU_ID MODEL_NAME
 47 | ```
 48 | 
 49 | - To train/test document ranking models, use `ranker.sh` in place of `SCRIPT_NAME`
 50 | - To train/test query suggestion models, use `recommender.sh` in place of `SCRIPT_NAME`
 51 | - To train/test multitask models, use `multitask.sh` in place of `SCRIPT_NAME`
 52 | 
 53 | Here is a list of models which you can use in place of `MODEL_NAME`.
 54 | 
 55 | - Document Ranking Models: `esm, dssm, cdssm, drmm, arci, arcii, duet, match_tensor`
 56 | - Query Suggestion Models: `seq2seq, hredqs, acg`
 57 | - Multitask Models: `mnsrf, m_match_tensor, cars`
 58 | 
 59 | For example, if you want to run our CARS model, run the following command.
 60 | 
 61 | ```
 62 | bash multitask.sh GPU_ID cars
 63 | ```
 64 | 
 65 | ##### Running experiments on CPU/GPU/Multi-GPU
 66 | 
 67 | - If `GPU_ID` is set to -1, CPU will be used.
 68 | - If `GPU_ID` is set to one specific number, only one GPU will be used.
 69 | - If `GPU_ID` is set to multiple numbers (e.g., 0,1,2), then parallel computing will be used.
 70 | 
 71 | ### An Artificial Dataset
 72 | 
 73 | We are unable to make our experimental dataset publicly available. However, we are sharing scripts to create an artificial dataset from [MSMARCO Q&A v2.1](https://github.com/microsoft/MSMARCO-Question-Answering#qa) and [MSMARCO Conversational Search](https://github.com/microsoft/MSMARCO-Conversational-Search#corpus-generation) datasets. Please run the [script](https://github.com/wasiahmad/context_attentive_ir/blob/master/data/msmarco/get_data.sh) by going into the `/data/msmarco/` directory. Once the data is generated, you should be able to see a table showing the following statistics.
 74 | 
 75 | | Attribute           |   Train |  Dev   |   Test |
 76 | | :--- | ---: | ---: | ---: |
 77 | | Sessions            |  223876 | 24832  |  27673 |
 78 | | Queries             | 1530546 | 169413 | 189095 |
 79 | | Avg Session Len     |    6.84 |  6.82  |   6.83 |
 80 | | Avg Query Len       |    3.84 |  3.85  |   3.84 |
 81 | | Max Query Len       |      40 |   32   |     32 |
 82 | | Avg Doc Len         |   63.41 | 63.43  |  63.48 |
 83 | | Max Doc Len         |     290 |  290   |    290 |
 84 | | Avg Click Per Query |    1.05 |  1.05  |   1.05 |
 85 | | Max Click Per Query |       6 |   6    |      6 |
 86 | 
 87 | ### Results on the Artificial Dataset
 88 | 
 89 | Coming soon!
 90 | 
 91 | ### Acknowledgement
 92 | 
 93 | I borrowed and modified code from [DrQA](https://github.com/facebookresearch/DrQA), [OpenNMT](https://github.com/OpenNMT/OpenNMT-py). I would like to expresse my gratitdue for authors of these repositeries.
 94 | 
 95 | 
 96 | ### Citation
 97 | 
 98 | If you find the resources in this repo useful, please cite our works.
 99 | 
100 | ```
101 | @inproceedings{Ahmad:2019:CAD:3331184.3331246,
102 |  author = {Ahmad, Wasi Uddin and Chang, Kai-Wei and Wang, Hongning},
103 |  title = {Context Attentive Document Ranking and Query Suggestion},
104 |  booktitle = {Proceedings of the 42nd International ACM SIGIR Conference on Research and Development in Information Retrieval},
105 |  year = {2019},
106 |  pages = {385--394}
107 | } 
108 | ```
109 | 
110 | ```
111 | @inproceedings{uddin2018multitask,
112 |  title={Multi-Task Learning for Document Ranking and Query Suggestion},
113 |  author={Wasi Uddin Ahmad and Kai-Wei Chang and Hongning Wang},
114 |  booktitle={International Conference on Learning Representations},
115 |  year={2018}
116 | }
117 | ```
118 | 


--------------------------------------------------------------------------------
/data/msmarco/get_data.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | # download Q&A v2.1 dataset
 4 | wget https://msmarco.blob.core.windows.net/msmarco/train_v2.1.json.gz
 5 | wget https://msmarco.blob.core.windows.net/msmarco/dev_v2.1.json.gz
 6 | 
 7 | # download conversational search dataset
 8 | wget https://msmarco.blob.core.windows.net/conversationalsearch/ann_session_train.tar.gz
 9 | wget https://msmarco.blob.core.windows.net/conversationalsearch/ann_session_dev.tar.gz
10 | 
11 | # decompress the onversational search dataset
12 | tar -xvzf ann_session_train.tar.gz
13 | tar -xvzf ann_session_dev.tar.gz
14 | 
15 | # remove unnecessary files
16 | rm marco_ann_session.*.half*
17 | rm full_marco_sessions*
18 | 
19 | # remove original tar files
20 | rm ann_session_train.tar.gz
21 | rm ann_session_dev.tar.gz
22 | 
23 | # download data for document title
24 | wget https://msmarco.blob.core.windows.net/msmarcoranking/fulldocs.tsv.gz
25 | python process.py 1
26 | rm fulldocs.tsv.gz
27 | 
28 | # split, process data
29 | python process.py 2
30 | 
31 | # print the statistics of the data
32 | python process.py 3
33 | 
34 | # all done, remove all intermediate and src files
35 | rm doctitles.tsv
36 | rm marco_ann_session.train.all.tsv
37 | rm marco_ann_session.dev.all.tsv
38 | rm train_v2.1.json.gz
39 | rm dev_v2.1.json.gz
40 | 


--------------------------------------------------------------------------------
/data/setup.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | function make_dir () {
 4 |     if [[ ! -d "$1" ]]; then
 5 |         mkdir $1
 6 |     fi
 7 | }
 8 | 
 9 | TMP_DIR=../tmp
10 | DATA_DIR=../data/
11 | FASTTEXT=../data/fasttext
12 | 
13 | for dir in $TMP_DIR $FASTTEXT;
14 | do
15 | 	make_dir $dir;
16 | done
17 | 
18 | echo "Downloading Fasttext word embeddings"
19 | if [[ "$(ls -A $FASTTEXT)" ]]; then
20 |      echo "$FASTTEXT is not empty, skipping download"
21 | else
22 |     # download GloVe 840B version
23 |     curl -Lo ${FASTTEXT}/crawl-300d-2M-subword.zip https://dl.fbaipublicfiles.com/fasttext/vectors-english/crawl-300d-2M-subword.zip
24 |     unzip ${FASTTEXT}/crawl-300d-2M-subword.zip -d ${FASTTEXT}/
25 |     rm -f ${FASTTEXT}/crawl-300d-2M-subword.bin
26 | fi
27 | 


--------------------------------------------------------------------------------
/neuroir/__init__.py:
--------------------------------------------------------------------------------
1 | __author__ = 'wasi'
2 | 


--------------------------------------------------------------------------------
/neuroir/config.py:
--------------------------------------------------------------------------------
  1 | # src: https://github.com/facebookresearch/DrQA/blob/master/drqa/reader/config.py
  2 | """ Implementation of all available options """
  3 | from __future__ import print_function
  4 | 
  5 | import argparse
  6 | import logging
  7 | from .hyparam import get_model_specific_params
  8 | 
  9 | logger = logging.getLogger(__name__)
 10 | 
 11 | # Index of arguments concerning the core model architecture
 12 | MODEL_OPTIONS = {
 13 |     'model_type', 'emsize', 'use_word', 'use_char_ngram',
 14 |     'copy_attn', 'resue_copy_attn', 'force_copy'
 15 | }
 16 | 
 17 | # Index of arguments concerning the model optimizer/training
 18 | MODEL_OPTIMIZER = {
 19 |     'fix_embeddings', 'optimizer', 'learning_rate', 'momentum',
 20 |     'weight_decay', 'rnn_padding', 'dropout_rnn', 'dropout',
 21 |     'dropout_emb', 'cuda', 'grad_clipping', 'lr_decay'
 22 | }
 23 | 
 24 | DATA_OPTIONS = {
 25 |     'max_doc_len', 'max_query_len', 'num_candidates', 'force_pad'
 26 | }
 27 | 
 28 | 
 29 | def str2bool(v):
 30 |     return v.lower() in ('yes', 'true', 't', '1', 'y')
 31 | 
 32 | 
 33 | def add_model_args(parser):
 34 |     parser.register('type', 'bool', str2bool)
 35 | 
 36 |     # Data options
 37 |     data = parser.add_argument_group('Data parameters')
 38 |     data.add_argument('--max_doc_len', type=int, default=200,
 39 |                       help='Maximum allowed length for the documents')
 40 |     data.add_argument('--max_query_len', type=int, default=10,
 41 |                       help='Maximum allowed length for the queries')
 42 |     data.add_argument('--num_candidates', type=int, default=10,
 43 |                       help='Number of candidates per query')
 44 | 
 45 |     # Model architecture
 46 |     model = parser.add_argument_group('Neural QA Reader Architecture')
 47 |     model.add_argument('--use_word', type='bool', default=True,
 48 |                        help='Use word embeddings as a part of the input representations.')
 49 |     model.add_argument('--use_char_ngram', type=int, default=0,
 50 |                        help='Use char ngram for the input representations.')
 51 |     model.add_argument('--emsize', type=int, default=300,
 52 |                        help='Embedding size if embedding_file is not given')
 53 |     model.add_argument('--rnn_type', type=str, default='LSTM',
 54 |                        help='RNN type: LSTM, GRU')
 55 |     model.add_argument('--bidirection', type='bool', default=True,
 56 |                        help='use bidirectional recurrent unit')
 57 |     model.add_argument('--nlayers', type=int, default=1,
 58 |                        help='Number of encoding layers')
 59 | 
 60 |     seq2seq = parser.add_argument_group('Seq2seq Model Specific Params')
 61 |     seq2seq.add_argument('--attn_type', type=str, default='general',
 62 |                          help='Attention type for the seq2seq [dot, general, mlp]')
 63 |     seq2seq.add_argument('--coverage_attn', type='bool', default=False,
 64 |                          help='Use coverage attention')
 65 |     seq2seq.add_argument('--copy_attn', type='bool', default=False,
 66 |                          help='Use copy attention')
 67 |     seq2seq.add_argument('--force_copy', type='bool', default=False,
 68 |                          help='Apply force copying')
 69 |     seq2seq.add_argument('--reuse_copy_attn', type='bool', default=False,
 70 |                          help='Reuse encoder attention')
 71 | 
 72 |     # Optimization details
 73 |     optim = parser.add_argument_group('Neural QA Reader Optimization')
 74 |     optim.add_argument('--dropout_emb', type=float, default=0.2,
 75 |                        help='Dropout rate for word embeddings')
 76 |     optim.add_argument('--dropout_rnn', type=float, default=0.2,
 77 |                        help='Dropout rate for RNN states')
 78 |     optim.add_argument('--dropout', type=float, default=0.2,
 79 |                        help='Dropout for NN layers')
 80 |     optim.add_argument('--optimizer', type=str, default='adam',
 81 |                        help='Optimizer: sgd or adamax')
 82 |     optim.add_argument('--learning_rate', type=float, default=0.001,
 83 |                        help='Learning rate for the optimizer')
 84 |     optim.add_argument('--lr_decay', type=float, default=0.95,
 85 |                        help='Decay ratio for learning rate')
 86 |     optim.add_argument('--grad_clipping', type=float, default=10,
 87 |                        help='Gradient clipping')
 88 |     optim.add_argument('--early_stop', type=int, default=5,
 89 |                        help='Stop training if performance doesn\'t improve')
 90 |     optim.add_argument('--weight_decay', type=float, default=0,
 91 |                        help='Weight decay factor')
 92 |     optim.add_argument('--momentum', type=float, default=0,
 93 |                        help='Momentum factor')
 94 |     optim.add_argument('--fix_embeddings', type='bool', default=False,
 95 |                        help='Keep word embeddings fixed (use pretrained)')
 96 | 
 97 | 
 98 | def get_model_args(args):
 99 |     """Filter args for model ones.
100 |     From a args Namespace, return a new Namespace with *only* the args specific
101 |     to the model architecture or optimization. (i.e. the ones defined here.)
102 |     """
103 |     global MODEL_OPTIONS, MODEL_OPTIMIZER, DATA_OPTIONS
104 | 
105 |     model = args.model_type.upper()
106 |     required_args = MODEL_OPTIONS | MODEL_OPTIMIZER | DATA_OPTIONS
107 | 
108 |     arg_values = {k: v for k, v in vars(args).items() if k in required_args}
109 |     # using a fixed set of hyper-parameters that are model specific
110 |     for k, v in get_model_specific_params(model, field='arch').items():
111 |         arg_values[k] = v
112 |     return argparse.Namespace(**arg_values)
113 | 
114 | 
115 | def update_model_args(args):
116 |     model = args.model_type.upper()
117 |     old_args = vars(args)
118 |     for k, v in get_model_specific_params(model, field='data').items():
119 |         old_args[k] = v
120 |     return argparse.Namespace(**old_args)
121 | 
122 | 
123 | def override_model_args(old_args, new_args):
124 |     """Set args to new parameters.
125 |     Decide which model args to keep and which to override when resolving a set
126 |     of saved args and new args.
127 |     We keep the new optimization or RL setting, and leave the model architecture alone.
128 |     """
129 |     global MODEL_OPTIMIZER
130 |     old_args, new_args = vars(old_args), vars(new_args)
131 |     for k in old_args.keys():
132 |         if k in new_args and old_args[k] != new_args[k]:
133 |             if k in MODEL_OPTIMIZER:
134 |                 logger.info('Overriding saved %s: %s --> %s' %
135 |                             (k, old_args[k], new_args[k]))
136 |                 old_args[k] = new_args[k]
137 |             else:
138 |                 logger.info('Keeping saved %s: %s' % (k, old_args[k]))
139 | 
140 |     return argparse.Namespace(**old_args)
141 | 


--------------------------------------------------------------------------------
/neuroir/decoders/__init__.py:
--------------------------------------------------------------------------------
1 | __author__ = 'wasi'
2 | 
3 | from .decoder import *
4 | from .rnn_decoder import *
5 | from .state import *
6 | 


--------------------------------------------------------------------------------
/neuroir/decoders/decoder.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.nn as nn
  3 | 
  4 | from neuroir.utils.misc import aeq
  5 | from neuroir.decoders.state import RNNDecoderState
  6 | from neuroir.modules.global_attention import GlobalAttention
  7 | 
  8 | 
  9 | class DecoderBase(nn.Module):
 10 |     """Abstract class for decoders.
 11 |     Args:
 12 |         attentional (bool): The decoder returns non-empty attention.
 13 |     """
 14 | 
 15 |     def __init__(self, attentional=True):
 16 |         super(DecoderBase, self).__init__()
 17 |         self.attentional = attentional
 18 | 
 19 |     @classmethod
 20 |     def from_opt(cls, opt, embeddings):
 21 |         """Alternate constructor.
 22 |         Subclasses should override this method.
 23 |         """
 24 | 
 25 |         raise NotImplementedError
 26 | 
 27 | 
 28 | # many part of the codes are copied from OpenNMT-Py sources
 29 | class RNNDecoderBase(nn.Module):
 30 |     """
 31 |     Base recurrent attention-based decoder class.
 32 | 
 33 |     .. mermaid::
 34 |        graph BT
 35 |           A[Input]
 36 |           subgraph RNN
 37 |              C[Pos 1]
 38 |              D[Pos 2]
 39 |              E[Pos N]
 40 |           end
 41 |           G[Decoder State]
 42 |           H[Decoder State]
 43 |           I[Outputs]
 44 |           F[Memory_Bank]
 45 |           A--emb-->C
 46 |           A--emb-->D
 47 |           A--emb-->E
 48 |           H-->C
 49 |           C-- attn --- F
 50 |           D-- attn --- F
 51 |           E-- attn --- F
 52 |           C-->I
 53 |           D-->I
 54 |           E-->I
 55 |           E-->G
 56 |           F---I
 57 | 
 58 |     Args:
 59 |        rnn_type (:obj:`str`):
 60 |           style of recurrent unit to use, one of [LSTM, GRU]
 61 |        bidirectional (bool) : use with a bidirectional encoder
 62 |        num_layers (int) : number of stacked layers
 63 |        hidden_size (int) : hidden size of each layer
 64 |        attn_type (str) : see :obj:`nqa.modules.GlobalAttention`
 65 |        dropout (float) : dropout value for :obj:`nn.Dropout`
 66 |     """
 67 | 
 68 |     def __init__(self,
 69 |                  rnn_type,
 70 |                  input_size,
 71 |                  bidirectional_encoder,
 72 |                  num_layers,
 73 |                  hidden_size,
 74 |                  attn_type=None,
 75 |                  coverage_attn=False,
 76 |                  copy_attn=False,
 77 |                  reuse_copy_attn=False,
 78 |                  dropout=0.0):
 79 | 
 80 |         super(RNNDecoderBase, self).__init__()
 81 | 
 82 |         # Basic attributes.
 83 |         self.decoder_type = 'rnn'
 84 |         self.bidirectional_encoder = bidirectional_encoder
 85 |         self.num_layers = num_layers
 86 |         self.hidden_size = hidden_size
 87 |         self.dropout = nn.Dropout(dropout)
 88 | 
 89 |         # Build the RNN.
 90 |         kwargs = {'input_size': input_size,
 91 |                   'hidden_size': hidden_size,
 92 |                   'num_layers': num_layers,
 93 |                   'dropout': dropout,
 94 |                   'batch_first': True}
 95 |         self.rnn = getattr(nn, rnn_type)(**kwargs)
 96 | 
 97 |         # Set up the standard attention.
 98 |         self._coverage = coverage_attn
 99 |         self.attn = None
100 |         if attn_type:
101 |             self.attn = GlobalAttention(
102 |                 hidden_size, coverage=coverage_attn,
103 |                 attn_type=attn_type
104 |             )
105 |         else:
106 |             assert not self._coverage
107 |             if copy_attn and reuse_copy_attn:
108 |                 raise RuntimeError('Attn is turned off, so reuse_copy_attn flag must be false')
109 | 
110 |         # Set up a separated copy attention layer, if needed.
111 |         self._copy = False
112 |         self._reuse_copy_attn = reuse_copy_attn
113 |         if copy_attn and not reuse_copy_attn:
114 |             self.copy_attn = GlobalAttention(
115 |                 hidden_size, attn_type=attn_type
116 |             )
117 |         if copy_attn:
118 |             self._copy = True
119 | 
120 |     def forward(self, tgt, memory_bank, state, memory_lengths=None):
121 |         """
122 |         Args:
123 |             tgt (`LongTensor`): sequences of padded tokens
124 |                  `[batch x tgt_len x nfeats]`.
125 |             memory_bank (`FloatTensor`): vectors from the encoder
126 |                  `[batch x src_len x hidden]`.
127 |             state (:obj:`onmt.models.DecoderState`):
128 |                  decoder state object to initialize the decoder
129 |             memory_lengths (`LongTensor`): the padded source lengths
130 |                 `[batch]`.
131 |         Returns:
132 |             (`FloatTensor`,:obj:`onmt.Models.DecoderState`,`FloatTensor`):
133 |                 * decoder_outputs: output from the decoder (after attn)
134 |                          `[batch x tgt_len x hidden]`.
135 |                 * decoder_state: final hidden state from the decoder
136 |                 * attns: distribution over src at each tgt
137 |                         `[batch x tgt_len x src_len]`.
138 |         """
139 |         # Check
140 |         assert isinstance(state, RNNDecoderState)
141 |         # tgt.size() returns tgt length and batch
142 |         tgt_batch, _, _ = tgt.size()
143 |         if self.attn is not None:
144 |             memory_batch, _, _ = memory_bank.size()
145 |             aeq(tgt_batch, memory_batch)
146 |         # END
147 | 
148 |         # Run the forward pass of the RNN.
149 |         decoder_final, decoder_outputs, attns = self._run_forward_pass(
150 |             tgt, memory_bank, state, memory_lengths=memory_lengths)
151 | 
152 |         coverage = None
153 |         if "coverage" in attns:
154 |             coverage = attns["coverage"]
155 |         # Update the state with the result.
156 |         state.update_state(decoder_final, coverage)
157 | 
158 |         return decoder_outputs, state, attns
159 | 
160 |     def init_decoder_state(self, encoder_final):
161 |         """ Init decoder state with last state of the encoder """
162 | 
163 |         def _fix_enc_hidden(hidden):
164 |             # The encoder hidden is  (layers*directions) x batch x dim.
165 |             # We need to convert it to layers x batch x (directions*dim).
166 |             if self.bidirectional_encoder:
167 |                 hidden = torch.cat([hidden[0:hidden.size(0):2],
168 |                                     hidden[1:hidden.size(0):2]], 2)
169 |             return hidden
170 | 
171 |         if isinstance(encoder_final, tuple):  # LSTM
172 |             return RNNDecoderState(self.hidden_size,
173 |                                    tuple([_fix_enc_hidden(enc_hid)
174 |                                           for enc_hid in encoder_final]))
175 |         else:  # GRU
176 |             return RNNDecoderState(self.hidden_size,
177 |                                    _fix_enc_hidden(encoder_final))
178 | 


--------------------------------------------------------------------------------
/neuroir/decoders/rnn_decoder.py:
--------------------------------------------------------------------------------
 1 | # src: https://github.com/OpenNMT/OpenNMT-py/blob/master/onmt/decoders/decoder.py
 2 | import torch
 3 | import torch.nn as nn
 4 | 
 5 | from neuroir.decoders.decoder import RNNDecoderBase
 6 | from neuroir.utils.misc import aeq
 7 | 
 8 | 
 9 | class RNNDecoder(RNNDecoderBase):
10 |     """
11 |     Standard fully batched RNN decoder with attention.
12 |     Faster implementation, uses CuDNN for implementation.
13 |     See :obj:`RNNDecoderBase` for options.
14 |     Based around the approach from
15 |     "Neural Machine Translation By Jointly Learning To Align and Translate"
16 |     :cite:`Bahdanau2015`
17 |     """
18 | 
19 |     def _run_forward_pass(self, tgt, memory_bank, state, memory_lengths=None):
20 |         """
21 |         Private helper for running the specific RNN forward pass.
22 |         Must be overriden by all subclasses.
23 |         Args:
24 |             tgt (LongTensor): a sequence of input tokens tensors
25 |                                  [batch x len x nfeats].
26 |             memory_bank (FloatTensor): output(tensor sequence) from the encoder
27 |                         RNN of size (batch x src_len x hidden_size).
28 |             state (FloatTensor): hidden state from the encoder RNN for
29 |                                  initializing the decoder.
30 |             memory_lengths (LongTensor): the source memory_bank lengths.
31 |         Returns:
32 |             decoder_final (Tensor): final hidden state from the decoder.
33 |             decoder_outputs (Tensor): output from the decoder (after attn)
34 |                          `[batch x tgt_len x hidden]`.
35 |             attns (Tensor): distribution over src at each tgt
36 |                         `[batch x tgt_len x src_len]`.
37 |         """
38 |         # Initialize local and return variables.
39 |         attns = {}
40 | 
41 |         emb = tgt
42 |         assert emb.dim() == 3
43 | 
44 |         coverage = state.coverage
45 | 
46 |         if isinstance(self.rnn, nn.GRU):
47 |             rnn_output, decoder_final = self.rnn(emb, state.hidden[0])
48 |         else:
49 |             rnn_output, decoder_final = self.rnn(emb, state.hidden)
50 | 
51 |         # Check
52 |         tgt_batch, tgt_len, _ = tgt.size()
53 |         output_batch, output_len, _ = rnn_output.size()
54 |         aeq(tgt_len, output_len)
55 |         aeq(tgt_batch, output_batch)
56 |         # END
57 | 
58 |         # Calculate the attention.
59 |         if self.attn is not None:
60 |             decoder_outputs, p_attn, coverage_v = self.attn(
61 |                 rnn_output.contiguous(),
62 |                 memory_bank,
63 |                 memory_lengths=memory_lengths,
64 |                 coverage=coverage
65 |             )
66 |             attns["std"] = p_attn
67 |         else:
68 |             decoder_outputs = rnn_output.contiguous()
69 | 
70 |         # Update the coverage attention.
71 |         if self._coverage:
72 |             if coverage_v is None:
73 |                 coverage = coverage + p_attn \
74 |                     if coverage is not None else p_attn
75 |             else:
76 |                 coverage = coverage + coverage_v \
77 |                     if coverage is not None else coverage_v
78 |             attns["coverage"] = coverage
79 | 
80 |         decoder_outputs = self.dropout(decoder_outputs)
81 |         # Run the forward pass of the copy attention layer.
82 |         if self._copy and not self._reuse_copy_attn:
83 |             _, copy_attn, _ = self.copy_attn(decoder_outputs,
84 |                                              memory_bank,
85 |                                              memory_lengths=memory_lengths)
86 |             attns["copy"] = copy_attn
87 |         elif self._copy:
88 |             attns["copy"] = attns["std"]
89 | 
90 |         return decoder_final, decoder_outputs, attns
91 | 


--------------------------------------------------------------------------------
/neuroir/decoders/state.py:
--------------------------------------------------------------------------------
 1 | # src: https://github.com/OpenNMT/OpenNMT-py/blob/master/onmt/decoders/decoder.py
 2 | 
 3 | 
 4 | class DecoderState(object):
 5 |     """Interface for grouping together the current state of a recurrent
 6 |     decoder. In the simplest case just represents the hidden state of
 7 |     the model.  But can also be used for implementing various forms of
 8 |     input_feeding and non-recurrent models.
 9 |     Modules need to implement this to utilize beam search decoding.
10 |     """
11 | 
12 |     def detach(self):
13 |         """ Need to document this """
14 |         self.hidden = tuple([_.detach() for _ in self.hidden])
15 | 
16 |     def beam_update(self, idx, positions, beam_size):
17 |         """ Need to document this """
18 |         for e in self._all:
19 |             sizes = e.size()
20 |             br = sizes[1]
21 |             if len(sizes) == 3:
22 |                 sent_states = e.view(sizes[0], beam_size, br // beam_size,
23 |                                      sizes[2])[:, :, idx]
24 |             else:
25 |                 sent_states = e.view(sizes[0], beam_size,
26 |                                      br // beam_size,
27 |                                      sizes[2],
28 |                                      sizes[3])[:, :, idx]
29 | 
30 |             sent_states.data.copy_(
31 |                 sent_states.data.index_select(1, positions))
32 | 
33 |     def map_batch_fn(self, fn):
34 |         raise NotImplementedError()
35 | 
36 | 
37 | class RNNDecoderState(DecoderState):
38 |     """ Base class for RNN decoder state """
39 | 
40 |     def __init__(self, hidden_size, rnnstate):
41 |         """
42 |         Args:
43 |             hidden_size (int): the size of hidden layer of the decoder.
44 |             rnnstate: final hidden state from the encoder.
45 |                 transformed to shape: layers x batch x (directions*dim).
46 |         """
47 |         if not isinstance(rnnstate, tuple):
48 |             self.hidden = (rnnstate,)
49 |         else:
50 |             self.hidden = rnnstate
51 |         self.coverage = None
52 | 
53 |     @property
54 |     def _all(self):
55 |         return self.hidden
56 | 
57 |     def update_state(self, rnnstate, coverage):
58 |         """ Update decoder state """
59 |         if not isinstance(rnnstate, tuple):
60 |             self.hidden = (rnnstate,)
61 |         else:
62 |             self.hidden = rnnstate
63 |         self.coverage = coverage
64 | 
65 |     def repeat_beam_size_times(self, beam_size):
66 |         """ Repeat beam_size times along batch dimension. """
67 |         vars = [e.data.repeat(1, beam_size, 1)
68 |                 for e in self._all]
69 |         self.hidden = tuple(vars)
70 | 
71 |     def map_batch_fn(self, fn):
72 |         self.hidden = tuple(map(lambda x: fn(x, 1), self.hidden))
73 | 


--------------------------------------------------------------------------------
/neuroir/encoders/__init__.py:
--------------------------------------------------------------------------------
1 | __author__ = 'wasi'
2 | 
3 | from .encoder import *
4 | from .rnn_encoder import *
5 | 


--------------------------------------------------------------------------------
/neuroir/encoders/encoder.py:
--------------------------------------------------------------------------------
 1 | # src: https://github.com/OpenNMT/OpenNMT-py/blob/master/onmt/encoders/encoder.py
 2 | """"Base class for encoders and generic multi encoders."""
 3 | 
 4 | from __future__ import division
 5 | 
 6 | import torch.nn as nn
 7 | from neuroir.utils.misc import aeq
 8 | 
 9 | 
10 | # from absqa.nqa.utils import aeq
11 | 
12 | 
13 | class EncoderBase(nn.Module):
14 |     """
15 |     Base encoder class. Specifies the interface used by different encoder types
16 |     and required by :obj:`onmt.Models.NMTModel`.
17 |     .. mermaid::
18 |        graph BT
19 |           A[Input]
20 |           subgraph RNN
21 |             C[Pos 1]
22 |             D[Pos 2]
23 |             E[Pos N]
24 |           end
25 |           F[Memory_Bank]
26 |           G[Final]
27 |           A-->C
28 |           A-->D
29 |           A-->E
30 |           C-->F
31 |           D-->F
32 |           E-->F
33 |           E-->G
34 |     """
35 | 
36 |     def _check_args(self,
37 |                     src,
38 |                     lengths=None,
39 |                     hidden=None):
40 |         n_batch, _, _ = src.size()
41 |         if lengths is not None:
42 |             n_batch_, = lengths.size()
43 |             aeq(n_batch, n_batch_)
44 | 
45 |     def forward(self, src, lengths=None):
46 |         """
47 |         Args:
48 |             src (:obj:`LongTensor`):
49 |                padded sequences of sparse indices `[src_len x batch x nfeat]`
50 |             lengths (:obj:`LongTensor`): length of each sequence `[batch]`
51 |         Returns:
52 |             (tuple of :obj:`FloatTensor`, :obj:`FloatTensor`):
53 |                 * final encoder state, used to initialize decoder
54 |                 * memory bank for attention, `[src_len x batch x hidden]`
55 |         """
56 |         raise NotImplementedError
57 | 


--------------------------------------------------------------------------------
/neuroir/encoders/rnn_encoder.py:
--------------------------------------------------------------------------------
  1 | # src: https://github.com/OpenNMT/OpenNMT-py/blob/master/onmt/encoders/rnn_encoder.py
  2 | """Define RNN-based encoders."""
  3 | from __future__ import division
  4 | 
  5 | import torch
  6 | import torch.nn as nn
  7 | import torch.nn.functional as F
  8 | 
  9 | from neuroir.encoders.encoder import EncoderBase
 10 | from torch.nn.utils.rnn import pack_padded_sequence as pack
 11 | from torch.nn.utils.rnn import pad_packed_sequence as unpack
 12 | 
 13 | 
 14 | class RNNEncoder(EncoderBase):
 15 |     """ A generic recurrent neural network encoder.
 16 |     Args:
 17 |        rnn_type (:obj:`str`):
 18 |           style of recurrent unit to use, one of [RNN, LSTM, GRU, SRU]
 19 |        bidirectional (bool) : use a bidirectional RNN
 20 |        num_layers (int) : number of stacked layers
 21 |        hidden_size (int) : hidden size of each layer
 22 |        dropout (float) : dropout value for :obj:`nn.Dropout`
 23 |     """
 24 | 
 25 |     def __init__(self,
 26 |                  rnn_type,
 27 |                  input_size,
 28 |                  bidirectional,
 29 |                  num_layers,
 30 |                  hidden_size,
 31 |                  dropout=0.0,
 32 |                  use_bridge=False,
 33 |                  use_last=True):
 34 |         super(RNNEncoder, self).__init__()
 35 | 
 36 |         num_directions = 2 if bidirectional else 1
 37 |         assert hidden_size % num_directions == 0
 38 |         hidden_size = hidden_size // num_directions
 39 | 
 40 |         # Saves preferences for layer
 41 |         self.nlayers = num_layers
 42 |         self.use_last = use_last
 43 | 
 44 |         self.rnns = nn.ModuleList()
 45 |         for i in range(self.nlayers):
 46 |             input_size = input_size if i == 0 else hidden_size * num_directions
 47 |             kwargs = {'input_size': input_size,
 48 |                       'hidden_size': hidden_size,
 49 |                       'num_layers': 1,
 50 |                       'bidirectional': bidirectional,
 51 |                       'batch_first': True}
 52 |             rnn = getattr(nn, rnn_type)(**kwargs)
 53 |             self.rnns.append(rnn)
 54 | 
 55 |         self.dropout = nn.Dropout(dropout)
 56 |         # Initialize the bridge layer
 57 |         self.use_bridge = use_bridge
 58 |         if self.use_bridge:
 59 |             nl = 1 if self.use_last else num_layers
 60 |             self._initialize_bridge(rnn_type, hidden_size, nl)
 61 | 
 62 |     def forward(self,
 63 |                 emb,
 64 |                 lengths=None,
 65 |                 init_states=None):
 66 |         "See :obj:`EncoderBase.forward()`"
 67 |         self._check_args(emb, lengths)
 68 | 
 69 |         packed_emb = emb
 70 |         if lengths is not None:
 71 |             # Lengths data is wrapped inside a Tensor.
 72 |             lengths, indices = torch.sort(lengths, 0, True)  # Sort by length (keep idx)
 73 |             packed_emb = pack(packed_emb[indices], lengths.tolist(), batch_first=True)
 74 |             _, _indices = torch.sort(indices, 0)  # Un-sort by length
 75 | 
 76 |         istates = []
 77 |         if init_states:
 78 |             if isinstance(init_states, tuple):
 79 |                 hidden_states, cell_states = init_states
 80 |                 hidden_states = hidden_states.split(self.nlayers, dim=0)
 81 |                 cell_states = cell_states.split(self.nlayers, dim=0)
 82 |             else:
 83 |                 hidden_states = init_states
 84 |                 hidden_states = hidden_states.split(self.nlayers, dim=0)
 85 | 
 86 |             for i in range(self.nlayers):
 87 |                 if isinstance(init_states, tuple):
 88 |                     istates.append((hidden_states[i], cell_states[i]))
 89 |                 else:
 90 |                     istates.append(hidden_states[i])
 91 | 
 92 |         memory_bank, encoder_final = [], {'h_n': [], 'c_n': []}
 93 |         for i in range(self.nlayers):
 94 |             if i != 0:
 95 |                 packed_emb = self.dropout(packed_emb)
 96 |                 if lengths is not None:
 97 |                     packed_emb = pack(packed_emb, lengths.tolist(), batch_first=True)
 98 | 
 99 |             if init_states:
100 |                 packed_emb, states = self.rnns[i](packed_emb, istates[i])
101 |             else:
102 |                 packed_emb, states = self.rnns[i](packed_emb)
103 | 
104 |             if isinstance(states, tuple):
105 |                 h_n, c_n = states
106 |                 encoder_final['c_n'].append(c_n)
107 |             else:
108 |                 h_n = states
109 |             encoder_final['h_n'].append(h_n)
110 | 
111 |             packed_emb = unpack(packed_emb, batch_first=True)[0] if lengths is not None else packed_emb
112 |             if not self.use_last or i == self.nlayers - 1:
113 |                 memory_bank += [packed_emb[_indices]] if lengths is not None else [packed_emb]
114 | 
115 |         assert len(encoder_final['h_n']) != 0
116 |         if self.use_last:
117 |             memory_bank = memory_bank[-1]
118 |             if len(encoder_final['c_n']) == 0:
119 |                 encoder_final = encoder_final['h_n'][-1]
120 |             else:
121 |                 encoder_final = encoder_final['h_n'][-1], encoder_final['c_n'][-1]
122 |         else:
123 |             memory_bank = torch.cat(memory_bank, dim=2)
124 |             if len(encoder_final['c_n']) == 0:
125 |                 encoder_final = torch.cat(encoder_final['h_n'], dim=0)
126 |             else:
127 |                 encoder_final = torch.cat(encoder_final['h_n'], dim=0), \
128 |                                 torch.cat(encoder_final['c_n'], dim=0)
129 | 
130 |         if self.use_bridge:
131 |             encoder_final = self._bridge(encoder_final)
132 | 
133 |         # TODO: Temporary hack is adopted to compatible with DataParallel
134 |         # reference: https://github.com/pytorch/pytorch/issues/1591
135 |         if memory_bank.size(1) < emb.size(1):
136 |             dummy_tensor = torch.zeros(memory_bank.size(0),
137 |                                        emb.size(1) - memory_bank.size(1),
138 |                                        memory_bank.size(2)).type_as(memory_bank)
139 |             memory_bank = torch.cat([memory_bank, dummy_tensor], 1)
140 | 
141 |         return encoder_final, memory_bank
142 | 
143 |     def _initialize_bridge(self,
144 |                            rnn_type,
145 |                            hidden_size,
146 |                            num_layers):
147 | 
148 |         # LSTM has hidden and cell state, other only one
149 |         number_of_states = 2 if rnn_type == "LSTM" else 1
150 |         # Total number of states
151 |         self.total_hidden_dim = hidden_size * num_layers
152 | 
153 |         # Build a linear layer for each
154 |         self.bridge = nn.ModuleList([nn.Linear(self.total_hidden_dim,
155 |                                                self.total_hidden_dim,
156 |                                                bias=True)
157 |                                      for _ in range(number_of_states)])
158 | 
159 |     def _bridge(self, hidden):
160 |         """
161 |         Forward hidden state through bridge
162 |         """
163 | 
164 |         def bottle_hidden(linear, states):
165 |             """
166 |             Transform from 3D to 2D, apply linear and return initial size
167 |             """
168 |             size = states.size()
169 |             result = linear(states.view(-1, self.total_hidden_dim))
170 |             return F.relu(result).view(size)
171 | 
172 |         if isinstance(hidden, tuple):  # LSTM
173 |             outs = tuple([bottle_hidden(layer, hidden[ix])
174 |                           for ix, layer in enumerate(self.bridge)])
175 |         else:
176 |             outs = bottle_hidden(self.bridge[0], hidden)
177 | 
178 |         return outs
179 | 


--------------------------------------------------------------------------------
/neuroir/eval/__init__.py:
--------------------------------------------------------------------------------
1 | __author__ = 'wasi'
2 | 
3 | from .ltorank import *
4 | from .squad_eval import *
5 | from . import bleu
6 | from . import rouge
7 | 


--------------------------------------------------------------------------------
/neuroir/eval/bleu/__init__.py:
--------------------------------------------------------------------------------
1 | __author__ = 'wasi'
2 | 
3 | from .bleu import *
4 | from .bleu_scorer import *
5 | 


--------------------------------------------------------------------------------
/neuroir/eval/bleu/bleu.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | #
 3 | # File Name : bleu.py
 4 | #
 5 | # Description : Wrapper for BLEU scorer.
 6 | #
 7 | # Creation Date : 06-01-2015
 8 | # Last Modified : Thu 19 Mar 2015 09:13:28 PM PDT
 9 | # Authors : Hao Fang <hfang@uw.edu> and Tsung-Yi Lin <tl483@cornell.edu>
10 | 
11 | import numpy
12 | from neuroir.eval.bleu.bleu_scorer import BleuScorer
13 | 
14 | 
15 | class Bleu:
16 |     def __init__(self, n=4):
17 |         # default compute Blue score up to 4
18 |         self._n = n
19 |         self._hypo_for_image = {}
20 |         self.ref_for_image = {}
21 | 
22 |     def compute_score(self, gts, res, verbose):
23 |         assert (sorted(gts.keys()) == sorted(res.keys()))
24 |         imgIds = list(gts.keys())
25 | 
26 |         bleu_scorer = BleuScorer(n=self._n)
27 |         for id in imgIds:
28 |             hypo = res[id]
29 |             ref = gts[id]
30 | 
31 |             # Sanity check.
32 |             assert (type(hypo) is list)
33 |             assert (len(hypo) == 1)
34 |             assert (type(ref) is list)
35 |             assert (len(ref) >= 1)
36 | 
37 |             bleu_scorer += (hypo[0], ref)
38 | 
39 |         # score, scores = bleu_scorer.compute_score(option='shortest')
40 |         score, scores = bleu_scorer.compute_score(option='closest', verbose=verbose)
41 |         # score, scores = bleu_scorer.compute_score(option='average', verbose=1)
42 | 
43 |         ind_scores = dict()
44 |         scores = numpy.asarray(scores).transpose((1, 0)).tolist()
45 |         for id, s in zip(imgIds, scores):
46 |             ind_scores[id] = s
47 |         # return (bleu, bleu_info)
48 |         return score, ind_scores
49 | 
50 |     def method(self):
51 |         return "Bleu"
52 | 


--------------------------------------------------------------------------------
/neuroir/eval/bleu/bleu_scorer.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | 
  3 | # bleu_scorer.py
  4 | # David Chiang <chiang@isi.edu>
  5 | 
  6 | # Copyright (c) 2004-2006 University of Maryland. All rights
  7 | # reserved. Do not redistribute without permission from the
  8 | # author. Not for commercial use.
  9 | 
 10 | # Modified by:
 11 | # Hao Fang <hfang@uw.edu>
 12 | # Tsung-Yi Lin <tl483@cornell.edu>
 13 | 
 14 | '''Provides:
 15 | cook_refs(refs, n=4): Transform a list of reference sentences as strings into a form usable by cook_test().
 16 | cook_test(test, refs, n=4): Transform a test sentence as a string (together with the cooked reference sentences) into a form usable by score_cooked().
 17 | '''
 18 | 
 19 | import copy
 20 | import math
 21 | from collections import defaultdict
 22 | 
 23 | 
 24 | def precook(s, n=4, out=False):
 25 |     """Takes a string as input and returns an object that can be given to
 26 |     either cook_refs or cook_test. This is optional: cook_refs and cook_test
 27 |     can take string arguments as well."""
 28 |     words = s.split()
 29 |     counts = defaultdict(int)
 30 |     for k in range(1, n + 1):
 31 |         for i in range(len(words) - k + 1):
 32 |             ngram = tuple(words[i:i + k])
 33 |             counts[ngram] += 1
 34 |     return (len(words), counts)
 35 | 
 36 | 
 37 | def cook_refs(refs, eff=None, n=4):  ## lhuang: oracle will call with "average"
 38 |     '''Takes a list of reference sentences for a single segment
 39 |     and returns an object that encapsulates everything that BLEU
 40 |     needs to know about them.'''
 41 | 
 42 |     reflen = []
 43 |     maxcounts = {}
 44 |     for ref in refs:
 45 |         rl, counts = precook(ref, n)
 46 |         reflen.append(rl)
 47 |         for (ngram, count) in counts.items():
 48 |             maxcounts[ngram] = max(maxcounts.get(ngram, 0), count)
 49 | 
 50 |     # Calculate effective reference sentence length.
 51 |     if eff == "shortest":
 52 |         reflen = min(reflen)
 53 |     elif eff == "average":
 54 |         reflen = float(sum(reflen)) / len(reflen)
 55 | 
 56 |     ## lhuang: N.B.: leave reflen computaiton to the very end!!
 57 | 
 58 |     ## lhuang: N.B.: in case of "closest", keep a list of reflens!! (bad design)
 59 | 
 60 |     return (reflen, maxcounts)
 61 | 
 62 | 
 63 | def cook_test(test, xxx_todo_changeme, eff=None, n=4):
 64 |     '''Takes a test sentence and returns an object that
 65 |     encapsulates everything that BLEU needs to know about it.'''
 66 |     (reflen, refmaxcounts) = xxx_todo_changeme
 67 |     testlen, counts = precook(test, n, True)
 68 | 
 69 |     result = {}
 70 | 
 71 |     # Calculate effective reference sentence length.
 72 | 
 73 |     if eff == "closest":
 74 |         result["reflen"] = min((abs(l - testlen), l) for l in reflen)[1]
 75 |     else:  ## i.e., "average" or "shortest" or None
 76 |         result["reflen"] = reflen
 77 | 
 78 |     result["testlen"] = testlen
 79 | 
 80 |     result["guess"] = [max(0, testlen - k + 1) for k in range(1, n + 1)]
 81 | 
 82 |     result['correct'] = [0] * n
 83 |     for (ngram, count) in counts.items():
 84 |         result["correct"][len(ngram) - 1] += min(refmaxcounts.get(ngram, 0), count)
 85 | 
 86 |     return result
 87 | 
 88 | 
 89 | class BleuScorer(object):
 90 |     """Bleu scorer.
 91 |     """
 92 | 
 93 |     __slots__ = "n", "crefs", "ctest", "_score", "_ratio", "_testlen", "_reflen", "special_reflen"
 94 | 
 95 |     # special_reflen is used in oracle (proportional effective ref len for a node).
 96 | 
 97 |     def copy(self):
 98 |         ''' copy the refs.'''
 99 |         new = BleuScorer(n=self.n)
100 |         new.ctest = copy.copy(self.ctest)
101 |         new.crefs = copy.copy(self.crefs)
102 |         new._score = None
103 |         return new
104 | 
105 |     def __init__(self, test=None, refs=None, n=4, special_reflen=None):
106 |         ''' singular instance '''
107 | 
108 |         self.n = n
109 |         self.crefs = []
110 |         self.ctest = []
111 |         self.cook_append(test, refs)
112 |         self.special_reflen = special_reflen
113 | 
114 |     def cook_append(self, test, refs):
115 |         '''called by constructor and __iadd__ to avoid creating new instances.'''
116 | 
117 |         if refs is not None:
118 |             self.crefs.append(cook_refs(refs))
119 |             if test is not None:
120 |                 cooked_test = cook_test(test, self.crefs[-1])
121 |                 self.ctest.append(cooked_test)  ## N.B.: -1
122 |             else:
123 |                 self.ctest.append(None)  # lens of crefs and ctest have to match
124 | 
125 |         self._score = None  ## need to recompute
126 | 
127 |     def ratio(self, option=None):
128 |         self.compute_score(option=option)
129 |         return self._ratio
130 | 
131 |     def score_ratio(self, option=None):
132 |         '''return (bleu, len_ratio) pair'''
133 |         return (self.fscore(option=option), self.ratio(option=option))
134 | 
135 |     def score_ratio_str(self, option=None):
136 |         return "%.4f (%.2f)" % self.score_ratio(option)
137 | 
138 |     def reflen(self, option=None):
139 |         self.compute_score(option=option)
140 |         return self._reflen
141 | 
142 |     def testlen(self, option=None):
143 |         self.compute_score(option=option)
144 |         return self._testlen
145 | 
146 |     def retest(self, new_test):
147 |         if type(new_test) is str:
148 |             new_test = [new_test]
149 |         assert len(new_test) == len(self.crefs), new_test
150 |         self.ctest = []
151 |         for t, rs in zip(new_test, self.crefs):
152 |             self.ctest.append(cook_test(t, rs))
153 |         self._score = None
154 | 
155 |         return self
156 | 
157 |     def rescore(self, new_test):
158 |         ''' replace test(s) with new test(s), and returns the new score.'''
159 | 
160 |         return self.retest(new_test).compute_score()
161 | 
162 |     def size(self):
163 |         assert len(self.crefs) == len(self.ctest), "refs/test mismatch! %d<>%d" % (len(self.crefs), len(self.ctest))
164 |         return len(self.crefs)
165 | 
166 |     def __iadd__(self, other):
167 |         '''add an instance (e.g., from another sentence).'''
168 | 
169 |         if type(other) is tuple:
170 |             ## avoid creating new BleuScorer instances
171 |             self.cook_append(other[0], other[1])
172 |         else:
173 |             assert self.compatible(other), "incompatible BLEUs."
174 |             self.ctest.extend(other.ctest)
175 |             self.crefs.extend(other.crefs)
176 |             self._score = None  ## need to recompute
177 | 
178 |         return self
179 | 
180 |     def compatible(self, other):
181 |         return isinstance(other, BleuScorer) and self.n == other.n
182 | 
183 |     def single_reflen(self, option="average"):
184 |         return self._single_reflen(self.crefs[0][0], option)
185 | 
186 |     def _single_reflen(self, reflens, option=None, testlen=None):
187 | 
188 |         if option == "shortest":
189 |             reflen = min(reflens)
190 |         elif option == "average":
191 |             reflen = float(sum(reflens)) / len(reflens)
192 |         elif option == "closest":
193 |             reflen = min((abs(l - testlen), l) for l in reflens)[1]
194 |         else:
195 |             assert False, "unsupported reflen option %s" % option
196 | 
197 |         return reflen
198 | 
199 |     def recompute_score(self, option=None, verbose=0):
200 |         self._score = None
201 |         return self.compute_score(option, verbose)
202 | 
203 |     def compute_score(self, option=None, verbose=0):
204 |         n = self.n
205 |         small = 1e-9
206 |         tiny = 1e-15  ## so that if guess is 0 still return 0
207 |         bleu_list = [[] for _ in range(n)]
208 | 
209 |         if self._score is not None:
210 |             return self._score
211 | 
212 |         if option is None:
213 |             option = "average" if len(self.crefs) == 1 else "closest"
214 | 
215 |         self._testlen = 0
216 |         self._reflen = 0
217 |         totalcomps = {'testlen': 0, 'reflen': 0, 'guess': [0] * n, 'correct': [0] * n}
218 | 
219 |         # for each sentence
220 |         for comps in self.ctest:
221 |             testlen = comps['testlen']
222 |             self._testlen += testlen
223 | 
224 |             if self.special_reflen is None:  ## need computation
225 |                 reflen = self._single_reflen(comps['reflen'], option, testlen)
226 |             else:
227 |                 reflen = self.special_reflen
228 | 
229 |             self._reflen += reflen
230 | 
231 |             for key in ['guess', 'correct']:
232 |                 for k in range(n):
233 |                     totalcomps[key][k] += comps[key][k]
234 | 
235 |             # append per image bleu score
236 |             bleu = 1.
237 |             for k in range(n):
238 |                 bleu *= (float(comps['correct'][k]) + tiny) \
239 |                         / (float(comps['guess'][k]) + small)
240 |                 bleu_list[k].append(bleu ** (1. / (k + 1)))
241 |             ratio = (testlen + tiny) / (reflen + small)  ## N.B.: avoid zero division
242 |             if ratio < 1:
243 |                 for k in range(n):
244 |                     bleu_list[k][-1] *= math.exp(1 - 1 / ratio)
245 | 
246 |             if verbose > 1:
247 |                 print(comps, reflen)
248 | 
249 |         totalcomps['reflen'] = self._reflen
250 |         totalcomps['testlen'] = self._testlen
251 | 
252 |         bleus = []
253 |         bleu = 1.
254 |         for k in range(n):
255 |             bleu *= float(totalcomps['correct'][k] + tiny) \
256 |                     / (totalcomps['guess'][k] + small)
257 |             bleus.append(bleu ** (1. / (k + 1)))
258 |         ratio = (self._testlen + tiny) / (self._reflen + small)  ## N.B.: avoid zero division
259 |         if ratio < 1:
260 |             for k in range(n):
261 |                 bleus[k] *= math.exp(1 - 1 / ratio)
262 | 
263 |         if verbose > 0:
264 |             print(totalcomps)
265 |             print("ratio:", ratio)
266 | 
267 |         self._score = bleus
268 |         return self._score, bleu_list
269 | 


--------------------------------------------------------------------------------
/neuroir/eval/ltorank.py:
--------------------------------------------------------------------------------
  1 | import numpy
  2 | 
  3 | 
  4 | def MAP(predictions, target):
  5 |     """
  6 |     Compute mean average precision.
  7 |     :param predictions: 2d list [batch_size x num_candidate_paragraphs]
  8 |     :param target: 2d list [batch_size x num_candidate_paragraphs]
  9 |     :return: mean average precision [a float value]
 10 |     """
 11 |     assert predictions.shape == target.shape
 12 |     assert predictions.ndim == target.ndim == 2
 13 | 
 14 |     nrow, ncolumn = target.shape[0], target.shape[1]
 15 | 
 16 |     map = 0
 17 |     for i in range(nrow):
 18 |         average_precision, num_rel = 0, 0
 19 |         for j in range(ncolumn):
 20 |             if target[i, predictions[i, j]] == 1:
 21 |                 num_rel += 1
 22 |                 average_precision += num_rel / (j + 1)
 23 |         average_precision = average_precision / num_rel
 24 |         map += average_precision
 25 | 
 26 |     return map / nrow
 27 | 
 28 | 
 29 | def precision_at_k(predictions, target, k):
 30 |     """
 31 |     Compute precision at k.
 32 |     :param predictions: 2d list [batch_size x num_candidate_paragraphs]
 33 |     :param target: 2d list [batch_size x num_candidate_paragraphs]
 34 |     :return: precision@K [a float value]
 35 |     """
 36 |     assert predictions.shape == target.shape
 37 |     assert predictions.ndim == target.ndim == 2
 38 | 
 39 |     nrow, ncolumn = target.shape[0], target.shape[1]
 40 |     assert ncolumn >= k, 'Precision@K cannot be computed, invalid value of K.'
 41 | 
 42 |     p_at_k = 0
 43 |     for i in range(nrow):
 44 |         num_rel = numpy.count_nonzero(target[i, predictions[i, :k]])
 45 |         p_at_k += num_rel / k
 46 | 
 47 |     return p_at_k / nrow
 48 | 
 49 | 
 50 | def recall_at_k(predictions, target, k):
 51 |     """
 52 |     Compute recall at k.
 53 |     :param predictions: 2d list [batch_size x num_candidate_paragraphs]
 54 |     :param target: 2d list [batch_size x num_candidate_paragraphs]
 55 |     :return: precision@K [a float value]
 56 |     """
 57 |     assert predictions.shape == target.shape
 58 |     assert predictions.ndim == target.ndim == 2
 59 | 
 60 |     nrow, ncolumn = target.shape[0], target.shape[1]
 61 |     assert ncolumn >= k, 'Recall@K cannot be computed, invalid value of K.'
 62 | 
 63 |     r_at_k = 0
 64 |     for i in range(nrow):
 65 |         num_rel = numpy.count_nonzero(target[i, predictions[i, :k]])
 66 |         total_rel = numpy.count_nonzero(target[i])
 67 |         r_at_k += num_rel / total_rel
 68 | 
 69 |     return r_at_k / nrow
 70 | 
 71 | 
 72 | def NDCG_at_k(predictions, target, k):
 73 |     """
 74 |     Compute normalized discounted cumulative gain.
 75 |     :param predictions: 2d list [batch_size x num_candidate_paragraphs]
 76 |     :param target: 2d list [batch_size x num_candidate_paragraphs]
 77 |     :return: NDCG@k [a float value]
 78 |     """
 79 |     assert predictions.shape == target.shape
 80 |     assert predictions.ndim == target.ndim == 2
 81 | 
 82 |     nrow, ncolumn = target.shape[0], target.shape[1]
 83 |     assert ncolumn >= k, 'NDCG@K cannot be computed, invalid value of K.'
 84 | 
 85 |     NDCG = 0
 86 |     for i in range(nrow):
 87 |         DCG_ref = 0
 88 |         num_rel_docs = numpy.count_nonzero(target[i])
 89 |         for j in range(ncolumn):
 90 |             if j == k:
 91 |                 break
 92 |             if target[i, predictions[i, j]] == 1:
 93 |                 DCG_ref += 1 / numpy.log2(j + 2)
 94 |         DCG_gt = 0
 95 |         for j in range(num_rel_docs):
 96 |             if j == k:
 97 |                 break
 98 |             DCG_gt += 1 / numpy.log2(j + 2)
 99 |         NDCG += DCG_ref / DCG_gt
100 | 
101 |     return NDCG / nrow
102 | 
103 | 
104 | def MRR(predictions, target):
105 |     """
106 |     Compute mean reciprocal rank.
107 |     :param predictions: 2d list [batch_size x num_candidate_paragraphs]
108 |     :param target: 2d list [batch_size x num_candidate_paragraphs]
109 |     :return: mean reciprocal rank [a float value]
110 |     """
111 |     assert predictions.shape == target.shape
112 |     assert predictions.ndim == target.ndim == 2
113 | 
114 |     nrow, ncolumn = target.shape[0], target.shape[1]
115 | 
116 |     total_reciprocal_rank = 0
117 |     for i in range(nrow):
118 |         for j in range(ncolumn):
119 |             if target[i, predictions[i, j]] == 1:
120 |                 total_reciprocal_rank += 1.0 / (j + 1)
121 |                 break
122 | 
123 |     return total_reciprocal_rank / nrow
124 | 


--------------------------------------------------------------------------------
/neuroir/eval/rouge/__init__.py:
--------------------------------------------------------------------------------
1 | __author__ = 'wasi'
2 | 
3 | from .rouge import *
4 | 


--------------------------------------------------------------------------------
/neuroir/eval/rouge/rouge.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | #
  3 | # File Name : rouge.py
  4 | #
  5 | # Description : Computes ROUGE-L metric as described by Lin and Hovey (2004)
  6 | #
  7 | # Creation Date : 2015-01-07 06:03
  8 | # Author : Ramakrishna Vedantam <vrama91@vt.edu>
  9 | 
 10 | import numpy as np
 11 | 
 12 | 
 13 | def my_lcs(string, sub):
 14 |     """
 15 |     Calculates longest common subsequence for a pair of tokenized strings
 16 |     :param string : list of str : tokens from a string split using whitespace
 17 |     :param sub : list of str : shorter string, also split using whitespace
 18 |     :returns: length (list of int): length of the longest common subsequence between the two strings
 19 |     Note: my_lcs only gives length of the longest common subsequence, not the actual LCS
 20 |     """
 21 |     if len(string) < len(sub):
 22 |         sub, string = string, sub
 23 | 
 24 |     lengths = [[0 for i in range(0, len(sub) + 1)] for j in range(0, len(string) + 1)]
 25 | 
 26 |     for j in range(1, len(sub) + 1):
 27 |         for i in range(1, len(string) + 1):
 28 |             if string[i - 1] == sub[j - 1]:
 29 |                 lengths[i][j] = lengths[i - 1][j - 1] + 1
 30 |             else:
 31 |                 lengths[i][j] = max(lengths[i - 1][j], lengths[i][j - 1])
 32 | 
 33 |     return lengths[len(string)][len(sub)]
 34 | 
 35 | 
 36 | class Rouge():
 37 |     '''
 38 |     Class for computing ROUGE-L score for a set of candidate sentences for the MS COCO test set
 39 |     '''
 40 | 
 41 |     def __init__(self):
 42 |         # vrama91: updated the value below based on discussion with Hovey
 43 |         self.beta = 1.2
 44 | 
 45 |     def calc_score(self, candidate, refs):
 46 |         """
 47 |         Compute ROUGE-L score given one candidate and references for an image
 48 |         :param candidate: str : candidate sentence to be evaluated
 49 |         :param refs: list of str : COCO reference sentences for the particular image to be evaluated
 50 |         :returns score: int (ROUGE-L score for the candidate evaluated against references)
 51 |         """
 52 |         assert (len(candidate) == 1)
 53 |         assert (len(refs) > 0)
 54 |         prec = []
 55 |         rec = []
 56 | 
 57 |         # split into tokens
 58 |         token_c = candidate[0].split(" ")
 59 | 
 60 |         for reference in refs:
 61 |             # split into tokens
 62 |             token_r = reference.split(" ")
 63 |             # compute the longest common subsequence
 64 |             lcs = my_lcs(token_r, token_c)
 65 |             prec.append(lcs / float(len(token_c)))
 66 |             rec.append(lcs / float(len(token_r)))
 67 | 
 68 |         prec_max = max(prec)
 69 |         rec_max = max(rec)
 70 | 
 71 |         if prec_max != 0 and rec_max != 0:
 72 |             score = ((1 + self.beta ** 2) * prec_max * rec_max) / float(rec_max + self.beta ** 2 * prec_max)
 73 |         else:
 74 |             score = 0.0
 75 |         return score
 76 | 
 77 |     def compute_score(self, gts, res):
 78 |         """
 79 |         Computes Rouge-L score given a set of reference and candidate sentences for the dataset
 80 |         Invoked by evaluate_captions.py
 81 |         :param gts: dict : candidate / test sentences with "image name" key and "tokenized sentences" as values
 82 |         :param res: dict : reference MS-COCO sentences with "image name" key and "tokenized sentences" as values
 83 |         :returns: average_score: float (mean ROUGE-L score computed by averaging scores for all the images)
 84 |         """
 85 |         assert (sorted(gts.keys()) == sorted(res.keys()))
 86 |         imgIds = list(gts.keys())
 87 | 
 88 |         score = dict()
 89 |         for id in imgIds:
 90 |             hypo = res[id]
 91 |             ref = gts[id]
 92 | 
 93 |             score[id] = self.calc_score(hypo, ref)
 94 | 
 95 |             # Sanity check.
 96 |             assert (type(hypo) is list)
 97 |             assert (len(hypo) == 1)
 98 |             assert (type(ref) is list)
 99 |             assert (len(ref) > 0)
100 | 
101 |         average_score = np.mean(np.asarray(list(score.values())))
102 |         return average_score, score
103 | 
104 |     def method(self):
105 |         return "Rouge"
106 | 


--------------------------------------------------------------------------------
/neuroir/eval/squad_eval.py:
--------------------------------------------------------------------------------
 1 | # ------------------------------------------------------------------------------
 2 | # Modified evaluation script for v2.0 of the SQuAD dataset.
 3 | # ------------------------------------------------------------------------------
 4 | 
 5 | import re
 6 | import string
 7 | from collections import Counter
 8 | 
 9 | 
10 | def normalize_answer(s):
11 |     """Lower text and remove punctuation, articles and extra whitespace."""
12 | 
13 |     def remove_articles(text):
14 |         return re.sub(r'\b(a|an|the)\b', ' ', text)
15 | 
16 |     def white_space_fix(text):
17 |         return ' '.join(text.split())
18 | 
19 |     def remove_punc(text):
20 |         exclude = set(string.punctuation)
21 |         return ''.join(ch for ch in text if ch not in exclude)
22 | 
23 |     def lower(text):
24 |         return text.lower()
25 | 
26 |     return white_space_fix(remove_articles(remove_punc(lower(s))))
27 | 
28 | 
29 | def f1_score(prediction, ground_truth):
30 |     """Compute the geometric mean of precision and recall for answer tokens."""
31 |     if len(ground_truth) == 0:
32 |         return 1.0 if len(prediction) == 0 else 0.0
33 |     prediction_tokens = normalize_answer(prediction).split()
34 |     ground_truth_tokens = normalize_answer(ground_truth).split()
35 |     common = Counter(prediction_tokens) & Counter(ground_truth_tokens)
36 |     num_same = sum(common.values())
37 |     if num_same == 0:
38 |         return 0
39 |     precision = 1.0 * num_same / len(prediction_tokens)
40 |     recall = 1.0 * num_same / len(ground_truth_tokens)
41 |     f1 = (2 * precision * recall) / (precision + recall)
42 |     return f1
43 | 
44 | 
45 | def exact_match_score(prediction, ground_truth):
46 |     """Check if the prediction is a (soft) exact match with the ground truth."""
47 |     return normalize_answer(prediction) == normalize_answer(ground_truth)
48 | 
49 | 
50 | def regex_match_score(prediction, pattern):
51 |     """Check if the prediction matches the given regular expression."""
52 |     try:
53 |         compiled = re.compile(
54 |             pattern,
55 |             flags=re.IGNORECASE + re.UNICODE + re.MULTILINE
56 |         )
57 |     except BaseException:
58 |         print('Regular expression failed to compile: %s' % pattern)
59 |         return False
60 |     return compiled.match(prediction) is not None
61 | 
62 | 
63 | def metric_max_over_ground_truths(metric_fn, prediction, ground_truths):
64 |     """Given a prediction and multiple valid answers, return the score of
65 |     the best prediction-answer_n pair given a metric function.
66 |     """
67 |     scores_for_ground_truths = []
68 |     for ground_truth in ground_truths:
69 |         score = metric_fn(prediction, ground_truth)
70 |         scores_for_ground_truths.append(score)
71 |     return max(scores_for_ground_truths)
72 | 


--------------------------------------------------------------------------------
/neuroir/hyparam.py:
--------------------------------------------------------------------------------
  1 | from __future__ import print_function
  2 | 
  3 | ESM = {
  4 |     'arch': {
  5 |     },
  6 |     'data': {
  7 |     }
  8 | }
  9 | 
 10 | DSSM = {
 11 |     'arch': {
 12 |         'nhid': 300,
 13 |         'nout': 128
 14 |     },
 15 |     'data': {
 16 |         'use_char_ngram': 3,
 17 |         'src_vocab_size': 30000,
 18 |         'embedding_file': ''
 19 |     }
 20 | }
 21 | 
 22 | CDSSM = {
 23 |     'arch': {
 24 |         'nhid': 300,
 25 |         'nout': 128
 26 |     },
 27 |     'data': {
 28 |         'use_char_ngram': 3,
 29 |         'src_vocab_size': 30000,
 30 |         'embedding_file': ''
 31 |     }
 32 | }
 33 | 
 34 | DUET = {
 35 |     'arch': {
 36 |         'nfilters': 300,
 37 |         'local_filter_size': 1,
 38 |         'dist_filter_size': 3,
 39 |         'pool_size': 5,
 40 |     },
 41 |     'data': {
 42 |         'src_vocab_size': None,
 43 |         'force_pad': True,
 44 |         'fix_embeddings': True
 45 |     }
 46 | }
 47 | 
 48 | ARCI = {
 49 |     'arch': {
 50 |         'filters_1d': [256, 128],
 51 |         'kernel_size_1d': [3, 3],
 52 |         'maxpool_size_1d': [2, 2],
 53 |     },
 54 |     'data': {
 55 |         'src_vocab_size': None,
 56 |         'force_pad': True,
 57 |         'fix_embeddings': True,
 58 |     }
 59 | }
 60 | 
 61 | ARCII = {
 62 |     'arch': {
 63 |         'filters_1d': 128,
 64 |         'kernel_size_1d': 3,
 65 |         'filters_2d': [256, 128],
 66 |         'kernel_size_2d': [[3, 3], [3, 3]],
 67 |         'maxpool_size_2d': [[2, 2], [2, 2]],
 68 |     },
 69 |     'data': {
 70 |         'src_vocab_size': None,
 71 |         'force_pad': True,
 72 |         'fix_embeddings': True,
 73 |         'max_doc_len': 100,
 74 |         'max_query_len': 10
 75 |     }
 76 | }
 77 | 
 78 | DRMM = {
 79 |     'arch': {
 80 |         'nbins': 5
 81 |     },
 82 |     'data': {
 83 |         'src_vocab_size': None,
 84 |         'fix_embeddings': True
 85 |     }
 86 | }
 87 | 
 88 | MATCH_TENSOR = {
 89 |     'arch': {
 90 |         'rnn_type': 'LSTM',
 91 |         'bidirection': True,
 92 |         'nlayers': 1,
 93 |         'dropout_rnn': 0.2,
 94 |         'featsize': 40,
 95 |         'nhid_query': 30,
 96 |         'nhid_doc': 140,
 97 |         'nchannels': 50,
 98 |         'nfilters': 6,
 99 |         'match_filter_size': 20
100 |     },
101 |     'data': {
102 |         'src_vocab_size': None,
103 |         'fix_embeddings': True
104 |     }
105 | }
106 | 
107 | SEQ2SEQ = {
108 |     'arch': {
109 |         'rnn_type': 'LSTM',
110 |         'bidirection': True,
111 |         'nlayers': 2,
112 |         'nhid': 512,
113 |         'dropout_rnn': 0.2,
114 |         'attn_type': 'general'
115 |     },
116 |     'data': {
117 |         'tgt_vocab_size': 30000,
118 |         'fix_embeddings': True
119 |     }
120 | }
121 | 
122 | HREDQS = {
123 |     'arch': {
124 |         'rnn_type': 'LSTM',
125 |         'bidirection': True,
126 |         'nlayers': 1,
127 |         'nhid': 512,
128 |         'dropout_rnn': 0.2,
129 |         'nhid_session': 1024
130 |     },
131 |     'data': {
132 |         'tgt_vocab_size': 30000,
133 |         'fix_embeddings': True
134 |     }
135 | }
136 | 
137 | ACG = {
138 |     'arch': {
139 |         'rnn_type': 'LSTM',
140 |         'bidirection': True,
141 |         'nlayers': 1,
142 |         'nhid': 512,
143 |         'dropout_rnn': 0.2,
144 |         'attn_type': 'general',
145 |         'copy_attn': True,
146 |         'reuse_copy_attn': True,
147 |         'force_copy': False
148 |     },
149 |     'data': {
150 |         'tgt_vocab_size': 10000,
151 |         'fix_embeddings': True
152 |     }
153 | }
154 | 
155 | MNSRF = {
156 |     'arch': {
157 |         'rnn_type': 'LSTM',
158 |         'bidirection': True,
159 |         'nlayers': 1,
160 |         'nhid_query': 512,
161 |         'nhid_document': 512,
162 |         'nhid_session': 1024,
163 |         'dropout_rnn': 0.2,
164 |         'regularize_coeff': 0.1,
165 |         'alpha': 0.5
166 |     },
167 |     'data': {
168 |         'tgt_vocab_size': 30000,
169 |         'fix_embeddings': True
170 |     }
171 | }
172 | 
173 | M_MATCH_TENSOR = {
174 |     'arch': {
175 |         'featsize': 40,
176 |         'rnn_type': 'LSTM',
177 |         'bidirection': True,
178 |         'nlayers': 1,
179 |         'nhid_query': 30,
180 |         'nhid_document': 140,
181 |         'nhid_session': 300,
182 |         'dropout_rnn': 0.2,
183 |         'nchannels': 50,
184 |         'nfilters': 6,
185 |         'match_filter_size': 20,
186 |         'regularize_coeff': 0.1,
187 |         'alpha': 0.5
188 |     },
189 |     'data': {
190 |         'max_doc_len': 100,
191 |         'max_query_len': 10,
192 |         'tgt_vocab_size': 30000,
193 |         'fix_embeddings': True
194 |     }
195 | }
196 | 
197 | CARS = {
198 |     'arch': {
199 |         'rnn_type': 'LSTM',
200 |         'bidirection': True,
201 |         'nlayers': 1,
202 |         'nhid_query': 256,
203 |         'nhid_document': 256,
204 |         'nhid_click': 512,
205 |         'nhid_session_query': 512,
206 |         'nhid_session_document': 512,
207 |         'nhid_decoder': 512,
208 |         'query_session_off': False,
209 |         'doc_session_off': False,
210 |         'dropout_rnn': 0.2,
211 |         'attn_type': 'general',
212 |         'mlp_nhid': 150,
213 |         'pool_type': 'attn',
214 |         'regularize_coeff': 0.1,
215 |         'alpha': 0.1,
216 |         'lambda1': 0.01,
217 |         'lambda2': 0.0001,
218 |         'turn_ranker_off': False,
219 |         'turn_recommender_off': False
220 |     },
221 |     'data': {
222 |         'tgt_vocab_size': 30000,
223 |         'fix_embeddings': True
224 |     }
225 | }
226 | 
227 | MODEL_ARCHITECTURE = {
228 |     'DSSM': DSSM,
229 |     'CDSSM': CDSSM,
230 |     'ESM': ESM,
231 |     'DUET': DUET,
232 |     'ARCI': ARCI,
233 |     'ARCII': ARCII,
234 |     'DRMM': DRMM,
235 |     'MATCH_TENSOR': MATCH_TENSOR,
236 |     'SEQ2SEQ': SEQ2SEQ,
237 |     'HREDQS': HREDQS,
238 |     'ACG': ACG,
239 |     'MNSRF': MNSRF,
240 |     'M_MATCH_TENSOR': M_MATCH_TENSOR,
241 |     'CARS': CARS
242 | }
243 | 
244 | 
245 | def get_model_specific_params(model_name, field):
246 |     return MODEL_ARCHITECTURE[model_name.upper()][field]
247 | 


--------------------------------------------------------------------------------
/neuroir/inputters/__init__.py:
--------------------------------------------------------------------------------
1 | __author__ = 'wasi'
2 | 
3 | from .constants import *
4 | from .vocabulary import *
5 | from . import ranker
6 | 


--------------------------------------------------------------------------------
/neuroir/inputters/constants.py:
--------------------------------------------------------------------------------
 1 | PAD = 0
 2 | UNK = 1
 3 | BOS = 2
 4 | EOS = 3
 5 | 
 6 | PAD_WORD = '<blank>'
 7 | UNK_WORD = '<unk>'
 8 | BOS_WORD = '<s>'
 9 | EOS_WORD = '</s>'
10 | 


--------------------------------------------------------------------------------
/neuroir/inputters/multitask/__init__.py:
--------------------------------------------------------------------------------
1 | __author__ = 'wasi'
2 | 
3 | from .data import *
4 | from .utils import *
5 | from .vector import *
6 | 


--------------------------------------------------------------------------------
/neuroir/inputters/multitask/data.py:
--------------------------------------------------------------------------------
 1 | # src: https://github.com/facebookresearch/DrQA/blob/master/drqa/reader/data.py
 2 | import numpy as np
 3 | 
 4 | from torch.utils.data import Dataset
 5 | from torch.utils.data.sampler import Sampler
 6 | from .vector import vectorize
 7 | 
 8 | 
 9 | # ------------------------------------------------------------------------------
10 | # PyTorch dataset class for MSMARCO data.
11 | # ------------------------------------------------------------------------------
12 | 
13 | 
14 | class RankerRecommenderDataset(Dataset):
15 |     def __init__(self, examples, model, shuffle=False):
16 |         self.model = model
17 |         self.examples = examples
18 |         self.shuffle = shuffle
19 | 
20 |     def __len__(self):
21 |         return len(self.examples)
22 | 
23 |     def __getitem__(self, index):
24 |         return vectorize(self.examples[index],
25 |                          self.model,
26 |                          shuffle=self.shuffle)
27 | 
28 |     def lengths(self):
29 |         return [len(session) for session in self.examples]
30 | 
31 | 
32 | # ------------------------------------------------------------------------------
33 | # PyTorch sampler returning batched of sorted lengths (by doc and query).
34 | # ------------------------------------------------------------------------------
35 | 
36 | class SortedBatchSampler(Sampler):
37 |     def __init__(self, lengths, batch_size, shuffle=True):
38 |         self.lengths = lengths
39 |         self.batch_size = batch_size
40 |         self.shuffle = shuffle
41 | 
42 |     def __iter__(self):
43 |         clusters = dict()
44 |         for i, num_queries in enumerate(self.lengths):
45 |             if num_queries in clusters:
46 |                 clusters[num_queries].append(i)
47 |             else:
48 |                 clusters[num_queries] = [i]
49 | 
50 |         batches = []
51 |         for key, indices in clusters.items():
52 |             if len(indices) % self.batch_size != 0:
53 |                 num_batch = len(indices) // self.batch_size
54 |                 indices = indices[:(num_batch * self.batch_size)]
55 |             assert len(indices) % self.batch_size == 0
56 |             batches.extend([indices[i:i + self.batch_size]
57 |                             for i in range(0, len(indices), self.batch_size)])
58 |         if self.shuffle:
59 |             np.random.shuffle(batches)
60 |         return iter([i for batch in batches for i in batch])
61 | 
62 |     def __len__(self):
63 |         return len(self.lengths)
64 | 


--------------------------------------------------------------------------------
/neuroir/inputters/multitask/utils.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | import logging
  3 | from collections import Counter
  4 | from tqdm import tqdm
  5 | 
  6 | from neuroir.objects import Query, Session, Document
  7 | from neuroir.utils.misc import count_file_lines
  8 | from neuroir.inputters.vocabulary import Vocabulary, UnicodeCharsVocabulary
  9 | from neuroir.inputters.constants import BOS_WORD, EOS_WORD, PAD_WORD, UNK_WORD
 10 | 
 11 | logger = logging.getLogger(__name__)
 12 | 
 13 | 
 14 | # ------------------------------------------------------------------------------
 15 | # Data loading
 16 | # ------------------------------------------------------------------------------
 17 | 
 18 | 
 19 | def load_data(args,
 20 |               filename,
 21 |               max_examples=-1,
 22 |               dataset_name='msmarco'):
 23 |     """Load examples from preprocessed file. One example per line, JSON encoded."""
 24 | 
 25 |     # Load JSON lines
 26 |     with open(filename) as f:
 27 |         data = [json.loads(line) for line in
 28 |                 tqdm(f, total=count_file_lines(filename))]
 29 | 
 30 |     examples = []
 31 |     # based on model_type, we arrange the data
 32 |     model_type = args.model_type.upper()
 33 |     for example in tqdm(data):
 34 |         if dataset_name == 'msmarco':
 35 |             session_queries = []
 36 |             for query in example['query']:
 37 |                 qObj = Query(query['id'])
 38 |                 qObj.text = ' '.join(query['tokens'])
 39 |                 qtokens = query['tokens']
 40 |                 qtokens = [BOS_WORD] + qtokens + [EOS_WORD]
 41 | 
 42 |                 if len(qtokens) == 0 or len(qtokens) > args.max_query_len:
 43 |                     continue
 44 | 
 45 |                 qObj.tokens = qtokens
 46 | 
 47 |                 # --- record the candidate documents
 48 |                 candidates = []
 49 |                 for candidate in query['candidates']:
 50 |                     document = Document(candidate['id'])
 51 |                     # TODO: what should we use for documents? title/content?
 52 |                     content_tokens = candidate['content'].split()
 53 |                     if len(content_tokens) == 0:
 54 |                         continue
 55 | 
 56 |                     content_tokens = content_tokens[:args.max_doc_len]
 57 |                     document.tokens = content_tokens
 58 |                     assert isinstance(candidate['label'], bool)
 59 |                     document.label = 1 if candidate['label'] else 0
 60 |                     candidates.append(document)
 61 | 
 62 |                 if len(candidates) == args.num_candidates:
 63 |                     qObj.documents = candidates
 64 |                     session_queries.append(qObj)
 65 | 
 66 |             # sessions must contain at least 2 queries
 67 |             if len(session_queries) < 2:
 68 |                 continue
 69 | 
 70 |             session = Session(example['session_id'])
 71 |             session.queries = session_queries
 72 |             examples.append(session)
 73 | 
 74 |         if max_examples != -1 and len(examples) > max_examples:
 75 |             break
 76 | 
 77 |     return examples
 78 | 
 79 | 
 80 | # ------------------------------------------------------------------------------
 81 | # Dictionary building
 82 | # ------------------------------------------------------------------------------
 83 | 
 84 | 
 85 | def index_embedding_words(embedding_file):
 86 |     """Put all the words in embedding_file into a set."""
 87 |     words = set()
 88 |     with open(embedding_file) as f:
 89 |         for line in tqdm(f, total=count_file_lines(embedding_file)):
 90 |             w = Vocabulary.normalize(line.rstrip().split(' ')[0])
 91 |             words.add(w)
 92 | 
 93 |     words.update([BOS_WORD, EOS_WORD, PAD_WORD, UNK_WORD])
 94 |     return words
 95 | 
 96 | 
 97 | def load_words(args, examples, dict_size=None, only_queries=False):
 98 |     """Iterate and index all the words in examples (documents + questions)."""
 99 | 
100 |     def _insert(iterable):
101 |         words = []
102 |         for w in iterable:
103 |             w = Vocabulary.normalize(w)
104 |             if valid_words and w not in valid_words:
105 |                 continue
106 |             words.append(w)
107 |         word_count.update(words)
108 | 
109 |     if args.restrict_vocab and args.embedding_file:
110 |         logger.info('Restricting to words in %s' % args.embedding_file)
111 |         valid_words = index_embedding_words(args.embedding_file)
112 |         logger.info('Num words in set = %d' % len(valid_words))
113 |     else:
114 |         valid_words = None
115 | 
116 |     word_count = Counter()
117 |     for ex in tqdm(examples):
118 |         for query in ex.queries:
119 |             _insert(query.tokens)
120 |             if not only_queries:
121 |                 for document in query.documents:
122 |                     _insert(document.tokens)
123 | 
124 |     # -2 to reserve spots for PAD and UNK token
125 |     dict_size = dict_size - 2 if dict_size and dict_size > 2 else dict_size
126 |     most_common = word_count.most_common(dict_size)
127 |     words = set(word for word, _ in most_common)
128 |     return words
129 | 
130 | 
131 | def build_word_dict(args, examples, dict_size=None, only_queries=False):
132 |     """Return a dictionary from question and document words in
133 |     provided examples.
134 |     """
135 |     word_dict = Vocabulary()
136 |     for w in load_words(args, examples, dict_size, only_queries):
137 |         word_dict.add(w)
138 |     return word_dict
139 | 
140 | 
141 | def build_word_and_char_dict(args, examples, dict_size=None, only_queries=False):
142 |     """Return a dictionary from question and document words in
143 |     provided examples.
144 |     """
145 |     words = load_words(args, examples, dict_size, only_queries)
146 |     dictioanry = UnicodeCharsVocabulary(words, args.max_characters_per_token)
147 |     return dictioanry
148 | 


--------------------------------------------------------------------------------
/neuroir/inputters/multitask/vector.py:
--------------------------------------------------------------------------------
  1 | # Adapted from https://github.com/facebookresearch/DrQA/blob/master/drqa/reader/vector.py
  2 | 
  3 | import torch
  4 | import random
  5 | import copy
  6 | 
  7 | 
  8 | def vectorize(session, model, shuffle=False):
  9 |     """Torchify a single example."""
 10 | 
 11 |     src_dict = model.src_dict
 12 |     tgt_dict = model.tgt_dict
 13 |     num_candidates = model.args.num_candidates
 14 | 
 15 |     session_len = len(session)
 16 |     max_source_len = max([len(query) for query in session.queries])
 17 |     max_target_len = max([len(query) for query in session.queries[1:]])
 18 |     max_document_len = max([len(doc) for query in session.queries for doc in query.documents])
 19 | 
 20 |     source_tokens = [query.tokens for query in session.queries]  # 2d list
 21 |     target_tokens = [query.tokens for query in session.queries[1:]]  # 2d list
 22 | 
 23 |     source_words = torch.LongTensor(session_len, max_source_len).zero_()
 24 |     source_lens = torch.LongTensor(session_len).zero_()
 25 |     target_words = torch.LongTensor(session_len - 1, max_target_len).zero_()
 26 |     target_lens = torch.LongTensor(session_len - 1).zero_()
 27 |     target_seq = torch.LongTensor(session_len - 1, max_target_len).zero_()  # use only to compute loss
 28 | 
 29 |     document_words = torch.LongTensor(session_len, num_candidates, max_document_len).zero_()
 30 |     document_lens = torch.LongTensor(session_len, num_candidates).zero_()
 31 |     document_labels = torch.LongTensor(session_len, num_candidates).zero_()
 32 | 
 33 |     for i in range(session_len):
 34 |         query = session.queries[i]
 35 |         query_len = len(query.tokens)
 36 |         source_lens[i] = query_len
 37 |         source_words[i, :query_len].copy_(torch.LongTensor(
 38 |             query.vectorize(word_dict=src_dict)))
 39 | 
 40 |         # candidate document ranking
 41 |         candidates = copy.deepcopy(query.documents)
 42 |         assert len(candidates) == num_candidates
 43 |         if shuffle:
 44 |             random.shuffle(candidates)
 45 |         for cidx in range(num_candidates):
 46 |             cand = candidates[cidx]
 47 |             document_lens[i, cidx] = len(cand.tokens)
 48 |             document_labels[i, cidx] = cand.label
 49 |             document_words[i, cidx, :len(cand.tokens)].copy_(torch.LongTensor(
 50 |                 cand.vectorize(word_dict=src_dict)))
 51 | 
 52 |         if i != session_len - 1:
 53 |             # next query suggestion
 54 |             query = session.queries[i + 1]
 55 |             query_len = len(query.tokens)
 56 |             target_lens[i] = query_len
 57 |             target_words[i, :query_len].copy_(torch.LongTensor(
 58 |                 query.vectorize(word_dict=src_dict)))
 59 |             target_seq[i, :query_len].copy_(torch.LongTensor(
 60 |                 query.vectorize(word_dict=tgt_dict)))  # diff is which dict is used
 61 | 
 62 |     return {
 63 |         'id': session.id,
 64 |         'source_tokens': source_tokens,
 65 |         'source_words': source_words,
 66 |         'source_lens': source_lens,
 67 |         'target_tokens': target_tokens,
 68 |         'target_words': target_words,
 69 |         'target_lens': target_lens,
 70 |         'target_seq': target_seq,
 71 |         'max_source_len': max_source_len,
 72 |         'max_target_len': max_target_len,
 73 |         'session_len': session_len,
 74 |         'num_candidates': num_candidates,
 75 |         'document_words': document_words,  # 3d tensor
 76 |         'document_lens': document_lens,  # 2d tensor
 77 |         'document_labels': document_labels,  # 2d tensor
 78 |         'max_document_len': max_document_len
 79 |     }
 80 | 
 81 | 
 82 | def batchify(batch):
 83 |     """Gather a batch of individual examples into one batch."""
 84 | 
 85 |     # batch is a list of vectorized examples
 86 |     batch_size = len(batch)
 87 |     max_source_len = max([b['max_source_len'] for b in batch])
 88 |     max_target_len = max([b['max_target_len'] for b in batch])
 89 |     max_document_len = max([b['max_document_len'] for b in batch])
 90 |     session_len = batch[0]['session_len']
 91 |     num_candidates = batch[0]['num_candidates']
 92 | 
 93 |     # all the sessions must have the same length
 94 |     assert len(set([b['session_len'] for b in batch])) == 1
 95 | 
 96 |     ids = [ex['id'] for ex in batch]
 97 | 
 98 |     # --------- Prepare query tensors ---------
 99 |     source_lens = torch.LongTensor(batch_size,
100 |                                    session_len).zero_()
101 |     source_words = torch.LongTensor(batch_size,
102 |                                     session_len,
103 |                                     max_source_len).zero_()
104 |     document_lens = torch.LongTensor(batch_size,
105 |                                      session_len,
106 |                                      num_candidates).zero_()
107 |     document_words = torch.LongTensor(batch_size,
108 |                                       session_len,
109 |                                       num_candidates,
110 |                                       max_document_len).zero_()
111 |     document_labels = torch.FloatTensor(batch_size,
112 |                                         session_len,
113 |                                         num_candidates).zero_()
114 |     target_lens = torch.LongTensor(batch_size,
115 |                                    session_len - 1).zero_()
116 |     target_words = torch.LongTensor(batch_size,
117 |                                     session_len - 1,
118 |                                     max_target_len).zero_()
119 |     target_seq = torch.LongTensor(batch_size,
120 |                                   session_len - 1,
121 |                                   max_target_len).zero_()
122 | 
123 |     for bidx, session in enumerate(batch):
124 |         source_lens[bidx] = session['source_lens']
125 |         source_words[bidx, :, :session['max_source_len']].copy_(session['source_words'])
126 | 
127 |         document_lens[bidx] = session['document_lens']
128 |         document_labels[bidx] = session['document_labels']
129 |         document_words[bidx, :, :, :session['max_document_len']].copy_(session['document_words'])
130 | 
131 |         target_lens[bidx] = session['target_lens']
132 |         target_words[bidx, :, :session['max_target_len']].copy_(session['target_words'])
133 |         target_seq[bidx, :, :session['max_target_len']].copy_(session['target_seq'])
134 | 
135 |     return {
136 |         'batch_size': batch_size,
137 |         'ids': ids,
138 |         'source_tokens': [item['source_tokens'] for item in batch],
139 |         'source_words': source_words,
140 |         'source_lens': source_lens,
141 |         'target_tokens': [item['target_tokens'] for item in batch],
142 |         'target_words': target_words,
143 |         'target_lens': target_lens,
144 |         'target_seq': target_seq,
145 |         'session_len': session_len,
146 |         'document_words': document_words,
147 |         'document_lens': document_lens,
148 |         'document_labels': document_labels
149 |     }
150 | 


--------------------------------------------------------------------------------
/neuroir/inputters/ranker/__init__.py:
--------------------------------------------------------------------------------
1 | __author__ = 'wasi'
2 | 
3 | from .data import *
4 | from .utils import *
5 | from .vector import *
6 | 


--------------------------------------------------------------------------------
/neuroir/inputters/ranker/data.py:
--------------------------------------------------------------------------------
 1 | # src: https://github.com/facebookresearch/DrQA/blob/master/drqa/reader/data.py
 2 | import numpy as np
 3 | 
 4 | from torch.utils.data import Dataset
 5 | from torch.utils.data.sampler import Sampler
 6 | from .vector import vectorize
 7 | 
 8 | 
 9 | # ------------------------------------------------------------------------------
10 | # PyTorch dataset class for MSMARCO data.
11 | # ------------------------------------------------------------------------------
12 | 
13 | 
14 | class RankerDataset(Dataset):
15 |     def __init__(self, examples, model, shuffle=False):
16 |         self.model = model
17 |         self.examples = examples
18 |         self.shuffle = shuffle
19 | 
20 |     def __len__(self):
21 |         return len(self.examples)
22 | 
23 |     def __getitem__(self, index):
24 |         return vectorize(self.examples[index],
25 |                          self.model,
26 |                          shuffle=self.shuffle)
27 | 
28 |     def lengths(self):
29 |         return [(max([len(doc.tokens) for doc in ex.documents]),
30 |                  len(ex.tokens)) for ex in self.examples]
31 | 
32 | 
33 | # ------------------------------------------------------------------------------
34 | # PyTorch sampler returning batched of sorted lengths (by doc and query).
35 | # ------------------------------------------------------------------------------
36 | 
37 | class SortedBatchSampler(Sampler):
38 |     def __init__(self, lengths, batch_size, shuffle=True):
39 |         self.lengths = lengths
40 |         self.batch_size = batch_size
41 |         self.shuffle = shuffle
42 | 
43 |     def __iter__(self):
44 |         lengths = np.array(
45 |             [(-l[0], -l[1], np.random.random()) for l in self.lengths],
46 |             dtype=[('l1', np.int_), ('l2', np.int_), ('rand', np.float_)]
47 |         )
48 |         indices = np.argsort(lengths, order=('l1', 'l2', 'rand'))
49 |         batches = [indices[i:i + self.batch_size]
50 |                    for i in range(0, len(indices), self.batch_size)]
51 |         if self.shuffle:
52 |             np.random.shuffle(batches)
53 |         return iter([i for batch in batches for i in batch])
54 | 
55 |     def __len__(self):
56 |         return len(self.lengths)
57 | 


--------------------------------------------------------------------------------
/neuroir/inputters/ranker/utils.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | import logging
  3 | from collections import Counter
  4 | from tqdm import tqdm
  5 | 
  6 | from neuroir.objects import Document, Query
  7 | from neuroir.utils.misc import count_file_lines
  8 | from neuroir.inputters.vocabulary import Vocabulary, UnicodeCharsVocabulary
  9 | from neuroir.inputters.constants import BOS_WORD, EOS_WORD, PAD_WORD, UNK_WORD
 10 | 
 11 | logger = logging.getLogger(__name__)
 12 | 
 13 | 
 14 | # ------------------------------------------------------------------------------
 15 | # Data loading
 16 | # ------------------------------------------------------------------------------
 17 | 
 18 | 
 19 | def load_data(args,
 20 |               filename,
 21 |               max_examples=-1,
 22 |               dataset_name='msmarco'):
 23 |     """Load examples from preprocessed file. One example per line, JSON encoded."""
 24 | 
 25 |     # Load JSON lines
 26 |     with open(filename) as f:
 27 |         data = [json.loads(line) for line in
 28 |                 tqdm(f, total=count_file_lines(filename))]
 29 | 
 30 |     examples = []
 31 |     for session in tqdm(data):
 32 |         if dataset_name == 'msmarco':
 33 |             for query in session['query']:
 34 |                 qObj = Query(query['id'])
 35 |                 qtokens = query['tokens']
 36 |                 qtokens = [BOS_WORD] + qtokens + [EOS_WORD]
 37 | 
 38 |                 if len(qtokens) == 0 or len(qtokens) > args.max_query_len:
 39 |                     continue
 40 |                 if len(query['candidates']) != args.num_candidates:
 41 |                     continue
 42 | 
 43 |                 if args.use_char_ngram > 0:
 44 |                     char_n_grams = []
 45 |                     offset = args.use_char_ngram
 46 |                     for i in range(len(qtokens)):
 47 |                         term = '#' + qtokens[i] + '#'
 48 |                         for j in range(0, len(term) - offset + 1):
 49 |                             char_n_grams.append(term[j:j + offset])
 50 |                     qtokens = char_n_grams
 51 | 
 52 |                 qObj.tokens = qtokens
 53 |                 candidates = []
 54 |                 for candidate in query['candidates']:
 55 |                     document = Document(candidate['id'])
 56 |                     # TODO: what should we use for documents? title/content?
 57 |                     content_tokens = candidate['content'].split()
 58 |                     if len(content_tokens) == 0:
 59 |                         continue
 60 |                     content_tokens = content_tokens[:args.max_doc_len - 2]
 61 |                     content_tokens = [BOS_WORD] + content_tokens + [EOS_WORD]
 62 | 
 63 |                     if args.use_char_ngram > 0:
 64 |                         char_n_grams = []
 65 |                         offset = args.use_char_ngram
 66 |                         for i in range(len(content_tokens)):
 67 |                             term = '#' + content_tokens[i] + '#'
 68 |                             for j in range(0, len(term) - offset + 1):
 69 |                                 char_n_grams.append(term[j:j + offset])
 70 |                         content_tokens = char_n_grams
 71 | 
 72 |                     document.tokens = content_tokens
 73 |                     assert isinstance(candidate['label'], bool)
 74 |                     document.label = 1 if candidate['label'] else 0
 75 |                     candidates.append(document)
 76 | 
 77 |                 if len(candidates) == args.num_candidates:
 78 |                     qObj.documents = candidates
 79 |                     examples.append(qObj)
 80 | 
 81 |         if max_examples != -1 and len(examples) > max_examples:
 82 |             break
 83 | 
 84 |     return examples
 85 | 
 86 | 
 87 | # ------------------------------------------------------------------------------
 88 | # Dictionary building
 89 | # ------------------------------------------------------------------------------
 90 | 
 91 | 
 92 | def index_embedding_words(embedding_file):
 93 |     """Put all the words in embedding_file into a set."""
 94 |     words = set()
 95 |     with open(embedding_file) as f:
 96 |         for line in tqdm(f, total=count_file_lines(embedding_file)):
 97 |             w = Vocabulary.normalize(line.rstrip().split(' ')[0])
 98 |             words.add(w)
 99 | 
100 |     words.update([BOS_WORD, EOS_WORD, PAD_WORD, UNK_WORD])
101 |     return words
102 | 
103 | 
104 | def load_words(args, examples, dict_size=None):
105 |     """Iterate and index all the words in examples (documents + questions)."""
106 | 
107 |     def _insert(iterable):
108 |         words = []
109 |         for w in iterable:
110 |             w = Vocabulary.normalize(w)
111 |             if valid_words and w not in valid_words:
112 |                 continue
113 |             words.append(w)
114 |         word_count.update(words)
115 | 
116 |     if args.restrict_vocab and args.embedding_file:
117 |         logger.info('Restricting to words in %s' % args.embedding_file)
118 |         valid_words = index_embedding_words(args.embedding_file)
119 |         logger.info('Num words in set = %d' % len(valid_words))
120 |     else:
121 |         valid_words = None
122 | 
123 |     word_count = Counter()
124 |     for ex in tqdm(examples):
125 |         _insert(ex.tokens)
126 |         for doc in ex.documents:
127 |             _insert(doc.tokens)
128 | 
129 |     # -2 to reserve spots for PAD and UNK token
130 |     dict_size = dict_size - 2 if dict_size and dict_size > 2 else dict_size
131 |     most_common = word_count.most_common(dict_size)
132 |     words = set(word for word, _ in most_common)
133 |     return words
134 | 
135 | 
136 | def build_word_dict(args, examples, dict_size=None):
137 |     """Return a dictionary from question and document words in
138 |     provided examples.
139 |     """
140 |     word_dict = Vocabulary()
141 |     for w in load_words(args, examples, dict_size):
142 |         word_dict.add(w)
143 |     return word_dict
144 | 
145 | 
146 | def build_word_and_char_dict(args, examples, dict_size=None):
147 |     """Return a dictionary from question and document words in
148 |     provided examples.
149 |     """
150 |     words = load_words(args, examples, dict_size)
151 |     dictioanry = UnicodeCharsVocabulary(words, args.max_characters_per_token)
152 |     return dictioanry
153 | 


--------------------------------------------------------------------------------
/neuroir/inputters/ranker/vector.py:
--------------------------------------------------------------------------------
 1 | # Adapted from https://github.com/facebookresearch/DrQA/blob/master/drqa/reader/vector.py
 2 | 
 3 | import random
 4 | import copy
 5 | import torch
 6 | 
 7 | 
 8 | def vectorize(ex, model, shuffle=False):
 9 |     """Torchify a single example."""
10 |     src_dict = model.src_dict
11 |     query, candidates = ex, copy.deepcopy(ex.documents)
12 |     if shuffle:
13 |         random.shuffle(candidates)
14 | 
15 |     # Index words
16 |     Q_words = torch.LongTensor(query.vectorize(word_dict=src_dict))
17 |     D_words = [torch.LongTensor(c.vectorize(word_dict=src_dict)) for c in candidates]
18 |     max_doc_len = model.args.max_doc_len if model.args.force_pad \
19 |         else max([len(c.tokens) for c in candidates])
20 |     max_query_len = model.args.max_query_len \
21 |         if model.args.force_pad else len(query.tokens)
22 | 
23 |     # label is only used to compute loss during training
24 |     label = torch.LongTensor([c.label for c in candidates])
25 | 
26 |     return {
27 |         'id': query.id,
28 |         'query_tokens': query.tokens,
29 |         'query_words': Q_words,
30 |         'doc_tokens': [c.tokens for c in candidates],
31 |         'doc_words': D_words,
32 |         'label': label,
33 |         'num_candidates': model.args.num_candidates,
34 |         'max_doc_len': max_doc_len,
35 |         'max_query_len': max_query_len
36 |     }
37 | 
38 | 
39 | def batchify(batch):
40 |     """Gather a batch of individual examples into one batch."""
41 | 
42 |     # batch is a list of vectorized examples
43 |     batch_size = len(batch)
44 |     num_candidates = batch[0]['num_candidates']
45 |     max_doc_len = max([b['max_doc_len'] for b in batch])
46 |     max_que_len = max([b['max_query_len'] for b in batch])
47 | 
48 |     # --------- Prepare document tensors ---------
49 | 
50 |     batch_documents = [ex['doc_words'] for ex in batch]
51 | 
52 |     # Batch documents
53 |     doc_len = torch.LongTensor(batch_size, num_candidates).zero_()
54 |     doc_word = torch.LongTensor(batch_size,
55 |                                 num_candidates,
56 |                                 max_doc_len).zero_()
57 | 
58 |     for bidx, docs in enumerate(batch_documents):
59 |         for didx, doc in enumerate(docs):
60 |             doc_len[bidx, didx] = doc.size(0)
61 |             doc_word[bidx, didx, :doc.size(0)].copy_(doc)
62 | 
63 |     # --------- Prepare query tensors ---------
64 |     batch_queries = [ex['query_words'] for ex in batch]
65 | 
66 |     # Batch questions
67 |     que_len = torch.LongTensor(batch_size).zero_()
68 |     que_word = torch.LongTensor(batch_size,
69 |                                 max_que_len).zero_()
70 | 
71 |     for bidx, query in enumerate(batch_queries):
72 |         que_len[bidx] = query.size(0)
73 |         que_word[bidx, :query.size(0)].copy_(query)
74 | 
75 |     # --------- Prepare other tensors ---------
76 |     ids = [ex['id'] for ex in batch]
77 |     labels = [ex['label'] for ex in batch]
78 |     label_tensor = torch.LongTensor(batch_size, num_candidates).zero_()
79 |     for bidx, label in enumerate(labels):
80 |         label_tensor[bidx, :].copy_(label)
81 | 
82 |     return {
83 |         'batch_size': batch_size,
84 |         'ids': ids,
85 |         'doc_rep': doc_word,
86 |         'doc_len': doc_len,
87 |         'que_rep': que_word,
88 |         'que_len': que_len,
89 |         'label': label_tensor
90 |     }
91 | 


--------------------------------------------------------------------------------
/neuroir/inputters/recommender/__init__.py:
--------------------------------------------------------------------------------
1 | __author__ = 'wasi'
2 | 
3 | from .data import *
4 | from .utils import *
5 | from .vector import *
6 | 


--------------------------------------------------------------------------------
/neuroir/inputters/recommender/data.py:
--------------------------------------------------------------------------------
 1 | # src: https://github.com/facebookresearch/DrQA/blob/master/drqa/reader/data.py
 2 | import numpy as np
 3 | 
 4 | from torch.utils.data import Dataset
 5 | from torch.utils.data.sampler import Sampler
 6 | from .vector import vectorize
 7 | 
 8 | 
 9 | # ------------------------------------------------------------------------------
10 | # PyTorch dataset class for MSMARCO data.
11 | # ------------------------------------------------------------------------------
12 | 
13 | 
14 | class RecommenderDataset(Dataset):
15 |     def __init__(self, examples, model):
16 |         self.model = model
17 |         self.examples = examples
18 | 
19 |     def __len__(self):
20 |         return len(self.examples)
21 | 
22 |     def __getitem__(self, index):
23 |         return vectorize(self.examples[index], self.model)
24 | 
25 |     def lengths(self):
26 |         return [len(session) for session in self.examples]
27 | 
28 | 
29 | # ------------------------------------------------------------------------------
30 | # PyTorch sampler returning batched of sorted lengths (by doc and query).
31 | # ------------------------------------------------------------------------------
32 | 
33 | class SortedBatchSampler(Sampler):
34 |     def __init__(self, lengths, batch_size, shuffle=True):
35 |         self.lengths = lengths
36 |         self.batch_size = batch_size
37 |         self.shuffle = shuffle
38 | 
39 |     def __iter__(self):
40 |         clusters = dict()
41 |         for i, num_queries in enumerate(self.lengths):
42 |             if num_queries in clusters:
43 |                 clusters[num_queries].append(i)
44 |             else:
45 |                 clusters[num_queries] = [i]
46 | 
47 |         batches = []
48 |         for key, indices in clusters.items():
49 |             if len(indices) % self.batch_size != 0:
50 |                 num_batch = len(indices) // self.batch_size
51 |                 indices = indices[:(num_batch * self.batch_size)]
52 |             assert len(indices) % self.batch_size == 0
53 |             batches.extend([indices[i:i + self.batch_size]
54 |                             for i in range(0, len(indices), self.batch_size)])
55 |         if self.shuffle:
56 |             np.random.shuffle(batches)
57 |         return iter([i for batch in batches for i in batch])
58 | 
59 |     def __len__(self):
60 |         return len(self.lengths)
61 | 


--------------------------------------------------------------------------------
/neuroir/inputters/recommender/utils.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | import logging
  3 | from collections import Counter
  4 | from tqdm import tqdm
  5 | 
  6 | from neuroir.objects import Query, Session
  7 | from neuroir.utils.misc import count_file_lines
  8 | from neuroir.inputters.vocabulary import Vocabulary, UnicodeCharsVocabulary
  9 | from neuroir.inputters.constants import BOS_WORD, EOS_WORD, PAD_WORD, UNK_WORD
 10 | 
 11 | logger = logging.getLogger(__name__)
 12 | 
 13 | 
 14 | # ------------------------------------------------------------------------------
 15 | # Data loading
 16 | # ------------------------------------------------------------------------------
 17 | 
 18 | 
 19 | def load_data(args,
 20 |               filename,
 21 |               max_examples=-1,
 22 |               dataset_name='msmarco'):
 23 |     """Load examples from preprocessed file. One example per line, JSON encoded."""
 24 | 
 25 |     # Load JSON lines
 26 |     with open(filename) as f:
 27 |         data = [json.loads(line) for line in
 28 |                 tqdm(f, total=count_file_lines(filename))]
 29 | 
 30 |     examples = []
 31 |     # based on model_type, we arrange the data
 32 |     model_type = args.model_type.upper()
 33 |     for example in tqdm(data):
 34 |         if dataset_name == 'msmarco':
 35 |             session_queries = []
 36 |             for query in example['query']:
 37 |                 qObj = Query(query['id'])
 38 |                 qObj.text = ' '.join(query['tokens'])
 39 |                 qtokens = query['tokens']
 40 |                 qtokens = [BOS_WORD] + qtokens + [EOS_WORD]
 41 | 
 42 |                 if len(qtokens) == 0 or len(qtokens) > args.max_query_len:
 43 |                     continue
 44 | 
 45 |                 qObj.tokens = qtokens
 46 |                 session_queries.append(qObj)
 47 | 
 48 |             # sessions must contain at least 2 queries
 49 |             if len(session_queries) < 2:
 50 |                 continue
 51 | 
 52 |             if model_type == 'SEQ2SEQ':
 53 |                 # every session will contain only 2 queries
 54 |                 for i in range(len(session_queries) - 1):
 55 |                     session = Session(example['session_id'] + str(i))
 56 |                     session.queries = session_queries[i:i + 2]
 57 |                     assert len(session) == 2
 58 |                     examples.append(session)
 59 |             elif model_type == 'ACG':
 60 |                 # every session will contain only 2 queries
 61 |                 # but the first query is the concatenation of all previous queries till timestep i
 62 |                 for i in range(len(session_queries) - 1):
 63 |                     session = Session(example['session_id'] + str(i))
 64 |                     session.add_one_query(session_queries[0:i + 1])
 65 |                     session.add_query(session_queries[i + 1])
 66 |                     assert len(session) == 2
 67 |                     examples.append(session)
 68 |             elif model_type == 'HREDQS':
 69 |                 session = Session(example['session_id'])
 70 |                 session.queries = session_queries
 71 |                 examples.append(session)
 72 | 
 73 |         if max_examples != -1 and len(examples) > max_examples:
 74 |             break
 75 | 
 76 |     return examples
 77 | 
 78 | 
 79 | # ------------------------------------------------------------------------------
 80 | # Dictionary building
 81 | # ------------------------------------------------------------------------------
 82 | 
 83 | 
 84 | def index_embedding_words(embedding_file):
 85 |     """Put all the words in embedding_file into a set."""
 86 |     words = set()
 87 |     with open(embedding_file) as f:
 88 |         for line in tqdm(f, total=count_file_lines(embedding_file)):
 89 |             w = Vocabulary.normalize(line.rstrip().split(' ')[0])
 90 |             words.add(w)
 91 | 
 92 |     words.update([BOS_WORD, EOS_WORD, PAD_WORD, UNK_WORD])
 93 |     return words
 94 | 
 95 | 
 96 | def load_words(args, examples, dict_size=None):
 97 |     """Iterate and index all the words in examples (documents + questions)."""
 98 | 
 99 |     def _insert(iterable):
100 |         words = []
101 |         for w in iterable:
102 |             w = Vocabulary.normalize(w)
103 |             if valid_words and w not in valid_words:
104 |                 continue
105 |             words.append(w)
106 |         word_count.update(words)
107 | 
108 |     if args.restrict_vocab and args.embedding_file:
109 |         logger.info('Restricting to words in %s' % args.embedding_file)
110 |         valid_words = index_embedding_words(args.embedding_file)
111 |         logger.info('Num words in set = %d' % len(valid_words))
112 |     else:
113 |         valid_words = None
114 | 
115 |     word_count = Counter()
116 |     for ex in tqdm(examples):
117 |         for query in ex.queries:
118 |             _insert(query.tokens)
119 | 
120 |     # -2 to reserve spots for PAD and UNK token
121 |     dict_size = dict_size - 2 if dict_size and dict_size > 2 else dict_size
122 |     most_common = word_count.most_common(dict_size)
123 |     words = set(word for word, _ in most_common)
124 |     return words
125 | 
126 | 
127 | def build_word_dict(args, examples, dict_size=None):
128 |     """Return a dictionary from question and document words in
129 |     provided examples.
130 |     """
131 |     word_dict = Vocabulary()
132 |     for w in load_words(args, examples, dict_size):
133 |         word_dict.add(w)
134 |     return word_dict
135 | 
136 | 
137 | def build_word_and_char_dict(args, examples, dict_size=None):
138 |     """Return a dictionary from question and document words in
139 |     provided examples.
140 |     """
141 |     words = load_words(args, examples, dict_size)
142 |     dictioanry = UnicodeCharsVocabulary(words, args.max_characters_per_token)
143 |     return dictioanry
144 | 


--------------------------------------------------------------------------------
/neuroir/inputters/recommender/vector.py:
--------------------------------------------------------------------------------
  1 | # Adapted from https://github.com/facebookresearch/DrQA/blob/master/drqa/reader/vector.py
  2 | 
  3 | import torch
  4 | 
  5 | 
  6 | def vectorize(session, model):
  7 |     """Torchify a single example."""
  8 | 
  9 |     src_dict = model.src_dict
 10 |     tgt_dict = model.tgt_dict
 11 | 
 12 |     session_len = len(session)
 13 |     max_source_len = max([len(query) for query in session.queries[:-1]])
 14 |     max_target_len = max([len(query) for query in session.queries[1:]])
 15 |     source_tokens = [query.tokens for query in session.queries[:-1]]  # 2d list
 16 |     target_tokens = [query.tokens for query in session.queries[1:]]  # 2d list
 17 | 
 18 |     source_words = torch.LongTensor(session_len - 1, max_source_len).zero_()
 19 |     source_lens = torch.LongTensor(session_len - 1).zero_()
 20 |     target_words = torch.LongTensor(session_len - 1, max_target_len).zero_()
 21 |     target_lens = torch.LongTensor(session_len - 1).zero_()
 22 |     target_seq = torch.LongTensor(session_len - 1, max_target_len).zero_()  # use only to compute loss
 23 | 
 24 |     for i in range(session_len - 1):
 25 |         query = session.queries[i]
 26 |         query_len = len(query.tokens)
 27 |         source_lens[i] = query_len
 28 |         source_words[i, :query_len].copy_(torch.LongTensor(
 29 |             query.vectorize(word_dict=src_dict)))
 30 | 
 31 |         query = session.queries[i + 1]
 32 |         query_len = len(query.tokens)
 33 |         target_lens[i] = query_len
 34 |         target_words[i, :query_len].copy_(torch.LongTensor(
 35 |             query.vectorize(word_dict=src_dict)))
 36 |         target_seq[i, :query_len].copy_(torch.LongTensor(
 37 |             query.vectorize(word_dict=tgt_dict)))  # diff is which dict is used
 38 | 
 39 |     source_vocab = None
 40 |     if session_len == 2:
 41 |         source_vocab = session.queries[0].src_vocab
 42 | 
 43 |     return {
 44 |         'id': session.id,
 45 |         'source_tokens': source_tokens,
 46 |         'source_words': source_words,
 47 |         'source_lens': source_lens,
 48 |         'target_tokens': target_tokens,
 49 |         'target_words': target_words,
 50 |         'target_lens': target_lens,
 51 |         'target_seq': target_seq,
 52 |         'max_source_len': max_source_len,
 53 |         'max_target_len': max_target_len,
 54 |         'session_len': session_len,
 55 |         'src_vocab': source_vocab
 56 |     }
 57 | 
 58 | 
 59 | def batchify(batch):
 60 |     """Gather a batch of individual examples into one batch."""
 61 | 
 62 |     # batch is a list of vectorized examples
 63 |     batch_size = len(batch)
 64 |     max_source_len = max([b['max_source_len'] for b in batch])
 65 |     max_target_len = max([b['max_target_len'] for b in batch])
 66 |     session_len = batch[0]['session_len']
 67 | 
 68 |     # all the sessions must have the same length
 69 |     assert len(set([b['session_len'] for b in batch])) == 1
 70 | 
 71 |     ids = [ex['id'] for ex in batch]
 72 | 
 73 |     # --------- Prepare query tensors ---------
 74 |     source_lens = torch.LongTensor(batch_size,
 75 |                                    session_len - 1).zero_()
 76 |     source_words = torch.LongTensor(batch_size,
 77 |                                     session_len - 1,
 78 |                                     max_source_len).zero_()
 79 |     target_lens = torch.LongTensor(batch_size,
 80 |                                    session_len - 1).zero_()
 81 |     target_words = torch.LongTensor(batch_size,
 82 |                                     session_len - 1,
 83 |                                     max_target_len).zero_()
 84 |     target_seq = torch.LongTensor(batch_size,
 85 |                                   session_len - 1,
 86 |                                   max_target_len).zero_()
 87 | 
 88 |     for bidx, session in enumerate(batch):
 89 |         source_lens[bidx] = session['source_lens']
 90 |         target_lens[bidx] = session['target_lens']
 91 |         source_words[bidx, :, :session['max_source_len']].copy_(session['source_words'])
 92 |         target_words[bidx, :, :session['max_target_len']].copy_(session['target_words'])
 93 |         target_seq[bidx, :, :session['max_target_len']].copy_(session['target_seq'])
 94 | 
 95 |     # --------- Prepare other tensors ---------
 96 |     # prepare source vocabs, alignment [required for Copy Attention]
 97 |     source_maps = []
 98 |     alignments = []
 99 |     src_vocabs = []
100 |     if session_len == 2:
101 |         for idx in range(batch_size):
102 |             target = batch[idx]['target_tokens'][0]
103 |             context = batch[idx]['source_tokens'][0]
104 |             vocab = batch[idx]['src_vocab']
105 |             src_vocabs.append(vocab)
106 | 
107 |             # Mapping source tokens to indices in the dynamic dict.
108 |             src_map = torch.LongTensor([vocab[w] for w in context])
109 |             source_maps.append(src_map)
110 | 
111 |             mask = torch.LongTensor([vocab[w] for w in target])
112 |             alignments.append(mask)
113 | 
114 |     return {
115 |         'batch_size': batch_size,
116 |         'ids': ids,
117 |         'source_tokens': [item['source_tokens'] for item in batch],
118 |         'source_words': source_words,
119 |         'source_lens': source_lens,
120 |         'target_tokens': [item['target_tokens'] for item in batch],
121 |         'target_words': target_words,
122 |         'target_lens': target_lens,
123 |         'target_seq': target_seq,
124 |         'src_vocab': src_vocabs,
125 |         'src_map': source_maps,
126 |         'alignment': alignments,
127 |         'session_len': session_len - 1
128 |     }
129 | 


--------------------------------------------------------------------------------
/neuroir/inputters/vocabulary.py:
--------------------------------------------------------------------------------
  1 | # src: https://github.com/facebookresearch/DrQA/blob/master/drqa/reader/data.py
  2 | import unicodedata
  3 | import numpy as np
  4 | from neuroir.inputters.constants import PAD, PAD_WORD, UNK, UNK_WORD, \
  5 |     BOS, BOS_WORD, EOS, EOS_WORD
  6 | 
  7 | 
  8 | class Vocabulary(object):
  9 |     def __init__(self):
 10 |         self.tok2ind = {PAD_WORD: PAD,
 11 |                         UNK_WORD: UNK,
 12 |                         BOS_WORD: BOS,
 13 |                         EOS_WORD: EOS}
 14 |         self.ind2tok = {PAD: PAD_WORD,
 15 |                         UNK: UNK_WORD,
 16 |                         BOS: BOS_WORD,
 17 |                         EOS: EOS_WORD}
 18 | 
 19 |     @staticmethod
 20 |     def normalize(token):
 21 |         return unicodedata.normalize('NFD', token)
 22 | 
 23 |     def __len__(self):
 24 |         return len(self.tok2ind)
 25 | 
 26 |     def __iter__(self):
 27 |         return iter(self.tok2ind)
 28 | 
 29 |     def __contains__(self, key):
 30 |         if type(key) == int:
 31 |             return key in self.ind2tok
 32 |         elif type(key) == str:
 33 |             return self.normalize(key) in self.tok2ind
 34 | 
 35 |     def __getitem__(self, key):
 36 |         if type(key) == int:
 37 |             return self.ind2tok.get(key, UNK_WORD)
 38 |         elif type(key) == str:
 39 |             return self.tok2ind.get(self.normalize(key),
 40 |                                     self.tok2ind.get(UNK_WORD))
 41 |         else:
 42 |             raise RuntimeError('Invalid key type.')
 43 | 
 44 |     def __setitem__(self, key, item):
 45 |         if type(key) == int and type(item) == str:
 46 |             self.ind2tok[key] = item
 47 |         elif type(key) == str and type(item) == int:
 48 |             self.tok2ind[key] = item
 49 |         else:
 50 |             raise RuntimeError('Invalid (key, item) types.')
 51 | 
 52 |     def add(self, token):
 53 |         token = self.normalize(token)
 54 |         if token not in self.tok2ind:
 55 |             index = len(self.tok2ind)
 56 |             self.tok2ind[token] = index
 57 |             self.ind2tok[index] = token
 58 | 
 59 |     def add_tokens(self, token_list):
 60 |         assert isinstance(token_list, list)
 61 |         for token in token_list:
 62 |             self.add(token)
 63 | 
 64 |     def tokens(self):
 65 |         """Get dictionary tokens.
 66 |         Return all the words indexed by this dictionary, except for special
 67 |         tokens.
 68 |         """
 69 |         tokens = [k for k in self.tok2ind.keys()
 70 |                   if k not in {PAD_WORD, UNK_WORD}]
 71 |         return tokens
 72 | 
 73 |     def remove(self, key):
 74 |         if key in self.tok2ind:
 75 |             ind = self.tok2ind[key]
 76 |             del self.tok2ind[key]
 77 |             del self.ind2tok[ind]
 78 |             return True
 79 |         return False
 80 | 
 81 | 
 82 | class UnicodeCharsVocabulary(Vocabulary):
 83 |     """Vocabulary containing character-level and word level information.
 84 |     Has a word vocabulary that is used to lookup word ids and
 85 |     a character id that is used to map words to arrays of character ids.
 86 |     The character ids are defined by ord(c) for c in word.encode('utf-8')
 87 |     This limits the total number of possible char ids to 256.
 88 |     To this we add 5 additional special ids: begin sentence, end sentence,
 89 |         begin word, end word and padding.
 90 |     """
 91 | 
 92 |     def __init__(self, words, max_word_length):
 93 |         super(UnicodeCharsVocabulary, self).__init__()
 94 |         self._max_word_length = max_word_length
 95 | 
 96 |         # char ids 0-255 come from utf-8 encoding bytes
 97 |         # assign 256-259 to special chars
 98 |         self.bow_char = 256  # <begin word>
 99 |         self.eow_char = 257  # <end word>
100 |         self.pad_char = 258  # <padding>
101 | 
102 |         for w in words:
103 |             self.add(w)
104 |         num_words = len(self.ind2tok)
105 | 
106 |         self._word_char_ids = np.zeros([num_words, max_word_length],
107 |                                        dtype=np.int32)
108 | 
109 |         for i, word in self.ind2tok.items():
110 |             self._word_char_ids[i] = self._convert_word_to_char_ids(word)
111 | 
112 |     def rebuild_word_char_ids(self):
113 |         num_words = len(self.ind2tok)
114 |         self._word_char_ids = np.zeros([num_words, self._max_word_length],
115 |                                        dtype=np.int32)
116 | 
117 |         for i, word in self.ind2tok.items():
118 |             self._word_char_ids[i] = self._convert_word_to_char_ids(word)
119 | 
120 |     @property
121 |     def word_char_ids(self):
122 |         return self._word_char_ids
123 | 
124 |     @property
125 |     def max_word_length(self):
126 |         return self._max_word_length
127 | 
128 |     def _convert_word_to_char_ids(self, word):
129 |         code = np.zeros([self.max_word_length], dtype=np.int32)
130 |         code[:] = self.pad_char
131 | 
132 |         word_encoded = word.encode('utf-8', 'ignore')[:(self.max_word_length - 2)]
133 |         code[0] = self.bow_char
134 |         for k, chr_id in enumerate(word_encoded, start=1):
135 |             code[k] = chr_id
136 |         code[k + 1] = self.eow_char
137 | 
138 |         return code
139 | 
140 |     def word_to_char_ids(self, word):
141 |         if word in self.tok2ind:
142 |             return self._word_char_ids[self.tok2ind[word]]
143 |         else:
144 |             return self._convert_word_to_char_ids(word)
145 | 
146 |     def encode_chars(self, sentence, split=True):
147 |         """
148 |         Encode the sentence as a white space delimited string of tokens.
149 |         """
150 |         if split:
151 |             chars_ids = [self.word_to_char_ids(cur_word)
152 |                          for cur_word in sentence.split()]
153 |         else:
154 |             chars_ids = [self.word_to_char_ids(cur_word)
155 |                          for cur_word in sentence]
156 | 
157 |         return chars_ids
158 | 


--------------------------------------------------------------------------------
/neuroir/models/__init__.py:
--------------------------------------------------------------------------------
1 | __author__ = 'wasi'
2 | 
3 | from .ranker import *
4 | from .recommender import *
5 | 


--------------------------------------------------------------------------------
/neuroir/modules/__init__.py:
--------------------------------------------------------------------------------
1 | __author__ = 'wasi'
2 | 
3 | from .copy_generator import *
4 | from .embeddings import *
5 | from .global_attention import *
6 | from .maxout import *
7 | from .util_class import *
8 | 


--------------------------------------------------------------------------------
/neuroir/modules/copy_generator.py:
--------------------------------------------------------------------------------
  1 | # src: https://github.com/OpenNMT/OpenNMT-py/blob/master/onmt/modules/copy_generator.py
  2 | """ Generator module """
  3 | import torch.nn as nn
  4 | import torch
  5 | 
  6 | from neuroir.inputters import constants
  7 | from neuroir.utils.misc import aeq
  8 | 
  9 | 
 10 | class CopyGenerator(nn.Module):
 11 |     """Generator module that additionally considers copying
 12 |     words directly from the source.
 13 |     The main idea is that we have an extended "dynamic dictionary".
 14 |     It contains `|tgt_dict|` words plus an arbitrary number of
 15 |     additional words introduced by the source sentence.
 16 |     For each source sentence we have a `src_map` that maps
 17 |     each source word to an index in `tgt_dict` if it known, or
 18 |     else to an extra word.
 19 |     The copy generator is an extended version of the standard
 20 |     generator that computes three values.
 21 |     * :math:`p_{softmax}` the standard softmax over `tgt_dict`
 22 |     * :math:`p(z)` the probability of copying a word from
 23 |       the source
 24 |     * :math:`p_{copy}` the probility of copying a particular word.
 25 |       taken from the attention distribution directly.
 26 |     The model returns a distribution over the extend dictionary,
 27 |     computed as
 28 |     :math:`p(w) = p(z=1)  p_{copy}(w)  +  p(z=0)  p_{softmax}(w)`
 29 |     .. mermaid::
 30 |        graph BT
 31 |           A[input]
 32 |           S[src_map]
 33 |           B[softmax]
 34 |           BB[switch]
 35 |           C[attn]
 36 |           D[copy]
 37 |           O[output]
 38 |           A --> B
 39 |           A --> BB
 40 |           S --> D
 41 |           C --> D
 42 |           D --> O
 43 |           B --> O
 44 |           BB --> O
 45 |     Args:
 46 |        input_size (int): size of input representation
 47 |     """
 48 | 
 49 |     def __init__(self, input_size, generator, eps=1e-20):
 50 |         super(CopyGenerator, self).__init__()
 51 |         self.linear = generator
 52 |         self.linear_copy = nn.Linear(input_size, 1)
 53 |         self.softmax = nn.Softmax(dim=-1)
 54 |         self.sigmoid = nn.Sigmoid()
 55 |         self.eps = eps
 56 | 
 57 |     def forward(self, hidden, attn, src_map):
 58 |         """
 59 |         Compute a distribution over the target dictionary
 60 |         extended by the dynamic dictionary implied by compying
 61 |         source words.
 62 |         Args:
 63 |            hidden (`FloatTensor`): hidden outputs `[batch, tlen, input_size]`
 64 |            attn (`FloatTensor`): attn for each `[batch, tlen, slen]`
 65 |            src_map (`FloatTensor`):
 66 |              A sparse indicator matrix mapping each source word to
 67 |              its index in the "extended" vocab containing.
 68 |              `[batch, src_len, extra_words]`
 69 |         """
 70 |         # CHECKS
 71 |         batch, tlen, _ = hidden.size()
 72 |         batch_, tlen_, slen = attn.size()
 73 |         batch, slen_, cvocab = src_map.size()
 74 |         aeq(tlen, tlen_)
 75 |         aeq(slen, slen_)
 76 | 
 77 |         # Original probabilities.
 78 |         logits = self.linear(hidden)
 79 |         logits[:, :, constants.PAD] = -self.eps
 80 |         prob = self.softmax(logits)
 81 | 
 82 |         # Probability of copying p(z=1) batch.
 83 |         p_copy = self.sigmoid(self.linear_copy(hidden))
 84 |         # Probibility of not copying: p_{word}(w) * (1 - p(z))
 85 |         out_prob = torch.mul(prob, 1 - p_copy.expand_as(prob))
 86 |         mul_attn = torch.mul(attn, p_copy.expand_as(attn))
 87 |         copy_prob = torch.bmm(mul_attn, src_map)  # `[batch, tlen, extra_words]`
 88 |         return torch.cat([out_prob, copy_prob], 2)
 89 | 
 90 | 
 91 | class CopyGeneratorCriterion(object):
 92 |     """ Copy generator criterion """
 93 | 
 94 |     def __init__(self, vocab_size, force_copy, eps=1e-20):
 95 |         self.force_copy = force_copy
 96 |         self.eps = eps
 97 |         self.offset = vocab_size
 98 | 
 99 |     def __call__(self, scores, align, target):
100 |         # CHECKS
101 |         batch, tlen, _ = scores.size()
102 |         _, _tlen = target.size()
103 |         aeq(tlen, _tlen)
104 |         _, _tlen = align.size()
105 |         aeq(tlen, _tlen)
106 | 
107 |         align = align.view(-1)
108 |         target = target.view(-1)
109 |         scores = scores.view(-1, scores.size(2))
110 | 
111 |         # Compute unks in align and target for readability
112 |         align_unk = align.eq(constants.UNK).float()
113 |         align_not_unk = align.ne(constants.UNK).float()
114 |         target_unk = target.eq(constants.UNK).float()
115 |         target_not_unk = target.ne(constants.UNK).float()
116 | 
117 |         # Copy probability of tokens in source
118 |         out = scores.gather(1, align.view(-1, 1) + self.offset).view(-1)
119 |         # Set scores for unk to 0 and add eps
120 |         out = out.mul(align_not_unk) + self.eps
121 |         # Get scores for tokens in target
122 |         tmp = scores.gather(1, target.view(-1, 1)).view(-1)
123 | 
124 |         # Regular prob (no unks and unks that can't be copied)
125 |         if not self.force_copy:
126 |             # Add score for non-unks in target
127 |             out = out + tmp.mul(target_not_unk)
128 |             # Add score for when word is unk in both align and tgt
129 |             out = out + tmp.mul(align_unk).mul(target_unk)
130 |         else:
131 |             # Forced copy. Add only probability for not-copied tokens
132 |             out = out + tmp.mul(align_unk)
133 | 
134 |         loss = -out.log()
135 |         return loss
136 | 


--------------------------------------------------------------------------------
/neuroir/modules/global_attention.py:
--------------------------------------------------------------------------------
  1 | # copied from https://github.com/OpenNMT/OpenNMT-py/blob/master/onmt/modules/global_attention.py
  2 | 
  3 | """" Global attention modules (Luong / Bahdanau) """
  4 | import torch
  5 | import torch.nn as nn
  6 | 
  7 | from neuroir.utils.misc import aeq, sequence_mask
  8 | 
  9 | 
 10 | # This class is mainly used by decoder.py for RNNs but also
 11 | # by the CNN / transformer decoder when copy attention is used
 12 | # CNN has its own attention mechanism ConvMultiStepAttention
 13 | # Transformer has its own MultiHeadedAttention
 14 | 
 15 | 
 16 | class GlobalAttention(nn.Module):
 17 |     """
 18 |     Global attention takes a matrix and a query vector. It
 19 |     then computes a parameterized convex combination of the matrix
 20 |     based on the input query.
 21 |     Constructs a unit mapping a query `q` of size `dim`
 22 |     and a source matrix `H` of size `n x dim`, to an output
 23 |     of size `dim`.
 24 |     .. mermaid::
 25 |        graph BT
 26 |           A[Query]
 27 |           subgraph RNN
 28 |             C[H 1]
 29 |             D[H 2]
 30 |             E[H N]
 31 |           end
 32 |           F[Attn]
 33 |           G[Output]
 34 |           A --> F
 35 |           C --> F
 36 |           D --> F
 37 |           E --> F
 38 |           C -.-> G
 39 |           D -.-> G
 40 |           E -.-> G
 41 |           F --> G
 42 |     All models compute the output as
 43 |     :math:`c = sum_{j=1}^{SeqLength} a_j H_j` where
 44 |     :math:`a_j` is the softmax of a score function.
 45 |     Then then apply a projection layer to [q, c].
 46 |     However they
 47 |     differ on how they compute the attention score.
 48 |     * Luong Attention (dot, general):
 49 |        * dot: :math:`score(H_j,q) = H_j^T q`
 50 |        * general: :math:`score(H_j, q) = H_j^T W_a q`
 51 |     * Bahdanau Attention (mlp):
 52 |        * :math:`score(H_j, q) = v_a^T tanh(W_a q + U_a h_j)`
 53 |     Args:
 54 |        dim (int): dimensionality of query and key
 55 |        coverage (bool): use coverage term
 56 |        attn_type (str): type of attention to use, options [dot,general,mlp]
 57 |     """
 58 | 
 59 |     def __init__(self, dim, coverage=False, attn_type="dot"):
 60 |         super(GlobalAttention, self).__init__()
 61 | 
 62 |         self.dim = dim
 63 |         self.attn_type = attn_type
 64 |         assert (self.attn_type in ["dot", "general", "mlp"]), (
 65 |             "Please select a valid attention type.")
 66 | 
 67 |         if self.attn_type == "general":
 68 |             self.linear_in = nn.Linear(dim, dim, bias=False)
 69 |         elif self.attn_type == "mlp":
 70 |             self.linear_context = nn.Linear(dim, dim, bias=False)
 71 |             self.linear_query = nn.Linear(dim, dim, bias=True)
 72 |             self.v = nn.Linear(dim, 1, bias=False)
 73 |         # mlp wants it with bias
 74 |         out_bias = self.attn_type == "mlp"
 75 |         self.linear_out = nn.Linear(dim * 2, dim, bias=out_bias)
 76 | 
 77 |         self.softmax = nn.Softmax(dim=-1)
 78 |         self.tanh = nn.Tanh()
 79 |         self._coverage = coverage
 80 | 
 81 |     def score(self, h_t, h_s):
 82 |         """
 83 |         Args:
 84 |           h_t (`FloatTensor`): sequence of queries `[batch x tgt_len x dim]`
 85 |           h_s (`FloatTensor`): sequence of sources `[batch x src_len x dim]`
 86 |         Returns:
 87 |           :obj:`FloatTensor`:
 88 |            raw attention scores (unnormalized) for each src index
 89 |           `[batch x tgt_len x src_len]`
 90 |         """
 91 |         # Check input sizes
 92 |         src_batch, src_len, src_dim = h_s.size()
 93 |         tgt_batch, tgt_len, tgt_dim = h_t.size()
 94 |         aeq(src_batch, tgt_batch)
 95 |         aeq(src_dim, tgt_dim)
 96 |         aeq(self.dim, src_dim)
 97 | 
 98 |         if self.attn_type in ["general", "dot"]:
 99 |             if self.attn_type == "general":
100 |                 h_t_ = h_t.view(tgt_batch * tgt_len, tgt_dim)
101 |                 h_t_ = self.linear_in(h_t_)
102 |                 h_t = h_t_.view(tgt_batch, tgt_len, tgt_dim)
103 |             h_s_ = h_s.transpose(1, 2)
104 |             # (batch, t_len, d) x (batch, d, s_len) --> (batch, t_len, s_len)
105 |             return torch.bmm(h_t, h_s_)
106 |         else:
107 |             dim = self.dim
108 |             wq = self.linear_query(h_t.view(-1, dim))
109 |             wq = wq.view(tgt_batch, tgt_len, 1, dim)
110 |             wq = wq.expand(tgt_batch, tgt_len, src_len, dim)
111 | 
112 |             uh = self.linear_context(h_s.contiguous().view(-1, dim))
113 |             uh = uh.view(src_batch, 1, src_len, dim)
114 |             uh = uh.expand(src_batch, tgt_len, src_len, dim)
115 | 
116 |             # (batch, t_len, s_len, d)
117 |             wquh = self.tanh(wq + uh)
118 | 
119 |             return self.v(wquh.view(-1, dim)).view(tgt_batch, tgt_len, src_len)
120 | 
121 |     def forward(self, source, memory_bank, memory_lengths=None, coverage=None):
122 |         """
123 |         Args:
124 |           input (`FloatTensor`): query vectors `[batch x tgt_len x dim]`
125 |           memory_bank (`FloatTensor`): source vectors `[batch x src_len x dim]`
126 |           memory_lengths (`LongTensor`): the source context lengths `[batch]`
127 |           coverage (`FloatTensor`): None (not supported yet)
128 |         Returns:
129 |           (`FloatTensor`, `FloatTensor`):
130 |           * Computed vector `[batch x tgt_len x dim]`
131 |           * Attention distribtutions for each query
132 |              `[batch x tgt_len x src_len]`
133 |         """
134 | 
135 |         # one step input
136 |         assert source.dim() == 3
137 |         one_step = True if source.size(1) == 1 else False
138 | 
139 |         batch, source_l, dim = memory_bank.size()
140 |         batch_, target_l, dim_ = source.size()
141 |         aeq(batch, batch_)
142 |         aeq(dim, dim_)
143 |         aeq(self.dim, dim)
144 | 
145 |         # compute attention scores, as in Luong et al.
146 |         align = self.score(source, memory_bank)
147 | 
148 |         if memory_lengths is not None:
149 |             mask = sequence_mask(memory_lengths, max_len=align.size(-1))
150 |             mask = mask.unsqueeze(1)  # Make it broadcastable.
151 |             align.data.masked_fill_(~mask, -float('inf'))
152 | 
153 |         # We adopt coverage attn described in Paulus et al., 2018
154 |         # REF: https://arxiv.org/abs/1705.04304
155 |         if self._coverage:
156 |             maxes = torch.max(align, 2, keepdim=True)[0]
157 |             exp_score = torch.exp(align - maxes)
158 | 
159 |             if one_step:
160 |                 if coverage is None:
161 |                     # t = 1 in Eq(3) from Paulus et al., 2018
162 |                     unnormalized_score = exp_score
163 |                 else:
164 |                     # t = otherwise in Eq(3) from Paulus et al., 2018
165 |                     assert coverage.dim() == 3  # B x 1 x slen
166 |                     unnormalized_score = exp_score.div(coverage + 1e-20)
167 |             else:
168 |                 multiplier = torch.tril(torch.ones(target_l - 1, target_l - 1))
169 |                 multiplier = multiplier.unsqueeze(0).expand(batch, *multiplier.size())
170 |                 multiplier = torch.autograd.Variable(multiplier)
171 |                 multiplier = multiplier.cuda() if align.is_cuda else multiplier
172 | 
173 |                 penalty = torch.bmm(multiplier, exp_score[:, :-1, :])  # B x tlen-1 x slen
174 |                 no_penalty = torch.ones_like(penalty[:, -1, :])  # B x slen
175 |                 penalty = torch.cat([no_penalty.unsqueeze(1), penalty], dim=1)  # B x tlen x slen
176 |                 assert exp_score.size() == penalty.size()
177 |                 unnormalized_score = exp_score.div(penalty + 1e-20)
178 | 
179 |             # Eq.(4) from Paulus et al., 2018
180 |             align_vectors = unnormalized_score.div(unnormalized_score.sum(2, keepdim=True))
181 | 
182 |         # Softmax to normalize attention weights
183 |         else:
184 |             align_vectors = self.softmax(align.view(batch * target_l, source_l))
185 |             align_vectors = align_vectors.view(batch, target_l, source_l)
186 | 
187 |         # each context vector c_t is the weighted average
188 |         # over all the source hidden states
189 |         c = torch.bmm(align_vectors, memory_bank)
190 | 
191 |         # concatenate
192 |         concat_c = torch.cat([c, source], 2).view(batch * target_l, dim * 2)
193 |         attn_h = self.linear_out(concat_c).view(batch, target_l, dim)
194 |         if self.attn_type in ["general", "dot"]:
195 |             attn_h = self.tanh(attn_h)
196 | 
197 |         # Check output sizes
198 |         batch_, target_l_, dim_ = attn_h.size()
199 |         aeq(target_l, target_l_)
200 |         aeq(batch, batch_)
201 |         aeq(dim, dim_)
202 |         batch_, target_l_, source_l_ = align_vectors.size()
203 |         aeq(target_l, target_l_)
204 |         aeq(batch, batch_)
205 |         aeq(source_l, source_l_)
206 | 
207 |         covrage_vector = None
208 |         if self._coverage and one_step:
209 |             covrage_vector = exp_score  # B x 1 x slen
210 | 
211 |         return attn_h, align_vectors, covrage_vector
212 | 


--------------------------------------------------------------------------------
/neuroir/modules/maxout.py:
--------------------------------------------------------------------------------
 1 | # src: https://github.com/allenai/allennlp/blob/master/allennlp/modules/maxout.py
 2 | from typing import Sequence, Union
 3 | 
 4 | import torch
 5 | 
 6 | 
 7 | class Maxout(torch.nn.Module):
 8 |     """
 9 |     This ``Module`` is a maxout neural network.
10 |     Parameters
11 |     ----------
12 |     input_dim : ``int``
13 |         The dimensionality of the input.  We assume the input has shape ``(batch_size, input_dim)``.
14 |     num_layers : ``int``
15 |         The number of maxout layers to apply to the input.
16 |     output_dims : ``Union[int, Sequence[int]]``
17 |         The output dimension of each of the maxout layers.  If this is a single ``int``, we use
18 |         it for all maxout layers.  If it is a ``Sequence[int]``, ``len(output_dims)`` must be
19 |         ``num_layers``.
20 |     pool_sizes : ``Union[int, Sequence[int]]``
21 |         The size of max-pools.  If this is a single ``int``, we use
22 |         it for all maxout layers.  If it is a ``Sequence[int]``, ``len(pool_sizes)`` must be
23 |         ``num_layers``.
24 |     dropout : ``Union[float, Sequence[float]]``, optional
25 |         If given, we will apply this amount of dropout after each layer.  Semantics of ``float``
26 |         versus ``Sequence[float]`` is the same as with other parameters.
27 |     """
28 | 
29 |     def __init__(self,
30 |                  input_dim: int,
31 |                  num_layers: int,
32 |                  output_dims: Union[int, Sequence[int]],
33 |                  pool_sizes: Union[int, Sequence[int]],
34 |                  dropout: Union[float, Sequence[float]] = 0.0) -> None:
35 |         super(Maxout, self).__init__()
36 |         if not isinstance(output_dims, list):
37 |             output_dims = [output_dims] * num_layers  # type: ignore
38 |         if not isinstance(pool_sizes, list):
39 |             pool_sizes = [pool_sizes] * num_layers  # type: ignore
40 |         if not isinstance(dropout, list):
41 |             dropout = [dropout] * num_layers  # type: ignore
42 |         if len(output_dims) != num_layers:
43 |             raise ValueError("len(output_dims) (%d) != num_layers (%d)" %
44 |                              (len(output_dims), num_layers))
45 |         if len(pool_sizes) != num_layers:
46 |             raise ValueError("len(pool_sizes) (%d) != num_layers (%d)" %
47 |                              (len(pool_sizes), num_layers))
48 |         if len(dropout) != num_layers:
49 |             raise ValueError("len(dropout) (%d) != num_layers (%d)" %
50 |                              (len(dropout), num_layers))
51 | 
52 |         self._pool_sizes = pool_sizes
53 |         input_dims = [input_dim] + output_dims[:-1]
54 |         linear_layers = []
55 |         for layer_input_dim, layer_output_dim, pool_size in zip(input_dims, output_dims, pool_sizes):
56 |             linear_layers.append(torch.nn.Linear(layer_input_dim, layer_output_dim * pool_size))
57 |         self._linear_layers = torch.nn.ModuleList(linear_layers)
58 |         dropout_layers = [torch.nn.Dropout(p=value) for value in dropout]
59 |         self._dropout = torch.nn.ModuleList(dropout_layers)
60 |         self._output_dims = output_dims
61 |         self._output_dim = output_dims[-1]
62 |         self._input_dim = input_dim
63 | 
64 |     def get_output_dim(self):
65 |         return self._output_dim
66 | 
67 |     def get_input_dim(self):
68 |         return self._input_dim
69 | 
70 |     def forward(self, inputs: torch.Tensor) -> torch.Tensor:
71 |         # pylint: disable=arguments-differ
72 |         output = inputs
73 |         for layer, layer_output_dim, dropout, pool_size in zip(self._linear_layers, self._output_dims,
74 |                                                                self._dropout, self._pool_sizes):
75 |             affine_output = layer(output)
76 |             # Compute and apply the proper shape for the max.
77 |             shape = list(inputs.size())
78 |             shape[-1] = layer_output_dim
79 |             shape.append(pool_size)
80 | 
81 |             maxed_output = torch.max(affine_output.view(*shape), dim=-1)[0]
82 |             dropped_output = dropout(maxed_output)
83 |             output = dropped_output
84 |         return output
85 | 


--------------------------------------------------------------------------------
/neuroir/modules/util_class.py:
--------------------------------------------------------------------------------
 1 | # src: https://github.com/OpenNMT/OpenNMT-py/blob/master/onmt/modules/util_class.py
 2 | 
 3 | """ Misc classes """
 4 | import torch
 5 | import torch.nn as nn
 6 | 
 7 | 
 8 | class LayerNorm(nn.Module):
 9 |     """
10 |         Layer Normalization class
11 |     """
12 | 
13 |     def __init__(self, features, eps=1e-6):
14 |         super(LayerNorm, self).__init__()
15 |         self.a_2 = nn.Parameter(torch.ones(features))
16 |         self.b_2 = nn.Parameter(torch.zeros(features))
17 |         self.eps = eps
18 | 
19 |     def forward(self, x):
20 |         mean = x.mean(-1, keepdim=True)
21 |         std = x.std(-1, keepdim=True)
22 |         return self.a_2 * (x - mean) / (std + self.eps) + self.b_2
23 | 
24 | 
25 | # At the moment this class is only used by embeddings.Embeddings look-up tables
26 | class Elementwise(nn.ModuleList):
27 |     """
28 |     A simple network container.
29 |     Parameters are a list of modules.
30 |     Inputs are a 3d Tensor whose last dimension is the same length
31 |     as the list.
32 |     Outputs are the result of applying modules to inputs elementwise.
33 |     An optional merge parameter allows the outputs to be reduced to a
34 |     single Tensor.
35 |     """
36 | 
37 |     def __init__(self, merge=None, *args):
38 |         assert merge in [None, 'first', 'concat', 'sum', 'mlp']
39 |         self.merge = merge
40 |         super(Elementwise, self).__init__(*args)
41 | 
42 |     def forward(self, inputs):
43 |         inputs_ = [feat.squeeze(2) for feat in inputs.split(1, dim=2)]
44 |         assert len(self) == len(inputs_)
45 |         outputs = [f(x) for f, x in zip(self, inputs_)]
46 |         if self.merge == 'first':
47 |             return outputs[0]
48 |         elif self.merge == 'concat' or self.merge == 'mlp':
49 |             return torch.cat(outputs, 2)
50 |         elif self.merge == 'sum':
51 |             return sum(outputs)
52 |         else:
53 |             return outputs
54 | 


--------------------------------------------------------------------------------
/neuroir/multitask/__init.py:
--------------------------------------------------------------------------------
1 | __author__ = 'wasi'
2 | 


--------------------------------------------------------------------------------
/neuroir/multitask/layers.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | 
 4 | from neuroir.inputters import BOS, PAD
 5 | from neuroir.modules.embeddings import Embeddings
 6 | from neuroir.encoders.rnn_encoder import RNNEncoder
 7 | from neuroir.decoders.rnn_decoder import RNNDecoder
 8 | 
 9 | 
10 | class Embedder(nn.Module):
11 |     def __init__(self,
12 |                  emsize,
13 |                  src_vocab_size,
14 |                  dropout_emb):
15 |         super(Embedder, self).__init__()
16 | 
17 |         self.word_embeddings = Embeddings(emsize,
18 |                                           src_vocab_size,
19 |                                           PAD)
20 |         self.output_size = emsize
21 |         self.dropout = nn.Dropout(dropout_emb)
22 | 
23 |     def forward(self,
24 |                 sequence):
25 |         word_rep = self.word_embeddings(sequence.unsqueeze(2))  # B x P x d
26 |         word_rep = self.dropout(word_rep)
27 |         return word_rep
28 | 
29 | 
30 | class Encoder(nn.Module):
31 |     def __init__(self,
32 |                  rnn_type,
33 |                  input_size,
34 |                  bidirection,
35 |                  nlayers,
36 |                  nhid,
37 |                  dropout_rnn):
38 |         super(Encoder, self).__init__()
39 | 
40 |         self.encoder = RNNEncoder(rnn_type,
41 |                                   input_size,
42 |                                   bidirection,
43 |                                   nlayers,
44 |                                   nhid,
45 |                                   dropout_rnn)
46 | 
47 |     def forward(self,
48 |                 input,
49 |                 input_len,
50 |                 init_states=None):
51 |         hidden, M = self.encoder(input,
52 |                                  input_len,
53 |                                  init_states)  # B x Seq-len x h
54 |         return hidden, M
55 | 
56 | 
57 | class Decoder(nn.Module):
58 |     def __init__(self,
59 |                  rnn_type,
60 |                  input_size,
61 |                  bidirection,
62 |                  nlayers,
63 |                  nhid,
64 |                  attn_type,
65 |                  dropout_rnn,
66 |                  copy_attn,
67 |                  reuse_copy_attn):
68 |         super(Decoder, self).__init__()
69 | 
70 |         attn_type = None if attn_type == 'none' else attn_type
71 |         self.decoder = RNNDecoder(rnn_type,
72 |                                   input_size,
73 |                                   bidirection,
74 |                                   nlayers,
75 |                                   nhid,
76 |                                   attn_type=attn_type,
77 |                                   dropout=dropout_rnn,
78 |                                   copy_attn=copy_attn,
79 |                                   reuse_copy_attn=reuse_copy_attn)
80 | 
81 |     def init_decoder(self, hidden):
82 |         return self.decoder.init_decoder_state(hidden)
83 | 
84 |     def forward(self,
85 |                 tgt,
86 |                 memory_bank,
87 |                 memory_len,
88 |                 state):
89 |         decoder_outputs, _, attns = self.decoder(tgt,
90 |                                                  memory_bank,
91 |                                                  state,
92 |                                                  memory_lengths=memory_len)
93 |         return decoder_outputs, attns
94 | 


--------------------------------------------------------------------------------
/neuroir/objects/__init__.py:
--------------------------------------------------------------------------------
1 | __author__ = 'wasi'
2 | 
3 | from .document import *
4 | from .query import *
5 | from .session import *
6 | 


--------------------------------------------------------------------------------
/neuroir/objects/document.py:
--------------------------------------------------------------------------------
  1 | __author__ = 'wasi'
  2 | 
  3 | 
  4 | class Document(object):
  5 |     """Document containing annotated text, original text, selection label and
  6 |     all the extractive spans that can be an answer for the associated question.
  7 |     """
  8 | 
  9 |     def __init__(self, _id=None):
 10 |         self._id = _id
 11 |         self._url = None
 12 |         self._url_tokens = []
 13 |         self._title = None
 14 |         self._title_tokens = []
 15 |         self._content = None
 16 |         self._content_tokens = []
 17 |         self._tokens = []
 18 |         self._label = 0  # whether document is clicked
 19 | 
 20 |     @property
 21 |     def id(self) -> str:
 22 |         return self._id
 23 | 
 24 |     @property
 25 |     def url(self) -> str:
 26 |         return self._url
 27 | 
 28 |     @url.setter
 29 |     def url(self, param: str) -> None:
 30 |         self._url = param
 31 | 
 32 |     @property
 33 |     def url_tokens(self) -> list:
 34 |         return self._url_tokens
 35 | 
 36 |     @url_tokens.setter
 37 |     def url_tokens(self, param: list) -> None:
 38 |         if not isinstance(param, list):
 39 |             raise TypeError('Document->url.tokens must be a list')
 40 |         self._url_tokens = param
 41 | 
 42 |     @property
 43 |     def title(self) -> str:
 44 |         return self._title
 45 | 
 46 |     @title.setter
 47 |     def title(self, param: str) -> None:
 48 |         self._title = param
 49 | 
 50 |     @property
 51 |     def title_tokens(self) -> list:
 52 |         return self._url_tokens
 53 | 
 54 |     @title_tokens.setter
 55 |     def title_tokens(self, param: list) -> None:
 56 |         if not isinstance(param, list):
 57 |             raise TypeError('Document->title.tokens must be a list')
 58 |         self._title_tokens = param
 59 | 
 60 |     @property
 61 |     def content(self) -> str:
 62 |         return self._content
 63 | 
 64 |     @content.setter
 65 |     def content(self, param: str) -> None:
 66 |         self._content = param
 67 | 
 68 |     @property
 69 |     def content_tokens(self) -> list:
 70 |         return self._content_tokens
 71 | 
 72 |     @content_tokens.setter
 73 |     def content_tokens(self, param: list) -> None:
 74 |         if not isinstance(param, list):
 75 |             raise TypeError('Document->content.tokens must be a list')
 76 |         self._content_tokens = param
 77 | 
 78 |     @property
 79 |     def tokens(self) -> list:
 80 |         return self._tokens
 81 | 
 82 |     @tokens.setter
 83 |     def tokens(self, param: list) -> None:
 84 |         if not isinstance(param, list):
 85 |             raise TypeError('Document.tokens must be a list')
 86 |         self._tokens = param
 87 | 
 88 |     @property
 89 |     def label(self) -> int:
 90 |         return self._label
 91 | 
 92 |     @label.setter
 93 |     def label(self, param: int) -> None:
 94 |         self._label = param
 95 | 
 96 |     def __len__(self):
 97 |         return len(self.tokens)
 98 | 
 99 |     def vectorize(self, word_dict, _type='word') -> list:
100 |         if _type == 'word':
101 |             return [word_dict[w] for w in self.tokens]
102 |         elif _type == 'char':
103 |             return [word_dict.word_to_char_ids(w).tolist() for w in self.tokens]
104 |         else:
105 |             assert False
106 | 


--------------------------------------------------------------------------------
/neuroir/objects/query.py:
--------------------------------------------------------------------------------
 1 | __author__ = 'wasi'
 2 | 
 3 | from neuroir.inputters import BOS_WORD, EOS_WORD, Vocabulary
 4 | 
 5 | 
 6 | class Query(object):
 7 |     """Document containing annotated text, original text, selection label and
 8 |     all the extractive spans that can be an answer for the associated question.
 9 |     """
10 | 
11 |     def __init__(self, _id=None):
12 |         self._id = _id
13 |         self._text = None
14 |         self._tokens = []
15 |         self._documents = []
16 |         self._src_vocab = None  # required for Copy Attention
17 | 
18 |     @property
19 |     def id(self) -> str:
20 |         return self._id
21 | 
22 |     @property
23 |     def text(self) -> str:
24 |         return self._text
25 | 
26 |     @text.setter
27 |     def text(self, param: str) -> None:
28 |         self._text = param
29 | 
30 |     @property
31 |     def tokens(self) -> list:
32 |         return self._tokens
33 | 
34 |     @tokens.setter
35 |     def tokens(self, param: list) -> None:
36 |         if not isinstance(param, list):
37 |             raise TypeError('Query.tokens must be a list')
38 |         self._tokens = param
39 | 
40 |     @property
41 |     def documents(self) -> list:
42 |         return self._documents
43 | 
44 |     @documents.setter
45 |     def documents(self, param: list) -> None:
46 |         if not isinstance(param, list):
47 |             raise TypeError('Query.documents must be a list')
48 |         self._documents = param
49 | 
50 |     @property
51 |     def src_vocab(self) -> list:
52 |         if self._src_vocab is None:
53 |             self.form_src_vocab()
54 |         return self._src_vocab
55 | 
56 |     def form_src_vocab(self) -> None:
57 |         self._src_vocab = Vocabulary()
58 |         self._src_vocab.add_tokens(self.tokens)
59 | 
60 |     def vectorize(self, word_dict, _type='word'):
61 |         if _type == 'word':
62 |             return [word_dict[w] for w in self.tokens]
63 |         elif _type == 'char':
64 |             return [word_dict.word_to_char_ids(w).tolist() for w in self.tokens]
65 |         else:
66 |             assert False
67 | 
68 |     def __len__(self):
69 |         return len(self.tokens)
70 | 


--------------------------------------------------------------------------------
/neuroir/objects/session.py:
--------------------------------------------------------------------------------
 1 | __author__ = 'wasi'
 2 | 
 3 | from .query import Query
 4 | from neuroir.inputters import BOS_WORD, EOS_WORD
 5 | 
 6 | 
 7 | class Session(object):
 8 |     """Session containing a list of Objects:Query."""
 9 | 
10 |     def __init__(self, _id=None):
11 |         self._id = _id
12 |         self._queries = []
13 | 
14 |     @property
15 |     def id(self) -> str:
16 |         return self._id
17 | 
18 |     @property
19 |     def queries(self) -> list:
20 |         return self._queries
21 | 
22 |     @queries.setter
23 |     def queries(self, param: list) -> None:
24 |         if not isinstance(param, list):
25 |             raise TypeError('Session.queries must be a list')
26 |         self._queries = param
27 | 
28 |     def add_query(self, query: Query) -> None:
29 |         self._queries.append(query)
30 | 
31 |     def add_one_query(self, list_of_query: list) -> None:
32 |         query_text = ' '.join([query.text for query in list_of_query])
33 |         query_tokens = [query.tokens[1:-1] for query in list_of_query]
34 |         query_tokens = sum(query_tokens, [])
35 | 
36 |         qid = list_of_query[-1].id
37 |         qObj = Query(qid)
38 |         qObj.text = query_text
39 |         query_tokens = [BOS_WORD] + query_tokens + [EOS_WORD]
40 |         qObj.tokens = query_tokens
41 |         qObj.form_src_vocab()
42 |         self.add_query(qObj)
43 | 
44 |     def __len__(self):
45 |         return len(self.queries)
46 | 


--------------------------------------------------------------------------------
/neuroir/rankers/__init.py:
--------------------------------------------------------------------------------
1 | __author__ = 'wasi'
2 | 


--------------------------------------------------------------------------------
/neuroir/rankers/arci.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.nn as nn
  3 | from neuroir.inputters import PAD
  4 | from neuroir.modules.embeddings import Embeddings
  5 | 
  6 | 
  7 | class ARCI(nn.Module):
  8 |     """Implementation of the convolutional matching model (ARC-I)."""
  9 | 
 10 |     def __init__(self, args):
 11 |         """"Constructor of the class."""
 12 |         super(ARCI, self).__init__()
 13 | 
 14 |         self.word_embeddings = Embeddings(args.emsize,
 15 |                                           args.src_vocab_size,
 16 |                                           PAD)
 17 |         self.emb_drop = nn.Dropout(p=args.dropout_emb)
 18 | 
 19 |         num_conv1d_layers = len(args.filters_1d)
 20 |         assert num_conv1d_layers == len(args.kernel_size_1d)
 21 |         assert num_conv1d_layers == len(args.maxpool_size_1d)
 22 | 
 23 |         query_feats = args.max_query_len
 24 |         doc_feats = args.max_doc_len
 25 | 
 26 |         query_conv1d_layers = []
 27 |         doc_conv1d_layers = []
 28 |         for i in range(num_conv1d_layers):
 29 |             inpsize = args.emsize if i == 0 else args.filters_1d[i - 1]
 30 |             pad = args.kernel_size_1d[i] // 2
 31 |             layer = nn.Sequential(
 32 |                 nn.Conv1d(inpsize, args.filters_1d[i], args.kernel_size_1d[i],
 33 |                           padding=pad),
 34 |                 nn.ReLU(inplace=True),
 35 |                 nn.MaxPool1d(args.maxpool_size_1d[i])
 36 |             )
 37 |             query_conv1d_layers.append(layer)
 38 |             layer = nn.Sequential(
 39 |                 nn.Conv1d(inpsize, args.filters_1d[i], args.kernel_size_1d[i],
 40 |                           padding=pad),
 41 |                 nn.ReLU(inplace=True),
 42 |                 nn.MaxPool1d(args.maxpool_size_1d[i])
 43 |             )
 44 |             doc_conv1d_layers.append(layer)
 45 | 
 46 |             doc_feats = doc_feats // args.maxpool_size_1d[i]
 47 |             query_feats = query_feats // args.maxpool_size_1d[i]
 48 |             assert query_feats != 0 and doc_feats != 0
 49 | 
 50 |         self.query_conv1d_layers = nn.ModuleList(query_conv1d_layers)
 51 |         self.doc_conv1d_layers = nn.ModuleList(doc_conv1d_layers)
 52 | 
 53 |         inpsize = (args.filters_1d[-1] * query_feats) + \
 54 |                   (args.filters_1d[-1] * doc_feats)
 55 |         self.mlp = nn.Sequential(
 56 |             nn.Linear(inpsize, inpsize // 2),
 57 |             nn.Linear(inpsize // 2, 1)
 58 |         )
 59 | 
 60 |     def forward(self, batch_queries, query_len, batch_docs, doc_len):
 61 |         """
 62 |         Forward function of the match tensor model. Return average loss for a batch of sessions.
 63 |         :param batch_queries: 2d tensor [batch_size x max_query_length]
 64 |         :param query_len: 1d numpy array [batch_size]
 65 |         :param batch_docs: 3d tensor [batch_size x num_rel_docs_per_query x max_document_length]
 66 |         :param doc_len: 2d numpy array [batch_size x num_clicks_per_query]
 67 |         :return: click probabilities [batch_size x num_rel_docs_per_query]
 68 |         """
 69 |         assert batch_queries.shape[0] == batch_docs.shape[0]
 70 |         batch_size = batch_queries.shape[0]
 71 |         qlen = batch_queries.shape[1]
 72 |         num_docs, dlen = batch_docs.shape[1], batch_docs.shape[2]
 73 | 
 74 |         # embed query
 75 |         embedded_queries = self.word_embeddings(batch_queries.unsqueeze(2))
 76 |         # batch_size x max_q_len x emsize
 77 |         embedded_queries = self.emb_drop(embedded_queries)
 78 | 
 79 |         inp_rep = embedded_queries.transpose(1, 2)
 80 |         for layer in self.query_conv1d_layers:
 81 |             inp_rep = layer(inp_rep)
 82 |         # batch_size x ?
 83 |         conv_queries = inp_rep.flatten(1)
 84 | 
 85 |         # batch_size x num_rel_docs x ?
 86 |         conv_queries = conv_queries.unsqueeze(1).expand(
 87 |             batch_size, num_docs, conv_queries.size(1))
 88 |         # batch_size * num_rel_docs x ?
 89 |         conv_queries = conv_queries.contiguous().view(batch_size * num_docs, -1)
 90 | 
 91 |         # embed documents
 92 |         doc_rep = batch_docs.view(batch_size * num_docs, dlen)
 93 |         embedded_docs = self.word_embeddings(doc_rep.unsqueeze(2))
 94 |         # batch_size * num_rel_docs x max_doc_len x emsize
 95 |         embedded_docs = self.emb_drop(embedded_docs)
 96 | 
 97 |         inp_rep = embedded_docs.transpose(1, 2)
 98 |         for layer in self.doc_conv1d_layers:
 99 |             inp_rep = layer(inp_rep)
100 |         # batch_size * num_rel_docs x ?
101 |         conv_docs = inp_rep.flatten(1)
102 | 
103 |         com_rep = torch.cat((conv_queries, conv_docs), 1)
104 |         score = self.mlp(com_rep).squeeze(1)
105 |         return score.view(batch_size, num_docs)
106 | 


--------------------------------------------------------------------------------
/neuroir/rankers/arcii.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.nn as nn
  3 | 
  4 | from neuroir.inputters import PAD
  5 | from neuroir.modules.embeddings import Embeddings
  6 | 
  7 | 
  8 | class ARCII(nn.Module):
  9 |     """Implementation of the convolutional matching model (ARC-II)."""
 10 | 
 11 |     def __init__(self, args):
 12 |         """"Constructor of the class."""
 13 |         super(ARCII, self).__init__()
 14 | 
 15 |         self.word_embeddings = Embeddings(args.emsize,
 16 |                                           args.src_vocab_size,
 17 |                                           PAD)
 18 |         self.emb_drop = nn.Dropout(p=args.dropout_emb)
 19 |         self.conv_query = nn.Conv1d(args.emsize,
 20 |                                     args.filters_1d,
 21 |                                     args.kernel_size_1d,
 22 |                                     padding=args.kernel_size_1d // 2)
 23 |         self.conv_doc = nn.Conv1d(args.emsize,
 24 |                                   args.filters_1d,
 25 |                                   args.kernel_size_1d,
 26 |                                   padding=args.kernel_size_1d // 2)
 27 |         self.maxpool1 = nn.MaxPool2d((2, 2))
 28 | 
 29 |         num_conv2d_layers = len(args.kernel_size_2d)
 30 |         assert num_conv2d_layers == len(args.maxpool_size_2d)
 31 | 
 32 |         doc_feats = args.max_doc_len // 2
 33 |         query_feats = args.max_query_len // 2
 34 | 
 35 |         conv2d_layers = []
 36 |         for i in range(num_conv2d_layers):
 37 |             inpsize = args.filters_1d if i == 0 else args.filters_2d[i - 1]
 38 |             layer = nn.Sequential(
 39 |                 nn.Conv2d(inpsize, args.filters_2d[i], args.kernel_size_2d[i],
 40 |                           padding=(args.kernel_size_2d[i][0] // 2, args.kernel_size_2d[i][1] // 2)),
 41 |                 nn.ReLU(inplace=True),
 42 |                 nn.MaxPool2d((args.maxpool_size_2d[i][0], args.maxpool_size_2d[i][1]))
 43 |             )
 44 |             conv2d_layers.append(layer)
 45 | 
 46 |             doc_feats = doc_feats // args.maxpool_size_2d[i][0]
 47 |             query_feats = query_feats // args.maxpool_size_2d[i][1]
 48 |             assert query_feats != 0 and doc_feats != 0
 49 | 
 50 |         self.conv2d_layers = nn.ModuleList(conv2d_layers)
 51 | 
 52 |         inpsize = args.filters_2d[-1] * query_feats * doc_feats
 53 |         self.mlp = nn.Sequential(
 54 |             nn.Linear(inpsize, inpsize // 2),
 55 |             nn.Linear(inpsize // 2, 1)
 56 |         )
 57 | 
 58 |     def forward(self, batch_queries, query_len, batch_docs, doc_len):
 59 |         """
 60 |         Forward function of the match tensor model. Return average loss for a batch of sessions.
 61 |         :param batch_queries: 2d tensor [batch_size x max_query_length]
 62 |         :param query_len: 1d numpy array [batch_size]
 63 |         :param batch_docs: 3d tensor [batch_size x num_rel_docs_per_query x max_document_length]
 64 |         :param doc_len: 2d numpy array [batch_size x num_clicks_per_query]
 65 |         :return: score representing click probability [batch_size x num_clicks_per_query]
 66 |         """
 67 |         assert batch_queries.shape[0] == batch_docs.shape[0]
 68 |         batch_size = batch_queries.shape[0]
 69 |         qlen = batch_queries.shape[1]
 70 |         num_docs, dlen = batch_docs.shape[1], batch_docs.shape[2]
 71 | 
 72 |         # embed query
 73 |         embedded_queries = self.word_embeddings(batch_queries.unsqueeze(2))
 74 |         # batch_size x max_q_len x emsize
 75 |         embedded_queries = self.emb_drop(embedded_queries)
 76 |         # batch_size x nfilters x max_q_len
 77 |         embedded_queries = self.conv_query(embedded_queries.transpose(1, 2))
 78 | 
 79 |         # batch_size x num_rel_docs x nfilters x max_q_len
 80 |         embedded_queries = embedded_queries.unsqueeze(1).expand(batch_size, num_docs,
 81 |                                                                 embedded_queries.size(1),
 82 |                                                                 embedded_queries.size(2))
 83 |         # batch_size * num_rel_docs x nfilters x max_q_len
 84 |         embedded_queries = embedded_queries.contiguous().view(batch_size * num_docs,
 85 |                                                               embedded_queries.size(2),
 86 |                                                               embedded_queries.size(3))
 87 | 
 88 |         # embed documents
 89 |         doc_rep = batch_docs.view(batch_size * num_docs, dlen)
 90 |         embedded_docs = self.word_embeddings(doc_rep.unsqueeze(2))
 91 |         # batch_size * num_rel_docs x max_doc_len x emsize
 92 |         embedded_docs = self.emb_drop(embedded_docs)
 93 |         # batch_size * num_rel_docs x nfilters x max_doc_len
 94 |         embedded_docs = self.conv_doc(embedded_docs.transpose(1, 2))
 95 | 
 96 |         # batch_size * num_rel_docs x nfilters x max_doc_len x max_q_len
 97 |         embedded_queries = torch.stack([embedded_queries] * dlen, dim=2)
 98 |         # batch_size * num_rel_docs x nfilters x max_doc_len x max_q_len
 99 |         embedded_docs = torch.stack([embedded_docs] * qlen, dim=3)
100 | 
101 |         # batch_size * num_rel_docs x nfilters x max_doc_len x max_q_len
102 |         comb_rep = embedded_queries + embedded_docs
103 |         # batch_size * num_rel_docs x nfilters x max_doc_len/2 x max_q_len/2
104 |         comb_rep = self.maxpool1(comb_rep)
105 | 
106 |         for layer in self.conv2d_layers:
107 |             comb_rep = layer(comb_rep)
108 |         comb_rep = comb_rep.flatten(1)
109 | 
110 |         score = self.mlp(comb_rep).squeeze(1)
111 |         return score.view(batch_size, num_docs)
112 | 


--------------------------------------------------------------------------------
/neuroir/rankers/cdssm.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | import torch.nn.functional as f
 4 | from neuroir.inputters import PAD
 5 | from neuroir.modules.embeddings import Embeddings
 6 | 
 7 | 
 8 | class CDSSM(nn.Module):
 9 |     """Implementation of the convolutional deep semantic similarity model."""
10 | 
11 |     def __init__(self, args):
12 |         """"Constructor of the class."""
13 |         super(CDSSM, self).__init__()
14 | 
15 |         self.window = 3
16 |         self.word_embeddings = Embeddings(args.emsize,
17 |                                           args.src_vocab_size,
18 |                                           PAD)
19 |         self.emb_drop = nn.Dropout(p=args.dropout_emb)
20 | 
21 |         K = self.window * args.emsize
22 |         L = args.nhid
23 |         KERNEL_SIZE = 3
24 |         O = args.nout
25 | 
26 |         self.query_conv = nn.Conv1d(K, L, KERNEL_SIZE)
27 |         self.query_sem = nn.Linear(L, O)
28 | 
29 |         self.doc_conv = nn.Conv1d(K, L, KERNEL_SIZE)
30 |         self.doc_sem = nn.Linear(L, O)
31 | 
32 |     def _interleave_tensor(self, inp):
33 |         dim = inp.shape[1]
34 |         assert dim >= self.window
35 |         constituents = []
36 |         offset = dim - self.window + 1
37 |         for i in range(self.window):
38 |             constituents.append(inp[:, i:offset + i, :])
39 |         out = torch.cat(constituents, dim=-1)
40 |         return out
41 | 
42 |     def forward(self, batch_queries, query_len, batch_docs, doc_len):
43 |         """
44 |         Forward function of the dssm model. Return average loss for a batch of queries.
45 |         :param batch_queries: 2d tensor [batch_size x max_query_length]
46 |         :param query_len: 1d numpy array [batch_size]
47 |         :param batch_docs: 3d tensor [batch_size x num_rel_docs_per_query x max_document_length]
48 |         :param doc_len: 2d numpy array [batch_size x num_clicks_per_query]
49 |         :return: softmax score representing click probability [batch_size x num_rel_docs_per_query]
50 |         """
51 |         assert batch_queries.shape[0] == batch_docs.shape[0]
52 |         batch_size = batch_queries.shape[0]
53 |         qlen = batch_queries.shape[1]
54 |         num_docs, dlen = batch_docs.shape[1], batch_docs.shape[2]
55 | 
56 |         # query encoding
57 |         embedded_queries = self.word_embeddings(batch_queries.unsqueeze(2))
58 |         embedded_queries = self.emb_drop(embedded_queries)  # b,s,h
59 |         embedded_queries = self._interleave_tensor(embedded_queries)  # b,s-2,3h
60 |         query_rep = self.query_conv(embedded_queries.transpose(1, 2)).transpose(1, 2)
61 |         query_rep = f.tanh(self.query_sem(f.tanh(query_rep)))
62 |         latent_query_rep = query_rep.max(1)[0]  # max-pooling
63 | 
64 |         # document encoding
65 |         doc_rep = batch_docs.view(batch_size * num_docs, dlen).unsqueeze(2)
66 |         embedded_docs = self.word_embeddings(doc_rep)
67 |         embedded_docs = self.emb_drop(embedded_docs)  # b,s,h
68 |         embedded_docs = self._interleave_tensor(embedded_docs)  # b,s-2,3h
69 |         doc_rep = self.doc_conv(embedded_docs.transpose(1, 2)).transpose(1, 2)
70 |         doc_rep = f.tanh(self.doc_sem(f.tanh(doc_rep)))
71 |         latent_doc_rep = doc_rep.max(1)[0]  # max-pooling
72 |         latent_doc_rep = latent_doc_rep.view(batch_size, num_docs, -1)
73 | 
74 |         # compute loss
75 |         latent_query_rep = latent_query_rep.unsqueeze(1).expand(*latent_doc_rep.size())
76 |         scores = f.cosine_similarity(latent_query_rep, latent_doc_rep, dim=2)
77 |         return scores
78 | 


--------------------------------------------------------------------------------
/neuroir/rankers/drmm.py:
--------------------------------------------------------------------------------
 1 | import numpy
 2 | import torch
 3 | import torch.nn as nn
 4 | import torch.nn.functional as f
 5 | 
 6 | from neuroir.inputters import PAD
 7 | from neuroir.modules.embeddings import Embeddings
 8 | 
 9 | 
10 | class DRMM(nn.Module):
11 |     """Implementation of the deep relevance matching model."""
12 | 
13 |     def __init__(self, args):
14 |         """"Constructor of the class."""
15 |         super(DRMM, self).__init__()
16 | 
17 |         self.word_embeddings = Embeddings(args.emsize,
18 |                                           args.src_vocab_size,
19 |                                           PAD)
20 |         self.emb_drop = nn.Dropout(p=args.dropout_emb)
21 | 
22 |         self.nbins = args.nbins
23 |         self.bins = [-1.0, -0.5, 0, 0.5, 1.0, 1.0]
24 | 
25 |         self.gating_network = GatingNetwork(args.emsize)
26 |         self.ffnn = nn.Sequential(nn.Linear(self.nbins, 1), nn.Linear(1, 1))
27 |         self.output = nn.Linear(1, 1)
28 | 
29 |     def forward(self, batch_queries, query_len, batch_docs, doc_len):
30 |         """
31 |         Forward function of the match tensor model. Return average loss for a batch of sessions.
32 |         :param batch_queries: 2d tensor [batch_size x max_query_length]
33 |         :param query_len: 1d numpy array [batch_size]
34 |         :param batch_docs: 3d tensor [batch_size x num_rel_docs_per_query x max_document_length]
35 |         :param doc_len: 2d numpy array [batch_size x num_clicks_per_query]
36 |         :return: score representing click probability [batch_size x num_clicks_per_query]
37 |         """
38 |         assert batch_queries.shape[0] == batch_docs.shape[0]
39 |         batch_size = batch_queries.shape[0]
40 |         qlen = batch_queries.shape[1]
41 |         num_docs, dlen = batch_docs.shape[1], batch_docs.shape[2]
42 |         use_cuda = batch_queries.is_cuda
43 | 
44 |         # embed query
45 |         embedded_queries = self.word_embeddings(batch_queries.unsqueeze(2))
46 |         # batch_size x max_query_len x emsize
47 |         embedded_queries = self.emb_drop(embedded_queries)
48 | 
49 |         # batch_size x num_rel_docs x max_q_len
50 |         term_weights = self.gating_network(embedded_queries).unsqueeze(1).expand(
51 |             batch_size, num_docs, qlen)
52 | 
53 |         # embed documents
54 |         doc_rep = batch_docs.view(batch_size * num_docs, dlen)
55 |         embedded_docs = self.word_embeddings(doc_rep.unsqueeze(2))
56 |         # batch_size * num_rel_docs x max_doc_len x emsize
57 |         embedded_docs = self.emb_drop(embedded_docs)
58 | 
59 |         # batch_size x num_rel_docs x max_query_len x emsize
60 |         embedded_queries = torch.stack([embedded_queries] * num_docs, dim=1)
61 |         # batch_size * num_rel_docs x max_query_len x emsize
62 |         embedded_queries = embedded_queries.contiguous().view(batch_size * num_docs, qlen, -1)
63 | 
64 |         # batch_size * num_rel_docs x max_query_len x max_doc_len x emsize
65 |         embedded_queries = torch.stack([embedded_queries] * dlen, dim=2)
66 |         # batch_size * num_rel_docs x max_query_len x max_doc_len x emsize
67 |         embedded_docs = torch.stack([embedded_docs] * qlen, dim=1)
68 | 
69 |         cos_sim = f.cosine_similarity(embedded_queries, embedded_docs, 3)
70 | 
71 |         hist = numpy.apply_along_axis(
72 |             lambda x: numpy.histogram(x, bins=self.bins), 2, cos_sim.detach().cpu().numpy())
73 |         histogram_feats = torch.from_numpy(
74 |             numpy.array([[axis2 for axis2 in axis1] for axis1 in hist[:, :, 0]])
75 |         ).float()
76 | 
77 |         if use_cuda:
78 |             histogram_feats = histogram_feats.cuda()
79 | 
80 |         ffnn_out = self.ffnn(histogram_feats).squeeze(2)
81 |         ffnn_out = ffnn_out.view(batch_size, num_docs, -1).contiguous()
82 |         weighted_ffnn_out = ffnn_out * term_weights
83 |         score = self.output(torch.sum(weighted_ffnn_out, 2, keepdim=True)).squeeze(1)
84 |         return score.view(batch_size, num_docs)
85 | 
86 | 
87 | class GatingNetwork(nn.Module):
88 |     """Term gating network"""
89 | 
90 |     def __init__(self, emsize):
91 |         """"Constructor of the class"""
92 |         super(GatingNetwork, self).__init__()
93 |         self.weight = nn.Linear(emsize, 1)
94 | 
95 |     def forward(self, term_embeddings):
96 |         """"Defines the forward computation of the gating network layer."""
97 |         dot_out = self.weight(term_embeddings).squeeze(2)
98 |         return f.softmax(dot_out, 1)
99 | 


--------------------------------------------------------------------------------
/neuroir/rankers/dssm.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | import torch.nn.functional as f
 4 | from neuroir.inputters import PAD
 5 | from neuroir.modules.embeddings import Embeddings
 6 | 
 7 | 
 8 | class DSSM(nn.Module):
 9 |     """Implementation of the deep semantic similarity model."""
10 | 
11 |     def __init__(self, args):
12 |         """"Constructor of the class."""
13 |         super(DSSM, self).__init__()
14 | 
15 |         self.word_embeddings = Embeddings(args.emsize,
16 |                                           args.src_vocab_size,
17 |                                           PAD)
18 |         self.emb_drop = nn.Dropout(p=args.dropout_emb)
19 | 
20 |         self.query_mlp = nn.Sequential(
21 |             nn.Linear(args.emsize, args.nhid),
22 |             nn.Tanh(),
23 |             nn.Linear(args.nhid, args.nout),
24 |             nn.Tanh()
25 |         )
26 |         self.doc_mlp = nn.Sequential(
27 |             nn.Linear(args.emsize, args.nhid),
28 |             nn.Tanh(),
29 |             nn.Linear(args.nhid, args.nout),
30 |             nn.Tanh()
31 |         )
32 | 
33 |     def forward(self, batch_queries, query_len, batch_docs, doc_len):
34 |         """
35 |         Forward function of the dssm model. Return average loss for a batch of queries.
36 |         :param batch_queries: 2d tensor [batch_size x max_query_length]
37 |         :param query_len: 1d numpy array [batch_size]
38 |         :param batch_docs: 3d tensor [batch_size x num_rel_docs_per_query x max_document_length]
39 |         :param doc_len: 2d numpy array [batch_size x num_clicks_per_query]
40 |         :return: softmax score representing click probability [batch_size x num_rel_docs_per_query]
41 |         """
42 |         assert batch_queries.shape[0] == batch_docs.shape[0]
43 |         batch_size = batch_queries.shape[0]
44 |         qlen = batch_queries.shape[1]
45 |         num_docs, dlen = batch_docs.shape[1], batch_docs.shape[2]
46 | 
47 |         # embed query
48 |         embedded_queries = self.word_embeddings(batch_queries.unsqueeze(2))
49 |         embedded_queries = self.emb_drop(embedded_queries)
50 |         embedded_queries = embedded_queries.max(1)[0]  # max-pooling
51 | 
52 |         # embed document
53 |         doc_rep = batch_docs.view(batch_size * num_docs, dlen).unsqueeze(2)
54 |         embedded_docs = self.word_embeddings(doc_rep)
55 |         embedded_docs = self.emb_drop(embedded_docs)
56 |         embedded_docs = embedded_docs.max(1)[0]  # max-pooling
57 |         embedded_docs = embedded_docs.view(batch_size, num_docs, -1)
58 | 
59 |         query_rep = self.query_mlp(embedded_queries)
60 |         doc_rep = self.doc_mlp(embedded_docs)
61 |         query_rep = query_rep.unsqueeze(1).expand(*doc_rep.size())
62 |         scores = f.cosine_similarity(query_rep, doc_rep, dim=2)
63 |         return scores
64 | 


--------------------------------------------------------------------------------
/neuroir/rankers/duet.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.nn as nn
  3 | import torch.nn.functional as f
  4 | from neuroir.inputters import PAD
  5 | from neuroir.modules.embeddings import Embeddings
  6 | 
  7 | 
  8 | # verified from https://github.com/bmitra-msft/NDRM/blob/master/notebooks/Duet.ipynb
  9 | class DUET(nn.Module):
 10 |     """Learning to Match using Local and Distributed Representations of Text for Web Search."""
 11 | 
 12 |     def __init__(self, args):
 13 |         """"Constructor of the class."""
 14 |         super(DUET, self).__init__()
 15 | 
 16 |         self.use_word = args.use_word
 17 |         if self.use_word:
 18 |             self.word_embeddings = Embeddings(args.emsize,
 19 |                                               args.src_vocab_size,
 20 |                                               PAD)
 21 |             self.emb_drop = nn.Dropout(p=args.dropout_emb)
 22 |         else:
 23 |             raise TypeError('Non-word inputs are not supported!')
 24 | 
 25 |         self.local_model = LocalModel(args)
 26 |         self.distributed_model = DistributedModel(args)
 27 | 
 28 |     def forward(self, batch_queries, query_len, batch_docs, doc_len):
 29 |         """
 30 |         Forward function of the dssm model. Return average loss for a batch of queries.
 31 |         :param batch_queries: 2d tensor [batch_size x max_query_length]
 32 |         :param query_len: 1d numpy array [batch_size]
 33 |         :param batch_docs: 3d tensor [batch_size x num_rel_docs_per_query x max_document_length]
 34 |         :param doc_len: 2d numpy array [batch_size x num_clicks_per_query]
 35 |         :return: softmax score representing click probability [batch_size x num_rel_docs_per_query]
 36 |         """
 37 |         assert batch_queries.shape[0] == batch_docs.shape[0]
 38 |         batch_size = batch_queries.shape[0]
 39 |         qlen = batch_queries.shape[1]
 40 |         num_docs, dlen = batch_docs.shape[1], batch_docs.shape[2]
 41 | 
 42 |         local_score = self.local_model(batch_queries, batch_docs)
 43 |         # ----------Embed the questions and paragraphs from word---------- #
 44 |         if self.use_word:
 45 |             # batch_size x max_query_len x emsize
 46 |             embedded_queries = self.word_embeddings(batch_queries.unsqueeze(2))
 47 |             embedded_queries = self.emb_drop(embedded_queries)
 48 |             # batch_size x num_rel_docs x max_doc_len x emsize
 49 |             doc_rep = batch_docs.view(batch_size * num_docs, dlen).unsqueeze(2)
 50 |             embedded_docs = self.word_embeddings(doc_rep)
 51 |             embedded_docs = self.emb_drop(embedded_docs)
 52 |             embedded_docs = embedded_docs.view(batch_size, num_docs, dlen, -1)
 53 |         else:
 54 |             embedded_queries = batch_queries
 55 |             embedded_docs = batch_docs
 56 |         # --------------------------------------------------------------- #
 57 |         distributed_score = self.distributed_model(embedded_queries, embedded_docs)
 58 |         total_score = local_score + distributed_score
 59 |         return total_score
 60 | 
 61 | 
 62 | class LocalModel(nn.Module):
 63 |     """Implementation of the local model."""
 64 | 
 65 |     def __init__(self, args):
 66 |         """"Constructor of the class."""
 67 |         super(LocalModel, self).__init__()
 68 | 
 69 |         self.conv1d = nn.Conv1d(args.max_doc_len,
 70 |                                 args.nfilters,
 71 |                                 args.local_filter_size)
 72 |         self.drop = nn.Dropout(args.dropout)
 73 |         self.fc1 = nn.Linear(args.max_query_len, 1)
 74 |         self.fc2 = nn.Linear(args.nfilters, args.nfilters)
 75 |         self.fc3 = nn.Linear(args.nfilters, 1)
 76 | 
 77 |     def forward(self, batch_queries, batch_clicks):
 78 |         """
 79 |         Forward function of the local model.
 80 |         Parameters
 81 |         --------------------
 82 |             batch_queries   -- 2d tensor (batch_size, max_q_len)
 83 |             batch_clicks    -- 3d tensor (batch_size, num_rel_docs, max_doc_len)
 84 |         Returns
 85 |         --------------------
 86 |             score           -- 2d tensor (batch_size, num_rel_docs) local relevance score
 87 |         """
 88 |         batch_size, num_candidates = batch_clicks.size(0), batch_clicks.size(1)
 89 |         max_query_len = batch_queries.size(1)
 90 |         max_doc_len = batch_clicks.size(2)
 91 | 
 92 |         # batch_size x num_rel_docs x max_q_len
 93 |         extended_queries = batch_queries.unsqueeze(1).expand(batch_size,
 94 |                                                              num_candidates,
 95 |                                                              max_query_len)
 96 |         # batch_size x num_rel_docs x max_doc_len x max_q_len
 97 |         query_rep = extended_queries.unsqueeze(2).expand(batch_size,
 98 |                                                          num_candidates,
 99 |                                                          max_doc_len,
100 |                                                          max_query_len)
101 | 
102 |         # batch_size x num_rel_docs x max_doc_len x max_q_len
103 |         doc_rep = batch_clicks.unsqueeze(3).expand(batch_size,
104 |                                                    num_candidates,
105 |                                                    max_doc_len,
106 |                                                    max_query_len)
107 | 
108 |         # ----------Create binary matrix based on unigram overlapping---------- #
109 |         diff_matrix = doc_rep - query_rep
110 |         bin_matrix = diff_matrix.clone()
111 |         bin_matrix[diff_matrix == 0] = 1
112 |         bin_matrix[diff_matrix != 0] = 0
113 |         # (batch_size * num_rel_docs) x max_doc_len x max_q_len
114 |         bin_matrix = bin_matrix.view(-1, max_doc_len, max_query_len).contiguous()
115 |         # (batch_size * num_rel_docs) x nfilters x max_q_len
116 |         conv_unigram = f.tanh(self.conv1d(bin_matrix.float()))
117 | 
118 |         mapped_feature1 = f.tanh(self.fc1(conv_unigram)).squeeze(2)
119 |         mapped_feature2 = self.drop(f.tanh(self.fc2(mapped_feature1)))
120 |         score = f.tanh(self.fc3(mapped_feature2)).view(batch_size, num_candidates)
121 |         return score
122 | 
123 | 
124 | class DistributedModel(nn.Module):
125 |     """Implementation of the distributed model."""
126 | 
127 |     def __init__(self, args):
128 |         """"Constructor of the class."""
129 |         super(DistributedModel, self).__init__()
130 | 
131 |         self.conv_q = nn.Conv1d(args.emsize,
132 |                                 args.nfilters,
133 |                                 args.dist_filter_size)
134 |         self.conv_d1 = nn.Conv1d(args.emsize,
135 |                                  args.nfilters,
136 |                                  args.dist_filter_size)
137 |         self.conv_d2 = nn.Conv1d(args.nfilters,
138 |                                  args.nfilters,
139 |                                  1)
140 | 
141 |         self.pool_size = args.pool_size
142 |         self.dropout = nn.Dropout(args.dropout)
143 |         self.fc1 = nn.Linear(args.nfilters, args.nfilters)
144 |         self.fc2 = nn.Linear(args.max_doc_len - args.pool_size - 1, 1)
145 |         self.fc3 = nn.Linear(args.nfilters, args.nfilters)
146 |         self.fc4 = nn.Linear(args.nfilters, 1)
147 | 
148 |     def forward(self, embedded_q, embedded_d):
149 |         """
150 |         Forward function of neural ranker.
151 |         Parameters
152 |         --------------------
153 |             embedded_q      -- 3d tensor (batch_size, max_q_len, emsize)
154 |             embedded_d      -- 4d tensor (batch_size, num_rel_docs, max_doc_len, emsize)
155 |         Returns
156 |         --------------------
157 |             score           -- 2d tensor (batch_size, num_rel_docs) distributed relevance score
158 |         """
159 |         batch_size = embedded_d.size(0)
160 |         num_candidates = embedded_d.size(1)
161 |         max_doc_len = embedded_d.size(2)
162 |         # (batch_size * num_rel_docs) x max_doc_len x emsize
163 |         embedded_d = embedded_d.view(batch_size * num_candidates, max_doc_len, -1)
164 | 
165 |         # ----------Apply convolution on question and paragraph embeddings---------- #
166 |         # batch_size x num_filters x (max_q_len - filter_size + 1)
167 |         conv_q = f.tanh(self.conv_q(embedded_q.transpose(1, 2)))
168 |         # (batch_size * num_rel_docs) x num_filters x (max_doc_len - filter_size + 1)
169 |         conv_p = f.tanh(self.conv_d1(embedded_d.transpose(1, 2)))
170 | 
171 |         # ----------Apply max-pooling on convolved question and document features---------- #
172 |         # batch_size x num_filters
173 |         max_pooled_q = f.max_pool1d(conv_q, conv_q.size(-1)).squeeze(2)
174 |         # (batch_size * num_rel_docs) x num_filters x (max_doc_len - filter_size - pool_size + 2)
175 |         max_pooled_d = f.max_pool1d(conv_p, self.pool_size, 1)
176 | 
177 |         # ----------Apply LT on query and convolution on paragraph representation---------- #
178 |         # batch_size x num_filters
179 |         query_rep = f.tanh(self.fc1(max_pooled_q))
180 |         # (batch_size * num_rel_docs) x num_filters x (max_doc_len - filter_size - pool_size + 2)
181 |         doc_rep = f.tanh(self.conv_d2(max_pooled_d))
182 | 
183 |         # ----------Apply hadamard (element-wise) product on question and document representation---------- #
184 |         # (batch_size * num_rel_docs) x (max_doc_len - filter_size - pool_size + 2) x num_filters
185 |         transposed_p = doc_rep.transpose(1, 2)
186 |         # batch_size x (max_doc_len - filter_size - pool_size + 2) x num_filters
187 |         transposed_q = query_rep.unsqueeze(1).expand(query_rep.size(0), *transposed_p.size()[1:])
188 |         # batch_size x num_rel_docs x (max_doc_len - filter_size - pool_size + 2) x num_filters
189 |         expanded_q = transposed_q.unsqueeze(1).expand(transposed_q.size(0), num_candidates, *transposed_q.size()[1:])
190 |         # (batch_size * num_rel_docs) x (max_doc_len - filter_size - pool_size + 2) x num_filters
191 |         mod_q = expanded_q.contiguous().view(-1, *expanded_q.size()[2:])
192 |         # (batch_size * num_rel_docs) x (max_doc_len - filter_size - pool_size + 2) x num_filters
193 |         hadamard = mod_q * transposed_p
194 |         # (batch_size * num_rel_docs) x num_filters x (max_doc_len - filter_size - pool_size + 2)
195 |         hadamard = hadamard.transpose(1, 2)
196 | 
197 |         # ----------Apply rest of the operation---------- #
198 |         # (batch_size * num_rel_docs * num_filters)
199 |         mapped_features1 = f.tanh(self.fc2(hadamard.contiguous().view(-1, hadamard.size(-1)).squeeze()))
200 |         # (batch_size * num_rel_docs) x num_filters
201 |         mapped_features1 = mapped_features1.view(*hadamard.size()[:-1])
202 |         # (batch_size * num_rel_docs) x num_filters
203 |         mapped_features2 = self.dropout(f.tanh(self.fc3(mapped_features1)))
204 |         # (batch_size * num_rel_docs)
205 |         score = f.tanh(self.fc4(mapped_features2).squeeze())
206 |         # batch_size x num_rel_docs
207 |         score = score.view(batch_size, num_candidates)
208 |         return score
209 | 


--------------------------------------------------------------------------------
/neuroir/rankers/esm.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | import torch.nn.functional as f
 4 | from neuroir.inputters import PAD
 5 | from neuroir.modules.embeddings import Embeddings
 6 | 
 7 | 
 8 | class ESM(nn.Module):
 9 |     """Implementation of the embedding space model."""
10 | 
11 |     def __init__(self, args):
12 |         """"Constructor of the class."""
13 |         super(ESM, self).__init__()
14 | 
15 |         self.word_embeddings = Embeddings(args.emsize,
16 |                                           args.src_vocab_size,
17 |                                           PAD)
18 | 
19 |     def forward(self, batch_queries, query_len, batch_docs, doc_len):
20 |         """
21 |         Forward function of the dssm model. Return average loss for a batch of queries.
22 |         :param batch_queries: 2d tensor [batch_size x max_query_length]
23 |         :param query_len: 1d numpy array [batch_size]
24 |         :param batch_docs: 3d tensor [batch_size x num_rel_docs_per_query x max_document_length]
25 |         :param doc_len: 2d numpy array [batch_size x num_clicks_per_query]
26 |         :return: softmax score representing click probability [batch_size x num_rel_docs_per_query]
27 |         """
28 |         assert batch_queries.shape[0] == batch_docs.shape[0]
29 |         batch_size = batch_queries.shape[0]
30 |         qlen = batch_queries.shape[1]
31 |         num_docs, dlen = batch_docs.shape[1], batch_docs.shape[2]
32 | 
33 |         # embed query
34 |         embedded_queries = self.word_embeddings(batch_queries.unsqueeze(2))
35 |         embedded_queries = embedded_queries.mean(1)  # averaging
36 | 
37 |         # embed document
38 |         doc_rep = batch_docs.view(batch_size * num_docs, dlen).unsqueeze(2)
39 |         embedded_docs = self.word_embeddings(doc_rep)
40 |         embedded_docs = embedded_docs.mean(1)  # averaging
41 |         doc_rep = embedded_docs.view(batch_size, num_docs, -1)
42 | 
43 |         query_rep = embedded_queries.unsqueeze(1).expand(*doc_rep.size())
44 |         scores = f.cosine_similarity(query_rep, doc_rep, dim=2)
45 |         return scores
46 | 


--------------------------------------------------------------------------------
/neuroir/rankers/mtensor.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.nn as nn
  3 | 
  4 | from neuroir.inputters import PAD
  5 | from neuroir.modules.embeddings import Embeddings
  6 | from neuroir.encoders import RNNEncoder
  7 | 
  8 | 
  9 | class Encoder(nn.Module):
 10 |     def __init__(self, args, input_size):
 11 |         super(Encoder, self).__init__()
 12 |         self.encoder = RNNEncoder(args.rnn_type,
 13 |                                   input_size,
 14 |                                   args.bidirection,
 15 |                                   args.nlayers,
 16 |                                   args.nhid,
 17 |                                   args.dropout_rnn)
 18 | 
 19 |     def forward(self, input, input_len):
 20 |         hidden, M = self.encoder(input, input_len)  # B x Seq-len x h
 21 |         return hidden, M
 22 | 
 23 | 
 24 | class MatchTensor(nn.Module):
 25 |     """Class that classifies question pair as duplicate or not."""
 26 | 
 27 |     def __init__(self, args):
 28 |         """"Constructor of the class."""
 29 |         super(MatchTensor, self).__init__()
 30 | 
 31 |         self.word_embeddings = Embeddings(args.emsize,
 32 |                                           args.src_vocab_size,
 33 |                                           PAD)
 34 |         self.emb_drop = nn.Dropout(p=args.dropout_emb)
 35 | 
 36 |         self.linear_projection = nn.Linear(args.emsize, args.featsize)
 37 | 
 38 |         self.query_encoder = RNNEncoder(args.rnn_type,
 39 |                                         args.featsize,
 40 |                                         args.bidirection,
 41 |                                         args.nlayers,
 42 |                                         args.nhid_query,
 43 |                                         args.dropout_rnn)
 44 |         self.document_encoder = RNNEncoder(args.rnn_type,
 45 |                                            args.featsize,
 46 |                                            args.bidirection,
 47 |                                            args.nlayers,
 48 |                                            args.nhid_doc,
 49 |                                            args.dropout_rnn)
 50 | 
 51 |         self.query_projection = nn.Linear(args.nhid_query, args.nchannels)
 52 |         self.document_projection = nn.Linear(args.nhid_doc, args.nchannels)
 53 | 
 54 |         self.exact_match_channel = ExactMatchChannel()
 55 |         self.conv1 = nn.Conv2d(args.nchannels + 1, args.nfilters, (3, 3), padding=1)
 56 |         self.conv2 = nn.Conv2d(args.nchannels + 1, args.nfilters, (3, 5), padding=(1, 2))
 57 |         self.conv3 = nn.Conv2d(args.nchannels + 1, args.nfilters, (3, 7), padding=(1, 3))
 58 |         self.relu = nn.ReLU()
 59 |         self.conv = nn.Conv2d(args.nfilters * 3, args.match_filter_size, (1, 1))
 60 |         self.output = nn.Linear(args.match_filter_size, 1)
 61 | 
 62 |     def forward(self, batch_queries, query_len, batch_docs, doc_len):
 63 |         """
 64 |         Forward function of the match tensor model. Return average loss for a batch of sessions.
 65 |         :param batch_queries: 2d tensor [batch_size x max_query_length]
 66 |         :param query_len: 1d numpy array [batch_size]
 67 |         :param batch_docs: 3d tensor [batch_size x num_rel_docs_per_query x max_document_length]
 68 |         :param doc_len: 2d numpy array [batch_size x num_clicks_per_query]
 69 |         :return: score representing click probability [batch_size x num_clicks_per_query]
 70 |         """
 71 |         assert batch_queries.shape[0] == batch_docs.shape[0]
 72 |         batch_size = batch_queries.shape[0]
 73 |         qlen = batch_queries.shape[1]
 74 |         num_docs, dlen = batch_docs.shape[1], batch_docs.shape[2]
 75 | 
 76 |         # step1: apply embedding lookup
 77 |         embedded_queries = self.word_embeddings(batch_queries.unsqueeze(2))
 78 |         # batch_size x max_q_len x emsize
 79 |         embedded_queries = self.emb_drop(embedded_queries)
 80 |         #
 81 |         doc_rep = batch_docs.view(batch_size * num_docs, dlen)
 82 |         embedded_docs = self.word_embeddings(doc_rep.unsqueeze(2))
 83 |         # batch_size * num_rel_docs x max_doc_len x emsize
 84 |         embedded_docs = self.emb_drop(embedded_docs)
 85 | 
 86 |         # step2: apply linear projection on embedded queries and documents
 87 |         # batch_size x max_q_len x featsize
 88 |         embedded_queries = self.linear_projection(embedded_queries)
 89 |         # batch_size * num_rel_docs x max_doc_len x featsize
 90 |         embedded_docs = self.linear_projection(embedded_docs)
 91 | 
 92 |         # step3: pass the encoded query and doc through an RNN
 93 |         _, encoded_queries = self.query_encoder(embedded_queries, query_len)
 94 |         _, encoded_docs = self.document_encoder(embedded_docs, doc_len.reshape(-1))
 95 | 
 96 |         # step4: apply linear projection on query hidden states
 97 | 
 98 |         # batch_size x max_q_len x nchannels
 99 |         projected_queries = self.query_projection(encoded_queries)
100 |         projected_queries = torch.stack([projected_queries] * num_docs, dim=1)
101 |         # batch_size * num_rel_docs x max_q_len x nchannels
102 |         projected_queries = projected_queries.contiguous().view(batch_size * num_docs, qlen, -1)
103 | 
104 |         # batch_size * num_rel_docs x max_q_len x max_doc_len x nchannels
105 |         projected_queries = torch.stack([projected_queries] * dlen, dim=2)
106 | 
107 |         # batch_size * num_rel_docs x max_doc_len x nchannels
108 |         projected_docs = self.document_projection(encoded_docs)
109 |         # batch_size * num_rel_docs x max_q_len x max_doc_len x nchannels
110 |         projected_docs = torch.stack([projected_docs] * qlen, dim=1)
111 | 
112 |         # step5: 2d product between projected query and doc vectors
113 |         # batch_size * num_rel_docs x max_q_len x max_doc_len x nchannels
114 |         query_document_product = projected_queries * projected_docs
115 | 
116 |         # step6: append exact match channel
117 |         exact_match = self.exact_match_channel(batch_queries, batch_docs).unsqueeze(3)
118 |         query_document_product = torch.cat((query_document_product, exact_match), 3)
119 |         query_document_product = query_document_product.transpose(2, 3).transpose(1, 2)
120 | 
121 |         # step7: run the convolutional operation, max-pooling and linear projection
122 |         convoluted_feat1 = self.conv1(query_document_product)
123 |         convoluted_feat2 = self.conv2(query_document_product)
124 |         convoluted_feat3 = self.conv3(query_document_product)
125 |         convoluted_feat = self.relu(torch.cat((convoluted_feat1, convoluted_feat2, convoluted_feat3), 1))
126 |         convoluted_feat = self.conv(convoluted_feat).transpose(1, 2).transpose(2, 3)
127 | 
128 |         max_pooled_feat = torch.max(convoluted_feat, 2)[0]
129 |         max_pooled_feat = torch.max(max_pooled_feat, 1)[0]
130 |         scores = self.output(max_pooled_feat).squeeze(-1)
131 |         return scores.view(batch_size, num_docs)
132 | 
133 | 
134 | class ExactMatchChannel(nn.Module):
135 |     """Exact match channel layer for the match tensor"""
136 | 
137 |     def __init__(self):
138 |         """"Constructor of the class"""
139 |         super(ExactMatchChannel, self).__init__()
140 |         self.alpha = nn.Parameter(torch.FloatTensor(1))
141 |         # Initializing the value of alpha
142 |         torch.nn.init.uniform_(self.alpha)
143 | 
144 |     def forward(self, batch_query, batch_docs):
145 |         """"Computes the exact match channel"""
146 |         query_tensor = batch_query.unsqueeze(1).expand(batch_query.size(0),
147 |                                                        batch_docs.size(1),
148 |                                                        batch_query.size(1))
149 |         query_tensor = query_tensor.contiguous().view(-1, query_tensor.size(2))
150 |         doc_tensor = batch_docs.view(-1, batch_docs.size(2))
151 | 
152 |         query_tensor = query_tensor.unsqueeze(2).expand(*query_tensor.size(), batch_docs.size(2))
153 |         doc_tensor = doc_tensor.unsqueeze(1).expand(doc_tensor.size(0),
154 |                                                     batch_query.size(1),
155 |                                                     doc_tensor.size(1))
156 | 
157 |         exact_match = (query_tensor == doc_tensor).float()
158 |         return exact_match * self.alpha.expand(exact_match.size())
159 | 


--------------------------------------------------------------------------------
/neuroir/recommender/__init.py:
--------------------------------------------------------------------------------
1 | __author__ = 'wasi'
2 | 


--------------------------------------------------------------------------------
/neuroir/recommender/hredqs.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.nn as nn
  3 | import torch.nn.functional as f
  4 | 
  5 | from .layers import Embedder, Encoder, Decoder
  6 | from neuroir.inputters import constants
  7 | 
  8 | 
  9 | class HredQS(nn.Module):
 10 |     def __init__(self, args):
 11 |         super(HredQS, self).__init__()
 12 | 
 13 |         self.embedder = Embedder(emsize=args.emsize,
 14 |                                  src_vocab_size=args.src_vocab_size,
 15 |                                  dropout_emb=args.dropout_emb)
 16 | 
 17 |         self.encoder = Encoder(rnn_type=args.rnn_type,
 18 |                                input_size=self.embedder.output_size,
 19 |                                bidirection=args.bidirection,
 20 |                                nlayers=args.nlayers,
 21 |                                nhid=args.nhid,
 22 |                                dropout_rnn=args.dropout_rnn)
 23 | 
 24 |         # session encoder is unidirectional
 25 |         self.session_encoder = Encoder(rnn_type=args.rnn_type,
 26 |                                        input_size=args.nhid,
 27 |                                        bidirection=False,
 28 |                                        nlayers=args.nlayers,
 29 |                                        nhid=args.nhid_session,
 30 |                                        dropout_rnn=args.dropout_rnn)
 31 | 
 32 |         self.decoder = Decoder(rnn_type=args.rnn_type,
 33 |                                input_size=self.embedder.output_size,
 34 |                                bidirection=args.bidirection,
 35 |                                nlayers=args.nlayers,
 36 |                                nhid=args.nhid_session,  # check hidsize
 37 |                                attn_type='none',
 38 |                                dropout_rnn=args.dropout_rnn,
 39 |                                copy_attn=False,
 40 |                                reuse_copy_attn=False)
 41 | 
 42 |         self.dropout = nn.Dropout(args.dropout)
 43 |         self.generator = nn.Linear(args.nhid_session, args.tgt_vocab_size)
 44 |         self.log_softmax = nn.LogSoftmax(dim=-1)
 45 | 
 46 |     def encode(self,
 47 |                source_rep,
 48 |                source_len,
 49 |                batch_size,
 50 |                session_len):
 51 | 
 52 |         # batch_size x max_src_len x emsize
 53 |         source_word_rep = self.embedder(source_rep)
 54 | 
 55 |         # memory_bank: B x P x h; hidden: l*num_directions x B x h
 56 |         _, memory_bank = self.encoder(source_word_rep, source_len)
 57 |         memory_bank = self.dropout(memory_bank)
 58 | 
 59 |         # apply max-pooling
 60 |         memory_bank = self.apply_pooling(memory_bank, pool_type='max')
 61 |         # batch_size x session_len x nhid
 62 |         memory_bank = memory_bank.view(batch_size, session_len, -1).contiguous()
 63 | 
 64 |         # session level encoding
 65 |         hidden = None
 66 |         hidden_states, cell_states = [], []
 67 |         for sidx in range(memory_bank.size(1)):
 68 |             i_input = memory_bank[:, sidx, :].unsqueeze(1)
 69 |             # hidden: (layers*directions) x batch x dim.
 70 |             hidden, session_bank = self.session_encoder(i_input,
 71 |                                                         None,
 72 |                                                         init_states=hidden)
 73 |             if isinstance(hidden, tuple):  # LSTM
 74 |                 hidden_states.append(hidden[0])
 75 |                 cell_states.append(hidden[1])
 76 |             else:  # GRU
 77 |                 hidden_states.append(hidden[0])
 78 | 
 79 |         # (layers*directions) x (batch*session_len) x dim.
 80 |         if len(cell_states) != 0:
 81 |             hidden_states = torch.cat(hidden_states, dim=1)
 82 |             cell_states = torch.cat(cell_states, dim=1)
 83 |             states = (hidden_states, cell_states)
 84 |         else:
 85 |             states = torch.cat(hidden_states, dim=1)
 86 | 
 87 |         return states
 88 | 
 89 |     def forward(self,
 90 |                 source_rep,
 91 |                 source_len,
 92 |                 target_rep,
 93 |                 target_len,
 94 |                 target_seq,
 95 |                 source_map,
 96 |                 alignment):
 97 |         """
 98 |         Input:
 99 |             - source_rep: ``(batch_size, session_len-1, max_src_len)``
100 |             - source_len: ``(batch_size, session_len-1)``
101 |             - target_rep: ``(batch_size, session_len-1, max_tgt_len)``
102 |             - target_len: ``(batch_size, session_len-1)``
103 |             - target_seq: ``(batch_size, session_len-1, max_tgt_len)``
104 |         Output:
105 |             - loss: average loss over the batch elements
106 |         """
107 |         batch_size = source_rep.size(0)
108 |         session_len = source_rep.size(1)
109 | 
110 |         source_rep = source_rep.view(batch_size * session_len, -1).contiguous()
111 |         target_rep = target_rep.view(batch_size * session_len, -1).contiguous()
112 |         target_seq = target_seq.view(batch_size * session_len, -1).contiguous()
113 |         source_len = source_len.view(-1).contiguous()
114 |         target_len = target_len.view(-1).contiguous()
115 | 
116 |         states = self.encode(source_rep,
117 |                              source_len,
118 |                              batch_size,
119 |                              session_len)
120 | 
121 |         # ------- Decoding -------
122 | 
123 |         # (batch*session_len) x max_src_len x emsize
124 |         target_word_rep = self.embedder(target_rep)
125 | 
126 |         init_decoder_state = self.decoder.init_decoder(states)
127 |         decoder_outputs, _ = self.decoder(target_word_rep,
128 |                                           None,
129 |                                           None,
130 |                                           init_decoder_state)
131 | 
132 |         target = target_seq[:, 1:].contiguous()
133 |         scores = self.generator(decoder_outputs)  # `(batch*session_len) x max_tgt_len x vocab_size`
134 |         scores = scores[:, :-1, :].contiguous()  # `(batch*session_len) x max_tgt_len - 1 x vocab_size`
135 |         logll = self.log_softmax(scores)
136 |         ml_loss = f.nll_loss(logll.view(-1, logll.size(2)),
137 |                              target.view(-1),
138 |                              reduce=False)
139 | 
140 |         ml_loss = ml_loss.view(*scores.size()[:-1])
141 |         ml_loss = ml_loss.mul(target.ne(constants.PAD).float())
142 |         ml_loss = ml_loss.sum(1).mean()
143 |         return ml_loss
144 | 
145 |     @staticmethod
146 |     def apply_pooling(encodings, pool_type):
147 |         if pool_type == 'max':
148 |             pooled_encodings = encodings.max(1)[0]
149 |         elif pool_type == 'mean':
150 |             pooled_encodings = encodings.mean(1)
151 |         else:
152 |             raise NotImplementedError
153 | 
154 |         return pooled_encodings
155 | 
156 |     def __tens2sent(self, t, tgt_dict, src_vocabs):
157 |         words = []
158 |         for idx, w in enumerate(t):
159 |             widx = w[0].item()
160 |             if widx < len(tgt_dict):
161 |                 words.append(tgt_dict[widx])
162 |             elif src_vocabs:
163 |                 widx = widx - len(tgt_dict)
164 |                 words.append(src_vocabs[idx][widx])
165 |             else:
166 |                 raise NotImplementedError
167 |         return words
168 | 
169 |     def decode(self,
170 |                source_rep,
171 |                source_len,
172 |                max_len,
173 |                src_dict,
174 |                tgt_dict,
175 |                src_map,
176 |                alignment,
177 |                blank,
178 |                fill,
179 |                source_vocabs):
180 | 
181 |         batch_size = source_rep.size(0)
182 |         session_len = source_rep.size(1)
183 |         use_cuda = source_rep.is_cuda
184 | 
185 |         source_rep = source_rep.view(batch_size * session_len, -1).contiguous()
186 |         source_len = source_len.view(-1).contiguous()
187 | 
188 |         states = self.encode(source_rep,
189 |                              source_len,
190 |                              batch_size,
191 |                              session_len)
192 | 
193 |         # ------- Decoding -------
194 | 
195 |         init_decoder_state = self.decoder.init_decoder(states)
196 | 
197 |         tgt = torch.LongTensor([constants.BOS])
198 |         if use_cuda:
199 |             tgt = tgt.cuda()
200 |         tgt = tgt.expand(batch_size * session_len).unsqueeze(1)  # B x 1
201 | 
202 |         dec_preds = []
203 |         for idx in range(max_len):
204 |             # (batch*session_len) x 1 x emsize
205 |             target_word_rep = self.embedder(tgt)
206 | 
207 |             decoder_outputs, _ = self.decoder(target_word_rep,
208 |                                               None,
209 |                                               None,
210 |                                               init_decoder_state)
211 | 
212 |             prediction = self.generator(decoder_outputs.squeeze(1))
213 |             prediction = f.softmax(prediction, dim=1)
214 | 
215 |             # (batch*session_len) x 1
216 |             tgt = torch.max(prediction, dim=1, keepdim=True)[1]
217 |             dec_preds.append(tgt.squeeze(1).clone())
218 | 
219 |             words = self.__tens2sent(tgt, tgt_dict, None)
220 |             words = [src_dict[w] for w in words]
221 |             words = torch.Tensor(words).type_as(tgt)
222 |             tgt = words.unsqueeze(1)
223 | 
224 |         # (batch*session_len) x max_len
225 |         dec_preds = torch.stack(dec_preds, dim=1)
226 |         dec_preds = dec_preds.view(batch_size, session_len, max_len).contiguous()
227 | 
228 |         return {
229 |             'predictions': dec_preds
230 |         }
231 | 


--------------------------------------------------------------------------------
/neuroir/recommender/layers.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | 
 4 | from neuroir.inputters import BOS, PAD
 5 | from neuroir.modules.embeddings import Embeddings
 6 | from neuroir.encoders.rnn_encoder import RNNEncoder
 7 | from neuroir.decoders.rnn_decoder import RNNDecoder
 8 | 
 9 | 
10 | class Embedder(nn.Module):
11 |     def __init__(self,
12 |                  emsize,
13 |                  src_vocab_size,
14 |                  dropout_emb):
15 |         super(Embedder, self).__init__()
16 | 
17 |         self.word_embeddings = Embeddings(emsize,
18 |                                           src_vocab_size,
19 |                                           PAD)
20 |         self.output_size = emsize
21 |         self.dropout = nn.Dropout(dropout_emb)
22 | 
23 |     def forward(self,
24 |                 sequence):
25 |         word_rep = self.word_embeddings(sequence.unsqueeze(2))  # B x P x d
26 |         word_rep = self.dropout(word_rep)
27 |         return word_rep
28 | 
29 | 
30 | class Encoder(nn.Module):
31 |     def __init__(self,
32 |                  rnn_type,
33 |                  input_size,
34 |                  bidirection,
35 |                  nlayers,
36 |                  nhid,
37 |                  dropout_rnn):
38 |         super(Encoder, self).__init__()
39 | 
40 |         self.encoder = RNNEncoder(rnn_type,
41 |                                   input_size,
42 |                                   bidirection,
43 |                                   nlayers,
44 |                                   nhid,
45 |                                   dropout_rnn)
46 | 
47 |     def forward(self,
48 |                 input,
49 |                 input_len,
50 |                 init_states=None):
51 |         hidden, M = self.encoder(input,
52 |                                  input_len,
53 |                                  init_states)  # B x Seq-len x h
54 |         return hidden, M
55 | 
56 | 
57 | class Decoder(nn.Module):
58 |     def __init__(self,
59 |                  rnn_type,
60 |                  input_size,
61 |                  bidirection,
62 |                  nlayers,
63 |                  nhid,
64 |                  attn_type,
65 |                  dropout_rnn,
66 |                  copy_attn,
67 |                  reuse_copy_attn):
68 |         super(Decoder, self).__init__()
69 | 
70 |         attn_type = None if attn_type == 'none' else attn_type
71 |         self.decoder = RNNDecoder(rnn_type,
72 |                                   input_size,
73 |                                   bidirection,
74 |                                   nlayers,
75 |                                   nhid,
76 |                                   attn_type=attn_type,
77 |                                   dropout=dropout_rnn,
78 |                                   copy_attn=copy_attn,
79 |                                   reuse_copy_attn=reuse_copy_attn)
80 | 
81 |     def init_decoder(self, hidden):
82 |         return self.decoder.init_decoder_state(hidden)
83 | 
84 |     def forward(self,
85 |                 tgt,
86 |                 memory_bank,
87 |                 memory_len,
88 |                 state):
89 |         decoder_outputs, _, attns = self.decoder(tgt,
90 |                                                  memory_bank,
91 |                                                  state,
92 |                                                  memory_lengths=memory_len)
93 |         return decoder_outputs, attns
94 | 


--------------------------------------------------------------------------------
/neuroir/recommender/seq2seq.py:
--------------------------------------------------------------------------------
  1 | # Includes a simplified implementation of https://arxiv.org/abs/1708.03418
  2 | 
  3 | import torch
  4 | import torch.nn as nn
  5 | import torch.nn.functional as f
  6 | 
  7 | from .layers import Embedder, Encoder, Decoder
  8 | 
  9 | from neuroir.inputters import constants
 10 | from neuroir.modules.copy_generator import CopyGenerator, CopyGeneratorCriterion
 11 | 
 12 | 
 13 | class Seq2seq(nn.Module):
 14 |     def __init__(self, args):
 15 |         super(Seq2seq, self).__init__()
 16 | 
 17 |         self.embedder = Embedder(emsize=args.emsize,
 18 |                                  src_vocab_size=args.src_vocab_size,
 19 |                                  dropout_emb=args.dropout_emb)
 20 | 
 21 |         self.encoder = Encoder(rnn_type=args.rnn_type,
 22 |                                input_size=self.embedder.output_size,
 23 |                                bidirection=args.bidirection,
 24 |                                nlayers=args.nlayers,
 25 |                                nhid=args.nhid,
 26 |                                dropout_rnn=args.dropout_rnn)
 27 | 
 28 |         self.decoder = Decoder(rnn_type=args.rnn_type,
 29 |                                input_size=self.embedder.output_size,
 30 |                                bidirection=args.bidirection,
 31 |                                nlayers=args.nlayers,
 32 |                                nhid=args.nhid,
 33 |                                attn_type=args.attn_type,
 34 |                                dropout_rnn=args.dropout_rnn,
 35 |                                copy_attn=args.copy_attn,
 36 |                                reuse_copy_attn=args.reuse_copy_attn)
 37 | 
 38 |         self.dropout = nn.Dropout(args.dropout)
 39 |         self.generator = nn.Linear(args.nhid, args.tgt_vocab_size)
 40 |         self.log_softmax = nn.LogSoftmax(dim=-1)
 41 |         self.copy_attn = args.copy_attn
 42 |         if self.copy_attn:
 43 |             self.copy_generator = CopyGenerator(args.nhid,
 44 |                                                 self.generator)
 45 |             self.criterion = CopyGeneratorCriterion(vocab_size=args.tgt_vocab_size,
 46 |                                                     force_copy=args.force_copy)
 47 | 
 48 |     def forward(self,
 49 |                 source_rep,
 50 |                 source_len,
 51 |                 target_rep,
 52 |                 target_len,
 53 |                 target_seq,
 54 |                 source_map,
 55 |                 alignment):
 56 |         """
 57 |         Input:
 58 |             - source_rep: ``(batch_size, max_src_len)``
 59 |             - source_len: ``(batch_size)``
 60 |             - target_rep: ``(batch_size, max_tgt_len)``
 61 |             - target_len: ``(batch_size)``
 62 |             - target_seq: ``(batch_size, max_tgt_len)``
 63 |         Output:
 64 |             - loss: tensor with a single value
 65 |         """
 66 | 
 67 |         # batch_size x max_src_len x emsize
 68 |         source_word_rep = self.embedder(source_rep)
 69 | 
 70 |         # memory_bank: B x P x h; hidden: l*num_directions x B x h
 71 |         hidden, memory_bank = self.encoder(source_word_rep, source_len)
 72 |         memory_bank = self.dropout(memory_bank)
 73 | 
 74 |         # batch_size x max_src_len x emsize
 75 |         target_word_rep = self.embedder(target_rep)
 76 | 
 77 |         init_decoder_state = self.decoder.init_decoder(hidden)
 78 |         decoder_outputs, attns = self.decoder(target_word_rep,
 79 |                                               memory_bank,
 80 |                                               source_len,
 81 |                                               init_decoder_state)
 82 | 
 83 |         target = target_seq[:, 1:].contiguous()
 84 |         if self.copy_attn:
 85 |             scores = self.copy_generator(decoder_outputs,
 86 |                                          attns["copy"],
 87 |                                          source_map)
 88 |             scores = scores[:, :-1, :].contiguous()
 89 |             ml_loss = self.criterion(scores,
 90 |                                      alignment[:, 1:].contiguous(),
 91 |                                      target)
 92 |         else:
 93 |             scores = self.generator(decoder_outputs)  # `batch x max_tgt_len x vocab_size`
 94 |             scores = scores[:, :-1, :].contiguous()  # `batch x max_tgt_len - 1 x vocab_size`
 95 |             logll = self.log_softmax(scores)
 96 |             ml_loss = f.nll_loss(logll.view(-1, logll.size(2)),
 97 |                                  target.view(-1),
 98 |                                  reduce=False)
 99 | 
100 |         ml_loss = ml_loss.view(*scores.size()[:-1])
101 |         ml_loss = ml_loss.mul(target.ne(constants.PAD).float())
102 |         ml_loss = ml_loss.sum(1).mean()
103 |         return ml_loss
104 | 
105 |     def __tens2sent(self, t, tgt_dict, src_vocabs):
106 |         words = []
107 |         for idx, w in enumerate(t):
108 |             widx = w[0].item()
109 |             if widx < len(tgt_dict):
110 |                 words.append(tgt_dict[widx])
111 |             elif src_vocabs:
112 |                 widx = widx - len(tgt_dict)
113 |                 words.append(src_vocabs[idx][widx])
114 |             else:
115 |                 raise NotImplementedError
116 |         return words
117 | 
118 |     def decode(self,
119 |                source_rep,
120 |                source_len,
121 |                max_len,
122 |                src_dict,
123 |                tgt_dict,
124 |                src_map,
125 |                alignment,
126 |                blank,
127 |                fill,
128 |                source_vocabs):
129 | 
130 |         batch_size = source_rep.size(0)
131 |         use_cuda = source_rep.is_cuda
132 | 
133 |         # batch_size x max_src_len x emsize
134 |         source_word_rep = self.embedder(source_rep)
135 | 
136 |         # memory_bank: B x P x h; hidden: l*num_directions x B x h
137 |         hidden, memory_bank = self.encoder(source_word_rep, source_len)
138 |         memory_bank = self.dropout(memory_bank)
139 | 
140 |         init_decoder_state = self.decoder.init_decoder(hidden)
141 | 
142 |         tgt = torch.LongTensor([constants.BOS])
143 |         if use_cuda:
144 |             tgt = tgt.cuda()
145 |         tgt = tgt.expand(batch_size).unsqueeze(1)  # B x 1
146 | 
147 |         dec_preds, attentions = [], []
148 |         for idx in range(max_len):
149 |             target_word_rep = self.embedder(tgt)
150 | 
151 |             # decoder_outputs = batch_size x 1 x tgt_dict_size
152 |             decoder_outputs, attns = self.decoder(target_word_rep,
153 |                                                   memory_bank,
154 |                                                   source_len,
155 |                                                   init_decoder_state)
156 | 
157 |             if self.copy_attn:
158 |                 prediction = self.copy_generator(decoder_outputs,
159 |                                                  attns["copy"],
160 |                                                  src_map)
161 |                 prediction = prediction.squeeze(1)
162 |                 for b in range(prediction.size(0)):
163 |                     if blank[b]:
164 |                         blank_b = torch.LongTensor(blank[b])
165 |                         fill_b = torch.LongTensor(fill[b])
166 |                         if use_cuda:
167 |                             blank_b = blank_b.cuda()
168 |                             fill_b = fill_b.cuda()
169 |                         prediction[b].index_add_(0, fill_b,
170 |                                                  prediction[b].index_select(0, blank_b))
171 |                         prediction[b].index_fill_(0, blank_b, 1e-10)
172 | 
173 |             else:
174 |                 prediction = self.generator(decoder_outputs.squeeze(1))
175 |                 prediction = f.softmax(prediction, dim=1)
176 | 
177 |             tgt = torch.max(prediction, dim=1, keepdim=True)[1]
178 |             dec_preds.append(tgt.squeeze(1).clone())
179 |             if "std" in attns:
180 |                 attentions.append(attns["std"].squeeze(1))
181 | 
182 |             words = self.__tens2sent(tgt, tgt_dict, source_vocabs)
183 |             words = [src_dict[w] for w in words]
184 |             words = torch.Tensor(words).type_as(tgt)
185 |             tgt = words.unsqueeze(1)
186 | 
187 |         # batch_size x max_len
188 |         dec_preds = torch.stack(dec_preds, dim=1)
189 |         # batch_size x max_len x source_len
190 |         attentions = torch.stack(attentions, dim=1) if attentions else None
191 | 
192 |         return {
193 |             'predictions': dec_preds,
194 |             'attentions': attentions
195 |         }
196 | 


--------------------------------------------------------------------------------
/neuroir/utils/__init__.py:
--------------------------------------------------------------------------------
1 | __author__ = 'wasi'
2 | 
3 | from .copy_utils import *
4 | from .logging import *
5 | from .misc import *
6 | from .timer import *
7 | 


--------------------------------------------------------------------------------
/neuroir/utils/copy_utils.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from neuroir.inputters import constants
 3 | 
 4 | 
 5 | def collapse_copy_scores(tgt_dict, src_vocabs):
 6 |     """
 7 |     Given scores from an expanded dictionary
 8 |     corresponeding to a batch, sums together copies,
 9 |     with a dictionary word when it is ambiguous.
10 |     """
11 |     offset = len(tgt_dict)
12 |     blank_arr, fill_arr = [], []
13 |     for b in range(len(src_vocabs)):
14 |         blank = []
15 |         fill = []
16 |         src_vocab = src_vocabs[b]
17 |         # Starting from 2 to ignore PAD and UNK token
18 |         for i in range(2, len(src_vocab)):
19 |             sw = src_vocab[i]
20 |             ti = tgt_dict[sw]
21 |             if ti != constants.UNK:
22 |                 blank.append(offset + i)
23 |                 fill.append(ti)
24 | 
25 |         blank_arr.append(blank)
26 |         fill_arr.append(fill)
27 | 
28 |     return blank_arr, fill_arr
29 | 
30 | 
31 | def make_src_map(data):
32 |     """ ? """
33 |     src_size = max([t.size(0) for t in data])
34 |     src_vocab_size = max([t.max() for t in data]) + 1
35 |     alignment = torch.zeros(len(data), src_size, src_vocab_size)
36 |     for i, sent in enumerate(data):
37 |         for j, t in enumerate(sent):
38 |             alignment[i, j, t] = 1
39 |     return alignment
40 | 
41 | 
42 | def align(data):
43 |     """ ? """
44 |     tgt_size = max([t.size(0) for t in data])
45 |     alignment = torch.zeros(len(data), tgt_size).long()
46 |     for i, sent in enumerate(data):
47 |         alignment[i, :sent.size(0)] = sent
48 |     return alignment
49 | 
50 | 
51 | def replace_unknown(prediction, attn, src_raw):
52 |     """ ?
53 |         attn: tgt_len x src_len
54 |     """
55 |     tokens = prediction.split()
56 |     for i in range(len(tokens)):
57 |         if tokens[i] == constants.UNK_WORD:
58 |             _, max_index = attn[i].max(0)
59 |             tokens[i] = src_raw[max_index.item()]
60 |     return ' '.join(tokens)
61 | 


--------------------------------------------------------------------------------
/neuroir/utils/logging.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | from __future__ import absolute_import
 3 | 
 4 | import logging
 5 | 
 6 | logger = logging.getLogger()
 7 | 
 8 | 
 9 | def init_logger(log_file=None):
10 |     log_format = logging.Formatter("[%(asctime)s %(levelname)s] %(message)s")
11 |     logger = logging.getLogger()
12 |     logger.setLevel(logging.INFO)
13 | 
14 |     if log_file and log_file != '':
15 |         file_handler = logging.FileHandler(log_file)
16 |         file_handler.setFormatter(log_format)
17 |         logger.addHandler(file_handler)
18 | 
19 |     console_handler = logging.StreamHandler()
20 |     console_handler.setFormatter(log_format)
21 |     logger.addHandler(console_handler)
22 | 
23 |     return logger
24 | 


--------------------------------------------------------------------------------
/neuroir/utils/misc.py:
--------------------------------------------------------------------------------
  1 | # src: https://github.com/OpenNMT/OpenNMT-py/blob/master/onmt/utils/misc.py
  2 | # -*- coding: utf-8 -*-
  3 | 
  4 | import string
  5 | import torch
  6 | import subprocess
  7 | from neuroir.inputters import constants
  8 | 
  9 | 
 10 | def normalize_string(s):
 11 |     """Lower text and remove punctuation, and extra whitespace."""
 12 | 
 13 |     def white_space_fix(text):
 14 |         return ' '.join(text.split())
 15 | 
 16 |     def remove_punc(text):
 17 |         exclude = set(string.punctuation)
 18 |         return ''.join(ch for ch in text if ch not in exclude)
 19 | 
 20 |     def lower(text):
 21 |         return text.lower()
 22 | 
 23 |     return white_space_fix(remove_punc(lower(s)))
 24 | 
 25 | 
 26 | def aeq(*args):
 27 |     """
 28 |     Assert all arguments have the same value
 29 |     """
 30 |     arguments = (arg for arg in args)
 31 |     first = next(arguments)
 32 |     assert all(arg == first for arg in arguments), \
 33 |         "Not all arguments have the same value: " + str(args)
 34 | 
 35 | 
 36 | def tens2sen(t, word_dict=None, src_vocabs=None):
 37 |     sentences = []
 38 |     # loop over the batch elements
 39 |     for idx, s in enumerate(t):
 40 |         sentence = []
 41 |         for wt in s:
 42 |             word = wt if isinstance(wt, int) \
 43 |                 else wt.item()
 44 |             if word in [constants.BOS]:
 45 |                 continue
 46 |             if word in [constants.EOS]:
 47 |                 break
 48 |             if word_dict and word < len(word_dict):
 49 |                 sentence += [word_dict[word]]
 50 |             elif src_vocabs:
 51 |                 word = word - len(word_dict)
 52 |                 sentence += [src_vocabs[idx][word]]
 53 |             else:
 54 |                 sentence += [str(word)]
 55 | 
 56 |         if len(sentence) == 0:
 57 |             # NOTE: just a trick not to score empty sentence
 58 |             # this has no consequence
 59 |             sentence = [str(constants.PAD)]
 60 | 
 61 |         sentences += [' '.join(sentence)]
 62 |     return sentences
 63 | 
 64 | 
 65 | def sequence_mask(lengths, max_len=None):
 66 |     """
 67 |     Creates a boolean mask from sequence lengths.
 68 |     """
 69 |     batch_size = lengths.numel()
 70 |     max_len = max_len or lengths.max()
 71 |     return (torch.arange(0, max_len)  # (0 for pad positions)
 72 |             .type_as(lengths)
 73 |             .repeat(batch_size, 1)
 74 |             .lt(lengths.unsqueeze(1)))
 75 | 
 76 | 
 77 | def tile(x, count, dim=0):
 78 |     """
 79 |     Tiles x on dimension dim count times.
 80 |     """
 81 |     perm = list(range(len(x.size())))
 82 |     if dim != 0:
 83 |         perm[0], perm[dim] = perm[dim], perm[0]
 84 |         x = x.permute(perm).contiguous()
 85 |     out_size = list(x.size())
 86 |     out_size[0] *= count
 87 |     batch = x.size(0)
 88 |     x = x.view(batch, -1) \
 89 |         .transpose(0, 1) \
 90 |         .repeat(count, 1) \
 91 |         .transpose(0, 1) \
 92 |         .contiguous() \
 93 |         .view(*out_size)
 94 |     if dim != 0:
 95 |         x = x.permute(perm).contiguous()
 96 |     return x
 97 | 
 98 | 
 99 | def use_gpu(opt):
100 |     """
101 |     Creates a boolean if gpu used
102 |     """
103 |     return (hasattr(opt, 'gpuid') and len(opt.gpuid) > 0) or \
104 |            (hasattr(opt, 'gpu') and opt.gpu > -1)
105 | 
106 | 
107 | def generate_relative_positions_matrix(length, max_relative_positions,
108 |                                        cache=False):
109 |     """Generate the clipped relative positions matrix
110 |        for a given length and maximum relative positions"""
111 |     if cache:
112 |         distance_mat = torch.arange(-length + 1, 1, 1).unsqueeze(0)
113 |     else:
114 |         range_vec = torch.arange(length)
115 |         range_mat = range_vec.unsqueeze(-1).expand(-1, length).transpose(0, 1)
116 |         distance_mat = range_mat - range_mat.transpose(0, 1)
117 |     distance_mat_clipped = torch.clamp(distance_mat,
118 |                                        min=-max_relative_positions,
119 |                                        max=max_relative_positions)
120 |     # Shift values to be >= 0
121 |     final_mat = distance_mat_clipped + max_relative_positions
122 |     return final_mat
123 | 
124 | 
125 | def relative_matmul(x, z, transpose):
126 |     """Helper function for relative positions attention."""
127 |     batch_size = x.shape[0]
128 |     heads = x.shape[1]
129 |     length = x.shape[2]
130 |     x_t = x.permute(2, 0, 1, 3)
131 |     x_t_r = x_t.reshape(length, heads * batch_size, -1)
132 |     if transpose:
133 |         z_t = z.transpose(1, 2)
134 |         x_tz_matmul = torch.matmul(x_t_r, z_t)
135 |     else:
136 |         x_tz_matmul = torch.matmul(x_t_r, z)
137 |     x_tz_matmul_r = x_tz_matmul.reshape(length, batch_size, heads, -1)
138 |     x_tz_matmul_r_t = x_tz_matmul_r.permute(1, 2, 0, 3)
139 |     return x_tz_matmul_r_t
140 | 
141 | 
142 | def count_file_lines(file_path):
143 |     """
144 |     Counts the number of lines in a file using wc utility.
145 |     :param file_path: path to file
146 |     :return: int, no of lines
147 |     """
148 |     num = subprocess.check_output(['wc', '-l', file_path])
149 |     num = num.decode('utf-8').split(' ')
150 |     return int(num[0])
151 | 


--------------------------------------------------------------------------------
/neuroir/utils/timer.py:
--------------------------------------------------------------------------------
 1 | import time
 2 | 
 3 | 
 4 | # ------------------------------------------------------------------------------
 5 | # Utility classes
 6 | # ------------------------------------------------------------------------------
 7 | 
 8 | 
 9 | class AverageMeter(object):
10 |     """Computes and stores the average and current value."""
11 | 
12 |     def __init__(self):
13 |         self.reset()
14 | 
15 |     def reset(self):
16 |         self.val = 0
17 |         self.avg = 0
18 |         self.sum = 0
19 |         self.count = 0
20 | 
21 |     def update(self, val, n=1):
22 |         self.val = val
23 |         self.sum += val * n
24 |         self.count += n
25 |         self.avg = self.sum / self.count
26 | 
27 | 
28 | class Timer(object):
29 |     """Computes elapsed time."""
30 | 
31 |     def __init__(self):
32 |         self.running = True
33 |         self.total = 0
34 |         self.start = time.time()
35 | 
36 |     def reset(self):
37 |         self.running = True
38 |         self.total = 0
39 |         self.start = time.time()
40 |         return self
41 | 
42 |     def resume(self):
43 |         if not self.running:
44 |             self.running = True
45 |             self.start = time.time()
46 |         return self
47 | 
48 |     def stop(self):
49 |         if self.running:
50 |             self.running = False
51 |             self.total += time.time() - self.start
52 |         return self
53 | 
54 |     def time(self):
55 |         if self.running:
56 |             return self.total + time.time() - self.start
57 |         return self.total
58 | 


--------------------------------------------------------------------------------
/scripts/multitask.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | SRC_DIR=../
 4 | DATA_DIR=${SRC_DIR}/data/
 5 | EMBED_DIR=${SRC_DIR}/data/fasttext/
 6 | MODEL_DIR=${SRC_DIR}/tmp/
 7 | 
 8 | RGPU=$1
 9 | MODEL_NAME=$2
10 | DATASET=msmarco
11 | 
12 | 
13 | PYTHONPATH=$SRC_DIR CUDA_VISIBLE_DEVICES=$RGPU python -W ignore ${SRC_DIR}/main/multitask.py \
14 | 	--model_type $MODEL_NAME \
15 | 	--train_file train.json \
16 | 	--dev_file dev.json \
17 | 	--test_file test.json \
18 | 	--max_doc_len 200 \
19 | 	--max_query_len 20 \
20 | 	--uncase True \
21 | 	--max_examples -1 \
22 | 	--emsize 300 \
23 | 	--batch_size 32 \
24 | 	--test_batch_size 32 \
25 | 	--num_epochs 50 \
26 | 	--dropout_emb 0.2 \
27 | 	--dropout 0.2 \
28 | 	--dropout_rnn 0.2 \
29 | 	--optimizer adam \
30 | 	--learning_rate 0.001 \
31 | 	--weight_decay 0.0 \
32 | 	--early_stop 5 \
33 | 	--valid_metric bleu \
34 | 	--checkpoint True \
35 | 	--model_dir $MODEL_DIR \
36 | 	--model_name $MODEL_NAME \
37 | 	--only_test False \
38 | 	--data_workers 5 \
39 | 	--dataset_name $DATASET \
40 | 	--data_dir ${DATA_DIR}/${DATASET}/ \
41 | 	--embed_dir $EMBED_DIR \
42 | 	--embedding_file crawl-300d-2M-subword.vec
43 | 


--------------------------------------------------------------------------------
/scripts/ranker.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | SRC_DIR=../
 4 | DATA_DIR=${SRC_DIR}/data/
 5 | EMBED_DIR=${SRC_DIR}/data/fasttext/
 6 | MODEL_DIR=${SRC_DIR}/tmp/
 7 | 
 8 | RGPU=$1
 9 | MODEL_NAME=$2
10 | DATASET=msmarco
11 | 
12 | 
13 | PYTHONPATH=$SRC_DIR CUDA_VISIBLE_DEVICES=$RGPU python -W ignore ${SRC_DIR}/main/ranker.py \
14 | 	--model_type $MODEL_NAME \
15 | 	--train_file train.json \
16 | 	--dev_file dev.json \
17 | 	--test_file test.json \
18 | 	--max_doc_len 200 \
19 | 	--max_query_len 20 \
20 | 	--uncase True \
21 | 	--num_candidates 10 \
22 | 	--max_examples -1 \
23 | 	--emsize 300 \
24 | 	--batch_size 64 \
25 | 	--test_batch_size 64 \
26 | 	--num_epochs 50 \
27 | 	--dropout_emb 0.2 \
28 | 	--dropout 0.2 \
29 | 	--dropout_rnn 0.2 \
30 | 	--optimizer adam \
31 | 	--learning_rate 0.001 \
32 | 	--weight_decay 0.0 \
33 | 	--early_stop 5 \
34 | 	--valid_metric map \
35 | 	--checkpoint True \
36 | 	--model_dir $MODEL_DIR \
37 | 	--model_name $MODEL_NAME \
38 | 	--only_test False \
39 | 	--data_workers 5 \
40 | 	--dataset_name $DATASET \
41 | 	--data_dir ${DATA_DIR}/${DATASET}/ \
42 | 	--embed_dir $EMBED_DIR \
43 | 	--embedding_file crawl-300d-2M-subword.vec
44 | 


--------------------------------------------------------------------------------
/scripts/recommender.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | SRC_DIR=../
 4 | DATA_DIR=${SRC_DIR}/data/
 5 | EMBED_DIR=${SRC_DIR}/data/fasttext/
 6 | MODEL_DIR=${SRC_DIR}/tmp/
 7 | 
 8 | RGPU=$1
 9 | MODEL_NAME=$2
10 | DATASET=msmarco
11 | 
12 | 
13 | PYTHONPATH=$SRC_DIR CUDA_VISIBLE_DEVICES=$RGPU python -W ignore ${SRC_DIR}/main/recommender.py \
14 | 	--model_type $MODEL_NAME \
15 | 	--train_file train.json \
16 | 	--dev_file dev.json \
17 | 	--test_file test.json \
18 | 	--max_query_len 20 \
19 | 	--uncase True \
20 | 	--num_candidates 10 \
21 | 	--max_examples -1 \
22 | 	--emsize 300 \
23 | 	--batch_size 64 \
24 | 	--test_batch_size 64 \
25 | 	--num_epochs 50 \
26 | 	--dropout_emb 0.2 \
27 | 	--dropout 0.2 \
28 | 	--dropout_rnn 0.2 \
29 | 	--optimizer adam \
30 | 	--learning_rate 0.001 \
31 | 	--weight_decay 0.0 \
32 | 	--early_stop 5 \
33 | 	--valid_metric bleu \
34 | 	--checkpoint True \
35 | 	--model_dir $MODEL_DIR \
36 | 	--model_name $MODEL_NAME \
37 | 	--only_test False \
38 | 	--data_workers 5 \
39 | 	--dataset_name $DATASET \
40 | 	--data_dir ${DATA_DIR}/${DATASET}/ \
41 | 	--embed_dir $EMBED_DIR \
42 | 	--embedding_file crawl-300d-2M-subword.vec
43 | 


--------------------------------------------------------------------------------