├── LICENSE
├── README.md
├── examples
    └── msmarco-doc
    │   ├── README.md
    │   ├── helpers
    │       ├── build_train_from_ranking.py
    │       └── topk_text_2_json.py
    │   └── run_marco.py
├── helpers
    ├── score_to_marco.py
    └── score_to_tein.py
├── setup.py
└── src
    └── reranker
        ├── __init__.py
        ├── arguments.py
        ├── data.py
        ├── dist
            ├── __init__.py
            └── sampler.py
        ├── modeling.py
        └── trainer.py


/LICENSE:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "[]"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright [yyyy] [name of copyright owner]
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Reranker
  2 | Reranker is a lightweight, effective and efficient package for training and deploying deep languge model reranker in information retrieval (IR), question answering (QA) and many other natural language processing (NLP) pipelines. 
  3 | The training procedure follows our ECIR paper [Rethink Training of BERT Rerankers in Multi-Stage Retrieval Pipeline](https://arxiv.org/abs/2101.08751) using a localized constrastive esimation (LCE) loss.
  4 | 
  5 | Reranker speaks Huggingface🤗 language! This means that you instantly get all state-of-the-art pre-trained models as soon as they are ported to HF transformers. You also get the familiar model and trainer interfaces.
  6 | 
  7 | ### Stae of the Art Performance.
  8 | Reranker has two submissions to MS MARCO document leaderboard. Each got 1st place, advancing the SOTA!
  9 | 
 10 | | Date  | Submission Name |  Dev MRR@100 | Eval MRR@100  |
 11 | |---|---|---|---|
 12 | | 2021/01/20 | LCE loss + HDCT (ensemble)  | 0.464 | 0.405|
 13 | | 2020/09/09 | HDCT top100 + BERT-base FirstP (single) | 0.434 | 0.382 |
 14 | 
 15 | ### Features
 16 | - Training rerankers from the state-of-the-art pre-trained language models like BERT, RoBERTa and ELECTRA.
 17 | - The state-of-the-art reranking performance with our LCE loss based training pipeline.
 18 | - GPU memory optimizations: Loss Parallelism and Gradient Cache which allow training of larger model.
 19 | - Faster training
 20 |     - Distributed Data Parallel (DDP) for multi GPUs. 
 21 |     - Automatic Mixed Precision (AMP) training and inference with up to 2x speedup!
 22 | - Break CPU RAM limitation by memory mapping datasets with `pyarrow` through `datasets` package interface.
 23 | - Checkpoint interoperability with Hugging Face `transformers`.
 24 | 
 25 | ### Design Philosophy
 26 | The library is designed to be dedicated for text reranking modeling, training and testing. This helps us keep the code concise and focus on a more specific task. 
 27 | 
 28 | Under the hood, Reranker provides a thin layer of wrapper over Huggingface libraries. Our model wraps `PreTrainedModel` and our trainer sub-class Huggingface `Trainer`. You can then work with the familiar interfaces. 
 29 | 
 30 | ## Installation and Dependencies
 31 | Reranker uses Pytorch, Huggingface Transformers and Datasets.  Install with the following commands,
 32 | ```
 33 | git clone https://github.com/luyug/Reranker.git
 34 | cd Reranker
 35 | pip install .
 36 | ```
 37 | Reranker has been tested with `torch==1.6.0, transformers==4.2.0, datasets==1.1.3`.
 38 | 
 39 | For development, install as editable,
 40 | ```
 41 | pip install -e .
 42 | ```
 43 | 
 44 | ## Workflow
 45 | ### Inference (Reranking)
 46 | The easiest way to do inference is to use one of our uploaded [trained checkpoints](https://huggingface.co/Luyu) with `RerankerForInference`.
 47 | ```
 48 | from reranker import RerankerForInference
 49 | rk = RerankerForInference.from_pretrained("Luyu/bert-base-mdoc-bm25")  # load checkpoint
 50 | 
 51 | inputs = rk.tokenize('weather in new york', 'it is cold today in new york', return_tensors='pt')
 52 | score = rk(inputs).logits
 53 | ``` 
 54 | ### Training
 55 | For training, you will need a model, a dataset and a trainer. Say we have parsed arguments into
 56 |  `model_args`, `data_args` and `training_args` with `reranker.arguments`. First, 
 57 | initialize the reranker and tokenizer from one of 
 58 | [pre-tained language models](https://huggingface.co/transformers/pretrained_models.html) from Hugging Face.
 59 | For example, let's use RoBERTa by loading `roberta-base`.
 60 | ```
 61 | from reranker import Reranker 
 62 | from transformers import AutoTokenizer
 63 | tokenizer = AutoTokenizer.from_pretrained('roberta-base')
 64 | model = Reranker.from_pretrained(model_args, data_args, training_args, 'roberta-base')
 65 | ```
 66 | Then create the dataset,
 67 | ```
 68 | from reranker.data import GroupedTrainDataset
 69 | train_dataset = GroupedTrainDataset(
 70 |     data_args, data_args.train_path, 
 71 |     tokenizer=tokenizer, train_args=training_args
 72 | )
 73 | ```
 74 | Create a trainer and train,
 75 | ```
 76 | from reranker import RerankerTrainer
 77 | trainer = RerankerTrainer(
 78 |         model=model,
 79 |         args=training_args,
 80 |         train_dataset=train_dataset,
 81 |         data_collator=GroupCollator(tokenizer),
 82 |     )
 83 | trainer.train()
 84 | ```
 85 | See full examples in our [examples](examples).
 86 | ## Examples
 87 | [MS MARCO Document Ranking with Reranker](examples/msmarco-doc) 
 88 | 
 89 | *More to come*
 90 | 
 91 | ## Large Models
 92 | ### Loss Paralellism
 93 | We support computing a query's LCE loss with multiple GPUs with flag `--collaborative`. Note that a group size (pos + neg) 
 94 | not divisible by number of GPUs may incur undefined behaviours.
 95 | You will typically want to use it with gradient accumulation steps greater than one. 
 96 | 
 97 | *Detailed instruction ot be added.*
 98 | 
 99 | ### Gradient Cache 
100 | *Experimental*    We provide subclasses `RerankerDC` and `RerankerDCTrainer`. In the MS MARCO example, You can use them with `--distance_cahce` argument to activate gradient caching with respect to computed unnormalized distance. This allows potentially training with unlimited number of negatives beyond GPU memory limitation up to numerical precision. 
101 | The method is described in our preprint [Scaling Deep Contrastive Learning Batch Size with Almost Constant Peak Memory Usage](https://arxiv.org/abs/2101.06983).
102 | 
103 | 
104 | *Detailed instruction to be added.*
105 | 
106 | ## Helpers
107 | We provide a few helpers in the helper directory for data formatting,
108 | ### Score Formatting
109 | - `score_to_marco.py` turns a raw score txt file into MS MARCO format.
110 | - `score_to_tein.py` turns a raw score txt file into trec eval format.
111 | 
112 | For example,
113 | ```
114 | python score_to_tein.py --score_file {path to raw score txt}
115 | ```
116 | This generates a trec eval format file in the same directory as the raw score file. 
117 | ## Data Format
118 | Reranker core utilities (batch training, batch inference) expect processed and tokenized text in token id format. 
119 | This means pre-processing should be done beforehand, e.g. with BERT tokenizer.
120 | 
121 | ### Training Data
122 | Training data is grouped by query into a json file where each line has a query, its corresponding positives and sampled negatives.
123 | ```
124 | {
125 |     "qry": {
126 |         "qid": str,
127 |         "query": List[int],
128 |     },
129 |     "pos": List[
130 |         {
131 |             "pid": str,
132 |             "passage": List[int],
133 |         }
134 |     ],
135 |     "neg": List[
136 |         {
137 |             "pid": str,
138 |             "passage": List[int]
139 |         }
140 |     ]
141 | }
142 | ```
143 | Training data is handled by class `reranker.data.GroupedTrainDataset`.
144 | ### Inference (Reranking) Data
145 | Inference data is grouped by query document(passage) pairs. Each line is a json entry to be rereanked (scored).
146 | ```
147 | {
148 |     "qid": str,
149 |     "pid": str,
150 |     "qry": List[int],
151 |     "psg": List[int]
152 | }
153 | ```
154 | To speed up postprocessing, we currently take an additional tsv specifying text ids,
155 | ```
156 | qid0     pid0
157 | qid0     pid1
158 | ...
159 | ```
160 | The ordering in the two files are expected to be the same.
161 | 
162 | Inference data is handled by class `reranker.data.PredictionDataset`.
163 | ### Result Scores
164 | Scores are stored in a tsv file with columns corresponding to qid, pid and score.
165 | ```
166 | qid0     pid0     s0
167 | qid0     pid1     s1
168 | ...
169 | ```
170 | You can post-process it with our helper scirpt into MS MARCO format or TREC eval format.
171 | 
172 | 
173 | ## Contribution
174 | We welcome contribution to the package, either adding new dataset interface or new models.
175 | 
176 | ## Contact
177 | You can reach me by email `luyug@cs.cmu.edu`. As a 2nd year master, I get busy days from time to time and may not reply very promptly. Feel free to ping me if you don't get replies.
178 | 
179 | ## Citation
180 | If you use Reranker in your research, please consider citing our [ECIR paper](https://arxiv.org/abs/2101.08751),
181 | 
182 | ```
183 | @inproceedings{gao2021lce,
184 |                title={Rethink Training of BERT Rerankers in Multi-Stage Retrieval Pipeline}, 
185 |                author={Luyu Gao and Zhuyun Dai and Jamie Callan},
186 |                year={2021},
187 |                booktitle={The 43rd European Conference On Information Retrieval (ECIR)},
188 |       
189 | }
190 | ```
191 | 
192 | For the gradient cache utility, consider citing our [preprint](https://arxiv.org/abs/2101.06983),
193 | ```
194 | @misc{gao2021scaling,
195 |       title={Scaling Deep Contrastive Learning Batch Size with Almost Constant Peak Memory Usage}, 
196 |       author={Luyu Gao and Yunyi Zhang},
197 |       year={2021},
198 |       eprint={2101.06983},
199 |       archivePrefix={arXiv},
200 |       primaryClass={cs.LG}
201 | }
202 | ```
203 | 
204 | ## License
205 | Reranker is currently licensed under APACHE-2.0.
206 | 
207 | 
208 | 


--------------------------------------------------------------------------------
/examples/msmarco-doc/README.md:
--------------------------------------------------------------------------------
  1 | # MS MARCO Document
  2 | This example walks through reranker LCE training and inference on MS MARCO document collection with BERT-base LM and HDCT retriever.
  3 | 
  4 | After downloading the data, you can also skip the steps train data building and model training by using a trained model checkpoint uploded to Hugging Face model hub. See Inference sectoin for details.
  5 | 
  6 | ## Preparing Data
  7 | Download HDCT train rankings and dev file `hdct-marco-train.zip`, `dev.d100.tsv` from LTI server using this [link](http://boston.lti.cs.cmu.edu/appendices/TheWebConf2020-Zhuyun-Dai/rankings/) and unzip the latter.
  8 | 
  9 | Download the MSMARCO document ranking collection files `msmarco-doctrain-qrels.tsv.gz`, `msmarco-doctrain-queries.tsv`, `msmarco-docs.tsv` from the [official repo](https://github.com/microsoft/MSMARCO-Document-Ranking). 
 10 | Decompress the latter two.
 11 | 
 12 | ## Building Localized Training Data from Target Retriever top Ranking
 13 | Helper script `build_train_from_ranking.py` takes a ranking file and generate training set with localized negatives. It expects a tsv with 3 columns query id, passage/document id and ranking.
 14 | ```
 15 | qid  pid1  1
 16 | qid  pid2  2
 17 | ...
 18 | ```
 19 | Run the script with following command,
 20 | ```
 21 | mkdir -p {directory to store generated json training file}
 22 | for i in $(seq -f "%03g" 0 183)
 23 | do
 24 | python helpers/build_train_from_ranking.py \
 25 |     --tokenizer_name bert-base-uncased \
 26 |     --rank_file {directory of unzipped hdct-marco-train}/${i}.txt \
 27 |     --json_dir {directory to store generated json training file} \
 28 |     --n_sample 10 \
 29 |     --sample_from_top 100 \
 30 |     --random \
 31 |     --truncate 512 \
 32 |     --qrel {path to msmarco-doctrain-qrels.tsv.gz} \
 33 |     --query_collection {path to msmarco-doctrain-queries.tsv} \
 34 |     --doc_collection {path to msmarco-docs.tsv}
 35 | done
 36 | ```
 37 | 
 38 | ## Training 
 39 | This starts training on 4 GPUs with DDP.
 40 | ```
 41 | python -m torch.distributed.launch --nproc_per_node 4 run_marco.py \
 42 |   --output_dir {directory to save checkpoints} \
 43 |   --model_name_or_path  bert-base-uncased \
 44 |   --do_train \
 45 |   --save_steps 2000 \
 46 |   --train_dir {path to a train json splits from last step} \
 47 |   --max_len 512 \
 48 |   --fp16 \
 49 |   --per_device_train_batch_size 1 \
 50 |   --train_group_size 8 \
 51 |   --gradient_accumulation_steps 1 \
 52 |   --per_device_eval_batch_size 64 \
 53 |   --warmup_ratio 0.1 \
 54 |   --weight_decay 0.01 \
 55 |   --learning_rate 1e-5 \
 56 |   --num_train_epochs 2 \
 57 |   --overwrite_output_dir \
 58 |   --dataloader_num_workers 8 \
 59 | ```
 60 | Validatoin during training to be added. Validation over the entire dev is too expensive to do per x steps. Suggestions of alternatives are welcomed! (You can run inference during training separtely by loading saved checkpoints). After training, the last few checkpoints are usually good. 
 61 | 
 62 | ## Inference
 63 | First build the ranking input,
 64 | ```
 65 | mkdir -p {directory to save output}
 66 | python helpers/topk_text_2_json.py \
 67 |   --file {path to dev.d100.tsv} \
 68 |   --save_to {directory to save output}/all.json \
 69 |   --generate_id_to {directory to save output}/ids.tsv \
 70 |   --tokenizer bert-base-uncased \
 71 |   --truncate 512 \
 72 |   --q_truncate -1 
 73 | ```
 74 | Run inference with generated input using trained model checkpoint. You can also use DDP for inference by adding `python -m torch.distributed.launch --nproc_per_node {n_gpus}`. DP is currently not supported.
 75 | ```
 76 | python run_marco.py \
 77 |   --output_dir {score saving directory, not used for the moment} \
 78 |   --model_name_or_path {path to checkpoint} \
 79 |   --tokenizer_name bert-base-uncased \
 80 |   --do_predict \
 81 |   --max_len 512 \
 82 |   --fp16 \
 83 |   --per_device_eval_batch_size 64 \
 84 |   --dataloader_num_workers 8 \
 85 |   --pred_path {path to prediction json} \
 86 |   --pred_id_file  {path to prediction id tsv} \
 87 |   --rank_score_path {save path of the text file of scores}
 88 | ```
 89 | Or with hub model,
 90 | ```
 91 | python run_marco.py \
 92 |   --output_dir {score saving directory, not used for the moment} \
 93 |   --model_name_or_path Luyu/bert-base-mdoc-hdct \
 94 |   --do_predict \
 95 |   --max_len 512 \
 96 |   --fp16 \
 97 |   --per_device_eval_batch_size 64 \
 98 |   --dataloader_num_workers 8 \
 99 |   --pred_path {path to prediction json} \
100 |   --pred_id_file  {path to prediction id tsv} \
101 |   --rank_score_path {save path of the text file of scores}
102 | ```
103 | Convert score to MS MARCO format. This creates a MS MARCO format score file in the same directory,
104 | ```
105 | python {package root}/helpers/score_to_marco.py \
106 |   --score_file {path to inference output}
107 | ```
108 | 
109 | 
110 | 


--------------------------------------------------------------------------------
/examples/msmarco-doc/helpers/build_train_from_ranking.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2021 Reranker Author. All rights reserved.
  2 | #
  3 | # This source code is licensed under the license found in the
  4 | # LICENSE file in the root directory of this source tree.
  5 | from argparse import ArgumentParser
  6 | from transformers import AutoTokenizer
  7 | import json
  8 | import os
  9 | from collections import defaultdict
 10 | import datasets
 11 | import random
 12 | from tqdm import tqdm
 13 | 
 14 | parser = ArgumentParser()
 15 | parser.add_argument('--tokenizer_name', required=True)
 16 | parser.add_argument('--rank_file', required=True)
 17 | parser.add_argument('--truncate', type=int, default=512)
 18 | 
 19 | parser.add_argument('--sample_from_top', type=int, required=True)
 20 | parser.add_argument('--n_sample', type=int, default=100)
 21 | parser.add_argument('--random', action='store_true')
 22 | parser.add_argument('--json_dir', required=True)
 23 | 
 24 | parser.add_argument('--qrel', required=True)
 25 | parser.add_argument('--query_collection', required=True)
 26 | parser.add_argument('--doc_collection', required=True)
 27 | args = parser.parse_args()
 28 | 
 29 | 
 30 | def read_qrel():
 31 |     import gzip, csv
 32 |     qrel = {}
 33 |     with gzip.open(args.qrel, 'rt', encoding='utf8') as f:
 34 |         tsvreader = csv.reader(f, delimiter=" ")
 35 |         for [topicid, _, docid, rel] in tsvreader:
 36 |             assert rel == "1"
 37 |             if topicid in qrel:
 38 |                 qrel[topicid].append(docid)
 39 |             else:
 40 |                 qrel[topicid] = [docid]
 41 |     return qrel
 42 | 
 43 | 
 44 | qrel = read_qrel()
 45 | rankings = defaultdict(list)
 46 | no_judge = set()
 47 | with open(args.rank_file) as f:
 48 |     for l in f:
 49 |         qid, pid, rank = l.split()
 50 |         if qid not in qrel:
 51 |             no_judge.add(qid)
 52 |             continue
 53 |         if pid in qrel[qid]:
 54 |             continue
 55 |         # append passage if & only if it is not juddged relevant but ranks high
 56 |         rankings[qid].append(pid)
 57 | 
 58 | print(f'{len(no_judge)} queries not judged and skipped', flush=True)
 59 | 
 60 | columns = ['did', 'url', 'title', 'body']
 61 | collection = args.doc_collection
 62 | collection = datasets.load_dataset(
 63 |     'csv',
 64 |     data_files=collection,
 65 |     column_names=['did', 'url', 'title', 'body'],
 66 |     delimiter='\t',
 67 |     ignore_verifications=True,
 68 | )['train']
 69 | qry_collection = args.query_collection
 70 | qry_collection = datasets.load_dataset(
 71 |     'csv',
 72 |     data_files=qry_collection,
 73 |     column_names=['qid', 'qry'],
 74 |     delimiter='\t',
 75 |     ignore_verifications=True,
 76 | )['train']
 77 | 
 78 | doc_map = {x['did']: idx for idx, x in enumerate(collection)}
 79 | qry_map = {str(x['qid']): idx for idx, x in enumerate(qry_collection)}
 80 | 
 81 | tokenizer = AutoTokenizer.from_pretrained(args.tokenizer_name, use_fast=True)
 82 | 
 83 | out_file = args.rank_file
 84 | if out_file.endswith('.tsv') or out_file.endswith('.txt'):
 85 |     out_file = out_file[:-4]
 86 | out_file = os.path.join(args.json_dir, os.path.split(out_file)[1])
 87 | out_file = out_file + '.group.json'
 88 | 
 89 | queries = list(rankings.keys())
 90 | with open(out_file, 'w') as f:
 91 |     for qid in tqdm(queries):
 92 |         # pick from top of the full initial ranking
 93 |         negs = rankings[qid][:args.sample_from_top]
 94 |         # shuffle if random flag is on
 95 |         if args.random:
 96 |             random.shuffle(negs)
 97 |         # pick n samples
 98 |         negs = negs[:args.n_sample]
 99 | 
100 |         neg_encoded = []
101 |         for neg in negs:
102 |             idx = doc_map[neg]
103 |             item = collection[idx]
104 |             did, url, title, body = (item[k] for k in columns)
105 |             url, title, body = map(lambda v: v if v else '', [url, title, body])
106 |             encoded_neg = tokenizer.encode(
107 |                 url + tokenizer.sep_token + title + tokenizer.sep_token + body,
108 |                 add_special_tokens=False,
109 |                 max_length=args.truncate,
110 |                 truncation=True
111 |             )
112 |             neg_encoded.append({
113 |                 'passage': encoded_neg,
114 |                 'pid': neg,
115 |             })
116 |         pos_encoded = []
117 |         for pos in qrel[qid]:
118 |             idx = doc_map[pos]
119 |             item = collection[idx]
120 |             did, url, title, body = (item[k] for k in columns)
121 |             url, title, body = map(lambda v: v if v else '', [url, title, body])
122 |             encoded_pos = tokenizer.encode(
123 |                 url + tokenizer.sep_token + title + tokenizer.sep_token + body,
124 |                 add_special_tokens=False,
125 |                 max_length=args.truncate,
126 |                 truncation=True
127 |             )
128 |             pos_encoded.append({
129 |                 'passage': encoded_pos,
130 |                 'pid': pos,
131 |             })
132 |         q_idx = qry_map[qid]
133 |         query_dict = {
134 |             'qid': qid,
135 |             'query': tokenizer.encode(
136 |                 qry_collection[q_idx]['qry'],
137 |                 add_special_tokens=False,
138 |                 max_length=args.truncate,
139 |                 truncation=True),
140 |         }
141 |         item_set = {
142 |             'qry': query_dict,
143 |             'pos': pos_encoded,
144 |             'neg': neg_encoded,
145 |         }
146 |         f.write(json.dumps(item_set) + '\n')


--------------------------------------------------------------------------------
/examples/msmarco-doc/helpers/topk_text_2_json.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2021 Reranker Author. All rights reserved.
  2 | #
  3 | # This source code is licensed under the license found in the
  4 | # LICENSE file in the root directory of this source tree.
  5 | 
  6 | from transformers import AutoTokenizer
  7 | from argparse import ArgumentParser
  8 | from tqdm import tqdm
  9 | from multiprocessing import Pool
 10 | import json
 11 | import datasets
 12 | 
 13 | parser = ArgumentParser()
 14 | 
 15 | parser.add_argument('--file', required=True)
 16 | parser.add_argument('--save_to', required=True)
 17 | parser.add_argument('--tokenizer', required=True)
 18 | parser.add_argument('--generate_id_to')
 19 | parser.add_argument('--truncate', type=int, default=512)
 20 | parser.add_argument('--q_truncate', type=int, default=16)
 21 | args = parser.parse_args()
 22 | 
 23 | tokenizer = AutoTokenizer.from_pretrained(args.tokenizer, use_fast=True)
 24 | SEP = tokenizer.sep_token
 25 | 
 26 | 
 27 | columns = [
 28 |         'qid', 'query', 'did', 'url', 'title', 'body', 'unused'
 29 | ]
 30 | 
 31 | 
 32 | def encode_line(line):
 33 |     qid, qry, did, url, title, body = line.strip().split('\t')
 34 |     qry_encoded = tokenizer.encode(
 35 |         qry,
 36 |         truncation=True if args.q_truncate else False,
 37 |         max_length=args.q_truncate,
 38 |         add_special_tokens=False,
 39 |         padding=False,
 40 |     )
 41 |     doc_encoded = tokenizer.encode(
 42 |             url + SEP + title + SEP + body,
 43 |             truncation=True,
 44 |             max_length=args.truncate,
 45 |             add_special_tokens=False,
 46 |             padding=False
 47 |         )
 48 |     entry = {
 49 |         'qid': qid,
 50 |         'pid': did,
 51 |         'qry': qry_encoded,
 52 |         'psg': doc_encoded,
 53 |     }
 54 |     entry = json.dumps(entry)
 55 |     return entry, qid, did
 56 | 
 57 | def encode_item(item):
 58 |     qid, qry, did, url, title, body, _ = (item[k] for k in columns)
 59 |     url, title, body = map(lambda v: v if v else '', [url, title, body])
 60 |     qry_encoded = tokenizer.encode(
 61 |         qry,
 62 |         truncation=True if args.q_truncate else False,
 63 |         max_length=args.q_truncate,
 64 |         add_special_tokens=False,
 65 |         padding=False,
 66 |     )
 67 |     doc_encoded = tokenizer.encode(
 68 |             url + SEP + title + SEP + body,
 69 |             truncation=True,
 70 |             max_length=args.truncate,
 71 |             add_special_tokens=False,
 72 |             padding=False
 73 |         )
 74 |     entry = {
 75 |         'qid': qid,
 76 |         'pid': did,
 77 |         'qry': qry_encoded,
 78 |         'psg': doc_encoded,
 79 |     }
 80 |     entry = json.dumps(entry)
 81 |     return entry, qid, did
 82 | 
 83 | 
 84 | data_set = datasets.load_dataset(
 85 |     'csv',
 86 |     data_files=args.file,
 87 |     column_names=columns,
 88 |     delimiter='\t',
 89 |     ignore_verifications=True
 90 | )['train']
 91 | 
 92 | 
 93 | with open(args.save_to, 'w') as jfile:
 94 |     # for l in text_file:
 95 |     #     json_item = method_name(args, l, tokenizer)
 96 |     all_ids = []
 97 |     if args.q_truncate < 0:
 98 |         print('queries are not truncated', flush=True)
 99 |         args.q_truncate = None
100 |     with Pool() as p:
101 |         all_json_items = p.imap(
102 |             encode_item,
103 |             tqdm(data_set),
104 |             chunksize=100
105 |         )
106 |         for json_item, qry_id, doc_id in all_json_items:
107 |             all_ids.append((qry_id, doc_id))
108 |             jfile.write(json_item + '\n')
109 | 
110 |     if args.generate_id_to is not None:
111 |         with open(args.generate_id_to, 'w') as id_file:
112 |             for qry_id, doc_id in all_ids:
113 |                 id_file.write(f'{qry_id}\t{doc_id}\n')
114 | 


--------------------------------------------------------------------------------
/examples/msmarco-doc/run_marco.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2021 Reranker Author. All rights reserved.
  2 | # Code structure inspired by HuggingFace run_glue.py in the transformers library.
  3 | #
  4 | # This source code is licensed under the license found in the
  5 | # LICENSE file in the root directory of this source tree.
  6 | 
  7 | import logging
  8 | import os
  9 | 
 10 | from reranker import Reranker, RerankerDC
 11 | from reranker import RerankerTrainer, RerankerDCTrainer
 12 | from reranker.data import GroupedTrainDataset, PredictionDataset, GroupCollator
 13 | from reranker.arguments import ModelArguments, DataArguments, \
 14 |     RerankerTrainingArguments as TrainingArguments
 15 | 
 16 | from transformers import AutoConfig, AutoTokenizer
 17 | from transformers import (
 18 |     HfArgumentParser,
 19 |     set_seed,
 20 | )
 21 | 
 22 | logger = logging.getLogger(__name__)
 23 | 
 24 | 
 25 | def main():
 26 |     parser = HfArgumentParser((ModelArguments, DataArguments, TrainingArguments))
 27 |     model_args, data_args, training_args = parser.parse_args_into_dataclasses()
 28 |     model_args: ModelArguments
 29 |     data_args: DataArguments
 30 |     training_args: TrainingArguments
 31 | 
 32 |     if (
 33 |             os.path.exists(training_args.output_dir)
 34 |             and os.listdir(training_args.output_dir)
 35 |             and training_args.do_train
 36 |             and not training_args.overwrite_output_dir
 37 |     ):
 38 |         raise ValueError(
 39 |             f"Output directory ({training_args.output_dir}) already exists and is not empty. Use --overwrite_output_dir to overcome."
 40 |         )
 41 | 
 42 |     # Setup logging
 43 |     logging.basicConfig(
 44 |         format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
 45 |         datefmt="%m/%d/%Y %H:%M:%S",
 46 |         level=logging.INFO if training_args.local_rank in [-1, 0] else logging.WARN,
 47 |     )
 48 |     logger.warning(
 49 |         "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s",
 50 |         training_args.local_rank,
 51 |         training_args.device,
 52 |         training_args.n_gpu,
 53 |         bool(training_args.local_rank != -1),
 54 |         training_args.fp16,
 55 |     )
 56 |     logger.info("Training/evaluation parameters %s", training_args)
 57 |     logger.info("Model parameters %s", model_args)
 58 |     logger.info("Data parameters %s", data_args)
 59 | 
 60 |     # Set seed
 61 |     set_seed(training_args.seed)
 62 | 
 63 |     num_labels = 1
 64 | 
 65 |     config = AutoConfig.from_pretrained(
 66 |         model_args.config_name if model_args.config_name else model_args.model_name_or_path,
 67 |         num_labels=num_labels,
 68 |         cache_dir=model_args.cache_dir,
 69 |     )
 70 |     tokenizer = AutoTokenizer.from_pretrained(
 71 |         model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path,
 72 |         cache_dir=model_args.cache_dir,
 73 |         use_fast=False,
 74 |     )
 75 | 
 76 |     _model_class = RerankerDC if training_args.distance_cache else Reranker
 77 | 
 78 |     model = _model_class.from_pretrained(
 79 |         model_args, data_args, training_args,
 80 |         model_args.model_name_or_path,
 81 |         from_tf=bool(".ckpt" in model_args.model_name_or_path),
 82 |         config=config,
 83 |         cache_dir=model_args.cache_dir,
 84 |     )
 85 | 
 86 |     # Get datasets
 87 |     if training_args.do_train:
 88 |         train_dataset = GroupedTrainDataset(
 89 |             data_args, data_args.train_path, tokenizer=tokenizer, train_args=training_args
 90 |         )
 91 |     else:
 92 |         train_dataset = None
 93 | 
 94 | 
 95 |     # Initialize our Trainer
 96 |     _trainer_class = RerankerDCTrainer if training_args.distance_cache else RerankerTrainer
 97 |     trainer = _trainer_class(
 98 |         model=model,
 99 |         args=training_args,
100 |         train_dataset=train_dataset,
101 |         data_collator=GroupCollator(tokenizer),
102 |     )
103 | 
104 |     # Training
105 |     if training_args.do_train:
106 |         trainer.train(
107 |             model_path=model_args.model_name_or_path if os.path.isdir(model_args.model_name_or_path) else None
108 |         )
109 |         trainer.save_model()
110 |         # For convenience, we also re-save the tokenizer to the same directory,
111 |         # so that you can share your model easily on huggingface.co/models =)
112 |         if trainer.is_world_process_zero():
113 |             tokenizer.save_pretrained(training_args.output_dir)
114 | 
115 |     if training_args.do_eval:
116 |         trainer.evaluate()
117 | 
118 |     if training_args.do_predict:
119 |         logging.info("*** Prediction ***")
120 | 
121 |         if os.path.exists(data_args.rank_score_path):
122 |             if os.path.isfile(data_args.rank_score_path):
123 |                 raise FileExistsError(f'score file {data_args.rank_score_path} already exists')
124 |             else:
125 |                 raise ValueError(f'Should specify a file name')
126 |         else:
127 |             score_dir = os.path.split(data_args.rank_score_path)[0]
128 |             if not os.path.exists(score_dir):
129 |                 logger.info(f'Creating score directory {score_dir}')
130 |                 os.makedirs(score_dir)
131 | 
132 |         test_dataset = PredictionDataset(
133 |             data_args.pred_path, tokenizer=tokenizer,
134 |             max_len=data_args.max_len,
135 |         )
136 |         assert data_args.pred_id_file is not None
137 | 
138 |         pred_qids = []
139 |         pred_pids = []
140 |         with open(data_args.pred_id_file) as f:
141 |             for l in f:
142 |                 q, p = l.split()
143 |                 pred_qids.append(q)
144 |                 pred_pids.append(p)
145 | 
146 |         pred_scores = trainer.predict(test_dataset=test_dataset).predictions
147 | 
148 |         if trainer.is_world_process_zero():
149 |             assert len(pred_qids) == len(pred_scores)
150 |             with open(data_args.rank_score_path, "w") as writer:
151 |                 for qid, pid, score in zip(pred_qids, pred_pids, pred_scores):
152 |                     writer.write(f'{qid}\t{pid}\t{score}\n')
153 | 
154 | 
155 | def _mp_fn(index):
156 |     # For xla_spawn (TPUs)
157 |     main()
158 | 
159 | 
160 | if __name__ == "__main__":
161 |     main()
162 | 


--------------------------------------------------------------------------------
/helpers/score_to_marco.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2021 Reranker Author. All rights reserved.
 2 | #
 3 | # This source code is licensed under the license found in the
 4 | # LICENSE file in the root directory of this source tree.
 5 | 
 6 | import argparse
 7 | from collections import defaultdict
 8 | 
 9 | parser = argparse.ArgumentParser()
10 | parser.add_argument('--score_file', required=True)
11 | parser.add_argument('--run_id', default='marco')
12 | args = parser.parse_args()
13 | 
14 | with open(args.score_file) as f:
15 |     lines = f.readlines()
16 | 
17 | all_scores = defaultdict(dict)
18 | 
19 | for line in lines:
20 |     if len(line.strip()) == 0:
21 |         continue
22 |     qid, did, score = line.strip().split()
23 |     score = float(score)
24 |     all_scores[qid][did] = score
25 | 
26 | qq = list(all_scores.keys())
27 | 
28 | with open(args.score_file + '.marco', 'w') as f:
29 |     for qid in qq:
30 |         score_list = sorted(list(all_scores[qid].items()), key=lambda x: x[1], reverse=True)
31 |         for rank, (did, score) in enumerate(score_list):
32 |             f.write(f'{qid}\t{did}\t{rank+1}\n')
33 | 
34 | 


--------------------------------------------------------------------------------
/helpers/score_to_tein.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2021 Reranker Author. All rights reserved.
 2 | #
 3 | # This source code is licensed under the license found in the
 4 | # LICENSE file in the root directory of this source tree.
 5 | 
 6 | import argparse
 7 | from collections import defaultdict
 8 | 
 9 | parser = argparse.ArgumentParser()
10 | parser.add_argument('--score_file', required=True)
11 | parser.add_argument('--run_id', default='marco')
12 | args = parser.parse_args()
13 | 
14 | with open(args.score_file) as f:
15 |     lines = f.readlines()
16 | 
17 | all_scores = defaultdict(dict)
18 | 
19 | for line in lines:
20 |     if len(line.strip()) == 0:
21 |         continue
22 |     qid, did, score = line.strip().split()
23 |     score = float(score)
24 |     all_scores[qid][did] = score
25 | 
26 | qq = list(all_scores.keys())
27 | 
28 | with open(args.score_file + '.teIn', 'w') as f:
29 |     for qid in qq:
30 |         score_list = sorted(list(all_scores[qid].items()), key=lambda x: x[1], reverse=True)
31 |         for rank, (did, score) in enumerate(score_list):
32 |             f.write(f'{qid}\tQ0\t{did}\t{rank+1}\t{score}\t{args.run_id}\n')
33 | 
34 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright 2021 Reranker Author. All rights reserved.
 3 | #
 4 | # This source code is licensed under the license found in the
 5 | # LICENSE file in the root directory of this source tree.
 6 | 
 7 | from setuptools import setup, find_packages
 8 | 
 9 | setup(
10 |     name='reranker',
11 |     version='0.0.1',
12 |     package_dir={"": "src"},
13 |     packages=find_packages("src"),
14 |     install_requires=[
15 |         'torch>=1.6.0',
16 |         'transformers>=4.0.0',
17 |         'datasets>=1.1.3',
18 |     ],
19 |     url='',
20 |     license='CC-BY-NC 4.0',
21 |     author='Luyu Gao',
22 |     author_email='luyug@cs.cmu.edu',
23 |     description=''
24 | )
25 | 


--------------------------------------------------------------------------------
/src/reranker/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright 2021 Reranker Author. All rights reserved.
2 | #
3 | # This source code is licensed under the license found in the
4 | # LICENSE file in the root directory of this source tree.
5 | 
6 | from .modeling import Reranker, RerankerDC, RerankerForInference
7 | from .trainer import RerankerTrainer, RerankerDCTrainer
8 | 


--------------------------------------------------------------------------------
/src/reranker/arguments.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2021 Reranker Author. All rights reserved.
 2 | #
 3 | # This source code is licensed under the license found in the
 4 | # LICENSE file in the root directory of this source tree.
 5 | 
 6 | import os
 7 | from dataclasses import dataclass, field
 8 | from typing import Optional, Union, List
 9 | from transformers import TrainingArguments
10 | 
11 | 
12 | @dataclass
13 | class ModelArguments:
14 |     """
15 |     Arguments pertaining to which model/config/tokenizer we are going to fine-tune from.
16 |     """
17 | 
18 |     model_name_or_path: str = field(
19 |         metadata={"help": "Path to pretrained model or model identifier from huggingface.co/models"}
20 |     )
21 |     config_name: Optional[str] = field(
22 |         default=None, metadata={"help": "Pretrained config name or path if not the same as model_name"}
23 |     )
24 |     tokenizer_name: Optional[str] = field(
25 |         default=None, metadata={"help": "Pretrained tokenizer name or path if not the same as model_name"}
26 |     )
27 |     cache_dir: Optional[str] = field(
28 |         default=None, metadata={"help": "Where do you want to store the pretrained models downloaded from s3"}
29 |     )
30 |     temperature: Optional[float] = field(default=None)
31 | 
32 | 
33 | @dataclass
34 | class DataArguments:
35 |     train_dir: str = field(
36 |         default=None, metadata={"help": "Path to train directory"}
37 |     )
38 |     train_path: Union[str] = field(
39 |         default=None, metadata={"help": "Path to train data"}
40 |     )
41 |     train_group_size: int = field(default=8)
42 |     dev_path: str = field(
43 |         default=None, metadata={"help": "Path to dev data"}
44 |     )
45 |     pred_path: List[str] = field(default=None, metadata={"help": "Path to prediction data"})
46 |     pred_dir: str = field(
47 |         default=None, metadata={"help": "Path to prediction directory"}
48 |     )
49 |     pred_id_file: str = field(default=None)
50 |     rank_score_path: str = field(default=None, metadata={"help": "where to save the match score"})
51 |     max_len: int = field(
52 |         default=128,
53 |         metadata={
54 |             "help": "The maximum total input sequence length after tokenization for passage. Sequences longer "
55 |                     "than this will be truncated, sequences shorter will be padded."
56 |         },
57 |     )
58 | 
59 |     def __post_init__(self):
60 |         if self.train_dir is not None:
61 |             files = os.listdir(self.train_dir)
62 |             self.train_path = [
63 |                 os.path.join(self.train_dir, f)
64 |                 for f in files
65 |                 if f.endswith('tsv') or f.endswith('json')
66 |             ]
67 |         if self.pred_dir is not None:
68 |             files = os.listdir(self.pred_dir)
69 |             self.pred_path = [
70 |                 os.path.join(self.pred_dir, f)
71 |                 for f in files
72 |             ]
73 | 
74 | 
75 | @dataclass
76 | class RerankerTrainingArguments(TrainingArguments):
77 |     warmup_ratio: float = field(default=0.1)
78 |     distance_cache: bool = field(default=False)
79 |     distance_cache_stride: int = field(default=2)
80 | 
81 |     collaborative: bool = field(default=False)
82 | 


--------------------------------------------------------------------------------
/src/reranker/data.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2021 Reranker Author. All rights reserved.
  2 | #
  3 | # This source code is licensed under the license found in the
  4 | # LICENSE file in the root directory of this source tree.
  5 | 
  6 | import random
  7 | from dataclasses import dataclass
  8 | 
  9 | import datasets
 10 | from typing import Union, List, Tuple, Dict
 11 | 
 12 | import torch
 13 | from torch.utils.data import Dataset
 14 | 
 15 | from .arguments import DataArguments, RerankerTrainingArguments
 16 | from transformers import PreTrainedTokenizer, BatchEncoding
 17 | from transformers import DataCollatorWithPadding
 18 | 
 19 | 
 20 | class GroupedTrainDataset(Dataset):
 21 |     query_columns = ['qid', 'query']
 22 |     document_columns = ['pid', 'passage']
 23 | 
 24 |     def __init__(
 25 |             self,
 26 |             args: DataArguments,
 27 |             path_to_tsv: Union[List[str], str],
 28 |             tokenizer: PreTrainedTokenizer,
 29 |             train_args: RerankerTrainingArguments = None,
 30 |     ):
 31 |         self.nlp_dataset = datasets.load_dataset(
 32 |             'json',
 33 |             data_files=path_to_tsv,
 34 |             ignore_verifications=False,
 35 |             features=datasets.Features({
 36 |                 'qry': {
 37 |                     'qid': datasets.Value('string'),
 38 |                     'query': [datasets.Value('int32')],
 39 |                 },
 40 |                 'pos': [{
 41 |                     'pid': datasets.Value('string'),
 42 |                     'passage': [datasets.Value('int32')],
 43 |                 }],
 44 |                 'neg': [{
 45 |                     'pid': datasets.Value('string'),
 46 |                     'passage': [datasets.Value('int32')],
 47 |                 }]}
 48 |             )
 49 |         )['train']
 50 | 
 51 |         self.tok = tokenizer
 52 |         self.SEP = [self.tok.sep_token_id]
 53 |         self.args = args
 54 |         self.total_len = len(self.nlp_dataset)
 55 |         self.train_args = train_args
 56 | 
 57 |         if train_args is not None and train_args.collaborative:
 58 |             import torch.distributed as dist
 59 |             if not dist.is_available():
 60 |                 raise RuntimeError("Requires distributed package to be available")
 61 |             self.world_size = dist.get_world_size()
 62 |             self.rank = dist.get_rank()
 63 |             chunk_size = int(self.args.train_group_size / self.world_size)
 64 |             self.chunk_start = self.rank * chunk_size
 65 |             self.chunk_end = self.chunk_start + chunk_size
 66 | 
 67 |     def create_one_example(self, qry_encoding: List[int], doc_encoding: List[int]):
 68 |         item = self.tok.encode_plus(
 69 |             qry_encoding,
 70 |             doc_encoding,
 71 |             truncation='only_second',
 72 |             max_length=self.args.max_len,
 73 |             padding=False,
 74 |         )
 75 |         return item
 76 | 
 77 |     def __len__(self):
 78 |         return self.total_len
 79 | 
 80 |     def __getitem__(self, item) -> List[BatchEncoding]:
 81 |         group = self.nlp_dataset[item]
 82 |         examples = []
 83 |         group_batch = []
 84 |         _, qry = (group['qry'][k] for k in self.query_columns)
 85 |         _, pos_psg = [
 86 |             random.choice(group['pos'])[k] for k in self.document_columns]
 87 |         examples.append((qry, pos_psg))
 88 | 
 89 |         if len(group['neg']) < self.args.train_group_size - 1:
 90 |             negs = random.choices(group['neg'], k=self.args.train_group_size - 1)
 91 |         else:
 92 |             negs = random.sample(group['neg'], k=self.args.train_group_size - 1)
 93 | 
 94 |         for neg_entry in negs:
 95 |             _, neg_psg = [neg_entry[k] for k in self.document_columns]
 96 |             examples.append((qry, neg_psg))
 97 | 
 98 |         # collaborative mode, split the group
 99 |         if self.train_args is not None and self.train_args.collaborative:
100 |             examples = examples[self.chunk_start: self.chunk_end]
101 | 
102 |         for e in examples:
103 |             group_batch.append(self.create_one_example(*e))
104 |         return group_batch
105 | 
106 | 
107 | class PredictionDataset(Dataset):
108 |     columns = [
109 |         'qid', 'pid', 'qry', 'psg'
110 |     ]
111 | 
112 |     def __init__(self, path_to_json: List[str], tokenizer: PreTrainedTokenizer, max_len=128):
113 |         self.nlp_dataset = datasets.load_dataset(
114 |             'json',
115 |             data_files=path_to_json,
116 |         )['train']
117 |         self.tok = tokenizer
118 |         self.max_len = max_len
119 | 
120 |     def __len__(self):
121 |         return len(self.nlp_dataset)
122 | 
123 |     def __getitem__(self, item):
124 |         qid, pid, qry, psg = (self.nlp_dataset[item][f] for f in self.columns)
125 |         return self.tok.encode_plus(
126 |             qry,
127 |             psg,
128 |             truncation='only_second',
129 |             max_length=self.max_len,
130 |             padding=False,
131 |         )
132 | 
133 | 
134 | @dataclass
135 | class GroupCollator(DataCollatorWithPadding):
136 |     """
137 |     Wrapper that does conversion from List[Tuple[encode_qry, encode_psg]] to List[qry], List[psg]
138 |     and pass batch separately to the actual collator.
139 |     Abstract out data detail for the model.
140 |     """
141 | 
142 |     def __call__(
143 |             self, features
144 |     ) -> Tuple[Dict[str, torch.Tensor], Dict[str, torch.Tensor]]:
145 |         if isinstance(features[0], list):
146 |             features = sum(features, [])
147 |         return super().__call__(features)
148 | 


--------------------------------------------------------------------------------
/src/reranker/dist/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/luyug/Reranker/b99100f20aea53f55d9e55606a6b25d08e3172d5/src/reranker/dist/__init__.py


--------------------------------------------------------------------------------
/src/reranker/dist/sampler.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2021 Reranker Author. All rights reserved.
 2 | #
 3 | # This source code is licensed under the license found in the
 4 | # LICENSE file in the root directory of this source tree.
 5 | from typing import Optional
 6 | 
 7 | import torch
 8 | from torch.utils.data import DistributedSampler, Dataset
 9 | 
10 | import logging
11 | logger = logging.getLogger(__name__)
12 | 
13 | 
14 | class SyncedSampler(DistributedSampler):
15 |     def __init__(self, dataset: Dataset, num_replicas: Optional[int] = None,
16 |                  rank: Optional[int] = None, shuffle: bool = True,
17 |                  seed: int = 0) -> None:
18 |         super(SyncedSampler, self).__init__(
19 |             dataset, num_replicas, rank, shuffle, seed)
20 |         self.num_samples = len(self.dataset)
21 |         self.total_size = len(self.dataset)
22 | 
23 |     def __iter__(self):
24 |         if self.shuffle:
25 |             # deterministically shuffle based on epoch and seed
26 |             g = torch.Generator()
27 |             g.manual_seed(self.seed + self.epoch)
28 |             indices = torch.randperm(len(self.dataset), generator=g).tolist()  # type: ignore
29 |         else:
30 |             indices = list(range(len(self.dataset)))  # type: ignore
31 | 
32 |         # DO NOT SUB SAMPLE!
33 |         assert len(indices) == self.total_size
34 |         assert len(indices) == self.num_samples
35 | 
36 |         return iter(indices)
37 | 
38 |     def set_epoch(self, epoch: int):
39 |         super(SyncedSampler, self).set_epoch(epoch)
40 |         logger.info(f'Setting Data Sampler Epoch to {epoch}')
41 | 


--------------------------------------------------------------------------------
/src/reranker/modeling.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2021 Reranker Author. All rights reserved.
  2 | #
  3 | # This source code is licensed under the license found in the
  4 | # LICENSE file in the root directory of this source tree.
  5 | 
  6 | from typing import Optional
  7 | 
  8 | import torch
  9 | import torch.functional as F
 10 | import copy
 11 | from transformers import AutoModelForSequenceClassification, AutoTokenizer,\
 12 |     PreTrainedModel, PreTrainedTokenizer
 13 | from transformers.modeling_outputs import SequenceClassifierOutput, BaseModelOutputWithPooling
 14 | from torch import nn
 15 | import torch.distributed as dist
 16 | 
 17 | from .arguments import ModelArguments, DataArguments, \
 18 |     RerankerTrainingArguments as TrainingArguments
 19 | import logging
 20 | 
 21 | logger = logging.getLogger(__name__)
 22 | 
 23 | 
 24 | class Reranker(nn.Module):
 25 |     def __init__(self, hf_model: PreTrainedModel, model_args: ModelArguments, data_args: DataArguments,
 26 |                  train_args: TrainingArguments):
 27 |         super().__init__()
 28 |         self.hf_model = hf_model
 29 |         self.model_args = model_args
 30 |         self.train_args = train_args
 31 |         self.data_args = data_args
 32 | 
 33 |         self.cross_entropy = nn.CrossEntropyLoss(reduction='mean')
 34 | 
 35 |         self.register_buffer(
 36 |             'target_label',
 37 |             torch.zeros(self.train_args.per_device_train_batch_size, dtype=torch.long)
 38 |         )
 39 | 
 40 |         if train_args.local_rank >= 0:
 41 |             self.world_size = dist.get_world_size()
 42 | 
 43 |     def forward(self, batch):
 44 |         ranker_out: SequenceClassifierOutput = self.hf_model(**batch, return_dict=True)
 45 |         logits = ranker_out.logits
 46 | 
 47 |         if self.model_args.temperature is not None:
 48 |             assert self.model_args.temperature > 0
 49 |             logits = logits / self.model_args.temperature
 50 | 
 51 |         if self.train_args.collaborative:
 52 |             logits = self.dist_gather_tensor(logits)
 53 |             logits = logits.view(
 54 |                 self.world_size,
 55 |                 self.train_args.per_device_train_batch_size,
 56 |                 -1  # chunk
 57 |             )
 58 |             logits = logits.transpose(0, 1).contiguous()
 59 | 
 60 |         if self.training:
 61 |             scores = logits.view(
 62 |                 self.train_args.per_device_train_batch_size,
 63 |                 self.data_args.train_group_size
 64 |             )
 65 |             loss = self.cross_entropy(scores, self.target_label)
 66 |             # if self.train_args.collaborative or self.train_args.distance_cahce:
 67 |                 # account for avg in all reduce
 68 |                 # loss = loss.float() * self.world_size
 69 | 
 70 |             return SequenceClassifierOutput(
 71 |                 loss=loss,
 72 |                 **ranker_out,
 73 |             )
 74 |         else:
 75 |             return ranker_out
 76 | 
 77 |     @classmethod
 78 |     def from_pretrained(
 79 |             cls, model_args: ModelArguments, data_args: DataArguments, train_args: TrainingArguments,
 80 |             *args, **kwargs
 81 |     ):
 82 |         hf_model = AutoModelForSequenceClassification.from_pretrained(*args, **kwargs)
 83 |         reranker = cls(hf_model, model_args, data_args, train_args)
 84 |         return reranker
 85 | 
 86 |     def save_pretrained(self, output_dir: str):
 87 |         self.hf_model.save_pretrained(output_dir)
 88 | 
 89 |     def dist_gather_tensor(self, t: Optional[torch.Tensor]):
 90 |         if t is None:
 91 |             return None
 92 | 
 93 |         all_tensors = [torch.empty_like(t) for _ in range(self.world_size)]
 94 |         dist.all_gather(all_tensors, t)
 95 |         all_tensors[self.train_args.local_rank] = t
 96 |         all_tensors = torch.cat(all_tensors, dim=0)
 97 | 
 98 |         return all_tensors
 99 | 
100 | 
101 | class RerankerDC(Reranker):
102 |     def compute_grad(self, scores: torch.Tensor):
103 |         scores = scores.view(
104 |             self.train_args.per_device_train_batch_size,
105 |             self.data_args.train_group_size
106 |         ).detach().requires_grad_()
107 |         loss = self.cross_entropy(scores, self.target_label)
108 |         loss.backward()
109 | 
110 |         return loss.detach(), scores.grad
111 | 
112 |     def forward(self, batch, grad_tensor: torch.Tensor = None):
113 |         ranker_out: SequenceClassifierOutput = self.hf_model(**batch, return_dict=True)
114 |         logits = ranker_out.logits
115 | 
116 |         if self.training:
117 |             if grad_tensor is not None:
118 |                 return torch.dot(logits.float().flatten(), grad_tensor.flatten())
119 |             else:
120 |                 return logits
121 | 
122 |         else:
123 |             return ranker_out
124 | 
125 | 
126 | class RerankerForInference(nn.Module):
127 |     def __init__(
128 |             self,
129 |             hf_model: Optional[PreTrainedModel] = None,
130 |             tokenizer: Optional[PreTrainedTokenizer] = None
131 |     ):
132 |         super().__init__()
133 |         self.hf_model = hf_model
134 |         self.tokenizer = tokenizer
135 | 
136 |     def tokenize(self, *args, **kwargs):
137 |         return self.tokenizer(*args, **kwargs)
138 | 
139 |     def forward(self, batch):
140 |         return self.hf_model(**batch)
141 | 
142 |     @classmethod
143 |     def from_pretrained(cls, pretrained_model_name_or_path: str):
144 |         hf_model = AutoModelForSequenceClassification.from_pretrained(
145 |             pretrained_model_name_or_path)
146 |         hf_tokenizer = AutoTokenizer.from_pretrained(
147 |             pretrained_model_name_or_path)
148 | 
149 |         hf_model.eval()
150 |         return cls(hf_model, hf_tokenizer)
151 | 
152 |     def load_pretrained_model(self, pretrained_model_name_or_path, *model_args, **kwargs):
153 |         self.hf_model = AutoModelForSequenceClassification.from_pretrained(
154 |             pretrained_model_name_or_path, *model_args, **kwargs
155 |         )
156 | 
157 |     def load_pretrained_tokenizer(self, pretrained_model_name_or_path, *inputs, **kwargs):
158 |         self.tokenizer = AutoTokenizer.from_pretrained(
159 |             pretrained_model_name_or_path, *inputs, **kwargs
160 |         )
161 | 


--------------------------------------------------------------------------------
/src/reranker/trainer.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2021 Reranker Author. All rights reserved.
  2 | #
  3 | # This source code is licensed under the license found in the
  4 | # LICENSE file in the root directory of this source tree.
  5 | 
  6 | import os
  7 | from typing import Dict, List, Tuple, Optional, Any, Union
  8 | 
  9 | from .dist.sampler import SyncedSampler
 10 | from .modeling import Reranker, RerankerDC
 11 | 
 12 | import torch
 13 | from torch import nn
 14 | from torch.cuda.amp import autocast
 15 | from torch.utils.data import DataLoader, RandomSampler
 16 | from torch.utils.checkpoint import get_device_states, set_device_states
 17 | from torch.utils.data.distributed import DistributedSampler
 18 | 
 19 | from transformers.trainer import Trainer, nested_detach
 20 | from transformers.trainer_utils import PredictionOutput, EvalPrediction
 21 | import logging
 22 | 
 23 | logger = logging.getLogger(__name__)
 24 | 
 25 | 
 26 | class RerankerTrainer(Trainer):
 27 |     def _save(self, output_dir: Optional[str] = None):
 28 |         output_dir = output_dir if output_dir is not None else self.args.output_dir
 29 |         os.makedirs(output_dir, exist_ok=True)
 30 |         logger.info("Saving model checkpoint to %s", output_dir)
 31 |         # Save a trained model and configuration using `save_pretrained()`.
 32 |         # They can then be reloaded using `from_pretrained()`
 33 |         if not hasattr(self.model, 'save_pretrained'):
 34 |             raise NotImplementedError(
 35 |                 f'MODEL {self.model.__class__.__name__} '
 36 |                 f'does not support save_pretrained interface')
 37 |         else:
 38 |             self.model.save_pretrained(output_dir)
 39 |         if self.tokenizer is not None and self.is_world_process_zero():
 40 |             self.tokenizer.save_pretrained(output_dir)
 41 | 
 42 |         # Good practice: save your training arguments together with the trained model
 43 |         torch.save(self.args, os.path.join(output_dir, "training_args.bin"))
 44 | 
 45 |     def _get_train_sampler(self):
 46 |         if self.args.local_rank == -1:
 47 |             return RandomSampler(self.train_dataset)
 48 |         elif self.args.collaborative:
 49 |             logger.info(f'Collaborative Mode.')
 50 |             return SyncedSampler(self.train_dataset, seed=self.args.seed)
 51 |         else:
 52 |             return DistributedSampler(self.train_dataset)
 53 | 
 54 |     def create_optimizer_and_scheduler(self, num_training_steps: int):
 55 |         if self.args.warmup_ratio > 0:
 56 |             self.args.warmup_steps = num_training_steps * self.args.warmup_ratio
 57 | 
 58 |         return super(RerankerTrainer, self).create_optimizer_and_scheduler(num_training_steps)
 59 | 
 60 |     def get_train_dataloader(self) -> DataLoader:
 61 |         """
 62 |         Returns the training :class:`~torch.utils.data.DataLoader`.
 63 | 
 64 |         Will use no sampler if :obj:`self.train_dataset` is a :obj:`torch.utils.data.IterableDataset`, a random sampler
 65 |         (adapted to distributed training if necessary) otherwise.
 66 | 
 67 |         Subclass and override this method if you want to inject some custom behavior.
 68 |         """
 69 |         if self.train_dataset is None:
 70 |             raise ValueError("Trainer: training requires a train_dataset.")
 71 |         train_sampler = self._get_train_sampler()
 72 | 
 73 |         return DataLoader(
 74 |             self.train_dataset,
 75 |             batch_size=self.args.train_batch_size,
 76 |             sampler=train_sampler,
 77 |             collate_fn=self.data_collator,
 78 |             drop_last=True,
 79 |             num_workers=self.args.dataloader_num_workers,
 80 |         )
 81 | 
 82 |     def compute_loss(self, model: Reranker, inputs):
 83 |         return model(inputs)['loss']
 84 | 
 85 |     def prediction_step(
 86 |             self,
 87 |             model: nn.Module,
 88 |             inputs: Tuple[Dict[str, Union[torch.Tensor, Any]]],
 89 |             prediction_loss_only: bool,
 90 |             ignore_keys: Optional[List[str]] = None,
 91 |     ) -> Tuple[Optional[float], Optional[torch.Tensor], Optional[torch.Tensor]]:
 92 | 
 93 |         inputs = self._prepare_inputs(inputs)
 94 |         if ignore_keys is None:
 95 |             if hasattr(self.model, "config"):
 96 |                 ignore_keys = getattr(self.model.config, "keys_to_ignore_at_inference", [])
 97 |             else:
 98 |                 ignore_keys = []
 99 | 
100 |         with torch.no_grad():
101 |             if self.args.fp16:
102 |                 with autocast():
103 |                     outputs = model(inputs)
104 |             else:
105 |                 outputs = model(inputs)
106 | 
107 |             loss = None
108 |             if isinstance(outputs, dict):
109 |                 logits = tuple(v for k, v in outputs.items() if k not in ignore_keys)
110 |             else:
111 |                 logits = outputs
112 | 
113 |         if prediction_loss_only:
114 |             return (loss, None, None)
115 | 
116 |         logits = nested_detach(logits)
117 |         if len(logits) == 1:
118 |             logits = logits[0]
119 | 
120 |         labels = None
121 | 
122 |         return (loss, logits, labels)
123 | 
124 |     def prediction_loop(
125 |             self,
126 |             *args,
127 |             **kwargs
128 |     ) -> PredictionOutput:
129 |         pred_outs = super().prediction_loop(*args, **kwargs)
130 |         preds, label_ids, metrics = pred_outs.predictions, pred_outs.label_ids, pred_outs.metrics
131 |         preds = preds.squeeze()
132 |         if self.compute_metrics is not None:
133 |             metrics_no_label = self.compute_metrics(EvalPrediction(predictions=preds, label_ids=label_ids))
134 |         else:
135 |             metrics_no_label = {}
136 | 
137 |         for key in list(metrics_no_label.keys()):
138 |             if not key.startswith("eval_"):
139 |                 metrics_no_label[f"eval_{key}"] = metrics_no_label.pop(key)
140 | 
141 |         return PredictionOutput(predictions=preds, label_ids=label_ids, metrics={**metrics, **metrics_no_label})
142 | 
143 | class RandContext:
144 |     def __init__(self, *tensors):
145 |         self.fwd_cpu_state = torch.get_rng_state()
146 |         self.fwd_gpu_devices, self.fwd_gpu_states = get_device_states(*tensors)
147 | 
148 |     def __enter__(self):
149 |         self._fork = torch.random.fork_rng(
150 |             devices=self.fwd_gpu_devices,
151 |             enabled=True
152 |         )
153 |         self._fork.__enter__()
154 |         torch.set_rng_state(self.fwd_cpu_state)
155 |         set_device_states(self.fwd_gpu_devices, self.fwd_gpu_states)
156 | 
157 |     def __exit__(self, exc_type, exc_val, exc_tb):
158 |         self._fork.__exit__(exc_type, exc_val, exc_tb)
159 |         self._fork = None
160 | 
161 | class RerankerDCTrainer(RerankerTrainer):
162 |     def _chunk_input(self, inputs: Dict[str, torch.Tensor], chunk_size: int = None):
163 |         if chunk_size is None:
164 |             chunk_size = self.args.distance_cache_stride
165 |         keys = list(inputs.keys())
166 |         for k, v in inputs.items():
167 |             inputs[k] = v.split(chunk_size)
168 | 
169 |         chunks = []
170 |         n_chunks = len(inputs[keys[0]])
171 | 
172 |         for i in range(n_chunks):
173 |             chunks.append({k: inputs[k][i] for k in keys})
174 | 
175 |         return chunks
176 | 
177 |     def training_step(self, model: RerankerDC, inputs):
178 |         model.train()
179 |         _model = getattr(model, 'module', model)
180 |         inputs = self._prepare_inputs(inputs)
181 | 
182 |         rnd_states = []
183 |         all_logits = []
184 |         chunks = self._chunk_input(inputs)
185 | 
186 |         for chunk in chunks:
187 |             rnd_states.append(RandContext())
188 |             if self.args.fp16:
189 |                 with torch.no_grad():
190 |                     with autocast():
191 |                         chunk_logits = model(chunk)
192 |             else:
193 |                 with torch.no_grad():
194 |                     chunk_logits = model(chunk)
195 |             all_logits.append(chunk_logits)
196 | 
197 |         all_logits = torch.cat(all_logits).float()
198 |         loss, grads = _model.compute_grad(all_logits)
199 |         grads = grads.view(-1, self.args.distance_cache_stride)
200 | 
201 |         for chunk_id, chunk in enumerate(chunks):
202 |             with rnd_states[chunk_id]:
203 |                 if self.args.fp16:
204 |                     with autocast():
205 |                         surrogate = model(chunk, grads[chunk_id])
206 |                 else:
207 |                     surrogate = model(chunk, grads[chunk_id])
208 | 
209 |             if self.args.gradient_accumulation_steps > 1:
210 |                 surrogate = surrogate / self.args.gradient_accumulation_steps
211 | 
212 |             if self.args.fp16:
213 |                 self.scaler.scale(surrogate).backward()
214 |             else:
215 |                 surrogate.backward()
216 | 
217 |         return loss.detach()
218 | 
219 | 
220 | 
221 | 


--------------------------------------------------------------------------------