├── .gitignore ├── CODE_OF_CONDUCT.md ├── CONTRIBUTING.md ├── LICENSE ├── README.md ├── sentaugment_figure.png └── src ├── compress_text.py ├── faiss_retrieve.py ├── flat_retrieve.py ├── indexing.py ├── lib ├── __pycache__ │ └── indexing.cpython-36.pyc └── embeddings │ └── __pycache__ │ └── bov.cpython-36.pyc └── sase.py /.gitignore: -------------------------------------------------------------------------------- 1 | # JetBrains PyCharm IDE 2 | .idea/ 3 | 4 | # Byte-compiled / optimized / DLL files 5 | __pycache__/ 6 | *.py[cod] 7 | *$py.class 8 | 9 | # C extensions 10 | *.so 11 | 12 | # macOS dir files 13 | .DS_Store 14 | 15 | # Distribution / packaging 16 | .Python 17 | env/ 18 | build/ 19 | develop-eggs/ 20 | dist/ 21 | downloads/ 22 | eggs/ 23 | .eggs/ 24 | lib/ 25 | lib64/ 26 | parts/ 27 | sdist/ 28 | var/ 29 | wheels/ 30 | *.egg-info/ 31 | .installed.cfg 32 | *.egg 33 | 34 | # Checkpoints 35 | checkpoints 36 | 37 | # PyInstaller 38 | # Usually these files are written by a python script from a template 39 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 40 | *.manifest 41 | *.spec 42 | 43 | # Installer logs 44 | pip-log.txt 45 | pip-delete-this-directory.txt 46 | 47 | # Unit test / coverage reports 48 | htmlcov/ 49 | .tox/ 50 | .coverage 51 | .coverage.* 52 | .cache 53 | nosetests.xml 54 | coverage.xml 55 | *.cover 56 | .hypothesis/ 57 | 58 | # Translations 59 | *.mo 60 | -------------------------------------------------------------------------------- /CODE_OF_CONDUCT.md: -------------------------------------------------------------------------------- 1 | # Code of Conduct 2 | 3 | Facebook has adopted a Code of Conduct that we expect project participants to adhere to. 4 | Please read the [full text](https://code.fb.com/codeofconduct/) 5 | so that you can understand what actions will and will not be tolerated. 6 | -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # Contributing to this repo 2 | 3 | ## Pull Requests 4 | 5 | In order to accept your pull request, we need you to submit a CLA. You only need 6 | to do this once to work on any of Facebook's open source projects. 7 | 8 | Complete your CLA here: 9 | 10 | ## Issues 11 | We use GitHub issues to track public bugs. Please ensure your description is 12 | clear and has sufficient instructions to be able to reproduce the issue. 13 | 14 | ## License 15 | By contributing to this repo, you agree that your contributions will be licensed 16 | under the LICENSE file in the root directory of this source tree. 17 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Attribution-NonCommercial 4.0 International 2 | 3 | ======================================================================= 4 | 5 | Creative Commons Corporation ("Creative Commons") is not a law firm and 6 | does not provide legal services or legal advice. Distribution of 7 | Creative Commons public licenses does not create a lawyer-client or 8 | other relationship. Creative Commons makes its licenses and related 9 | information available on an "as-is" basis. Creative Commons gives no 10 | warranties regarding its licenses, any material licensed under their 11 | terms and conditions, or any related information. Creative Commons 12 | disclaims all liability for damages resulting from their use to the 13 | fullest extent possible. 14 | 15 | Using Creative Commons Public Licenses 16 | 17 | Creative Commons public licenses provide a standard set of terms and 18 | conditions that creators and other rights holders may use to share 19 | original works of authorship and other material subject to copyright 20 | and certain other rights specified in the public license below. The 21 | following considerations are for informational purposes only, are not 22 | exhaustive, and do not form part of our licenses. 23 | 24 | Considerations for licensors: Our public licenses are 25 | intended for use by those authorized to give the public 26 | permission to use material in ways otherwise restricted by 27 | copyright and certain other rights. Our licenses are 28 | irrevocable. Licensors should read and understand the terms 29 | and conditions of the license they choose before applying it. 30 | Licensors should also secure all rights necessary before 31 | applying our licenses so that the public can reuse the 32 | material as expected. Licensors should clearly mark any 33 | material not subject to the license. This includes other CC- 34 | licensed material, or material used under an exception or 35 | limitation to copyright. More considerations for licensors: 36 | wiki.creativecommons.org/Considerations_for_licensors 37 | 38 | Considerations for the public: By using one of our public 39 | licenses, a licensor grants the public permission to use the 40 | licensed material under specified terms and conditions. If 41 | the licensor's permission is not necessary for any reason--for 42 | example, because of any applicable exception or limitation to 43 | copyright--then that use is not regulated by the license. Our 44 | licenses grant only permissions under copyright and certain 45 | other rights that a licensor has authority to grant. Use of 46 | the licensed material may still be restricted for other 47 | reasons, including because others have copyright or other 48 | rights in the material. A licensor may make special requests, 49 | such as asking that all changes be marked or described. 50 | Although not required by our licenses, you are encouraged to 51 | respect those requests where reasonable. More_considerations 52 | for the public: 53 | wiki.creativecommons.org/Considerations_for_licensees 54 | 55 | ======================================================================= 56 | 57 | Creative Commons Attribution-NonCommercial 4.0 International Public 58 | License 59 | 60 | By exercising the Licensed Rights (defined below), You accept and agree 61 | to be bound by the terms and conditions of this Creative Commons 62 | Attribution-NonCommercial 4.0 International Public License ("Public 63 | License"). To the extent this Public License may be interpreted as a 64 | contract, You are granted the Licensed Rights in consideration of Your 65 | acceptance of these terms and conditions, and the Licensor grants You 66 | such rights in consideration of benefits the Licensor receives from 67 | making the Licensed Material available under these terms and 68 | conditions. 69 | 70 | Section 1 -- Definitions. 71 | 72 | a. Adapted Material means material subject to Copyright and Similar 73 | Rights that is derived from or based upon the Licensed Material 74 | and in which the Licensed Material is translated, altered, 75 | arranged, transformed, or otherwise modified in a manner requiring 76 | permission under the Copyright and Similar Rights held by the 77 | Licensor. For purposes of this Public License, where the Licensed 78 | Material is a musical work, performance, or sound recording, 79 | Adapted Material is always produced where the Licensed Material is 80 | synched in timed relation with a moving image. 81 | 82 | b. Adapter's License means the license You apply to Your Copyright 83 | and Similar Rights in Your contributions to Adapted Material in 84 | accordance with the terms and conditions of this Public License. 85 | 86 | c. Copyright and Similar Rights means copyright and/or similar rights 87 | closely related to copyright including, without limitation, 88 | performance, broadcast, sound recording, and Sui Generis Database 89 | Rights, without regard to how the rights are labeled or 90 | categorized. For purposes of this Public License, the rights 91 | specified in Section 2(b)(1)-(2) are not Copyright and Similar 92 | Rights. 93 | d. Effective Technological Measures means those measures that, in the 94 | absence of proper authority, may not be circumvented under laws 95 | fulfilling obligations under Article 11 of the WIPO Copyright 96 | Treaty adopted on December 20, 1996, and/or similar international 97 | agreements. 98 | 99 | e. Exceptions and Limitations means fair use, fair dealing, and/or 100 | any other exception or limitation to Copyright and Similar Rights 101 | that applies to Your use of the Licensed Material. 102 | 103 | f. Licensed Material means the artistic or literary work, database, 104 | or other material to which the Licensor applied this Public 105 | License. 106 | 107 | g. Licensed Rights means the rights granted to You subject to the 108 | terms and conditions of this Public License, which are limited to 109 | all Copyright and Similar Rights that apply to Your use of the 110 | Licensed Material and that the Licensor has authority to license. 111 | 112 | h. Licensor means the individual(s) or entity(ies) granting rights 113 | under this Public License. 114 | 115 | i. NonCommercial means not primarily intended for or directed towards 116 | commercial advantage or monetary compensation. For purposes of 117 | this Public License, the exchange of the Licensed Material for 118 | other material subject to Copyright and Similar Rights by digital 119 | file-sharing or similar means is NonCommercial provided there is 120 | no payment of monetary compensation in connection with the 121 | exchange. 122 | 123 | j. Share means to provide material to the public by any means or 124 | process that requires permission under the Licensed Rights, such 125 | as reproduction, public display, public performance, distribution, 126 | dissemination, communication, or importation, and to make material 127 | available to the public including in ways that members of the 128 | public may access the material from a place and at a time 129 | individually chosen by them. 130 | 131 | k. Sui Generis Database Rights means rights other than copyright 132 | resulting from Directive 96/9/EC of the European Parliament and of 133 | the Council of 11 March 1996 on the legal protection of databases, 134 | as amended and/or succeeded, as well as other essentially 135 | equivalent rights anywhere in the world. 136 | 137 | l. You means the individual or entity exercising the Licensed Rights 138 | under this Public License. Your has a corresponding meaning. 139 | 140 | Section 2 -- Scope. 141 | 142 | a. License grant. 143 | 144 | 1. Subject to the terms and conditions of this Public License, 145 | the Licensor hereby grants You a worldwide, royalty-free, 146 | non-sublicensable, non-exclusive, irrevocable license to 147 | exercise the Licensed Rights in the Licensed Material to: 148 | 149 | a. reproduce and Share the Licensed Material, in whole or 150 | in part, for NonCommercial purposes only; and 151 | 152 | b. produce, reproduce, and Share Adapted Material for 153 | NonCommercial purposes only. 154 | 155 | 2. Exceptions and Limitations. For the avoidance of doubt, where 156 | Exceptions and Limitations apply to Your use, this Public 157 | License does not apply, and You do not need to comply with 158 | its terms and conditions. 159 | 160 | 3. Term. The term of this Public License is specified in Section 161 | 6(a). 162 | 163 | 4. Media and formats; technical modifications allowed. The 164 | Licensor authorizes You to exercise the Licensed Rights in 165 | all media and formats whether now known or hereafter created, 166 | and to make technical modifications necessary to do so. The 167 | Licensor waives and/or agrees not to assert any right or 168 | authority to forbid You from making technical modifications 169 | necessary to exercise the Licensed Rights, including 170 | technical modifications necessary to circumvent Effective 171 | Technological Measures. For purposes of this Public License, 172 | simply making modifications authorized by this Section 2(a) 173 | (4) never produces Adapted Material. 174 | 175 | 5. Downstream recipients. 176 | 177 | a. Offer from the Licensor -- Licensed Material. Every 178 | recipient of the Licensed Material automatically 179 | receives an offer from the Licensor to exercise the 180 | Licensed Rights under the terms and conditions of this 181 | Public License. 182 | 183 | b. No downstream restrictions. You may not offer or impose 184 | any additional or different terms or conditions on, or 185 | apply any Effective Technological Measures to, the 186 | Licensed Material if doing so restricts exercise of the 187 | Licensed Rights by any recipient of the Licensed 188 | Material. 189 | 190 | 6. No endorsement. Nothing in this Public License constitutes or 191 | may be construed as permission to assert or imply that You 192 | are, or that Your use of the Licensed Material is, connected 193 | with, or sponsored, endorsed, or granted official status by, 194 | the Licensor or others designated to receive attribution as 195 | provided in Section 3(a)(1)(A)(i). 196 | 197 | b. Other rights. 198 | 199 | 1. Moral rights, such as the right of integrity, are not 200 | licensed under this Public License, nor are publicity, 201 | privacy, and/or other similar personality rights; however, to 202 | the extent possible, the Licensor waives and/or agrees not to 203 | assert any such rights held by the Licensor to the limited 204 | extent necessary to allow You to exercise the Licensed 205 | Rights, but not otherwise. 206 | 207 | 2. Patent and trademark rights are not licensed under this 208 | Public License. 209 | 210 | 3. To the extent possible, the Licensor waives any right to 211 | collect royalties from You for the exercise of the Licensed 212 | Rights, whether directly or through a collecting society 213 | under any voluntary or waivable statutory or compulsory 214 | licensing scheme. In all other cases the Licensor expressly 215 | reserves any right to collect such royalties, including when 216 | the Licensed Material is used other than for NonCommercial 217 | purposes. 218 | 219 | Section 3 -- License Conditions. 220 | 221 | Your exercise of the Licensed Rights is expressly made subject to the 222 | following conditions. 223 | 224 | a. Attribution. 225 | 226 | 1. If You Share the Licensed Material (including in modified 227 | form), You must: 228 | 229 | a. retain the following if it is supplied by the Licensor 230 | with the Licensed Material: 231 | 232 | i. identification of the creator(s) of the Licensed 233 | Material and any others designated to receive 234 | attribution, in any reasonable manner requested by 235 | the Licensor (including by pseudonym if 236 | designated); 237 | 238 | ii. a copyright notice; 239 | 240 | iii. a notice that refers to this Public License; 241 | 242 | iv. a notice that refers to the disclaimer of 243 | warranties; 244 | 245 | v. a URI or hyperlink to the Licensed Material to the 246 | extent reasonably practicable; 247 | 248 | b. indicate if You modified the Licensed Material and 249 | retain an indication of any previous modifications; and 250 | 251 | c. indicate the Licensed Material is licensed under this 252 | Public License, and include the text of, or the URI or 253 | hyperlink to, this Public License. 254 | 255 | 2. You may satisfy the conditions in Section 3(a)(1) in any 256 | reasonable manner based on the medium, means, and context in 257 | which You Share the Licensed Material. For example, it may be 258 | reasonable to satisfy the conditions by providing a URI or 259 | hyperlink to a resource that includes the required 260 | information. 261 | 262 | 3. If requested by the Licensor, You must remove any of the 263 | information required by Section 3(a)(1)(A) to the extent 264 | reasonably practicable. 265 | 266 | 4. If You Share Adapted Material You produce, the Adapter's 267 | License You apply must not prevent recipients of the Adapted 268 | Material from complying with this Public License. 269 | 270 | Section 4 -- Sui Generis Database Rights. 271 | 272 | Where the Licensed Rights include Sui Generis Database Rights that 273 | apply to Your use of the Licensed Material: 274 | 275 | a. for the avoidance of doubt, Section 2(a)(1) grants You the right 276 | to extract, reuse, reproduce, and Share all or a substantial 277 | portion of the contents of the database for NonCommercial purposes 278 | only; 279 | 280 | b. if You include all or a substantial portion of the database 281 | contents in a database in which You have Sui Generis Database 282 | Rights, then the database in which You have Sui Generis Database 283 | Rights (but not its individual contents) is Adapted Material; and 284 | 285 | c. You must comply with the conditions in Section 3(a) if You Share 286 | all or a substantial portion of the contents of the database. 287 | 288 | For the avoidance of doubt, this Section 4 supplements and does not 289 | replace Your obligations under this Public License where the Licensed 290 | Rights include other Copyright and Similar Rights. 291 | 292 | Section 5 -- Disclaimer of Warranties and Limitation of Liability. 293 | 294 | a. UNLESS OTHERWISE SEPARATELY UNDERTAKEN BY THE LICENSOR, TO THE 295 | EXTENT POSSIBLE, THE LICENSOR OFFERS THE LICENSED MATERIAL AS-IS 296 | AND AS-AVAILABLE, AND MAKES NO REPRESENTATIONS OR WARRANTIES OF 297 | ANY KIND CONCERNING THE LICENSED MATERIAL, WHETHER EXPRESS, 298 | IMPLIED, STATUTORY, OR OTHER. THIS INCLUDES, WITHOUT LIMITATION, 299 | WARRANTIES OF TITLE, MERCHANTABILITY, FITNESS FOR A PARTICULAR 300 | PURPOSE, NON-INFRINGEMENT, ABSENCE OF LATENT OR OTHER DEFECTS, 301 | ACCURACY, OR THE PRESENCE OR ABSENCE OF ERRORS, WHETHER OR NOT 302 | KNOWN OR DISCOVERABLE. WHERE DISCLAIMERS OF WARRANTIES ARE NOT 303 | ALLOWED IN FULL OR IN PART, THIS DISCLAIMER MAY NOT APPLY TO YOU. 304 | 305 | b. TO THE EXTENT POSSIBLE, IN NO EVENT WILL THE LICENSOR BE LIABLE 306 | TO YOU ON ANY LEGAL THEORY (INCLUDING, WITHOUT LIMITATION, 307 | NEGLIGENCE) OR OTHERWISE FOR ANY DIRECT, SPECIAL, INDIRECT, 308 | INCIDENTAL, CONSEQUENTIAL, PUNITIVE, EXEMPLARY, OR OTHER LOSSES, 309 | COSTS, EXPENSES, OR DAMAGES ARISING OUT OF THIS PUBLIC LICENSE OR 310 | USE OF THE LICENSED MATERIAL, EVEN IF THE LICENSOR HAS BEEN 311 | ADVISED OF THE POSSIBILITY OF SUCH LOSSES, COSTS, EXPENSES, OR 312 | DAMAGES. WHERE A LIMITATION OF LIABILITY IS NOT ALLOWED IN FULL OR 313 | IN PART, THIS LIMITATION MAY NOT APPLY TO YOU. 314 | 315 | c. The disclaimer of warranties and limitation of liability provided 316 | above shall be interpreted in a manner that, to the extent 317 | possible, most closely approximates an absolute disclaimer and 318 | waiver of all liability. 319 | 320 | Section 6 -- Term and Termination. 321 | 322 | a. This Public License applies for the term of the Copyright and 323 | Similar Rights licensed here. However, if You fail to comply with 324 | this Public License, then Your rights under this Public License 325 | terminate automatically. 326 | 327 | b. Where Your right to use the Licensed Material has terminated under 328 | Section 6(a), it reinstates: 329 | 330 | 1. automatically as of the date the violation is cured, provided 331 | it is cured within 30 days of Your discovery of the 332 | violation; or 333 | 334 | 2. upon express reinstatement by the Licensor. 335 | 336 | For the avoidance of doubt, this Section 6(b) does not affect any 337 | right the Licensor may have to seek remedies for Your violations 338 | of this Public License. 339 | 340 | c. For the avoidance of doubt, the Licensor may also offer the 341 | Licensed Material under separate terms or conditions or stop 342 | distributing the Licensed Material at any time; however, doing so 343 | will not terminate this Public License. 344 | 345 | d. Sections 1, 5, 6, 7, and 8 survive termination of this Public 346 | License. 347 | 348 | Section 7 -- Other Terms and Conditions. 349 | 350 | a. The Licensor shall not be bound by any additional or different 351 | terms or conditions communicated by You unless expressly agreed. 352 | 353 | b. Any arrangements, understandings, or agreements regarding the 354 | Licensed Material not stated herein are separate from and 355 | independent of the terms and conditions of this Public License. 356 | 357 | Section 8 -- Interpretation. 358 | 359 | a. For the avoidance of doubt, this Public License does not, and 360 | shall not be interpreted to, reduce, limit, restrict, or impose 361 | conditions on any use of the Licensed Material that could lawfully 362 | be made without permission under this Public License. 363 | 364 | b. To the extent possible, if any provision of this Public License is 365 | deemed unenforceable, it shall be automatically reformed to the 366 | minimum extent necessary to make it enforceable. If the provision 367 | cannot be reformed, it shall be severed from this Public License 368 | without affecting the enforceability of the remaining terms and 369 | conditions. 370 | 371 | c. No term or condition of this Public License will be waived and no 372 | failure to comply consented to unless expressly agreed to by the 373 | Licensor. 374 | 375 | d. Nothing in this Public License constitutes or may be interpreted 376 | as a limitation upon, or waiver of, any privileges and immunities 377 | that apply to the Licensor or You, including from the legal 378 | processes of any jurisdiction or authority. 379 | 380 | ======================================================================= 381 | 382 | Creative Commons is not a party to its public 383 | licenses. Notwithstanding, Creative Commons may elect to apply one of 384 | its public licenses to material it publishes and in those instances 385 | will be considered the “Licensor.” The text of the Creative Commons 386 | public licenses is dedicated to the public domain under the CC0 Public 387 | Domain Dedication. Except for the limited purpose of indicating that 388 | material is shared under a Creative Commons public license or as 389 | otherwise permitted by the Creative Commons policies published at 390 | creativecommons.org/policies, Creative Commons does not authorize the 391 | use of the trademark "Creative Commons" or any other trademark or logo 392 | of Creative Commons without its prior written consent including, 393 | without limitation, in connection with any unauthorized modifications 394 | to any of its public licenses or any other arrangements, 395 | understandings, or agreements concerning use of licensed material. For 396 | the avoidance of doubt, this paragraph does not form part of the 397 | public licenses. 398 | 399 | Creative Commons may be contacted at creativecommons.org. -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 2 | # SentAugment 3 | 4 | SentAugment is a data augmentation technique for semi-supervised learning in NLP. It uses state-of-the-art sentence embeddings to structure the information of a very large bank of sentences. The large-scale sentence embedding space is then used to retrieve in-domain unannotated sentences for any language understanding task such that semi-supervised learning techniques like self-training and knowledge-distillation can be leveraged. This means you do not need to assume the presence of unannotated sentences to use semi-supervised learning techniques. In our paper [Self-training Improves Pre-training for Natural Language Understanding](https://arxiv.org/abs/2010.02194), we show that SentAugment provides strong gains on multiple language understanding tasks when used in combination with self-training or knowledge distillation. 5 | 6 | ![Model](sentaugment_figure.png) 7 | 8 | ## Dependencies 9 | 10 | * [PyTorch](https://pytorch.org/) 11 | * [FAISS](https://github.com/facebookresearch/faiss) 12 | * [XLM](https://github.com/facebookresearch/XLM) 13 | 14 | ## I. The large-scale bank of sentences 15 | Our approach is based on a large bank of CommonCrawl web sentences. We use SentAugment to filter domain-specific unannotated data for semi-supervised learning NLP methods. This data can be found [here](http://www.statmt.org/cc-english/) and can be recovered from CommonCrawl by the [ccnet](https://github.com/facebookresearch/CC_Net) repository. It consists of 5 billion sentences, each file containing 100M sentences. As an example, we are going to use 100M sentences from the first file: 16 | 17 | ```bash 18 | mkdir data && cd data 19 | wget http://www.statmt.org/cc-english/x01.cc.5b.tar.gz 20 | ``` 21 | Then untar files and put all sentences into a single file: 22 | ```bash 23 | tar -xvf *.tar.gz 24 | cat *.5b > keys.txt 25 | ``` 26 | 27 | Then, for fast indexing, create a memory map (mmap) of this text file: 28 | ```bash 29 | python src/compress_text.py --input data/keys.txt & 30 | ``` 31 | We will use this data as the bank of sentences. 32 | 33 | ## II. The SentAugment sentence embedding space (SASE) 34 | Our sentence encoder is based on the Transformer implementation of XLM. It obtains state-of-the-art performance on several STS benchmarks. To use it, first clone XLM: 35 | ```bash 36 | git clone https://github.com/facebookresearch/XLM 37 | ``` 38 | 39 | Then, download the SentAugment sentence encoder (SASE), and its sentencepiece model: 40 | ```bash 41 | cd data 42 | wget https://dl.fbaipublicfiles.com/sentaugment/sase.pth 43 | wget https://dl.fbaipublicfiles.com/sentaugment/sase.spm 44 | ``` 45 | 46 | 47 | Then to embed sentences, you can run for instance: 48 | ```bash 49 | input=data/keys.txt # input text file 50 | output=data/keys.pt # output pytorch file 51 | 52 | # Encode sentence from $input file and save it to $output 53 | python src/sase.py --input $input --model data/sase.pth --spm_model data/sase.spm --batch_size 64 --cuda "True" --output $output 54 | ``` 55 | 56 | This will output a torch file containing sentence embeddings (dim=256). 57 | 58 | ## III. Retrieving nearest neighbor sentences from a query 59 | Now that you have constructed a sentence embedding space by encoding many sentences from CommonCrawl, you can leverage that "bank of sentences" with similarity search. 60 | From an input query sentence, you can retrieve nearest neighbors from the bank by running: 61 | 62 | ```bash 63 | bank=data/keys.txt.ref.bin64 # compressed text file (bank) 64 | emb=data/keys.pt # embeddings of sentences (keys) 65 | K=10000 # number of sentences to retrieve per query 66 | 67 | ## encode input sentences as sase embedding 68 | input=sentence.txt # input file containing a few (query) sentences 69 | python src/sase.py --input $input --model data/sase.pth --spm_model data/sase.spm --batch_size 64 --cuda "True" --output $input.pt 70 | 71 | ## use embedding to retrieve nearest neighbors 72 | input=sentence.txt # input file containing a few (query) sentences 73 | python src/flat_retrieve.py --input $input.pt --bank $bank --emb data/keys.pt --K $K > nn.txt & 74 | ``` 75 | 76 | Sentences in nn.txt can be used for semi-supervised learning as unannotated in-domain data. They also provide good paraphrases (use the cosine similarity score to filter good paraphrase pairs). 77 | 78 | In the next part, we provide fast nearest-neighbor indexes for faster retrieval of similar sentences. 79 | 80 | ## IV. Fast K-nearest neighbor search 81 | Fast K-nearest neighbor search is particularly important when considering a large bank of sentences. We use [FAISS](https://github.com/facebookresearch/faiss) indexes to optimize the memory usage and query time. 82 | 83 | ### IV.1 - The KNN index bestiary 84 | For fast nearest-neighbor search, we provide pretrained [FAISS indexes](https://github.com/facebookresearch/faiss/wiki/The-index-factory) (see Table below). Each index enables fast NN search based on different compression schemes. The embeddings are compressed using for instance scalar quantization (SQ4 or SQ8), PCA reduction (PCAR: 14, 40, 256), and search is sped up with k-means clustering (32k or 262k). Please consider looking at the [FAISS documentation](https://github.com/facebookresearch/faiss/wiki/Faiss-on-the-GPU) for more information on indexes and [how to train them](https://github.com/facebookresearch/faiss/wiki/Guidelines-to-choose-an-index). 85 | 86 | FAISS index | \#Sentences | \#Clusters | Quantization | #PCAR | Machine | Size 87 | |:---: |:---: |:---: | :---: |:---: | :---: | :------: | 88 | [`100M_1GPU_16GB`](https://dl.fbaipublicfiles.com/sentaugment/100M_1GPU_16GB.faiss.idx) | 100M | 32768 | SQ4 | 256 | 1GPU16 | 14GiB 89 | [`100M_1GPU_32GB`](https://dl.fbaipublicfiles.com/sentaugment/100M_1GPU_32GB.faiss.idx) | 100M | 32768 | SQ8 | 256 | 1GPU32 | 26GiB 90 | [`1B_1GPU_16GB`](https://dl.fbaipublicfiles.com/sentaugment/1B_1GPU_16GB.faiss.idx) | 1B | 262144 | SQ4 | 14 | 1GPU16 | 15GiB 91 | [`1B_1GPU_32GB`](https://dl.fbaipublicfiles.com/sentaugment/1B_1GPU_32GB.faiss.idx) | 1B | 262144 | SQ4 | 40 | 1GPU32 | 28GiB 92 | [`1B_8GPU_32GB`](https://dl.fbaipublicfiles.com/sentaugment/1B_8GPU_32GB.faiss.idx) | 1B | 262144 | SQ4 | 256 | 8GPU32 | 136GiB 93 | 94 | We provide indexes that fit either on 1 GPU with 16GiB memory (1GPU16) up to a larger index that fits on 1 GPU with 32 GiB memory (1GPU32) and one that fits on 8 GPUs (32GB). Indexes that use 100M sentences are built from the first file "x01.cc.5b.tar.gz", and 1B indexes use the first ten files. All indexes are based on SASE embeddings. 95 | 96 | ### IV.2 - How to use an index to query nearest neighbors 97 | You can get K nearest neighbors for each sentence of an input text file by running: 98 | 99 | ```bash 100 | ## encode input sentences as sase embedding 101 | input=sentence.txt # input file containing a few (query) sentences 102 | python src/sase.py --input $input --model data/sase.pth --spm_model data/sase.spm --batch_size 64 --cuda "True" --output $input.pt 103 | 104 | index=data/100M_1GPU_16GB.faiss.idx # FAISS index path 105 | input=sentences.pt # embeddings of input sentences 106 | bank=data/keys.txt # text file with all the data (the compressed file keys.ref.bin64 should also be present in the same folder) 107 | K=10 # number of sentences to retrieve per query 108 | NPROBE=1024 # number of probes for querying the index 109 | 110 | python src/faiss_retrieve.py --input $input --bank $bank --index $index --K $K --nprobe $NPROBE --gpu "True" > nn.txt & 111 | ``` 112 | This can also be used for paraphrase mining. 113 | 114 | 115 | ## Reference 116 | If you found the resources here useful, please consider citing our paper: 117 | 118 | ``` 119 | @article{du2020self, 120 | title={Self-training Improves Pre-training for Natural Language Understanding}, 121 | author={Du, Jingfei and Grave, Edouard and Gunel, Beliz and Chaudhary, Vishrav and Celebi, Onur and Auli, Michael and Stoyanov, Ves and Conneau, Alexis}, 122 | journal={arXiv preprint arXiv:2010.02194}, 123 | year={2020} 124 | } 125 | ``` 126 | 127 | ## License 128 | 129 | See the [LICENSE](LICENSE) file for more details. 130 | The majority of SentAugment is licensed under CC-BY-NC. However, license information for PyTorch code is available at https://github.com/pytorch/pytorch/blob/master/LICENSE 131 | -------------------------------------------------------------------------------- /sentaugment_figure.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/facebookresearch/SentAugment/e92dc7039335dcaa96396e66cf03f50c9899dacf/sentaugment_figure.png -------------------------------------------------------------------------------- /src/compress_text.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python3 2 | # Copyright (c) Facebook, Inc. and its affiliates. 3 | # 4 | import sys 5 | import os 6 | import argparse 7 | 8 | import numpy as np 9 | import torch 10 | DIR = os.path.dirname(os.path.realpath(__file__)) 11 | sys.path.append(DIR + '/../src/lib') 12 | from indexing import CompressText 13 | 14 | 15 | 16 | 17 | def main(): 18 | parser = argparse.ArgumentParser(description="Generating ref file to support fetching text from memmap") 19 | parser.add_argument("--input", type=str, help="input text file") 20 | args = parser.parse_args() 21 | CompressText(args.input) 22 | 23 | 24 | 25 | if __name__ == "__main__": 26 | main() 27 | -------------------------------------------------------------------------------- /src/faiss_retrieve.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python3 2 | # Copyright (c) Facebook, Inc. and its affiliates. 3 | # 4 | """ 5 | Script for retrieving nearest neighbors of sentences from the bank using a given faiss index 6 | Example: python src/faiss_retrieve.py --input $input --bank $bank --index $index --K $K 7 | """ 8 | 9 | import argparse 10 | import faiss 11 | import os 12 | import sys 13 | import time 14 | import torch 15 | 16 | from indexing import IndexLoad, IndexSearchKNN, IndexTextOpen 17 | 18 | parser = argparse.ArgumentParser(description="retrieve nearest neighbors of sentences") 19 | parser.add_argument("--input", type=str, required=True , help="input pytorch embeddings") 20 | parser.add_argument("--bank", type=str, required=True, help="compressed text file") 21 | parser.add_argument("--index", type=str, required=True, help="faiss index") 22 | parser.add_argument("--K", type=int, default=100, help="number of nearest neighbors per sentence") 23 | parser.add_argument("--nprobe", type=int, default=1024, help="number of probes for the FAISS index") 24 | parser.add_argument("--gpu", type=str, default="True", help="use gpu") 25 | 26 | args = parser.parse_args() 27 | assert args.gpu in ["True", "False"] 28 | args.gpu = eval(args.gpu) 29 | 30 | # load query embeddings 31 | query_emb = torch.load(args.input).numpy() 32 | 33 | # normalize embeddings 34 | faiss.normalize_L2(query_emb) 35 | 36 | # load the index 37 | index = IndexLoad(args.index, args.nprobe, args.gpu) 38 | 39 | # query the index and print retrieved neighbors 40 | txt_mmap, ref_mmap = IndexTextOpen(args.bank) 41 | nns = IndexSearchKNN(index, query_emb, txt_mmap, ref_mmap, args.K) 42 | for nn in nns: 43 | print(nn) 44 | 45 | 46 | -------------------------------------------------------------------------------- /src/flat_retrieve.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python3 2 | # Copyright (c) Facebook, Inc. and its affiliates. 3 | # 4 | """ 5 | Script that retrieve nearest neighbors of sentences from the bank 6 | Example: python src/flat_retrieve.py --input $input --bank $bank --emb data/keys.pt --K $K 7 | """ 8 | 9 | import os 10 | import sys 11 | import torch 12 | import argparse 13 | import time 14 | 15 | DIR = os.path.dirname(os.path.realpath(__file__)) 16 | sys.path.append(DIR + '/../src/lib') 17 | from indexing import IndexTextOpen, IndexTextQuery 18 | 19 | parser = argparse.ArgumentParser(description="retrieve nearest neighbors of sentences") 20 | parser.add_argument("--input", type=str, required=True , help="input pytorch embeddings") 21 | parser.add_argument("--bank", type=str, required=True, help="compressed text file") 22 | parser.add_argument("--emb", type=str, required=True, help="pytorch embeddings of text bank") 23 | parser.add_argument("--K", type=int, default=100, help="number of nearest neighbors per sentence") 24 | 25 | args = parser.parse_args() 26 | 27 | # load query embedding and bank embedding 28 | query_emb = torch.load(args.input) 29 | bank_emb = torch.load(args.emb) 30 | 31 | # normalize embeddings 32 | query_emb.div_(query_emb.norm(2, 1, keepdim=True).expand_as(query_emb)) 33 | bank_emb.div_(bank_emb.norm(2, 1, keepdim=True).expand_as(bank_emb)) 34 | 35 | # score and rank 36 | scores = bank_emb.mm(query_emb.t()) # B x Q 37 | _, indices = torch.topk(scores, params.k, dim=0) # K x Q 38 | 39 | # fetch and print retrieved text 40 | txt_mmap, ref_mmap = IndexTextOpen(args.bank) 41 | for qeury_idx in range(indices.size(1)): 42 | for k in range(K): 43 | print(IndexTextQuery(txt_mmap, ref_mmap, indices[k][qeury_idx])) 44 | 45 | 46 | -------------------------------------------------------------------------------- /src/indexing.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python3 2 | # Copyright (c) Facebook, Inc. and its affiliates. 3 | # 4 | 5 | # indexing and search with FAISS 6 | 7 | import faiss 8 | import os.path 9 | import sys 10 | import numpy as np 11 | import torch 12 | 13 | 14 | ############################################################################### 15 | # create an FAISS index on the given data 16 | 17 | def IndexCreate(input_path, idx_type, output_path, normalize=True, dim=512): 18 | 19 | assert idx_type == 'FlatL2', 'only FlatL2 index is currently supported' 20 | x = torch.load(input_path).numpy() 21 | print(' - creating FAISS index') 22 | idx = faiss.IndexFlatL2(dim) 23 | if normalize: 24 | faiss.normalize_L2(x) 25 | idx.add(x) 26 | print(' - saving index into ' + output_path) 27 | faiss.write_index(idx, output_path) 28 | return x, idx 29 | 30 | 31 | def LoadTextSimple(text_fname): 32 | """ 33 | Naive version of loading text into python list 34 | used for retrieve text using sentence idx from faiss 35 | NOTE: inefficient, will be replaced with mmap 36 | """ 37 | with open(text_fname, 'r', encoding='utf-8', errors='ignore') as fin: 38 | sentences = [s.strip() for s in fin] 39 | return sentences 40 | 41 | 42 | def CompressText(txt_fname): 43 | """ 44 | generate ref binary file storing starting offset for each sentence 45 | """ 46 | fname = txt_fname.replace('.txt', '.ref.bin64') 47 | offsets = [0] 48 | with open(txt_fname, 'r', encoding='utf-8', errors='ignore') as fin: 49 | for line in fin: 50 | offsets.append(offsets[-1] + len(bytes(line, encoding='utf-8', errors='ignore'))) 51 | offsets = np.array(offsets[:-1], dtype=np.int64) # discard last one 52 | offsets.tofile(fname) 53 | 54 | 55 | ############################################################################### 56 | # Opens a text file with the sentences corresponding to the indices used 57 | # by an FAISS index 58 | # We also need the reference files with the byte offsets to the beginning 59 | # of each sentence 60 | # optionnally: array with number of words per sentence 61 | # All arrays are memory mapped 62 | 63 | def IndexTextOpen(txt_fname): 64 | # print('Reading text corpus') 65 | # print(' - texts: {:s}'.format(txt_fname)) 66 | txt_mmap = np.memmap(txt_fname, mode='r', dtype=np.uint8) 67 | fname = txt_fname.replace('.txt', '.ref.bin32') 68 | if os.path.isfile(fname): 69 | # print(' - sentence start offsets (32 bit): {}'.format(fname)) 70 | ref_mmap = np.memmap(fname, mode='r', dtype=np.uint32) 71 | else: 72 | fname = txt_fname.replace('.txt', '.ref.bin64') 73 | if os.path.isfile(fname): 74 | # print(' - sentence start offsets (64 bit): {}'.format(fname)) 75 | ref_mmap = np.memmap(fname, mode='r', dtype=np.uint64) 76 | else: 77 | # print('ERROR: no file with sentence start offsets found') 78 | sys.exit(1) 79 | # print(' - found {:d} sentences'.format(ref_mmap.shape[0])) 80 | return txt_mmap, ref_mmap 81 | 82 | 83 | ############################################################################### 84 | # Return the text for the given index 85 | 86 | def IndexTextQuery(txt_mmap, ref_mmap, idx): 87 | p = int(ref_mmap[idx]) # get starting byte position 88 | i = 0 89 | dim = 10000 # max sentence length in bytes 90 | b = bytearray(dim) 91 | # find EOL 92 | while txt_mmap[p+i] != 10 and i < dim: 93 | b[i] = txt_mmap[p+i] 94 | i += 1 95 | return b[0:i].decode('utf-8') 96 | 97 | 98 | 99 | ############################################################################### 100 | # Load an FAISS index 101 | 102 | def IndexLoad(idx_path, nprobe=0, gpu=False): 103 | print('Reading FAISS index', file=sys.stderr) 104 | print(' - index: {:s}'.format(idx_path), file=sys.stderr) 105 | index = faiss.read_index(idx_path) 106 | print(' - found {:d} sentences of dim {:d}'.format(index.ntotal, index.d), file=sys.stderr) 107 | print(' - setting nbprobe to {:d}'.format(nprobe), file=sys.stderr) 108 | if gpu: 109 | print(' - transfer index to %d GPUs ' % faiss.get_num_gpus(), file=sys.stderr) 110 | index = faiss.index_cpu_to_all_gpus(index) # co=co 111 | faiss.GpuParameterSpace().set_index_parameter(index, 'nprobe', nprobe) 112 | return index 113 | 114 | 115 | ############################################################################### 116 | # Search the [k] nearest vectors of [x] in the given index 117 | # and return the text lines 118 | 119 | def IndexSearchKNN(index, x, T, R, kmax=1, dedup=True): 120 | D, I = index.search(x, kmax) 121 | all_res = [] 122 | for n in range(x.shape[0]): 123 | prev = set() # for depuplication 124 | res = [] 125 | for i in range(kmax): 126 | txt = IndexTextQuery(T, R, I[n, i]) 127 | # txt = T[I[n, i]] 128 | if dedup and txt not in prev: 129 | prev.add(txt) 130 | res.append((txt, D[n, i])) 131 | all_res.append(res) 132 | return all_res 133 | -------------------------------------------------------------------------------- /src/lib/__pycache__/indexing.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/facebookresearch/SentAugment/e92dc7039335dcaa96396e66cf03f50c9899dacf/src/lib/__pycache__/indexing.cpython-36.pyc -------------------------------------------------------------------------------- /src/lib/embeddings/__pycache__/bov.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/facebookresearch/SentAugment/e92dc7039335dcaa96396e66cf03f50c9899dacf/src/lib/embeddings/__pycache__/bov.cpython-36.pyc -------------------------------------------------------------------------------- /src/sase.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python3 2 | # Copyright (c) Facebook, Inc. and its affiliates. 3 | # 4 | 5 | """ 6 | Script that takes text as input and output SASE sentence embeddings 7 | Example: python src/sase.py --input $input --model $modelpath --spm_model $spmmodel --batch_size 64 --cuda "True" --output $output 8 | """ 9 | 10 | import os 11 | import sys 12 | import torch 13 | import argparse 14 | from collections import OrderedDict 15 | import sentencepiece as spm 16 | 17 | sys.path.insert(0, 'XLM/') 18 | 19 | from src.utils import AttrDict 20 | from src.data.dictionary import Dictionary, BOS_WORD, EOS_WORD, PAD_WORD, UNK_WORD, MASK_WORD 21 | from src.model.transformer import TransformerModel 22 | 23 | parser = argparse.ArgumentParser(description="SASE encoding") 24 | 25 | 26 | def main(): 27 | parser.add_argument("--input", type=str, default="", help="input file") 28 | parser.add_argument("--model", type=str, default="", help="model path") 29 | parser.add_argument("--spm_model", type=str, default="", help="spm model path") 30 | parser.add_argument("--batch_size", type=int, default=64, help="batch size") 31 | parser.add_argument("--max_words", type=int, default=100, help="max words") 32 | parser.add_argument("--cuda", type=str, default="True", help="use cuda") 33 | parser.add_argument("--output", type=str, default="", help="output file") 34 | args = parser.parse_args() 35 | 36 | # Reload a pretrained model 37 | reloaded = torch.load(args.model) 38 | params = AttrDict(reloaded['params']) 39 | 40 | # Reload the SPM model 41 | spm_model = spm.SentencePieceProcessor() 42 | spm_model.Load(args.spm_model) 43 | 44 | # cuda 45 | assert args.cuda in ["True", "False"] 46 | args.cuda = eval(args.cuda) 47 | 48 | # build dictionary / update parameters 49 | dico = Dictionary(reloaded['dico_id2word'], reloaded['dico_word2id'], reloaded['dico_counts']) 50 | params.n_words = len(dico) 51 | params.bos_index = dico.index(BOS_WORD) 52 | params.eos_index = dico.index(EOS_WORD) 53 | params.pad_index = dico.index(PAD_WORD) 54 | params.unk_index = dico.index(UNK_WORD) 55 | params.mask_index = dico.index(MASK_WORD) 56 | 57 | 58 | # build model / reload weights 59 | model = TransformerModel(params, dico, True, True) 60 | reloaded['model'] = OrderedDict({key.replace('module.', ''):reloaded['model'][key] for key in reloaded['model']}) 61 | model.load_state_dict(reloaded['model']) 62 | model.eval() 63 | 64 | if args.cuda: 65 | model.cuda() 66 | 67 | # load sentences 68 | sentences = [] 69 | with open(args.input) as f: 70 | for line in f: 71 | line = spm_model.EncodeAsPieces(line.rstrip()) 72 | line = line[:args.max_words - 1] 73 | sentences.append(line) 74 | 75 | # encode sentences 76 | embs = [] 77 | for i in range(0, len(sentences), args.batch_size): 78 | batch = sentences[i:i+args.batch_size] 79 | lengths = torch.LongTensor([len(s) + 1 for s in batch]) 80 | bs, slen = len(batch), lengths.max().item() 81 | assert slen <= args.max_words 82 | 83 | x = torch.LongTensor(slen, bs).fill_(params.pad_index) 84 | for k in range(bs): 85 | sent = torch.LongTensor([params.eos_index] + [dico.index(w) for w in batch[k]]) 86 | x[:len(sent), k] = sent 87 | 88 | if args.cuda: 89 | x = x.cuda() 90 | lengths = lengths.cuda() 91 | 92 | with torch.no_grad(): 93 | embedding = model('fwd', x=x, lengths=lengths, langs=None, causal=False).contiguous()[0].cpu() 94 | 95 | embs.append(embedding) 96 | 97 | # save embeddings 98 | torch.save(torch.cat(embs, dim=0).squeeze(0), args.output) 99 | 100 | if __name__ == "__main__": 101 | main() 102 | --------------------------------------------------------------------------------