├── .gitignore
├── CODE_OF_CONDUCT.md
├── CONTRIBUTING.md
├── LICENSE
├── README.md
├── sentaugment_figure.png
└── src
    ├── compress_text.py
    ├── faiss_retrieve.py
    ├── flat_retrieve.py
    ├── indexing.py
    ├── lib
        ├── __pycache__
        │   └── indexing.cpython-36.pyc
        └── embeddings
        │   └── __pycache__
        │       └── bov.cpython-36.pyc
    └── sase.py


/.gitignore:
--------------------------------------------------------------------------------
 1 | # JetBrains PyCharm IDE
 2 | .idea/
 3 | 
 4 | # Byte-compiled / optimized / DLL files
 5 | __pycache__/
 6 | *.py[cod]
 7 | *$py.class
 8 | 
 9 | # C extensions
10 | *.so
11 | 
12 | # macOS dir files
13 | .DS_Store
14 | 
15 | # Distribution / packaging
16 | .Python
17 | env/
18 | build/
19 | develop-eggs/
20 | dist/
21 | downloads/
22 | eggs/
23 | .eggs/
24 | lib/
25 | lib64/
26 | parts/
27 | sdist/
28 | var/
29 | wheels/
30 | *.egg-info/
31 | .installed.cfg
32 | *.egg
33 | 
34 | # Checkpoints
35 | checkpoints
36 | 
37 | # PyInstaller
38 | #  Usually these files are written by a python script from a template
39 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
40 | *.manifest
41 | *.spec
42 | 
43 | # Installer logs
44 | pip-log.txt
45 | pip-delete-this-directory.txt
46 | 
47 | # Unit test / coverage reports
48 | htmlcov/
49 | .tox/
50 | .coverage
51 | .coverage.*
52 | .cache
53 | nosetests.xml
54 | coverage.xml
55 | *.cover
56 | .hypothesis/
57 | 
58 | # Translations
59 | *.mo
60 | 


--------------------------------------------------------------------------------
/CODE_OF_CONDUCT.md:
--------------------------------------------------------------------------------
1 | # Code of Conduct
2 | 
3 | Facebook has adopted a Code of Conduct that we expect project participants to adhere to.
4 | Please read the [full text](https://code.fb.com/codeofconduct/)
5 | so that you can understand what actions will and will not be tolerated.
6 | 


--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
 1 | # Contributing to this repo
 2 | 
 3 | ## Pull Requests
 4 | 
 5 | In order to accept your pull request, we need you to submit a CLA. You only need
 6 | to do this once to work on any of Facebook's open source projects.
 7 | 
 8 | Complete your CLA here: <https://code.facebook.com/cla>
 9 | 
10 | ## Issues
11 | We use GitHub issues to track public bugs. Please ensure your description is
12 | clear and has sufficient instructions to be able to reproduce the issue.
13 | 
14 | ## License
15 | By contributing to this repo, you agree that your contributions will be licensed
16 | under the LICENSE file in the root directory of this source tree.
17 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 | Attribution-NonCommercial 4.0 International
  2 | 
  3 | =======================================================================
  4 | 
  5 | Creative Commons Corporation ("Creative Commons") is not a law firm and
  6 | does not provide legal services or legal advice. Distribution of
  7 | Creative Commons public licenses does not create a lawyer-client or
  8 | other relationship. Creative Commons makes its licenses and related
  9 | information available on an "as-is" basis. Creative Commons gives no
 10 | warranties regarding its licenses, any material licensed under their
 11 | terms and conditions, or any related information. Creative Commons
 12 | disclaims all liability for damages resulting from their use to the
 13 | fullest extent possible.
 14 | 
 15 | Using Creative Commons Public Licenses
 16 | 
 17 | Creative Commons public licenses provide a standard set of terms and
 18 | conditions that creators and other rights holders may use to share
 19 | original works of authorship and other material subject to copyright
 20 | and certain other rights specified in the public license below. The
 21 | following considerations are for informational purposes only, are not
 22 | exhaustive, and do not form part of our licenses.
 23 | 
 24 |      Considerations for licensors: Our public licenses are
 25 |      intended for use by those authorized to give the public
 26 |      permission to use material in ways otherwise restricted by
 27 |      copyright and certain other rights. Our licenses are
 28 |      irrevocable. Licensors should read and understand the terms
 29 |      and conditions of the license they choose before applying it.
 30 |      Licensors should also secure all rights necessary before
 31 |      applying our licenses so that the public can reuse the
 32 |      material as expected. Licensors should clearly mark any
 33 |      material not subject to the license. This includes other CC-
 34 |      licensed material, or material used under an exception or
 35 |      limitation to copyright. More considerations for licensors:
 36 |      wiki.creativecommons.org/Considerations_for_licensors
 37 | 
 38 |      Considerations for the public: By using one of our public
 39 |      licenses, a licensor grants the public permission to use the
 40 |      licensed material under specified terms and conditions. If
 41 |      the licensor's permission is not necessary for any reason--for
 42 |      example, because of any applicable exception or limitation to
 43 |      copyright--then that use is not regulated by the license. Our
 44 |      licenses grant only permissions under copyright and certain
 45 |      other rights that a licensor has authority to grant. Use of
 46 |      the licensed material may still be restricted for other
 47 |      reasons, including because others have copyright or other
 48 |      rights in the material. A licensor may make special requests,
 49 |      such as asking that all changes be marked or described.
 50 |      Although not required by our licenses, you are encouraged to
 51 |      respect those requests where reasonable. More_considerations
 52 |      for the public:
 53 | 	wiki.creativecommons.org/Considerations_for_licensees
 54 | 
 55 | =======================================================================
 56 | 
 57 | Creative Commons Attribution-NonCommercial 4.0 International Public
 58 | License
 59 | 
 60 | By exercising the Licensed Rights (defined below), You accept and agree
 61 | to be bound by the terms and conditions of this Creative Commons
 62 | Attribution-NonCommercial 4.0 International Public License ("Public
 63 | License"). To the extent this Public License may be interpreted as a
 64 | contract, You are granted the Licensed Rights in consideration of Your
 65 | acceptance of these terms and conditions, and the Licensor grants You
 66 | such rights in consideration of benefits the Licensor receives from
 67 | making the Licensed Material available under these terms and
 68 | conditions.
 69 | 
 70 | Section 1 -- Definitions.
 71 | 
 72 |   a. Adapted Material means material subject to Copyright and Similar
 73 |      Rights that is derived from or based upon the Licensed Material
 74 |      and in which the Licensed Material is translated, altered,
 75 |      arranged, transformed, or otherwise modified in a manner requiring
 76 |      permission under the Copyright and Similar Rights held by the
 77 |      Licensor. For purposes of this Public License, where the Licensed
 78 |      Material is a musical work, performance, or sound recording,
 79 |      Adapted Material is always produced where the Licensed Material is
 80 |      synched in timed relation with a moving image.
 81 | 
 82 |   b. Adapter's License means the license You apply to Your Copyright
 83 |      and Similar Rights in Your contributions to Adapted Material in
 84 |      accordance with the terms and conditions of this Public License.
 85 | 
 86 |   c. Copyright and Similar Rights means copyright and/or similar rights
 87 |      closely related to copyright including, without limitation,
 88 |      performance, broadcast, sound recording, and Sui Generis Database
 89 |      Rights, without regard to how the rights are labeled or
 90 |      categorized. For purposes of this Public License, the rights
 91 |      specified in Section 2(b)(1)-(2) are not Copyright and Similar
 92 |      Rights.
 93 |   d. Effective Technological Measures means those measures that, in the
 94 |      absence of proper authority, may not be circumvented under laws
 95 |      fulfilling obligations under Article 11 of the WIPO Copyright
 96 |      Treaty adopted on December 20, 1996, and/or similar international
 97 |      agreements.
 98 | 
 99 |   e. Exceptions and Limitations means fair use, fair dealing, and/or
100 |      any other exception or limitation to Copyright and Similar Rights
101 |      that applies to Your use of the Licensed Material.
102 | 
103 |   f. Licensed Material means the artistic or literary work, database,
104 |      or other material to which the Licensor applied this Public
105 |      License.
106 | 
107 |   g. Licensed Rights means the rights granted to You subject to the
108 |      terms and conditions of this Public License, which are limited to
109 |      all Copyright and Similar Rights that apply to Your use of the
110 |      Licensed Material and that the Licensor has authority to license.
111 | 
112 |   h. Licensor means the individual(s) or entity(ies) granting rights
113 |      under this Public License.
114 | 
115 |   i. NonCommercial means not primarily intended for or directed towards
116 |      commercial advantage or monetary compensation. For purposes of
117 |      this Public License, the exchange of the Licensed Material for
118 |      other material subject to Copyright and Similar Rights by digital
119 |      file-sharing or similar means is NonCommercial provided there is
120 |      no payment of monetary compensation in connection with the
121 |      exchange.
122 | 
123 |   j. Share means to provide material to the public by any means or
124 |      process that requires permission under the Licensed Rights, such
125 |      as reproduction, public display, public performance, distribution,
126 |      dissemination, communication, or importation, and to make material
127 |      available to the public including in ways that members of the
128 |      public may access the material from a place and at a time
129 |      individually chosen by them.
130 | 
131 |   k. Sui Generis Database Rights means rights other than copyright
132 |      resulting from Directive 96/9/EC of the European Parliament and of
133 |      the Council of 11 March 1996 on the legal protection of databases,
134 |      as amended and/or succeeded, as well as other essentially
135 |      equivalent rights anywhere in the world.
136 | 
137 |   l. You means the individual or entity exercising the Licensed Rights
138 |      under this Public License. Your has a corresponding meaning.
139 | 
140 | Section 2 -- Scope.
141 | 
142 |   a. License grant.
143 | 
144 |        1. Subject to the terms and conditions of this Public License,
145 |           the Licensor hereby grants You a worldwide, royalty-free,
146 |           non-sublicensable, non-exclusive, irrevocable license to
147 |           exercise the Licensed Rights in the Licensed Material to:
148 | 
149 |             a. reproduce and Share the Licensed Material, in whole or
150 |                in part, for NonCommercial purposes only; and
151 | 
152 |             b. produce, reproduce, and Share Adapted Material for
153 |                NonCommercial purposes only.
154 | 
155 |        2. Exceptions and Limitations. For the avoidance of doubt, where
156 |           Exceptions and Limitations apply to Your use, this Public
157 |           License does not apply, and You do not need to comply with
158 |           its terms and conditions.
159 | 
160 |        3. Term. The term of this Public License is specified in Section
161 |           6(a).
162 | 
163 |        4. Media and formats; technical modifications allowed. The
164 |           Licensor authorizes You to exercise the Licensed Rights in
165 |           all media and formats whether now known or hereafter created,
166 |           and to make technical modifications necessary to do so. The
167 |           Licensor waives and/or agrees not to assert any right or
168 |           authority to forbid You from making technical modifications
169 |           necessary to exercise the Licensed Rights, including
170 |           technical modifications necessary to circumvent Effective
171 |           Technological Measures. For purposes of this Public License,
172 |           simply making modifications authorized by this Section 2(a)
173 |           (4) never produces Adapted Material.
174 | 
175 |        5. Downstream recipients.
176 | 
177 |             a. Offer from the Licensor -- Licensed Material. Every
178 |                recipient of the Licensed Material automatically
179 |                receives an offer from the Licensor to exercise the
180 |                Licensed Rights under the terms and conditions of this
181 |                Public License.
182 | 
183 |             b. No downstream restrictions. You may not offer or impose
184 |                any additional or different terms or conditions on, or
185 |                apply any Effective Technological Measures to, the
186 |                Licensed Material if doing so restricts exercise of the
187 |                Licensed Rights by any recipient of the Licensed
188 |                Material.
189 | 
190 |        6. No endorsement. Nothing in this Public License constitutes or
191 |           may be construed as permission to assert or imply that You
192 |           are, or that Your use of the Licensed Material is, connected
193 |           with, or sponsored, endorsed, or granted official status by,
194 |           the Licensor or others designated to receive attribution as
195 |           provided in Section 3(a)(1)(A)(i).
196 | 
197 |   b. Other rights.
198 | 
199 |        1. Moral rights, such as the right of integrity, are not
200 |           licensed under this Public License, nor are publicity,
201 |           privacy, and/or other similar personality rights; however, to
202 |           the extent possible, the Licensor waives and/or agrees not to
203 |           assert any such rights held by the Licensor to the limited
204 |           extent necessary to allow You to exercise the Licensed
205 |           Rights, but not otherwise.
206 | 
207 |        2. Patent and trademark rights are not licensed under this
208 |           Public License.
209 | 
210 |        3. To the extent possible, the Licensor waives any right to
211 |           collect royalties from You for the exercise of the Licensed
212 |           Rights, whether directly or through a collecting society
213 |           under any voluntary or waivable statutory or compulsory
214 |           licensing scheme. In all other cases the Licensor expressly
215 |           reserves any right to collect such royalties, including when
216 |           the Licensed Material is used other than for NonCommercial
217 |           purposes.
218 | 
219 | Section 3 -- License Conditions.
220 | 
221 | Your exercise of the Licensed Rights is expressly made subject to the
222 | following conditions.
223 | 
224 |   a. Attribution.
225 | 
226 |        1. If You Share the Licensed Material (including in modified
227 |           form), You must:
228 | 
229 |             a. retain the following if it is supplied by the Licensor
230 |                with the Licensed Material:
231 | 
232 |                  i. identification of the creator(s) of the Licensed
233 |                     Material and any others designated to receive
234 |                     attribution, in any reasonable manner requested by
235 |                     the Licensor (including by pseudonym if
236 |                     designated);
237 | 
238 |                 ii. a copyright notice;
239 | 
240 |                iii. a notice that refers to this Public License;
241 | 
242 |                 iv. a notice that refers to the disclaimer of
243 |                     warranties;
244 | 
245 |                  v. a URI or hyperlink to the Licensed Material to the
246 |                     extent reasonably practicable;
247 | 
248 |             b. indicate if You modified the Licensed Material and
249 |                retain an indication of any previous modifications; and
250 | 
251 |             c. indicate the Licensed Material is licensed under this
252 |                Public License, and include the text of, or the URI or
253 |                hyperlink to, this Public License.
254 | 
255 |        2. You may satisfy the conditions in Section 3(a)(1) in any
256 |           reasonable manner based on the medium, means, and context in
257 |           which You Share the Licensed Material. For example, it may be
258 |           reasonable to satisfy the conditions by providing a URI or
259 |           hyperlink to a resource that includes the required
260 |           information.
261 | 
262 |        3. If requested by the Licensor, You must remove any of the
263 |           information required by Section 3(a)(1)(A) to the extent
264 |           reasonably practicable.
265 | 
266 |        4. If You Share Adapted Material You produce, the Adapter's
267 |           License You apply must not prevent recipients of the Adapted
268 |           Material from complying with this Public License.
269 | 
270 | Section 4 -- Sui Generis Database Rights.
271 | 
272 | Where the Licensed Rights include Sui Generis Database Rights that
273 | apply to Your use of the Licensed Material:
274 | 
275 |   a. for the avoidance of doubt, Section 2(a)(1) grants You the right
276 |      to extract, reuse, reproduce, and Share all or a substantial
277 |      portion of the contents of the database for NonCommercial purposes
278 |      only;
279 | 
280 |   b. if You include all or a substantial portion of the database
281 |      contents in a database in which You have Sui Generis Database
282 |      Rights, then the database in which You have Sui Generis Database
283 |      Rights (but not its individual contents) is Adapted Material; and
284 | 
285 |   c. You must comply with the conditions in Section 3(a) if You Share
286 |      all or a substantial portion of the contents of the database.
287 | 
288 | For the avoidance of doubt, this Section 4 supplements and does not
289 | replace Your obligations under this Public License where the Licensed
290 | Rights include other Copyright and Similar Rights.
291 | 
292 | Section 5 -- Disclaimer of Warranties and Limitation of Liability.
293 | 
294 |   a. UNLESS OTHERWISE SEPARATELY UNDERTAKEN BY THE LICENSOR, TO THE
295 |      EXTENT POSSIBLE, THE LICENSOR OFFERS THE LICENSED MATERIAL AS-IS
296 |      AND AS-AVAILABLE, AND MAKES NO REPRESENTATIONS OR WARRANTIES OF
297 |      ANY KIND CONCERNING THE LICENSED MATERIAL, WHETHER EXPRESS,
298 |      IMPLIED, STATUTORY, OR OTHER. THIS INCLUDES, WITHOUT LIMITATION,
299 |      WARRANTIES OF TITLE, MERCHANTABILITY, FITNESS FOR A PARTICULAR
300 |      PURPOSE, NON-INFRINGEMENT, ABSENCE OF LATENT OR OTHER DEFECTS,
301 |      ACCURACY, OR THE PRESENCE OR ABSENCE OF ERRORS, WHETHER OR NOT
302 |      KNOWN OR DISCOVERABLE. WHERE DISCLAIMERS OF WARRANTIES ARE NOT
303 |      ALLOWED IN FULL OR IN PART, THIS DISCLAIMER MAY NOT APPLY TO YOU.
304 | 
305 |   b. TO THE EXTENT POSSIBLE, IN NO EVENT WILL THE LICENSOR BE LIABLE
306 |      TO YOU ON ANY LEGAL THEORY (INCLUDING, WITHOUT LIMITATION,
307 |      NEGLIGENCE) OR OTHERWISE FOR ANY DIRECT, SPECIAL, INDIRECT,
308 |      INCIDENTAL, CONSEQUENTIAL, PUNITIVE, EXEMPLARY, OR OTHER LOSSES,
309 |      COSTS, EXPENSES, OR DAMAGES ARISING OUT OF THIS PUBLIC LICENSE OR
310 |      USE OF THE LICENSED MATERIAL, EVEN IF THE LICENSOR HAS BEEN
311 |      ADVISED OF THE POSSIBILITY OF SUCH LOSSES, COSTS, EXPENSES, OR
312 |      DAMAGES. WHERE A LIMITATION OF LIABILITY IS NOT ALLOWED IN FULL OR
313 |      IN PART, THIS LIMITATION MAY NOT APPLY TO YOU.
314 | 
315 |   c. The disclaimer of warranties and limitation of liability provided
316 |      above shall be interpreted in a manner that, to the extent
317 |      possible, most closely approximates an absolute disclaimer and
318 |      waiver of all liability.
319 | 
320 | Section 6 -- Term and Termination.
321 | 
322 |   a. This Public License applies for the term of the Copyright and
323 |      Similar Rights licensed here. However, if You fail to comply with
324 |      this Public License, then Your rights under this Public License
325 |      terminate automatically.
326 | 
327 |   b. Where Your right to use the Licensed Material has terminated under
328 |      Section 6(a), it reinstates:
329 | 
330 |        1. automatically as of the date the violation is cured, provided
331 |           it is cured within 30 days of Your discovery of the
332 |           violation; or
333 | 
334 |        2. upon express reinstatement by the Licensor.
335 | 
336 |      For the avoidance of doubt, this Section 6(b) does not affect any
337 |      right the Licensor may have to seek remedies for Your violations
338 |      of this Public License.
339 | 
340 |   c. For the avoidance of doubt, the Licensor may also offer the
341 |      Licensed Material under separate terms or conditions or stop
342 |      distributing the Licensed Material at any time; however, doing so
343 |      will not terminate this Public License.
344 | 
345 |   d. Sections 1, 5, 6, 7, and 8 survive termination of this Public
346 |      License.
347 | 
348 | Section 7 -- Other Terms and Conditions.
349 | 
350 |   a. The Licensor shall not be bound by any additional or different
351 |      terms or conditions communicated by You unless expressly agreed.
352 | 
353 |   b. Any arrangements, understandings, or agreements regarding the
354 |      Licensed Material not stated herein are separate from and
355 |      independent of the terms and conditions of this Public License.
356 | 
357 | Section 8 -- Interpretation.
358 | 
359 |   a. For the avoidance of doubt, this Public License does not, and
360 |      shall not be interpreted to, reduce, limit, restrict, or impose
361 |      conditions on any use of the Licensed Material that could lawfully
362 |      be made without permission under this Public License.
363 | 
364 |   b. To the extent possible, if any provision of this Public License is
365 |      deemed unenforceable, it shall be automatically reformed to the
366 |      minimum extent necessary to make it enforceable. If the provision
367 |      cannot be reformed, it shall be severed from this Public License
368 |      without affecting the enforceability of the remaining terms and
369 |      conditions.
370 | 
371 |   c. No term or condition of this Public License will be waived and no
372 |      failure to comply consented to unless expressly agreed to by the
373 |      Licensor.
374 | 
375 |   d. Nothing in this Public License constitutes or may be interpreted
376 |      as a limitation upon, or waiver of, any privileges and immunities
377 |      that apply to the Licensor or You, including from the legal
378 |      processes of any jurisdiction or authority.
379 | 
380 | =======================================================================
381 | 
382 | Creative Commons is not a party to its public
383 | licenses. Notwithstanding, Creative Commons may elect to apply one of
384 | its public licenses to material it publishes and in those instances
385 | will be considered the “Licensor.” The text of the Creative Commons
386 | public licenses is dedicated to the public domain under the CC0 Public
387 | Domain Dedication. Except for the limited purpose of indicating that
388 | material is shared under a Creative Commons public license or as
389 | otherwise permitted by the Creative Commons policies published at
390 | creativecommons.org/policies, Creative Commons does not authorize the
391 | use of the trademark "Creative Commons" or any other trademark or logo
392 | of Creative Commons without its prior written consent including,
393 | without limitation, in connection with any unauthorized modifications
394 | to any of its public licenses or any other arrangements,
395 | understandings, or agreements concerning use of licensed material. For
396 | the avoidance of doubt, this paragraph does not form part of the
397 | public licenses.
398 | 
399 | Creative Commons may be contacted at creativecommons.org.


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | 
  2 | # SentAugment
  3 | 
  4 | SentAugment is a data augmentation technique for semi-supervised learning in NLP. It uses state-of-the-art sentence embeddings to structure the information of a very large bank of sentences. The large-scale sentence embedding space is then used to retrieve in-domain unannotated sentences for any language understanding task such that semi-supervised learning techniques like self-training and knowledge-distillation can be leveraged. This means you do not need to assume the presence of unannotated sentences to use semi-supervised learning techniques. In our paper [Self-training Improves Pre-training for Natural Language Understanding](https://arxiv.org/abs/2010.02194), we show that SentAugment provides strong gains on multiple language understanding tasks when used in combination with self-training or knowledge distillation.
  5 | 
  6 | ![Model](sentaugment_figure.png)
  7 | 
  8 | ## Dependencies
  9 | 
 10 | *  [PyTorch](https://pytorch.org/)
 11 | *  [FAISS](https://github.com/facebookresearch/faiss)
 12 | *  [XLM](https://github.com/facebookresearch/XLM)
 13 | 
 14 | ## I. The large-scale bank of sentences
 15 | Our approach is based on a large bank of CommonCrawl web sentences. We use SentAugment to filter domain-specific unannotated data for semi-supervised learning NLP methods. This data can be found [here](http://www.statmt.org/cc-english/) and can be recovered from CommonCrawl by the [ccnet](https://github.com/facebookresearch/CC_Net) repository. It consists of 5 billion sentences, each file containing 100M sentences. As an example, we are going to use 100M sentences from the first file:
 16 | 
 17 | ```bash
 18 | mkdir data && cd data
 19 | wget http://www.statmt.org/cc-english/x01.cc.5b.tar.gz
 20 | ```
 21 | Then untar files and put all sentences into a single file:
 22 | ```bash
 23 | tar -xvf *.tar.gz
 24 | cat *.5b > keys.txt
 25 | ```
 26 | 
 27 | Then, for fast indexing, create a memory map (mmap) of this text file:
 28 | ```bash
 29 | python src/compress_text.py --input data/keys.txt &
 30 | ```
 31 | We will use this data as the bank of sentences.
 32 | 
 33 | ## II. The SentAugment sentence embedding space (SASE)
 34 | Our sentence encoder is based on the Transformer implementation of XLM. It obtains state-of-the-art performance on several STS benchmarks. To use it, first clone XLM:
 35 | ```bash
 36 | git clone https://github.com/facebookresearch/XLM
 37 | ```
 38 | 
 39 | Then, download the SentAugment sentence encoder (SASE), and its sentencepiece model:
 40 | ```bash
 41 | cd data
 42 | wget https://dl.fbaipublicfiles.com/sentaugment/sase.pth
 43 | wget https://dl.fbaipublicfiles.com/sentaugment/sase.spm
 44 | ```
 45 | 
 46 | 
 47 | Then to embed sentences, you can run for instance:
 48 | ```bash
 49 | input=data/keys.txt  # input text file
 50 | output=data/keys.pt  # output pytorch file
 51 | 
 52 | # Encode sentence from $input file and save it to $output
 53 | python src/sase.py --input $input --model data/sase.pth --spm_model data/sase.spm --batch_size 64 --cuda "True" --output $output
 54 | ```
 55 | 
 56 | This will output a torch file containing sentence embeddings (dim=256).
 57 | 
 58 | ## III. Retrieving nearest neighbor sentences from a query
 59 | Now that you have constructed a sentence embedding space by encoding many sentences from CommonCrawl, you can leverage that "bank of sentences" with similarity search.
 60 | From an input query sentence, you can retrieve nearest neighbors from the bank by running:
 61 | 
 62 | ```bash
 63 | bank=data/keys.txt.ref.bin64  # compressed text file (bank)
 64 | emb=data/keys.pt  # embeddings of sentences (keys)
 65 | K=10000  # number of sentences to retrieve per query
 66 | 
 67 | ## encode input sentences as sase embedding
 68 | input=sentence.txt  # input file containing a few (query) sentences
 69 | python src/sase.py --input $input --model data/sase.pth --spm_model data/sase.spm --batch_size 64 --cuda "True" --output $input.pt
 70 | 
 71 | ## use embedding to retrieve nearest neighbors
 72 | input=sentence.txt  # input file containing a few (query) sentences
 73 | python src/flat_retrieve.py --input $input.pt --bank $bank --emb data/keys.pt --K $K > nn.txt &
 74 | ```
 75 | 
 76 | Sentences in nn.txt can be used for semi-supervised learning as unannotated in-domain data. They also provide good paraphrases (use the cosine similarity score to filter good paraphrase pairs).
 77 | 
 78 | In the next part, we provide fast nearest-neighbor indexes for faster retrieval of similar sentences.
 79 | 
 80 | ## IV. Fast K-nearest neighbor search
 81 | Fast K-nearest neighbor search is particularly important when considering a large bank of sentences. We use [FAISS](https://github.com/facebookresearch/faiss) indexes to optimize the memory usage and query time.
 82 | 
 83 | ### IV.1 - The KNN index bestiary
 84 | For fast nearest-neighbor search, we provide pretrained [FAISS indexes](https://github.com/facebookresearch/faiss/wiki/The-index-factory) (see Table below). Each index enables fast NN search based on different compression schemes. The embeddings are compressed using for instance scalar quantization (SQ4 or SQ8), PCA reduction (PCAR: 14, 40, 256), and search is sped up with k-means clustering (32k or 262k). Please consider looking at the [FAISS documentation](https://github.com/facebookresearch/faiss/wiki/Faiss-on-the-GPU) for more information on indexes and  [how to train them](https://github.com/facebookresearch/faiss/wiki/Guidelines-to-choose-an-index).
 85 | 
 86 | FAISS index | \#Sentences | \#Clusters | Quantization | #PCAR | Machine | Size
 87 | |:---: |:---: |:---: | :---: |:---: | :---: | :------: |
 88 | [`100M_1GPU_16GB`](https://dl.fbaipublicfiles.com/sentaugment/100M_1GPU_16GB.faiss.idx) | 100M | 32768 | SQ4 | 256 | 1GPU16 | 14GiB
 89 | [`100M_1GPU_32GB`](https://dl.fbaipublicfiles.com/sentaugment/100M_1GPU_32GB.faiss.idx) | 100M | 32768 | SQ8 | 256 | 1GPU32 | 26GiB
 90 | [`1B_1GPU_16GB`](https://dl.fbaipublicfiles.com/sentaugment/1B_1GPU_16GB.faiss.idx) | 1B | 262144 | SQ4 | 14 | 1GPU16 | 15GiB
 91 | [`1B_1GPU_32GB`](https://dl.fbaipublicfiles.com/sentaugment/1B_1GPU_32GB.faiss.idx) | 1B | 262144 | SQ4 | 40 | 1GPU32 | 28GiB
 92 | [`1B_8GPU_32GB`](https://dl.fbaipublicfiles.com/sentaugment/1B_8GPU_32GB.faiss.idx) | 1B | 262144 | SQ4 | 256 | 8GPU32 | 136GiB
 93 | 
 94 | We provide indexes that fit either on 1 GPU with 16GiB memory (1GPU16) up to a larger index that fits on 1 GPU with 32 GiB memory (1GPU32) and one that fits on 8 GPUs (32GB). Indexes that use 100M sentences are built from the first file "x01.cc.5b.tar.gz", and 1B indexes use the first ten files. All indexes are based on SASE embeddings.
 95 | 
 96 |   ### IV.2 - How to use an index to query nearest neighbors
 97 | You can get K nearest neighbors for each sentence of an input text file by running:
 98 | 
 99 | ```bash
100 | ## encode input sentences as sase embedding
101 | input=sentence.txt  # input file containing a few (query) sentences
102 | python src/sase.py --input $input --model data/sase.pth --spm_model data/sase.spm --batch_size 64 --cuda "True" --output $input.pt
103 | 
104 | index=data/100M_1GPU_16GB.faiss.idx  # FAISS index path
105 | input=sentences.pt  # embeddings of input sentences
106 | bank=data/keys.txt  # text file with all the data (the compressed file keys.ref.bin64 should also be present in the same folder)
107 | K=10  # number of sentences to retrieve per query
108 | NPROBE=1024 # number of probes for querying the index
109 | 
110 | python src/faiss_retrieve.py --input $input --bank $bank --index $index --K $K --nprobe $NPROBE --gpu "True" > nn.txt &
111 | ```
112 | This can also be used for paraphrase mining.
113 | 
114 | 
115 | ## Reference
116 | If you found the resources here useful, please consider citing our paper:
117 | 
118 | ```
119 | @article{du2020self,
120 |   title={Self-training Improves Pre-training for Natural Language Understanding},
121 |   author={Du, Jingfei and Grave, Edouard and Gunel, Beliz and Chaudhary, Vishrav and Celebi, Onur and Auli, Michael and Stoyanov, Ves and Conneau, Alexis},
122 |   journal={arXiv preprint arXiv:2010.02194},
123 |   year={2020}
124 | }
125 | ```
126 | 
127 | ## License
128 | 
129 | See the [LICENSE](LICENSE) file for more details.
130 | The majority of SentAugment is licensed under CC-BY-NC. However, license information for PyTorch code is available at https://github.com/pytorch/pytorch/blob/master/LICENSE
131 | 


--------------------------------------------------------------------------------
/sentaugment_figure.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/facebookresearch/SentAugment/e92dc7039335dcaa96396e66cf03f50c9899dacf/sentaugment_figure.png


--------------------------------------------------------------------------------
/src/compress_text.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python3
 2 | # Copyright (c) Facebook, Inc. and its affiliates.
 3 | #
 4 | import sys
 5 | import os
 6 | import argparse
 7 | 
 8 | import numpy as np
 9 | import torch
10 | DIR = os.path.dirname(os.path.realpath(__file__))
11 | sys.path.append(DIR + '/../src/lib')
12 | from indexing import CompressText
13 | 
14 | 
15 | 
16 | 
17 | def main():
18 |     parser = argparse.ArgumentParser(description="Generating ref file to support fetching text from memmap")
19 |     parser.add_argument("--input", type=str, help="input text file")
20 |     args = parser.parse_args()
21 |     CompressText(args.input)
22 | 
23 | 
24 | 
25 | if __name__ == "__main__":
26 |     main()
27 | 


--------------------------------------------------------------------------------
/src/faiss_retrieve.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python3
 2 | # Copyright (c) Facebook, Inc. and its affiliates.
 3 | #
 4 | """
 5 | Script for retrieving nearest neighbors of sentences from the bank using a given faiss index
 6 | Example: python src/faiss_retrieve.py --input $input --bank $bank --index $index --K $K
 7 | """
 8 | 
 9 | import argparse
10 | import faiss
11 | import os
12 | import sys
13 | import time
14 | import torch
15 | 
16 | from indexing import IndexLoad, IndexSearchKNN, IndexTextOpen 
17 | 
18 | parser = argparse.ArgumentParser(description="retrieve nearest neighbors of sentences")
19 | parser.add_argument("--input", type=str, required=True , help="input pytorch embeddings")
20 | parser.add_argument("--bank", type=str, required=True, help="compressed text file")
21 | parser.add_argument("--index", type=str, required=True, help="faiss index")
22 | parser.add_argument("--K", type=int, default=100, help="number of nearest neighbors per sentence")
23 | parser.add_argument("--nprobe", type=int, default=1024, help="number of probes for the FAISS index")
24 | parser.add_argument("--gpu", type=str, default="True", help="use gpu")
25 | 
26 | args = parser.parse_args()
27 | assert args.gpu in ["True", "False"]
28 | args.gpu = eval(args.gpu)
29 | 
30 | # load query embeddings
31 | query_emb = torch.load(args.input).numpy()
32 | 
33 | # normalize embeddings
34 | faiss.normalize_L2(query_emb)
35 | 
36 | # load the index
37 | index = IndexLoad(args.index, args.nprobe, args.gpu)
38 | 
39 | # query the index and print retrieved neighbors
40 | txt_mmap, ref_mmap = IndexTextOpen(args.bank)
41 | nns = IndexSearchKNN(index, query_emb, txt_mmap, ref_mmap, args.K)
42 | for nn in nns:
43 |     print(nn)
44 | 
45 | 
46 | 


--------------------------------------------------------------------------------
/src/flat_retrieve.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python3
 2 | # Copyright (c) Facebook, Inc. and its affiliates.
 3 | #
 4 | """
 5 | Script that retrieve nearest neighbors of sentences from the bank
 6 | Example: python src/flat_retrieve.py --input $input --bank $bank --emb data/keys.pt --K $K
 7 | """
 8 | 
 9 | import os
10 | import sys
11 | import torch
12 | import argparse
13 | import time
14 | 
15 | DIR = os.path.dirname(os.path.realpath(__file__))
16 | sys.path.append(DIR + '/../src/lib')
17 | from indexing import IndexTextOpen, IndexTextQuery
18 | 
19 | parser = argparse.ArgumentParser(description="retrieve nearest neighbors of sentences")
20 | parser.add_argument("--input", type=str, required=True , help="input pytorch embeddings")
21 | parser.add_argument("--bank", type=str, required=True, help="compressed text file")
22 | parser.add_argument("--emb", type=str, required=True, help="pytorch embeddings of text bank")
23 | parser.add_argument("--K", type=int, default=100, help="number of nearest neighbors per sentence")
24 | 
25 | args = parser.parse_args()
26 | 
27 | # load query embedding and bank embedding
28 | query_emb = torch.load(args.input)
29 | bank_emb = torch.load(args.emb)
30 | 
31 | # normalize embeddings
32 | query_emb.div_(query_emb.norm(2, 1, keepdim=True).expand_as(query_emb))
33 | bank_emb.div_(bank_emb.norm(2, 1, keepdim=True).expand_as(bank_emb))
34 | 
35 | # score and rank
36 | scores = bank_emb.mm(query_emb.t()) # B x Q
37 | _, indices = torch.topk(scores, params.k, dim=0) # K x Q
38 | 
39 | # fetch and print retrieved text
40 | txt_mmap, ref_mmap = IndexTextOpen(args.bank)
41 | for qeury_idx in range(indices.size(1)):
42 |     for k in range(K):
43 |         print(IndexTextQuery(txt_mmap, ref_mmap, indices[k][qeury_idx]))
44 | 
45 | 
46 | 


--------------------------------------------------------------------------------
/src/indexing.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/python3
  2 | # Copyright (c) Facebook, Inc. and its affiliates.
  3 | #
  4 | 
  5 | # indexing and search with FAISS
  6 | 
  7 | import faiss
  8 | import os.path
  9 | import sys
 10 | import numpy as np
 11 | import torch
 12 | 
 13 | 
 14 | ###############################################################################
 15 | # create an FAISS index on the given data
 16 | 
 17 | def IndexCreate(input_path, idx_type, output_path, normalize=True, dim=512):
 18 | 
 19 |     assert idx_type == 'FlatL2', 'only FlatL2 index is currently supported'
 20 |     x = torch.load(input_path).numpy()
 21 |     print(' - creating FAISS index')
 22 |     idx = faiss.IndexFlatL2(dim)
 23 |     if normalize:
 24 |         faiss.normalize_L2(x)
 25 |     idx.add(x)
 26 |     print(' - saving index into ' + output_path)
 27 |     faiss.write_index(idx, output_path)
 28 |     return x, idx
 29 | 
 30 | 
 31 | def LoadTextSimple(text_fname):
 32 |     """
 33 |     Naive version of loading text into python list
 34 |     used for retrieve text using sentence idx from faiss
 35 |     NOTE: inefficient, will be replaced with mmap
 36 |     """
 37 |     with open(text_fname, 'r', encoding='utf-8', errors='ignore') as fin:
 38 |         sentences = [s.strip() for s in fin]
 39 |     return sentences
 40 | 
 41 | 
 42 | def CompressText(txt_fname):
 43 |     """
 44 |     generate ref binary file storing starting offset for each sentence
 45 |     """
 46 |     fname = txt_fname.replace('.txt', '.ref.bin64')
 47 |     offsets = [0]
 48 |     with open(txt_fname, 'r', encoding='utf-8', errors='ignore') as fin:
 49 |         for line in fin:
 50 |             offsets.append(offsets[-1] + len(bytes(line, encoding='utf-8', errors='ignore')))
 51 |     offsets = np.array(offsets[:-1], dtype=np.int64) # discard last one
 52 |     offsets.tofile(fname)
 53 | 
 54 | 
 55 | ###############################################################################
 56 | # Opens a text file with the sentences corresponding to the indices used
 57 | # by an FAISS index
 58 | # We also need the reference files with the byte offsets to the beginning
 59 | # of each sentence
 60 | # optionnally:  array with number of words per sentence
 61 | # All arrays are memory mapped
 62 | 
 63 | def IndexTextOpen(txt_fname):
 64 |     # print('Reading text corpus')
 65 |     # print(' - texts: {:s}'.format(txt_fname))
 66 |     txt_mmap = np.memmap(txt_fname, mode='r', dtype=np.uint8)
 67 |     fname = txt_fname.replace('.txt', '.ref.bin32')
 68 |     if os.path.isfile(fname):
 69 |         # print(' - sentence start offsets (32 bit): {}'.format(fname))
 70 |         ref_mmap = np.memmap(fname, mode='r', dtype=np.uint32)
 71 |     else:
 72 |         fname = txt_fname.replace('.txt', '.ref.bin64')
 73 |         if os.path.isfile(fname):
 74 |             # print(' - sentence start offsets (64 bit): {}'.format(fname))
 75 |             ref_mmap = np.memmap(fname, mode='r', dtype=np.uint64)
 76 |         else:
 77 |             # print('ERROR: no file with sentence start offsets found')
 78 |             sys.exit(1)
 79 |     # print(' - found {:d} sentences'.format(ref_mmap.shape[0]))
 80 |     return txt_mmap, ref_mmap
 81 | 
 82 | 
 83 | ###############################################################################
 84 | # Return the text for the given index
 85 | 
 86 | def IndexTextQuery(txt_mmap, ref_mmap, idx):
 87 |     p = int(ref_mmap[idx])  # get starting byte position
 88 |     i = 0
 89 |     dim = 10000  # max sentence length in bytes
 90 |     b = bytearray(dim)
 91 |     #  find EOL
 92 |     while txt_mmap[p+i] != 10 and i < dim:
 93 |         b[i] = txt_mmap[p+i]
 94 |         i += 1
 95 |     return b[0:i].decode('utf-8')
 96 | 
 97 | 
 98 | 
 99 | ###############################################################################
100 | # Load an FAISS index
101 | 
102 | def IndexLoad(idx_path, nprobe=0, gpu=False):
103 |     print('Reading FAISS index', file=sys.stderr)
104 |     print(' - index: {:s}'.format(idx_path), file=sys.stderr)
105 |     index = faiss.read_index(idx_path)
106 |     print(' - found {:d} sentences of dim {:d}'.format(index.ntotal, index.d), file=sys.stderr)
107 |     print(' - setting nbprobe to {:d}'.format(nprobe), file=sys.stderr)
108 |     if gpu:
109 |         print(' - transfer index to %d GPUs ' % faiss.get_num_gpus(), file=sys.stderr)
110 |         index = faiss.index_cpu_to_all_gpus(index) # co=co
111 |         faiss.GpuParameterSpace().set_index_parameter(index, 'nprobe', nprobe)
112 |     return index
113 | 
114 | 
115 | ###############################################################################
116 | # Search the [k] nearest vectors of [x] in the given index
117 | # and return the text lines
118 | 
119 | def IndexSearchKNN(index, x, T, R, kmax=1, dedup=True):
120 |     D, I = index.search(x, kmax)
121 |     all_res = []
122 |     for n in range(x.shape[0]):
123 |         prev = set()  # for depuplication
124 |         res = []
125 |         for i in range(kmax):
126 |             txt = IndexTextQuery(T, R, I[n, i])
127 |             # txt = T[I[n, i]]
128 |             if dedup and txt not in prev:
129 |                 prev.add(txt)
130 |                 res.append((txt, D[n, i]))
131 |         all_res.append(res)
132 |     return all_res
133 | 


--------------------------------------------------------------------------------
/src/lib/__pycache__/indexing.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/facebookresearch/SentAugment/e92dc7039335dcaa96396e66cf03f50c9899dacf/src/lib/__pycache__/indexing.cpython-36.pyc


--------------------------------------------------------------------------------
/src/lib/embeddings/__pycache__/bov.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/facebookresearch/SentAugment/e92dc7039335dcaa96396e66cf03f50c9899dacf/src/lib/embeddings/__pycache__/bov.cpython-36.pyc


--------------------------------------------------------------------------------
/src/sase.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/python3
  2 | # Copyright (c) Facebook, Inc. and its affiliates.
  3 | #
  4 | 
  5 | """
  6 | Script that takes text as input and output SASE sentence embeddings
  7 | Example: python src/sase.py --input $input --model $modelpath --spm_model $spmmodel --batch_size 64 --cuda "True" --output $output
  8 | """
  9 | 
 10 | import os
 11 | import sys
 12 | import torch
 13 | import argparse
 14 | from collections import OrderedDict
 15 | import sentencepiece as spm
 16 | 
 17 | sys.path.insert(0, 'XLM/')
 18 | 
 19 | from src.utils import AttrDict
 20 | from src.data.dictionary import Dictionary, BOS_WORD, EOS_WORD, PAD_WORD, UNK_WORD, MASK_WORD
 21 | from src.model.transformer import TransformerModel
 22 | 
 23 | parser = argparse.ArgumentParser(description="SASE encoding")
 24 | 
 25 | 
 26 | def main():
 27 |     parser.add_argument("--input", type=str, default="", help="input file")
 28 |     parser.add_argument("--model", type=str, default="", help="model path")
 29 |     parser.add_argument("--spm_model", type=str, default="", help="spm model path")
 30 |     parser.add_argument("--batch_size", type=int, default=64, help="batch size")
 31 |     parser.add_argument("--max_words", type=int, default=100, help="max words")
 32 |     parser.add_argument("--cuda", type=str, default="True", help="use cuda")
 33 |     parser.add_argument("--output", type=str, default="", help="output file")
 34 |     args = parser.parse_args()
 35 | 
 36 |     # Reload a pretrained model
 37 |     reloaded = torch.load(args.model)
 38 |     params = AttrDict(reloaded['params'])
 39 | 
 40 |     # Reload the SPM model
 41 |     spm_model = spm.SentencePieceProcessor()
 42 |     spm_model.Load(args.spm_model)
 43 | 
 44 |     # cuda
 45 |     assert args.cuda in ["True", "False"]
 46 |     args.cuda = eval(args.cuda)
 47 | 
 48 |     # build dictionary / update parameters
 49 |     dico = Dictionary(reloaded['dico_id2word'], reloaded['dico_word2id'], reloaded['dico_counts'])
 50 |     params.n_words = len(dico)
 51 |     params.bos_index = dico.index(BOS_WORD)
 52 |     params.eos_index = dico.index(EOS_WORD)
 53 |     params.pad_index = dico.index(PAD_WORD)
 54 |     params.unk_index = dico.index(UNK_WORD)
 55 |     params.mask_index = dico.index(MASK_WORD)
 56 | 
 57 | 
 58 |     # build model / reload weights
 59 |     model = TransformerModel(params, dico, True, True)
 60 |     reloaded['model'] = OrderedDict({key.replace('module.', ''):reloaded['model'][key] for key in reloaded['model']})
 61 |     model.load_state_dict(reloaded['model'])
 62 |     model.eval()
 63 | 
 64 |     if args.cuda:
 65 |         model.cuda()
 66 | 
 67 |     # load sentences
 68 |     sentences = []
 69 |     with open(args.input) as f:
 70 |         for line in f:
 71 |             line = spm_model.EncodeAsPieces(line.rstrip())
 72 |             line = line[:args.max_words - 1]
 73 |             sentences.append(line)
 74 | 
 75 |     # encode sentences
 76 |     embs = []
 77 |     for i in range(0, len(sentences), args.batch_size):
 78 |         batch = sentences[i:i+args.batch_size]
 79 |         lengths = torch.LongTensor([len(s) + 1 for s in batch])
 80 |         bs, slen = len(batch), lengths.max().item()
 81 |         assert slen <= args.max_words
 82 | 
 83 |         x = torch.LongTensor(slen, bs).fill_(params.pad_index)
 84 |         for k in range(bs):
 85 |             sent = torch.LongTensor([params.eos_index] + [dico.index(w) for w in batch[k]])
 86 |             x[:len(sent), k] = sent
 87 | 
 88 |         if args.cuda:
 89 |             x = x.cuda()
 90 |             lengths = lengths.cuda()
 91 | 
 92 |         with torch.no_grad():
 93 |             embedding = model('fwd', x=x, lengths=lengths, langs=None, causal=False).contiguous()[0].cpu()
 94 | 
 95 |         embs.append(embedding)
 96 | 
 97 |     # save embeddings
 98 |     torch.save(torch.cat(embs, dim=0).squeeze(0), args.output)
 99 | 
100 | if __name__ == "__main__":
101 |     main()
102 | 


--------------------------------------------------------------------------------